1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2015, Joyent, Inc. All rights reserved. 24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 /* 41 * VM - shared or copy-on-write from a vnode/anonymous memory. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/param.h> 46 #include <sys/t_lock.h> 47 #include <sys/errno.h> 48 #include <sys/systm.h> 49 #include <sys/mman.h> 50 #include <sys/debug.h> 51 #include <sys/cred.h> 52 #include <sys/vmsystm.h> 53 #include <sys/tuneable.h> 54 #include <sys/bitmap.h> 55 #include <sys/swap.h> 56 #include <sys/kmem.h> 57 #include <sys/sysmacros.h> 58 #include <sys/vtrace.h> 59 #include <sys/cmn_err.h> 60 #include <sys/callb.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 #include <sys/proc.h> 74 #include <sys/task.h> 75 #include <sys/project.h> 76 #include <sys/zone.h> 77 #include <sys/shm_impl.h> 78 79 /* 80 * segvn_fault needs a temporary page list array. To avoid calling kmem all 81 * the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if 82 * it can. In the rare case when this page list is not large enough, it 83 * goes and gets a large enough array from kmem. 84 * 85 * This small page list array covers either 8 pages or 64kB worth of pages - 86 * whichever is smaller. 87 */ 88 #define PVN_MAX_GETPAGE_SZ 0x10000 89 #define PVN_MAX_GETPAGE_NUM 0x8 90 91 #if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE 92 #define PVN_GETPAGE_SZ ptob(PVN_MAX_GETPAGE_NUM) 93 #define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM 94 #else 95 #define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ 96 #define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ) 97 #endif 98 99 /* 100 * Private seg op routines. 101 */ 102 static int segvn_dup(struct seg *seg, struct seg *newseg); 103 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 104 static void segvn_free(struct seg *seg); 105 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 106 caddr_t addr, size_t len, enum fault_type type, 107 enum seg_rw rw); 108 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 109 static int segvn_setprot(struct seg *seg, caddr_t addr, 110 size_t len, uint_t prot); 111 static int segvn_checkprot(struct seg *seg, caddr_t addr, 112 size_t len, uint_t prot); 113 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 114 static size_t segvn_swapout(struct seg *seg); 115 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 116 int attr, uint_t flags); 117 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 118 char *vec); 119 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 120 int attr, int op, ulong_t *lockmap, size_t pos); 121 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 122 uint_t *protv); 123 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 124 static int segvn_gettype(struct seg *seg, caddr_t addr); 125 static int segvn_getvp(struct seg *seg, caddr_t addr, 126 struct vnode **vpp); 127 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 128 uint_t behav); 129 static void segvn_dump(struct seg *seg); 130 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 131 struct page ***ppp, enum lock_type type, enum seg_rw rw); 132 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 133 uint_t szc); 134 static int segvn_getmemid(struct seg *seg, caddr_t addr, 135 memid_t *memidp); 136 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 137 static int segvn_capable(struct seg *seg, segcapability_t capable); 138 static int segvn_inherit(struct seg *, caddr_t, size_t, uint_t); 139 140 struct seg_ops segvn_ops = { 141 segvn_dup, 142 segvn_unmap, 143 segvn_free, 144 segvn_fault, 145 segvn_faulta, 146 segvn_setprot, 147 segvn_checkprot, 148 segvn_kluster, 149 segvn_swapout, 150 segvn_sync, 151 segvn_incore, 152 segvn_lockop, 153 segvn_getprot, 154 segvn_getoffset, 155 segvn_gettype, 156 segvn_getvp, 157 segvn_advise, 158 segvn_dump, 159 segvn_pagelock, 160 segvn_setpagesize, 161 segvn_getmemid, 162 segvn_getpolicy, 163 segvn_capable, 164 segvn_inherit 165 }; 166 167 /* 168 * Common zfod structures, provided as a shorthand for others to use. 169 */ 170 static segvn_crargs_t zfod_segvn_crargs = 171 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 172 static segvn_crargs_t kzfod_segvn_crargs = 173 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 174 PROT_ALL & ~PROT_USER); 175 static segvn_crargs_t stack_noexec_crargs = 176 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 177 178 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 179 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 180 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 181 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 182 183 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 184 185 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 186 187 size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */ 188 size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */ 189 uint_t segvn_pglock_comb_bshift; 190 size_t segvn_pglock_comb_palign; 191 192 static int segvn_concat(struct seg *, struct seg *, int); 193 static int segvn_extend_prev(struct seg *, struct seg *, 194 struct segvn_crargs *, size_t); 195 static int segvn_extend_next(struct seg *, struct seg *, 196 struct segvn_crargs *, size_t); 197 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 198 static void segvn_pagelist_rele(page_t **); 199 static void segvn_setvnode_mpss(vnode_t *); 200 static void segvn_relocate_pages(page_t **, page_t *); 201 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 202 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 203 uint_t, page_t **, page_t **, uint_t *, int *); 204 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 205 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 206 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 207 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 208 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 209 u_offset_t, struct vpage *, page_t **, uint_t, 210 enum fault_type, enum seg_rw, int); 211 static void segvn_vpage(struct seg *); 212 static size_t segvn_count_swap_by_vpages(struct seg *); 213 214 static void segvn_purge(struct seg *seg); 215 static int segvn_reclaim(void *, caddr_t, size_t, struct page **, 216 enum seg_rw, int); 217 static int shamp_reclaim(void *, caddr_t, size_t, struct page **, 218 enum seg_rw, int); 219 220 static int sameprot(struct seg *, caddr_t, size_t); 221 222 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 223 static int segvn_clrszc(struct seg *); 224 static struct seg *segvn_split_seg(struct seg *, caddr_t); 225 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 226 ulong_t, uint_t); 227 228 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t, 229 size_t, void *, u_offset_t); 230 231 static struct kmem_cache *segvn_cache; 232 static struct kmem_cache **segvn_szc_cache; 233 234 #ifdef VM_STATS 235 static struct segvnvmstats_str { 236 ulong_t fill_vp_pages[31]; 237 ulong_t fltvnpages[49]; 238 ulong_t fullszcpages[10]; 239 ulong_t relocatepages[3]; 240 ulong_t fltanpages[17]; 241 ulong_t pagelock[2]; 242 ulong_t demoterange[3]; 243 } segvnvmstats; 244 #endif /* VM_STATS */ 245 246 #define SDR_RANGE 1 /* demote entire range */ 247 #define SDR_END 2 /* demote non aligned ends only */ 248 249 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 250 if ((len) != 0) { \ 251 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 252 ASSERT(lpgaddr >= (seg)->s_base); \ 253 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 254 (len)), pgsz); \ 255 ASSERT(lpgeaddr > lpgaddr); \ 256 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 257 } else { \ 258 lpgeaddr = lpgaddr = (addr); \ 259 } \ 260 } 261 262 /*ARGSUSED*/ 263 static int 264 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 265 { 266 struct segvn_data *svd = buf; 267 268 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 269 mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 270 svd->svn_trnext = svd->svn_trprev = NULL; 271 return (0); 272 } 273 274 /*ARGSUSED1*/ 275 static void 276 segvn_cache_destructor(void *buf, void *cdrarg) 277 { 278 struct segvn_data *svd = buf; 279 280 rw_destroy(&svd->lock); 281 mutex_destroy(&svd->segfree_syncmtx); 282 } 283 284 /*ARGSUSED*/ 285 static int 286 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 287 { 288 bzero(buf, sizeof (svntr_t)); 289 return (0); 290 } 291 292 /* 293 * Patching this variable to non-zero allows the system to run with 294 * stacks marked as "not executable". It's a bit of a kludge, but is 295 * provided as a tweakable for platforms that export those ABIs 296 * (e.g. sparc V8) that have executable stacks enabled by default. 297 * There are also some restrictions for platforms that don't actually 298 * implement 'noexec' protections. 299 * 300 * Once enabled, the system is (therefore) unable to provide a fully 301 * ABI-compliant execution environment, though practically speaking, 302 * most everything works. The exceptions are generally some interpreters 303 * and debuggers that create executable code on the stack and jump 304 * into it (without explicitly mprotecting the address range to include 305 * PROT_EXEC). 306 * 307 * One important class of applications that are disabled are those 308 * that have been transformed into malicious agents using one of the 309 * numerous "buffer overflow" attacks. See 4007890. 310 */ 311 int noexec_user_stack = 0; 312 int noexec_user_stack_log = 1; 313 314 int segvn_lpg_disable = 0; 315 uint_t segvn_maxpgszc = 0; 316 317 ulong_t segvn_vmpss_clrszc_cnt; 318 ulong_t segvn_vmpss_clrszc_err; 319 ulong_t segvn_fltvnpages_clrszc_cnt; 320 ulong_t segvn_fltvnpages_clrszc_err; 321 ulong_t segvn_setpgsz_align_err; 322 ulong_t segvn_setpgsz_anon_align_err; 323 ulong_t segvn_setpgsz_getattr_err; 324 ulong_t segvn_setpgsz_eof_err; 325 ulong_t segvn_faultvnmpss_align_err1; 326 ulong_t segvn_faultvnmpss_align_err2; 327 ulong_t segvn_faultvnmpss_align_err3; 328 ulong_t segvn_faultvnmpss_align_err4; 329 ulong_t segvn_faultvnmpss_align_err5; 330 ulong_t segvn_vmpss_pageio_deadlk_err; 331 332 int segvn_use_regions = 1; 333 334 /* 335 * Segvn supports text replication optimization for NUMA platforms. Text 336 * replica's are represented by anon maps (amp). There's one amp per text file 337 * region per lgroup. A process chooses the amp for each of its text mappings 338 * based on the lgroup assignment of its main thread (t_tid = 1). All 339 * processes that want a replica on a particular lgroup for the same text file 340 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 341 * with vp,off,size,szc used as a key. Text replication segments are read only 342 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 343 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 344 * pages. Replication amp is assigned to a segment when it gets its first 345 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 346 * rechecks periodically if the process still maps an amp local to the main 347 * thread. If not async thread forces process to remap to an amp in the new 348 * home lgroup of the main thread. Current text replication implementation 349 * only provides the benefit to workloads that do most of their work in the 350 * main thread of a process or all the threads of a process run in the same 351 * lgroup. To extend text replication benefit to different types of 352 * multithreaded workloads further work would be needed in the hat layer to 353 * allow the same virtual address in the same hat to simultaneously map 354 * different physical addresses (i.e. page table replication would be needed 355 * for x86). 356 * 357 * amp pages are used instead of vnode pages as long as segment has a very 358 * simple life cycle. It's created via segvn_create(), handles S_EXEC 359 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 360 * happens such as protection is changed, real COW fault happens, pagesize is 361 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 362 * text replication by converting the segment back to vnode only segment 363 * (unmap segment's address range and set svd->amp to NULL). 364 * 365 * The original file can be changed after amp is inserted into 366 * svntr_hashtab. Processes that are launched after the file is already 367 * changed can't use the replica's created prior to the file change. To 368 * implement this functionality hash entries are timestamped. Replica's can 369 * only be used if current file modification time is the same as the timestamp 370 * saved when hash entry was created. However just timestamps alone are not 371 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 372 * deal with file changes via MAP_SHARED mappings differently. When writable 373 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 374 * existing replica's for this vnode as not usable for future text 375 * mappings. And we don't create new replica's for files that currently have 376 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 377 * true). 378 */ 379 380 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 381 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 382 383 static ulong_t svntr_hashtab_sz = 512; 384 static svntr_bucket_t *svntr_hashtab = NULL; 385 static struct kmem_cache *svntr_cache; 386 static svntr_stats_t *segvn_textrepl_stats; 387 static ksema_t segvn_trasync_sem; 388 389 int segvn_disable_textrepl = 1; 390 size_t textrepl_size_thresh = (size_t)-1; 391 size_t segvn_textrepl_bytes = 0; 392 size_t segvn_textrepl_max_bytes = 0; 393 clock_t segvn_update_textrepl_interval = 0; 394 int segvn_update_tr_time = 10; 395 int segvn_disable_textrepl_update = 0; 396 397 static void segvn_textrepl(struct seg *); 398 static void segvn_textunrepl(struct seg *, int); 399 static void segvn_inval_trcache(vnode_t *); 400 static void segvn_trasync_thread(void); 401 static void segvn_trupdate_wakeup(void *); 402 static void segvn_trupdate(void); 403 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 404 ulong_t); 405 406 /* 407 * Initialize segvn data structures 408 */ 409 void 410 segvn_init(void) 411 { 412 uint_t maxszc; 413 uint_t szc; 414 size_t pgsz; 415 416 segvn_cache = kmem_cache_create("segvn_cache", 417 sizeof (struct segvn_data), 0, 418 segvn_cache_constructor, segvn_cache_destructor, NULL, 419 NULL, NULL, 0); 420 421 if (segvn_lpg_disable == 0) { 422 szc = maxszc = page_num_pagesizes() - 1; 423 if (szc == 0) { 424 segvn_lpg_disable = 1; 425 } 426 if (page_get_pagesize(0) != PAGESIZE) { 427 panic("segvn_init: bad szc 0"); 428 /*NOTREACHED*/ 429 } 430 while (szc != 0) { 431 pgsz = page_get_pagesize(szc); 432 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 433 panic("segvn_init: bad szc %d", szc); 434 /*NOTREACHED*/ 435 } 436 szc--; 437 } 438 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 439 segvn_maxpgszc = maxszc; 440 } 441 442 if (segvn_maxpgszc) { 443 segvn_szc_cache = (struct kmem_cache **)kmem_alloc( 444 (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *), 445 KM_SLEEP); 446 } 447 448 for (szc = 1; szc <= segvn_maxpgszc; szc++) { 449 char str[32]; 450 451 (void) sprintf(str, "segvn_szc_cache%d", szc); 452 segvn_szc_cache[szc] = kmem_cache_create(str, 453 page_get_pagecnt(szc) * sizeof (page_t *), 0, 454 NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 455 } 456 457 458 if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL)) 459 segvn_use_regions = 0; 460 461 /* 462 * For now shared regions and text replication segvn support 463 * are mutually exclusive. This is acceptable because 464 * currently significant benefit from text replication was 465 * only observed on AMD64 NUMA platforms (due to relatively 466 * small L2$ size) and currently we don't support shared 467 * regions on x86. 468 */ 469 if (segvn_use_regions && !segvn_disable_textrepl) { 470 segvn_disable_textrepl = 1; 471 } 472 473 #if defined(_LP64) 474 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 475 !segvn_disable_textrepl) { 476 ulong_t i; 477 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 478 479 svntr_cache = kmem_cache_create("svntr_cache", 480 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 481 NULL, NULL, NULL, 0); 482 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 483 for (i = 0; i < svntr_hashtab_sz; i++) { 484 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 485 MUTEX_DEFAULT, NULL); 486 } 487 segvn_textrepl_max_bytes = ptob(physmem) / 488 segvn_textrepl_max_bytes_factor; 489 segvn_textrepl_stats = kmem_zalloc(NCPU * 490 sizeof (svntr_stats_t), KM_SLEEP); 491 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 492 (void) thread_create(NULL, 0, segvn_trasync_thread, 493 NULL, 0, &p0, TS_RUN, minclsyspri); 494 } 495 #endif 496 497 if (!ISP2(segvn_pglock_comb_balign) || 498 segvn_pglock_comb_balign < PAGESIZE) { 499 segvn_pglock_comb_balign = 1UL << 16; /* 64K */ 500 } 501 segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1; 502 segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign); 503 } 504 505 #define SEGVN_PAGEIO ((void *)0x1) 506 #define SEGVN_NOPAGEIO ((void *)0x2) 507 508 static void 509 segvn_setvnode_mpss(vnode_t *vp) 510 { 511 int err; 512 513 ASSERT(vp->v_mpssdata == NULL || 514 vp->v_mpssdata == SEGVN_PAGEIO || 515 vp->v_mpssdata == SEGVN_NOPAGEIO); 516 517 if (vp->v_mpssdata == NULL) { 518 if (vn_vmpss_usepageio(vp)) { 519 err = VOP_PAGEIO(vp, (page_t *)NULL, 520 (u_offset_t)0, 0, 0, CRED(), NULL); 521 } else { 522 err = ENOSYS; 523 } 524 /* 525 * set v_mpssdata just once per vnode life 526 * so that it never changes. 527 */ 528 mutex_enter(&vp->v_lock); 529 if (vp->v_mpssdata == NULL) { 530 if (err == EINVAL) { 531 vp->v_mpssdata = SEGVN_PAGEIO; 532 } else { 533 vp->v_mpssdata = SEGVN_NOPAGEIO; 534 } 535 } 536 mutex_exit(&vp->v_lock); 537 } 538 } 539 540 int 541 segvn_create(struct seg *seg, void *argsp) 542 { 543 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 544 struct segvn_data *svd; 545 size_t swresv = 0; 546 struct cred *cred; 547 struct anon_map *amp; 548 int error = 0; 549 size_t pgsz; 550 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 551 int use_rgn = 0; 552 int trok = 0; 553 554 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 555 556 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 557 panic("segvn_create type"); 558 /*NOTREACHED*/ 559 } 560 561 /* 562 * Check arguments. If a shared anon structure is given then 563 * it is illegal to also specify a vp. 564 */ 565 if (a->amp != NULL && a->vp != NULL) { 566 panic("segvn_create anon_map"); 567 /*NOTREACHED*/ 568 } 569 570 if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) && 571 a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) && 572 segvn_use_regions) { 573 use_rgn = 1; 574 } 575 576 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 577 if (a->type == MAP_SHARED) 578 a->flags &= ~MAP_NORESERVE; 579 580 if (a->szc != 0) { 581 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 582 (a->amp != NULL && a->type == MAP_PRIVATE) || 583 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 584 a->szc = 0; 585 } else { 586 if (a->szc > segvn_maxpgszc) 587 a->szc = segvn_maxpgszc; 588 pgsz = page_get_pagesize(a->szc); 589 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 590 !IS_P2ALIGNED(seg->s_size, pgsz)) { 591 a->szc = 0; 592 } else if (a->vp != NULL) { 593 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 594 /* 595 * paranoid check. 596 * hat_page_demote() is not supported 597 * on swapfs pages. 598 */ 599 a->szc = 0; 600 } else if (map_addr_vacalign_check(seg->s_base, 601 a->offset & PAGEMASK)) { 602 a->szc = 0; 603 } 604 } else if (a->amp != NULL) { 605 pgcnt_t anum = btopr(a->offset); 606 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 607 if (!IS_P2ALIGNED(anum, pgcnt)) { 608 a->szc = 0; 609 } 610 } 611 } 612 } 613 614 /* 615 * If segment may need private pages, reserve them now. 616 */ 617 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 618 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 619 if (anon_resv_zone(seg->s_size, 620 seg->s_as->a_proc->p_zone) == 0) 621 return (EAGAIN); 622 swresv = seg->s_size; 623 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 624 seg, swresv, 1); 625 } 626 627 /* 628 * Reserve any mapping structures that may be required. 629 * 630 * Don't do it for segments that may use regions. It's currently a 631 * noop in the hat implementations anyway. 632 */ 633 if (!use_rgn) { 634 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 635 } 636 637 if (a->cred) { 638 cred = a->cred; 639 crhold(cred); 640 } else { 641 crhold(cred = CRED()); 642 } 643 644 /* Inform the vnode of the new mapping */ 645 if (a->vp != NULL) { 646 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 647 seg->s_as, seg->s_base, seg->s_size, a->prot, 648 a->maxprot, a->type, cred, NULL); 649 if (error) { 650 if (swresv != 0) { 651 anon_unresv_zone(swresv, 652 seg->s_as->a_proc->p_zone); 653 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 654 "anon proc:%p %lu %u", seg, swresv, 0); 655 } 656 crfree(cred); 657 if (!use_rgn) { 658 hat_unload(seg->s_as->a_hat, seg->s_base, 659 seg->s_size, HAT_UNLOAD_UNMAP); 660 } 661 return (error); 662 } 663 /* 664 * svntr_hashtab will be NULL if we support shared regions. 665 */ 666 trok = ((a->flags & MAP_TEXT) && 667 (seg->s_size > textrepl_size_thresh || 668 (a->flags & _MAP_TEXTREPL)) && 669 lgrp_optimizations() && svntr_hashtab != NULL && 670 a->type == MAP_PRIVATE && swresv == 0 && 671 !(a->flags & MAP_NORESERVE) && 672 seg->s_as != &kas && a->vp->v_type == VREG); 673 674 ASSERT(!trok || !use_rgn); 675 } 676 677 /* 678 * MAP_NORESERVE mappings don't count towards the VSZ of a process 679 * until we fault the pages in. 680 */ 681 if ((a->vp == NULL || a->vp->v_type != VREG) && 682 a->flags & MAP_NORESERVE) { 683 seg->s_as->a_resvsize -= seg->s_size; 684 } 685 686 /* 687 * If more than one segment in the address space, and they're adjacent 688 * virtually, try to concatenate them. Don't concatenate if an 689 * explicit anon_map structure was supplied (e.g., SystemV shared 690 * memory) or if we'll use text replication for this segment. 691 */ 692 if (a->amp == NULL && !use_rgn && !trok) { 693 struct seg *pseg, *nseg; 694 struct segvn_data *psvd, *nsvd; 695 lgrp_mem_policy_t ppolicy, npolicy; 696 uint_t lgrp_mem_policy_flags = 0; 697 extern lgrp_mem_policy_t lgrp_mem_default_policy; 698 699 /* 700 * Memory policy flags (lgrp_mem_policy_flags) is valid when 701 * extending stack/heap segments. 702 */ 703 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 704 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 705 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 706 } else { 707 /* 708 * Get policy when not extending it from another segment 709 */ 710 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 711 } 712 713 /* 714 * First, try to concatenate the previous and new segments 715 */ 716 pseg = AS_SEGPREV(seg->s_as, seg); 717 if (pseg != NULL && 718 pseg->s_base + pseg->s_size == seg->s_base && 719 pseg->s_ops == &segvn_ops) { 720 /* 721 * Get memory allocation policy from previous segment. 722 * When extension is specified (e.g. for heap) apply 723 * this policy to the new segment regardless of the 724 * outcome of segment concatenation. Extension occurs 725 * for non-default policy otherwise default policy is 726 * used and is based on extended segment size. 727 */ 728 psvd = (struct segvn_data *)pseg->s_data; 729 ppolicy = psvd->policy_info.mem_policy; 730 if (lgrp_mem_policy_flags == 731 LGRP_MP_FLAG_EXTEND_UP) { 732 if (ppolicy != lgrp_mem_default_policy) { 733 mpolicy = ppolicy; 734 } else { 735 mpolicy = lgrp_mem_policy_default( 736 pseg->s_size + seg->s_size, 737 a->type); 738 } 739 } 740 741 if (mpolicy == ppolicy && 742 (pseg->s_size + seg->s_size <= 743 segvn_comb_thrshld || psvd->amp == NULL) && 744 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 745 /* 746 * success! now try to concatenate 747 * with following seg 748 */ 749 crfree(cred); 750 nseg = AS_SEGNEXT(pseg->s_as, pseg); 751 if (nseg != NULL && 752 nseg != pseg && 753 nseg->s_ops == &segvn_ops && 754 pseg->s_base + pseg->s_size == 755 nseg->s_base) 756 (void) segvn_concat(pseg, nseg, 0); 757 ASSERT(pseg->s_szc == 0 || 758 (a->szc == pseg->s_szc && 759 IS_P2ALIGNED(pseg->s_base, pgsz) && 760 IS_P2ALIGNED(pseg->s_size, pgsz))); 761 return (0); 762 } 763 } 764 765 /* 766 * Failed, so try to concatenate with following seg 767 */ 768 nseg = AS_SEGNEXT(seg->s_as, seg); 769 if (nseg != NULL && 770 seg->s_base + seg->s_size == nseg->s_base && 771 nseg->s_ops == &segvn_ops) { 772 /* 773 * Get memory allocation policy from next segment. 774 * When extension is specified (e.g. for stack) apply 775 * this policy to the new segment regardless of the 776 * outcome of segment concatenation. Extension occurs 777 * for non-default policy otherwise default policy is 778 * used and is based on extended segment size. 779 */ 780 nsvd = (struct segvn_data *)nseg->s_data; 781 npolicy = nsvd->policy_info.mem_policy; 782 if (lgrp_mem_policy_flags == 783 LGRP_MP_FLAG_EXTEND_DOWN) { 784 if (npolicy != lgrp_mem_default_policy) { 785 mpolicy = npolicy; 786 } else { 787 mpolicy = lgrp_mem_policy_default( 788 nseg->s_size + seg->s_size, 789 a->type); 790 } 791 } 792 793 if (mpolicy == npolicy && 794 segvn_extend_next(seg, nseg, a, swresv) == 0) { 795 crfree(cred); 796 ASSERT(nseg->s_szc == 0 || 797 (a->szc == nseg->s_szc && 798 IS_P2ALIGNED(nseg->s_base, pgsz) && 799 IS_P2ALIGNED(nseg->s_size, pgsz))); 800 return (0); 801 } 802 } 803 } 804 805 if (a->vp != NULL) { 806 VN_HOLD(a->vp); 807 if (a->type == MAP_SHARED) 808 lgrp_shm_policy_init(NULL, a->vp); 809 } 810 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 811 812 seg->s_ops = &segvn_ops; 813 seg->s_data = (void *)svd; 814 seg->s_szc = a->szc; 815 816 svd->seg = seg; 817 svd->vp = a->vp; 818 /* 819 * Anonymous mappings have no backing file so the offset is meaningless. 820 */ 821 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 822 svd->prot = a->prot; 823 svd->maxprot = a->maxprot; 824 svd->pageprot = 0; 825 svd->type = a->type; 826 svd->vpage = NULL; 827 svd->cred = cred; 828 svd->advice = MADV_NORMAL; 829 svd->pageadvice = 0; 830 svd->flags = (ushort_t)a->flags; 831 svd->softlockcnt = 0; 832 svd->softlockcnt_sbase = 0; 833 svd->softlockcnt_send = 0; 834 svd->svn_inz = 0; 835 svd->rcookie = HAT_INVALID_REGION_COOKIE; 836 svd->pageswap = 0; 837 838 if (a->szc != 0 && a->vp != NULL) { 839 segvn_setvnode_mpss(a->vp); 840 } 841 if (svd->type == MAP_SHARED && svd->vp != NULL && 842 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 843 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 844 segvn_inval_trcache(svd->vp); 845 } 846 847 amp = a->amp; 848 if ((svd->amp = amp) == NULL) { 849 svd->anon_index = 0; 850 if (svd->type == MAP_SHARED) { 851 svd->swresv = 0; 852 /* 853 * Shared mappings to a vp need no other setup. 854 * If we have a shared mapping to an anon_map object 855 * which hasn't been allocated yet, allocate the 856 * struct now so that it will be properly shared 857 * by remembering the swap reservation there. 858 */ 859 if (a->vp == NULL) { 860 svd->amp = anonmap_alloc(seg->s_size, swresv, 861 ANON_SLEEP); 862 svd->amp->a_szc = seg->s_szc; 863 } 864 } else { 865 /* 866 * Private mapping (with or without a vp). 867 * Allocate anon_map when needed. 868 */ 869 svd->swresv = swresv; 870 } 871 } else { 872 pgcnt_t anon_num; 873 874 /* 875 * Mapping to an existing anon_map structure without a vp. 876 * For now we will insure that the segment size isn't larger 877 * than the size - offset gives us. Later on we may wish to 878 * have the anon array dynamically allocated itself so that 879 * we don't always have to allocate all the anon pointer slots. 880 * This of course involves adding extra code to check that we 881 * aren't trying to use an anon pointer slot beyond the end 882 * of the currently allocated anon array. 883 */ 884 if ((amp->size - a->offset) < seg->s_size) { 885 panic("segvn_create anon_map size"); 886 /*NOTREACHED*/ 887 } 888 889 anon_num = btopr(a->offset); 890 891 if (a->type == MAP_SHARED) { 892 /* 893 * SHARED mapping to a given anon_map. 894 */ 895 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 896 amp->refcnt++; 897 if (a->szc > amp->a_szc) { 898 amp->a_szc = a->szc; 899 } 900 ANON_LOCK_EXIT(&->a_rwlock); 901 svd->anon_index = anon_num; 902 svd->swresv = 0; 903 } else { 904 /* 905 * PRIVATE mapping to a given anon_map. 906 * Make sure that all the needed anon 907 * structures are created (so that we will 908 * share the underlying pages if nothing 909 * is written by this mapping) and then 910 * duplicate the anon array as is done 911 * when a privately mapped segment is dup'ed. 912 */ 913 struct anon *ap; 914 caddr_t addr; 915 caddr_t eaddr; 916 ulong_t anon_idx; 917 int hat_flag = HAT_LOAD; 918 919 if (svd->flags & MAP_TEXT) { 920 hat_flag |= HAT_LOAD_TEXT; 921 } 922 923 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 924 svd->amp->a_szc = seg->s_szc; 925 svd->anon_index = 0; 926 svd->swresv = swresv; 927 928 /* 929 * Prevent 2 threads from allocating anon 930 * slots simultaneously. 931 */ 932 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 933 eaddr = seg->s_base + seg->s_size; 934 935 for (anon_idx = anon_num, addr = seg->s_base; 936 addr < eaddr; addr += PAGESIZE, anon_idx++) { 937 page_t *pp; 938 939 if ((ap = anon_get_ptr(amp->ahp, 940 anon_idx)) != NULL) 941 continue; 942 943 /* 944 * Allocate the anon struct now. 945 * Might as well load up translation 946 * to the page while we're at it... 947 */ 948 pp = anon_zero(seg, addr, &ap, cred); 949 if (ap == NULL || pp == NULL) { 950 panic("segvn_create anon_zero"); 951 /*NOTREACHED*/ 952 } 953 954 /* 955 * Re-acquire the anon_map lock and 956 * initialize the anon array entry. 957 */ 958 ASSERT(anon_get_ptr(amp->ahp, 959 anon_idx) == NULL); 960 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 961 ANON_SLEEP); 962 963 ASSERT(seg->s_szc == 0); 964 ASSERT(!IS_VMODSORT(pp->p_vnode)); 965 966 ASSERT(use_rgn == 0); 967 hat_memload(seg->s_as->a_hat, addr, pp, 968 svd->prot & ~PROT_WRITE, hat_flag); 969 970 page_unlock(pp); 971 } 972 ASSERT(seg->s_szc == 0); 973 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 974 0, seg->s_size); 975 ANON_LOCK_EXIT(&->a_rwlock); 976 } 977 } 978 979 /* 980 * Set default memory allocation policy for segment 981 * 982 * Always set policy for private memory at least for initialization 983 * even if this is a shared memory segment 984 */ 985 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 986 987 if (svd->type == MAP_SHARED) 988 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 989 svd->vp, svd->offset, seg->s_size); 990 991 if (use_rgn) { 992 ASSERT(!trok); 993 ASSERT(svd->amp == NULL); 994 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base, 995 seg->s_size, (void *)svd->vp, svd->offset, svd->prot, 996 (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback, 997 HAT_REGION_TEXT); 998 } 999 1000 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 1001 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF; 1002 1003 return (0); 1004 } 1005 1006 /* 1007 * Concatenate two existing segments, if possible. 1008 * Return 0 on success, -1 if two segments are not compatible 1009 * or -2 on memory allocation failure. 1010 * If amp_cat == 1 then try and concat segments with anon maps 1011 */ 1012 static int 1013 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 1014 { 1015 struct segvn_data *svd1 = seg1->s_data; 1016 struct segvn_data *svd2 = seg2->s_data; 1017 struct anon_map *amp1 = svd1->amp; 1018 struct anon_map *amp2 = svd2->amp; 1019 struct vpage *vpage1 = svd1->vpage; 1020 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 1021 size_t size, nvpsize; 1022 pgcnt_t npages1, npages2; 1023 1024 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 1025 ASSERT(AS_WRITE_HELD(seg1->s_as)); 1026 ASSERT(seg1->s_ops == seg2->s_ops); 1027 1028 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) || 1029 HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1030 return (-1); 1031 } 1032 1033 /* both segments exist, try to merge them */ 1034 #define incompat(x) (svd1->x != svd2->x) 1035 if (incompat(vp) || incompat(maxprot) || 1036 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 1037 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 1038 incompat(type) || incompat(cred) || incompat(flags) || 1039 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 1040 (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0) 1041 return (-1); 1042 #undef incompat 1043 1044 /* 1045 * vp == NULL implies zfod, offset doesn't matter 1046 */ 1047 if (svd1->vp != NULL && 1048 svd1->offset + seg1->s_size != svd2->offset) { 1049 return (-1); 1050 } 1051 1052 /* 1053 * Don't concatenate if either segment uses text replication. 1054 */ 1055 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) { 1056 return (-1); 1057 } 1058 1059 /* 1060 * Fail early if we're not supposed to concatenate 1061 * segments with non NULL amp. 1062 */ 1063 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 1064 return (-1); 1065 } 1066 1067 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 1068 if (amp1 != amp2) { 1069 return (-1); 1070 } 1071 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 1072 svd2->anon_index) { 1073 return (-1); 1074 } 1075 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 1076 } 1077 1078 /* 1079 * If either seg has vpages, create a new merged vpage array. 1080 */ 1081 if (vpage1 != NULL || vpage2 != NULL) { 1082 struct vpage *vp, *evp; 1083 1084 npages1 = seg_pages(seg1); 1085 npages2 = seg_pages(seg2); 1086 nvpsize = vpgtob(npages1 + npages2); 1087 1088 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 1089 return (-2); 1090 } 1091 1092 if (vpage1 != NULL) { 1093 bcopy(vpage1, nvpage, vpgtob(npages1)); 1094 } else { 1095 evp = nvpage + npages1; 1096 for (vp = nvpage; vp < evp; vp++) { 1097 VPP_SETPROT(vp, svd1->prot); 1098 VPP_SETADVICE(vp, svd1->advice); 1099 } 1100 } 1101 1102 if (vpage2 != NULL) { 1103 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 1104 } else { 1105 evp = nvpage + npages1 + npages2; 1106 for (vp = nvpage + npages1; vp < evp; vp++) { 1107 VPP_SETPROT(vp, svd2->prot); 1108 VPP_SETADVICE(vp, svd2->advice); 1109 } 1110 } 1111 1112 if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) { 1113 ASSERT(svd1->swresv == seg1->s_size); 1114 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1115 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1116 evp = nvpage + npages1; 1117 for (vp = nvpage; vp < evp; vp++) { 1118 VPP_SETSWAPRES(vp); 1119 } 1120 } 1121 1122 if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) { 1123 ASSERT(svd2->swresv == seg2->s_size); 1124 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1125 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1126 vp = nvpage + npages1; 1127 evp = vp + npages2; 1128 for (; vp < evp; vp++) { 1129 VPP_SETSWAPRES(vp); 1130 } 1131 } 1132 } 1133 ASSERT((vpage1 != NULL || vpage2 != NULL) || 1134 (svd1->pageswap == 0 && svd2->pageswap == 0)); 1135 1136 /* 1137 * If either segment has private pages, create a new merged anon 1138 * array. If mergeing shared anon segments just decrement anon map's 1139 * refcnt. 1140 */ 1141 if (amp1 != NULL && svd1->type == MAP_SHARED) { 1142 ASSERT(amp1 == amp2 && svd1->vp == NULL); 1143 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1144 ASSERT(amp1->refcnt >= 2); 1145 amp1->refcnt--; 1146 ANON_LOCK_EXIT(&1->a_rwlock); 1147 svd2->amp = NULL; 1148 } else if (amp1 != NULL || amp2 != NULL) { 1149 struct anon_hdr *nahp; 1150 struct anon_map *namp = NULL; 1151 size_t asize; 1152 1153 ASSERT(svd1->type == MAP_PRIVATE); 1154 1155 asize = seg1->s_size + seg2->s_size; 1156 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 1157 if (nvpage != NULL) { 1158 kmem_free(nvpage, nvpsize); 1159 } 1160 return (-2); 1161 } 1162 if (amp1 != NULL) { 1163 /* 1164 * XXX anon rwlock is not really needed because 1165 * this is a private segment and we are writers. 1166 */ 1167 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1168 ASSERT(amp1->refcnt == 1); 1169 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 1170 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 1171 anon_release(nahp, btop(asize)); 1172 ANON_LOCK_EXIT(&1->a_rwlock); 1173 if (nvpage != NULL) { 1174 kmem_free(nvpage, nvpsize); 1175 } 1176 return (-2); 1177 } 1178 } 1179 if (amp2 != NULL) { 1180 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1181 ASSERT(amp2->refcnt == 1); 1182 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 1183 nahp, btop(seg1->s_size), btop(seg2->s_size), 1184 ANON_NOSLEEP)) { 1185 anon_release(nahp, btop(asize)); 1186 ANON_LOCK_EXIT(&2->a_rwlock); 1187 if (amp1 != NULL) { 1188 ANON_LOCK_EXIT(&1->a_rwlock); 1189 } 1190 if (nvpage != NULL) { 1191 kmem_free(nvpage, nvpsize); 1192 } 1193 return (-2); 1194 } 1195 } 1196 if (amp1 != NULL) { 1197 namp = amp1; 1198 anon_release(amp1->ahp, btop(amp1->size)); 1199 } 1200 if (amp2 != NULL) { 1201 if (namp == NULL) { 1202 ASSERT(amp1 == NULL); 1203 namp = amp2; 1204 anon_release(amp2->ahp, btop(amp2->size)); 1205 } else { 1206 amp2->refcnt--; 1207 ANON_LOCK_EXIT(&2->a_rwlock); 1208 anonmap_free(amp2); 1209 } 1210 svd2->amp = NULL; /* needed for seg_free */ 1211 } 1212 namp->ahp = nahp; 1213 namp->size = asize; 1214 svd1->amp = namp; 1215 svd1->anon_index = 0; 1216 ANON_LOCK_EXIT(&namp->a_rwlock); 1217 } 1218 /* 1219 * Now free the old vpage structures. 1220 */ 1221 if (nvpage != NULL) { 1222 if (vpage1 != NULL) { 1223 kmem_free(vpage1, vpgtob(npages1)); 1224 } 1225 if (vpage2 != NULL) { 1226 svd2->vpage = NULL; 1227 kmem_free(vpage2, vpgtob(npages2)); 1228 } 1229 if (svd2->pageprot) { 1230 svd1->pageprot = 1; 1231 } 1232 if (svd2->pageadvice) { 1233 svd1->pageadvice = 1; 1234 } 1235 if (svd2->pageswap) { 1236 svd1->pageswap = 1; 1237 } 1238 svd1->vpage = nvpage; 1239 } 1240 1241 /* all looks ok, merge segments */ 1242 svd1->swresv += svd2->swresv; 1243 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 1244 size = seg2->s_size; 1245 seg_free(seg2); 1246 seg1->s_size += size; 1247 return (0); 1248 } 1249 1250 /* 1251 * Extend the previous segment (seg1) to include the 1252 * new segment (seg2 + a), if possible. 1253 * Return 0 on success. 1254 */ 1255 static int 1256 segvn_extend_prev(seg1, seg2, a, swresv) 1257 struct seg *seg1, *seg2; 1258 struct segvn_crargs *a; 1259 size_t swresv; 1260 { 1261 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 1262 size_t size; 1263 struct anon_map *amp1; 1264 struct vpage *new_vpage; 1265 1266 /* 1267 * We don't need any segment level locks for "segvn" data 1268 * since the address space is "write" locked. 1269 */ 1270 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as)); 1271 1272 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) { 1273 return (-1); 1274 } 1275 1276 /* second segment is new, try to extend first */ 1277 /* XXX - should also check cred */ 1278 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1279 (!svd1->pageprot && (svd1->prot != a->prot)) || 1280 svd1->type != a->type || svd1->flags != a->flags || 1281 seg1->s_szc != a->szc || svd1->softlockcnt_send > 0) 1282 return (-1); 1283 1284 /* vp == NULL implies zfod, offset doesn't matter */ 1285 if (svd1->vp != NULL && 1286 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1287 return (-1); 1288 1289 if (svd1->tr_state != SEGVN_TR_OFF) { 1290 return (-1); 1291 } 1292 1293 amp1 = svd1->amp; 1294 if (amp1) { 1295 pgcnt_t newpgs; 1296 1297 /* 1298 * Segment has private pages, can data structures 1299 * be expanded? 1300 * 1301 * Acquire the anon_map lock to prevent it from changing, 1302 * if it is shared. This ensures that the anon_map 1303 * will not change while a thread which has a read/write 1304 * lock on an address space references it. 1305 * XXX - Don't need the anon_map lock at all if "refcnt" 1306 * is 1. 1307 * 1308 * Can't grow a MAP_SHARED segment with an anonmap because 1309 * there may be existing anon slots where we want to extend 1310 * the segment and we wouldn't know what to do with them 1311 * (e.g., for tmpfs right thing is to just leave them there, 1312 * for /dev/zero they should be cleared out). 1313 */ 1314 if (svd1->type == MAP_SHARED) 1315 return (-1); 1316 1317 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1318 if (amp1->refcnt > 1) { 1319 ANON_LOCK_EXIT(&1->a_rwlock); 1320 return (-1); 1321 } 1322 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1323 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1324 1325 if (newpgs == 0) { 1326 ANON_LOCK_EXIT(&1->a_rwlock); 1327 return (-1); 1328 } 1329 amp1->size = ptob(newpgs); 1330 ANON_LOCK_EXIT(&1->a_rwlock); 1331 } 1332 if (svd1->vpage != NULL) { 1333 struct vpage *vp, *evp; 1334 new_vpage = 1335 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1336 KM_NOSLEEP); 1337 if (new_vpage == NULL) 1338 return (-1); 1339 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1340 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1341 svd1->vpage = new_vpage; 1342 1343 vp = new_vpage + seg_pages(seg1); 1344 evp = vp + seg_pages(seg2); 1345 for (; vp < evp; vp++) 1346 VPP_SETPROT(vp, a->prot); 1347 if (svd1->pageswap && swresv) { 1348 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1349 ASSERT(swresv == seg2->s_size); 1350 vp = new_vpage + seg_pages(seg1); 1351 for (; vp < evp; vp++) { 1352 VPP_SETSWAPRES(vp); 1353 } 1354 } 1355 } 1356 ASSERT(svd1->vpage != NULL || svd1->pageswap == 0); 1357 size = seg2->s_size; 1358 seg_free(seg2); 1359 seg1->s_size += size; 1360 svd1->swresv += swresv; 1361 if (svd1->pageprot && (a->prot & PROT_WRITE) && 1362 svd1->type == MAP_SHARED && svd1->vp != NULL && 1363 (svd1->vp->v_flag & VVMEXEC)) { 1364 ASSERT(vn_is_mapped(svd1->vp, V_WRITE)); 1365 segvn_inval_trcache(svd1->vp); 1366 } 1367 return (0); 1368 } 1369 1370 /* 1371 * Extend the next segment (seg2) to include the 1372 * new segment (seg1 + a), if possible. 1373 * Return 0 on success. 1374 */ 1375 static int 1376 segvn_extend_next( 1377 struct seg *seg1, 1378 struct seg *seg2, 1379 struct segvn_crargs *a, 1380 size_t swresv) 1381 { 1382 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1383 size_t size; 1384 struct anon_map *amp2; 1385 struct vpage *new_vpage; 1386 1387 /* 1388 * We don't need any segment level locks for "segvn" data 1389 * since the address space is "write" locked. 1390 */ 1391 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as)); 1392 1393 if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1394 return (-1); 1395 } 1396 1397 /* first segment is new, try to extend second */ 1398 /* XXX - should also check cred */ 1399 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1400 (!svd2->pageprot && (svd2->prot != a->prot)) || 1401 svd2->type != a->type || svd2->flags != a->flags || 1402 seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0) 1403 return (-1); 1404 /* vp == NULL implies zfod, offset doesn't matter */ 1405 if (svd2->vp != NULL && 1406 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1407 return (-1); 1408 1409 if (svd2->tr_state != SEGVN_TR_OFF) { 1410 return (-1); 1411 } 1412 1413 amp2 = svd2->amp; 1414 if (amp2) { 1415 pgcnt_t newpgs; 1416 1417 /* 1418 * Segment has private pages, can data structures 1419 * be expanded? 1420 * 1421 * Acquire the anon_map lock to prevent it from changing, 1422 * if it is shared. This ensures that the anon_map 1423 * will not change while a thread which has a read/write 1424 * lock on an address space references it. 1425 * 1426 * XXX - Don't need the anon_map lock at all if "refcnt" 1427 * is 1. 1428 */ 1429 if (svd2->type == MAP_SHARED) 1430 return (-1); 1431 1432 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1433 if (amp2->refcnt > 1) { 1434 ANON_LOCK_EXIT(&2->a_rwlock); 1435 return (-1); 1436 } 1437 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1438 btop(seg2->s_size), btop(seg1->s_size), 1439 ANON_NOSLEEP | ANON_GROWDOWN); 1440 1441 if (newpgs == 0) { 1442 ANON_LOCK_EXIT(&2->a_rwlock); 1443 return (-1); 1444 } 1445 amp2->size = ptob(newpgs); 1446 ANON_LOCK_EXIT(&2->a_rwlock); 1447 } 1448 if (svd2->vpage != NULL) { 1449 struct vpage *vp, *evp; 1450 new_vpage = 1451 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1452 KM_NOSLEEP); 1453 if (new_vpage == NULL) { 1454 /* Not merging segments so adjust anon_index back */ 1455 if (amp2) 1456 svd2->anon_index += seg_pages(seg1); 1457 return (-1); 1458 } 1459 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1460 vpgtob(seg_pages(seg2))); 1461 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1462 svd2->vpage = new_vpage; 1463 1464 vp = new_vpage; 1465 evp = vp + seg_pages(seg1); 1466 for (; vp < evp; vp++) 1467 VPP_SETPROT(vp, a->prot); 1468 if (svd2->pageswap && swresv) { 1469 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1470 ASSERT(swresv == seg1->s_size); 1471 vp = new_vpage; 1472 for (; vp < evp; vp++) { 1473 VPP_SETSWAPRES(vp); 1474 } 1475 } 1476 } 1477 ASSERT(svd2->vpage != NULL || svd2->pageswap == 0); 1478 size = seg1->s_size; 1479 seg_free(seg1); 1480 seg2->s_size += size; 1481 seg2->s_base -= size; 1482 svd2->offset -= size; 1483 svd2->swresv += swresv; 1484 if (svd2->pageprot && (a->prot & PROT_WRITE) && 1485 svd2->type == MAP_SHARED && svd2->vp != NULL && 1486 (svd2->vp->v_flag & VVMEXEC)) { 1487 ASSERT(vn_is_mapped(svd2->vp, V_WRITE)); 1488 segvn_inval_trcache(svd2->vp); 1489 } 1490 return (0); 1491 } 1492 1493 /* 1494 * Duplicate all the pages in the segment. This may break COW sharing for a 1495 * given page. If the page is marked with inherit zero set, then instead of 1496 * duplicating the page, we zero the page. 1497 */ 1498 static int 1499 segvn_dup_pages(struct seg *seg, struct seg *newseg) 1500 { 1501 int error; 1502 uint_t prot; 1503 page_t *pp; 1504 struct anon *ap, *newap; 1505 size_t i; 1506 caddr_t addr; 1507 1508 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1509 struct segvn_data *newsvd = (struct segvn_data *)newseg->s_data; 1510 ulong_t old_idx = svd->anon_index; 1511 ulong_t new_idx = 0; 1512 1513 i = btopr(seg->s_size); 1514 addr = seg->s_base; 1515 1516 /* 1517 * XXX break cow sharing using PAGESIZE 1518 * pages. They will be relocated into larger 1519 * pages at fault time. 1520 */ 1521 while (i-- > 0) { 1522 if ((ap = anon_get_ptr(svd->amp->ahp, old_idx)) != NULL) { 1523 struct vpage *vpp; 1524 1525 vpp = &svd->vpage[seg_page(seg, addr)]; 1526 1527 /* 1528 * prot need not be computed below 'cause anon_private 1529 * is going to ignore it anyway as child doesn't inherit 1530 * pagelock from parent. 1531 */ 1532 prot = svd->pageprot ? VPP_PROT(vpp) : svd->prot; 1533 1534 /* 1535 * Check whether we should zero this or dup it. 1536 */ 1537 if (svd->svn_inz == SEGVN_INZ_ALL || 1538 (svd->svn_inz == SEGVN_INZ_VPP && 1539 VPP_ISINHZERO(vpp))) { 1540 pp = anon_zero(newseg, addr, &newap, 1541 newsvd->cred); 1542 } else { 1543 page_t *anon_pl[1+1]; 1544 uint_t vpprot; 1545 error = anon_getpage(&ap, &vpprot, anon_pl, 1546 PAGESIZE, seg, addr, S_READ, svd->cred); 1547 if (error != 0) 1548 return (error); 1549 1550 pp = anon_private(&newap, newseg, addr, prot, 1551 anon_pl[0], 0, newsvd->cred); 1552 } 1553 if (pp == NULL) { 1554 return (ENOMEM); 1555 } 1556 (void) anon_set_ptr(newsvd->amp->ahp, new_idx, newap, 1557 ANON_SLEEP); 1558 page_unlock(pp); 1559 } 1560 addr += PAGESIZE; 1561 old_idx++; 1562 new_idx++; 1563 } 1564 1565 return (0); 1566 } 1567 1568 static int 1569 segvn_dup(struct seg *seg, struct seg *newseg) 1570 { 1571 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1572 struct segvn_data *newsvd; 1573 pgcnt_t npages = seg_pages(seg); 1574 int error = 0; 1575 size_t len; 1576 struct anon_map *amp; 1577 1578 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1579 ASSERT(newseg->s_as->a_proc->p_parent == curproc); 1580 1581 /* 1582 * If segment has anon reserved, reserve more for the new seg. 1583 * For a MAP_NORESERVE segment swresv will be a count of all the 1584 * allocated anon slots; thus we reserve for the child as many slots 1585 * as the parent has allocated. This semantic prevents the child or 1586 * parent from dieing during a copy-on-write fault caused by trying 1587 * to write a shared pre-existing anon page. 1588 */ 1589 if ((len = svd->swresv) != 0) { 1590 if (anon_resv(svd->swresv) == 0) 1591 return (ENOMEM); 1592 1593 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1594 seg, len, 0); 1595 } 1596 1597 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1598 1599 newseg->s_ops = &segvn_ops; 1600 newseg->s_data = (void *)newsvd; 1601 newseg->s_szc = seg->s_szc; 1602 1603 newsvd->seg = newseg; 1604 if ((newsvd->vp = svd->vp) != NULL) { 1605 VN_HOLD(svd->vp); 1606 if (svd->type == MAP_SHARED) 1607 lgrp_shm_policy_init(NULL, svd->vp); 1608 } 1609 newsvd->offset = svd->offset; 1610 newsvd->prot = svd->prot; 1611 newsvd->maxprot = svd->maxprot; 1612 newsvd->pageprot = svd->pageprot; 1613 newsvd->type = svd->type; 1614 newsvd->cred = svd->cred; 1615 crhold(newsvd->cred); 1616 newsvd->advice = svd->advice; 1617 newsvd->pageadvice = svd->pageadvice; 1618 newsvd->svn_inz = svd->svn_inz; 1619 newsvd->swresv = svd->swresv; 1620 newsvd->pageswap = svd->pageswap; 1621 newsvd->flags = svd->flags; 1622 newsvd->softlockcnt = 0; 1623 newsvd->softlockcnt_sbase = 0; 1624 newsvd->softlockcnt_send = 0; 1625 newsvd->policy_info = svd->policy_info; 1626 newsvd->rcookie = HAT_INVALID_REGION_COOKIE; 1627 1628 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) { 1629 /* 1630 * Not attaching to a shared anon object. 1631 */ 1632 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) || 1633 svd->tr_state == SEGVN_TR_OFF); 1634 if (svd->tr_state == SEGVN_TR_ON) { 1635 ASSERT(newsvd->vp != NULL && amp != NULL); 1636 newsvd->tr_state = SEGVN_TR_INIT; 1637 } else { 1638 newsvd->tr_state = svd->tr_state; 1639 } 1640 newsvd->amp = NULL; 1641 newsvd->anon_index = 0; 1642 } else { 1643 /* regions for now are only used on pure vnode segments */ 1644 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 1645 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1646 newsvd->tr_state = SEGVN_TR_OFF; 1647 if (svd->type == MAP_SHARED) { 1648 ASSERT(svd->svn_inz == SEGVN_INZ_NONE); 1649 newsvd->amp = amp; 1650 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1651 amp->refcnt++; 1652 ANON_LOCK_EXIT(&->a_rwlock); 1653 newsvd->anon_index = svd->anon_index; 1654 } else { 1655 int reclaim = 1; 1656 1657 /* 1658 * Allocate and initialize new anon_map structure. 1659 */ 1660 newsvd->amp = anonmap_alloc(newseg->s_size, 0, 1661 ANON_SLEEP); 1662 newsvd->amp->a_szc = newseg->s_szc; 1663 newsvd->anon_index = 0; 1664 ASSERT(svd->svn_inz == SEGVN_INZ_NONE || 1665 svd->svn_inz == SEGVN_INZ_ALL || 1666 svd->svn_inz == SEGVN_INZ_VPP); 1667 1668 /* 1669 * We don't have to acquire the anon_map lock 1670 * for the new segment (since it belongs to an 1671 * address space that is still not associated 1672 * with any process), or the segment in the old 1673 * address space (since all threads in it 1674 * are stopped while duplicating the address space). 1675 */ 1676 1677 /* 1678 * The goal of the following code is to make sure that 1679 * softlocked pages do not end up as copy on write 1680 * pages. This would cause problems where one 1681 * thread writes to a page that is COW and a different 1682 * thread in the same process has softlocked it. The 1683 * softlock lock would move away from this process 1684 * because the write would cause this process to get 1685 * a copy (without the softlock). 1686 * 1687 * The strategy here is to just break the 1688 * sharing on pages that could possibly be 1689 * softlocked. 1690 * 1691 * In addition, if any pages have been marked that they 1692 * should be inherited as zero, then we immediately go 1693 * ahead and break COW and zero them. In the case of a 1694 * softlocked page that should be inherited zero, we 1695 * break COW and just get a zero page. 1696 */ 1697 retry: 1698 if (svd->softlockcnt || 1699 svd->svn_inz != SEGVN_INZ_NONE) { 1700 /* 1701 * The softlock count might be non zero 1702 * because some pages are still stuck in the 1703 * cache for lazy reclaim. Flush the cache 1704 * now. This should drop the count to zero. 1705 * [or there is really I/O going on to these 1706 * pages]. Note, we have the writers lock so 1707 * nothing gets inserted during the flush. 1708 */ 1709 if (svd->softlockcnt && reclaim == 1) { 1710 segvn_purge(seg); 1711 reclaim = 0; 1712 goto retry; 1713 } 1714 1715 error = segvn_dup_pages(seg, newseg); 1716 if (error != 0) { 1717 newsvd->vpage = NULL; 1718 goto out; 1719 } 1720 } else { /* common case */ 1721 if (seg->s_szc != 0) { 1722 /* 1723 * If at least one of anon slots of a 1724 * large page exists then make sure 1725 * all anon slots of a large page 1726 * exist to avoid partial cow sharing 1727 * of a large page in the future. 1728 */ 1729 anon_dup_fill_holes(amp->ahp, 1730 svd->anon_index, newsvd->amp->ahp, 1731 0, seg->s_size, seg->s_szc, 1732 svd->vp != NULL); 1733 } else { 1734 anon_dup(amp->ahp, svd->anon_index, 1735 newsvd->amp->ahp, 0, seg->s_size); 1736 } 1737 1738 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1739 seg->s_size, PROT_WRITE); 1740 } 1741 } 1742 } 1743 /* 1744 * If necessary, create a vpage structure for the new segment. 1745 * Do not copy any page lock indications. 1746 */ 1747 if (svd->vpage != NULL) { 1748 uint_t i; 1749 struct vpage *ovp = svd->vpage; 1750 struct vpage *nvp; 1751 1752 nvp = newsvd->vpage = 1753 kmem_alloc(vpgtob(npages), KM_SLEEP); 1754 for (i = 0; i < npages; i++) { 1755 *nvp = *ovp++; 1756 VPP_CLRPPLOCK(nvp++); 1757 } 1758 } else 1759 newsvd->vpage = NULL; 1760 1761 /* Inform the vnode of the new mapping */ 1762 if (newsvd->vp != NULL) { 1763 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1764 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1765 newsvd->maxprot, newsvd->type, newsvd->cred, NULL); 1766 } 1767 out: 1768 if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1769 ASSERT(newsvd->amp == NULL); 1770 ASSERT(newsvd->tr_state == SEGVN_TR_OFF); 1771 newsvd->rcookie = svd->rcookie; 1772 hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie); 1773 } 1774 return (error); 1775 } 1776 1777 1778 /* 1779 * callback function to invoke free_vp_pages() for only those pages actually 1780 * processed by the HAT when a shared region is destroyed. 1781 */ 1782 extern int free_pages; 1783 1784 static void 1785 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 1786 size_t r_size, void *r_obj, u_offset_t r_objoff) 1787 { 1788 u_offset_t off; 1789 size_t len; 1790 vnode_t *vp = (vnode_t *)r_obj; 1791 1792 ASSERT(eaddr > saddr); 1793 ASSERT(saddr >= r_saddr); 1794 ASSERT(saddr < r_saddr + r_size); 1795 ASSERT(eaddr > r_saddr); 1796 ASSERT(eaddr <= r_saddr + r_size); 1797 ASSERT(vp != NULL); 1798 1799 if (!free_pages) { 1800 return; 1801 } 1802 1803 len = eaddr - saddr; 1804 off = (saddr - r_saddr) + r_objoff; 1805 free_vp_pages(vp, off, len); 1806 } 1807 1808 /* 1809 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1810 * those pages actually processed by the HAT 1811 */ 1812 static void 1813 segvn_hat_unload_callback(hat_callback_t *cb) 1814 { 1815 struct seg *seg = cb->hcb_data; 1816 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1817 size_t len; 1818 u_offset_t off; 1819 1820 ASSERT(svd->vp != NULL); 1821 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1822 ASSERT(cb->hcb_start_addr >= seg->s_base); 1823 1824 len = cb->hcb_end_addr - cb->hcb_start_addr; 1825 off = cb->hcb_start_addr - seg->s_base; 1826 free_vp_pages(svd->vp, svd->offset + off, len); 1827 } 1828 1829 /* 1830 * This function determines the number of bytes of swap reserved by 1831 * a segment for which per-page accounting is present. It is used to 1832 * calculate the correct value of a segvn_data's swresv. 1833 */ 1834 static size_t 1835 segvn_count_swap_by_vpages(struct seg *seg) 1836 { 1837 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1838 struct vpage *vp, *evp; 1839 size_t nswappages = 0; 1840 1841 ASSERT(svd->pageswap); 1842 ASSERT(svd->vpage != NULL); 1843 1844 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 1845 1846 for (vp = svd->vpage; vp < evp; vp++) { 1847 if (VPP_ISSWAPRES(vp)) 1848 nswappages++; 1849 } 1850 1851 return (nswappages << PAGESHIFT); 1852 } 1853 1854 static int 1855 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1856 { 1857 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1858 struct segvn_data *nsvd; 1859 struct seg *nseg; 1860 struct anon_map *amp; 1861 pgcnt_t opages; /* old segment size in pages */ 1862 pgcnt_t npages; /* new segment size in pages */ 1863 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1864 hat_callback_t callback; /* used for free_vp_pages() */ 1865 hat_callback_t *cbp = NULL; 1866 caddr_t nbase; 1867 size_t nsize; 1868 size_t oswresv; 1869 int reclaim = 1; 1870 1871 /* 1872 * We don't need any segment level locks for "segvn" data 1873 * since the address space is "write" locked. 1874 */ 1875 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1876 1877 /* 1878 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1879 * softlockcnt is protected from change by the as write lock. 1880 */ 1881 retry: 1882 if (svd->softlockcnt > 0) { 1883 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1884 1885 /* 1886 * If this is shared segment non 0 softlockcnt 1887 * means locked pages are still in use. 1888 */ 1889 if (svd->type == MAP_SHARED) { 1890 return (EAGAIN); 1891 } 1892 1893 /* 1894 * since we do have the writers lock nobody can fill 1895 * the cache during the purge. The flush either succeeds 1896 * or we still have pending I/Os. 1897 */ 1898 if (reclaim == 1) { 1899 segvn_purge(seg); 1900 reclaim = 0; 1901 goto retry; 1902 } 1903 return (EAGAIN); 1904 } 1905 1906 /* 1907 * Check for bad sizes 1908 */ 1909 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1910 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1911 panic("segvn_unmap"); 1912 /*NOTREACHED*/ 1913 } 1914 1915 if (seg->s_szc != 0) { 1916 size_t pgsz = page_get_pagesize(seg->s_szc); 1917 int err; 1918 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1919 ASSERT(seg->s_base != addr || seg->s_size != len); 1920 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1921 ASSERT(svd->amp == NULL); 1922 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1923 hat_leave_region(seg->s_as->a_hat, 1924 svd->rcookie, HAT_REGION_TEXT); 1925 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1926 /* 1927 * could pass a flag to segvn_demote_range() 1928 * below to tell it not to do any unloads but 1929 * this case is rare enough to not bother for 1930 * now. 1931 */ 1932 } else if (svd->tr_state == SEGVN_TR_INIT) { 1933 svd->tr_state = SEGVN_TR_OFF; 1934 } else if (svd->tr_state == SEGVN_TR_ON) { 1935 ASSERT(svd->amp != NULL); 1936 segvn_textunrepl(seg, 1); 1937 ASSERT(svd->amp == NULL); 1938 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1939 } 1940 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1941 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1942 if (err == 0) { 1943 return (IE_RETRY); 1944 } 1945 return (err); 1946 } 1947 } 1948 1949 /* Inform the vnode of the unmapping. */ 1950 if (svd->vp) { 1951 int error; 1952 1953 error = VOP_DELMAP(svd->vp, 1954 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1955 seg->s_as, addr, len, svd->prot, svd->maxprot, 1956 svd->type, svd->cred, NULL); 1957 1958 if (error == EAGAIN) 1959 return (error); 1960 } 1961 1962 /* 1963 * Remove any page locks set through this mapping. 1964 * If text replication is not off no page locks could have been 1965 * established via this mapping. 1966 */ 1967 if (svd->tr_state == SEGVN_TR_OFF) { 1968 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1969 } 1970 1971 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1972 ASSERT(svd->amp == NULL); 1973 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1974 ASSERT(svd->type == MAP_PRIVATE); 1975 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 1976 HAT_REGION_TEXT); 1977 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1978 } else if (svd->tr_state == SEGVN_TR_ON) { 1979 ASSERT(svd->amp != NULL); 1980 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE)); 1981 segvn_textunrepl(seg, 1); 1982 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 1983 } else { 1984 if (svd->tr_state != SEGVN_TR_OFF) { 1985 ASSERT(svd->tr_state == SEGVN_TR_INIT); 1986 svd->tr_state = SEGVN_TR_OFF; 1987 } 1988 /* 1989 * Unload any hardware translations in the range to be taken 1990 * out. Use a callback to invoke free_vp_pages() effectively. 1991 */ 1992 if (svd->vp != NULL && free_pages != 0) { 1993 callback.hcb_data = seg; 1994 callback.hcb_function = segvn_hat_unload_callback; 1995 cbp = &callback; 1996 } 1997 hat_unload_callback(seg->s_as->a_hat, addr, len, 1998 HAT_UNLOAD_UNMAP, cbp); 1999 2000 if (svd->type == MAP_SHARED && svd->vp != NULL && 2001 (svd->vp->v_flag & VVMEXEC) && 2002 ((svd->prot & PROT_WRITE) || svd->pageprot)) { 2003 segvn_inval_trcache(svd->vp); 2004 } 2005 } 2006 2007 /* 2008 * Check for entire segment 2009 */ 2010 if (addr == seg->s_base && len == seg->s_size) { 2011 seg_free(seg); 2012 return (0); 2013 } 2014 2015 opages = seg_pages(seg); 2016 dpages = btop(len); 2017 npages = opages - dpages; 2018 amp = svd->amp; 2019 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 2020 2021 /* 2022 * Check for beginning of segment 2023 */ 2024 if (addr == seg->s_base) { 2025 if (svd->vpage != NULL) { 2026 size_t nbytes; 2027 struct vpage *ovpage; 2028 2029 ovpage = svd->vpage; /* keep pointer to vpage */ 2030 2031 nbytes = vpgtob(npages); 2032 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2033 bcopy(&ovpage[dpages], svd->vpage, nbytes); 2034 2035 /* free up old vpage */ 2036 kmem_free(ovpage, vpgtob(opages)); 2037 } 2038 if (amp != NULL) { 2039 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2040 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2041 /* 2042 * Shared anon map is no longer in use. Before 2043 * freeing its pages purge all entries from 2044 * pcache that belong to this amp. 2045 */ 2046 if (svd->type == MAP_SHARED) { 2047 ASSERT(amp->refcnt == 1); 2048 ASSERT(svd->softlockcnt == 0); 2049 anonmap_purge(amp); 2050 } 2051 /* 2052 * Free up now unused parts of anon_map array. 2053 */ 2054 if (amp->a_szc == seg->s_szc) { 2055 if (seg->s_szc != 0) { 2056 anon_free_pages(amp->ahp, 2057 svd->anon_index, len, 2058 seg->s_szc); 2059 } else { 2060 anon_free(amp->ahp, 2061 svd->anon_index, 2062 len); 2063 } 2064 } else { 2065 ASSERT(svd->type == MAP_SHARED); 2066 ASSERT(amp->a_szc > seg->s_szc); 2067 anon_shmap_free_pages(amp, 2068 svd->anon_index, len); 2069 } 2070 2071 /* 2072 * Unreserve swap space for the 2073 * unmapped chunk of this segment in 2074 * case it's MAP_SHARED 2075 */ 2076 if (svd->type == MAP_SHARED) { 2077 anon_unresv_zone(len, 2078 seg->s_as->a_proc->p_zone); 2079 amp->swresv -= len; 2080 } 2081 } 2082 ANON_LOCK_EXIT(&->a_rwlock); 2083 svd->anon_index += dpages; 2084 } 2085 if (svd->vp != NULL) 2086 svd->offset += len; 2087 2088 seg->s_base += len; 2089 seg->s_size -= len; 2090 2091 if (svd->swresv) { 2092 if (svd->flags & MAP_NORESERVE) { 2093 ASSERT(amp); 2094 oswresv = svd->swresv; 2095 2096 svd->swresv = ptob(anon_pages(amp->ahp, 2097 svd->anon_index, npages)); 2098 anon_unresv_zone(oswresv - svd->swresv, 2099 seg->s_as->a_proc->p_zone); 2100 if (SEG_IS_PARTIAL_RESV(seg)) 2101 seg->s_as->a_resvsize -= oswresv - 2102 svd->swresv; 2103 } else { 2104 size_t unlen; 2105 2106 if (svd->pageswap) { 2107 oswresv = svd->swresv; 2108 svd->swresv = 2109 segvn_count_swap_by_vpages(seg); 2110 ASSERT(oswresv >= svd->swresv); 2111 unlen = oswresv - svd->swresv; 2112 } else { 2113 svd->swresv -= len; 2114 ASSERT(svd->swresv == seg->s_size); 2115 unlen = len; 2116 } 2117 anon_unresv_zone(unlen, 2118 seg->s_as->a_proc->p_zone); 2119 } 2120 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2121 seg, len, 0); 2122 } 2123 2124 return (0); 2125 } 2126 2127 /* 2128 * Check for end of segment 2129 */ 2130 if (addr + len == seg->s_base + seg->s_size) { 2131 if (svd->vpage != NULL) { 2132 size_t nbytes; 2133 struct vpage *ovpage; 2134 2135 ovpage = svd->vpage; /* keep pointer to vpage */ 2136 2137 nbytes = vpgtob(npages); 2138 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2139 bcopy(ovpage, svd->vpage, nbytes); 2140 2141 /* free up old vpage */ 2142 kmem_free(ovpage, vpgtob(opages)); 2143 2144 } 2145 if (amp != NULL) { 2146 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2147 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2148 /* 2149 * Free up now unused parts of anon_map array. 2150 */ 2151 ulong_t an_idx = svd->anon_index + npages; 2152 2153 /* 2154 * Shared anon map is no longer in use. Before 2155 * freeing its pages purge all entries from 2156 * pcache that belong to this amp. 2157 */ 2158 if (svd->type == MAP_SHARED) { 2159 ASSERT(amp->refcnt == 1); 2160 ASSERT(svd->softlockcnt == 0); 2161 anonmap_purge(amp); 2162 } 2163 2164 if (amp->a_szc == seg->s_szc) { 2165 if (seg->s_szc != 0) { 2166 anon_free_pages(amp->ahp, 2167 an_idx, len, 2168 seg->s_szc); 2169 } else { 2170 anon_free(amp->ahp, an_idx, 2171 len); 2172 } 2173 } else { 2174 ASSERT(svd->type == MAP_SHARED); 2175 ASSERT(amp->a_szc > seg->s_szc); 2176 anon_shmap_free_pages(amp, 2177 an_idx, len); 2178 } 2179 2180 /* 2181 * Unreserve swap space for the 2182 * unmapped chunk of this segment in 2183 * case it's MAP_SHARED 2184 */ 2185 if (svd->type == MAP_SHARED) { 2186 anon_unresv_zone(len, 2187 seg->s_as->a_proc->p_zone); 2188 amp->swresv -= len; 2189 } 2190 } 2191 ANON_LOCK_EXIT(&->a_rwlock); 2192 } 2193 2194 seg->s_size -= len; 2195 2196 if (svd->swresv) { 2197 if (svd->flags & MAP_NORESERVE) { 2198 ASSERT(amp); 2199 oswresv = svd->swresv; 2200 svd->swresv = ptob(anon_pages(amp->ahp, 2201 svd->anon_index, npages)); 2202 anon_unresv_zone(oswresv - svd->swresv, 2203 seg->s_as->a_proc->p_zone); 2204 if (SEG_IS_PARTIAL_RESV(seg)) 2205 seg->s_as->a_resvsize -= oswresv - 2206 svd->swresv; 2207 } else { 2208 size_t unlen; 2209 2210 if (svd->pageswap) { 2211 oswresv = svd->swresv; 2212 svd->swresv = 2213 segvn_count_swap_by_vpages(seg); 2214 ASSERT(oswresv >= svd->swresv); 2215 unlen = oswresv - svd->swresv; 2216 } else { 2217 svd->swresv -= len; 2218 ASSERT(svd->swresv == seg->s_size); 2219 unlen = len; 2220 } 2221 anon_unresv_zone(unlen, 2222 seg->s_as->a_proc->p_zone); 2223 } 2224 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2225 "anon proc:%p %lu %u", seg, len, 0); 2226 } 2227 2228 return (0); 2229 } 2230 2231 /* 2232 * The section to go is in the middle of the segment, 2233 * have to make it into two segments. nseg is made for 2234 * the high end while seg is cut down at the low end. 2235 */ 2236 nbase = addr + len; /* new seg base */ 2237 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 2238 seg->s_size = addr - seg->s_base; /* shrink old seg */ 2239 nseg = seg_alloc(seg->s_as, nbase, nsize); 2240 if (nseg == NULL) { 2241 panic("segvn_unmap seg_alloc"); 2242 /*NOTREACHED*/ 2243 } 2244 nseg->s_ops = seg->s_ops; 2245 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 2246 nseg->s_data = (void *)nsvd; 2247 nseg->s_szc = seg->s_szc; 2248 *nsvd = *svd; 2249 nsvd->seg = nseg; 2250 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 2251 nsvd->swresv = 0; 2252 nsvd->softlockcnt = 0; 2253 nsvd->softlockcnt_sbase = 0; 2254 nsvd->softlockcnt_send = 0; 2255 nsvd->svn_inz = svd->svn_inz; 2256 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 2257 2258 if (svd->vp != NULL) { 2259 VN_HOLD(nsvd->vp); 2260 if (nsvd->type == MAP_SHARED) 2261 lgrp_shm_policy_init(NULL, nsvd->vp); 2262 } 2263 crhold(svd->cred); 2264 2265 if (svd->vpage == NULL) { 2266 nsvd->vpage = NULL; 2267 } else { 2268 /* need to split vpage into two arrays */ 2269 size_t nbytes; 2270 struct vpage *ovpage; 2271 2272 ovpage = svd->vpage; /* keep pointer to vpage */ 2273 2274 npages = seg_pages(seg); /* seg has shrunk */ 2275 nbytes = vpgtob(npages); 2276 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2277 2278 bcopy(ovpage, svd->vpage, nbytes); 2279 2280 npages = seg_pages(nseg); 2281 nbytes = vpgtob(npages); 2282 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2283 2284 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 2285 2286 /* free up old vpage */ 2287 kmem_free(ovpage, vpgtob(opages)); 2288 } 2289 2290 if (amp == NULL) { 2291 nsvd->amp = NULL; 2292 nsvd->anon_index = 0; 2293 } else { 2294 /* 2295 * Need to create a new anon map for the new segment. 2296 * We'll also allocate a new smaller array for the old 2297 * smaller segment to save space. 2298 */ 2299 opages = btop((uintptr_t)(addr - seg->s_base)); 2300 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2301 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2302 /* 2303 * Free up now unused parts of anon_map array. 2304 */ 2305 ulong_t an_idx = svd->anon_index + opages; 2306 2307 /* 2308 * Shared anon map is no longer in use. Before 2309 * freeing its pages purge all entries from 2310 * pcache that belong to this amp. 2311 */ 2312 if (svd->type == MAP_SHARED) { 2313 ASSERT(amp->refcnt == 1); 2314 ASSERT(svd->softlockcnt == 0); 2315 anonmap_purge(amp); 2316 } 2317 2318 if (amp->a_szc == seg->s_szc) { 2319 if (seg->s_szc != 0) { 2320 anon_free_pages(amp->ahp, an_idx, len, 2321 seg->s_szc); 2322 } else { 2323 anon_free(amp->ahp, an_idx, 2324 len); 2325 } 2326 } else { 2327 ASSERT(svd->type == MAP_SHARED); 2328 ASSERT(amp->a_szc > seg->s_szc); 2329 anon_shmap_free_pages(amp, an_idx, len); 2330 } 2331 2332 /* 2333 * Unreserve swap space for the 2334 * unmapped chunk of this segment in 2335 * case it's MAP_SHARED 2336 */ 2337 if (svd->type == MAP_SHARED) { 2338 anon_unresv_zone(len, 2339 seg->s_as->a_proc->p_zone); 2340 amp->swresv -= len; 2341 } 2342 } 2343 nsvd->anon_index = svd->anon_index + 2344 btop((uintptr_t)(nseg->s_base - seg->s_base)); 2345 if (svd->type == MAP_SHARED) { 2346 amp->refcnt++; 2347 nsvd->amp = amp; 2348 } else { 2349 struct anon_map *namp; 2350 struct anon_hdr *nahp; 2351 2352 ASSERT(svd->type == MAP_PRIVATE); 2353 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 2354 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 2355 namp->a_szc = seg->s_szc; 2356 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 2357 0, btop(seg->s_size), ANON_SLEEP); 2358 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 2359 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 2360 anon_release(amp->ahp, btop(amp->size)); 2361 svd->anon_index = 0; 2362 nsvd->anon_index = 0; 2363 amp->ahp = nahp; 2364 amp->size = seg->s_size; 2365 nsvd->amp = namp; 2366 } 2367 ANON_LOCK_EXIT(&->a_rwlock); 2368 } 2369 if (svd->swresv) { 2370 if (svd->flags & MAP_NORESERVE) { 2371 ASSERT(amp); 2372 oswresv = svd->swresv; 2373 svd->swresv = ptob(anon_pages(amp->ahp, 2374 svd->anon_index, btop(seg->s_size))); 2375 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 2376 nsvd->anon_index, btop(nseg->s_size))); 2377 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2378 anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv), 2379 seg->s_as->a_proc->p_zone); 2380 if (SEG_IS_PARTIAL_RESV(seg)) 2381 seg->s_as->a_resvsize -= oswresv - 2382 (svd->swresv + nsvd->swresv); 2383 } else { 2384 size_t unlen; 2385 2386 if (svd->pageswap) { 2387 oswresv = svd->swresv; 2388 svd->swresv = segvn_count_swap_by_vpages(seg); 2389 nsvd->swresv = segvn_count_swap_by_vpages(nseg); 2390 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2391 unlen = oswresv - (svd->swresv + nsvd->swresv); 2392 } else { 2393 if (seg->s_size + nseg->s_size + len != 2394 svd->swresv) { 2395 panic("segvn_unmap: cannot split " 2396 "swap reservation"); 2397 /*NOTREACHED*/ 2398 } 2399 svd->swresv = seg->s_size; 2400 nsvd->swresv = nseg->s_size; 2401 unlen = len; 2402 } 2403 anon_unresv_zone(unlen, 2404 seg->s_as->a_proc->p_zone); 2405 } 2406 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2407 seg, len, 0); 2408 } 2409 2410 return (0); /* I'm glad that's all over with! */ 2411 } 2412 2413 static void 2414 segvn_free(struct seg *seg) 2415 { 2416 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2417 pgcnt_t npages = seg_pages(seg); 2418 struct anon_map *amp; 2419 size_t len; 2420 2421 /* 2422 * We don't need any segment level locks for "segvn" data 2423 * since the address space is "write" locked. 2424 */ 2425 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 2426 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2427 2428 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2429 2430 /* 2431 * Be sure to unlock pages. XXX Why do things get free'ed instead 2432 * of unmapped? XXX 2433 */ 2434 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 2435 0, MC_UNLOCK, NULL, 0); 2436 2437 /* 2438 * Deallocate the vpage and anon pointers if necessary and possible. 2439 */ 2440 if (svd->vpage != NULL) { 2441 kmem_free(svd->vpage, vpgtob(npages)); 2442 svd->vpage = NULL; 2443 } 2444 if ((amp = svd->amp) != NULL) { 2445 /* 2446 * If there are no more references to this anon_map 2447 * structure, then deallocate the structure after freeing 2448 * up all the anon slot pointers that we can. 2449 */ 2450 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2451 ASSERT(amp->a_szc >= seg->s_szc); 2452 if (--amp->refcnt == 0) { 2453 if (svd->type == MAP_PRIVATE) { 2454 /* 2455 * Private - we only need to anon_free 2456 * the part that this segment refers to. 2457 */ 2458 if (seg->s_szc != 0) { 2459 anon_free_pages(amp->ahp, 2460 svd->anon_index, seg->s_size, 2461 seg->s_szc); 2462 } else { 2463 anon_free(amp->ahp, svd->anon_index, 2464 seg->s_size); 2465 } 2466 } else { 2467 2468 /* 2469 * Shared anon map is no longer in use. Before 2470 * freeing its pages purge all entries from 2471 * pcache that belong to this amp. 2472 */ 2473 ASSERT(svd->softlockcnt == 0); 2474 anonmap_purge(amp); 2475 2476 /* 2477 * Shared - anon_free the entire 2478 * anon_map's worth of stuff and 2479 * release any swap reservation. 2480 */ 2481 if (amp->a_szc != 0) { 2482 anon_shmap_free_pages(amp, 0, 2483 amp->size); 2484 } else { 2485 anon_free(amp->ahp, 0, amp->size); 2486 } 2487 if ((len = amp->swresv) != 0) { 2488 anon_unresv_zone(len, 2489 seg->s_as->a_proc->p_zone); 2490 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2491 "anon proc:%p %lu %u", seg, len, 0); 2492 } 2493 } 2494 svd->amp = NULL; 2495 ANON_LOCK_EXIT(&->a_rwlock); 2496 anonmap_free(amp); 2497 } else if (svd->type == MAP_PRIVATE) { 2498 /* 2499 * We had a private mapping which still has 2500 * a held anon_map so just free up all the 2501 * anon slot pointers that we were using. 2502 */ 2503 if (seg->s_szc != 0) { 2504 anon_free_pages(amp->ahp, svd->anon_index, 2505 seg->s_size, seg->s_szc); 2506 } else { 2507 anon_free(amp->ahp, svd->anon_index, 2508 seg->s_size); 2509 } 2510 ANON_LOCK_EXIT(&->a_rwlock); 2511 } else { 2512 ANON_LOCK_EXIT(&->a_rwlock); 2513 } 2514 } 2515 2516 /* 2517 * Release swap reservation. 2518 */ 2519 if ((len = svd->swresv) != 0) { 2520 anon_unresv_zone(svd->swresv, 2521 seg->s_as->a_proc->p_zone); 2522 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2523 seg, len, 0); 2524 if (SEG_IS_PARTIAL_RESV(seg)) 2525 seg->s_as->a_resvsize -= svd->swresv; 2526 svd->swresv = 0; 2527 } 2528 /* 2529 * Release claim on vnode, credentials, and finally free the 2530 * private data. 2531 */ 2532 if (svd->vp != NULL) { 2533 if (svd->type == MAP_SHARED) 2534 lgrp_shm_policy_fini(NULL, svd->vp); 2535 VN_RELE(svd->vp); 2536 svd->vp = NULL; 2537 } 2538 crfree(svd->cred); 2539 svd->pageprot = 0; 2540 svd->pageadvice = 0; 2541 svd->pageswap = 0; 2542 svd->cred = NULL; 2543 2544 /* 2545 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's 2546 * still working with this segment without holding as lock (in case 2547 * it's called by pcache async thread). 2548 */ 2549 ASSERT(svd->softlockcnt == 0); 2550 mutex_enter(&svd->segfree_syncmtx); 2551 mutex_exit(&svd->segfree_syncmtx); 2552 2553 seg->s_data = NULL; 2554 kmem_cache_free(segvn_cache, svd); 2555 } 2556 2557 /* 2558 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2559 * already been F_SOFTLOCK'ed. 2560 * Caller must always match addr and len of a softunlock with a previous 2561 * softlock with exactly the same addr and len. 2562 */ 2563 static void 2564 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2565 { 2566 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2567 page_t *pp; 2568 caddr_t adr; 2569 struct vnode *vp; 2570 u_offset_t offset; 2571 ulong_t anon_index; 2572 struct anon_map *amp; 2573 struct anon *ap = NULL; 2574 2575 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2576 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2577 2578 if ((amp = svd->amp) != NULL) 2579 anon_index = svd->anon_index + seg_page(seg, addr); 2580 2581 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 2582 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2583 hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie); 2584 } else { 2585 hat_unlock(seg->s_as->a_hat, addr, len); 2586 } 2587 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2588 if (amp != NULL) { 2589 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2590 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2591 != NULL) { 2592 swap_xlate(ap, &vp, &offset); 2593 } else { 2594 vp = svd->vp; 2595 offset = svd->offset + 2596 (uintptr_t)(adr - seg->s_base); 2597 } 2598 ANON_LOCK_EXIT(&->a_rwlock); 2599 } else { 2600 vp = svd->vp; 2601 offset = svd->offset + 2602 (uintptr_t)(adr - seg->s_base); 2603 } 2604 2605 /* 2606 * Use page_find() instead of page_lookup() to 2607 * find the page since we know that it is locked. 2608 */ 2609 pp = page_find(vp, offset); 2610 if (pp == NULL) { 2611 panic( 2612 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2613 (void *)adr, (void *)ap, (void *)vp, offset); 2614 /*NOTREACHED*/ 2615 } 2616 2617 if (rw == S_WRITE) { 2618 hat_setrefmod(pp); 2619 if (seg->s_as->a_vbits) 2620 hat_setstat(seg->s_as, adr, PAGESIZE, 2621 P_REF | P_MOD); 2622 } else if (rw != S_OTHER) { 2623 hat_setref(pp); 2624 if (seg->s_as->a_vbits) 2625 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2626 } 2627 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2628 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2629 page_unlock(pp); 2630 } 2631 ASSERT(svd->softlockcnt >= btop(len)); 2632 if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) { 2633 /* 2634 * All SOFTLOCKS are gone. Wakeup any waiting 2635 * unmappers so they can try again to unmap. 2636 * Check for waiters first without the mutex 2637 * held so we don't always grab the mutex on 2638 * softunlocks. 2639 */ 2640 if (AS_ISUNMAPWAIT(seg->s_as)) { 2641 mutex_enter(&seg->s_as->a_contents); 2642 if (AS_ISUNMAPWAIT(seg->s_as)) { 2643 AS_CLRUNMAPWAIT(seg->s_as); 2644 cv_broadcast(&seg->s_as->a_cv); 2645 } 2646 mutex_exit(&seg->s_as->a_contents); 2647 } 2648 } 2649 } 2650 2651 #define PAGE_HANDLED ((page_t *)-1) 2652 2653 /* 2654 * Release all the pages in the NULL terminated ppp list 2655 * which haven't already been converted to PAGE_HANDLED. 2656 */ 2657 static void 2658 segvn_pagelist_rele(page_t **ppp) 2659 { 2660 for (; *ppp != NULL; ppp++) { 2661 if (*ppp != PAGE_HANDLED) 2662 page_unlock(*ppp); 2663 } 2664 } 2665 2666 static int stealcow = 1; 2667 2668 /* 2669 * Workaround for viking chip bug. See bug id 1220902. 2670 * To fix this down in pagefault() would require importing so 2671 * much as and segvn code as to be unmaintainable. 2672 */ 2673 int enable_mbit_wa = 0; 2674 2675 /* 2676 * Handles all the dirty work of getting the right 2677 * anonymous pages and loading up the translations. 2678 * This routine is called only from segvn_fault() 2679 * when looping over the range of addresses requested. 2680 * 2681 * The basic algorithm here is: 2682 * If this is an anon_zero case 2683 * Call anon_zero to allocate page 2684 * Load up translation 2685 * Return 2686 * endif 2687 * If this is an anon page 2688 * Use anon_getpage to get the page 2689 * else 2690 * Find page in pl[] list passed in 2691 * endif 2692 * If not a cow 2693 * Load up the translation to the page 2694 * return 2695 * endif 2696 * Call anon_private to handle cow 2697 * Load up (writable) translation to new page 2698 */ 2699 static faultcode_t 2700 segvn_faultpage( 2701 struct hat *hat, /* the hat to use for mapping */ 2702 struct seg *seg, /* seg_vn of interest */ 2703 caddr_t addr, /* address in as */ 2704 u_offset_t off, /* offset in vp */ 2705 struct vpage *vpage, /* pointer to vpage for vp, off */ 2706 page_t *pl[], /* object source page pointer */ 2707 uint_t vpprot, /* access allowed to object pages */ 2708 enum fault_type type, /* type of fault */ 2709 enum seg_rw rw, /* type of access at fault */ 2710 int brkcow) /* we may need to break cow */ 2711 { 2712 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2713 page_t *pp, **ppp; 2714 uint_t pageflags = 0; 2715 page_t *anon_pl[1 + 1]; 2716 page_t *opp = NULL; /* original page */ 2717 uint_t prot; 2718 int err; 2719 int cow; 2720 int claim; 2721 int steal = 0; 2722 ulong_t anon_index; 2723 struct anon *ap, *oldap; 2724 struct anon_map *amp; 2725 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2726 int anon_lock = 0; 2727 anon_sync_obj_t cookie; 2728 2729 if (svd->flags & MAP_TEXT) { 2730 hat_flag |= HAT_LOAD_TEXT; 2731 } 2732 2733 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2734 ASSERT(seg->s_szc == 0); 2735 ASSERT(svd->tr_state != SEGVN_TR_INIT); 2736 2737 /* 2738 * Initialize protection value for this page. 2739 * If we have per page protection values check it now. 2740 */ 2741 if (svd->pageprot) { 2742 uint_t protchk; 2743 2744 switch (rw) { 2745 case S_READ: 2746 protchk = PROT_READ; 2747 break; 2748 case S_WRITE: 2749 protchk = PROT_WRITE; 2750 break; 2751 case S_EXEC: 2752 protchk = PROT_EXEC; 2753 break; 2754 case S_OTHER: 2755 default: 2756 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2757 break; 2758 } 2759 2760 prot = VPP_PROT(vpage); 2761 if ((prot & protchk) == 0) 2762 return (FC_PROT); /* illegal access type */ 2763 } else { 2764 prot = svd->prot; 2765 } 2766 2767 if (type == F_SOFTLOCK) { 2768 atomic_inc_ulong((ulong_t *)&svd->softlockcnt); 2769 } 2770 2771 /* 2772 * Always acquire the anon array lock to prevent 2 threads from 2773 * allocating separate anon slots for the same "addr". 2774 */ 2775 2776 if ((amp = svd->amp) != NULL) { 2777 ASSERT(RW_READ_HELD(&->a_rwlock)); 2778 anon_index = svd->anon_index + seg_page(seg, addr); 2779 anon_array_enter(amp, anon_index, &cookie); 2780 anon_lock = 1; 2781 } 2782 2783 if (svd->vp == NULL && amp != NULL) { 2784 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2785 /* 2786 * Allocate a (normally) writable anonymous page of 2787 * zeroes. If no advance reservations, reserve now. 2788 */ 2789 if (svd->flags & MAP_NORESERVE) { 2790 if (anon_resv_zone(ptob(1), 2791 seg->s_as->a_proc->p_zone)) { 2792 atomic_add_long(&svd->swresv, ptob(1)); 2793 atomic_add_long(&seg->s_as->a_resvsize, 2794 ptob(1)); 2795 } else { 2796 err = ENOMEM; 2797 goto out; 2798 } 2799 } 2800 if ((pp = anon_zero(seg, addr, &ap, 2801 svd->cred)) == NULL) { 2802 err = ENOMEM; 2803 goto out; /* out of swap space */ 2804 } 2805 /* 2806 * Re-acquire the anon_map lock and 2807 * initialize the anon array entry. 2808 */ 2809 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2810 ANON_SLEEP); 2811 2812 ASSERT(pp->p_szc == 0); 2813 2814 /* 2815 * Handle pages that have been marked for migration 2816 */ 2817 if (lgrp_optimizations()) 2818 page_migrate(seg, addr, &pp, 1); 2819 2820 if (enable_mbit_wa) { 2821 if (rw == S_WRITE) 2822 hat_setmod(pp); 2823 else if (!hat_ismod(pp)) 2824 prot &= ~PROT_WRITE; 2825 } 2826 /* 2827 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2828 * with MC_LOCKAS, MCL_FUTURE) and this is a 2829 * MAP_NORESERVE segment, we may need to 2830 * permanently lock the page as it is being faulted 2831 * for the first time. The following text applies 2832 * only to MAP_NORESERVE segments: 2833 * 2834 * As per memcntl(2), if this segment was created 2835 * after MCL_FUTURE was applied (a "future" 2836 * segment), its pages must be locked. If this 2837 * segment existed at MCL_FUTURE application (a 2838 * "past" segment), the interface is unclear. 2839 * 2840 * We decide to lock only if vpage is present: 2841 * 2842 * - "future" segments will have a vpage array (see 2843 * as_map), and so will be locked as required 2844 * 2845 * - "past" segments may not have a vpage array, 2846 * depending on whether events (such as 2847 * mprotect) have occurred. Locking if vpage 2848 * exists will preserve legacy behavior. Not 2849 * locking if vpage is absent, will not break 2850 * the interface or legacy behavior. Note that 2851 * allocating vpage here if it's absent requires 2852 * upgrading the segvn reader lock, the cost of 2853 * which does not seem worthwhile. 2854 * 2855 * Usually testing and setting VPP_ISPPLOCK and 2856 * VPP_SETPPLOCK requires holding the segvn lock as 2857 * writer, but in this case all readers are 2858 * serializing on the anon array lock. 2859 */ 2860 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2861 (svd->flags & MAP_NORESERVE) && 2862 !VPP_ISPPLOCK(vpage)) { 2863 proc_t *p = seg->s_as->a_proc; 2864 ASSERT(svd->type == MAP_PRIVATE); 2865 mutex_enter(&p->p_lock); 2866 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2867 1) == 0) { 2868 claim = VPP_PROT(vpage) & PROT_WRITE; 2869 if (page_pp_lock(pp, claim, 0)) { 2870 VPP_SETPPLOCK(vpage); 2871 } else { 2872 rctl_decr_locked_mem(p, NULL, 2873 PAGESIZE, 1); 2874 } 2875 } 2876 mutex_exit(&p->p_lock); 2877 } 2878 2879 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2880 hat_memload(hat, addr, pp, prot, hat_flag); 2881 2882 if (!(hat_flag & HAT_LOAD_LOCK)) 2883 page_unlock(pp); 2884 2885 anon_array_exit(&cookie); 2886 return (0); 2887 } 2888 } 2889 2890 /* 2891 * Obtain the page structure via anon_getpage() if it is 2892 * a private copy of an object (the result of a previous 2893 * copy-on-write). 2894 */ 2895 if (amp != NULL) { 2896 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2897 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2898 seg, addr, rw, svd->cred); 2899 if (err) 2900 goto out; 2901 2902 if (svd->type == MAP_SHARED) { 2903 /* 2904 * If this is a shared mapping to an 2905 * anon_map, then ignore the write 2906 * permissions returned by anon_getpage(). 2907 * They apply to the private mappings 2908 * of this anon_map. 2909 */ 2910 vpprot |= PROT_WRITE; 2911 } 2912 opp = anon_pl[0]; 2913 } 2914 } 2915 2916 /* 2917 * Search the pl[] list passed in if it is from the 2918 * original object (i.e., not a private copy). 2919 */ 2920 if (opp == NULL) { 2921 /* 2922 * Find original page. We must be bringing it in 2923 * from the list in pl[]. 2924 */ 2925 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2926 if (opp == PAGE_HANDLED) 2927 continue; 2928 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2929 if (opp->p_offset == off) 2930 break; 2931 } 2932 if (opp == NULL) { 2933 panic("segvn_faultpage not found"); 2934 /*NOTREACHED*/ 2935 } 2936 *ppp = PAGE_HANDLED; 2937 2938 } 2939 2940 ASSERT(PAGE_LOCKED(opp)); 2941 2942 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2943 "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0); 2944 2945 /* 2946 * The fault is treated as a copy-on-write fault if a 2947 * write occurs on a private segment and the object 2948 * page (i.e., mapping) is write protected. We assume 2949 * that fatal protection checks have already been made. 2950 */ 2951 2952 if (brkcow) { 2953 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2954 cow = !(vpprot & PROT_WRITE); 2955 } else if (svd->tr_state == SEGVN_TR_ON) { 2956 /* 2957 * If we are doing text replication COW on first touch. 2958 */ 2959 ASSERT(amp != NULL); 2960 ASSERT(svd->vp != NULL); 2961 ASSERT(rw != S_WRITE); 2962 cow = (ap == NULL); 2963 } else { 2964 cow = 0; 2965 } 2966 2967 /* 2968 * If not a copy-on-write case load the translation 2969 * and return. 2970 */ 2971 if (cow == 0) { 2972 2973 /* 2974 * Handle pages that have been marked for migration 2975 */ 2976 if (lgrp_optimizations()) 2977 page_migrate(seg, addr, &opp, 1); 2978 2979 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2980 if (rw == S_WRITE) 2981 hat_setmod(opp); 2982 else if (rw != S_OTHER && !hat_ismod(opp)) 2983 prot &= ~PROT_WRITE; 2984 } 2985 2986 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 2987 (!svd->pageprot && svd->prot == (prot & vpprot))); 2988 ASSERT(amp == NULL || 2989 svd->rcookie == HAT_INVALID_REGION_COOKIE); 2990 hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag, 2991 svd->rcookie); 2992 2993 if (!(hat_flag & HAT_LOAD_LOCK)) 2994 page_unlock(opp); 2995 2996 if (anon_lock) { 2997 anon_array_exit(&cookie); 2998 } 2999 return (0); 3000 } 3001 3002 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3003 3004 hat_setref(opp); 3005 3006 ASSERT(amp != NULL && anon_lock); 3007 3008 /* 3009 * Steal the page only if it isn't a private page 3010 * since stealing a private page is not worth the effort. 3011 */ 3012 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 3013 steal = 1; 3014 3015 /* 3016 * Steal the original page if the following conditions are true: 3017 * 3018 * We are low on memory, the page is not private, page is not large, 3019 * not shared, not modified, not `locked' or if we have it `locked' 3020 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 3021 * that the page is not shared) and if it doesn't have any 3022 * translations. page_struct_lock isn't needed to look at p_cowcnt 3023 * and p_lckcnt because we first get exclusive lock on page. 3024 */ 3025 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 3026 3027 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 3028 page_tryupgrade(opp) && !hat_ismod(opp) && 3029 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 3030 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 3031 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 3032 /* 3033 * Check if this page has other translations 3034 * after unloading our translation. 3035 */ 3036 if (hat_page_is_mapped(opp)) { 3037 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3038 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 3039 HAT_UNLOAD); 3040 } 3041 3042 /* 3043 * hat_unload() might sync back someone else's recent 3044 * modification, so check again. 3045 */ 3046 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 3047 pageflags |= STEAL_PAGE; 3048 } 3049 3050 /* 3051 * If we have a vpage pointer, see if it indicates that we have 3052 * ``locked'' the page we map -- if so, tell anon_private to 3053 * transfer the locking resource to the new page. 3054 * 3055 * See Statement at the beginning of segvn_lockop regarding 3056 * the way lockcnts/cowcnts are handled during COW. 3057 * 3058 */ 3059 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 3060 pageflags |= LOCK_PAGE; 3061 3062 /* 3063 * Allocate a private page and perform the copy. 3064 * For MAP_NORESERVE reserve swap space now, unless this 3065 * is a cow fault on an existing anon page in which case 3066 * MAP_NORESERVE will have made advance reservations. 3067 */ 3068 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 3069 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 3070 atomic_add_long(&svd->swresv, ptob(1)); 3071 atomic_add_long(&seg->s_as->a_resvsize, ptob(1)); 3072 } else { 3073 page_unlock(opp); 3074 err = ENOMEM; 3075 goto out; 3076 } 3077 } 3078 oldap = ap; 3079 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 3080 if (pp == NULL) { 3081 err = ENOMEM; /* out of swap space */ 3082 goto out; 3083 } 3084 3085 /* 3086 * If we copied away from an anonymous page, then 3087 * we are one step closer to freeing up an anon slot. 3088 * 3089 * NOTE: The original anon slot must be released while 3090 * holding the "anon_map" lock. This is necessary to prevent 3091 * other threads from obtaining a pointer to the anon slot 3092 * which may be freed if its "refcnt" is 1. 3093 */ 3094 if (oldap != NULL) 3095 anon_decref(oldap); 3096 3097 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 3098 3099 /* 3100 * Handle pages that have been marked for migration 3101 */ 3102 if (lgrp_optimizations()) 3103 page_migrate(seg, addr, &pp, 1); 3104 3105 ASSERT(pp->p_szc == 0); 3106 3107 ASSERT(!IS_VMODSORT(pp->p_vnode)); 3108 if (enable_mbit_wa) { 3109 if (rw == S_WRITE) 3110 hat_setmod(pp); 3111 else if (!hat_ismod(pp)) 3112 prot &= ~PROT_WRITE; 3113 } 3114 3115 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3116 hat_memload(hat, addr, pp, prot, hat_flag); 3117 3118 if (!(hat_flag & HAT_LOAD_LOCK)) 3119 page_unlock(pp); 3120 3121 ASSERT(anon_lock); 3122 anon_array_exit(&cookie); 3123 return (0); 3124 out: 3125 if (anon_lock) 3126 anon_array_exit(&cookie); 3127 3128 if (type == F_SOFTLOCK) { 3129 atomic_dec_ulong((ulong_t *)&svd->softlockcnt); 3130 } 3131 return (FC_MAKE_ERR(err)); 3132 } 3133 3134 /* 3135 * relocate a bunch of smaller targ pages into one large repl page. all targ 3136 * pages must be complete pages smaller than replacement pages. 3137 * it's assumed that no page's szc can change since they are all PAGESIZE or 3138 * complete large pages locked SHARED. 3139 */ 3140 static void 3141 segvn_relocate_pages(page_t **targ, page_t *replacement) 3142 { 3143 page_t *pp; 3144 pgcnt_t repl_npgs, curnpgs; 3145 pgcnt_t i; 3146 uint_t repl_szc = replacement->p_szc; 3147 page_t *first_repl = replacement; 3148 page_t *repl; 3149 spgcnt_t npgs; 3150 3151 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 3152 3153 ASSERT(repl_szc != 0); 3154 npgs = repl_npgs = page_get_pagecnt(repl_szc); 3155 3156 i = 0; 3157 while (repl_npgs) { 3158 spgcnt_t nreloc; 3159 int err; 3160 ASSERT(replacement != NULL); 3161 pp = targ[i]; 3162 ASSERT(pp->p_szc < repl_szc); 3163 ASSERT(PAGE_EXCL(pp)); 3164 ASSERT(!PP_ISFREE(pp)); 3165 curnpgs = page_get_pagecnt(pp->p_szc); 3166 if (curnpgs == 1) { 3167 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 3168 repl = replacement; 3169 page_sub(&replacement, repl); 3170 ASSERT(PAGE_EXCL(repl)); 3171 ASSERT(!PP_ISFREE(repl)); 3172 ASSERT(repl->p_szc == repl_szc); 3173 } else { 3174 page_t *repl_savepp; 3175 int j; 3176 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 3177 repl_savepp = replacement; 3178 for (j = 0; j < curnpgs; j++) { 3179 repl = replacement; 3180 page_sub(&replacement, repl); 3181 ASSERT(PAGE_EXCL(repl)); 3182 ASSERT(!PP_ISFREE(repl)); 3183 ASSERT(repl->p_szc == repl_szc); 3184 ASSERT(page_pptonum(targ[i + j]) == 3185 page_pptonum(targ[i]) + j); 3186 } 3187 repl = repl_savepp; 3188 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 3189 } 3190 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 3191 if (err || nreloc != curnpgs) { 3192 panic("segvn_relocate_pages: " 3193 "page_relocate failed err=%d curnpgs=%ld " 3194 "nreloc=%ld", err, curnpgs, nreloc); 3195 } 3196 ASSERT(curnpgs <= repl_npgs); 3197 repl_npgs -= curnpgs; 3198 i += curnpgs; 3199 } 3200 ASSERT(replacement == NULL); 3201 3202 repl = first_repl; 3203 repl_npgs = npgs; 3204 for (i = 0; i < repl_npgs; i++) { 3205 ASSERT(PAGE_EXCL(repl)); 3206 ASSERT(!PP_ISFREE(repl)); 3207 targ[i] = repl; 3208 page_downgrade(targ[i]); 3209 repl++; 3210 } 3211 } 3212 3213 /* 3214 * Check if all pages in ppa array are complete smaller than szc pages and 3215 * their roots will still be aligned relative to their current size if the 3216 * entire ppa array is relocated into one szc page. If these conditions are 3217 * not met return 0. 3218 * 3219 * If all pages are properly aligned attempt to upgrade their locks 3220 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 3221 * upgrdfail was set to 0 by caller. 3222 * 3223 * Return 1 if all pages are aligned and locked exclusively. 3224 * 3225 * If all pages in ppa array happen to be physically contiguous to make one 3226 * szc page and all exclusive locks are successfully obtained promote the page 3227 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 3228 */ 3229 static int 3230 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 3231 { 3232 page_t *pp; 3233 pfn_t pfn; 3234 pgcnt_t totnpgs = page_get_pagecnt(szc); 3235 pfn_t first_pfn; 3236 int contig = 1; 3237 pgcnt_t i; 3238 pgcnt_t j; 3239 uint_t curszc; 3240 pgcnt_t curnpgs; 3241 int root = 0; 3242 3243 ASSERT(szc > 0); 3244 3245 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 3246 3247 for (i = 0; i < totnpgs; i++) { 3248 pp = ppa[i]; 3249 ASSERT(PAGE_SHARED(pp)); 3250 ASSERT(!PP_ISFREE(pp)); 3251 pfn = page_pptonum(pp); 3252 if (i == 0) { 3253 if (!IS_P2ALIGNED(pfn, totnpgs)) { 3254 contig = 0; 3255 } else { 3256 first_pfn = pfn; 3257 } 3258 } else if (contig && pfn != first_pfn + i) { 3259 contig = 0; 3260 } 3261 if (pp->p_szc == 0) { 3262 if (root) { 3263 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 3264 return (0); 3265 } 3266 } else if (!root) { 3267 if ((curszc = pp->p_szc) >= szc) { 3268 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 3269 return (0); 3270 } 3271 if (curszc == 0) { 3272 /* 3273 * p_szc changed means we don't have all pages 3274 * locked. return failure. 3275 */ 3276 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 3277 return (0); 3278 } 3279 curnpgs = page_get_pagecnt(curszc); 3280 if (!IS_P2ALIGNED(pfn, curnpgs) || 3281 !IS_P2ALIGNED(i, curnpgs)) { 3282 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 3283 return (0); 3284 } 3285 root = 1; 3286 } else { 3287 ASSERT(i > 0); 3288 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 3289 if (pp->p_szc != curszc) { 3290 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 3291 return (0); 3292 } 3293 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 3294 panic("segvn_full_szcpages: " 3295 "large page not physically contiguous"); 3296 } 3297 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 3298 root = 0; 3299 } 3300 } 3301 } 3302 3303 for (i = 0; i < totnpgs; i++) { 3304 ASSERT(ppa[i]->p_szc < szc); 3305 if (!page_tryupgrade(ppa[i])) { 3306 for (j = 0; j < i; j++) { 3307 page_downgrade(ppa[j]); 3308 } 3309 *pszc = ppa[i]->p_szc; 3310 *upgrdfail = 1; 3311 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 3312 return (0); 3313 } 3314 } 3315 3316 /* 3317 * When a page is put a free cachelist its szc is set to 0. if file 3318 * system reclaimed pages from cachelist targ pages will be physically 3319 * contiguous with 0 p_szc. in this case just upgrade szc of targ 3320 * pages without any relocations. 3321 * To avoid any hat issues with previous small mappings 3322 * hat_pageunload() the target pages first. 3323 */ 3324 if (contig) { 3325 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 3326 for (i = 0; i < totnpgs; i++) { 3327 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 3328 } 3329 for (i = 0; i < totnpgs; i++) { 3330 ppa[i]->p_szc = szc; 3331 } 3332 for (i = 0; i < totnpgs; i++) { 3333 ASSERT(PAGE_EXCL(ppa[i])); 3334 page_downgrade(ppa[i]); 3335 } 3336 if (pszc != NULL) { 3337 *pszc = szc; 3338 } 3339 } 3340 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 3341 return (1); 3342 } 3343 3344 /* 3345 * Create physically contiguous pages for [vp, off] - [vp, off + 3346 * page_size(szc)) range and for private segment return them in ppa array. 3347 * Pages are created either via IO or relocations. 3348 * 3349 * Return 1 on success and 0 on failure. 3350 * 3351 * If physically contiguous pages already exist for this range return 1 without 3352 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 3353 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 3354 */ 3355 3356 static int 3357 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 3358 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 3359 int *downsize) 3360 3361 { 3362 page_t *pplist = *ppplist; 3363 size_t pgsz = page_get_pagesize(szc); 3364 pgcnt_t pages = btop(pgsz); 3365 ulong_t start_off = off; 3366 u_offset_t eoff = off + pgsz; 3367 spgcnt_t nreloc; 3368 u_offset_t io_off = off; 3369 size_t io_len; 3370 page_t *io_pplist = NULL; 3371 page_t *done_pplist = NULL; 3372 pgcnt_t pgidx = 0; 3373 page_t *pp; 3374 page_t *newpp; 3375 page_t *targpp; 3376 int io_err = 0; 3377 int i; 3378 pfn_t pfn; 3379 ulong_t ppages; 3380 page_t *targ_pplist = NULL; 3381 page_t *repl_pplist = NULL; 3382 page_t *tmp_pplist; 3383 int nios = 0; 3384 uint_t pszc; 3385 struct vattr va; 3386 3387 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 3388 3389 ASSERT(szc != 0); 3390 ASSERT(pplist->p_szc == szc); 3391 3392 /* 3393 * downsize will be set to 1 only if we fail to lock pages. this will 3394 * allow subsequent faults to try to relocate the page again. If we 3395 * fail due to misalignment don't downsize and let the caller map the 3396 * whole region with small mappings to avoid more faults into the area 3397 * where we can't get large pages anyway. 3398 */ 3399 *downsize = 0; 3400 3401 while (off < eoff) { 3402 newpp = pplist; 3403 ASSERT(newpp != NULL); 3404 ASSERT(PAGE_EXCL(newpp)); 3405 ASSERT(!PP_ISFREE(newpp)); 3406 /* 3407 * we pass NULL for nrelocp to page_lookup_create() 3408 * so that it doesn't relocate. We relocate here 3409 * later only after we make sure we can lock all 3410 * pages in the range we handle and they are all 3411 * aligned. 3412 */ 3413 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 3414 ASSERT(pp != NULL); 3415 ASSERT(!PP_ISFREE(pp)); 3416 ASSERT(pp->p_vnode == vp); 3417 ASSERT(pp->p_offset == off); 3418 if (pp == newpp) { 3419 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 3420 page_sub(&pplist, pp); 3421 ASSERT(PAGE_EXCL(pp)); 3422 ASSERT(page_iolock_assert(pp)); 3423 page_list_concat(&io_pplist, &pp); 3424 off += PAGESIZE; 3425 continue; 3426 } 3427 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 3428 pfn = page_pptonum(pp); 3429 pszc = pp->p_szc; 3430 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 3431 IS_P2ALIGNED(pfn, pages)) { 3432 ASSERT(repl_pplist == NULL); 3433 ASSERT(done_pplist == NULL); 3434 ASSERT(pplist == *ppplist); 3435 page_unlock(pp); 3436 page_free_replacement_page(pplist); 3437 page_create_putback(pages); 3438 *ppplist = NULL; 3439 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 3440 return (1); 3441 } 3442 if (pszc >= szc) { 3443 page_unlock(pp); 3444 segvn_faultvnmpss_align_err1++; 3445 goto out; 3446 } 3447 ppages = page_get_pagecnt(pszc); 3448 if (!IS_P2ALIGNED(pfn, ppages)) { 3449 ASSERT(pszc > 0); 3450 /* 3451 * sizing down to pszc won't help. 3452 */ 3453 page_unlock(pp); 3454 segvn_faultvnmpss_align_err2++; 3455 goto out; 3456 } 3457 pfn = page_pptonum(newpp); 3458 if (!IS_P2ALIGNED(pfn, ppages)) { 3459 ASSERT(pszc > 0); 3460 /* 3461 * sizing down to pszc won't help. 3462 */ 3463 page_unlock(pp); 3464 segvn_faultvnmpss_align_err3++; 3465 goto out; 3466 } 3467 if (!PAGE_EXCL(pp)) { 3468 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3469 page_unlock(pp); 3470 *downsize = 1; 3471 *ret_pszc = pp->p_szc; 3472 goto out; 3473 } 3474 targpp = pp; 3475 if (io_pplist != NULL) { 3476 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3477 io_len = off - io_off; 3478 /* 3479 * Some file systems like NFS don't check EOF 3480 * conditions in VOP_PAGEIO(). Check it here 3481 * now that pages are locked SE_EXCL. Any file 3482 * truncation will wait until the pages are 3483 * unlocked so no need to worry that file will 3484 * be truncated after we check its size here. 3485 * XXX fix NFS to remove this check. 3486 */ 3487 va.va_mask = AT_SIZE; 3488 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL)) { 3489 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3490 page_unlock(targpp); 3491 goto out; 3492 } 3493 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3494 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3495 *downsize = 1; 3496 *ret_pszc = 0; 3497 page_unlock(targpp); 3498 goto out; 3499 } 3500 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3501 B_READ, svd->cred, NULL); 3502 if (io_err) { 3503 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3504 page_unlock(targpp); 3505 if (io_err == EDEADLK) { 3506 segvn_vmpss_pageio_deadlk_err++; 3507 } 3508 goto out; 3509 } 3510 nios++; 3511 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3512 while (io_pplist != NULL) { 3513 pp = io_pplist; 3514 page_sub(&io_pplist, pp); 3515 ASSERT(page_iolock_assert(pp)); 3516 page_io_unlock(pp); 3517 pgidx = (pp->p_offset - start_off) >> 3518 PAGESHIFT; 3519 ASSERT(pgidx < pages); 3520 ppa[pgidx] = pp; 3521 page_list_concat(&done_pplist, &pp); 3522 } 3523 } 3524 pp = targpp; 3525 ASSERT(PAGE_EXCL(pp)); 3526 ASSERT(pp->p_szc <= pszc); 3527 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3528 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3529 page_unlock(pp); 3530 *downsize = 1; 3531 *ret_pszc = pp->p_szc; 3532 goto out; 3533 } 3534 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3535 /* 3536 * page szc chould have changed before the entire group was 3537 * locked. reread page szc. 3538 */ 3539 pszc = pp->p_szc; 3540 ppages = page_get_pagecnt(pszc); 3541 3542 /* link just the roots */ 3543 page_list_concat(&targ_pplist, &pp); 3544 page_sub(&pplist, newpp); 3545 page_list_concat(&repl_pplist, &newpp); 3546 off += PAGESIZE; 3547 while (--ppages != 0) { 3548 newpp = pplist; 3549 page_sub(&pplist, newpp); 3550 off += PAGESIZE; 3551 } 3552 io_off = off; 3553 } 3554 if (io_pplist != NULL) { 3555 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3556 io_len = eoff - io_off; 3557 va.va_mask = AT_SIZE; 3558 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL) != 0) { 3559 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3560 goto out; 3561 } 3562 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3563 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3564 *downsize = 1; 3565 *ret_pszc = 0; 3566 goto out; 3567 } 3568 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3569 B_READ, svd->cred, NULL); 3570 if (io_err) { 3571 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3572 if (io_err == EDEADLK) { 3573 segvn_vmpss_pageio_deadlk_err++; 3574 } 3575 goto out; 3576 } 3577 nios++; 3578 while (io_pplist != NULL) { 3579 pp = io_pplist; 3580 page_sub(&io_pplist, pp); 3581 ASSERT(page_iolock_assert(pp)); 3582 page_io_unlock(pp); 3583 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3584 ASSERT(pgidx < pages); 3585 ppa[pgidx] = pp; 3586 } 3587 } 3588 /* 3589 * we're now bound to succeed or panic. 3590 * remove pages from done_pplist. it's not needed anymore. 3591 */ 3592 while (done_pplist != NULL) { 3593 pp = done_pplist; 3594 page_sub(&done_pplist, pp); 3595 } 3596 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3597 ASSERT(pplist == NULL); 3598 *ppplist = NULL; 3599 while (targ_pplist != NULL) { 3600 int ret; 3601 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3602 ASSERT(repl_pplist); 3603 pp = targ_pplist; 3604 page_sub(&targ_pplist, pp); 3605 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3606 newpp = repl_pplist; 3607 page_sub(&repl_pplist, newpp); 3608 #ifdef DEBUG 3609 pfn = page_pptonum(pp); 3610 pszc = pp->p_szc; 3611 ppages = page_get_pagecnt(pszc); 3612 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3613 pfn = page_pptonum(newpp); 3614 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3615 ASSERT(P2PHASE(pfn, pages) == pgidx); 3616 #endif 3617 nreloc = 0; 3618 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3619 if (ret != 0 || nreloc == 0) { 3620 panic("segvn_fill_vp_pages: " 3621 "page_relocate failed"); 3622 } 3623 pp = newpp; 3624 while (nreloc-- != 0) { 3625 ASSERT(PAGE_EXCL(pp)); 3626 ASSERT(pp->p_vnode == vp); 3627 ASSERT(pgidx == 3628 ((pp->p_offset - start_off) >> PAGESHIFT)); 3629 ppa[pgidx++] = pp; 3630 pp++; 3631 } 3632 } 3633 3634 if (svd->type == MAP_PRIVATE) { 3635 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3636 for (i = 0; i < pages; i++) { 3637 ASSERT(ppa[i] != NULL); 3638 ASSERT(PAGE_EXCL(ppa[i])); 3639 ASSERT(ppa[i]->p_vnode == vp); 3640 ASSERT(ppa[i]->p_offset == 3641 start_off + (i << PAGESHIFT)); 3642 page_downgrade(ppa[i]); 3643 } 3644 ppa[pages] = NULL; 3645 } else { 3646 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3647 /* 3648 * the caller will still call VOP_GETPAGE() for shared segments 3649 * to check FS write permissions. For private segments we map 3650 * file read only anyway. so no VOP_GETPAGE is needed. 3651 */ 3652 for (i = 0; i < pages; i++) { 3653 ASSERT(ppa[i] != NULL); 3654 ASSERT(PAGE_EXCL(ppa[i])); 3655 ASSERT(ppa[i]->p_vnode == vp); 3656 ASSERT(ppa[i]->p_offset == 3657 start_off + (i << PAGESHIFT)); 3658 page_unlock(ppa[i]); 3659 } 3660 ppa[0] = NULL; 3661 } 3662 3663 return (1); 3664 out: 3665 /* 3666 * Do the cleanup. Unlock target pages we didn't relocate. They are 3667 * linked on targ_pplist by root pages. reassemble unused replacement 3668 * and io pages back to pplist. 3669 */ 3670 if (io_pplist != NULL) { 3671 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3672 pp = io_pplist; 3673 do { 3674 ASSERT(pp->p_vnode == vp); 3675 ASSERT(pp->p_offset == io_off); 3676 ASSERT(page_iolock_assert(pp)); 3677 page_io_unlock(pp); 3678 page_hashout(pp, NULL); 3679 io_off += PAGESIZE; 3680 } while ((pp = pp->p_next) != io_pplist); 3681 page_list_concat(&io_pplist, &pplist); 3682 pplist = io_pplist; 3683 } 3684 tmp_pplist = NULL; 3685 while (targ_pplist != NULL) { 3686 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3687 pp = targ_pplist; 3688 ASSERT(PAGE_EXCL(pp)); 3689 page_sub(&targ_pplist, pp); 3690 3691 pszc = pp->p_szc; 3692 ppages = page_get_pagecnt(pszc); 3693 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3694 3695 if (pszc != 0) { 3696 group_page_unlock(pp); 3697 } 3698 page_unlock(pp); 3699 3700 pp = repl_pplist; 3701 ASSERT(pp != NULL); 3702 ASSERT(PAGE_EXCL(pp)); 3703 ASSERT(pp->p_szc == szc); 3704 page_sub(&repl_pplist, pp); 3705 3706 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3707 3708 /* relink replacement page */ 3709 page_list_concat(&tmp_pplist, &pp); 3710 while (--ppages != 0) { 3711 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3712 pp++; 3713 ASSERT(PAGE_EXCL(pp)); 3714 ASSERT(pp->p_szc == szc); 3715 page_list_concat(&tmp_pplist, &pp); 3716 } 3717 } 3718 if (tmp_pplist != NULL) { 3719 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3720 page_list_concat(&tmp_pplist, &pplist); 3721 pplist = tmp_pplist; 3722 } 3723 /* 3724 * at this point all pages are either on done_pplist or 3725 * pplist. They can't be all on done_pplist otherwise 3726 * we'd've been done. 3727 */ 3728 ASSERT(pplist != NULL); 3729 if (nios != 0) { 3730 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3731 pp = pplist; 3732 do { 3733 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3734 ASSERT(pp->p_szc == szc); 3735 ASSERT(PAGE_EXCL(pp)); 3736 ASSERT(pp->p_vnode != vp); 3737 pp->p_szc = 0; 3738 } while ((pp = pp->p_next) != pplist); 3739 3740 pp = done_pplist; 3741 do { 3742 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3743 ASSERT(pp->p_szc == szc); 3744 ASSERT(PAGE_EXCL(pp)); 3745 ASSERT(pp->p_vnode == vp); 3746 pp->p_szc = 0; 3747 } while ((pp = pp->p_next) != done_pplist); 3748 3749 while (pplist != NULL) { 3750 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3751 pp = pplist; 3752 page_sub(&pplist, pp); 3753 page_free(pp, 0); 3754 } 3755 3756 while (done_pplist != NULL) { 3757 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3758 pp = done_pplist; 3759 page_sub(&done_pplist, pp); 3760 page_unlock(pp); 3761 } 3762 *ppplist = NULL; 3763 return (0); 3764 } 3765 ASSERT(pplist == *ppplist); 3766 if (io_err) { 3767 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3768 /* 3769 * don't downsize on io error. 3770 * see if vop_getpage succeeds. 3771 * pplist may still be used in this case 3772 * for relocations. 3773 */ 3774 return (0); 3775 } 3776 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3777 page_free_replacement_page(pplist); 3778 page_create_putback(pages); 3779 *ppplist = NULL; 3780 return (0); 3781 } 3782 3783 int segvn_anypgsz = 0; 3784 3785 #define SEGVN_RESTORE_SOFTLOCK_VP(type, pages) \ 3786 if ((type) == F_SOFTLOCK) { \ 3787 atomic_add_long((ulong_t *)&(svd)->softlockcnt, \ 3788 -(pages)); \ 3789 } 3790 3791 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3792 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3793 if ((rw) == S_WRITE) { \ 3794 for (i = 0; i < (pages); i++) { \ 3795 ASSERT((ppa)[i]->p_vnode == \ 3796 (ppa)[0]->p_vnode); \ 3797 hat_setmod((ppa)[i]); \ 3798 } \ 3799 } else if ((rw) != S_OTHER && \ 3800 ((prot) & (vpprot) & PROT_WRITE)) { \ 3801 for (i = 0; i < (pages); i++) { \ 3802 ASSERT((ppa)[i]->p_vnode == \ 3803 (ppa)[0]->p_vnode); \ 3804 if (!hat_ismod((ppa)[i])) { \ 3805 prot &= ~PROT_WRITE; \ 3806 break; \ 3807 } \ 3808 } \ 3809 } \ 3810 } 3811 3812 #ifdef VM_STATS 3813 3814 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3815 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3816 3817 #else /* VM_STATS */ 3818 3819 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3820 3821 #endif 3822 3823 static faultcode_t 3824 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3825 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3826 caddr_t eaddr, int brkcow) 3827 { 3828 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3829 struct anon_map *amp = svd->amp; 3830 uchar_t segtype = svd->type; 3831 uint_t szc = seg->s_szc; 3832 size_t pgsz = page_get_pagesize(szc); 3833 size_t maxpgsz = pgsz; 3834 pgcnt_t pages = btop(pgsz); 3835 pgcnt_t maxpages = pages; 3836 size_t ppasize = (pages + 1) * sizeof (page_t *); 3837 caddr_t a = lpgaddr; 3838 caddr_t maxlpgeaddr = lpgeaddr; 3839 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3840 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3841 struct vpage *vpage = (svd->vpage != NULL) ? 3842 &svd->vpage[seg_page(seg, a)] : NULL; 3843 vnode_t *vp = svd->vp; 3844 page_t **ppa; 3845 uint_t pszc; 3846 size_t ppgsz; 3847 pgcnt_t ppages; 3848 faultcode_t err = 0; 3849 int ierr; 3850 int vop_size_err = 0; 3851 uint_t protchk, prot, vpprot; 3852 ulong_t i; 3853 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3854 anon_sync_obj_t an_cookie; 3855 enum seg_rw arw; 3856 int alloc_failed = 0; 3857 int adjszc_chk; 3858 struct vattr va; 3859 page_t *pplist; 3860 pfn_t pfn; 3861 int physcontig; 3862 int upgrdfail; 3863 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3864 int tron = (svd->tr_state == SEGVN_TR_ON); 3865 3866 ASSERT(szc != 0); 3867 ASSERT(vp != NULL); 3868 ASSERT(brkcow == 0 || amp != NULL); 3869 ASSERT(tron == 0 || amp != NULL); 3870 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3871 ASSERT(!(svd->flags & MAP_NORESERVE)); 3872 ASSERT(type != F_SOFTUNLOCK); 3873 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3874 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3875 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3876 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3877 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3878 ASSERT(svd->tr_state != SEGVN_TR_INIT); 3879 3880 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3881 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3882 3883 if (svd->flags & MAP_TEXT) { 3884 hat_flag |= HAT_LOAD_TEXT; 3885 } 3886 3887 if (svd->pageprot) { 3888 switch (rw) { 3889 case S_READ: 3890 protchk = PROT_READ; 3891 break; 3892 case S_WRITE: 3893 protchk = PROT_WRITE; 3894 break; 3895 case S_EXEC: 3896 protchk = PROT_EXEC; 3897 break; 3898 case S_OTHER: 3899 default: 3900 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3901 break; 3902 } 3903 } else { 3904 prot = svd->prot; 3905 /* caller has already done segment level protection check. */ 3906 } 3907 3908 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3909 SEGVN_VMSTAT_FLTVNPAGES(2); 3910 arw = S_READ; 3911 } else { 3912 arw = rw; 3913 } 3914 3915 ppa = kmem_alloc(ppasize, KM_SLEEP); 3916 3917 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3918 3919 for (;;) { 3920 adjszc_chk = 0; 3921 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3922 if (adjszc_chk) { 3923 while (szc < seg->s_szc) { 3924 uintptr_t e; 3925 uint_t tszc; 3926 tszc = segvn_anypgsz_vnode ? szc + 1 : 3927 seg->s_szc; 3928 ppgsz = page_get_pagesize(tszc); 3929 if (!IS_P2ALIGNED(a, ppgsz) || 3930 ((alloc_failed >> tszc) & 0x1)) { 3931 break; 3932 } 3933 SEGVN_VMSTAT_FLTVNPAGES(4); 3934 szc = tszc; 3935 pgsz = ppgsz; 3936 pages = btop(pgsz); 3937 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3938 lpgeaddr = (caddr_t)e; 3939 } 3940 } 3941 3942 again: 3943 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3944 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3945 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3946 anon_array_enter(amp, aindx, &an_cookie); 3947 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3948 SEGVN_VMSTAT_FLTVNPAGES(5); 3949 ASSERT(anon_pages(amp->ahp, aindx, 3950 maxpages) == maxpages); 3951 anon_array_exit(&an_cookie); 3952 ANON_LOCK_EXIT(&->a_rwlock); 3953 err = segvn_fault_anonpages(hat, seg, 3954 a, a + maxpgsz, type, rw, 3955 MAX(a, addr), 3956 MIN(a + maxpgsz, eaddr), brkcow); 3957 if (err != 0) { 3958 SEGVN_VMSTAT_FLTVNPAGES(6); 3959 goto out; 3960 } 3961 if (szc < seg->s_szc) { 3962 szc = seg->s_szc; 3963 pgsz = maxpgsz; 3964 pages = maxpages; 3965 lpgeaddr = maxlpgeaddr; 3966 } 3967 goto next; 3968 } else { 3969 ASSERT(anon_pages(amp->ahp, aindx, 3970 maxpages) == 0); 3971 SEGVN_VMSTAT_FLTVNPAGES(7); 3972 anon_array_exit(&an_cookie); 3973 ANON_LOCK_EXIT(&->a_rwlock); 3974 } 3975 } 3976 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3977 ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz)); 3978 3979 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3980 ASSERT(vpage != NULL); 3981 prot = VPP_PROT(vpage); 3982 ASSERT(sameprot(seg, a, maxpgsz)); 3983 if ((prot & protchk) == 0) { 3984 SEGVN_VMSTAT_FLTVNPAGES(8); 3985 err = FC_PROT; 3986 goto out; 3987 } 3988 } 3989 if (type == F_SOFTLOCK) { 3990 atomic_add_long((ulong_t *)&svd->softlockcnt, 3991 pages); 3992 } 3993 3994 pplist = NULL; 3995 physcontig = 0; 3996 ppa[0] = NULL; 3997 if (!brkcow && !tron && szc && 3998 !page_exists_physcontig(vp, off, szc, 3999 segtype == MAP_PRIVATE ? ppa : NULL)) { 4000 SEGVN_VMSTAT_FLTVNPAGES(9); 4001 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 4002 szc, 0, 0) && type != F_SOFTLOCK) { 4003 SEGVN_VMSTAT_FLTVNPAGES(10); 4004 pszc = 0; 4005 ierr = -1; 4006 alloc_failed |= (1 << szc); 4007 break; 4008 } 4009 if (pplist != NULL && 4010 vp->v_mpssdata == SEGVN_PAGEIO) { 4011 int downsize; 4012 SEGVN_VMSTAT_FLTVNPAGES(11); 4013 physcontig = segvn_fill_vp_pages(svd, 4014 vp, off, szc, ppa, &pplist, 4015 &pszc, &downsize); 4016 ASSERT(!physcontig || pplist == NULL); 4017 if (!physcontig && downsize && 4018 type != F_SOFTLOCK) { 4019 ASSERT(pplist == NULL); 4020 SEGVN_VMSTAT_FLTVNPAGES(12); 4021 ierr = -1; 4022 break; 4023 } 4024 ASSERT(!physcontig || 4025 segtype == MAP_PRIVATE || 4026 ppa[0] == NULL); 4027 if (physcontig && ppa[0] == NULL) { 4028 physcontig = 0; 4029 } 4030 } 4031 } else if (!brkcow && !tron && szc && ppa[0] != NULL) { 4032 SEGVN_VMSTAT_FLTVNPAGES(13); 4033 ASSERT(segtype == MAP_PRIVATE); 4034 physcontig = 1; 4035 } 4036 4037 if (!physcontig) { 4038 SEGVN_VMSTAT_FLTVNPAGES(14); 4039 ppa[0] = NULL; 4040 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 4041 &vpprot, ppa, pgsz, seg, a, arw, 4042 svd->cred, NULL); 4043 #ifdef DEBUG 4044 if (ierr == 0) { 4045 for (i = 0; i < pages; i++) { 4046 ASSERT(PAGE_LOCKED(ppa[i])); 4047 ASSERT(!PP_ISFREE(ppa[i])); 4048 ASSERT(ppa[i]->p_vnode == vp); 4049 ASSERT(ppa[i]->p_offset == 4050 off + (i << PAGESHIFT)); 4051 } 4052 } 4053 #endif /* DEBUG */ 4054 if (segtype == MAP_PRIVATE) { 4055 SEGVN_VMSTAT_FLTVNPAGES(15); 4056 vpprot &= ~PROT_WRITE; 4057 } 4058 } else { 4059 ASSERT(segtype == MAP_PRIVATE); 4060 SEGVN_VMSTAT_FLTVNPAGES(16); 4061 vpprot = PROT_ALL & ~PROT_WRITE; 4062 ierr = 0; 4063 } 4064 4065 if (ierr != 0) { 4066 SEGVN_VMSTAT_FLTVNPAGES(17); 4067 if (pplist != NULL) { 4068 SEGVN_VMSTAT_FLTVNPAGES(18); 4069 page_free_replacement_page(pplist); 4070 page_create_putback(pages); 4071 } 4072 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4073 if (a + pgsz <= eaddr) { 4074 SEGVN_VMSTAT_FLTVNPAGES(19); 4075 err = FC_MAKE_ERR(ierr); 4076 goto out; 4077 } 4078 va.va_mask = AT_SIZE; 4079 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL)) { 4080 SEGVN_VMSTAT_FLTVNPAGES(20); 4081 err = FC_MAKE_ERR(EIO); 4082 goto out; 4083 } 4084 if (btopr(va.va_size) >= btopr(off + pgsz)) { 4085 SEGVN_VMSTAT_FLTVNPAGES(21); 4086 err = FC_MAKE_ERR(ierr); 4087 goto out; 4088 } 4089 if (btopr(va.va_size) < 4090 btopr(off + (eaddr - a))) { 4091 SEGVN_VMSTAT_FLTVNPAGES(22); 4092 err = FC_MAKE_ERR(ierr); 4093 goto out; 4094 } 4095 if (brkcow || tron || type == F_SOFTLOCK) { 4096 /* can't reduce map area */ 4097 SEGVN_VMSTAT_FLTVNPAGES(23); 4098 vop_size_err = 1; 4099 goto out; 4100 } 4101 SEGVN_VMSTAT_FLTVNPAGES(24); 4102 ASSERT(szc != 0); 4103 pszc = 0; 4104 ierr = -1; 4105 break; 4106 } 4107 4108 if (amp != NULL) { 4109 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4110 anon_array_enter(amp, aindx, &an_cookie); 4111 } 4112 if (amp != NULL && 4113 anon_get_ptr(amp->ahp, aindx) != NULL) { 4114 ulong_t taindx = P2ALIGN(aindx, maxpages); 4115 4116 SEGVN_VMSTAT_FLTVNPAGES(25); 4117 ASSERT(anon_pages(amp->ahp, taindx, 4118 maxpages) == maxpages); 4119 for (i = 0; i < pages; i++) { 4120 page_unlock(ppa[i]); 4121 } 4122 anon_array_exit(&an_cookie); 4123 ANON_LOCK_EXIT(&->a_rwlock); 4124 if (pplist != NULL) { 4125 page_free_replacement_page(pplist); 4126 page_create_putback(pages); 4127 } 4128 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4129 if (szc < seg->s_szc) { 4130 SEGVN_VMSTAT_FLTVNPAGES(26); 4131 /* 4132 * For private segments SOFTLOCK 4133 * either always breaks cow (any rw 4134 * type except S_READ_NOCOW) or 4135 * address space is locked as writer 4136 * (S_READ_NOCOW case) and anon slots 4137 * can't show up on second check. 4138 * Therefore if we are here for 4139 * SOFTLOCK case it must be a cow 4140 * break but cow break never reduces 4141 * szc. text replication (tron) in 4142 * this case works as cow break. 4143 * Thus the assert below. 4144 */ 4145 ASSERT(!brkcow && !tron && 4146 type != F_SOFTLOCK); 4147 pszc = seg->s_szc; 4148 ierr = -2; 4149 break; 4150 } 4151 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4152 goto again; 4153 } 4154 #ifdef DEBUG 4155 if (amp != NULL) { 4156 ulong_t taindx = P2ALIGN(aindx, maxpages); 4157 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 4158 } 4159 #endif /* DEBUG */ 4160 4161 if (brkcow || tron) { 4162 ASSERT(amp != NULL); 4163 ASSERT(pplist == NULL); 4164 ASSERT(szc == seg->s_szc); 4165 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4166 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 4167 SEGVN_VMSTAT_FLTVNPAGES(27); 4168 ierr = anon_map_privatepages(amp, aindx, szc, 4169 seg, a, prot, ppa, vpage, segvn_anypgsz, 4170 tron ? PG_LOCAL : 0, svd->cred); 4171 if (ierr != 0) { 4172 SEGVN_VMSTAT_FLTVNPAGES(28); 4173 anon_array_exit(&an_cookie); 4174 ANON_LOCK_EXIT(&->a_rwlock); 4175 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4176 err = FC_MAKE_ERR(ierr); 4177 goto out; 4178 } 4179 4180 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4181 /* 4182 * p_szc can't be changed for locked 4183 * swapfs pages. 4184 */ 4185 ASSERT(svd->rcookie == 4186 HAT_INVALID_REGION_COOKIE); 4187 hat_memload_array(hat, a, pgsz, ppa, prot, 4188 hat_flag); 4189 4190 if (!(hat_flag & HAT_LOAD_LOCK)) { 4191 SEGVN_VMSTAT_FLTVNPAGES(29); 4192 for (i = 0; i < pages; i++) { 4193 page_unlock(ppa[i]); 4194 } 4195 } 4196 anon_array_exit(&an_cookie); 4197 ANON_LOCK_EXIT(&->a_rwlock); 4198 goto next; 4199 } 4200 4201 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 4202 (!svd->pageprot && svd->prot == (prot & vpprot))); 4203 4204 pfn = page_pptonum(ppa[0]); 4205 /* 4206 * hat_page_demote() needs an SE_EXCL lock on one of 4207 * constituent page_t's and it decreases root's p_szc 4208 * last. This means if root's p_szc is equal szc and 4209 * all its constituent pages are locked 4210 * hat_page_demote() that could have changed p_szc to 4211 * szc is already done and no new have page_demote() 4212 * can start for this large page. 4213 */ 4214 4215 /* 4216 * we need to make sure same mapping size is used for 4217 * the same address range if there's a possibility the 4218 * adddress is already mapped because hat layer panics 4219 * when translation is loaded for the range already 4220 * mapped with a different page size. We achieve it 4221 * by always using largest page size possible subject 4222 * to the constraints of page size, segment page size 4223 * and page alignment. Since mappings are invalidated 4224 * when those constraints change and make it 4225 * impossible to use previously used mapping size no 4226 * mapping size conflicts should happen. 4227 */ 4228 4229 chkszc: 4230 if ((pszc = ppa[0]->p_szc) == szc && 4231 IS_P2ALIGNED(pfn, pages)) { 4232 4233 SEGVN_VMSTAT_FLTVNPAGES(30); 4234 #ifdef DEBUG 4235 for (i = 0; i < pages; i++) { 4236 ASSERT(PAGE_LOCKED(ppa[i])); 4237 ASSERT(!PP_ISFREE(ppa[i])); 4238 ASSERT(page_pptonum(ppa[i]) == 4239 pfn + i); 4240 ASSERT(ppa[i]->p_szc == szc); 4241 ASSERT(ppa[i]->p_vnode == vp); 4242 ASSERT(ppa[i]->p_offset == 4243 off + (i << PAGESHIFT)); 4244 } 4245 #endif /* DEBUG */ 4246 /* 4247 * All pages are of szc we need and they are 4248 * all locked so they can't change szc. load 4249 * translations. 4250 * 4251 * if page got promoted since last check 4252 * we don't need pplist. 4253 */ 4254 if (pplist != NULL) { 4255 page_free_replacement_page(pplist); 4256 page_create_putback(pages); 4257 } 4258 if (PP_ISMIGRATE(ppa[0])) { 4259 page_migrate(seg, a, ppa, pages); 4260 } 4261 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4262 prot, vpprot); 4263 hat_memload_array_region(hat, a, pgsz, 4264 ppa, prot & vpprot, hat_flag, 4265 svd->rcookie); 4266 4267 if (!(hat_flag & HAT_LOAD_LOCK)) { 4268 for (i = 0; i < pages; i++) { 4269 page_unlock(ppa[i]); 4270 } 4271 } 4272 if (amp != NULL) { 4273 anon_array_exit(&an_cookie); 4274 ANON_LOCK_EXIT(&->a_rwlock); 4275 } 4276 goto next; 4277 } 4278 4279 /* 4280 * See if upsize is possible. 4281 */ 4282 if (pszc > szc && szc < seg->s_szc && 4283 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 4284 pgcnt_t aphase; 4285 uint_t pszc1 = MIN(pszc, seg->s_szc); 4286 ppgsz = page_get_pagesize(pszc1); 4287 ppages = btop(ppgsz); 4288 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 4289 4290 ASSERT(type != F_SOFTLOCK); 4291 4292 SEGVN_VMSTAT_FLTVNPAGES(31); 4293 if (aphase != P2PHASE(pfn, ppages)) { 4294 segvn_faultvnmpss_align_err4++; 4295 } else { 4296 SEGVN_VMSTAT_FLTVNPAGES(32); 4297 if (pplist != NULL) { 4298 page_t *pl = pplist; 4299 page_free_replacement_page(pl); 4300 page_create_putback(pages); 4301 } 4302 for (i = 0; i < pages; i++) { 4303 page_unlock(ppa[i]); 4304 } 4305 if (amp != NULL) { 4306 anon_array_exit(&an_cookie); 4307 ANON_LOCK_EXIT(&->a_rwlock); 4308 } 4309 pszc = pszc1; 4310 ierr = -2; 4311 break; 4312 } 4313 } 4314 4315 /* 4316 * check if we should use smallest mapping size. 4317 */ 4318 upgrdfail = 0; 4319 if (szc == 0 || 4320 (pszc >= szc && 4321 !IS_P2ALIGNED(pfn, pages)) || 4322 (pszc < szc && 4323 !segvn_full_szcpages(ppa, szc, &upgrdfail, 4324 &pszc))) { 4325 4326 if (upgrdfail && type != F_SOFTLOCK) { 4327 /* 4328 * segvn_full_szcpages failed to lock 4329 * all pages EXCL. Size down. 4330 */ 4331 ASSERT(pszc < szc); 4332 4333 SEGVN_VMSTAT_FLTVNPAGES(33); 4334 4335 if (pplist != NULL) { 4336 page_t *pl = pplist; 4337 page_free_replacement_page(pl); 4338 page_create_putback(pages); 4339 } 4340 4341 for (i = 0; i < pages; i++) { 4342 page_unlock(ppa[i]); 4343 } 4344 if (amp != NULL) { 4345 anon_array_exit(&an_cookie); 4346 ANON_LOCK_EXIT(&->a_rwlock); 4347 } 4348 ierr = -1; 4349 break; 4350 } 4351 if (szc != 0 && !upgrdfail) { 4352 segvn_faultvnmpss_align_err5++; 4353 } 4354 SEGVN_VMSTAT_FLTVNPAGES(34); 4355 if (pplist != NULL) { 4356 page_free_replacement_page(pplist); 4357 page_create_putback(pages); 4358 } 4359 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4360 prot, vpprot); 4361 if (upgrdfail && segvn_anypgsz_vnode) { 4362 /* SOFTLOCK case */ 4363 hat_memload_array_region(hat, a, pgsz, 4364 ppa, prot & vpprot, hat_flag, 4365 svd->rcookie); 4366 } else { 4367 for (i = 0; i < pages; i++) { 4368 hat_memload_region(hat, 4369 a + (i << PAGESHIFT), 4370 ppa[i], prot & vpprot, 4371 hat_flag, svd->rcookie); 4372 } 4373 } 4374 if (!(hat_flag & HAT_LOAD_LOCK)) { 4375 for (i = 0; i < pages; i++) { 4376 page_unlock(ppa[i]); 4377 } 4378 } 4379 if (amp != NULL) { 4380 anon_array_exit(&an_cookie); 4381 ANON_LOCK_EXIT(&->a_rwlock); 4382 } 4383 goto next; 4384 } 4385 4386 if (pszc == szc) { 4387 /* 4388 * segvn_full_szcpages() upgraded pages szc. 4389 */ 4390 ASSERT(pszc == ppa[0]->p_szc); 4391 ASSERT(IS_P2ALIGNED(pfn, pages)); 4392 goto chkszc; 4393 } 4394 4395 if (pszc > szc) { 4396 kmutex_t *szcmtx; 4397 SEGVN_VMSTAT_FLTVNPAGES(35); 4398 /* 4399 * p_szc of ppa[0] can change since we haven't 4400 * locked all constituent pages. Call 4401 * page_lock_szc() to prevent szc changes. 4402 * This should be a rare case that happens when 4403 * multiple segments use a different page size 4404 * to map the same file offsets. 4405 */ 4406 szcmtx = page_szc_lock(ppa[0]); 4407 pszc = ppa[0]->p_szc; 4408 ASSERT(szcmtx != NULL || pszc == 0); 4409 ASSERT(ppa[0]->p_szc <= pszc); 4410 if (pszc <= szc) { 4411 SEGVN_VMSTAT_FLTVNPAGES(36); 4412 if (szcmtx != NULL) { 4413 mutex_exit(szcmtx); 4414 } 4415 goto chkszc; 4416 } 4417 if (pplist != NULL) { 4418 /* 4419 * page got promoted since last check. 4420 * we don't need preaalocated large 4421 * page. 4422 */ 4423 SEGVN_VMSTAT_FLTVNPAGES(37); 4424 page_free_replacement_page(pplist); 4425 page_create_putback(pages); 4426 } 4427 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4428 prot, vpprot); 4429 hat_memload_array_region(hat, a, pgsz, ppa, 4430 prot & vpprot, hat_flag, svd->rcookie); 4431 mutex_exit(szcmtx); 4432 if (!(hat_flag & HAT_LOAD_LOCK)) { 4433 for (i = 0; i < pages; i++) { 4434 page_unlock(ppa[i]); 4435 } 4436 } 4437 if (amp != NULL) { 4438 anon_array_exit(&an_cookie); 4439 ANON_LOCK_EXIT(&->a_rwlock); 4440 } 4441 goto next; 4442 } 4443 4444 /* 4445 * if page got demoted since last check 4446 * we could have not allocated larger page. 4447 * allocate now. 4448 */ 4449 if (pplist == NULL && 4450 page_alloc_pages(vp, seg, a, &pplist, NULL, 4451 szc, 0, 0) && type != F_SOFTLOCK) { 4452 SEGVN_VMSTAT_FLTVNPAGES(38); 4453 for (i = 0; i < pages; i++) { 4454 page_unlock(ppa[i]); 4455 } 4456 if (amp != NULL) { 4457 anon_array_exit(&an_cookie); 4458 ANON_LOCK_EXIT(&->a_rwlock); 4459 } 4460 ierr = -1; 4461 alloc_failed |= (1 << szc); 4462 break; 4463 } 4464 4465 SEGVN_VMSTAT_FLTVNPAGES(39); 4466 4467 if (pplist != NULL) { 4468 segvn_relocate_pages(ppa, pplist); 4469 #ifdef DEBUG 4470 } else { 4471 ASSERT(type == F_SOFTLOCK); 4472 SEGVN_VMSTAT_FLTVNPAGES(40); 4473 #endif /* DEBUG */ 4474 } 4475 4476 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4477 4478 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4479 ASSERT(type == F_SOFTLOCK); 4480 for (i = 0; i < pages; i++) { 4481 ASSERT(ppa[i]->p_szc < szc); 4482 hat_memload_region(hat, 4483 a + (i << PAGESHIFT), 4484 ppa[i], prot & vpprot, hat_flag, 4485 svd->rcookie); 4486 } 4487 } else { 4488 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4489 hat_memload_array_region(hat, a, pgsz, ppa, 4490 prot & vpprot, hat_flag, svd->rcookie); 4491 } 4492 if (!(hat_flag & HAT_LOAD_LOCK)) { 4493 for (i = 0; i < pages; i++) { 4494 ASSERT(PAGE_SHARED(ppa[i])); 4495 page_unlock(ppa[i]); 4496 } 4497 } 4498 if (amp != NULL) { 4499 anon_array_exit(&an_cookie); 4500 ANON_LOCK_EXIT(&->a_rwlock); 4501 } 4502 4503 next: 4504 if (vpage != NULL) { 4505 vpage += pages; 4506 } 4507 adjszc_chk = 1; 4508 } 4509 if (a == lpgeaddr) 4510 break; 4511 ASSERT(a < lpgeaddr); 4512 4513 ASSERT(!brkcow && !tron && type != F_SOFTLOCK); 4514 4515 /* 4516 * ierr == -1 means we failed to map with a large page. 4517 * (either due to allocation/relocation failures or 4518 * misalignment with other mappings to this file. 4519 * 4520 * ierr == -2 means some other thread allocated a large page 4521 * after we gave up tp map with a large page. retry with 4522 * larger mapping. 4523 */ 4524 ASSERT(ierr == -1 || ierr == -2); 4525 ASSERT(ierr == -2 || szc != 0); 4526 ASSERT(ierr == -1 || szc < seg->s_szc); 4527 if (ierr == -2) { 4528 SEGVN_VMSTAT_FLTVNPAGES(41); 4529 ASSERT(pszc > szc && pszc <= seg->s_szc); 4530 szc = pszc; 4531 } else if (segvn_anypgsz_vnode) { 4532 SEGVN_VMSTAT_FLTVNPAGES(42); 4533 szc--; 4534 } else { 4535 SEGVN_VMSTAT_FLTVNPAGES(43); 4536 ASSERT(pszc < szc); 4537 /* 4538 * other process created pszc large page. 4539 * but we still have to drop to 0 szc. 4540 */ 4541 szc = 0; 4542 } 4543 4544 pgsz = page_get_pagesize(szc); 4545 pages = btop(pgsz); 4546 if (ierr == -2) { 4547 /* 4548 * Size up case. Note lpgaddr may only be needed for 4549 * softlock case so we don't adjust it here. 4550 */ 4551 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4552 ASSERT(a >= lpgaddr); 4553 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4554 off = svd->offset + (uintptr_t)(a - seg->s_base); 4555 aindx = svd->anon_index + seg_page(seg, a); 4556 vpage = (svd->vpage != NULL) ? 4557 &svd->vpage[seg_page(seg, a)] : NULL; 4558 } else { 4559 /* 4560 * Size down case. Note lpgaddr may only be needed for 4561 * softlock case so we don't adjust it here. 4562 */ 4563 ASSERT(IS_P2ALIGNED(a, pgsz)); 4564 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4565 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4566 ASSERT(a < lpgeaddr); 4567 if (a < addr) { 4568 SEGVN_VMSTAT_FLTVNPAGES(44); 4569 /* 4570 * The beginning of the large page region can 4571 * be pulled to the right to make a smaller 4572 * region. We haven't yet faulted a single 4573 * page. 4574 */ 4575 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4576 ASSERT(a >= lpgaddr); 4577 off = svd->offset + 4578 (uintptr_t)(a - seg->s_base); 4579 aindx = svd->anon_index + seg_page(seg, a); 4580 vpage = (svd->vpage != NULL) ? 4581 &svd->vpage[seg_page(seg, a)] : NULL; 4582 } 4583 } 4584 } 4585 out: 4586 kmem_free(ppa, ppasize); 4587 if (!err && !vop_size_err) { 4588 SEGVN_VMSTAT_FLTVNPAGES(45); 4589 return (0); 4590 } 4591 if (type == F_SOFTLOCK && a > lpgaddr) { 4592 SEGVN_VMSTAT_FLTVNPAGES(46); 4593 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4594 } 4595 if (!vop_size_err) { 4596 SEGVN_VMSTAT_FLTVNPAGES(47); 4597 return (err); 4598 } 4599 ASSERT(brkcow || tron || type == F_SOFTLOCK); 4600 /* 4601 * Large page end is mapped beyond the end of file and it's a cow 4602 * fault (can be a text replication induced cow) or softlock so we can't 4603 * reduce the map area. For now just demote the segment. This should 4604 * really only happen if the end of the file changed after the mapping 4605 * was established since when large page segments are created we make 4606 * sure they don't extend beyond the end of the file. 4607 */ 4608 SEGVN_VMSTAT_FLTVNPAGES(48); 4609 4610 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4611 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4612 err = 0; 4613 if (seg->s_szc != 0) { 4614 segvn_fltvnpages_clrszc_cnt++; 4615 ASSERT(svd->softlockcnt == 0); 4616 err = segvn_clrszc(seg); 4617 if (err != 0) { 4618 segvn_fltvnpages_clrszc_err++; 4619 } 4620 } 4621 ASSERT(err || seg->s_szc == 0); 4622 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4623 /* segvn_fault will do its job as if szc had been zero to begin with */ 4624 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4625 } 4626 4627 /* 4628 * This routine will attempt to fault in one large page. 4629 * it will use smaller pages if that fails. 4630 * It should only be called for pure anonymous segments. 4631 */ 4632 static faultcode_t 4633 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4634 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4635 caddr_t eaddr, int brkcow) 4636 { 4637 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4638 struct anon_map *amp = svd->amp; 4639 uchar_t segtype = svd->type; 4640 uint_t szc = seg->s_szc; 4641 size_t pgsz = page_get_pagesize(szc); 4642 size_t maxpgsz = pgsz; 4643 pgcnt_t pages = btop(pgsz); 4644 uint_t ppaszc = szc; 4645 caddr_t a = lpgaddr; 4646 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4647 struct vpage *vpage = (svd->vpage != NULL) ? 4648 &svd->vpage[seg_page(seg, a)] : NULL; 4649 page_t **ppa; 4650 uint_t ppa_szc; 4651 faultcode_t err; 4652 int ierr; 4653 uint_t protchk, prot, vpprot; 4654 ulong_t i; 4655 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4656 anon_sync_obj_t cookie; 4657 int adjszc_chk; 4658 int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; 4659 4660 ASSERT(szc != 0); 4661 ASSERT(amp != NULL); 4662 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4663 ASSERT(!(svd->flags & MAP_NORESERVE)); 4664 ASSERT(type != F_SOFTUNLOCK); 4665 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4666 ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF); 4667 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4668 4669 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4670 4671 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4672 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4673 4674 if (svd->flags & MAP_TEXT) { 4675 hat_flag |= HAT_LOAD_TEXT; 4676 } 4677 4678 if (svd->pageprot) { 4679 switch (rw) { 4680 case S_READ: 4681 protchk = PROT_READ; 4682 break; 4683 case S_WRITE: 4684 protchk = PROT_WRITE; 4685 break; 4686 case S_EXEC: 4687 protchk = PROT_EXEC; 4688 break; 4689 case S_OTHER: 4690 default: 4691 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4692 break; 4693 } 4694 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4695 } else { 4696 prot = svd->prot; 4697 /* caller has already done segment level protection check. */ 4698 } 4699 4700 ppa = kmem_cache_alloc(segvn_szc_cache[ppaszc], KM_SLEEP); 4701 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4702 for (;;) { 4703 adjszc_chk = 0; 4704 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4705 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4706 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4707 ASSERT(vpage != NULL); 4708 prot = VPP_PROT(vpage); 4709 ASSERT(sameprot(seg, a, maxpgsz)); 4710 if ((prot & protchk) == 0) { 4711 err = FC_PROT; 4712 goto error; 4713 } 4714 } 4715 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4716 pgsz < maxpgsz) { 4717 ASSERT(a > lpgaddr); 4718 szc = seg->s_szc; 4719 pgsz = maxpgsz; 4720 pages = btop(pgsz); 4721 ASSERT(IS_P2ALIGNED(aindx, pages)); 4722 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4723 pgsz); 4724 } 4725 if (type == F_SOFTLOCK) { 4726 atomic_add_long((ulong_t *)&svd->softlockcnt, 4727 pages); 4728 } 4729 anon_array_enter(amp, aindx, &cookie); 4730 ppa_szc = (uint_t)-1; 4731 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4732 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4733 segvn_anypgsz, pgflags, svd->cred); 4734 if (ierr != 0) { 4735 anon_array_exit(&cookie); 4736 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4737 if (type == F_SOFTLOCK) { 4738 atomic_add_long( 4739 (ulong_t *)&svd->softlockcnt, 4740 -pages); 4741 } 4742 if (ierr > 0) { 4743 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4744 err = FC_MAKE_ERR(ierr); 4745 goto error; 4746 } 4747 break; 4748 } 4749 4750 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4751 4752 ASSERT(segtype == MAP_SHARED || 4753 ppa[0]->p_szc <= szc); 4754 ASSERT(segtype == MAP_PRIVATE || 4755 ppa[0]->p_szc >= szc); 4756 4757 /* 4758 * Handle pages that have been marked for migration 4759 */ 4760 if (lgrp_optimizations()) 4761 page_migrate(seg, a, ppa, pages); 4762 4763 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 4764 4765 if (segtype == MAP_SHARED) { 4766 vpprot |= PROT_WRITE; 4767 } 4768 4769 hat_memload_array(hat, a, pgsz, ppa, 4770 prot & vpprot, hat_flag); 4771 4772 if (hat_flag & HAT_LOAD_LOCK) { 4773 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4774 } else { 4775 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4776 for (i = 0; i < pages; i++) 4777 page_unlock(ppa[i]); 4778 } 4779 if (vpage != NULL) 4780 vpage += pages; 4781 4782 anon_array_exit(&cookie); 4783 adjszc_chk = 1; 4784 } 4785 if (a == lpgeaddr) 4786 break; 4787 ASSERT(a < lpgeaddr); 4788 /* 4789 * ierr == -1 means we failed to allocate a large page. 4790 * so do a size down operation. 4791 * 4792 * ierr == -2 means some other process that privately shares 4793 * pages with this process has allocated a larger page and we 4794 * need to retry with larger pages. So do a size up 4795 * operation. This relies on the fact that large pages are 4796 * never partially shared i.e. if we share any constituent 4797 * page of a large page with another process we must share the 4798 * entire large page. Note this cannot happen for SOFTLOCK 4799 * case, unless current address (a) is at the beginning of the 4800 * next page size boundary because the other process couldn't 4801 * have relocated locked pages. 4802 */ 4803 ASSERT(ierr == -1 || ierr == -2); 4804 4805 if (segvn_anypgsz) { 4806 ASSERT(ierr == -2 || szc != 0); 4807 ASSERT(ierr == -1 || szc < seg->s_szc); 4808 szc = (ierr == -1) ? szc - 1 : szc + 1; 4809 } else { 4810 /* 4811 * For non COW faults and segvn_anypgsz == 0 4812 * we need to be careful not to loop forever 4813 * if existing page is found with szc other 4814 * than 0 or seg->s_szc. This could be due 4815 * to page relocations on behalf of DR or 4816 * more likely large page creation. For this 4817 * case simply re-size to existing page's szc 4818 * if returned by anon_map_getpages(). 4819 */ 4820 if (ppa_szc == (uint_t)-1) { 4821 szc = (ierr == -1) ? 0 : seg->s_szc; 4822 } else { 4823 ASSERT(ppa_szc <= seg->s_szc); 4824 ASSERT(ierr == -2 || ppa_szc < szc); 4825 ASSERT(ierr == -1 || ppa_szc > szc); 4826 szc = ppa_szc; 4827 } 4828 } 4829 4830 pgsz = page_get_pagesize(szc); 4831 pages = btop(pgsz); 4832 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4833 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4834 if (type == F_SOFTLOCK) { 4835 /* 4836 * For softlocks we cannot reduce the fault area 4837 * (calculated based on the largest page size for this 4838 * segment) for size down and a is already next 4839 * page size aligned as assertted above for size 4840 * ups. Therefore just continue in case of softlock. 4841 */ 4842 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4843 continue; /* keep lint happy */ 4844 } else if (ierr == -2) { 4845 4846 /* 4847 * Size up case. Note lpgaddr may only be needed for 4848 * softlock case so we don't adjust it here. 4849 */ 4850 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4851 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4852 ASSERT(a >= lpgaddr); 4853 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4854 aindx = svd->anon_index + seg_page(seg, a); 4855 vpage = (svd->vpage != NULL) ? 4856 &svd->vpage[seg_page(seg, a)] : NULL; 4857 } else { 4858 /* 4859 * Size down case. Note lpgaddr may only be needed for 4860 * softlock case so we don't adjust it here. 4861 */ 4862 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4863 ASSERT(IS_P2ALIGNED(a, pgsz)); 4864 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4865 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4866 ASSERT(a < lpgeaddr); 4867 if (a < addr) { 4868 /* 4869 * The beginning of the large page region can 4870 * be pulled to the right to make a smaller 4871 * region. We haven't yet faulted a single 4872 * page. 4873 */ 4874 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4875 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4876 ASSERT(a >= lpgaddr); 4877 aindx = svd->anon_index + seg_page(seg, a); 4878 vpage = (svd->vpage != NULL) ? 4879 &svd->vpage[seg_page(seg, a)] : NULL; 4880 } 4881 } 4882 } 4883 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4884 ANON_LOCK_EXIT(&->a_rwlock); 4885 kmem_cache_free(segvn_szc_cache[ppaszc], ppa); 4886 return (0); 4887 error: 4888 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4889 ANON_LOCK_EXIT(&->a_rwlock); 4890 kmem_cache_free(segvn_szc_cache[ppaszc], ppa); 4891 if (type == F_SOFTLOCK && a > lpgaddr) { 4892 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4893 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4894 } 4895 return (err); 4896 } 4897 4898 int fltadvice = 1; /* set to free behind pages for sequential access */ 4899 4900 /* 4901 * This routine is called via a machine specific fault handling routine. 4902 * It is also called by software routines wishing to lock or unlock 4903 * a range of addresses. 4904 * 4905 * Here is the basic algorithm: 4906 * If unlocking 4907 * Call segvn_softunlock 4908 * Return 4909 * endif 4910 * Checking and set up work 4911 * If we will need some non-anonymous pages 4912 * Call VOP_GETPAGE over the range of non-anonymous pages 4913 * endif 4914 * Loop over all addresses requested 4915 * Call segvn_faultpage passing in page list 4916 * to load up translations and handle anonymous pages 4917 * endloop 4918 * Load up translation to any additional pages in page list not 4919 * already handled that fit into this segment 4920 */ 4921 static faultcode_t 4922 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4923 enum fault_type type, enum seg_rw rw) 4924 { 4925 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4926 page_t **plp, **ppp, *pp; 4927 u_offset_t off; 4928 caddr_t a; 4929 struct vpage *vpage; 4930 uint_t vpprot, prot; 4931 int err; 4932 page_t *pl[PVN_GETPAGE_NUM + 1]; 4933 size_t plsz, pl_alloc_sz; 4934 size_t page; 4935 ulong_t anon_index; 4936 struct anon_map *amp; 4937 int dogetpage = 0; 4938 caddr_t lpgaddr, lpgeaddr; 4939 size_t pgsz; 4940 anon_sync_obj_t cookie; 4941 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4942 4943 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 4944 ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE); 4945 4946 /* 4947 * First handle the easy stuff 4948 */ 4949 if (type == F_SOFTUNLOCK) { 4950 if (rw == S_READ_NOCOW) { 4951 rw = S_READ; 4952 ASSERT(AS_WRITE_HELD(seg->s_as)); 4953 } 4954 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4955 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4956 page_get_pagesize(seg->s_szc); 4957 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4958 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4959 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4960 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4961 return (0); 4962 } 4963 4964 ASSERT(svd->tr_state == SEGVN_TR_OFF || 4965 !HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 4966 if (brkcow == 0) { 4967 if (svd->tr_state == SEGVN_TR_INIT) { 4968 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4969 if (svd->tr_state == SEGVN_TR_INIT) { 4970 ASSERT(svd->vp != NULL && svd->amp == NULL); 4971 ASSERT(svd->flags & MAP_TEXT); 4972 ASSERT(svd->type == MAP_PRIVATE); 4973 segvn_textrepl(seg); 4974 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4975 ASSERT(svd->tr_state != SEGVN_TR_ON || 4976 svd->amp != NULL); 4977 } 4978 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4979 } 4980 } else if (svd->tr_state != SEGVN_TR_OFF) { 4981 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4982 4983 if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) { 4984 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 4985 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4986 return (FC_PROT); 4987 } 4988 4989 if (svd->tr_state == SEGVN_TR_ON) { 4990 ASSERT(svd->vp != NULL && svd->amp != NULL); 4991 segvn_textunrepl(seg, 0); 4992 ASSERT(svd->amp == NULL && 4993 svd->tr_state == SEGVN_TR_OFF); 4994 } else if (svd->tr_state != SEGVN_TR_OFF) { 4995 svd->tr_state = SEGVN_TR_OFF; 4996 } 4997 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 4998 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4999 } 5000 5001 top: 5002 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5003 5004 /* 5005 * If we have the same protections for the entire segment, 5006 * insure that the access being attempted is legitimate. 5007 */ 5008 5009 if (svd->pageprot == 0) { 5010 uint_t protchk; 5011 5012 switch (rw) { 5013 case S_READ: 5014 case S_READ_NOCOW: 5015 protchk = PROT_READ; 5016 break; 5017 case S_WRITE: 5018 protchk = PROT_WRITE; 5019 break; 5020 case S_EXEC: 5021 protchk = PROT_EXEC; 5022 break; 5023 case S_OTHER: 5024 default: 5025 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 5026 break; 5027 } 5028 5029 if ((svd->prot & protchk) == 0) { 5030 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5031 return (FC_PROT); /* illegal access type */ 5032 } 5033 } 5034 5035 if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5036 /* this must be SOFTLOCK S_READ fault */ 5037 ASSERT(svd->amp == NULL); 5038 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5039 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5040 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5041 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5042 /* 5043 * this must be the first ever non S_READ_NOCOW 5044 * softlock for this segment. 5045 */ 5046 ASSERT(svd->softlockcnt == 0); 5047 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5048 HAT_REGION_TEXT); 5049 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5050 } 5051 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5052 goto top; 5053 } 5054 5055 /* 5056 * We can't allow the long term use of softlocks for vmpss segments, 5057 * because in some file truncation cases we should be able to demote 5058 * the segment, which requires that there are no softlocks. The 5059 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 5060 * segment is S_READ_NOCOW, where the caller holds the address space 5061 * locked as writer and calls softunlock before dropping the as lock. 5062 * S_READ_NOCOW is used by /proc to read memory from another user. 5063 * 5064 * Another deadlock between SOFTLOCK and file truncation can happen 5065 * because segvn_fault_vnodepages() calls the FS one pagesize at 5066 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 5067 * can cause a deadlock because the first set of page_t's remain 5068 * locked SE_SHARED. To avoid this, we demote segments on a first 5069 * SOFTLOCK if they have a length greater than the segment's 5070 * page size. 5071 * 5072 * So for now, we only avoid demoting a segment on a SOFTLOCK when 5073 * the access type is S_READ_NOCOW and the fault length is less than 5074 * or equal to the segment's page size. While this is quite restrictive, 5075 * it should be the most common case of SOFTLOCK against a vmpss 5076 * segment. 5077 * 5078 * For S_READ_NOCOW, it's safe not to do a copy on write because the 5079 * caller makes sure no COW will be caused by another thread for a 5080 * softlocked page. 5081 */ 5082 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 5083 int demote = 0; 5084 5085 if (rw != S_READ_NOCOW) { 5086 demote = 1; 5087 } 5088 if (!demote && len > PAGESIZE) { 5089 pgsz = page_get_pagesize(seg->s_szc); 5090 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 5091 lpgeaddr); 5092 if (lpgeaddr - lpgaddr > pgsz) { 5093 demote = 1; 5094 } 5095 } 5096 5097 ASSERT(demote || AS_WRITE_HELD(seg->s_as)); 5098 5099 if (demote) { 5100 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5101 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5102 if (seg->s_szc != 0) { 5103 segvn_vmpss_clrszc_cnt++; 5104 ASSERT(svd->softlockcnt == 0); 5105 err = segvn_clrszc(seg); 5106 if (err) { 5107 segvn_vmpss_clrszc_err++; 5108 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5109 return (FC_MAKE_ERR(err)); 5110 } 5111 } 5112 ASSERT(seg->s_szc == 0); 5113 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5114 goto top; 5115 } 5116 } 5117 5118 /* 5119 * Check to see if we need to allocate an anon_map structure. 5120 */ 5121 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 5122 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5123 /* 5124 * Drop the "read" lock on the segment and acquire 5125 * the "write" version since we have to allocate the 5126 * anon_map. 5127 */ 5128 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5129 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5130 5131 if (svd->amp == NULL) { 5132 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 5133 svd->amp->a_szc = seg->s_szc; 5134 } 5135 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5136 5137 /* 5138 * Start all over again since segment protections 5139 * may have changed after we dropped the "read" lock. 5140 */ 5141 goto top; 5142 } 5143 5144 /* 5145 * S_READ_NOCOW vs S_READ distinction was 5146 * only needed for the code above. After 5147 * that we treat it as S_READ. 5148 */ 5149 if (rw == S_READ_NOCOW) { 5150 ASSERT(type == F_SOFTLOCK); 5151 ASSERT(AS_WRITE_HELD(seg->s_as)); 5152 rw = S_READ; 5153 } 5154 5155 amp = svd->amp; 5156 5157 /* 5158 * MADV_SEQUENTIAL work is ignored for large page segments. 5159 */ 5160 if (seg->s_szc != 0) { 5161 pgsz = page_get_pagesize(seg->s_szc); 5162 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5163 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5164 if (svd->vp == NULL) { 5165 err = segvn_fault_anonpages(hat, seg, lpgaddr, 5166 lpgeaddr, type, rw, addr, addr + len, brkcow); 5167 } else { 5168 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 5169 lpgeaddr, type, rw, addr, addr + len, brkcow); 5170 if (err == IE_RETRY) { 5171 ASSERT(seg->s_szc == 0); 5172 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 5173 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5174 goto top; 5175 } 5176 } 5177 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5178 return (err); 5179 } 5180 5181 page = seg_page(seg, addr); 5182 if (amp != NULL) { 5183 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5184 anon_index = svd->anon_index + page; 5185 5186 if (type == F_PROT && rw == S_READ && 5187 svd->tr_state == SEGVN_TR_OFF && 5188 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 5189 size_t index = anon_index; 5190 struct anon *ap; 5191 5192 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5193 /* 5194 * The fast path could apply to S_WRITE also, except 5195 * that the protection fault could be caused by lazy 5196 * tlb flush when ro->rw. In this case, the pte is 5197 * RW already. But RO in the other cpu's tlb causes 5198 * the fault. Since hat_chgprot won't do anything if 5199 * pte doesn't change, we may end up faulting 5200 * indefinitely until the RO tlb entry gets replaced. 5201 */ 5202 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 5203 anon_array_enter(amp, index, &cookie); 5204 ap = anon_get_ptr(amp->ahp, index); 5205 anon_array_exit(&cookie); 5206 if ((ap == NULL) || (ap->an_refcnt != 1)) { 5207 ANON_LOCK_EXIT(&->a_rwlock); 5208 goto slow; 5209 } 5210 } 5211 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 5212 ANON_LOCK_EXIT(&->a_rwlock); 5213 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5214 return (0); 5215 } 5216 } 5217 slow: 5218 5219 if (svd->vpage == NULL) 5220 vpage = NULL; 5221 else 5222 vpage = &svd->vpage[page]; 5223 5224 off = svd->offset + (uintptr_t)(addr - seg->s_base); 5225 5226 /* 5227 * If MADV_SEQUENTIAL has been set for the particular page we 5228 * are faulting on, free behind all pages in the segment and put 5229 * them on the free list. 5230 */ 5231 5232 if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) { 5233 struct vpage *vpp; 5234 ulong_t fanon_index; 5235 size_t fpage; 5236 u_offset_t pgoff, fpgoff; 5237 struct vnode *fvp; 5238 struct anon *fap = NULL; 5239 5240 if (svd->advice == MADV_SEQUENTIAL || 5241 (svd->pageadvice && 5242 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 5243 pgoff = off - PAGESIZE; 5244 fpage = page - 1; 5245 if (vpage != NULL) 5246 vpp = &svd->vpage[fpage]; 5247 if (amp != NULL) 5248 fanon_index = svd->anon_index + fpage; 5249 5250 while (pgoff > svd->offset) { 5251 if (svd->advice != MADV_SEQUENTIAL && 5252 (!svd->pageadvice || (vpage && 5253 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 5254 break; 5255 5256 /* 5257 * If this is an anon page, we must find the 5258 * correct <vp, offset> for it 5259 */ 5260 fap = NULL; 5261 if (amp != NULL) { 5262 ANON_LOCK_ENTER(&->a_rwlock, 5263 RW_READER); 5264 anon_array_enter(amp, fanon_index, 5265 &cookie); 5266 fap = anon_get_ptr(amp->ahp, 5267 fanon_index); 5268 if (fap != NULL) { 5269 swap_xlate(fap, &fvp, &fpgoff); 5270 } else { 5271 fpgoff = pgoff; 5272 fvp = svd->vp; 5273 } 5274 anon_array_exit(&cookie); 5275 ANON_LOCK_EXIT(&->a_rwlock); 5276 } else { 5277 fpgoff = pgoff; 5278 fvp = svd->vp; 5279 } 5280 if (fvp == NULL) 5281 break; /* XXX */ 5282 /* 5283 * Skip pages that are free or have an 5284 * "exclusive" lock. 5285 */ 5286 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 5287 if (pp == NULL) 5288 break; 5289 /* 5290 * We don't need the page_struct_lock to test 5291 * as this is only advisory; even if we 5292 * acquire it someone might race in and lock 5293 * the page after we unlock and before the 5294 * PUTPAGE, then VOP_PUTPAGE will do nothing. 5295 */ 5296 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 5297 /* 5298 * Hold the vnode before releasing 5299 * the page lock to prevent it from 5300 * being freed and re-used by some 5301 * other thread. 5302 */ 5303 VN_HOLD(fvp); 5304 page_unlock(pp); 5305 /* 5306 * We should build a page list 5307 * to kluster putpages XXX 5308 */ 5309 (void) VOP_PUTPAGE(fvp, 5310 (offset_t)fpgoff, PAGESIZE, 5311 (B_DONTNEED|B_FREE|B_ASYNC), 5312 svd->cred, NULL); 5313 VN_RELE(fvp); 5314 } else { 5315 /* 5316 * XXX - Should the loop terminate if 5317 * the page is `locked'? 5318 */ 5319 page_unlock(pp); 5320 } 5321 --vpp; 5322 --fanon_index; 5323 pgoff -= PAGESIZE; 5324 } 5325 } 5326 } 5327 5328 plp = pl; 5329 *plp = NULL; 5330 pl_alloc_sz = 0; 5331 5332 /* 5333 * See if we need to call VOP_GETPAGE for 5334 * *any* of the range being faulted on. 5335 * We can skip all of this work if there 5336 * was no original vnode. 5337 */ 5338 if (svd->vp != NULL) { 5339 u_offset_t vp_off; 5340 size_t vp_len; 5341 struct anon *ap; 5342 vnode_t *vp; 5343 5344 vp_off = off; 5345 vp_len = len; 5346 5347 if (amp == NULL) 5348 dogetpage = 1; 5349 else { 5350 /* 5351 * Only acquire reader lock to prevent amp->ahp 5352 * from being changed. It's ok to miss pages, 5353 * hence we don't do anon_array_enter 5354 */ 5355 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5356 ap = anon_get_ptr(amp->ahp, anon_index); 5357 5358 if (len <= PAGESIZE) 5359 /* inline non_anon() */ 5360 dogetpage = (ap == NULL); 5361 else 5362 dogetpage = non_anon(amp->ahp, anon_index, 5363 &vp_off, &vp_len); 5364 ANON_LOCK_EXIT(&->a_rwlock); 5365 } 5366 5367 if (dogetpage) { 5368 enum seg_rw arw; 5369 struct as *as = seg->s_as; 5370 5371 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 5372 /* 5373 * Page list won't fit in local array, 5374 * allocate one of the needed size. 5375 */ 5376 pl_alloc_sz = 5377 (btop(len) + 1) * sizeof (page_t *); 5378 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 5379 plp[0] = NULL; 5380 plsz = len; 5381 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 5382 svd->tr_state == SEGVN_TR_ON || rw == S_OTHER || 5383 (((size_t)(addr + PAGESIZE) < 5384 (size_t)(seg->s_base + seg->s_size)) && 5385 hat_probe(as->a_hat, addr + PAGESIZE))) { 5386 /* 5387 * Ask VOP_GETPAGE to return the exact number 5388 * of pages if 5389 * (a) this is a COW fault, or 5390 * (b) this is a software fault, or 5391 * (c) next page is already mapped. 5392 */ 5393 plsz = len; 5394 } else { 5395 /* 5396 * Ask VOP_GETPAGE to return adjacent pages 5397 * within the segment. 5398 */ 5399 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 5400 ((seg->s_base + seg->s_size) - addr)); 5401 ASSERT((addr + plsz) <= 5402 (seg->s_base + seg->s_size)); 5403 } 5404 5405 /* 5406 * Need to get some non-anonymous pages. 5407 * We need to make only one call to GETPAGE to do 5408 * this to prevent certain deadlocking conditions 5409 * when we are doing locking. In this case 5410 * non_anon() should have picked up the smallest 5411 * range which includes all the non-anonymous 5412 * pages in the requested range. We have to 5413 * be careful regarding which rw flag to pass in 5414 * because on a private mapping, the underlying 5415 * object is never allowed to be written. 5416 */ 5417 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 5418 arw = S_READ; 5419 } else { 5420 arw = rw; 5421 } 5422 vp = svd->vp; 5423 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5424 "segvn_getpage:seg %p addr %p vp %p", 5425 seg, addr, vp); 5426 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 5427 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 5428 svd->cred, NULL); 5429 if (err) { 5430 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5431 segvn_pagelist_rele(plp); 5432 if (pl_alloc_sz) 5433 kmem_free(plp, pl_alloc_sz); 5434 return (FC_MAKE_ERR(err)); 5435 } 5436 if (svd->type == MAP_PRIVATE) 5437 vpprot &= ~PROT_WRITE; 5438 } 5439 } 5440 5441 /* 5442 * N.B. at this time the plp array has all the needed non-anon 5443 * pages in addition to (possibly) having some adjacent pages. 5444 */ 5445 5446 /* 5447 * Always acquire the anon_array_lock to prevent 5448 * 2 threads from allocating separate anon slots for 5449 * the same "addr". 5450 * 5451 * If this is a copy-on-write fault and we don't already 5452 * have the anon_array_lock, acquire it to prevent the 5453 * fault routine from handling multiple copy-on-write faults 5454 * on the same "addr" in the same address space. 5455 * 5456 * Only one thread should deal with the fault since after 5457 * it is handled, the other threads can acquire a translation 5458 * to the newly created private page. This prevents two or 5459 * more threads from creating different private pages for the 5460 * same fault. 5461 * 5462 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5463 * to prevent deadlock between this thread and another thread 5464 * which has soft-locked this page and wants to acquire serial_lock. 5465 * ( bug 4026339 ) 5466 * 5467 * The fix for bug 4026339 becomes unnecessary when using the 5468 * locking scheme with per amp rwlock and a global set of hash 5469 * lock, anon_array_lock. If we steal a vnode page when low 5470 * on memory and upgrad the page lock through page_rename, 5471 * then the page is PAGE_HANDLED, nothing needs to be done 5472 * for this page after returning from segvn_faultpage. 5473 * 5474 * But really, the page lock should be downgraded after 5475 * the stolen page is page_rename'd. 5476 */ 5477 5478 if (amp != NULL) 5479 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5480 5481 /* 5482 * Ok, now loop over the address range and handle faults 5483 */ 5484 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5485 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5486 type, rw, brkcow); 5487 if (err) { 5488 if (amp != NULL) 5489 ANON_LOCK_EXIT(&->a_rwlock); 5490 if (type == F_SOFTLOCK && a > addr) { 5491 segvn_softunlock(seg, addr, (a - addr), 5492 S_OTHER); 5493 } 5494 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5495 segvn_pagelist_rele(plp); 5496 if (pl_alloc_sz) 5497 kmem_free(plp, pl_alloc_sz); 5498 return (err); 5499 } 5500 if (vpage) { 5501 vpage++; 5502 } else if (svd->vpage) { 5503 page = seg_page(seg, addr); 5504 vpage = &svd->vpage[++page]; 5505 } 5506 } 5507 5508 /* Didn't get pages from the underlying fs so we're done */ 5509 if (!dogetpage) 5510 goto done; 5511 5512 /* 5513 * Now handle any other pages in the list returned. 5514 * If the page can be used, load up the translations now. 5515 * Note that the for loop will only be entered if "plp" 5516 * is pointing to a non-NULL page pointer which means that 5517 * VOP_GETPAGE() was called and vpprot has been initialized. 5518 */ 5519 if (svd->pageprot == 0) 5520 prot = svd->prot & vpprot; 5521 5522 5523 /* 5524 * Large Files: diff should be unsigned value because we started 5525 * supporting > 2GB segment sizes from 2.5.1 and when a 5526 * large file of size > 2GB gets mapped to address space 5527 * the diff value can be > 2GB. 5528 */ 5529 5530 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5531 size_t diff; 5532 struct anon *ap; 5533 int anon_index; 5534 anon_sync_obj_t cookie; 5535 int hat_flag = HAT_LOAD_ADV; 5536 5537 if (svd->flags & MAP_TEXT) { 5538 hat_flag |= HAT_LOAD_TEXT; 5539 } 5540 5541 if (pp == PAGE_HANDLED) 5542 continue; 5543 5544 if (svd->tr_state != SEGVN_TR_ON && 5545 pp->p_offset >= svd->offset && 5546 pp->p_offset < svd->offset + seg->s_size) { 5547 5548 diff = pp->p_offset - svd->offset; 5549 5550 /* 5551 * Large Files: Following is the assertion 5552 * validating the above cast. 5553 */ 5554 ASSERT(svd->vp == pp->p_vnode); 5555 5556 page = btop(diff); 5557 if (svd->pageprot) 5558 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5559 5560 /* 5561 * Prevent other threads in the address space from 5562 * creating private pages (i.e., allocating anon slots) 5563 * while we are in the process of loading translations 5564 * to additional pages returned by the underlying 5565 * object. 5566 */ 5567 if (amp != NULL) { 5568 anon_index = svd->anon_index + page; 5569 anon_array_enter(amp, anon_index, &cookie); 5570 ap = anon_get_ptr(amp->ahp, anon_index); 5571 } 5572 if ((amp == NULL) || (ap == NULL)) { 5573 if (IS_VMODSORT(pp->p_vnode) || 5574 enable_mbit_wa) { 5575 if (rw == S_WRITE) 5576 hat_setmod(pp); 5577 else if (rw != S_OTHER && 5578 !hat_ismod(pp)) 5579 prot &= ~PROT_WRITE; 5580 } 5581 /* 5582 * Skip mapping read ahead pages marked 5583 * for migration, so they will get migrated 5584 * properly on fault 5585 */ 5586 ASSERT(amp == NULL || 5587 svd->rcookie == HAT_INVALID_REGION_COOKIE); 5588 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5589 hat_memload_region(hat, 5590 seg->s_base + diff, 5591 pp, prot, hat_flag, 5592 svd->rcookie); 5593 } 5594 } 5595 if (amp != NULL) 5596 anon_array_exit(&cookie); 5597 } 5598 page_unlock(pp); 5599 } 5600 done: 5601 if (amp != NULL) 5602 ANON_LOCK_EXIT(&->a_rwlock); 5603 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5604 if (pl_alloc_sz) 5605 kmem_free(plp, pl_alloc_sz); 5606 return (0); 5607 } 5608 5609 /* 5610 * This routine is used to start I/O on pages asynchronously. XXX it will 5611 * only create PAGESIZE pages. At fault time they will be relocated into 5612 * larger pages. 5613 */ 5614 static faultcode_t 5615 segvn_faulta(struct seg *seg, caddr_t addr) 5616 { 5617 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5618 int err; 5619 struct anon_map *amp; 5620 vnode_t *vp; 5621 5622 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 5623 5624 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5625 if ((amp = svd->amp) != NULL) { 5626 struct anon *ap; 5627 5628 /* 5629 * Reader lock to prevent amp->ahp from being changed. 5630 * This is advisory, it's ok to miss a page, so 5631 * we don't do anon_array_enter lock. 5632 */ 5633 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5634 if ((ap = anon_get_ptr(amp->ahp, 5635 svd->anon_index + seg_page(seg, addr))) != NULL) { 5636 5637 err = anon_getpage(&ap, NULL, NULL, 5638 0, seg, addr, S_READ, svd->cred); 5639 5640 ANON_LOCK_EXIT(&->a_rwlock); 5641 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5642 if (err) 5643 return (FC_MAKE_ERR(err)); 5644 return (0); 5645 } 5646 ANON_LOCK_EXIT(&->a_rwlock); 5647 } 5648 5649 if (svd->vp == NULL) { 5650 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5651 return (0); /* zfod page - do nothing now */ 5652 } 5653 5654 vp = svd->vp; 5655 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5656 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5657 err = VOP_GETPAGE(vp, 5658 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5659 PAGESIZE, NULL, NULL, 0, seg, addr, 5660 S_OTHER, svd->cred, NULL); 5661 5662 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5663 if (err) 5664 return (FC_MAKE_ERR(err)); 5665 return (0); 5666 } 5667 5668 static int 5669 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5670 { 5671 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5672 struct vpage *cvp, *svp, *evp; 5673 struct vnode *vp; 5674 size_t pgsz; 5675 pgcnt_t pgcnt; 5676 anon_sync_obj_t cookie; 5677 int unload_done = 0; 5678 5679 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 5680 5681 if ((svd->maxprot & prot) != prot) 5682 return (EACCES); /* violated maxprot */ 5683 5684 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5685 5686 /* return if prot is the same */ 5687 if (!svd->pageprot && svd->prot == prot) { 5688 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5689 return (0); 5690 } 5691 5692 /* 5693 * Since we change protections we first have to flush the cache. 5694 * This makes sure all the pagelock calls have to recheck 5695 * protections. 5696 */ 5697 if (svd->softlockcnt > 0) { 5698 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5699 5700 /* 5701 * If this is shared segment non 0 softlockcnt 5702 * means locked pages are still in use. 5703 */ 5704 if (svd->type == MAP_SHARED) { 5705 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5706 return (EAGAIN); 5707 } 5708 5709 /* 5710 * Since we do have the segvn writers lock nobody can fill 5711 * the cache with entries belonging to this seg during 5712 * the purge. The flush either succeeds or we still have 5713 * pending I/Os. 5714 */ 5715 segvn_purge(seg); 5716 if (svd->softlockcnt > 0) { 5717 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5718 return (EAGAIN); 5719 } 5720 } 5721 5722 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5723 ASSERT(svd->amp == NULL); 5724 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5725 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5726 HAT_REGION_TEXT); 5727 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5728 unload_done = 1; 5729 } else if (svd->tr_state == SEGVN_TR_INIT) { 5730 svd->tr_state = SEGVN_TR_OFF; 5731 } else if (svd->tr_state == SEGVN_TR_ON) { 5732 ASSERT(svd->amp != NULL); 5733 segvn_textunrepl(seg, 0); 5734 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5735 unload_done = 1; 5736 } 5737 5738 if ((prot & PROT_WRITE) && svd->type == MAP_SHARED && 5739 svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) { 5740 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 5741 segvn_inval_trcache(svd->vp); 5742 } 5743 if (seg->s_szc != 0) { 5744 int err; 5745 pgsz = page_get_pagesize(seg->s_szc); 5746 pgcnt = pgsz >> PAGESHIFT; 5747 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5748 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5749 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5750 ASSERT(seg->s_base != addr || seg->s_size != len); 5751 /* 5752 * If we are holding the as lock as a reader then 5753 * we need to return IE_RETRY and let the as 5754 * layer drop and re-acquire the lock as a writer. 5755 */ 5756 if (AS_READ_HELD(seg->s_as)) 5757 return (IE_RETRY); 5758 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5759 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5760 err = segvn_demote_range(seg, addr, len, 5761 SDR_END, 0); 5762 } else { 5763 uint_t szcvec = map_pgszcvec(seg->s_base, 5764 pgsz, (uintptr_t)seg->s_base, 5765 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5766 err = segvn_demote_range(seg, addr, len, 5767 SDR_END, szcvec); 5768 } 5769 if (err == 0) 5770 return (IE_RETRY); 5771 if (err == ENOMEM) 5772 return (IE_NOMEM); 5773 return (err); 5774 } 5775 } 5776 5777 5778 /* 5779 * If it's a private mapping and we're making it writable then we 5780 * may have to reserve the additional swap space now. If we are 5781 * making writable only a part of the segment then we use its vpage 5782 * array to keep a record of the pages for which we have reserved 5783 * swap. In this case we set the pageswap field in the segment's 5784 * segvn structure to record this. 5785 * 5786 * If it's a private mapping to a file (i.e., vp != NULL) and we're 5787 * removing write permission on the entire segment and we haven't 5788 * modified any pages, we can release the swap space. 5789 */ 5790 if (svd->type == MAP_PRIVATE) { 5791 if (prot & PROT_WRITE) { 5792 if (!(svd->flags & MAP_NORESERVE) && 5793 !(svd->swresv && svd->pageswap == 0)) { 5794 size_t sz = 0; 5795 5796 /* 5797 * Start by determining how much swap 5798 * space is required. 5799 */ 5800 if (addr == seg->s_base && 5801 len == seg->s_size && 5802 svd->pageswap == 0) { 5803 /* The whole segment */ 5804 sz = seg->s_size; 5805 } else { 5806 /* 5807 * Make sure that the vpage array 5808 * exists, and make a note of the 5809 * range of elements corresponding 5810 * to len. 5811 */ 5812 segvn_vpage(seg); 5813 if (svd->vpage == NULL) { 5814 SEGVN_LOCK_EXIT(seg->s_as, 5815 &svd->lock); 5816 return (ENOMEM); 5817 } 5818 svp = &svd->vpage[seg_page(seg, addr)]; 5819 evp = &svd->vpage[seg_page(seg, 5820 addr + len)]; 5821 5822 if (svd->pageswap == 0) { 5823 /* 5824 * This is the first time we've 5825 * asked for a part of this 5826 * segment, so we need to 5827 * reserve everything we've 5828 * been asked for. 5829 */ 5830 sz = len; 5831 } else { 5832 /* 5833 * We have to count the number 5834 * of pages required. 5835 */ 5836 for (cvp = svp; cvp < evp; 5837 cvp++) { 5838 if (!VPP_ISSWAPRES(cvp)) 5839 sz++; 5840 } 5841 sz <<= PAGESHIFT; 5842 } 5843 } 5844 5845 /* Try to reserve the necessary swap. */ 5846 if (anon_resv_zone(sz, 5847 seg->s_as->a_proc->p_zone) == 0) { 5848 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5849 return (IE_NOMEM); 5850 } 5851 5852 /* 5853 * Make a note of how much swap space 5854 * we've reserved. 5855 */ 5856 if (svd->pageswap == 0 && sz == seg->s_size) { 5857 svd->swresv = sz; 5858 } else { 5859 ASSERT(svd->vpage != NULL); 5860 svd->swresv += sz; 5861 svd->pageswap = 1; 5862 for (cvp = svp; cvp < evp; cvp++) { 5863 if (!VPP_ISSWAPRES(cvp)) 5864 VPP_SETSWAPRES(cvp); 5865 } 5866 } 5867 } 5868 } else { 5869 /* 5870 * Swap space is released only if this segment 5871 * does not map anonymous memory, since read faults 5872 * on such segments still need an anon slot to read 5873 * in the data. 5874 */ 5875 if (svd->swresv != 0 && svd->vp != NULL && 5876 svd->amp == NULL && addr == seg->s_base && 5877 len == seg->s_size && svd->pageprot == 0) { 5878 ASSERT(svd->pageswap == 0); 5879 anon_unresv_zone(svd->swresv, 5880 seg->s_as->a_proc->p_zone); 5881 svd->swresv = 0; 5882 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5883 "anon proc:%p %lu %u", seg, 0, 0); 5884 } 5885 } 5886 } 5887 5888 if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) { 5889 if (svd->prot == prot) { 5890 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5891 return (0); /* all done */ 5892 } 5893 svd->prot = (uchar_t)prot; 5894 } else if (svd->type == MAP_PRIVATE) { 5895 struct anon *ap = NULL; 5896 page_t *pp; 5897 u_offset_t offset, off; 5898 struct anon_map *amp; 5899 ulong_t anon_idx = 0; 5900 5901 /* 5902 * A vpage structure exists or else the change does not 5903 * involve the entire segment. Establish a vpage structure 5904 * if none is there. Then, for each page in the range, 5905 * adjust its individual permissions. Note that write- 5906 * enabling a MAP_PRIVATE page can affect the claims for 5907 * locked down memory. Overcommitting memory terminates 5908 * the operation. 5909 */ 5910 segvn_vpage(seg); 5911 if (svd->vpage == NULL) { 5912 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5913 return (ENOMEM); 5914 } 5915 svd->pageprot = 1; 5916 if ((amp = svd->amp) != NULL) { 5917 anon_idx = svd->anon_index + seg_page(seg, addr); 5918 ASSERT(seg->s_szc == 0 || 5919 IS_P2ALIGNED(anon_idx, pgcnt)); 5920 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5921 } 5922 5923 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5924 evp = &svd->vpage[seg_page(seg, addr + len)]; 5925 5926 /* 5927 * See Statement at the beginning of segvn_lockop regarding 5928 * the way cowcnts and lckcnts are handled. 5929 */ 5930 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5931 5932 if (seg->s_szc != 0) { 5933 if (amp != NULL) { 5934 anon_array_enter(amp, anon_idx, 5935 &cookie); 5936 } 5937 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5938 !segvn_claim_pages(seg, svp, offset, 5939 anon_idx, prot)) { 5940 if (amp != NULL) { 5941 anon_array_exit(&cookie); 5942 } 5943 break; 5944 } 5945 if (amp != NULL) { 5946 anon_array_exit(&cookie); 5947 } 5948 anon_idx++; 5949 } else { 5950 if (amp != NULL) { 5951 anon_array_enter(amp, anon_idx, 5952 &cookie); 5953 ap = anon_get_ptr(amp->ahp, anon_idx++); 5954 } 5955 5956 if (VPP_ISPPLOCK(svp) && 5957 VPP_PROT(svp) != prot) { 5958 5959 if (amp == NULL || ap == NULL) { 5960 vp = svd->vp; 5961 off = offset; 5962 } else 5963 swap_xlate(ap, &vp, &off); 5964 if (amp != NULL) 5965 anon_array_exit(&cookie); 5966 5967 if ((pp = page_lookup(vp, off, 5968 SE_SHARED)) == NULL) { 5969 panic("segvn_setprot: no page"); 5970 /*NOTREACHED*/ 5971 } 5972 ASSERT(seg->s_szc == 0); 5973 if ((VPP_PROT(svp) ^ prot) & 5974 PROT_WRITE) { 5975 if (prot & PROT_WRITE) { 5976 if (!page_addclaim( 5977 pp)) { 5978 page_unlock(pp); 5979 break; 5980 } 5981 } else { 5982 if (!page_subclaim( 5983 pp)) { 5984 page_unlock(pp); 5985 break; 5986 } 5987 } 5988 } 5989 page_unlock(pp); 5990 } else if (amp != NULL) 5991 anon_array_exit(&cookie); 5992 } 5993 VPP_SETPROT(svp, prot); 5994 offset += PAGESIZE; 5995 } 5996 if (amp != NULL) 5997 ANON_LOCK_EXIT(&->a_rwlock); 5998 5999 /* 6000 * Did we terminate prematurely? If so, simply unload 6001 * the translations to the things we've updated so far. 6002 */ 6003 if (svp != evp) { 6004 if (unload_done) { 6005 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6006 return (IE_NOMEM); 6007 } 6008 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 6009 PAGESIZE; 6010 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 6011 if (len != 0) 6012 hat_unload(seg->s_as->a_hat, addr, 6013 len, HAT_UNLOAD); 6014 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6015 return (IE_NOMEM); 6016 } 6017 } else { 6018 segvn_vpage(seg); 6019 if (svd->vpage == NULL) { 6020 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6021 return (ENOMEM); 6022 } 6023 svd->pageprot = 1; 6024 evp = &svd->vpage[seg_page(seg, addr + len)]; 6025 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 6026 VPP_SETPROT(svp, prot); 6027 } 6028 } 6029 6030 if (unload_done) { 6031 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6032 return (0); 6033 } 6034 6035 if (((prot & PROT_WRITE) != 0 && 6036 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 6037 (prot & ~PROT_USER) == PROT_NONE) { 6038 /* 6039 * Either private or shared data with write access (in 6040 * which case we need to throw out all former translations 6041 * so that we get the right translations set up on fault 6042 * and we don't allow write access to any copy-on-write pages 6043 * that might be around or to prevent write access to pages 6044 * representing holes in a file), or we don't have permission 6045 * to access the memory at all (in which case we have to 6046 * unload any current translations that might exist). 6047 */ 6048 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 6049 } else { 6050 /* 6051 * A shared mapping or a private mapping in which write 6052 * protection is going to be denied - just change all the 6053 * protections over the range of addresses in question. 6054 * segvn does not support any other attributes other 6055 * than prot so we can use hat_chgattr. 6056 */ 6057 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 6058 } 6059 6060 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6061 6062 return (0); 6063 } 6064 6065 /* 6066 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 6067 * to determine if the seg is capable of mapping the requested szc. 6068 */ 6069 static int 6070 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 6071 { 6072 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6073 struct segvn_data *nsvd; 6074 struct anon_map *amp = svd->amp; 6075 struct seg *nseg; 6076 caddr_t eaddr = addr + len, a; 6077 size_t pgsz = page_get_pagesize(szc); 6078 pgcnt_t pgcnt = page_get_pagecnt(szc); 6079 int err; 6080 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 6081 6082 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 6083 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6084 6085 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 6086 return (0); 6087 } 6088 6089 /* 6090 * addr should always be pgsz aligned but eaddr may be misaligned if 6091 * it's at the end of the segment. 6092 * 6093 * XXX we should assert this condition since as_setpagesize() logic 6094 * guarantees it. 6095 */ 6096 if (!IS_P2ALIGNED(addr, pgsz) || 6097 (!IS_P2ALIGNED(eaddr, pgsz) && 6098 eaddr != seg->s_base + seg->s_size)) { 6099 6100 segvn_setpgsz_align_err++; 6101 return (EINVAL); 6102 } 6103 6104 if (amp != NULL && svd->type == MAP_SHARED) { 6105 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 6106 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 6107 6108 segvn_setpgsz_anon_align_err++; 6109 return (EINVAL); 6110 } 6111 } 6112 6113 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 6114 szc > segvn_maxpgszc) { 6115 return (EINVAL); 6116 } 6117 6118 /* paranoid check */ 6119 if (svd->vp != NULL && 6120 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 6121 return (EINVAL); 6122 } 6123 6124 if (seg->s_szc == 0 && svd->vp != NULL && 6125 map_addr_vacalign_check(addr, off)) { 6126 return (EINVAL); 6127 } 6128 6129 /* 6130 * Check that protections are the same within new page 6131 * size boundaries. 6132 */ 6133 if (svd->pageprot) { 6134 for (a = addr; a < eaddr; a += pgsz) { 6135 if ((a + pgsz) > eaddr) { 6136 if (!sameprot(seg, a, eaddr - a)) { 6137 return (EINVAL); 6138 } 6139 } else { 6140 if (!sameprot(seg, a, pgsz)) { 6141 return (EINVAL); 6142 } 6143 } 6144 } 6145 } 6146 6147 /* 6148 * Since we are changing page size we first have to flush 6149 * the cache. This makes sure all the pagelock calls have 6150 * to recheck protections. 6151 */ 6152 if (svd->softlockcnt > 0) { 6153 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6154 6155 /* 6156 * If this is shared segment non 0 softlockcnt 6157 * means locked pages are still in use. 6158 */ 6159 if (svd->type == MAP_SHARED) { 6160 return (EAGAIN); 6161 } 6162 6163 /* 6164 * Since we do have the segvn writers lock nobody can fill 6165 * the cache with entries belonging to this seg during 6166 * the purge. The flush either succeeds or we still have 6167 * pending I/Os. 6168 */ 6169 segvn_purge(seg); 6170 if (svd->softlockcnt > 0) { 6171 return (EAGAIN); 6172 } 6173 } 6174 6175 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6176 ASSERT(svd->amp == NULL); 6177 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6178 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6179 HAT_REGION_TEXT); 6180 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6181 } else if (svd->tr_state == SEGVN_TR_INIT) { 6182 svd->tr_state = SEGVN_TR_OFF; 6183 } else if (svd->tr_state == SEGVN_TR_ON) { 6184 ASSERT(svd->amp != NULL); 6185 segvn_textunrepl(seg, 1); 6186 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6187 amp = NULL; 6188 } 6189 6190 /* 6191 * Operation for sub range of existing segment. 6192 */ 6193 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 6194 if (szc < seg->s_szc) { 6195 VM_STAT_ADD(segvnvmstats.demoterange[2]); 6196 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 6197 if (err == 0) { 6198 return (IE_RETRY); 6199 } 6200 if (err == ENOMEM) { 6201 return (IE_NOMEM); 6202 } 6203 return (err); 6204 } 6205 if (addr != seg->s_base) { 6206 nseg = segvn_split_seg(seg, addr); 6207 if (eaddr != (nseg->s_base + nseg->s_size)) { 6208 /* eaddr is szc aligned */ 6209 (void) segvn_split_seg(nseg, eaddr); 6210 } 6211 return (IE_RETRY); 6212 } 6213 if (eaddr != (seg->s_base + seg->s_size)) { 6214 /* eaddr is szc aligned */ 6215 (void) segvn_split_seg(seg, eaddr); 6216 } 6217 return (IE_RETRY); 6218 } 6219 6220 /* 6221 * Break any low level sharing and reset seg->s_szc to 0. 6222 */ 6223 if ((err = segvn_clrszc(seg)) != 0) { 6224 if (err == ENOMEM) { 6225 err = IE_NOMEM; 6226 } 6227 return (err); 6228 } 6229 ASSERT(seg->s_szc == 0); 6230 6231 /* 6232 * If the end of the current segment is not pgsz aligned 6233 * then attempt to concatenate with the next segment. 6234 */ 6235 if (!IS_P2ALIGNED(eaddr, pgsz)) { 6236 nseg = AS_SEGNEXT(seg->s_as, seg); 6237 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 6238 return (ENOMEM); 6239 } 6240 if (nseg->s_ops != &segvn_ops) { 6241 return (EINVAL); 6242 } 6243 nsvd = (struct segvn_data *)nseg->s_data; 6244 if (nsvd->softlockcnt > 0) { 6245 /* 6246 * If this is shared segment non 0 softlockcnt 6247 * means locked pages are still in use. 6248 */ 6249 if (nsvd->type == MAP_SHARED) { 6250 return (EAGAIN); 6251 } 6252 segvn_purge(nseg); 6253 if (nsvd->softlockcnt > 0) { 6254 return (EAGAIN); 6255 } 6256 } 6257 err = segvn_clrszc(nseg); 6258 if (err == ENOMEM) { 6259 err = IE_NOMEM; 6260 } 6261 if (err != 0) { 6262 return (err); 6263 } 6264 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6265 err = segvn_concat(seg, nseg, 1); 6266 if (err == -1) { 6267 return (EINVAL); 6268 } 6269 if (err == -2) { 6270 return (IE_NOMEM); 6271 } 6272 return (IE_RETRY); 6273 } 6274 6275 /* 6276 * May need to re-align anon array to 6277 * new szc. 6278 */ 6279 if (amp != NULL) { 6280 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 6281 struct anon_hdr *nahp; 6282 6283 ASSERT(svd->type == MAP_PRIVATE); 6284 6285 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6286 ASSERT(amp->refcnt == 1); 6287 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 6288 if (nahp == NULL) { 6289 ANON_LOCK_EXIT(&->a_rwlock); 6290 return (IE_NOMEM); 6291 } 6292 if (anon_copy_ptr(amp->ahp, svd->anon_index, 6293 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 6294 anon_release(nahp, btop(amp->size)); 6295 ANON_LOCK_EXIT(&->a_rwlock); 6296 return (IE_NOMEM); 6297 } 6298 anon_release(amp->ahp, btop(amp->size)); 6299 amp->ahp = nahp; 6300 svd->anon_index = 0; 6301 ANON_LOCK_EXIT(&->a_rwlock); 6302 } 6303 } 6304 if (svd->vp != NULL && szc != 0) { 6305 struct vattr va; 6306 u_offset_t eoffpage = svd->offset; 6307 va.va_mask = AT_SIZE; 6308 eoffpage += seg->s_size; 6309 eoffpage = btopr(eoffpage); 6310 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred, NULL) != 0) { 6311 segvn_setpgsz_getattr_err++; 6312 return (EINVAL); 6313 } 6314 if (btopr(va.va_size) < eoffpage) { 6315 segvn_setpgsz_eof_err++; 6316 return (EINVAL); 6317 } 6318 if (amp != NULL) { 6319 /* 6320 * anon_fill_cow_holes() may call VOP_GETPAGE(). 6321 * don't take anon map lock here to avoid holding it 6322 * across VOP_GETPAGE() calls that may call back into 6323 * segvn for klsutering checks. We don't really need 6324 * anon map lock here since it's a private segment and 6325 * we hold as level lock as writers. 6326 */ 6327 if ((err = anon_fill_cow_holes(seg, seg->s_base, 6328 amp->ahp, svd->anon_index, svd->vp, svd->offset, 6329 seg->s_size, szc, svd->prot, svd->vpage, 6330 svd->cred)) != 0) { 6331 return (EINVAL); 6332 } 6333 } 6334 segvn_setvnode_mpss(svd->vp); 6335 } 6336 6337 if (amp != NULL) { 6338 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6339 if (svd->type == MAP_PRIVATE) { 6340 amp->a_szc = szc; 6341 } else if (szc > amp->a_szc) { 6342 amp->a_szc = szc; 6343 } 6344 ANON_LOCK_EXIT(&->a_rwlock); 6345 } 6346 6347 seg->s_szc = szc; 6348 6349 return (0); 6350 } 6351 6352 static int 6353 segvn_clrszc(struct seg *seg) 6354 { 6355 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6356 struct anon_map *amp = svd->amp; 6357 size_t pgsz; 6358 pgcnt_t pages; 6359 int err = 0; 6360 caddr_t a = seg->s_base; 6361 caddr_t ea = a + seg->s_size; 6362 ulong_t an_idx = svd->anon_index; 6363 vnode_t *vp = svd->vp; 6364 struct vpage *vpage = svd->vpage; 6365 page_t *anon_pl[1 + 1], *pp; 6366 struct anon *ap, *oldap; 6367 uint_t prot = svd->prot, vpprot; 6368 int pageflag = 0; 6369 6370 ASSERT(AS_WRITE_HELD(seg->s_as) || 6371 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 6372 ASSERT(svd->softlockcnt == 0); 6373 6374 if (vp == NULL && amp == NULL) { 6375 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6376 seg->s_szc = 0; 6377 return (0); 6378 } 6379 6380 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6381 ASSERT(svd->amp == NULL); 6382 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6383 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6384 HAT_REGION_TEXT); 6385 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6386 } else if (svd->tr_state == SEGVN_TR_ON) { 6387 ASSERT(svd->amp != NULL); 6388 segvn_textunrepl(seg, 1); 6389 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6390 amp = NULL; 6391 } else { 6392 if (svd->tr_state != SEGVN_TR_OFF) { 6393 ASSERT(svd->tr_state == SEGVN_TR_INIT); 6394 svd->tr_state = SEGVN_TR_OFF; 6395 } 6396 6397 /* 6398 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 6399 * unload argument is 0 when we are freeing the segment 6400 * and unload was already done. 6401 */ 6402 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 6403 HAT_UNLOAD_UNMAP); 6404 } 6405 6406 if (amp == NULL || svd->type == MAP_SHARED) { 6407 seg->s_szc = 0; 6408 return (0); 6409 } 6410 6411 pgsz = page_get_pagesize(seg->s_szc); 6412 pages = btop(pgsz); 6413 6414 /* 6415 * XXX anon rwlock is not really needed because this is a 6416 * private segment and we are writers. 6417 */ 6418 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6419 6420 for (; a < ea; a += pgsz, an_idx += pages) { 6421 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 6422 ASSERT(vpage != NULL || svd->pageprot == 0); 6423 if (vpage != NULL) { 6424 ASSERT(sameprot(seg, a, pgsz)); 6425 prot = VPP_PROT(vpage); 6426 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 6427 } 6428 if (seg->s_szc != 0) { 6429 ASSERT(vp == NULL || anon_pages(amp->ahp, 6430 an_idx, pages) == pages); 6431 if ((err = anon_map_demotepages(amp, an_idx, 6432 seg, a, prot, vpage, svd->cred)) != 0) { 6433 goto out; 6434 } 6435 } else { 6436 if (oldap->an_refcnt == 1) { 6437 continue; 6438 } 6439 if ((err = anon_getpage(&oldap, &vpprot, 6440 anon_pl, PAGESIZE, seg, a, S_READ, 6441 svd->cred))) { 6442 goto out; 6443 } 6444 if ((pp = anon_private(&ap, seg, a, prot, 6445 anon_pl[0], pageflag, svd->cred)) == NULL) { 6446 err = ENOMEM; 6447 goto out; 6448 } 6449 anon_decref(oldap); 6450 (void) anon_set_ptr(amp->ahp, an_idx, ap, 6451 ANON_SLEEP); 6452 page_unlock(pp); 6453 } 6454 } 6455 vpage = (vpage == NULL) ? NULL : vpage + pages; 6456 } 6457 6458 amp->a_szc = 0; 6459 seg->s_szc = 0; 6460 out: 6461 ANON_LOCK_EXIT(&->a_rwlock); 6462 return (err); 6463 } 6464 6465 static int 6466 segvn_claim_pages( 6467 struct seg *seg, 6468 struct vpage *svp, 6469 u_offset_t off, 6470 ulong_t anon_idx, 6471 uint_t prot) 6472 { 6473 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6474 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 6475 page_t **ppa; 6476 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6477 struct anon_map *amp = svd->amp; 6478 struct vpage *evp = svp + pgcnt; 6479 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 6480 + seg->s_base; 6481 struct anon *ap; 6482 struct vnode *vp = svd->vp; 6483 page_t *pp; 6484 pgcnt_t pg_idx, i; 6485 int err = 0; 6486 anoff_t aoff; 6487 int anon = (amp != NULL) ? 1 : 0; 6488 6489 ASSERT(svd->type == MAP_PRIVATE); 6490 ASSERT(svd->vpage != NULL); 6491 ASSERT(seg->s_szc != 0); 6492 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 6493 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 6494 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 6495 6496 if (VPP_PROT(svp) == prot) 6497 return (1); 6498 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 6499 return (1); 6500 6501 ppa = kmem_alloc(ppasize, KM_SLEEP); 6502 if (anon && vp != NULL) { 6503 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 6504 anon = 0; 6505 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 6506 } 6507 ASSERT(!anon || 6508 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 6509 } 6510 6511 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 6512 if (!VPP_ISPPLOCK(svp)) 6513 continue; 6514 if (anon) { 6515 ap = anon_get_ptr(amp->ahp, anon_idx); 6516 if (ap == NULL) { 6517 panic("segvn_claim_pages: no anon slot"); 6518 } 6519 swap_xlate(ap, &vp, &aoff); 6520 off = (u_offset_t)aoff; 6521 } 6522 ASSERT(vp != NULL); 6523 if ((pp = page_lookup(vp, 6524 (u_offset_t)off, SE_SHARED)) == NULL) { 6525 panic("segvn_claim_pages: no page"); 6526 } 6527 ppa[pg_idx++] = pp; 6528 off += PAGESIZE; 6529 } 6530 6531 if (ppa[0] == NULL) { 6532 kmem_free(ppa, ppasize); 6533 return (1); 6534 } 6535 6536 ASSERT(pg_idx <= pgcnt); 6537 ppa[pg_idx] = NULL; 6538 6539 6540 /* Find each large page within ppa, and adjust its claim */ 6541 6542 /* Does ppa cover a single large page? */ 6543 if (ppa[0]->p_szc == seg->s_szc) { 6544 if (prot & PROT_WRITE) 6545 err = page_addclaim_pages(ppa); 6546 else 6547 err = page_subclaim_pages(ppa); 6548 } else { 6549 for (i = 0; ppa[i]; i += pgcnt) { 6550 ASSERT(IS_P2ALIGNED(page_pptonum(ppa[i]), pgcnt)); 6551 if (prot & PROT_WRITE) 6552 err = page_addclaim_pages(&ppa[i]); 6553 else 6554 err = page_subclaim_pages(&ppa[i]); 6555 if (err == 0) 6556 break; 6557 } 6558 } 6559 6560 for (i = 0; i < pg_idx; i++) { 6561 ASSERT(ppa[i] != NULL); 6562 page_unlock(ppa[i]); 6563 } 6564 6565 kmem_free(ppa, ppasize); 6566 return (err); 6567 } 6568 6569 /* 6570 * Returns right (upper address) segment if split occurred. 6571 * If the address is equal to the beginning or end of its segment it returns 6572 * the current segment. 6573 */ 6574 static struct seg * 6575 segvn_split_seg(struct seg *seg, caddr_t addr) 6576 { 6577 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6578 struct seg *nseg; 6579 size_t nsize; 6580 struct segvn_data *nsvd; 6581 6582 ASSERT(AS_WRITE_HELD(seg->s_as)); 6583 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6584 6585 ASSERT(addr >= seg->s_base); 6586 ASSERT(addr <= seg->s_base + seg->s_size); 6587 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6588 6589 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 6590 return (seg); 6591 6592 nsize = seg->s_base + seg->s_size - addr; 6593 seg->s_size = addr - seg->s_base; 6594 nseg = seg_alloc(seg->s_as, addr, nsize); 6595 ASSERT(nseg != NULL); 6596 nseg->s_ops = seg->s_ops; 6597 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 6598 nseg->s_data = (void *)nsvd; 6599 nseg->s_szc = seg->s_szc; 6600 *nsvd = *svd; 6601 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6602 nsvd->seg = nseg; 6603 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 6604 6605 if (nsvd->vp != NULL) { 6606 VN_HOLD(nsvd->vp); 6607 nsvd->offset = svd->offset + 6608 (uintptr_t)(nseg->s_base - seg->s_base); 6609 if (nsvd->type == MAP_SHARED) 6610 lgrp_shm_policy_init(NULL, nsvd->vp); 6611 } else { 6612 /* 6613 * The offset for an anonymous segment has no signifigance in 6614 * terms of an offset into a file. If we were to use the above 6615 * calculation instead, the structures read out of 6616 * /proc/<pid>/xmap would be more difficult to decipher since 6617 * it would be unclear whether two seemingly contiguous 6618 * prxmap_t structures represented different segments or a 6619 * single segment that had been split up into multiple prxmap_t 6620 * structures (e.g. if some part of the segment had not yet 6621 * been faulted in). 6622 */ 6623 nsvd->offset = 0; 6624 } 6625 6626 ASSERT(svd->softlockcnt == 0); 6627 ASSERT(svd->softlockcnt_sbase == 0); 6628 ASSERT(svd->softlockcnt_send == 0); 6629 crhold(svd->cred); 6630 6631 if (svd->vpage != NULL) { 6632 size_t bytes = vpgtob(seg_pages(seg)); 6633 size_t nbytes = vpgtob(seg_pages(nseg)); 6634 struct vpage *ovpage = svd->vpage; 6635 6636 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 6637 bcopy(ovpage, svd->vpage, bytes); 6638 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 6639 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 6640 kmem_free(ovpage, bytes + nbytes); 6641 } 6642 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6643 struct anon_map *oamp = svd->amp, *namp; 6644 struct anon_hdr *nahp; 6645 6646 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6647 ASSERT(oamp->refcnt == 1); 6648 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6649 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6650 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6651 6652 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 6653 namp->a_szc = nseg->s_szc; 6654 (void) anon_copy_ptr(oamp->ahp, 6655 svd->anon_index + btop(seg->s_size), 6656 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6657 anon_release(oamp->ahp, btop(oamp->size)); 6658 oamp->ahp = nahp; 6659 oamp->size = seg->s_size; 6660 svd->anon_index = 0; 6661 nsvd->amp = namp; 6662 nsvd->anon_index = 0; 6663 ANON_LOCK_EXIT(&oamp->a_rwlock); 6664 } else if (svd->amp != NULL) { 6665 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6666 ASSERT(svd->amp == nsvd->amp); 6667 ASSERT(seg->s_szc <= svd->amp->a_szc); 6668 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6669 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6670 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6671 svd->amp->refcnt++; 6672 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6673 } 6674 6675 /* 6676 * Split the amount of swap reserved. 6677 */ 6678 if (svd->swresv) { 6679 /* 6680 * For MAP_NORESERVE, only allocate swap reserve for pages 6681 * being used. Other segments get enough to cover whole 6682 * segment. 6683 */ 6684 if (svd->flags & MAP_NORESERVE) { 6685 size_t oswresv; 6686 6687 ASSERT(svd->amp); 6688 oswresv = svd->swresv; 6689 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6690 svd->anon_index, btop(seg->s_size))); 6691 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6692 nsvd->anon_index, btop(nseg->s_size))); 6693 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6694 } else { 6695 if (svd->pageswap) { 6696 svd->swresv = segvn_count_swap_by_vpages(seg); 6697 ASSERT(nsvd->swresv >= svd->swresv); 6698 nsvd->swresv -= svd->swresv; 6699 } else { 6700 ASSERT(svd->swresv == seg->s_size + 6701 nseg->s_size); 6702 svd->swresv = seg->s_size; 6703 nsvd->swresv = nseg->s_size; 6704 } 6705 } 6706 } 6707 6708 return (nseg); 6709 } 6710 6711 /* 6712 * called on memory operations (unmap, setprot, setpagesize) for a subset 6713 * of a large page segment to either demote the memory range (SDR_RANGE) 6714 * or the ends (SDR_END) by addr/len. 6715 * 6716 * returns 0 on success. returns errno, including ENOMEM, on failure. 6717 */ 6718 static int 6719 segvn_demote_range( 6720 struct seg *seg, 6721 caddr_t addr, 6722 size_t len, 6723 int flag, 6724 uint_t szcvec) 6725 { 6726 caddr_t eaddr = addr + len; 6727 caddr_t lpgaddr, lpgeaddr; 6728 struct seg *nseg; 6729 struct seg *badseg1 = NULL; 6730 struct seg *badseg2 = NULL; 6731 size_t pgsz; 6732 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6733 int err; 6734 uint_t szc = seg->s_szc; 6735 uint_t tszcvec; 6736 6737 ASSERT(AS_WRITE_HELD(seg->s_as)); 6738 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6739 ASSERT(szc != 0); 6740 pgsz = page_get_pagesize(szc); 6741 ASSERT(seg->s_base != addr || seg->s_size != len); 6742 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6743 ASSERT(svd->softlockcnt == 0); 6744 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6745 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6746 6747 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6748 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6749 if (flag == SDR_RANGE) { 6750 /* demote entire range */ 6751 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6752 (void) segvn_split_seg(nseg, lpgeaddr); 6753 ASSERT(badseg1->s_base == lpgaddr); 6754 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6755 } else if (addr != lpgaddr) { 6756 ASSERT(flag == SDR_END); 6757 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6758 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6759 eaddr < lpgaddr + 2 * pgsz) { 6760 (void) segvn_split_seg(nseg, lpgeaddr); 6761 ASSERT(badseg1->s_base == lpgaddr); 6762 ASSERT(badseg1->s_size == 2 * pgsz); 6763 } else { 6764 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6765 ASSERT(badseg1->s_base == lpgaddr); 6766 ASSERT(badseg1->s_size == pgsz); 6767 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6768 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6769 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6770 badseg2 = nseg; 6771 (void) segvn_split_seg(nseg, lpgeaddr); 6772 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6773 ASSERT(badseg2->s_size == pgsz); 6774 } 6775 } 6776 } else { 6777 ASSERT(flag == SDR_END); 6778 ASSERT(eaddr < lpgeaddr); 6779 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6780 (void) segvn_split_seg(nseg, lpgeaddr); 6781 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6782 ASSERT(badseg1->s_size == pgsz); 6783 } 6784 6785 ASSERT(badseg1 != NULL); 6786 ASSERT(badseg1->s_szc == szc); 6787 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6788 badseg1->s_size == 2 * pgsz); 6789 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6790 ASSERT(badseg1->s_size == pgsz || 6791 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6792 if (err = segvn_clrszc(badseg1)) { 6793 return (err); 6794 } 6795 ASSERT(badseg1->s_szc == 0); 6796 6797 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6798 uint_t tszc = highbit(tszcvec) - 1; 6799 caddr_t ta = MAX(addr, badseg1->s_base); 6800 caddr_t te; 6801 size_t tpgsz = page_get_pagesize(tszc); 6802 6803 ASSERT(svd->type == MAP_SHARED); 6804 ASSERT(flag == SDR_END); 6805 ASSERT(tszc < szc && tszc > 0); 6806 6807 if (eaddr > badseg1->s_base + badseg1->s_size) { 6808 te = badseg1->s_base + badseg1->s_size; 6809 } else { 6810 te = eaddr; 6811 } 6812 6813 ASSERT(ta <= te); 6814 badseg1->s_szc = tszc; 6815 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6816 if (badseg2 != NULL) { 6817 err = segvn_demote_range(badseg1, ta, te - ta, 6818 SDR_END, tszcvec); 6819 if (err != 0) { 6820 return (err); 6821 } 6822 } else { 6823 return (segvn_demote_range(badseg1, ta, 6824 te - ta, SDR_END, tszcvec)); 6825 } 6826 } 6827 } 6828 6829 if (badseg2 == NULL) 6830 return (0); 6831 ASSERT(badseg2->s_szc == szc); 6832 ASSERT(badseg2->s_size == pgsz); 6833 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6834 if (err = segvn_clrszc(badseg2)) { 6835 return (err); 6836 } 6837 ASSERT(badseg2->s_szc == 0); 6838 6839 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6840 uint_t tszc = highbit(tszcvec) - 1; 6841 size_t tpgsz = page_get_pagesize(tszc); 6842 6843 ASSERT(svd->type == MAP_SHARED); 6844 ASSERT(flag == SDR_END); 6845 ASSERT(tszc < szc && tszc > 0); 6846 ASSERT(badseg2->s_base > addr); 6847 ASSERT(eaddr > badseg2->s_base); 6848 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6849 6850 badseg2->s_szc = tszc; 6851 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6852 return (segvn_demote_range(badseg2, badseg2->s_base, 6853 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6854 } 6855 } 6856 6857 return (0); 6858 } 6859 6860 static int 6861 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6862 { 6863 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6864 struct vpage *vp, *evp; 6865 6866 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6867 6868 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6869 /* 6870 * If segment protection can be used, simply check against them. 6871 */ 6872 if (svd->pageprot == 0) { 6873 int err; 6874 6875 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6876 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6877 return (err); 6878 } 6879 6880 /* 6881 * Have to check down to the vpage level. 6882 */ 6883 evp = &svd->vpage[seg_page(seg, addr + len)]; 6884 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6885 if ((VPP_PROT(vp) & prot) != prot) { 6886 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6887 return (EACCES); 6888 } 6889 } 6890 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6891 return (0); 6892 } 6893 6894 static int 6895 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6896 { 6897 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6898 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6899 6900 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6901 6902 if (pgno != 0) { 6903 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6904 if (svd->pageprot == 0) { 6905 do { 6906 protv[--pgno] = svd->prot; 6907 } while (pgno != 0); 6908 } else { 6909 size_t pgoff = seg_page(seg, addr); 6910 6911 do { 6912 pgno--; 6913 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6914 } while (pgno != 0); 6915 } 6916 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6917 } 6918 return (0); 6919 } 6920 6921 static u_offset_t 6922 segvn_getoffset(struct seg *seg, caddr_t addr) 6923 { 6924 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6925 6926 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6927 6928 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6929 } 6930 6931 /*ARGSUSED*/ 6932 static int 6933 segvn_gettype(struct seg *seg, caddr_t addr) 6934 { 6935 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6936 6937 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6938 6939 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6940 MAP_INITDATA))); 6941 } 6942 6943 /*ARGSUSED*/ 6944 static int 6945 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6946 { 6947 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6948 6949 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6950 6951 *vpp = svd->vp; 6952 return (0); 6953 } 6954 6955 /* 6956 * Check to see if it makes sense to do kluster/read ahead to 6957 * addr + delta relative to the mapping at addr. We assume here 6958 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6959 * 6960 * For segvn, we currently "approve" of the action if we are 6961 * still in the segment and it maps from the same vp/off, 6962 * or if the advice stored in segvn_data or vpages allows it. 6963 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6964 */ 6965 static int 6966 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6967 { 6968 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6969 struct anon *oap, *ap; 6970 ssize_t pd; 6971 size_t page; 6972 struct vnode *vp1, *vp2; 6973 u_offset_t off1, off2; 6974 struct anon_map *amp; 6975 6976 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6977 ASSERT(AS_WRITE_HELD(seg->s_as) || 6978 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6979 6980 if (addr + delta < seg->s_base || 6981 addr + delta >= (seg->s_base + seg->s_size)) 6982 return (-1); /* exceeded segment bounds */ 6983 6984 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6985 page = seg_page(seg, addr); 6986 6987 /* 6988 * Check to see if either of the pages addr or addr + delta 6989 * have advice set that prevents klustering (if MADV_RANDOM advice 6990 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6991 * is negative). 6992 */ 6993 if (svd->advice == MADV_RANDOM || 6994 svd->advice == MADV_SEQUENTIAL && delta < 0) 6995 return (-1); 6996 else if (svd->pageadvice && svd->vpage) { 6997 struct vpage *bvpp, *evpp; 6998 6999 bvpp = &svd->vpage[page]; 7000 evpp = &svd->vpage[page + pd]; 7001 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 7002 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 7003 return (-1); 7004 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 7005 VPP_ADVICE(evpp) == MADV_RANDOM) 7006 return (-1); 7007 } 7008 7009 if (svd->type == MAP_SHARED) 7010 return (0); /* shared mapping - all ok */ 7011 7012 if ((amp = svd->amp) == NULL) 7013 return (0); /* off original vnode */ 7014 7015 page += svd->anon_index; 7016 7017 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7018 7019 oap = anon_get_ptr(amp->ahp, page); 7020 ap = anon_get_ptr(amp->ahp, page + pd); 7021 7022 ANON_LOCK_EXIT(&->a_rwlock); 7023 7024 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 7025 return (-1); /* one with and one without an anon */ 7026 } 7027 7028 if (oap == NULL) { /* implies that ap == NULL */ 7029 return (0); /* off original vnode */ 7030 } 7031 7032 /* 7033 * Now we know we have two anon pointers - check to 7034 * see if they happen to be properly allocated. 7035 */ 7036 7037 /* 7038 * XXX We cheat here and don't lock the anon slots. We can't because 7039 * we may have been called from the anon layer which might already 7040 * have locked them. We are holding a refcnt on the slots so they 7041 * can't disappear. The worst that will happen is we'll get the wrong 7042 * names (vp, off) for the slots and make a poor klustering decision. 7043 */ 7044 swap_xlate(ap, &vp1, &off1); 7045 swap_xlate(oap, &vp2, &off2); 7046 7047 7048 if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta) 7049 return (-1); 7050 return (0); 7051 } 7052 7053 /* 7054 * Swap the pages of seg out to secondary storage, returning the 7055 * number of bytes of storage freed. 7056 * 7057 * The basic idea is first to unload all translations and then to call 7058 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 7059 * swap device. Pages to which other segments have mappings will remain 7060 * mapped and won't be swapped. Our caller (as_swapout) has already 7061 * performed the unloading step. 7062 * 7063 * The value returned is intended to correlate well with the process's 7064 * memory requirements. However, there are some caveats: 7065 * 1) When given a shared segment as argument, this routine will 7066 * only succeed in swapping out pages for the last sharer of the 7067 * segment. (Previous callers will only have decremented mapping 7068 * reference counts.) 7069 * 2) We assume that the hat layer maintains a large enough translation 7070 * cache to capture process reference patterns. 7071 */ 7072 static size_t 7073 segvn_swapout(struct seg *seg) 7074 { 7075 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7076 struct anon_map *amp; 7077 pgcnt_t pgcnt = 0; 7078 pgcnt_t npages; 7079 pgcnt_t page; 7080 ulong_t anon_index; 7081 7082 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7083 7084 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7085 /* 7086 * Find pages unmapped by our caller and force them 7087 * out to the virtual swap device. 7088 */ 7089 if ((amp = svd->amp) != NULL) 7090 anon_index = svd->anon_index; 7091 npages = seg->s_size >> PAGESHIFT; 7092 for (page = 0; page < npages; page++) { 7093 page_t *pp; 7094 struct anon *ap; 7095 struct vnode *vp; 7096 u_offset_t off; 7097 anon_sync_obj_t cookie; 7098 7099 /* 7100 * Obtain <vp, off> pair for the page, then look it up. 7101 * 7102 * Note that this code is willing to consider regular 7103 * pages as well as anon pages. Is this appropriate here? 7104 */ 7105 ap = NULL; 7106 if (amp != NULL) { 7107 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7108 if (anon_array_try_enter(amp, anon_index + page, 7109 &cookie)) { 7110 ANON_LOCK_EXIT(&->a_rwlock); 7111 continue; 7112 } 7113 ap = anon_get_ptr(amp->ahp, anon_index + page); 7114 if (ap != NULL) { 7115 swap_xlate(ap, &vp, &off); 7116 } else { 7117 vp = svd->vp; 7118 off = svd->offset + ptob(page); 7119 } 7120 anon_array_exit(&cookie); 7121 ANON_LOCK_EXIT(&->a_rwlock); 7122 } else { 7123 vp = svd->vp; 7124 off = svd->offset + ptob(page); 7125 } 7126 if (vp == NULL) { /* untouched zfod page */ 7127 ASSERT(ap == NULL); 7128 continue; 7129 } 7130 7131 pp = page_lookup_nowait(vp, off, SE_SHARED); 7132 if (pp == NULL) 7133 continue; 7134 7135 7136 /* 7137 * Examine the page to see whether it can be tossed out, 7138 * keeping track of how many we've found. 7139 */ 7140 if (!page_tryupgrade(pp)) { 7141 /* 7142 * If the page has an i/o lock and no mappings, 7143 * it's very likely that the page is being 7144 * written out as a result of klustering. 7145 * Assume this is so and take credit for it here. 7146 */ 7147 if (!page_io_trylock(pp)) { 7148 if (!hat_page_is_mapped(pp)) 7149 pgcnt++; 7150 } else { 7151 page_io_unlock(pp); 7152 } 7153 page_unlock(pp); 7154 continue; 7155 } 7156 ASSERT(!page_iolock_assert(pp)); 7157 7158 7159 /* 7160 * Skip if page is locked or has mappings. 7161 * We don't need the page_struct_lock to look at lckcnt 7162 * and cowcnt because the page is exclusive locked. 7163 */ 7164 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 7165 hat_page_is_mapped(pp)) { 7166 page_unlock(pp); 7167 continue; 7168 } 7169 7170 /* 7171 * dispose skips large pages so try to demote first. 7172 */ 7173 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 7174 page_unlock(pp); 7175 /* 7176 * XXX should skip the remaining page_t's of this 7177 * large page. 7178 */ 7179 continue; 7180 } 7181 7182 ASSERT(pp->p_szc == 0); 7183 7184 /* 7185 * No longer mapped -- we can toss it out. How 7186 * we do so depends on whether or not it's dirty. 7187 */ 7188 if (hat_ismod(pp) && pp->p_vnode) { 7189 /* 7190 * We must clean the page before it can be 7191 * freed. Setting B_FREE will cause pvn_done 7192 * to free the page when the i/o completes. 7193 * XXX: This also causes it to be accounted 7194 * as a pageout instead of a swap: need 7195 * B_SWAPOUT bit to use instead of B_FREE. 7196 * 7197 * Hold the vnode before releasing the page lock 7198 * to prevent it from being freed and re-used by 7199 * some other thread. 7200 */ 7201 VN_HOLD(vp); 7202 page_unlock(pp); 7203 7204 /* 7205 * Queue all i/o requests for the pageout thread 7206 * to avoid saturating the pageout devices. 7207 */ 7208 if (!queue_io_request(vp, off)) 7209 VN_RELE(vp); 7210 } else { 7211 /* 7212 * The page was clean, free it. 7213 * 7214 * XXX: Can we ever encounter modified pages 7215 * with no associated vnode here? 7216 */ 7217 ASSERT(pp->p_vnode != NULL); 7218 /*LINTED: constant in conditional context*/ 7219 VN_DISPOSE(pp, B_FREE, 0, kcred); 7220 } 7221 7222 /* 7223 * Credit now even if i/o is in progress. 7224 */ 7225 pgcnt++; 7226 } 7227 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7228 7229 /* 7230 * Wakeup pageout to initiate i/o on all queued requests. 7231 */ 7232 cv_signal_pageout(); 7233 return (ptob(pgcnt)); 7234 } 7235 7236 /* 7237 * Synchronize primary storage cache with real object in virtual memory. 7238 * 7239 * XXX - Anonymous pages should not be sync'ed out at all. 7240 */ 7241 static int 7242 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 7243 { 7244 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7245 struct vpage *vpp; 7246 page_t *pp; 7247 u_offset_t offset; 7248 struct vnode *vp; 7249 u_offset_t off; 7250 caddr_t eaddr; 7251 int bflags; 7252 int err = 0; 7253 int segtype; 7254 int pageprot; 7255 int prot; 7256 ulong_t anon_index; 7257 struct anon_map *amp; 7258 struct anon *ap; 7259 anon_sync_obj_t cookie; 7260 7261 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7262 7263 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7264 7265 if (svd->softlockcnt > 0) { 7266 /* 7267 * If this is shared segment non 0 softlockcnt 7268 * means locked pages are still in use. 7269 */ 7270 if (svd->type == MAP_SHARED) { 7271 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7272 return (EAGAIN); 7273 } 7274 7275 /* 7276 * flush all pages from seg cache 7277 * otherwise we may deadlock in swap_putpage 7278 * for B_INVAL page (4175402). 7279 * 7280 * Even if we grab segvn WRITER's lock 7281 * here, there might be another thread which could've 7282 * successfully performed lookup/insert just before 7283 * we acquired the lock here. So, grabbing either 7284 * lock here is of not much use. Until we devise 7285 * a strategy at upper layers to solve the 7286 * synchronization issues completely, we expect 7287 * applications to handle this appropriately. 7288 */ 7289 segvn_purge(seg); 7290 if (svd->softlockcnt > 0) { 7291 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7292 return (EAGAIN); 7293 } 7294 } else if (svd->type == MAP_SHARED && svd->amp != NULL && 7295 svd->amp->a_softlockcnt > 0) { 7296 /* 7297 * Try to purge this amp's entries from pcache. It will 7298 * succeed only if other segments that share the amp have no 7299 * outstanding softlock's. 7300 */ 7301 segvn_purge(seg); 7302 if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) { 7303 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7304 return (EAGAIN); 7305 } 7306 } 7307 7308 vpp = svd->vpage; 7309 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7310 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 7311 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 7312 7313 if (attr) { 7314 pageprot = attr & ~(SHARED|PRIVATE); 7315 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 7316 7317 /* 7318 * We are done if the segment types don't match 7319 * or if we have segment level protections and 7320 * they don't match. 7321 */ 7322 if (svd->type != segtype) { 7323 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7324 return (0); 7325 } 7326 if (vpp == NULL) { 7327 if (svd->prot != pageprot) { 7328 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7329 return (0); 7330 } 7331 prot = svd->prot; 7332 } else 7333 vpp = &svd->vpage[seg_page(seg, addr)]; 7334 7335 } else if (svd->vp && svd->amp == NULL && 7336 (flags & MS_INVALIDATE) == 0) { 7337 7338 /* 7339 * No attributes, no anonymous pages and MS_INVALIDATE flag 7340 * is not on, just use one big request. 7341 */ 7342 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 7343 bflags, svd->cred, NULL); 7344 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7345 return (err); 7346 } 7347 7348 if ((amp = svd->amp) != NULL) 7349 anon_index = svd->anon_index + seg_page(seg, addr); 7350 7351 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 7352 ap = NULL; 7353 if (amp != NULL) { 7354 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7355 anon_array_enter(amp, anon_index, &cookie); 7356 ap = anon_get_ptr(amp->ahp, anon_index++); 7357 if (ap != NULL) { 7358 swap_xlate(ap, &vp, &off); 7359 } else { 7360 vp = svd->vp; 7361 off = offset; 7362 } 7363 anon_array_exit(&cookie); 7364 ANON_LOCK_EXIT(&->a_rwlock); 7365 } else { 7366 vp = svd->vp; 7367 off = offset; 7368 } 7369 offset += PAGESIZE; 7370 7371 if (vp == NULL) /* untouched zfod page */ 7372 continue; 7373 7374 if (attr) { 7375 if (vpp) { 7376 prot = VPP_PROT(vpp); 7377 vpp++; 7378 } 7379 if (prot != pageprot) { 7380 continue; 7381 } 7382 } 7383 7384 /* 7385 * See if any of these pages are locked -- if so, then we 7386 * will have to truncate an invalidate request at the first 7387 * locked one. We don't need the page_struct_lock to test 7388 * as this is only advisory; even if we acquire it someone 7389 * might race in and lock the page after we unlock and before 7390 * we do the PUTPAGE, then PUTPAGE simply does nothing. 7391 */ 7392 if (flags & MS_INVALIDATE) { 7393 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 7394 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 7395 page_unlock(pp); 7396 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7397 return (EBUSY); 7398 } 7399 if (ap != NULL && pp->p_szc != 0 && 7400 page_tryupgrade(pp)) { 7401 if (pp->p_lckcnt == 0 && 7402 pp->p_cowcnt == 0) { 7403 /* 7404 * swapfs VN_DISPOSE() won't 7405 * invalidate large pages. 7406 * Attempt to demote. 7407 * XXX can't help it if it 7408 * fails. But for swapfs 7409 * pages it is no big deal. 7410 */ 7411 (void) page_try_demote_pages( 7412 pp); 7413 } 7414 } 7415 page_unlock(pp); 7416 } 7417 } else if (svd->type == MAP_SHARED && amp != NULL) { 7418 /* 7419 * Avoid writing out to disk ISM's large pages 7420 * because segspt_free_pages() relies on NULL an_pvp 7421 * of anon slots of such pages. 7422 */ 7423 7424 ASSERT(svd->vp == NULL); 7425 /* 7426 * swapfs uses page_lookup_nowait if not freeing or 7427 * invalidating and skips a page if 7428 * page_lookup_nowait returns NULL. 7429 */ 7430 pp = page_lookup_nowait(vp, off, SE_SHARED); 7431 if (pp == NULL) { 7432 continue; 7433 } 7434 if (pp->p_szc != 0) { 7435 page_unlock(pp); 7436 continue; 7437 } 7438 7439 /* 7440 * Note ISM pages are created large so (vp, off)'s 7441 * page cannot suddenly become large after we unlock 7442 * pp. 7443 */ 7444 page_unlock(pp); 7445 } 7446 /* 7447 * XXX - Should ultimately try to kluster 7448 * calls to VOP_PUTPAGE() for performance. 7449 */ 7450 VN_HOLD(vp); 7451 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 7452 (bflags | (IS_SWAPFSVP(vp) ? B_PAGE_NOWAIT : 0)), 7453 svd->cred, NULL); 7454 7455 VN_RELE(vp); 7456 if (err) 7457 break; 7458 } 7459 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7460 return (err); 7461 } 7462 7463 /* 7464 * Determine if we have data corresponding to pages in the 7465 * primary storage virtual memory cache (i.e., "in core"). 7466 */ 7467 static size_t 7468 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 7469 { 7470 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7471 struct vnode *vp, *avp; 7472 u_offset_t offset, aoffset; 7473 size_t p, ep; 7474 int ret; 7475 struct vpage *vpp; 7476 page_t *pp; 7477 uint_t start; 7478 struct anon_map *amp; /* XXX - for locknest */ 7479 struct anon *ap; 7480 uint_t attr; 7481 anon_sync_obj_t cookie; 7482 7483 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7484 7485 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7486 if (svd->amp == NULL && svd->vp == NULL) { 7487 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7488 bzero(vec, btopr(len)); 7489 return (len); /* no anonymous pages created yet */ 7490 } 7491 7492 p = seg_page(seg, addr); 7493 ep = seg_page(seg, addr + len); 7494 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 7495 7496 amp = svd->amp; 7497 for (; p < ep; p++, addr += PAGESIZE) { 7498 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 7499 ret = start; 7500 ap = NULL; 7501 avp = NULL; 7502 /* Grab the vnode/offset for the anon slot */ 7503 if (amp != NULL) { 7504 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7505 anon_array_enter(amp, svd->anon_index + p, &cookie); 7506 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 7507 if (ap != NULL) { 7508 swap_xlate(ap, &avp, &aoffset); 7509 } 7510 anon_array_exit(&cookie); 7511 ANON_LOCK_EXIT(&->a_rwlock); 7512 } 7513 if ((avp != NULL) && page_exists(avp, aoffset)) { 7514 /* A page exists for the anon slot */ 7515 ret |= SEG_PAGE_INCORE; 7516 7517 /* 7518 * If page is mapped and writable 7519 */ 7520 attr = (uint_t)0; 7521 if ((hat_getattr(seg->s_as->a_hat, addr, 7522 &attr) != -1) && (attr & PROT_WRITE)) { 7523 ret |= SEG_PAGE_ANON; 7524 } 7525 /* 7526 * Don't get page_struct lock for lckcnt and cowcnt, 7527 * since this is purely advisory. 7528 */ 7529 if ((pp = page_lookup_nowait(avp, aoffset, 7530 SE_SHARED)) != NULL) { 7531 if (pp->p_lckcnt) 7532 ret |= SEG_PAGE_SOFTLOCK; 7533 if (pp->p_cowcnt) 7534 ret |= SEG_PAGE_HASCOW; 7535 page_unlock(pp); 7536 } 7537 } 7538 7539 /* Gather vnode statistics */ 7540 vp = svd->vp; 7541 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7542 7543 if (vp != NULL) { 7544 /* 7545 * Try to obtain a "shared" lock on the page 7546 * without blocking. If this fails, determine 7547 * if the page is in memory. 7548 */ 7549 pp = page_lookup_nowait(vp, offset, SE_SHARED); 7550 if ((pp == NULL) && (page_exists(vp, offset))) { 7551 /* Page is incore, and is named */ 7552 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7553 } 7554 /* 7555 * Don't get page_struct lock for lckcnt and cowcnt, 7556 * since this is purely advisory. 7557 */ 7558 if (pp != NULL) { 7559 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7560 if (pp->p_lckcnt) 7561 ret |= SEG_PAGE_SOFTLOCK; 7562 if (pp->p_cowcnt) 7563 ret |= SEG_PAGE_HASCOW; 7564 page_unlock(pp); 7565 } 7566 } 7567 7568 /* Gather virtual page information */ 7569 if (vpp) { 7570 if (VPP_ISPPLOCK(vpp)) 7571 ret |= SEG_PAGE_LOCKED; 7572 vpp++; 7573 } 7574 7575 *vec++ = (char)ret; 7576 } 7577 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7578 return (len); 7579 } 7580 7581 /* 7582 * Statement for p_cowcnts/p_lckcnts. 7583 * 7584 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 7585 * irrespective of the following factors or anything else: 7586 * 7587 * (1) anon slots are populated or not 7588 * (2) cow is broken or not 7589 * (3) refcnt on ap is 1 or greater than 1 7590 * 7591 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 7592 * and munlock. 7593 * 7594 * 7595 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 7596 * 7597 * if vpage has PROT_WRITE 7598 * transfer cowcnt on the oldpage -> cowcnt on the newpage 7599 * else 7600 * transfer lckcnt on the oldpage -> lckcnt on the newpage 7601 * 7602 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 7603 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 7604 * 7605 * We may also break COW if softlocking on read access in the physio case. 7606 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 7607 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 7608 * vpage doesn't have PROT_WRITE. 7609 * 7610 * 7611 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 7612 * 7613 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 7614 * increment p_lckcnt by calling page_subclaim() which takes care of 7615 * availrmem accounting and p_lckcnt overflow. 7616 * 7617 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 7618 * increment p_cowcnt by calling page_addclaim() which takes care of 7619 * availrmem availability and p_cowcnt overflow. 7620 */ 7621 7622 /* 7623 * Lock down (or unlock) pages mapped by this segment. 7624 * 7625 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7626 * At fault time they will be relocated into larger pages. 7627 */ 7628 static int 7629 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 7630 int attr, int op, ulong_t *lockmap, size_t pos) 7631 { 7632 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7633 struct vpage *vpp; 7634 struct vpage *evp; 7635 page_t *pp; 7636 u_offset_t offset; 7637 u_offset_t off; 7638 int segtype; 7639 int pageprot; 7640 int claim; 7641 struct vnode *vp; 7642 ulong_t anon_index; 7643 struct anon_map *amp; 7644 struct anon *ap; 7645 struct vattr va; 7646 anon_sync_obj_t cookie; 7647 struct kshmid *sp = NULL; 7648 struct proc *p = curproc; 7649 kproject_t *proj = NULL; 7650 int chargeproc = 1; 7651 size_t locked_bytes = 0; 7652 size_t unlocked_bytes = 0; 7653 int err = 0; 7654 7655 /* 7656 * Hold write lock on address space because may split or concatenate 7657 * segments 7658 */ 7659 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7660 7661 /* 7662 * If this is a shm, use shm's project and zone, else use 7663 * project and zone of calling process 7664 */ 7665 7666 /* Determine if this segment backs a sysV shm */ 7667 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 7668 ASSERT(svd->type == MAP_SHARED); 7669 ASSERT(svd->tr_state == SEGVN_TR_OFF); 7670 sp = svd->amp->a_sp; 7671 proj = sp->shm_perm.ipc_proj; 7672 chargeproc = 0; 7673 } 7674 7675 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7676 if (attr) { 7677 pageprot = attr & ~(SHARED|PRIVATE); 7678 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7679 7680 /* 7681 * We are done if the segment types don't match 7682 * or if we have segment level protections and 7683 * they don't match. 7684 */ 7685 if (svd->type != segtype) { 7686 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7687 return (0); 7688 } 7689 if (svd->pageprot == 0 && svd->prot != pageprot) { 7690 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7691 return (0); 7692 } 7693 } 7694 7695 if (op == MC_LOCK) { 7696 if (svd->tr_state == SEGVN_TR_INIT) { 7697 svd->tr_state = SEGVN_TR_OFF; 7698 } else if (svd->tr_state == SEGVN_TR_ON) { 7699 ASSERT(svd->amp != NULL); 7700 segvn_textunrepl(seg, 0); 7701 ASSERT(svd->amp == NULL && 7702 svd->tr_state == SEGVN_TR_OFF); 7703 } 7704 } 7705 7706 /* 7707 * If we're locking, then we must create a vpage structure if 7708 * none exists. If we're unlocking, then check to see if there 7709 * is a vpage -- if not, then we could not have locked anything. 7710 */ 7711 7712 if ((vpp = svd->vpage) == NULL) { 7713 if (op == MC_LOCK) { 7714 segvn_vpage(seg); 7715 if (svd->vpage == NULL) { 7716 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7717 return (ENOMEM); 7718 } 7719 } else { 7720 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7721 return (0); 7722 } 7723 } 7724 7725 /* 7726 * The anonymous data vector (i.e., previously 7727 * unreferenced mapping to swap space) can be allocated 7728 * by lazily testing for its existence. 7729 */ 7730 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7731 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 7732 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 7733 svd->amp->a_szc = seg->s_szc; 7734 } 7735 7736 if ((amp = svd->amp) != NULL) { 7737 anon_index = svd->anon_index + seg_page(seg, addr); 7738 } 7739 7740 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7741 evp = &svd->vpage[seg_page(seg, addr + len)]; 7742 7743 if (sp != NULL) 7744 mutex_enter(&sp->shm_mlock); 7745 7746 /* determine number of unlocked bytes in range for lock operation */ 7747 if (op == MC_LOCK) { 7748 7749 if (sp == NULL) { 7750 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7751 vpp++) { 7752 if (!VPP_ISPPLOCK(vpp)) 7753 unlocked_bytes += PAGESIZE; 7754 } 7755 } else { 7756 ulong_t i_idx, i_edx; 7757 anon_sync_obj_t i_cookie; 7758 struct anon *i_ap; 7759 struct vnode *i_vp; 7760 u_offset_t i_off; 7761 7762 /* Only count sysV pages once for locked memory */ 7763 i_edx = svd->anon_index + seg_page(seg, addr + len); 7764 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7765 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7766 anon_array_enter(amp, i_idx, &i_cookie); 7767 i_ap = anon_get_ptr(amp->ahp, i_idx); 7768 if (i_ap == NULL) { 7769 unlocked_bytes += PAGESIZE; 7770 anon_array_exit(&i_cookie); 7771 continue; 7772 } 7773 swap_xlate(i_ap, &i_vp, &i_off); 7774 anon_array_exit(&i_cookie); 7775 pp = page_lookup(i_vp, i_off, SE_SHARED); 7776 if (pp == NULL) { 7777 unlocked_bytes += PAGESIZE; 7778 continue; 7779 } else if (pp->p_lckcnt == 0) 7780 unlocked_bytes += PAGESIZE; 7781 page_unlock(pp); 7782 } 7783 ANON_LOCK_EXIT(&->a_rwlock); 7784 } 7785 7786 mutex_enter(&p->p_lock); 7787 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7788 chargeproc); 7789 mutex_exit(&p->p_lock); 7790 7791 if (err) { 7792 if (sp != NULL) 7793 mutex_exit(&sp->shm_mlock); 7794 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7795 return (err); 7796 } 7797 } 7798 /* 7799 * Loop over all pages in the range. Process if we're locking and 7800 * page has not already been locked in this mapping; or if we're 7801 * unlocking and the page has been locked. 7802 */ 7803 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7804 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7805 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7806 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7807 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7808 7809 if (amp != NULL) 7810 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7811 /* 7812 * If this isn't a MAP_NORESERVE segment and 7813 * we're locking, allocate anon slots if they 7814 * don't exist. The page is brought in later on. 7815 */ 7816 if (op == MC_LOCK && svd->vp == NULL && 7817 ((svd->flags & MAP_NORESERVE) == 0) && 7818 amp != NULL && 7819 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7820 == NULL)) { 7821 anon_array_enter(amp, anon_index, &cookie); 7822 7823 if ((ap = anon_get_ptr(amp->ahp, 7824 anon_index)) == NULL) { 7825 pp = anon_zero(seg, addr, &ap, 7826 svd->cred); 7827 if (pp == NULL) { 7828 anon_array_exit(&cookie); 7829 ANON_LOCK_EXIT(&->a_rwlock); 7830 err = ENOMEM; 7831 goto out; 7832 } 7833 ASSERT(anon_get_ptr(amp->ahp, 7834 anon_index) == NULL); 7835 (void) anon_set_ptr(amp->ahp, 7836 anon_index, ap, ANON_SLEEP); 7837 page_unlock(pp); 7838 } 7839 anon_array_exit(&cookie); 7840 } 7841 7842 /* 7843 * Get name for page, accounting for 7844 * existence of private copy. 7845 */ 7846 ap = NULL; 7847 if (amp != NULL) { 7848 anon_array_enter(amp, anon_index, &cookie); 7849 ap = anon_get_ptr(amp->ahp, anon_index); 7850 if (ap != NULL) { 7851 swap_xlate(ap, &vp, &off); 7852 } else { 7853 if (svd->vp == NULL && 7854 (svd->flags & MAP_NORESERVE)) { 7855 anon_array_exit(&cookie); 7856 ANON_LOCK_EXIT(&->a_rwlock); 7857 continue; 7858 } 7859 vp = svd->vp; 7860 off = offset; 7861 } 7862 if (op != MC_LOCK || ap == NULL) { 7863 anon_array_exit(&cookie); 7864 ANON_LOCK_EXIT(&->a_rwlock); 7865 } 7866 } else { 7867 vp = svd->vp; 7868 off = offset; 7869 } 7870 7871 /* 7872 * Get page frame. It's ok if the page is 7873 * not available when we're unlocking, as this 7874 * may simply mean that a page we locked got 7875 * truncated out of existence after we locked it. 7876 * 7877 * Invoke VOP_GETPAGE() to obtain the page struct 7878 * since we may need to read it from disk if its 7879 * been paged out. 7880 */ 7881 if (op != MC_LOCK) 7882 pp = page_lookup(vp, off, SE_SHARED); 7883 else { 7884 page_t *pl[1 + 1]; 7885 int error; 7886 7887 ASSERT(vp != NULL); 7888 7889 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7890 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7891 S_OTHER, svd->cred, NULL); 7892 7893 if (error && ap != NULL) { 7894 anon_array_exit(&cookie); 7895 ANON_LOCK_EXIT(&->a_rwlock); 7896 } 7897 7898 /* 7899 * If the error is EDEADLK then we must bounce 7900 * up and drop all vm subsystem locks and then 7901 * retry the operation later 7902 * This behavior is a temporary measure because 7903 * ufs/sds logging is badly designed and will 7904 * deadlock if we don't allow this bounce to 7905 * happen. The real solution is to re-design 7906 * the logging code to work properly. See bug 7907 * 4125102 for details of the problem. 7908 */ 7909 if (error == EDEADLK) { 7910 err = error; 7911 goto out; 7912 } 7913 /* 7914 * Quit if we fail to fault in the page. Treat 7915 * the failure as an error, unless the addr 7916 * is mapped beyond the end of a file. 7917 */ 7918 if (error && svd->vp) { 7919 va.va_mask = AT_SIZE; 7920 if (VOP_GETATTR(svd->vp, &va, 0, 7921 svd->cred, NULL) != 0) { 7922 err = EIO; 7923 goto out; 7924 } 7925 if (btopr(va.va_size) >= 7926 btopr(off + 1)) { 7927 err = EIO; 7928 goto out; 7929 } 7930 goto out; 7931 7932 } else if (error) { 7933 err = EIO; 7934 goto out; 7935 } 7936 pp = pl[0]; 7937 ASSERT(pp != NULL); 7938 } 7939 7940 /* 7941 * See Statement at the beginning of this routine. 7942 * 7943 * claim is always set if MAP_PRIVATE and PROT_WRITE 7944 * irrespective of following factors: 7945 * 7946 * (1) anon slots are populated or not 7947 * (2) cow is broken or not 7948 * (3) refcnt on ap is 1 or greater than 1 7949 * 7950 * See 4140683 for details 7951 */ 7952 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7953 (svd->type == MAP_PRIVATE)); 7954 7955 /* 7956 * Perform page-level operation appropriate to 7957 * operation. If locking, undo the SOFTLOCK 7958 * performed to bring the page into memory 7959 * after setting the lock. If unlocking, 7960 * and no page was found, account for the claim 7961 * separately. 7962 */ 7963 if (op == MC_LOCK) { 7964 int ret = 1; /* Assume success */ 7965 7966 ASSERT(!VPP_ISPPLOCK(vpp)); 7967 7968 ret = page_pp_lock(pp, claim, 0); 7969 if (ap != NULL) { 7970 if (ap->an_pvp != NULL) { 7971 anon_swap_free(ap, pp); 7972 } 7973 anon_array_exit(&cookie); 7974 ANON_LOCK_EXIT(&->a_rwlock); 7975 } 7976 if (ret == 0) { 7977 /* locking page failed */ 7978 page_unlock(pp); 7979 err = EAGAIN; 7980 goto out; 7981 } 7982 VPP_SETPPLOCK(vpp); 7983 if (sp != NULL) { 7984 if (pp->p_lckcnt == 1) 7985 locked_bytes += PAGESIZE; 7986 } else 7987 locked_bytes += PAGESIZE; 7988 7989 if (lockmap != (ulong_t *)NULL) 7990 BT_SET(lockmap, pos); 7991 7992 page_unlock(pp); 7993 } else { 7994 ASSERT(VPP_ISPPLOCK(vpp)); 7995 if (pp != NULL) { 7996 /* sysV pages should be locked */ 7997 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7998 page_pp_unlock(pp, claim, 0); 7999 if (sp != NULL) { 8000 if (pp->p_lckcnt == 0) 8001 unlocked_bytes 8002 += PAGESIZE; 8003 } else 8004 unlocked_bytes += PAGESIZE; 8005 page_unlock(pp); 8006 } else { 8007 ASSERT(sp == NULL); 8008 unlocked_bytes += PAGESIZE; 8009 } 8010 VPP_CLRPPLOCK(vpp); 8011 } 8012 } 8013 } 8014 out: 8015 if (op == MC_LOCK) { 8016 /* Credit back bytes that did not get locked */ 8017 if ((unlocked_bytes - locked_bytes) > 0) { 8018 if (proj == NULL) 8019 mutex_enter(&p->p_lock); 8020 rctl_decr_locked_mem(p, proj, 8021 (unlocked_bytes - locked_bytes), chargeproc); 8022 if (proj == NULL) 8023 mutex_exit(&p->p_lock); 8024 } 8025 8026 } else { 8027 /* Account bytes that were unlocked */ 8028 if (unlocked_bytes > 0) { 8029 if (proj == NULL) 8030 mutex_enter(&p->p_lock); 8031 rctl_decr_locked_mem(p, proj, unlocked_bytes, 8032 chargeproc); 8033 if (proj == NULL) 8034 mutex_exit(&p->p_lock); 8035 } 8036 } 8037 if (sp != NULL) 8038 mutex_exit(&sp->shm_mlock); 8039 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8040 8041 return (err); 8042 } 8043 8044 /* 8045 * Set advice from user for specified pages 8046 * There are 9 types of advice: 8047 * MADV_NORMAL - Normal (default) behavior (whatever that is) 8048 * MADV_RANDOM - Random page references 8049 * do not allow readahead or 'klustering' 8050 * MADV_SEQUENTIAL - Sequential page references 8051 * Pages previous to the one currently being 8052 * accessed (determined by fault) are 'not needed' 8053 * and are freed immediately 8054 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 8055 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 8056 * MADV_FREE - Contents can be discarded 8057 * MADV_ACCESS_DEFAULT- Default access 8058 * MADV_ACCESS_LWP - Next LWP will access heavily 8059 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 8060 */ 8061 static int 8062 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 8063 { 8064 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8065 size_t page; 8066 int err = 0; 8067 int already_set; 8068 struct anon_map *amp; 8069 ulong_t anon_index; 8070 struct seg *next; 8071 lgrp_mem_policy_t policy; 8072 struct seg *prev; 8073 struct vnode *vp; 8074 8075 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 8076 8077 /* 8078 * In case of MADV_FREE, we won't be modifying any segment private 8079 * data structures; so, we only need to grab READER's lock 8080 */ 8081 if (behav != MADV_FREE) { 8082 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 8083 if (svd->tr_state != SEGVN_TR_OFF) { 8084 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8085 return (0); 8086 } 8087 } else { 8088 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8089 } 8090 8091 /* 8092 * Large pages are assumed to be only turned on when accesses to the 8093 * segment's address range have spatial and temporal locality. That 8094 * justifies ignoring MADV_SEQUENTIAL for large page segments. 8095 * Also, ignore advice affecting lgroup memory allocation 8096 * if don't need to do lgroup optimizations on this system 8097 */ 8098 8099 if ((behav == MADV_SEQUENTIAL && 8100 (seg->s_szc != 0 || HAT_IS_REGION_COOKIE_VALID(svd->rcookie))) || 8101 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 8102 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 8103 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8104 return (0); 8105 } 8106 8107 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 8108 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 8109 /* 8110 * Since we are going to unload hat mappings 8111 * we first have to flush the cache. Otherwise 8112 * this might lead to system panic if another 8113 * thread is doing physio on the range whose 8114 * mappings are unloaded by madvise(3C). 8115 */ 8116 if (svd->softlockcnt > 0) { 8117 /* 8118 * If this is shared segment non 0 softlockcnt 8119 * means locked pages are still in use. 8120 */ 8121 if (svd->type == MAP_SHARED) { 8122 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8123 return (EAGAIN); 8124 } 8125 /* 8126 * Since we do have the segvn writers lock 8127 * nobody can fill the cache with entries 8128 * belonging to this seg during the purge. 8129 * The flush either succeeds or we still 8130 * have pending I/Os. In the later case, 8131 * madvise(3C) fails. 8132 */ 8133 segvn_purge(seg); 8134 if (svd->softlockcnt > 0) { 8135 /* 8136 * Since madvise(3C) is advisory and 8137 * it's not part of UNIX98, madvise(3C) 8138 * failure here doesn't cause any hardship. 8139 * Note that we don't block in "as" layer. 8140 */ 8141 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8142 return (EAGAIN); 8143 } 8144 } else if (svd->type == MAP_SHARED && svd->amp != NULL && 8145 svd->amp->a_softlockcnt > 0) { 8146 /* 8147 * Try to purge this amp's entries from pcache. It 8148 * will succeed only if other segments that share the 8149 * amp have no outstanding softlock's. 8150 */ 8151 segvn_purge(seg); 8152 } 8153 } 8154 8155 amp = svd->amp; 8156 vp = svd->vp; 8157 if (behav == MADV_FREE) { 8158 /* 8159 * MADV_FREE is not supported for segments with 8160 * underlying object; if anonmap is NULL, anon slots 8161 * are not yet populated and there is nothing for 8162 * us to do. As MADV_FREE is advisory, we don't 8163 * return error in either case. 8164 */ 8165 if (vp != NULL || amp == NULL) { 8166 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8167 return (0); 8168 } 8169 8170 segvn_purge(seg); 8171 8172 page = seg_page(seg, addr); 8173 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8174 anon_disclaim(amp, svd->anon_index + page, len); 8175 ANON_LOCK_EXIT(&->a_rwlock); 8176 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8177 return (0); 8178 } 8179 8180 /* 8181 * If advice is to be applied to entire segment, 8182 * use advice field in seg_data structure 8183 * otherwise use appropriate vpage entry. 8184 */ 8185 if ((addr == seg->s_base) && (len == seg->s_size)) { 8186 switch (behav) { 8187 case MADV_ACCESS_LWP: 8188 case MADV_ACCESS_MANY: 8189 case MADV_ACCESS_DEFAULT: 8190 /* 8191 * Set memory allocation policy for this segment 8192 */ 8193 policy = lgrp_madv_to_policy(behav, len, svd->type); 8194 if (svd->type == MAP_SHARED) 8195 already_set = lgrp_shm_policy_set(policy, amp, 8196 svd->anon_index, vp, svd->offset, len); 8197 else { 8198 /* 8199 * For private memory, need writers lock on 8200 * address space because the segment may be 8201 * split or concatenated when changing policy 8202 */ 8203 if (AS_READ_HELD(seg->s_as)) { 8204 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8205 return (IE_RETRY); 8206 } 8207 8208 already_set = lgrp_privm_policy_set(policy, 8209 &svd->policy_info, len); 8210 } 8211 8212 /* 8213 * If policy set already and it shouldn't be reapplied, 8214 * don't do anything. 8215 */ 8216 if (already_set && 8217 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8218 break; 8219 8220 /* 8221 * Mark any existing pages in given range for 8222 * migration 8223 */ 8224 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8225 vp, svd->offset, 1); 8226 8227 /* 8228 * If same policy set already or this is a shared 8229 * memory segment, don't need to try to concatenate 8230 * segment with adjacent ones. 8231 */ 8232 if (already_set || svd->type == MAP_SHARED) 8233 break; 8234 8235 /* 8236 * Try to concatenate this segment with previous 8237 * one and next one, since we changed policy for 8238 * this one and it may be compatible with adjacent 8239 * ones now. 8240 */ 8241 prev = AS_SEGPREV(seg->s_as, seg); 8242 next = AS_SEGNEXT(seg->s_as, seg); 8243 8244 if (next && next->s_ops == &segvn_ops && 8245 addr + len == next->s_base) 8246 (void) segvn_concat(seg, next, 1); 8247 8248 if (prev && prev->s_ops == &segvn_ops && 8249 addr == prev->s_base + prev->s_size) { 8250 /* 8251 * Drop lock for private data of current 8252 * segment before concatenating (deleting) it 8253 * and return IE_REATTACH to tell as_ctl() that 8254 * current segment has changed 8255 */ 8256 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8257 if (!segvn_concat(prev, seg, 1)) 8258 err = IE_REATTACH; 8259 8260 return (err); 8261 } 8262 break; 8263 8264 case MADV_SEQUENTIAL: 8265 /* 8266 * unloading mapping guarantees 8267 * detection in segvn_fault 8268 */ 8269 ASSERT(seg->s_szc == 0); 8270 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8271 hat_unload(seg->s_as->a_hat, addr, len, 8272 HAT_UNLOAD); 8273 /* FALLTHROUGH */ 8274 case MADV_NORMAL: 8275 case MADV_RANDOM: 8276 svd->advice = (uchar_t)behav; 8277 svd->pageadvice = 0; 8278 break; 8279 case MADV_WILLNEED: /* handled in memcntl */ 8280 case MADV_DONTNEED: /* handled in memcntl */ 8281 case MADV_FREE: /* handled above */ 8282 break; 8283 default: 8284 err = EINVAL; 8285 } 8286 } else { 8287 caddr_t eaddr; 8288 struct seg *new_seg; 8289 struct segvn_data *new_svd; 8290 u_offset_t off; 8291 caddr_t oldeaddr; 8292 8293 page = seg_page(seg, addr); 8294 8295 segvn_vpage(seg); 8296 if (svd->vpage == NULL) { 8297 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8298 return (ENOMEM); 8299 } 8300 8301 switch (behav) { 8302 struct vpage *bvpp, *evpp; 8303 8304 case MADV_ACCESS_LWP: 8305 case MADV_ACCESS_MANY: 8306 case MADV_ACCESS_DEFAULT: 8307 /* 8308 * Set memory allocation policy for portion of this 8309 * segment 8310 */ 8311 8312 /* 8313 * Align address and length of advice to page 8314 * boundaries for large pages 8315 */ 8316 if (seg->s_szc != 0) { 8317 size_t pgsz; 8318 8319 pgsz = page_get_pagesize(seg->s_szc); 8320 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 8321 len = P2ROUNDUP(len, pgsz); 8322 } 8323 8324 /* 8325 * Check to see whether policy is set already 8326 */ 8327 policy = lgrp_madv_to_policy(behav, len, svd->type); 8328 8329 anon_index = svd->anon_index + page; 8330 off = svd->offset + (uintptr_t)(addr - seg->s_base); 8331 8332 if (svd->type == MAP_SHARED) 8333 already_set = lgrp_shm_policy_set(policy, amp, 8334 anon_index, vp, off, len); 8335 else 8336 already_set = 8337 (policy == svd->policy_info.mem_policy); 8338 8339 /* 8340 * If policy set already and it shouldn't be reapplied, 8341 * don't do anything. 8342 */ 8343 if (already_set && 8344 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8345 break; 8346 8347 /* 8348 * For private memory, need writers lock on 8349 * address space because the segment may be 8350 * split or concatenated when changing policy 8351 */ 8352 if (svd->type == MAP_PRIVATE && 8353 AS_READ_HELD(seg->s_as)) { 8354 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8355 return (IE_RETRY); 8356 } 8357 8358 /* 8359 * Mark any existing pages in given range for 8360 * migration 8361 */ 8362 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8363 vp, svd->offset, 1); 8364 8365 /* 8366 * Don't need to try to split or concatenate 8367 * segments, since policy is same or this is a shared 8368 * memory segment 8369 */ 8370 if (already_set || svd->type == MAP_SHARED) 8371 break; 8372 8373 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 8374 ASSERT(svd->amp == NULL); 8375 ASSERT(svd->tr_state == SEGVN_TR_OFF); 8376 ASSERT(svd->softlockcnt == 0); 8377 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 8378 HAT_REGION_TEXT); 8379 svd->rcookie = HAT_INVALID_REGION_COOKIE; 8380 } 8381 8382 /* 8383 * Split off new segment if advice only applies to a 8384 * portion of existing segment starting in middle 8385 */ 8386 new_seg = NULL; 8387 eaddr = addr + len; 8388 oldeaddr = seg->s_base + seg->s_size; 8389 if (addr > seg->s_base) { 8390 /* 8391 * Must flush I/O page cache 8392 * before splitting segment 8393 */ 8394 if (svd->softlockcnt > 0) 8395 segvn_purge(seg); 8396 8397 /* 8398 * Split segment and return IE_REATTACH to tell 8399 * as_ctl() that current segment changed 8400 */ 8401 new_seg = segvn_split_seg(seg, addr); 8402 new_svd = (struct segvn_data *)new_seg->s_data; 8403 err = IE_REATTACH; 8404 8405 /* 8406 * If new segment ends where old one 8407 * did, try to concatenate the new 8408 * segment with next one. 8409 */ 8410 if (eaddr == oldeaddr) { 8411 /* 8412 * Set policy for new segment 8413 */ 8414 (void) lgrp_privm_policy_set(policy, 8415 &new_svd->policy_info, 8416 new_seg->s_size); 8417 8418 next = AS_SEGNEXT(new_seg->s_as, 8419 new_seg); 8420 8421 if (next && 8422 next->s_ops == &segvn_ops && 8423 eaddr == next->s_base) 8424 (void) segvn_concat(new_seg, 8425 next, 1); 8426 } 8427 } 8428 8429 /* 8430 * Split off end of existing segment if advice only 8431 * applies to a portion of segment ending before 8432 * end of the existing segment 8433 */ 8434 if (eaddr < oldeaddr) { 8435 /* 8436 * Must flush I/O page cache 8437 * before splitting segment 8438 */ 8439 if (svd->softlockcnt > 0) 8440 segvn_purge(seg); 8441 8442 /* 8443 * If beginning of old segment was already 8444 * split off, use new segment to split end off 8445 * from. 8446 */ 8447 if (new_seg != NULL && new_seg != seg) { 8448 /* 8449 * Split segment 8450 */ 8451 (void) segvn_split_seg(new_seg, eaddr); 8452 8453 /* 8454 * Set policy for new segment 8455 */ 8456 (void) lgrp_privm_policy_set(policy, 8457 &new_svd->policy_info, 8458 new_seg->s_size); 8459 } else { 8460 /* 8461 * Split segment and return IE_REATTACH 8462 * to tell as_ctl() that current 8463 * segment changed 8464 */ 8465 (void) segvn_split_seg(seg, eaddr); 8466 err = IE_REATTACH; 8467 8468 (void) lgrp_privm_policy_set(policy, 8469 &svd->policy_info, seg->s_size); 8470 8471 /* 8472 * If new segment starts where old one 8473 * did, try to concatenate it with 8474 * previous segment. 8475 */ 8476 if (addr == seg->s_base) { 8477 prev = AS_SEGPREV(seg->s_as, 8478 seg); 8479 8480 /* 8481 * Drop lock for private data 8482 * of current segment before 8483 * concatenating (deleting) it 8484 */ 8485 if (prev && 8486 prev->s_ops == 8487 &segvn_ops && 8488 addr == prev->s_base + 8489 prev->s_size) { 8490 SEGVN_LOCK_EXIT( 8491 seg->s_as, 8492 &svd->lock); 8493 (void) segvn_concat( 8494 prev, seg, 1); 8495 return (err); 8496 } 8497 } 8498 } 8499 } 8500 break; 8501 case MADV_SEQUENTIAL: 8502 ASSERT(seg->s_szc == 0); 8503 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8504 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 8505 /* FALLTHROUGH */ 8506 case MADV_NORMAL: 8507 case MADV_RANDOM: 8508 bvpp = &svd->vpage[page]; 8509 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8510 for (; bvpp < evpp; bvpp++) 8511 VPP_SETADVICE(bvpp, behav); 8512 svd->advice = MADV_NORMAL; 8513 break; 8514 case MADV_WILLNEED: /* handled in memcntl */ 8515 case MADV_DONTNEED: /* handled in memcntl */ 8516 case MADV_FREE: /* handled above */ 8517 break; 8518 default: 8519 err = EINVAL; 8520 } 8521 } 8522 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8523 return (err); 8524 } 8525 8526 /* 8527 * There is one kind of inheritance that can be specified for pages: 8528 * 8529 * SEGP_INH_ZERO - Pages should be zeroed in the child 8530 */ 8531 static int 8532 segvn_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 8533 { 8534 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8535 struct vpage *bvpp, *evpp; 8536 size_t page; 8537 int ret = 0; 8538 8539 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 8540 8541 /* Can't support something we don't know about */ 8542 if (behav != SEGP_INH_ZERO) 8543 return (ENOTSUP); 8544 8545 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 8546 8547 /* 8548 * This must be a straightforward anonymous segment that is mapped 8549 * privately and is not backed by a vnode. 8550 */ 8551 if (svd->tr_state != SEGVN_TR_OFF || 8552 svd->type != MAP_PRIVATE || 8553 svd->vp != NULL) { 8554 ret = EINVAL; 8555 goto out; 8556 } 8557 8558 /* 8559 * If the entire segment has been marked as inherit zero, then no reason 8560 * to do anything else. 8561 */ 8562 if (svd->svn_inz == SEGVN_INZ_ALL) { 8563 ret = 0; 8564 goto out; 8565 } 8566 8567 /* 8568 * If this applies to the entire segment, simply mark it and we're done. 8569 */ 8570 if ((addr == seg->s_base) && (len == seg->s_size)) { 8571 svd->svn_inz = SEGVN_INZ_ALL; 8572 ret = 0; 8573 goto out; 8574 } 8575 8576 /* 8577 * We've been asked to mark a subset of this segment as inherit zero, 8578 * therefore we need to mainpulate its vpages. 8579 */ 8580 if (svd->vpage == NULL) { 8581 segvn_vpage(seg); 8582 if (svd->vpage == NULL) { 8583 ret = ENOMEM; 8584 goto out; 8585 } 8586 } 8587 8588 svd->svn_inz = SEGVN_INZ_VPP; 8589 page = seg_page(seg, addr); 8590 bvpp = &svd->vpage[page]; 8591 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8592 for (; bvpp < evpp; bvpp++) 8593 VPP_SETINHZERO(bvpp); 8594 ret = 0; 8595 8596 out: 8597 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8598 return (ret); 8599 } 8600 8601 /* 8602 * Create a vpage structure for this seg. 8603 */ 8604 static void 8605 segvn_vpage(struct seg *seg) 8606 { 8607 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8608 struct vpage *vp, *evp; 8609 static pgcnt_t page_limit = 0; 8610 8611 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8612 8613 /* 8614 * If no vpage structure exists, allocate one. Copy the protections 8615 * and the advice from the segment itself to the individual pages. 8616 */ 8617 if (svd->vpage == NULL) { 8618 /* 8619 * Start by calculating the number of pages we must allocate to 8620 * track the per-page vpage structs needs for this entire 8621 * segment. If we know now that it will require more than our 8622 * heuristic for the maximum amount of kmem we can consume then 8623 * fail. We do this here, instead of trying to detect this deep 8624 * in page_resv and propagating the error up, since the entire 8625 * memory allocation stack is not amenable to passing this 8626 * back. Instead, it wants to keep trying. 8627 * 8628 * As a heuristic we set a page limit of 5/8s of total_pages 8629 * for this allocation. We use shifts so that no floating 8630 * point conversion takes place and only need to do the 8631 * calculation once. 8632 */ 8633 ulong_t mem_needed = seg_pages(seg) * sizeof (struct vpage); 8634 pgcnt_t npages = mem_needed >> PAGESHIFT; 8635 8636 if (page_limit == 0) 8637 page_limit = (total_pages >> 1) + (total_pages >> 3); 8638 8639 if (npages > page_limit) 8640 return; 8641 8642 svd->pageadvice = 1; 8643 svd->vpage = kmem_zalloc(mem_needed, KM_SLEEP); 8644 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 8645 for (vp = svd->vpage; vp < evp; vp++) { 8646 VPP_SETPROT(vp, svd->prot); 8647 VPP_SETADVICE(vp, svd->advice); 8648 } 8649 } 8650 } 8651 8652 /* 8653 * Dump the pages belonging to this segvn segment. 8654 */ 8655 static void 8656 segvn_dump(struct seg *seg) 8657 { 8658 struct segvn_data *svd; 8659 page_t *pp; 8660 struct anon_map *amp; 8661 ulong_t anon_index; 8662 struct vnode *vp; 8663 u_offset_t off, offset; 8664 pfn_t pfn; 8665 pgcnt_t page, npages; 8666 caddr_t addr; 8667 8668 npages = seg_pages(seg); 8669 svd = (struct segvn_data *)seg->s_data; 8670 vp = svd->vp; 8671 off = offset = svd->offset; 8672 addr = seg->s_base; 8673 8674 if ((amp = svd->amp) != NULL) { 8675 anon_index = svd->anon_index; 8676 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8677 } 8678 8679 for (page = 0; page < npages; page++, offset += PAGESIZE) { 8680 struct anon *ap; 8681 int we_own_it = 0; 8682 8683 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 8684 swap_xlate_nopanic(ap, &vp, &off); 8685 } else { 8686 vp = svd->vp; 8687 off = offset; 8688 } 8689 8690 /* 8691 * If pp == NULL, the page either does not exist 8692 * or is exclusively locked. So determine if it 8693 * exists before searching for it. 8694 */ 8695 8696 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 8697 we_own_it = 1; 8698 else 8699 pp = page_exists(vp, off); 8700 8701 if (pp) { 8702 pfn = page_pptonum(pp); 8703 dump_addpage(seg->s_as, addr, pfn); 8704 if (we_own_it) 8705 page_unlock(pp); 8706 } 8707 addr += PAGESIZE; 8708 dump_timeleft = dump_timeout; 8709 } 8710 8711 if (amp != NULL) 8712 ANON_LOCK_EXIT(&->a_rwlock); 8713 } 8714 8715 #ifdef DEBUG 8716 static uint32_t segvn_pglock_mtbf = 0; 8717 #endif 8718 8719 #define PCACHE_SHWLIST ((page_t *)-2) 8720 #define NOPCACHE_SHWLIST ((page_t *)-1) 8721 8722 /* 8723 * Lock/Unlock anon pages over a given range. Return shadow list. This routine 8724 * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages 8725 * to avoid the overhead of per page locking, unlocking for subsequent IOs to 8726 * the same parts of the segment. Currently shadow list creation is only 8727 * supported for pure anon segments. MAP_PRIVATE segment pcache entries are 8728 * tagged with segment pointer, starting virtual address and length. This 8729 * approach for MAP_SHARED segments may add many pcache entries for the same 8730 * set of pages and lead to long hash chains that decrease pcache lookup 8731 * performance. To avoid this issue for shared segments shared anon map and 8732 * starting anon index are used for pcache entry tagging. This allows all 8733 * segments to share pcache entries for the same anon range and reduces pcache 8734 * chain's length as well as memory overhead from duplicate shadow lists and 8735 * pcache entries. 8736 * 8737 * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd 8738 * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock 8739 * part of softlockcnt accounting is done differently for private and shared 8740 * segments. In private segment case softlock is only incremented when a new 8741 * shadow list is created but not when an existing one is found via 8742 * seg_plookup(). pcache entries have reference count incremented/decremented 8743 * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0 8744 * reference count can be purged (and purging is needed before segment can be 8745 * freed). When a private segment pcache entry is purged segvn_reclaim() will 8746 * decrement softlockcnt. Since in private segment case each of its pcache 8747 * entries only belongs to this segment we can expect that when 8748 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this 8749 * segment purge will succeed and softlockcnt will drop to 0. In shared 8750 * segment case reference count in pcache entry counts active locks from many 8751 * different segments so we can't expect segment purging to succeed even when 8752 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this 8753 * segment. To be able to determine when there're no pending pagelocks in 8754 * shared segment case we don't rely on purging to make softlockcnt drop to 0 8755 * but instead softlockcnt is incremented and decremented for every 8756 * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow 8757 * list was created or an existing one was found. When softlockcnt drops to 0 8758 * this segment no longer has any claims for pcached shadow lists and the 8759 * segment can be freed even if there're still active pcache entries 8760 * shared by this segment anon map. Shared segment pcache entries belong to 8761 * anon map and are typically removed when anon map is freed after all 8762 * processes destroy the segments that use this anon map. 8763 */ 8764 static int 8765 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 8766 enum lock_type type, enum seg_rw rw) 8767 { 8768 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8769 size_t np; 8770 pgcnt_t adjustpages; 8771 pgcnt_t npages; 8772 ulong_t anon_index; 8773 uint_t protchk = (rw == S_READ) ? PROT_READ : PROT_WRITE; 8774 uint_t error; 8775 struct anon_map *amp; 8776 pgcnt_t anpgcnt; 8777 struct page **pplist, **pl, *pp; 8778 caddr_t a; 8779 size_t page; 8780 caddr_t lpgaddr, lpgeaddr; 8781 anon_sync_obj_t cookie; 8782 int anlock; 8783 struct anon_map *pamp; 8784 caddr_t paddr; 8785 seg_preclaim_cbfunc_t preclaim_callback; 8786 size_t pgsz; 8787 int use_pcache; 8788 size_t wlen; 8789 uint_t pflags = 0; 8790 int sftlck_sbase = 0; 8791 int sftlck_send = 0; 8792 8793 #ifdef DEBUG 8794 if (type == L_PAGELOCK && segvn_pglock_mtbf) { 8795 hrtime_t ts = gethrtime(); 8796 if ((ts % segvn_pglock_mtbf) == 0) { 8797 return (ENOTSUP); 8798 } 8799 if ((ts % segvn_pglock_mtbf) == 1) { 8800 return (EFAULT); 8801 } 8802 } 8803 #endif 8804 8805 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 8806 "segvn_pagelock: start seg %p addr %p", seg, addr); 8807 8808 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 8809 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 8810 8811 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8812 8813 /* 8814 * for now we only support pagelock to anon memory. We would have to 8815 * check protections for vnode objects and call into the vnode driver. 8816 * That's too much for a fast path. Let the fault entry point handle 8817 * it. 8818 */ 8819 if (svd->vp != NULL) { 8820 if (type == L_PAGELOCK) { 8821 error = ENOTSUP; 8822 goto out; 8823 } 8824 panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL"); 8825 } 8826 if ((amp = svd->amp) == NULL) { 8827 if (type == L_PAGELOCK) { 8828 error = EFAULT; 8829 goto out; 8830 } 8831 panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL"); 8832 } 8833 if (rw != S_READ && rw != S_WRITE) { 8834 if (type == L_PAGELOCK) { 8835 error = ENOTSUP; 8836 goto out; 8837 } 8838 panic("segvn_pagelock(L_PAGEUNLOCK): bad rw"); 8839 } 8840 8841 if (seg->s_szc != 0) { 8842 /* 8843 * We are adjusting the pagelock region to the large page size 8844 * boundary because the unlocked part of a large page cannot 8845 * be freed anyway unless all constituent pages of a large 8846 * page are locked. Bigger regions reduce pcache chain length 8847 * and improve lookup performance. The tradeoff is that the 8848 * very first segvn_pagelock() call for a given page is more 8849 * expensive if only 1 page_t is needed for IO. This is only 8850 * an issue if pcache entry doesn't get reused by several 8851 * subsequent calls. We optimize here for the case when pcache 8852 * is heavily used by repeated IOs to the same address range. 8853 * 8854 * Note segment's page size cannot change while we are holding 8855 * as lock. And then it cannot change while softlockcnt is 8856 * not 0. This will allow us to correctly recalculate large 8857 * page size region for the matching pageunlock/reclaim call 8858 * since as_pageunlock() caller must always match 8859 * as_pagelock() call's addr and len. 8860 * 8861 * For pageunlock *ppp points to the pointer of page_t that 8862 * corresponds to the real unadjusted start address. Similar 8863 * for pagelock *ppp must point to the pointer of page_t that 8864 * corresponds to the real unadjusted start address. 8865 */ 8866 pgsz = page_get_pagesize(seg->s_szc); 8867 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 8868 adjustpages = btop((uintptr_t)(addr - lpgaddr)); 8869 } else if (len < segvn_pglock_comb_thrshld) { 8870 lpgaddr = addr; 8871 lpgeaddr = addr + len; 8872 adjustpages = 0; 8873 pgsz = PAGESIZE; 8874 } else { 8875 /* 8876 * Align the address range of large enough requests to allow 8877 * combining of different shadow lists into 1 to reduce memory 8878 * overhead from potentially overlapping large shadow lists 8879 * (worst case is we have a 1MB IO into buffers with start 8880 * addresses separated by 4K). Alignment is only possible if 8881 * padded chunks have sufficient access permissions. Note 8882 * permissions won't change between L_PAGELOCK and 8883 * L_PAGEUNLOCK calls since non 0 softlockcnt will force 8884 * segvn_setprot() to wait until softlockcnt drops to 0. This 8885 * allows us to determine in L_PAGEUNLOCK the same range we 8886 * computed in L_PAGELOCK. 8887 * 8888 * If alignment is limited by segment ends set 8889 * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when 8890 * these flags are set bump softlockcnt_sbase/softlockcnt_send 8891 * per segment counters. In L_PAGEUNLOCK case decrease 8892 * softlockcnt_sbase/softlockcnt_send counters if 8893 * sftlck_sbase/sftlck_send flags are set. When 8894 * softlockcnt_sbase/softlockcnt_send are non 0 8895 * segvn_concat()/segvn_extend_prev()/segvn_extend_next() 8896 * won't merge the segments. This restriction combined with 8897 * restriction on segment unmapping and splitting for segments 8898 * that have non 0 softlockcnt allows L_PAGEUNLOCK to 8899 * correctly determine the same range that was previously 8900 * locked by matching L_PAGELOCK. 8901 */ 8902 pflags = SEGP_PSHIFT | (segvn_pglock_comb_bshift << 16); 8903 pgsz = PAGESIZE; 8904 if (svd->type == MAP_PRIVATE) { 8905 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)addr, 8906 segvn_pglock_comb_balign); 8907 if (lpgaddr < seg->s_base) { 8908 lpgaddr = seg->s_base; 8909 sftlck_sbase = 1; 8910 } 8911 } else { 8912 ulong_t aix = svd->anon_index + seg_page(seg, addr); 8913 ulong_t aaix = P2ALIGN(aix, segvn_pglock_comb_palign); 8914 if (aaix < svd->anon_index) { 8915 lpgaddr = seg->s_base; 8916 sftlck_sbase = 1; 8917 } else { 8918 lpgaddr = addr - ptob(aix - aaix); 8919 ASSERT(lpgaddr >= seg->s_base); 8920 } 8921 } 8922 if (svd->pageprot && lpgaddr != addr) { 8923 struct vpage *vp = &svd->vpage[seg_page(seg, lpgaddr)]; 8924 struct vpage *evp = &svd->vpage[seg_page(seg, addr)]; 8925 while (vp < evp) { 8926 if ((VPP_PROT(vp) & protchk) == 0) { 8927 break; 8928 } 8929 vp++; 8930 } 8931 if (vp < evp) { 8932 lpgaddr = addr; 8933 pflags = 0; 8934 } 8935 } 8936 lpgeaddr = addr + len; 8937 if (pflags) { 8938 if (svd->type == MAP_PRIVATE) { 8939 lpgeaddr = (caddr_t)P2ROUNDUP( 8940 (uintptr_t)lpgeaddr, 8941 segvn_pglock_comb_balign); 8942 } else { 8943 ulong_t aix = svd->anon_index + 8944 seg_page(seg, lpgeaddr); 8945 ulong_t aaix = P2ROUNDUP(aix, 8946 segvn_pglock_comb_palign); 8947 if (aaix < aix) { 8948 lpgeaddr = 0; 8949 } else { 8950 lpgeaddr += ptob(aaix - aix); 8951 } 8952 } 8953 if (lpgeaddr == 0 || 8954 lpgeaddr > seg->s_base + seg->s_size) { 8955 lpgeaddr = seg->s_base + seg->s_size; 8956 sftlck_send = 1; 8957 } 8958 } 8959 if (svd->pageprot && lpgeaddr != addr + len) { 8960 struct vpage *vp; 8961 struct vpage *evp; 8962 8963 vp = &svd->vpage[seg_page(seg, addr + len)]; 8964 evp = &svd->vpage[seg_page(seg, lpgeaddr)]; 8965 8966 while (vp < evp) { 8967 if ((VPP_PROT(vp) & protchk) == 0) { 8968 break; 8969 } 8970 vp++; 8971 } 8972 if (vp < evp) { 8973 lpgeaddr = addr + len; 8974 } 8975 } 8976 adjustpages = btop((uintptr_t)(addr - lpgaddr)); 8977 } 8978 8979 /* 8980 * For MAP_SHARED segments we create pcache entries tagged by amp and 8981 * anon index so that we can share pcache entries with other segments 8982 * that map this amp. For private segments pcache entries are tagged 8983 * with segment and virtual address. 8984 */ 8985 if (svd->type == MAP_SHARED) { 8986 pamp = amp; 8987 paddr = (caddr_t)((lpgaddr - seg->s_base) + 8988 ptob(svd->anon_index)); 8989 preclaim_callback = shamp_reclaim; 8990 } else { 8991 pamp = NULL; 8992 paddr = lpgaddr; 8993 preclaim_callback = segvn_reclaim; 8994 } 8995 8996 if (type == L_PAGEUNLOCK) { 8997 VM_STAT_ADD(segvnvmstats.pagelock[0]); 8998 8999 /* 9000 * update hat ref bits for /proc. We need to make sure 9001 * that threads tracing the ref and mod bits of the 9002 * address space get the right data. 9003 * Note: page ref and mod bits are updated at reclaim time 9004 */ 9005 if (seg->s_as->a_vbits) { 9006 for (a = addr; a < addr + len; a += PAGESIZE) { 9007 if (rw == S_WRITE) { 9008 hat_setstat(seg->s_as, a, 9009 PAGESIZE, P_REF | P_MOD); 9010 } else { 9011 hat_setstat(seg->s_as, a, 9012 PAGESIZE, P_REF); 9013 } 9014 } 9015 } 9016 9017 /* 9018 * Check the shadow list entry after the last page used in 9019 * this IO request. If it's NOPCACHE_SHWLIST the shadow list 9020 * was not inserted into pcache and is not large page 9021 * adjusted. In this case call reclaim callback directly and 9022 * don't adjust the shadow list start and size for large 9023 * pages. 9024 */ 9025 npages = btop(len); 9026 if ((*ppp)[npages] == NOPCACHE_SHWLIST) { 9027 void *ptag; 9028 if (pamp != NULL) { 9029 ASSERT(svd->type == MAP_SHARED); 9030 ptag = (void *)pamp; 9031 paddr = (caddr_t)((addr - seg->s_base) + 9032 ptob(svd->anon_index)); 9033 } else { 9034 ptag = (void *)seg; 9035 paddr = addr; 9036 } 9037 (*preclaim_callback)(ptag, paddr, len, *ppp, rw, 0); 9038 } else { 9039 ASSERT((*ppp)[npages] == PCACHE_SHWLIST || 9040 IS_SWAPFSVP((*ppp)[npages]->p_vnode)); 9041 len = lpgeaddr - lpgaddr; 9042 npages = btop(len); 9043 seg_pinactive(seg, pamp, paddr, len, 9044 *ppp - adjustpages, rw, pflags, preclaim_callback); 9045 } 9046 9047 if (pamp != NULL) { 9048 ASSERT(svd->type == MAP_SHARED); 9049 ASSERT(svd->softlockcnt >= npages); 9050 atomic_add_long((ulong_t *)&svd->softlockcnt, -npages); 9051 } 9052 9053 if (sftlck_sbase) { 9054 ASSERT(svd->softlockcnt_sbase > 0); 9055 atomic_dec_ulong((ulong_t *)&svd->softlockcnt_sbase); 9056 } 9057 if (sftlck_send) { 9058 ASSERT(svd->softlockcnt_send > 0); 9059 atomic_dec_ulong((ulong_t *)&svd->softlockcnt_send); 9060 } 9061 9062 /* 9063 * If someone is blocked while unmapping, we purge 9064 * segment page cache and thus reclaim pplist synchronously 9065 * without waiting for seg_pasync_thread. This speeds up 9066 * unmapping in cases where munmap(2) is called, while 9067 * raw async i/o is still in progress or where a thread 9068 * exits on data fault in a multithreaded application. 9069 */ 9070 if (AS_ISUNMAPWAIT(seg->s_as)) { 9071 if (svd->softlockcnt == 0) { 9072 mutex_enter(&seg->s_as->a_contents); 9073 if (AS_ISUNMAPWAIT(seg->s_as)) { 9074 AS_CLRUNMAPWAIT(seg->s_as); 9075 cv_broadcast(&seg->s_as->a_cv); 9076 } 9077 mutex_exit(&seg->s_as->a_contents); 9078 } else if (pamp == NULL) { 9079 /* 9080 * softlockcnt is not 0 and this is a 9081 * MAP_PRIVATE segment. Try to purge its 9082 * pcache entries to reduce softlockcnt. 9083 * If it drops to 0 segvn_reclaim() 9084 * will wake up a thread waiting on 9085 * unmapwait flag. 9086 * 9087 * We don't purge MAP_SHARED segments with non 9088 * 0 softlockcnt since IO is still in progress 9089 * for such segments. 9090 */ 9091 ASSERT(svd->type == MAP_PRIVATE); 9092 segvn_purge(seg); 9093 } 9094 } 9095 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9096 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 9097 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 9098 return (0); 9099 } 9100 9101 /* The L_PAGELOCK case ... */ 9102 9103 VM_STAT_ADD(segvnvmstats.pagelock[1]); 9104 9105 /* 9106 * For MAP_SHARED segments we have to check protections before 9107 * seg_plookup() since pcache entries may be shared by many segments 9108 * with potentially different page protections. 9109 */ 9110 if (pamp != NULL) { 9111 ASSERT(svd->type == MAP_SHARED); 9112 if (svd->pageprot == 0) { 9113 if ((svd->prot & protchk) == 0) { 9114 error = EACCES; 9115 goto out; 9116 } 9117 } else { 9118 /* 9119 * check page protections 9120 */ 9121 caddr_t ea; 9122 9123 if (seg->s_szc) { 9124 a = lpgaddr; 9125 ea = lpgeaddr; 9126 } else { 9127 a = addr; 9128 ea = addr + len; 9129 } 9130 for (; a < ea; a += pgsz) { 9131 struct vpage *vp; 9132 9133 ASSERT(seg->s_szc == 0 || 9134 sameprot(seg, a, pgsz)); 9135 vp = &svd->vpage[seg_page(seg, a)]; 9136 if ((VPP_PROT(vp) & protchk) == 0) { 9137 error = EACCES; 9138 goto out; 9139 } 9140 } 9141 } 9142 } 9143 9144 /* 9145 * try to find pages in segment page cache 9146 */ 9147 pplist = seg_plookup(seg, pamp, paddr, lpgeaddr - lpgaddr, rw, pflags); 9148 if (pplist != NULL) { 9149 if (pamp != NULL) { 9150 npages = btop((uintptr_t)(lpgeaddr - lpgaddr)); 9151 ASSERT(svd->type == MAP_SHARED); 9152 atomic_add_long((ulong_t *)&svd->softlockcnt, 9153 npages); 9154 } 9155 if (sftlck_sbase) { 9156 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase); 9157 } 9158 if (sftlck_send) { 9159 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send); 9160 } 9161 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9162 *ppp = pplist + adjustpages; 9163 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 9164 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 9165 return (0); 9166 } 9167 9168 /* 9169 * For MAP_SHARED segments we already verified above that segment 9170 * protections allow this pagelock operation. 9171 */ 9172 if (pamp == NULL) { 9173 ASSERT(svd->type == MAP_PRIVATE); 9174 if (svd->pageprot == 0) { 9175 if ((svd->prot & protchk) == 0) { 9176 error = EACCES; 9177 goto out; 9178 } 9179 if (svd->prot & PROT_WRITE) { 9180 wlen = lpgeaddr - lpgaddr; 9181 } else { 9182 wlen = 0; 9183 ASSERT(rw == S_READ); 9184 } 9185 } else { 9186 int wcont = 1; 9187 /* 9188 * check page protections 9189 */ 9190 for (a = lpgaddr, wlen = 0; a < lpgeaddr; a += pgsz) { 9191 struct vpage *vp; 9192 9193 ASSERT(seg->s_szc == 0 || 9194 sameprot(seg, a, pgsz)); 9195 vp = &svd->vpage[seg_page(seg, a)]; 9196 if ((VPP_PROT(vp) & protchk) == 0) { 9197 error = EACCES; 9198 goto out; 9199 } 9200 if (wcont && (VPP_PROT(vp) & PROT_WRITE)) { 9201 wlen += pgsz; 9202 } else { 9203 wcont = 0; 9204 ASSERT(rw == S_READ); 9205 } 9206 } 9207 } 9208 ASSERT(rw == S_READ || wlen == lpgeaddr - lpgaddr); 9209 ASSERT(rw == S_WRITE || wlen <= lpgeaddr - lpgaddr); 9210 } 9211 9212 /* 9213 * Only build large page adjusted shadow list if we expect to insert 9214 * it into pcache. For large enough pages it's a big overhead to 9215 * create a shadow list of the entire large page. But this overhead 9216 * should be amortized over repeated pcache hits on subsequent reuse 9217 * of this shadow list (IO into any range within this shadow list will 9218 * find it in pcache since we large page align the request for pcache 9219 * lookups). pcache performance is improved with bigger shadow lists 9220 * as it reduces the time to pcache the entire big segment and reduces 9221 * pcache chain length. 9222 */ 9223 if (seg_pinsert_check(seg, pamp, paddr, 9224 lpgeaddr - lpgaddr, pflags) == SEGP_SUCCESS) { 9225 addr = lpgaddr; 9226 len = lpgeaddr - lpgaddr; 9227 use_pcache = 1; 9228 } else { 9229 use_pcache = 0; 9230 /* 9231 * Since this entry will not be inserted into the pcache, we 9232 * will not do any adjustments to the starting address or 9233 * size of the memory to be locked. 9234 */ 9235 adjustpages = 0; 9236 } 9237 npages = btop(len); 9238 9239 pplist = kmem_alloc(sizeof (page_t *) * (npages + 1), KM_SLEEP); 9240 pl = pplist; 9241 *ppp = pplist + adjustpages; 9242 /* 9243 * If use_pcache is 0 this shadow list is not large page adjusted. 9244 * Record this info in the last entry of shadow array so that 9245 * L_PAGEUNLOCK can determine if it should large page adjust the 9246 * address range to find the real range that was locked. 9247 */ 9248 pl[npages] = use_pcache ? PCACHE_SHWLIST : NOPCACHE_SHWLIST; 9249 9250 page = seg_page(seg, addr); 9251 anon_index = svd->anon_index + page; 9252 9253 anlock = 0; 9254 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 9255 ASSERT(amp->a_szc >= seg->s_szc); 9256 anpgcnt = page_get_pagecnt(amp->a_szc); 9257 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 9258 struct anon *ap; 9259 struct vnode *vp; 9260 u_offset_t off; 9261 9262 /* 9263 * Lock and unlock anon array only once per large page. 9264 * anon_array_enter() locks the root anon slot according to 9265 * a_szc which can't change while anon map is locked. We lock 9266 * anon the first time through this loop and each time we 9267 * reach anon index that corresponds to a root of a large 9268 * page. 9269 */ 9270 if (a == addr || P2PHASE(anon_index, anpgcnt) == 0) { 9271 ASSERT(anlock == 0); 9272 anon_array_enter(amp, anon_index, &cookie); 9273 anlock = 1; 9274 } 9275 ap = anon_get_ptr(amp->ahp, anon_index); 9276 9277 /* 9278 * We must never use seg_pcache for COW pages 9279 * because we might end up with original page still 9280 * lying in seg_pcache even after private page is 9281 * created. This leads to data corruption as 9282 * aio_write refers to the page still in cache 9283 * while all other accesses refer to the private 9284 * page. 9285 */ 9286 if (ap == NULL || ap->an_refcnt != 1) { 9287 struct vpage *vpage; 9288 9289 if (seg->s_szc) { 9290 error = EFAULT; 9291 break; 9292 } 9293 if (svd->vpage != NULL) { 9294 vpage = &svd->vpage[seg_page(seg, a)]; 9295 } else { 9296 vpage = NULL; 9297 } 9298 ASSERT(anlock); 9299 anon_array_exit(&cookie); 9300 anlock = 0; 9301 pp = NULL; 9302 error = segvn_faultpage(seg->s_as->a_hat, seg, a, 0, 9303 vpage, &pp, 0, F_INVAL, rw, 1); 9304 if (error) { 9305 error = fc_decode(error); 9306 break; 9307 } 9308 anon_array_enter(amp, anon_index, &cookie); 9309 anlock = 1; 9310 ap = anon_get_ptr(amp->ahp, anon_index); 9311 if (ap == NULL || ap->an_refcnt != 1) { 9312 error = EFAULT; 9313 break; 9314 } 9315 } 9316 swap_xlate(ap, &vp, &off); 9317 pp = page_lookup_nowait(vp, off, SE_SHARED); 9318 if (pp == NULL) { 9319 error = EFAULT; 9320 break; 9321 } 9322 if (ap->an_pvp != NULL) { 9323 anon_swap_free(ap, pp); 9324 } 9325 /* 9326 * Unlock anon if this is the last slot in a large page. 9327 */ 9328 if (P2PHASE(anon_index, anpgcnt) == anpgcnt - 1) { 9329 ASSERT(anlock); 9330 anon_array_exit(&cookie); 9331 anlock = 0; 9332 } 9333 *pplist++ = pp; 9334 } 9335 if (anlock) { /* Ensure the lock is dropped */ 9336 anon_array_exit(&cookie); 9337 } 9338 ANON_LOCK_EXIT(&->a_rwlock); 9339 9340 if (a >= addr + len) { 9341 atomic_add_long((ulong_t *)&svd->softlockcnt, npages); 9342 if (pamp != NULL) { 9343 ASSERT(svd->type == MAP_SHARED); 9344 atomic_add_long((ulong_t *)&pamp->a_softlockcnt, 9345 npages); 9346 wlen = len; 9347 } 9348 if (sftlck_sbase) { 9349 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase); 9350 } 9351 if (sftlck_send) { 9352 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send); 9353 } 9354 if (use_pcache) { 9355 (void) seg_pinsert(seg, pamp, paddr, len, wlen, pl, 9356 rw, pflags, preclaim_callback); 9357 } 9358 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9359 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 9360 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 9361 return (0); 9362 } 9363 9364 pplist = pl; 9365 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 9366 while (np > (uint_t)0) { 9367 ASSERT(PAGE_LOCKED(*pplist)); 9368 page_unlock(*pplist); 9369 np--; 9370 pplist++; 9371 } 9372 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9373 out: 9374 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9375 *ppp = NULL; 9376 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 9377 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 9378 return (error); 9379 } 9380 9381 /* 9382 * purge any cached pages in the I/O page cache 9383 */ 9384 static void 9385 segvn_purge(struct seg *seg) 9386 { 9387 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9388 9389 /* 9390 * pcache is only used by pure anon segments. 9391 */ 9392 if (svd->amp == NULL || svd->vp != NULL) { 9393 return; 9394 } 9395 9396 /* 9397 * For MAP_SHARED segments non 0 segment's softlockcnt means 9398 * active IO is still in progress via this segment. So we only 9399 * purge MAP_SHARED segments when their softlockcnt is 0. 9400 */ 9401 if (svd->type == MAP_PRIVATE) { 9402 if (svd->softlockcnt) { 9403 seg_ppurge(seg, NULL, 0); 9404 } 9405 } else if (svd->softlockcnt == 0 && svd->amp->a_softlockcnt != 0) { 9406 seg_ppurge(seg, svd->amp, 0); 9407 } 9408 } 9409 9410 /* 9411 * If async argument is not 0 we are called from pcache async thread and don't 9412 * hold AS lock. 9413 */ 9414 9415 /*ARGSUSED*/ 9416 static int 9417 segvn_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 9418 enum seg_rw rw, int async) 9419 { 9420 struct seg *seg = (struct seg *)ptag; 9421 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9422 pgcnt_t np, npages; 9423 struct page **pl; 9424 9425 npages = np = btop(len); 9426 ASSERT(npages); 9427 9428 ASSERT(svd->vp == NULL && svd->amp != NULL); 9429 ASSERT(svd->softlockcnt >= npages); 9430 ASSERT(async || AS_LOCK_HELD(seg->s_as)); 9431 9432 pl = pplist; 9433 9434 ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); 9435 ASSERT(!async || pl[np] == PCACHE_SHWLIST); 9436 9437 while (np > (uint_t)0) { 9438 if (rw == S_WRITE) { 9439 hat_setrefmod(*pplist); 9440 } else { 9441 hat_setref(*pplist); 9442 } 9443 page_unlock(*pplist); 9444 np--; 9445 pplist++; 9446 } 9447 9448 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9449 9450 /* 9451 * If we are pcache async thread we don't hold AS lock. This means if 9452 * softlockcnt drops to 0 after the decrement below address space may 9453 * get freed. We can't allow it since after softlock derement to 0 we 9454 * still need to access as structure for possible wakeup of unmap 9455 * waiters. To prevent the disappearance of as we take this segment 9456 * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to 9457 * make sure this routine completes before segment is freed. 9458 * 9459 * The second complication we have to deal with in async case is a 9460 * possibility of missed wake up of unmap wait thread. When we don't 9461 * hold as lock here we may take a_contents lock before unmap wait 9462 * thread that was first to see softlockcnt was still not 0. As a 9463 * result we'll fail to wake up an unmap wait thread. To avoid this 9464 * race we set nounmapwait flag in as structure if we drop softlockcnt 9465 * to 0 when we were called by pcache async thread. unmapwait thread 9466 * will not block if this flag is set. 9467 */ 9468 if (async) { 9469 mutex_enter(&svd->segfree_syncmtx); 9470 } 9471 9472 if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -npages)) { 9473 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 9474 mutex_enter(&seg->s_as->a_contents); 9475 if (async) { 9476 AS_SETNOUNMAPWAIT(seg->s_as); 9477 } 9478 if (AS_ISUNMAPWAIT(seg->s_as)) { 9479 AS_CLRUNMAPWAIT(seg->s_as); 9480 cv_broadcast(&seg->s_as->a_cv); 9481 } 9482 mutex_exit(&seg->s_as->a_contents); 9483 } 9484 } 9485 9486 if (async) { 9487 mutex_exit(&svd->segfree_syncmtx); 9488 } 9489 return (0); 9490 } 9491 9492 /*ARGSUSED*/ 9493 static int 9494 shamp_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 9495 enum seg_rw rw, int async) 9496 { 9497 amp_t *amp = (amp_t *)ptag; 9498 pgcnt_t np, npages; 9499 struct page **pl; 9500 9501 npages = np = btop(len); 9502 ASSERT(npages); 9503 ASSERT(amp->a_softlockcnt >= npages); 9504 9505 pl = pplist; 9506 9507 ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); 9508 ASSERT(!async || pl[np] == PCACHE_SHWLIST); 9509 9510 while (np > (uint_t)0) { 9511 if (rw == S_WRITE) { 9512 hat_setrefmod(*pplist); 9513 } else { 9514 hat_setref(*pplist); 9515 } 9516 page_unlock(*pplist); 9517 np--; 9518 pplist++; 9519 } 9520 9521 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9522 9523 /* 9524 * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt 9525 * drops to 0. anon map can't be freed until a_softlockcnt drops to 0 9526 * and anonmap_purge() acquires a_purgemtx. 9527 */ 9528 mutex_enter(&->a_purgemtx); 9529 if (!atomic_add_long_nv((ulong_t *)&->a_softlockcnt, -npages) && 9530 amp->a_purgewait) { 9531 amp->a_purgewait = 0; 9532 cv_broadcast(&->a_purgecv); 9533 } 9534 mutex_exit(&->a_purgemtx); 9535 return (0); 9536 } 9537 9538 /* 9539 * get a memory ID for an addr in a given segment 9540 * 9541 * XXX only creates PAGESIZE pages if anon slots are not initialized. 9542 * At fault time they will be relocated into larger pages. 9543 */ 9544 static int 9545 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 9546 { 9547 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9548 struct anon *ap = NULL; 9549 ulong_t anon_index; 9550 struct anon_map *amp; 9551 anon_sync_obj_t cookie; 9552 9553 if (svd->type == MAP_PRIVATE) { 9554 memidp->val[0] = (uintptr_t)seg->s_as; 9555 memidp->val[1] = (uintptr_t)addr; 9556 return (0); 9557 } 9558 9559 if (svd->type == MAP_SHARED) { 9560 if (svd->vp) { 9561 memidp->val[0] = (uintptr_t)svd->vp; 9562 memidp->val[1] = (u_longlong_t)svd->offset + 9563 (uintptr_t)(addr - seg->s_base); 9564 return (0); 9565 } else { 9566 9567 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 9568 if ((amp = svd->amp) != NULL) { 9569 anon_index = svd->anon_index + 9570 seg_page(seg, addr); 9571 } 9572 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9573 9574 ASSERT(amp != NULL); 9575 9576 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 9577 anon_array_enter(amp, anon_index, &cookie); 9578 ap = anon_get_ptr(amp->ahp, anon_index); 9579 if (ap == NULL) { 9580 page_t *pp; 9581 9582 pp = anon_zero(seg, addr, &ap, svd->cred); 9583 if (pp == NULL) { 9584 anon_array_exit(&cookie); 9585 ANON_LOCK_EXIT(&->a_rwlock); 9586 return (ENOMEM); 9587 } 9588 ASSERT(anon_get_ptr(amp->ahp, anon_index) 9589 == NULL); 9590 (void) anon_set_ptr(amp->ahp, anon_index, 9591 ap, ANON_SLEEP); 9592 page_unlock(pp); 9593 } 9594 9595 anon_array_exit(&cookie); 9596 ANON_LOCK_EXIT(&->a_rwlock); 9597 9598 memidp->val[0] = (uintptr_t)ap; 9599 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 9600 return (0); 9601 } 9602 } 9603 return (EINVAL); 9604 } 9605 9606 static int 9607 sameprot(struct seg *seg, caddr_t a, size_t len) 9608 { 9609 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9610 struct vpage *vpage; 9611 spgcnt_t pages = btop(len); 9612 uint_t prot; 9613 9614 if (svd->pageprot == 0) 9615 return (1); 9616 9617 ASSERT(svd->vpage != NULL); 9618 9619 vpage = &svd->vpage[seg_page(seg, a)]; 9620 prot = VPP_PROT(vpage); 9621 vpage++; 9622 pages--; 9623 while (pages-- > 0) { 9624 if (prot != VPP_PROT(vpage)) 9625 return (0); 9626 vpage++; 9627 } 9628 return (1); 9629 } 9630 9631 /* 9632 * Get memory allocation policy info for specified address in given segment 9633 */ 9634 static lgrp_mem_policy_info_t * 9635 segvn_getpolicy(struct seg *seg, caddr_t addr) 9636 { 9637 struct anon_map *amp; 9638 ulong_t anon_index; 9639 lgrp_mem_policy_info_t *policy_info; 9640 struct segvn_data *svn_data; 9641 u_offset_t vn_off; 9642 vnode_t *vp; 9643 9644 ASSERT(seg != NULL); 9645 9646 svn_data = (struct segvn_data *)seg->s_data; 9647 if (svn_data == NULL) 9648 return (NULL); 9649 9650 /* 9651 * Get policy info for private or shared memory 9652 */ 9653 if (svn_data->type != MAP_SHARED) { 9654 if (svn_data->tr_state != SEGVN_TR_ON) { 9655 policy_info = &svn_data->policy_info; 9656 } else { 9657 policy_info = &svn_data->tr_policy_info; 9658 ASSERT(policy_info->mem_policy == 9659 LGRP_MEM_POLICY_NEXT_SEG); 9660 } 9661 } else { 9662 amp = svn_data->amp; 9663 anon_index = svn_data->anon_index + seg_page(seg, addr); 9664 vp = svn_data->vp; 9665 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 9666 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 9667 } 9668 9669 return (policy_info); 9670 } 9671 9672 /*ARGSUSED*/ 9673 static int 9674 segvn_capable(struct seg *seg, segcapability_t capability) 9675 { 9676 return (0); 9677 } 9678 9679 /* 9680 * Bind text vnode segment to an amp. If we bind successfully mappings will be 9681 * established to per vnode mapping per lgroup amp pages instead of to vnode 9682 * pages. There's one amp per vnode text mapping per lgroup. Many processes 9683 * may share the same text replication amp. If a suitable amp doesn't already 9684 * exist in svntr hash table create a new one. We may fail to bind to amp if 9685 * segment is not eligible for text replication. Code below first checks for 9686 * these conditions. If binding is successful segment tr_state is set to on 9687 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and 9688 * svd->amp remains as NULL. 9689 */ 9690 static void 9691 segvn_textrepl(struct seg *seg) 9692 { 9693 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9694 vnode_t *vp = svd->vp; 9695 u_offset_t off = svd->offset; 9696 size_t size = seg->s_size; 9697 u_offset_t eoff = off + size; 9698 uint_t szc = seg->s_szc; 9699 ulong_t hash = SVNTR_HASH_FUNC(vp); 9700 svntr_t *svntrp; 9701 struct vattr va; 9702 proc_t *p = seg->s_as->a_proc; 9703 lgrp_id_t lgrp_id; 9704 lgrp_id_t olid; 9705 int first; 9706 struct anon_map *amp; 9707 9708 ASSERT(AS_LOCK_HELD(seg->s_as)); 9709 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 9710 ASSERT(p != NULL); 9711 ASSERT(svd->tr_state == SEGVN_TR_INIT); 9712 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9713 ASSERT(svd->flags & MAP_TEXT); 9714 ASSERT(svd->type == MAP_PRIVATE); 9715 ASSERT(vp != NULL && svd->amp == NULL); 9716 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 9717 ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0); 9718 ASSERT(seg->s_as != &kas); 9719 ASSERT(off < eoff); 9720 ASSERT(svntr_hashtab != NULL); 9721 9722 /* 9723 * If numa optimizations are no longer desired bail out. 9724 */ 9725 if (!lgrp_optimizations()) { 9726 svd->tr_state = SEGVN_TR_OFF; 9727 return; 9728 } 9729 9730 /* 9731 * Avoid creating anon maps with size bigger than the file size. 9732 * If VOP_GETATTR() call fails bail out. 9733 */ 9734 va.va_mask = AT_SIZE | AT_MTIME | AT_CTIME; 9735 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL) != 0) { 9736 svd->tr_state = SEGVN_TR_OFF; 9737 SEGVN_TR_ADDSTAT(gaerr); 9738 return; 9739 } 9740 if (btopr(va.va_size) < btopr(eoff)) { 9741 svd->tr_state = SEGVN_TR_OFF; 9742 SEGVN_TR_ADDSTAT(overmap); 9743 return; 9744 } 9745 9746 /* 9747 * VVMEXEC may not be set yet if exec() prefaults text segment. Set 9748 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED 9749 * mapping that checks if trcache for this vnode needs to be 9750 * invalidated can't miss us. 9751 */ 9752 if (!(vp->v_flag & VVMEXEC)) { 9753 mutex_enter(&vp->v_lock); 9754 vp->v_flag |= VVMEXEC; 9755 mutex_exit(&vp->v_lock); 9756 } 9757 mutex_enter(&svntr_hashtab[hash].tr_lock); 9758 /* 9759 * Bail out if potentially MAP_SHARED writable mappings exist to this 9760 * vnode. We don't want to use old file contents from existing 9761 * replicas if this mapping was established after the original file 9762 * was changed. 9763 */ 9764 if (vn_is_mapped(vp, V_WRITE)) { 9765 mutex_exit(&svntr_hashtab[hash].tr_lock); 9766 svd->tr_state = SEGVN_TR_OFF; 9767 SEGVN_TR_ADDSTAT(wrcnt); 9768 return; 9769 } 9770 svntrp = svntr_hashtab[hash].tr_head; 9771 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9772 ASSERT(svntrp->tr_refcnt != 0); 9773 if (svntrp->tr_vp != vp) { 9774 continue; 9775 } 9776 9777 /* 9778 * Bail out if the file or its attributes were changed after 9779 * this replication entry was created since we need to use the 9780 * latest file contents. Note that mtime test alone is not 9781 * sufficient because a user can explicitly change mtime via 9782 * utimes(2) interfaces back to the old value after modifiying 9783 * the file contents. To detect this case we also have to test 9784 * ctime which among other things records the time of the last 9785 * mtime change by utimes(2). ctime is not changed when the file 9786 * is only read or executed so we expect that typically existing 9787 * replication amp's can be used most of the time. 9788 */ 9789 if (!svntrp->tr_valid || 9790 svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec || 9791 svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec || 9792 svntrp->tr_ctime.tv_sec != va.va_ctime.tv_sec || 9793 svntrp->tr_ctime.tv_nsec != va.va_ctime.tv_nsec) { 9794 mutex_exit(&svntr_hashtab[hash].tr_lock); 9795 svd->tr_state = SEGVN_TR_OFF; 9796 SEGVN_TR_ADDSTAT(stale); 9797 return; 9798 } 9799 /* 9800 * if off, eoff and szc match current segment we found the 9801 * existing entry we can use. 9802 */ 9803 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff && 9804 svntrp->tr_szc == szc) { 9805 break; 9806 } 9807 /* 9808 * Don't create different but overlapping in file offsets 9809 * entries to avoid replication of the same file pages more 9810 * than once per lgroup. 9811 */ 9812 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) || 9813 (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) { 9814 mutex_exit(&svntr_hashtab[hash].tr_lock); 9815 svd->tr_state = SEGVN_TR_OFF; 9816 SEGVN_TR_ADDSTAT(overlap); 9817 return; 9818 } 9819 } 9820 /* 9821 * If we didn't find existing entry create a new one. 9822 */ 9823 if (svntrp == NULL) { 9824 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP); 9825 if (svntrp == NULL) { 9826 mutex_exit(&svntr_hashtab[hash].tr_lock); 9827 svd->tr_state = SEGVN_TR_OFF; 9828 SEGVN_TR_ADDSTAT(nokmem); 9829 return; 9830 } 9831 #ifdef DEBUG 9832 { 9833 lgrp_id_t i; 9834 for (i = 0; i < NLGRPS_MAX; i++) { 9835 ASSERT(svntrp->tr_amp[i] == NULL); 9836 } 9837 } 9838 #endif /* DEBUG */ 9839 svntrp->tr_vp = vp; 9840 svntrp->tr_off = off; 9841 svntrp->tr_eoff = eoff; 9842 svntrp->tr_szc = szc; 9843 svntrp->tr_valid = 1; 9844 svntrp->tr_mtime = va.va_mtime; 9845 svntrp->tr_ctime = va.va_ctime; 9846 svntrp->tr_refcnt = 0; 9847 svntrp->tr_next = svntr_hashtab[hash].tr_head; 9848 svntr_hashtab[hash].tr_head = svntrp; 9849 } 9850 first = 1; 9851 again: 9852 /* 9853 * We want to pick a replica with pages on main thread's (t_tid = 1, 9854 * aka T1) lgrp. Currently text replication is only optimized for 9855 * workloads that either have all threads of a process on the same 9856 * lgrp or execute their large text primarily on main thread. 9857 */ 9858 lgrp_id = p->p_t1_lgrpid; 9859 if (lgrp_id == LGRP_NONE) { 9860 /* 9861 * In case exec() prefaults text on non main thread use 9862 * current thread lgrpid. It will become main thread anyway 9863 * soon. 9864 */ 9865 lgrp_id = lgrp_home_id(curthread); 9866 } 9867 /* 9868 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise 9869 * just set it to NLGRPS_MAX if it's different from current process T1 9870 * home lgrp. p_tr_lgrpid is used to detect if process uses text 9871 * replication and T1 new home is different from lgrp used for text 9872 * replication. When this happens asyncronous segvn thread rechecks if 9873 * segments should change lgrps used for text replication. If we fail 9874 * to set p_tr_lgrpid with atomic_cas_32 then set it to NLGRPS_MAX 9875 * without cas if it's not already NLGRPS_MAX and not equal lgrp_id 9876 * we want to use. We don't need to use cas in this case because 9877 * another thread that races in between our non atomic check and set 9878 * may only change p_tr_lgrpid to NLGRPS_MAX at this point. 9879 */ 9880 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9881 olid = p->p_tr_lgrpid; 9882 if (lgrp_id != olid && olid != NLGRPS_MAX) { 9883 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX; 9884 if (atomic_cas_32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != 9885 olid) { 9886 olid = p->p_tr_lgrpid; 9887 ASSERT(olid != LGRP_NONE); 9888 if (olid != lgrp_id && olid != NLGRPS_MAX) { 9889 p->p_tr_lgrpid = NLGRPS_MAX; 9890 } 9891 } 9892 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9893 membar_producer(); 9894 /* 9895 * lgrp_move_thread() won't schedule async recheck after 9896 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not 9897 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid 9898 * is not LGRP_NONE. 9899 */ 9900 if (first && p->p_t1_lgrpid != LGRP_NONE && 9901 p->p_t1_lgrpid != lgrp_id) { 9902 first = 0; 9903 goto again; 9904 } 9905 } 9906 /* 9907 * If no amp was created yet for lgrp_id create a new one as long as 9908 * we have enough memory to afford it. 9909 */ 9910 if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) { 9911 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9912 if (trmem > segvn_textrepl_max_bytes) { 9913 SEGVN_TR_ADDSTAT(normem); 9914 goto fail; 9915 } 9916 if (anon_try_resv_zone(size, NULL) == 0) { 9917 SEGVN_TR_ADDSTAT(noanon); 9918 goto fail; 9919 } 9920 amp = anonmap_alloc(size, size, ANON_NOSLEEP); 9921 if (amp == NULL) { 9922 anon_unresv_zone(size, NULL); 9923 SEGVN_TR_ADDSTAT(nokmem); 9924 goto fail; 9925 } 9926 ASSERT(amp->refcnt == 1); 9927 amp->a_szc = szc; 9928 svntrp->tr_amp[lgrp_id] = amp; 9929 SEGVN_TR_ADDSTAT(newamp); 9930 } 9931 svntrp->tr_refcnt++; 9932 ASSERT(svd->svn_trnext == NULL); 9933 ASSERT(svd->svn_trprev == NULL); 9934 svd->svn_trnext = svntrp->tr_svnhead; 9935 svd->svn_trprev = NULL; 9936 if (svntrp->tr_svnhead != NULL) { 9937 svntrp->tr_svnhead->svn_trprev = svd; 9938 } 9939 svntrp->tr_svnhead = svd; 9940 ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size); 9941 ASSERT(amp->refcnt >= 1); 9942 svd->amp = amp; 9943 svd->anon_index = 0; 9944 svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG; 9945 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9946 svd->tr_state = SEGVN_TR_ON; 9947 mutex_exit(&svntr_hashtab[hash].tr_lock); 9948 SEGVN_TR_ADDSTAT(repl); 9949 return; 9950 fail: 9951 ASSERT(segvn_textrepl_bytes >= size); 9952 atomic_add_long(&segvn_textrepl_bytes, -size); 9953 ASSERT(svntrp != NULL); 9954 ASSERT(svntrp->tr_amp[lgrp_id] == NULL); 9955 if (svntrp->tr_refcnt == 0) { 9956 ASSERT(svntrp == svntr_hashtab[hash].tr_head); 9957 svntr_hashtab[hash].tr_head = svntrp->tr_next; 9958 mutex_exit(&svntr_hashtab[hash].tr_lock); 9959 kmem_cache_free(svntr_cache, svntrp); 9960 } else { 9961 mutex_exit(&svntr_hashtab[hash].tr_lock); 9962 } 9963 svd->tr_state = SEGVN_TR_OFF; 9964 } 9965 9966 /* 9967 * Convert seg back to regular vnode mapping seg by unbinding it from its text 9968 * replication amp. This routine is most typically called when segment is 9969 * unmapped but can also be called when segment no longer qualifies for text 9970 * replication (e.g. due to protection changes). If unload_unmap is set use 9971 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of 9972 * svntr free all its anon maps and remove it from the hash table. 9973 */ 9974 static void 9975 segvn_textunrepl(struct seg *seg, int unload_unmap) 9976 { 9977 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9978 vnode_t *vp = svd->vp; 9979 u_offset_t off = svd->offset; 9980 size_t size = seg->s_size; 9981 u_offset_t eoff = off + size; 9982 uint_t szc = seg->s_szc; 9983 ulong_t hash = SVNTR_HASH_FUNC(vp); 9984 svntr_t *svntrp; 9985 svntr_t **prv_svntrp; 9986 lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid; 9987 lgrp_id_t i; 9988 9989 ASSERT(AS_LOCK_HELD(seg->s_as)); 9990 ASSERT(AS_WRITE_HELD(seg->s_as) || 9991 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 9992 ASSERT(svd->tr_state == SEGVN_TR_ON); 9993 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9994 ASSERT(svd->amp != NULL); 9995 ASSERT(svd->amp->refcnt >= 1); 9996 ASSERT(svd->anon_index == 0); 9997 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9998 ASSERT(svntr_hashtab != NULL); 9999 10000 mutex_enter(&svntr_hashtab[hash].tr_lock); 10001 prv_svntrp = &svntr_hashtab[hash].tr_head; 10002 for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) { 10003 ASSERT(svntrp->tr_refcnt != 0); 10004 if (svntrp->tr_vp == vp && svntrp->tr_off == off && 10005 svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) { 10006 break; 10007 } 10008 } 10009 if (svntrp == NULL) { 10010 panic("segvn_textunrepl: svntr record not found"); 10011 } 10012 if (svntrp->tr_amp[lgrp_id] != svd->amp) { 10013 panic("segvn_textunrepl: amp mismatch"); 10014 } 10015 svd->tr_state = SEGVN_TR_OFF; 10016 svd->amp = NULL; 10017 if (svd->svn_trprev == NULL) { 10018 ASSERT(svntrp->tr_svnhead == svd); 10019 svntrp->tr_svnhead = svd->svn_trnext; 10020 if (svntrp->tr_svnhead != NULL) { 10021 svntrp->tr_svnhead->svn_trprev = NULL; 10022 } 10023 svd->svn_trnext = NULL; 10024 } else { 10025 svd->svn_trprev->svn_trnext = svd->svn_trnext; 10026 if (svd->svn_trnext != NULL) { 10027 svd->svn_trnext->svn_trprev = svd->svn_trprev; 10028 svd->svn_trnext = NULL; 10029 } 10030 svd->svn_trprev = NULL; 10031 } 10032 if (--svntrp->tr_refcnt) { 10033 mutex_exit(&svntr_hashtab[hash].tr_lock); 10034 goto done; 10035 } 10036 *prv_svntrp = svntrp->tr_next; 10037 mutex_exit(&svntr_hashtab[hash].tr_lock); 10038 for (i = 0; i < NLGRPS_MAX; i++) { 10039 struct anon_map *amp = svntrp->tr_amp[i]; 10040 if (amp == NULL) { 10041 continue; 10042 } 10043 ASSERT(amp->refcnt == 1); 10044 ASSERT(amp->swresv == size); 10045 ASSERT(amp->size == size); 10046 ASSERT(amp->a_szc == szc); 10047 if (amp->a_szc != 0) { 10048 anon_free_pages(amp->ahp, 0, size, szc); 10049 } else { 10050 anon_free(amp->ahp, 0, size); 10051 } 10052 svntrp->tr_amp[i] = NULL; 10053 ASSERT(segvn_textrepl_bytes >= size); 10054 atomic_add_long(&segvn_textrepl_bytes, -size); 10055 anon_unresv_zone(amp->swresv, NULL); 10056 amp->refcnt = 0; 10057 anonmap_free(amp); 10058 } 10059 kmem_cache_free(svntr_cache, svntrp); 10060 done: 10061 hat_unload_callback(seg->s_as->a_hat, seg->s_base, size, 10062 unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL); 10063 } 10064 10065 /* 10066 * This is called when a MAP_SHARED writable mapping is created to a vnode 10067 * that is currently used for execution (VVMEXEC flag is set). In this case we 10068 * need to prevent further use of existing replicas. 10069 */ 10070 static void 10071 segvn_inval_trcache(vnode_t *vp) 10072 { 10073 ulong_t hash = SVNTR_HASH_FUNC(vp); 10074 svntr_t *svntrp; 10075 10076 ASSERT(vp->v_flag & VVMEXEC); 10077 10078 if (svntr_hashtab == NULL) { 10079 return; 10080 } 10081 10082 mutex_enter(&svntr_hashtab[hash].tr_lock); 10083 svntrp = svntr_hashtab[hash].tr_head; 10084 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 10085 ASSERT(svntrp->tr_refcnt != 0); 10086 if (svntrp->tr_vp == vp && svntrp->tr_valid) { 10087 svntrp->tr_valid = 0; 10088 } 10089 } 10090 mutex_exit(&svntr_hashtab[hash].tr_lock); 10091 } 10092 10093 static void 10094 segvn_trasync_thread(void) 10095 { 10096 callb_cpr_t cpr_info; 10097 kmutex_t cpr_lock; /* just for CPR stuff */ 10098 10099 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 10100 10101 CALLB_CPR_INIT(&cpr_info, &cpr_lock, 10102 callb_generic_cpr, "segvn_async"); 10103 10104 if (segvn_update_textrepl_interval == 0) { 10105 segvn_update_textrepl_interval = segvn_update_tr_time * hz; 10106 } else { 10107 segvn_update_textrepl_interval *= hz; 10108 } 10109 (void) timeout(segvn_trupdate_wakeup, NULL, 10110 segvn_update_textrepl_interval); 10111 10112 for (;;) { 10113 mutex_enter(&cpr_lock); 10114 CALLB_CPR_SAFE_BEGIN(&cpr_info); 10115 mutex_exit(&cpr_lock); 10116 sema_p(&segvn_trasync_sem); 10117 mutex_enter(&cpr_lock); 10118 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 10119 mutex_exit(&cpr_lock); 10120 segvn_trupdate(); 10121 } 10122 } 10123 10124 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0; 10125 10126 static void 10127 segvn_trupdate_wakeup(void *dummy) 10128 { 10129 uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations(); 10130 10131 if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) { 10132 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs; 10133 sema_v(&segvn_trasync_sem); 10134 } 10135 10136 if (!segvn_disable_textrepl_update && 10137 segvn_update_textrepl_interval != 0) { 10138 (void) timeout(segvn_trupdate_wakeup, dummy, 10139 segvn_update_textrepl_interval); 10140 } 10141 } 10142 10143 static void 10144 segvn_trupdate(void) 10145 { 10146 ulong_t hash; 10147 svntr_t *svntrp; 10148 segvn_data_t *svd; 10149 10150 ASSERT(svntr_hashtab != NULL); 10151 10152 for (hash = 0; hash < svntr_hashtab_sz; hash++) { 10153 mutex_enter(&svntr_hashtab[hash].tr_lock); 10154 svntrp = svntr_hashtab[hash].tr_head; 10155 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 10156 ASSERT(svntrp->tr_refcnt != 0); 10157 svd = svntrp->tr_svnhead; 10158 for (; svd != NULL; svd = svd->svn_trnext) { 10159 segvn_trupdate_seg(svd->seg, svd, svntrp, 10160 hash); 10161 } 10162 } 10163 mutex_exit(&svntr_hashtab[hash].tr_lock); 10164 } 10165 } 10166 10167 static void 10168 segvn_trupdate_seg(struct seg *seg, 10169 segvn_data_t *svd, 10170 svntr_t *svntrp, 10171 ulong_t hash) 10172 { 10173 proc_t *p; 10174 lgrp_id_t lgrp_id; 10175 struct as *as; 10176 size_t size; 10177 struct anon_map *amp; 10178 10179 ASSERT(svd->vp != NULL); 10180 ASSERT(svd->vp == svntrp->tr_vp); 10181 ASSERT(svd->offset == svntrp->tr_off); 10182 ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff); 10183 ASSERT(seg != NULL); 10184 ASSERT(svd->seg == seg); 10185 ASSERT(seg->s_data == (void *)svd); 10186 ASSERT(seg->s_szc == svntrp->tr_szc); 10187 ASSERT(svd->tr_state == SEGVN_TR_ON); 10188 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 10189 ASSERT(svd->amp != NULL); 10190 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 10191 ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE); 10192 ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX); 10193 ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp); 10194 ASSERT(svntrp->tr_refcnt != 0); 10195 ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock)); 10196 10197 as = seg->s_as; 10198 ASSERT(as != NULL && as != &kas); 10199 p = as->a_proc; 10200 ASSERT(p != NULL); 10201 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 10202 lgrp_id = p->p_t1_lgrpid; 10203 if (lgrp_id == LGRP_NONE) { 10204 return; 10205 } 10206 ASSERT(lgrp_id < NLGRPS_MAX); 10207 if (svd->tr_policy_info.mem_lgrpid == lgrp_id) { 10208 return; 10209 } 10210 10211 /* 10212 * Use tryenter locking since we are locking as/seg and svntr hash 10213 * lock in reverse from syncrounous thread order. 10214 */ 10215 if (!AS_LOCK_TRYENTER(as, RW_READER)) { 10216 SEGVN_TR_ADDSTAT(nolock); 10217 if (segvn_lgrp_trthr_migrs_snpsht) { 10218 segvn_lgrp_trthr_migrs_snpsht = 0; 10219 } 10220 return; 10221 } 10222 if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) { 10223 AS_LOCK_EXIT(as); 10224 SEGVN_TR_ADDSTAT(nolock); 10225 if (segvn_lgrp_trthr_migrs_snpsht) { 10226 segvn_lgrp_trthr_migrs_snpsht = 0; 10227 } 10228 return; 10229 } 10230 size = seg->s_size; 10231 if (svntrp->tr_amp[lgrp_id] == NULL) { 10232 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 10233 if (trmem > segvn_textrepl_max_bytes) { 10234 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10235 AS_LOCK_EXIT(as); 10236 atomic_add_long(&segvn_textrepl_bytes, -size); 10237 SEGVN_TR_ADDSTAT(normem); 10238 return; 10239 } 10240 if (anon_try_resv_zone(size, NULL) == 0) { 10241 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10242 AS_LOCK_EXIT(as); 10243 atomic_add_long(&segvn_textrepl_bytes, -size); 10244 SEGVN_TR_ADDSTAT(noanon); 10245 return; 10246 } 10247 amp = anonmap_alloc(size, size, KM_NOSLEEP); 10248 if (amp == NULL) { 10249 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10250 AS_LOCK_EXIT(as); 10251 atomic_add_long(&segvn_textrepl_bytes, -size); 10252 anon_unresv_zone(size, NULL); 10253 SEGVN_TR_ADDSTAT(nokmem); 10254 return; 10255 } 10256 ASSERT(amp->refcnt == 1); 10257 amp->a_szc = seg->s_szc; 10258 svntrp->tr_amp[lgrp_id] = amp; 10259 } 10260 /* 10261 * We don't need to drop the bucket lock but here we give other 10262 * threads a chance. svntr and svd can't be unlinked as long as 10263 * segment lock is held as a writer and AS held as well. After we 10264 * retake bucket lock we'll continue from where we left. We'll be able 10265 * to reach the end of either list since new entries are always added 10266 * to the beginning of the lists. 10267 */ 10268 mutex_exit(&svntr_hashtab[hash].tr_lock); 10269 hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL); 10270 mutex_enter(&svntr_hashtab[hash].tr_lock); 10271 10272 ASSERT(svd->tr_state == SEGVN_TR_ON); 10273 ASSERT(svd->amp != NULL); 10274 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 10275 ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id); 10276 ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]); 10277 10278 svd->tr_policy_info.mem_lgrpid = lgrp_id; 10279 svd->amp = svntrp->tr_amp[lgrp_id]; 10280 p->p_tr_lgrpid = NLGRPS_MAX; 10281 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10282 AS_LOCK_EXIT(as); 10283 10284 ASSERT(svntrp->tr_refcnt != 0); 10285 ASSERT(svd->vp == svntrp->tr_vp); 10286 ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id); 10287 ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]); 10288 ASSERT(svd->seg == seg); 10289 ASSERT(svd->tr_state == SEGVN_TR_ON); 10290 10291 SEGVN_TR_ADDSTAT(asyncrepl); 10292 } 10293