1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2015, Joyent, Inc. All rights reserved. 24 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 /* 41 * VM - shared or copy-on-write from a vnode/anonymous memory. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/param.h> 46 #include <sys/t_lock.h> 47 #include <sys/errno.h> 48 #include <sys/systm.h> 49 #include <sys/mman.h> 50 #include <sys/debug.h> 51 #include <sys/cred.h> 52 #include <sys/vmsystm.h> 53 #include <sys/tuneable.h> 54 #include <sys/bitmap.h> 55 #include <sys/swap.h> 56 #include <sys/kmem.h> 57 #include <sys/sysmacros.h> 58 #include <sys/vtrace.h> 59 #include <sys/cmn_err.h> 60 #include <sys/callb.h> 61 #include <sys/vm.h> 62 #include <sys/dumphdr.h> 63 #include <sys/lgrp.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/seg.h> 68 #include <vm/seg_vn.h> 69 #include <vm/pvn.h> 70 #include <vm/anon.h> 71 #include <vm/page.h> 72 #include <vm/vpage.h> 73 #include <sys/proc.h> 74 #include <sys/task.h> 75 #include <sys/project.h> 76 #include <sys/zone.h> 77 #include <sys/shm_impl.h> 78 79 /* 80 * segvn_fault needs a temporary page list array. To avoid calling kmem all 81 * the time, it creates a small (PVN_GETPAGE_NUM entry) array and uses it if 82 * it can. In the rare case when this page list is not large enough, it 83 * goes and gets a large enough array from kmem. 84 * 85 * This small page list array covers either 8 pages or 64kB worth of pages - 86 * whichever is smaller. 87 */ 88 #define PVN_MAX_GETPAGE_SZ 0x10000 89 #define PVN_MAX_GETPAGE_NUM 0x8 90 91 #if PVN_MAX_GETPAGE_SZ > PVN_MAX_GETPAGE_NUM * PAGESIZE 92 #define PVN_GETPAGE_SZ ptob(PVN_MAX_GETPAGE_NUM) 93 #define PVN_GETPAGE_NUM PVN_MAX_GETPAGE_NUM 94 #else 95 #define PVN_GETPAGE_SZ PVN_MAX_GETPAGE_SZ 96 #define PVN_GETPAGE_NUM btop(PVN_MAX_GETPAGE_SZ) 97 #endif 98 99 /* 100 * Private seg op routines. 101 */ 102 static int segvn_dup(struct seg *seg, struct seg *newseg); 103 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 104 static void segvn_free(struct seg *seg); 105 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 106 caddr_t addr, size_t len, enum fault_type type, 107 enum seg_rw rw); 108 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 109 static int segvn_setprot(struct seg *seg, caddr_t addr, 110 size_t len, uint_t prot); 111 static int segvn_checkprot(struct seg *seg, caddr_t addr, 112 size_t len, uint_t prot); 113 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 114 static size_t segvn_swapout(struct seg *seg); 115 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 116 int attr, uint_t flags); 117 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 118 char *vec); 119 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 120 int attr, int op, ulong_t *lockmap, size_t pos); 121 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 122 uint_t *protv); 123 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 124 static int segvn_gettype(struct seg *seg, caddr_t addr); 125 static int segvn_getvp(struct seg *seg, caddr_t addr, 126 struct vnode **vpp); 127 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 128 uint_t behav); 129 static void segvn_dump(struct seg *seg); 130 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 131 struct page ***ppp, enum lock_type type, enum seg_rw rw); 132 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 133 uint_t szc); 134 static int segvn_getmemid(struct seg *seg, caddr_t addr, 135 memid_t *memidp); 136 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 137 static int segvn_capable(struct seg *seg, segcapability_t capable); 138 static int segvn_inherit(struct seg *, caddr_t, size_t, uint_t); 139 140 struct seg_ops segvn_ops = { 141 segvn_dup, 142 segvn_unmap, 143 segvn_free, 144 segvn_fault, 145 segvn_faulta, 146 segvn_setprot, 147 segvn_checkprot, 148 segvn_kluster, 149 segvn_swapout, 150 segvn_sync, 151 segvn_incore, 152 segvn_lockop, 153 segvn_getprot, 154 segvn_getoffset, 155 segvn_gettype, 156 segvn_getvp, 157 segvn_advise, 158 segvn_dump, 159 segvn_pagelock, 160 segvn_setpagesize, 161 segvn_getmemid, 162 segvn_getpolicy, 163 segvn_capable, 164 segvn_inherit 165 }; 166 167 /* 168 * Common zfod structures, provided as a shorthand for others to use. 169 */ 170 static segvn_crargs_t zfod_segvn_crargs = 171 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 172 static segvn_crargs_t kzfod_segvn_crargs = 173 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 174 PROT_ALL & ~PROT_USER); 175 static segvn_crargs_t stack_noexec_crargs = 176 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 177 178 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 179 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 180 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 181 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 182 183 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 184 185 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 186 187 size_t segvn_pglock_comb_thrshld = (1UL << 16); /* 64K */ 188 size_t segvn_pglock_comb_balign = (1UL << 16); /* 64K */ 189 uint_t segvn_pglock_comb_bshift; 190 size_t segvn_pglock_comb_palign; 191 192 static int segvn_concat(struct seg *, struct seg *, int); 193 static int segvn_extend_prev(struct seg *, struct seg *, 194 struct segvn_crargs *, size_t); 195 static int segvn_extend_next(struct seg *, struct seg *, 196 struct segvn_crargs *, size_t); 197 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 198 static void segvn_pagelist_rele(page_t **); 199 static void segvn_setvnode_mpss(vnode_t *); 200 static void segvn_relocate_pages(page_t **, page_t *); 201 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 202 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 203 uint_t, page_t **, page_t **, uint_t *, int *); 204 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 205 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 206 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 207 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 208 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 209 u_offset_t, struct vpage *, page_t **, uint_t, 210 enum fault_type, enum seg_rw, int); 211 static void segvn_vpage(struct seg *); 212 static size_t segvn_count_swap_by_vpages(struct seg *); 213 214 static void segvn_purge(struct seg *seg); 215 static int segvn_reclaim(void *, caddr_t, size_t, struct page **, 216 enum seg_rw, int); 217 static int shamp_reclaim(void *, caddr_t, size_t, struct page **, 218 enum seg_rw, int); 219 220 static int sameprot(struct seg *, caddr_t, size_t); 221 222 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 223 static int segvn_clrszc(struct seg *); 224 static struct seg *segvn_split_seg(struct seg *, caddr_t); 225 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 226 ulong_t, uint_t); 227 228 static void segvn_hat_rgn_unload_callback(caddr_t, caddr_t, caddr_t, 229 size_t, void *, u_offset_t); 230 231 static struct kmem_cache *segvn_cache; 232 static struct kmem_cache **segvn_szc_cache; 233 234 #ifdef VM_STATS 235 static struct segvnvmstats_str { 236 ulong_t fill_vp_pages[31]; 237 ulong_t fltvnpages[49]; 238 ulong_t fullszcpages[10]; 239 ulong_t relocatepages[3]; 240 ulong_t fltanpages[17]; 241 ulong_t pagelock[2]; 242 ulong_t demoterange[3]; 243 } segvnvmstats; 244 #endif /* VM_STATS */ 245 246 #define SDR_RANGE 1 /* demote entire range */ 247 #define SDR_END 2 /* demote non aligned ends only */ 248 249 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 250 if ((len) != 0) { \ 251 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 252 ASSERT(lpgaddr >= (seg)->s_base); \ 253 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 254 (len)), pgsz); \ 255 ASSERT(lpgeaddr > lpgaddr); \ 256 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 257 } else { \ 258 lpgeaddr = lpgaddr = (addr); \ 259 } \ 260 } 261 262 /*ARGSUSED*/ 263 static int 264 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 265 { 266 struct segvn_data *svd = buf; 267 268 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 269 mutex_init(&svd->segfree_syncmtx, NULL, MUTEX_DEFAULT, NULL); 270 svd->svn_trnext = svd->svn_trprev = NULL; 271 return (0); 272 } 273 274 /*ARGSUSED1*/ 275 static void 276 segvn_cache_destructor(void *buf, void *cdrarg) 277 { 278 struct segvn_data *svd = buf; 279 280 rw_destroy(&svd->lock); 281 mutex_destroy(&svd->segfree_syncmtx); 282 } 283 284 /*ARGSUSED*/ 285 static int 286 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 287 { 288 bzero(buf, sizeof (svntr_t)); 289 return (0); 290 } 291 292 /* 293 * Patching this variable to non-zero allows the system to run with 294 * stacks marked as "not executable". It's a bit of a kludge, but is 295 * provided as a tweakable for platforms that export those ABIs 296 * (e.g. sparc V8) that have executable stacks enabled by default. 297 * There are also some restrictions for platforms that don't actually 298 * implement 'noexec' protections. 299 * 300 * Once enabled, the system is (therefore) unable to provide a fully 301 * ABI-compliant execution environment, though practically speaking, 302 * most everything works. The exceptions are generally some interpreters 303 * and debuggers that create executable code on the stack and jump 304 * into it (without explicitly mprotecting the address range to include 305 * PROT_EXEC). 306 * 307 * One important class of applications that are disabled are those 308 * that have been transformed into malicious agents using one of the 309 * numerous "buffer overflow" attacks. See 4007890. 310 */ 311 int noexec_user_stack = 0; 312 int noexec_user_stack_log = 1; 313 314 int segvn_lpg_disable = 0; 315 uint_t segvn_maxpgszc = 0; 316 317 ulong_t segvn_vmpss_clrszc_cnt; 318 ulong_t segvn_vmpss_clrszc_err; 319 ulong_t segvn_fltvnpages_clrszc_cnt; 320 ulong_t segvn_fltvnpages_clrszc_err; 321 ulong_t segvn_setpgsz_align_err; 322 ulong_t segvn_setpgsz_anon_align_err; 323 ulong_t segvn_setpgsz_getattr_err; 324 ulong_t segvn_setpgsz_eof_err; 325 ulong_t segvn_faultvnmpss_align_err1; 326 ulong_t segvn_faultvnmpss_align_err2; 327 ulong_t segvn_faultvnmpss_align_err3; 328 ulong_t segvn_faultvnmpss_align_err4; 329 ulong_t segvn_faultvnmpss_align_err5; 330 ulong_t segvn_vmpss_pageio_deadlk_err; 331 332 int segvn_use_regions = 1; 333 334 /* 335 * Segvn supports text replication optimization for NUMA platforms. Text 336 * replica's are represented by anon maps (amp). There's one amp per text file 337 * region per lgroup. A process chooses the amp for each of its text mappings 338 * based on the lgroup assignment of its main thread (t_tid = 1). All 339 * processes that want a replica on a particular lgroup for the same text file 340 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 341 * with vp,off,size,szc used as a key. Text replication segments are read only 342 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 343 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 344 * pages. Replication amp is assigned to a segment when it gets its first 345 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 346 * rechecks periodically if the process still maps an amp local to the main 347 * thread. If not async thread forces process to remap to an amp in the new 348 * home lgroup of the main thread. Current text replication implementation 349 * only provides the benefit to workloads that do most of their work in the 350 * main thread of a process or all the threads of a process run in the same 351 * lgroup. To extend text replication benefit to different types of 352 * multithreaded workloads further work would be needed in the hat layer to 353 * allow the same virtual address in the same hat to simultaneously map 354 * different physical addresses (i.e. page table replication would be needed 355 * for x86). 356 * 357 * amp pages are used instead of vnode pages as long as segment has a very 358 * simple life cycle. It's created via segvn_create(), handles S_EXEC 359 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 360 * happens such as protection is changed, real COW fault happens, pagesize is 361 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 362 * text replication by converting the segment back to vnode only segment 363 * (unmap segment's address range and set svd->amp to NULL). 364 * 365 * The original file can be changed after amp is inserted into 366 * svntr_hashtab. Processes that are launched after the file is already 367 * changed can't use the replica's created prior to the file change. To 368 * implement this functionality hash entries are timestamped. Replica's can 369 * only be used if current file modification time is the same as the timestamp 370 * saved when hash entry was created. However just timestamps alone are not 371 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 372 * deal with file changes via MAP_SHARED mappings differently. When writable 373 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 374 * existing replica's for this vnode as not usable for future text 375 * mappings. And we don't create new replica's for files that currently have 376 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 377 * true). 378 */ 379 380 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 381 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 382 383 static ulong_t svntr_hashtab_sz = 512; 384 static svntr_bucket_t *svntr_hashtab = NULL; 385 static struct kmem_cache *svntr_cache; 386 static svntr_stats_t *segvn_textrepl_stats; 387 static ksema_t segvn_trasync_sem; 388 389 int segvn_disable_textrepl = 1; 390 size_t textrepl_size_thresh = (size_t)-1; 391 size_t segvn_textrepl_bytes = 0; 392 size_t segvn_textrepl_max_bytes = 0; 393 clock_t segvn_update_textrepl_interval = 0; 394 int segvn_update_tr_time = 10; 395 int segvn_disable_textrepl_update = 0; 396 397 static void segvn_textrepl(struct seg *); 398 static void segvn_textunrepl(struct seg *, int); 399 static void segvn_inval_trcache(vnode_t *); 400 static void segvn_trasync_thread(void); 401 static void segvn_trupdate_wakeup(void *); 402 static void segvn_trupdate(void); 403 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 404 ulong_t); 405 406 /* 407 * Initialize segvn data structures 408 */ 409 void 410 segvn_init(void) 411 { 412 uint_t maxszc; 413 uint_t szc; 414 size_t pgsz; 415 416 segvn_cache = kmem_cache_create("segvn_cache", 417 sizeof (struct segvn_data), 0, 418 segvn_cache_constructor, segvn_cache_destructor, NULL, 419 NULL, NULL, 0); 420 421 if (segvn_lpg_disable == 0) { 422 szc = maxszc = page_num_pagesizes() - 1; 423 if (szc == 0) { 424 segvn_lpg_disable = 1; 425 } 426 if (page_get_pagesize(0) != PAGESIZE) { 427 panic("segvn_init: bad szc 0"); 428 /*NOTREACHED*/ 429 } 430 while (szc != 0) { 431 pgsz = page_get_pagesize(szc); 432 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 433 panic("segvn_init: bad szc %d", szc); 434 /*NOTREACHED*/ 435 } 436 szc--; 437 } 438 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 439 segvn_maxpgszc = maxszc; 440 } 441 442 if (segvn_maxpgszc) { 443 segvn_szc_cache = (struct kmem_cache **)kmem_alloc( 444 (segvn_maxpgszc + 1) * sizeof (struct kmem_cache *), 445 KM_SLEEP); 446 } 447 448 for (szc = 1; szc <= segvn_maxpgszc; szc++) { 449 char str[32]; 450 451 (void) sprintf(str, "segvn_szc_cache%d", szc); 452 segvn_szc_cache[szc] = kmem_cache_create(str, 453 page_get_pagecnt(szc) * sizeof (page_t *), 0, 454 NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); 455 } 456 457 458 if (segvn_use_regions && !hat_supported(HAT_SHARED_REGIONS, NULL)) 459 segvn_use_regions = 0; 460 461 /* 462 * For now shared regions and text replication segvn support 463 * are mutually exclusive. This is acceptable because 464 * currently significant benefit from text replication was 465 * only observed on AMD64 NUMA platforms (due to relatively 466 * small L2$ size) and currently we don't support shared 467 * regions on x86. 468 */ 469 if (segvn_use_regions && !segvn_disable_textrepl) { 470 segvn_disable_textrepl = 1; 471 } 472 473 #if defined(_LP64) 474 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 475 !segvn_disable_textrepl) { 476 ulong_t i; 477 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 478 479 svntr_cache = kmem_cache_create("svntr_cache", 480 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 481 NULL, NULL, NULL, 0); 482 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 483 for (i = 0; i < svntr_hashtab_sz; i++) { 484 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 485 MUTEX_DEFAULT, NULL); 486 } 487 segvn_textrepl_max_bytes = ptob(physmem) / 488 segvn_textrepl_max_bytes_factor; 489 segvn_textrepl_stats = kmem_zalloc(NCPU * 490 sizeof (svntr_stats_t), KM_SLEEP); 491 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 492 (void) thread_create(NULL, 0, segvn_trasync_thread, 493 NULL, 0, &p0, TS_RUN, minclsyspri); 494 } 495 #endif 496 497 if (!ISP2(segvn_pglock_comb_balign) || 498 segvn_pglock_comb_balign < PAGESIZE) { 499 segvn_pglock_comb_balign = 1UL << 16; /* 64K */ 500 } 501 segvn_pglock_comb_bshift = highbit(segvn_pglock_comb_balign) - 1; 502 segvn_pglock_comb_palign = btop(segvn_pglock_comb_balign); 503 } 504 505 #define SEGVN_PAGEIO ((void *)0x1) 506 #define SEGVN_NOPAGEIO ((void *)0x2) 507 508 static void 509 segvn_setvnode_mpss(vnode_t *vp) 510 { 511 int err; 512 513 ASSERT(vp->v_mpssdata == NULL || 514 vp->v_mpssdata == SEGVN_PAGEIO || 515 vp->v_mpssdata == SEGVN_NOPAGEIO); 516 517 if (vp->v_mpssdata == NULL) { 518 if (vn_vmpss_usepageio(vp)) { 519 err = VOP_PAGEIO(vp, (page_t *)NULL, 520 (u_offset_t)0, 0, 0, CRED(), NULL); 521 } else { 522 err = ENOSYS; 523 } 524 /* 525 * set v_mpssdata just once per vnode life 526 * so that it never changes. 527 */ 528 mutex_enter(&vp->v_lock); 529 if (vp->v_mpssdata == NULL) { 530 if (err == EINVAL) { 531 vp->v_mpssdata = SEGVN_PAGEIO; 532 } else { 533 vp->v_mpssdata = SEGVN_NOPAGEIO; 534 } 535 } 536 mutex_exit(&vp->v_lock); 537 } 538 } 539 540 int 541 segvn_create(struct seg *seg, void *argsp) 542 { 543 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 544 struct segvn_data *svd; 545 size_t swresv = 0; 546 struct cred *cred; 547 struct anon_map *amp; 548 int error = 0; 549 size_t pgsz; 550 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 551 int use_rgn = 0; 552 int trok = 0; 553 554 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 555 556 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 557 panic("segvn_create type"); 558 /*NOTREACHED*/ 559 } 560 561 /* 562 * Check arguments. If a shared anon structure is given then 563 * it is illegal to also specify a vp. 564 */ 565 if (a->amp != NULL && a->vp != NULL) { 566 panic("segvn_create anon_map"); 567 /*NOTREACHED*/ 568 } 569 570 if (a->type == MAP_PRIVATE && (a->flags & MAP_TEXT) && 571 a->vp != NULL && a->prot == (PROT_USER | PROT_READ | PROT_EXEC) && 572 segvn_use_regions) { 573 use_rgn = 1; 574 } 575 576 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 577 if (a->type == MAP_SHARED) 578 a->flags &= ~MAP_NORESERVE; 579 580 if (a->szc != 0) { 581 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 582 (a->amp != NULL && a->type == MAP_PRIVATE) || 583 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 584 a->szc = 0; 585 } else { 586 if (a->szc > segvn_maxpgszc) 587 a->szc = segvn_maxpgszc; 588 pgsz = page_get_pagesize(a->szc); 589 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 590 !IS_P2ALIGNED(seg->s_size, pgsz)) { 591 a->szc = 0; 592 } else if (a->vp != NULL) { 593 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 594 /* 595 * paranoid check. 596 * hat_page_demote() is not supported 597 * on swapfs pages. 598 */ 599 a->szc = 0; 600 } else if (map_addr_vacalign_check(seg->s_base, 601 a->offset & PAGEMASK)) { 602 a->szc = 0; 603 } 604 } else if (a->amp != NULL) { 605 pgcnt_t anum = btopr(a->offset); 606 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 607 if (!IS_P2ALIGNED(anum, pgcnt)) { 608 a->szc = 0; 609 } 610 } 611 } 612 } 613 614 /* 615 * If segment may need private pages, reserve them now. 616 */ 617 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 618 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 619 if (anon_resv_zone(seg->s_size, 620 seg->s_as->a_proc->p_zone) == 0) 621 return (EAGAIN); 622 swresv = seg->s_size; 623 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 624 seg, swresv, 1); 625 } 626 627 /* 628 * Reserve any mapping structures that may be required. 629 * 630 * Don't do it for segments that may use regions. It's currently a 631 * noop in the hat implementations anyway. 632 */ 633 if (!use_rgn) { 634 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 635 } 636 637 if (a->cred) { 638 cred = a->cred; 639 crhold(cred); 640 } else { 641 crhold(cred = CRED()); 642 } 643 644 /* Inform the vnode of the new mapping */ 645 if (a->vp != NULL) { 646 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 647 seg->s_as, seg->s_base, seg->s_size, a->prot, 648 a->maxprot, a->type, cred, NULL); 649 if (error) { 650 if (swresv != 0) { 651 anon_unresv_zone(swresv, 652 seg->s_as->a_proc->p_zone); 653 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 654 "anon proc:%p %lu %u", seg, swresv, 0); 655 } 656 crfree(cred); 657 if (!use_rgn) { 658 hat_unload(seg->s_as->a_hat, seg->s_base, 659 seg->s_size, HAT_UNLOAD_UNMAP); 660 } 661 return (error); 662 } 663 /* 664 * svntr_hashtab will be NULL if we support shared regions. 665 */ 666 trok = ((a->flags & MAP_TEXT) && 667 (seg->s_size > textrepl_size_thresh || 668 (a->flags & _MAP_TEXTREPL)) && 669 lgrp_optimizations() && svntr_hashtab != NULL && 670 a->type == MAP_PRIVATE && swresv == 0 && 671 !(a->flags & MAP_NORESERVE) && 672 seg->s_as != &kas && a->vp->v_type == VREG); 673 674 ASSERT(!trok || !use_rgn); 675 } 676 677 /* 678 * MAP_NORESERVE mappings don't count towards the VSZ of a process 679 * until we fault the pages in. 680 */ 681 if ((a->vp == NULL || a->vp->v_type != VREG) && 682 a->flags & MAP_NORESERVE) { 683 seg->s_as->a_resvsize -= seg->s_size; 684 } 685 686 /* 687 * If more than one segment in the address space, and they're adjacent 688 * virtually, try to concatenate them. Don't concatenate if an 689 * explicit anon_map structure was supplied (e.g., SystemV shared 690 * memory) or if we'll use text replication for this segment. 691 */ 692 if (a->amp == NULL && !use_rgn && !trok) { 693 struct seg *pseg, *nseg; 694 struct segvn_data *psvd, *nsvd; 695 lgrp_mem_policy_t ppolicy, npolicy; 696 uint_t lgrp_mem_policy_flags = 0; 697 extern lgrp_mem_policy_t lgrp_mem_default_policy; 698 699 /* 700 * Memory policy flags (lgrp_mem_policy_flags) is valid when 701 * extending stack/heap segments. 702 */ 703 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 704 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 705 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 706 } else { 707 /* 708 * Get policy when not extending it from another segment 709 */ 710 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 711 } 712 713 /* 714 * First, try to concatenate the previous and new segments 715 */ 716 pseg = AS_SEGPREV(seg->s_as, seg); 717 if (pseg != NULL && 718 pseg->s_base + pseg->s_size == seg->s_base && 719 pseg->s_ops == &segvn_ops) { 720 /* 721 * Get memory allocation policy from previous segment. 722 * When extension is specified (e.g. for heap) apply 723 * this policy to the new segment regardless of the 724 * outcome of segment concatenation. Extension occurs 725 * for non-default policy otherwise default policy is 726 * used and is based on extended segment size. 727 */ 728 psvd = (struct segvn_data *)pseg->s_data; 729 ppolicy = psvd->policy_info.mem_policy; 730 if (lgrp_mem_policy_flags == 731 LGRP_MP_FLAG_EXTEND_UP) { 732 if (ppolicy != lgrp_mem_default_policy) { 733 mpolicy = ppolicy; 734 } else { 735 mpolicy = lgrp_mem_policy_default( 736 pseg->s_size + seg->s_size, 737 a->type); 738 } 739 } 740 741 if (mpolicy == ppolicy && 742 (pseg->s_size + seg->s_size <= 743 segvn_comb_thrshld || psvd->amp == NULL) && 744 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 745 /* 746 * success! now try to concatenate 747 * with following seg 748 */ 749 crfree(cred); 750 nseg = AS_SEGNEXT(pseg->s_as, pseg); 751 if (nseg != NULL && 752 nseg != pseg && 753 nseg->s_ops == &segvn_ops && 754 pseg->s_base + pseg->s_size == 755 nseg->s_base) 756 (void) segvn_concat(pseg, nseg, 0); 757 ASSERT(pseg->s_szc == 0 || 758 (a->szc == pseg->s_szc && 759 IS_P2ALIGNED(pseg->s_base, pgsz) && 760 IS_P2ALIGNED(pseg->s_size, pgsz))); 761 return (0); 762 } 763 } 764 765 /* 766 * Failed, so try to concatenate with following seg 767 */ 768 nseg = AS_SEGNEXT(seg->s_as, seg); 769 if (nseg != NULL && 770 seg->s_base + seg->s_size == nseg->s_base && 771 nseg->s_ops == &segvn_ops) { 772 /* 773 * Get memory allocation policy from next segment. 774 * When extension is specified (e.g. for stack) apply 775 * this policy to the new segment regardless of the 776 * outcome of segment concatenation. Extension occurs 777 * for non-default policy otherwise default policy is 778 * used and is based on extended segment size. 779 */ 780 nsvd = (struct segvn_data *)nseg->s_data; 781 npolicy = nsvd->policy_info.mem_policy; 782 if (lgrp_mem_policy_flags == 783 LGRP_MP_FLAG_EXTEND_DOWN) { 784 if (npolicy != lgrp_mem_default_policy) { 785 mpolicy = npolicy; 786 } else { 787 mpolicy = lgrp_mem_policy_default( 788 nseg->s_size + seg->s_size, 789 a->type); 790 } 791 } 792 793 if (mpolicy == npolicy && 794 segvn_extend_next(seg, nseg, a, swresv) == 0) { 795 crfree(cred); 796 ASSERT(nseg->s_szc == 0 || 797 (a->szc == nseg->s_szc && 798 IS_P2ALIGNED(nseg->s_base, pgsz) && 799 IS_P2ALIGNED(nseg->s_size, pgsz))); 800 return (0); 801 } 802 } 803 } 804 805 if (a->vp != NULL) { 806 VN_HOLD(a->vp); 807 if (a->type == MAP_SHARED) 808 lgrp_shm_policy_init(NULL, a->vp); 809 } 810 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 811 812 seg->s_ops = &segvn_ops; 813 seg->s_data = (void *)svd; 814 seg->s_szc = a->szc; 815 816 svd->seg = seg; 817 svd->vp = a->vp; 818 /* 819 * Anonymous mappings have no backing file so the offset is meaningless. 820 */ 821 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 822 svd->prot = a->prot; 823 svd->maxprot = a->maxprot; 824 svd->pageprot = 0; 825 svd->type = a->type; 826 svd->vpage = NULL; 827 svd->cred = cred; 828 svd->advice = MADV_NORMAL; 829 svd->pageadvice = 0; 830 svd->flags = (ushort_t)a->flags; 831 svd->softlockcnt = 0; 832 svd->softlockcnt_sbase = 0; 833 svd->softlockcnt_send = 0; 834 svd->svn_inz = 0; 835 svd->rcookie = HAT_INVALID_REGION_COOKIE; 836 svd->pageswap = 0; 837 838 if (a->szc != 0 && a->vp != NULL) { 839 segvn_setvnode_mpss(a->vp); 840 } 841 if (svd->type == MAP_SHARED && svd->vp != NULL && 842 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 843 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 844 segvn_inval_trcache(svd->vp); 845 } 846 847 amp = a->amp; 848 if ((svd->amp = amp) == NULL) { 849 svd->anon_index = 0; 850 if (svd->type == MAP_SHARED) { 851 svd->swresv = 0; 852 /* 853 * Shared mappings to a vp need no other setup. 854 * If we have a shared mapping to an anon_map object 855 * which hasn't been allocated yet, allocate the 856 * struct now so that it will be properly shared 857 * by remembering the swap reservation there. 858 */ 859 if (a->vp == NULL) { 860 svd->amp = anonmap_alloc(seg->s_size, swresv, 861 ANON_SLEEP); 862 svd->amp->a_szc = seg->s_szc; 863 } 864 } else { 865 /* 866 * Private mapping (with or without a vp). 867 * Allocate anon_map when needed. 868 */ 869 svd->swresv = swresv; 870 } 871 } else { 872 pgcnt_t anon_num; 873 874 /* 875 * Mapping to an existing anon_map structure without a vp. 876 * For now we will insure that the segment size isn't larger 877 * than the size - offset gives us. Later on we may wish to 878 * have the anon array dynamically allocated itself so that 879 * we don't always have to allocate all the anon pointer slots. 880 * This of course involves adding extra code to check that we 881 * aren't trying to use an anon pointer slot beyond the end 882 * of the currently allocated anon array. 883 */ 884 if ((amp->size - a->offset) < seg->s_size) { 885 panic("segvn_create anon_map size"); 886 /*NOTREACHED*/ 887 } 888 889 anon_num = btopr(a->offset); 890 891 if (a->type == MAP_SHARED) { 892 /* 893 * SHARED mapping to a given anon_map. 894 */ 895 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 896 amp->refcnt++; 897 if (a->szc > amp->a_szc) { 898 amp->a_szc = a->szc; 899 } 900 ANON_LOCK_EXIT(&->a_rwlock); 901 svd->anon_index = anon_num; 902 svd->swresv = 0; 903 } else { 904 /* 905 * PRIVATE mapping to a given anon_map. 906 * Make sure that all the needed anon 907 * structures are created (so that we will 908 * share the underlying pages if nothing 909 * is written by this mapping) and then 910 * duplicate the anon array as is done 911 * when a privately mapped segment is dup'ed. 912 */ 913 struct anon *ap; 914 caddr_t addr; 915 caddr_t eaddr; 916 ulong_t anon_idx; 917 int hat_flag = HAT_LOAD; 918 919 if (svd->flags & MAP_TEXT) { 920 hat_flag |= HAT_LOAD_TEXT; 921 } 922 923 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 924 svd->amp->a_szc = seg->s_szc; 925 svd->anon_index = 0; 926 svd->swresv = swresv; 927 928 /* 929 * Prevent 2 threads from allocating anon 930 * slots simultaneously. 931 */ 932 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 933 eaddr = seg->s_base + seg->s_size; 934 935 for (anon_idx = anon_num, addr = seg->s_base; 936 addr < eaddr; addr += PAGESIZE, anon_idx++) { 937 page_t *pp; 938 939 if ((ap = anon_get_ptr(amp->ahp, 940 anon_idx)) != NULL) 941 continue; 942 943 /* 944 * Allocate the anon struct now. 945 * Might as well load up translation 946 * to the page while we're at it... 947 */ 948 pp = anon_zero(seg, addr, &ap, cred); 949 if (ap == NULL || pp == NULL) { 950 panic("segvn_create anon_zero"); 951 /*NOTREACHED*/ 952 } 953 954 /* 955 * Re-acquire the anon_map lock and 956 * initialize the anon array entry. 957 */ 958 ASSERT(anon_get_ptr(amp->ahp, 959 anon_idx) == NULL); 960 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 961 ANON_SLEEP); 962 963 ASSERT(seg->s_szc == 0); 964 ASSERT(!IS_VMODSORT(pp->p_vnode)); 965 966 ASSERT(use_rgn == 0); 967 hat_memload(seg->s_as->a_hat, addr, pp, 968 svd->prot & ~PROT_WRITE, hat_flag); 969 970 page_unlock(pp); 971 } 972 ASSERT(seg->s_szc == 0); 973 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 974 0, seg->s_size); 975 ANON_LOCK_EXIT(&->a_rwlock); 976 } 977 } 978 979 /* 980 * Set default memory allocation policy for segment 981 * 982 * Always set policy for private memory at least for initialization 983 * even if this is a shared memory segment 984 */ 985 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 986 987 if (svd->type == MAP_SHARED) 988 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 989 svd->vp, svd->offset, seg->s_size); 990 991 if (use_rgn) { 992 ASSERT(!trok); 993 ASSERT(svd->amp == NULL); 994 svd->rcookie = hat_join_region(seg->s_as->a_hat, seg->s_base, 995 seg->s_size, (void *)svd->vp, svd->offset, svd->prot, 996 (uchar_t)seg->s_szc, segvn_hat_rgn_unload_callback, 997 HAT_REGION_TEXT); 998 } 999 1000 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 1001 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF; 1002 1003 return (0); 1004 } 1005 1006 /* 1007 * Concatenate two existing segments, if possible. 1008 * Return 0 on success, -1 if two segments are not compatible 1009 * or -2 on memory allocation failure. 1010 * If amp_cat == 1 then try and concat segments with anon maps 1011 */ 1012 static int 1013 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 1014 { 1015 struct segvn_data *svd1 = seg1->s_data; 1016 struct segvn_data *svd2 = seg2->s_data; 1017 struct anon_map *amp1 = svd1->amp; 1018 struct anon_map *amp2 = svd2->amp; 1019 struct vpage *vpage1 = svd1->vpage; 1020 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 1021 size_t size, nvpsize; 1022 pgcnt_t npages1, npages2; 1023 1024 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 1025 ASSERT(AS_WRITE_HELD(seg1->s_as)); 1026 ASSERT(seg1->s_ops == seg2->s_ops); 1027 1028 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie) || 1029 HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1030 return (-1); 1031 } 1032 1033 /* both segments exist, try to merge them */ 1034 #define incompat(x) (svd1->x != svd2->x) 1035 if (incompat(vp) || incompat(maxprot) || 1036 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 1037 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 1038 incompat(type) || incompat(cred) || incompat(flags) || 1039 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 1040 (svd2->softlockcnt > 0) || svd1->softlockcnt_send > 0) 1041 return (-1); 1042 #undef incompat 1043 1044 /* 1045 * vp == NULL implies zfod, offset doesn't matter 1046 */ 1047 if (svd1->vp != NULL && 1048 svd1->offset + seg1->s_size != svd2->offset) { 1049 return (-1); 1050 } 1051 1052 /* 1053 * Don't concatenate if either segment uses text replication. 1054 */ 1055 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) { 1056 return (-1); 1057 } 1058 1059 /* 1060 * Fail early if we're not supposed to concatenate 1061 * segments with non NULL amp. 1062 */ 1063 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 1064 return (-1); 1065 } 1066 1067 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 1068 if (amp1 != amp2) { 1069 return (-1); 1070 } 1071 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 1072 svd2->anon_index) { 1073 return (-1); 1074 } 1075 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 1076 } 1077 1078 /* 1079 * If either seg has vpages, create a new merged vpage array. 1080 */ 1081 if (vpage1 != NULL || vpage2 != NULL) { 1082 struct vpage *vp, *evp; 1083 1084 npages1 = seg_pages(seg1); 1085 npages2 = seg_pages(seg2); 1086 nvpsize = vpgtob(npages1 + npages2); 1087 1088 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 1089 return (-2); 1090 } 1091 1092 if (vpage1 != NULL) { 1093 bcopy(vpage1, nvpage, vpgtob(npages1)); 1094 } else { 1095 evp = nvpage + npages1; 1096 for (vp = nvpage; vp < evp; vp++) { 1097 VPP_SETPROT(vp, svd1->prot); 1098 VPP_SETADVICE(vp, svd1->advice); 1099 } 1100 } 1101 1102 if (vpage2 != NULL) { 1103 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 1104 } else { 1105 evp = nvpage + npages1 + npages2; 1106 for (vp = nvpage + npages1; vp < evp; vp++) { 1107 VPP_SETPROT(vp, svd2->prot); 1108 VPP_SETADVICE(vp, svd2->advice); 1109 } 1110 } 1111 1112 if (svd2->pageswap && (!svd1->pageswap && svd1->swresv)) { 1113 ASSERT(svd1->swresv == seg1->s_size); 1114 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1115 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1116 evp = nvpage + npages1; 1117 for (vp = nvpage; vp < evp; vp++) { 1118 VPP_SETSWAPRES(vp); 1119 } 1120 } 1121 1122 if (svd1->pageswap && (!svd2->pageswap && svd2->swresv)) { 1123 ASSERT(svd2->swresv == seg2->s_size); 1124 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1125 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1126 vp = nvpage + npages1; 1127 evp = vp + npages2; 1128 for (; vp < evp; vp++) { 1129 VPP_SETSWAPRES(vp); 1130 } 1131 } 1132 } 1133 ASSERT((vpage1 != NULL || vpage2 != NULL) || 1134 (svd1->pageswap == 0 && svd2->pageswap == 0)); 1135 1136 /* 1137 * If either segment has private pages, create a new merged anon 1138 * array. If mergeing shared anon segments just decrement anon map's 1139 * refcnt. 1140 */ 1141 if (amp1 != NULL && svd1->type == MAP_SHARED) { 1142 ASSERT(amp1 == amp2 && svd1->vp == NULL); 1143 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1144 ASSERT(amp1->refcnt >= 2); 1145 amp1->refcnt--; 1146 ANON_LOCK_EXIT(&1->a_rwlock); 1147 svd2->amp = NULL; 1148 } else if (amp1 != NULL || amp2 != NULL) { 1149 struct anon_hdr *nahp; 1150 struct anon_map *namp = NULL; 1151 size_t asize; 1152 1153 ASSERT(svd1->type == MAP_PRIVATE); 1154 1155 asize = seg1->s_size + seg2->s_size; 1156 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 1157 if (nvpage != NULL) { 1158 kmem_free(nvpage, nvpsize); 1159 } 1160 return (-2); 1161 } 1162 if (amp1 != NULL) { 1163 /* 1164 * XXX anon rwlock is not really needed because 1165 * this is a private segment and we are writers. 1166 */ 1167 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1168 ASSERT(amp1->refcnt == 1); 1169 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 1170 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 1171 anon_release(nahp, btop(asize)); 1172 ANON_LOCK_EXIT(&1->a_rwlock); 1173 if (nvpage != NULL) { 1174 kmem_free(nvpage, nvpsize); 1175 } 1176 return (-2); 1177 } 1178 } 1179 if (amp2 != NULL) { 1180 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1181 ASSERT(amp2->refcnt == 1); 1182 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 1183 nahp, btop(seg1->s_size), btop(seg2->s_size), 1184 ANON_NOSLEEP)) { 1185 anon_release(nahp, btop(asize)); 1186 ANON_LOCK_EXIT(&2->a_rwlock); 1187 if (amp1 != NULL) { 1188 ANON_LOCK_EXIT(&1->a_rwlock); 1189 } 1190 if (nvpage != NULL) { 1191 kmem_free(nvpage, nvpsize); 1192 } 1193 return (-2); 1194 } 1195 } 1196 if (amp1 != NULL) { 1197 namp = amp1; 1198 anon_release(amp1->ahp, btop(amp1->size)); 1199 } 1200 if (amp2 != NULL) { 1201 if (namp == NULL) { 1202 ASSERT(amp1 == NULL); 1203 namp = amp2; 1204 anon_release(amp2->ahp, btop(amp2->size)); 1205 } else { 1206 amp2->refcnt--; 1207 ANON_LOCK_EXIT(&2->a_rwlock); 1208 anonmap_free(amp2); 1209 } 1210 svd2->amp = NULL; /* needed for seg_free */ 1211 } 1212 namp->ahp = nahp; 1213 namp->size = asize; 1214 svd1->amp = namp; 1215 svd1->anon_index = 0; 1216 ANON_LOCK_EXIT(&namp->a_rwlock); 1217 } 1218 /* 1219 * Now free the old vpage structures. 1220 */ 1221 if (nvpage != NULL) { 1222 if (vpage1 != NULL) { 1223 kmem_free(vpage1, vpgtob(npages1)); 1224 } 1225 if (vpage2 != NULL) { 1226 svd2->vpage = NULL; 1227 kmem_free(vpage2, vpgtob(npages2)); 1228 } 1229 if (svd2->pageprot) { 1230 svd1->pageprot = 1; 1231 } 1232 if (svd2->pageadvice) { 1233 svd1->pageadvice = 1; 1234 } 1235 if (svd2->pageswap) { 1236 svd1->pageswap = 1; 1237 } 1238 svd1->vpage = nvpage; 1239 } 1240 1241 /* all looks ok, merge segments */ 1242 svd1->swresv += svd2->swresv; 1243 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 1244 size = seg2->s_size; 1245 seg_free(seg2); 1246 seg1->s_size += size; 1247 return (0); 1248 } 1249 1250 /* 1251 * Extend the previous segment (seg1) to include the 1252 * new segment (seg2 + a), if possible. 1253 * Return 0 on success. 1254 */ 1255 static int 1256 segvn_extend_prev(seg1, seg2, a, swresv) 1257 struct seg *seg1, *seg2; 1258 struct segvn_crargs *a; 1259 size_t swresv; 1260 { 1261 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 1262 size_t size; 1263 struct anon_map *amp1; 1264 struct vpage *new_vpage; 1265 1266 /* 1267 * We don't need any segment level locks for "segvn" data 1268 * since the address space is "write" locked. 1269 */ 1270 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as)); 1271 1272 if (HAT_IS_REGION_COOKIE_VALID(svd1->rcookie)) { 1273 return (-1); 1274 } 1275 1276 /* second segment is new, try to extend first */ 1277 /* XXX - should also check cred */ 1278 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1279 (!svd1->pageprot && (svd1->prot != a->prot)) || 1280 svd1->type != a->type || svd1->flags != a->flags || 1281 seg1->s_szc != a->szc || svd1->softlockcnt_send > 0) 1282 return (-1); 1283 1284 /* vp == NULL implies zfod, offset doesn't matter */ 1285 if (svd1->vp != NULL && 1286 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1287 return (-1); 1288 1289 if (svd1->tr_state != SEGVN_TR_OFF) { 1290 return (-1); 1291 } 1292 1293 amp1 = svd1->amp; 1294 if (amp1) { 1295 pgcnt_t newpgs; 1296 1297 /* 1298 * Segment has private pages, can data structures 1299 * be expanded? 1300 * 1301 * Acquire the anon_map lock to prevent it from changing, 1302 * if it is shared. This ensures that the anon_map 1303 * will not change while a thread which has a read/write 1304 * lock on an address space references it. 1305 * XXX - Don't need the anon_map lock at all if "refcnt" 1306 * is 1. 1307 * 1308 * Can't grow a MAP_SHARED segment with an anonmap because 1309 * there may be existing anon slots where we want to extend 1310 * the segment and we wouldn't know what to do with them 1311 * (e.g., for tmpfs right thing is to just leave them there, 1312 * for /dev/zero they should be cleared out). 1313 */ 1314 if (svd1->type == MAP_SHARED) 1315 return (-1); 1316 1317 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1318 if (amp1->refcnt > 1) { 1319 ANON_LOCK_EXIT(&1->a_rwlock); 1320 return (-1); 1321 } 1322 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1323 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1324 1325 if (newpgs == 0) { 1326 ANON_LOCK_EXIT(&1->a_rwlock); 1327 return (-1); 1328 } 1329 amp1->size = ptob(newpgs); 1330 ANON_LOCK_EXIT(&1->a_rwlock); 1331 } 1332 if (svd1->vpage != NULL) { 1333 struct vpage *vp, *evp; 1334 new_vpage = 1335 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1336 KM_NOSLEEP); 1337 if (new_vpage == NULL) 1338 return (-1); 1339 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1340 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1341 svd1->vpage = new_vpage; 1342 1343 vp = new_vpage + seg_pages(seg1); 1344 evp = vp + seg_pages(seg2); 1345 for (; vp < evp; vp++) 1346 VPP_SETPROT(vp, a->prot); 1347 if (svd1->pageswap && swresv) { 1348 ASSERT(!(svd1->flags & MAP_NORESERVE)); 1349 ASSERT(swresv == seg2->s_size); 1350 vp = new_vpage + seg_pages(seg1); 1351 for (; vp < evp; vp++) { 1352 VPP_SETSWAPRES(vp); 1353 } 1354 } 1355 } 1356 ASSERT(svd1->vpage != NULL || svd1->pageswap == 0); 1357 size = seg2->s_size; 1358 seg_free(seg2); 1359 seg1->s_size += size; 1360 svd1->swresv += swresv; 1361 if (svd1->pageprot && (a->prot & PROT_WRITE) && 1362 svd1->type == MAP_SHARED && svd1->vp != NULL && 1363 (svd1->vp->v_flag & VVMEXEC)) { 1364 ASSERT(vn_is_mapped(svd1->vp, V_WRITE)); 1365 segvn_inval_trcache(svd1->vp); 1366 } 1367 return (0); 1368 } 1369 1370 /* 1371 * Extend the next segment (seg2) to include the 1372 * new segment (seg1 + a), if possible. 1373 * Return 0 on success. 1374 */ 1375 static int 1376 segvn_extend_next( 1377 struct seg *seg1, 1378 struct seg *seg2, 1379 struct segvn_crargs *a, 1380 size_t swresv) 1381 { 1382 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1383 size_t size; 1384 struct anon_map *amp2; 1385 struct vpage *new_vpage; 1386 1387 /* 1388 * We don't need any segment level locks for "segvn" data 1389 * since the address space is "write" locked. 1390 */ 1391 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as)); 1392 1393 if (HAT_IS_REGION_COOKIE_VALID(svd2->rcookie)) { 1394 return (-1); 1395 } 1396 1397 /* first segment is new, try to extend second */ 1398 /* XXX - should also check cred */ 1399 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1400 (!svd2->pageprot && (svd2->prot != a->prot)) || 1401 svd2->type != a->type || svd2->flags != a->flags || 1402 seg2->s_szc != a->szc || svd2->softlockcnt_sbase > 0) 1403 return (-1); 1404 /* vp == NULL implies zfod, offset doesn't matter */ 1405 if (svd2->vp != NULL && 1406 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1407 return (-1); 1408 1409 if (svd2->tr_state != SEGVN_TR_OFF) { 1410 return (-1); 1411 } 1412 1413 amp2 = svd2->amp; 1414 if (amp2) { 1415 pgcnt_t newpgs; 1416 1417 /* 1418 * Segment has private pages, can data structures 1419 * be expanded? 1420 * 1421 * Acquire the anon_map lock to prevent it from changing, 1422 * if it is shared. This ensures that the anon_map 1423 * will not change while a thread which has a read/write 1424 * lock on an address space references it. 1425 * 1426 * XXX - Don't need the anon_map lock at all if "refcnt" 1427 * is 1. 1428 */ 1429 if (svd2->type == MAP_SHARED) 1430 return (-1); 1431 1432 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1433 if (amp2->refcnt > 1) { 1434 ANON_LOCK_EXIT(&2->a_rwlock); 1435 return (-1); 1436 } 1437 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1438 btop(seg2->s_size), btop(seg1->s_size), 1439 ANON_NOSLEEP | ANON_GROWDOWN); 1440 1441 if (newpgs == 0) { 1442 ANON_LOCK_EXIT(&2->a_rwlock); 1443 return (-1); 1444 } 1445 amp2->size = ptob(newpgs); 1446 ANON_LOCK_EXIT(&2->a_rwlock); 1447 } 1448 if (svd2->vpage != NULL) { 1449 struct vpage *vp, *evp; 1450 new_vpage = 1451 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1452 KM_NOSLEEP); 1453 if (new_vpage == NULL) { 1454 /* Not merging segments so adjust anon_index back */ 1455 if (amp2) 1456 svd2->anon_index += seg_pages(seg1); 1457 return (-1); 1458 } 1459 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1460 vpgtob(seg_pages(seg2))); 1461 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1462 svd2->vpage = new_vpage; 1463 1464 vp = new_vpage; 1465 evp = vp + seg_pages(seg1); 1466 for (; vp < evp; vp++) 1467 VPP_SETPROT(vp, a->prot); 1468 if (svd2->pageswap && swresv) { 1469 ASSERT(!(svd2->flags & MAP_NORESERVE)); 1470 ASSERT(swresv == seg1->s_size); 1471 vp = new_vpage; 1472 for (; vp < evp; vp++) { 1473 VPP_SETSWAPRES(vp); 1474 } 1475 } 1476 } 1477 ASSERT(svd2->vpage != NULL || svd2->pageswap == 0); 1478 size = seg1->s_size; 1479 seg_free(seg1); 1480 seg2->s_size += size; 1481 seg2->s_base -= size; 1482 svd2->offset -= size; 1483 svd2->swresv += swresv; 1484 if (svd2->pageprot && (a->prot & PROT_WRITE) && 1485 svd2->type == MAP_SHARED && svd2->vp != NULL && 1486 (svd2->vp->v_flag & VVMEXEC)) { 1487 ASSERT(vn_is_mapped(svd2->vp, V_WRITE)); 1488 segvn_inval_trcache(svd2->vp); 1489 } 1490 return (0); 1491 } 1492 1493 /* 1494 * Duplicate all the pages in the segment. This may break COW sharing for a 1495 * given page. If the page is marked with inherit zero set, then instead of 1496 * duplicating the page, we zero the page. 1497 */ 1498 static int 1499 segvn_dup_pages(struct seg *seg, struct seg *newseg) 1500 { 1501 int error; 1502 uint_t prot; 1503 page_t *pp; 1504 struct anon *ap, *newap; 1505 size_t i; 1506 caddr_t addr; 1507 1508 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1509 struct segvn_data *newsvd = (struct segvn_data *)newseg->s_data; 1510 ulong_t old_idx = svd->anon_index; 1511 ulong_t new_idx = 0; 1512 1513 i = btopr(seg->s_size); 1514 addr = seg->s_base; 1515 1516 /* 1517 * XXX break cow sharing using PAGESIZE 1518 * pages. They will be relocated into larger 1519 * pages at fault time. 1520 */ 1521 while (i-- > 0) { 1522 if ((ap = anon_get_ptr(svd->amp->ahp, old_idx)) != NULL) { 1523 struct vpage *vpp; 1524 1525 vpp = &svd->vpage[seg_page(seg, addr)]; 1526 1527 /* 1528 * prot need not be computed below 'cause anon_private 1529 * is going to ignore it anyway as child doesn't inherit 1530 * pagelock from parent. 1531 */ 1532 prot = svd->pageprot ? VPP_PROT(vpp) : svd->prot; 1533 1534 /* 1535 * Check whether we should zero this or dup it. 1536 */ 1537 if (svd->svn_inz == SEGVN_INZ_ALL || 1538 (svd->svn_inz == SEGVN_INZ_VPP && 1539 VPP_ISINHZERO(vpp))) { 1540 pp = anon_zero(newseg, addr, &newap, 1541 newsvd->cred); 1542 } else { 1543 page_t *anon_pl[1+1]; 1544 uint_t vpprot; 1545 error = anon_getpage(&ap, &vpprot, anon_pl, 1546 PAGESIZE, seg, addr, S_READ, svd->cred); 1547 if (error != 0) 1548 return (error); 1549 1550 pp = anon_private(&newap, newseg, addr, prot, 1551 anon_pl[0], 0, newsvd->cred); 1552 } 1553 if (pp == NULL) { 1554 return (ENOMEM); 1555 } 1556 (void) anon_set_ptr(newsvd->amp->ahp, new_idx, newap, 1557 ANON_SLEEP); 1558 page_unlock(pp); 1559 } 1560 addr += PAGESIZE; 1561 old_idx++; 1562 new_idx++; 1563 } 1564 1565 return (0); 1566 } 1567 1568 static int 1569 segvn_dup(struct seg *seg, struct seg *newseg) 1570 { 1571 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1572 struct segvn_data *newsvd; 1573 pgcnt_t npages = seg_pages(seg); 1574 int error = 0; 1575 size_t len; 1576 struct anon_map *amp; 1577 1578 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1579 ASSERT(newseg->s_as->a_proc->p_parent == curproc); 1580 1581 /* 1582 * If segment has anon reserved, reserve more for the new seg. 1583 * For a MAP_NORESERVE segment swresv will be a count of all the 1584 * allocated anon slots; thus we reserve for the child as many slots 1585 * as the parent has allocated. This semantic prevents the child or 1586 * parent from dieing during a copy-on-write fault caused by trying 1587 * to write a shared pre-existing anon page. 1588 */ 1589 if ((len = svd->swresv) != 0) { 1590 if (anon_resv(svd->swresv) == 0) 1591 return (ENOMEM); 1592 1593 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1594 seg, len, 0); 1595 } 1596 1597 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1598 1599 newseg->s_ops = &segvn_ops; 1600 newseg->s_data = (void *)newsvd; 1601 newseg->s_szc = seg->s_szc; 1602 1603 newsvd->seg = newseg; 1604 if ((newsvd->vp = svd->vp) != NULL) { 1605 VN_HOLD(svd->vp); 1606 if (svd->type == MAP_SHARED) 1607 lgrp_shm_policy_init(NULL, svd->vp); 1608 } 1609 newsvd->offset = svd->offset; 1610 newsvd->prot = svd->prot; 1611 newsvd->maxprot = svd->maxprot; 1612 newsvd->pageprot = svd->pageprot; 1613 newsvd->type = svd->type; 1614 newsvd->cred = svd->cred; 1615 crhold(newsvd->cred); 1616 newsvd->advice = svd->advice; 1617 newsvd->pageadvice = svd->pageadvice; 1618 newsvd->svn_inz = svd->svn_inz; 1619 newsvd->swresv = svd->swresv; 1620 newsvd->pageswap = svd->pageswap; 1621 newsvd->flags = svd->flags; 1622 newsvd->softlockcnt = 0; 1623 newsvd->softlockcnt_sbase = 0; 1624 newsvd->softlockcnt_send = 0; 1625 newsvd->policy_info = svd->policy_info; 1626 newsvd->rcookie = HAT_INVALID_REGION_COOKIE; 1627 1628 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) { 1629 /* 1630 * Not attaching to a shared anon object. 1631 */ 1632 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie) || 1633 svd->tr_state == SEGVN_TR_OFF); 1634 if (svd->tr_state == SEGVN_TR_ON) { 1635 ASSERT(newsvd->vp != NULL && amp != NULL); 1636 newsvd->tr_state = SEGVN_TR_INIT; 1637 } else { 1638 newsvd->tr_state = svd->tr_state; 1639 } 1640 newsvd->amp = NULL; 1641 newsvd->anon_index = 0; 1642 } else { 1643 /* regions for now are only used on pure vnode segments */ 1644 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 1645 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1646 newsvd->tr_state = SEGVN_TR_OFF; 1647 if (svd->type == MAP_SHARED) { 1648 ASSERT(svd->svn_inz == SEGVN_INZ_NONE); 1649 newsvd->amp = amp; 1650 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1651 amp->refcnt++; 1652 ANON_LOCK_EXIT(&->a_rwlock); 1653 newsvd->anon_index = svd->anon_index; 1654 } else { 1655 int reclaim = 1; 1656 1657 /* 1658 * Allocate and initialize new anon_map structure. 1659 */ 1660 newsvd->amp = anonmap_alloc(newseg->s_size, 0, 1661 ANON_SLEEP); 1662 newsvd->amp->a_szc = newseg->s_szc; 1663 newsvd->anon_index = 0; 1664 ASSERT(svd->svn_inz == SEGVN_INZ_NONE || 1665 svd->svn_inz == SEGVN_INZ_ALL || 1666 svd->svn_inz == SEGVN_INZ_VPP); 1667 1668 /* 1669 * We don't have to acquire the anon_map lock 1670 * for the new segment (since it belongs to an 1671 * address space that is still not associated 1672 * with any process), or the segment in the old 1673 * address space (since all threads in it 1674 * are stopped while duplicating the address space). 1675 */ 1676 1677 /* 1678 * The goal of the following code is to make sure that 1679 * softlocked pages do not end up as copy on write 1680 * pages. This would cause problems where one 1681 * thread writes to a page that is COW and a different 1682 * thread in the same process has softlocked it. The 1683 * softlock lock would move away from this process 1684 * because the write would cause this process to get 1685 * a copy (without the softlock). 1686 * 1687 * The strategy here is to just break the 1688 * sharing on pages that could possibly be 1689 * softlocked. 1690 * 1691 * In addition, if any pages have been marked that they 1692 * should be inherited as zero, then we immediately go 1693 * ahead and break COW and zero them. In the case of a 1694 * softlocked page that should be inherited zero, we 1695 * break COW and just get a zero page. 1696 */ 1697 retry: 1698 if (svd->softlockcnt || 1699 svd->svn_inz != SEGVN_INZ_NONE) { 1700 /* 1701 * The softlock count might be non zero 1702 * because some pages are still stuck in the 1703 * cache for lazy reclaim. Flush the cache 1704 * now. This should drop the count to zero. 1705 * [or there is really I/O going on to these 1706 * pages]. Note, we have the writers lock so 1707 * nothing gets inserted during the flush. 1708 */ 1709 if (svd->softlockcnt && reclaim == 1) { 1710 segvn_purge(seg); 1711 reclaim = 0; 1712 goto retry; 1713 } 1714 1715 error = segvn_dup_pages(seg, newseg); 1716 if (error != 0) { 1717 newsvd->vpage = NULL; 1718 goto out; 1719 } 1720 } else { /* common case */ 1721 if (seg->s_szc != 0) { 1722 /* 1723 * If at least one of anon slots of a 1724 * large page exists then make sure 1725 * all anon slots of a large page 1726 * exist to avoid partial cow sharing 1727 * of a large page in the future. 1728 */ 1729 anon_dup_fill_holes(amp->ahp, 1730 svd->anon_index, newsvd->amp->ahp, 1731 0, seg->s_size, seg->s_szc, 1732 svd->vp != NULL); 1733 } else { 1734 anon_dup(amp->ahp, svd->anon_index, 1735 newsvd->amp->ahp, 0, seg->s_size); 1736 } 1737 1738 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1739 seg->s_size, PROT_WRITE); 1740 } 1741 } 1742 } 1743 /* 1744 * If necessary, create a vpage structure for the new segment. 1745 * Do not copy any page lock indications. 1746 */ 1747 if (svd->vpage != NULL) { 1748 uint_t i; 1749 struct vpage *ovp = svd->vpage; 1750 struct vpage *nvp; 1751 1752 nvp = newsvd->vpage = 1753 kmem_alloc(vpgtob(npages), KM_SLEEP); 1754 for (i = 0; i < npages; i++) { 1755 *nvp = *ovp++; 1756 VPP_CLRPPLOCK(nvp++); 1757 } 1758 } else 1759 newsvd->vpage = NULL; 1760 1761 /* Inform the vnode of the new mapping */ 1762 if (newsvd->vp != NULL) { 1763 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1764 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1765 newsvd->maxprot, newsvd->type, newsvd->cred, NULL); 1766 } 1767 out: 1768 if (error == 0 && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1769 ASSERT(newsvd->amp == NULL); 1770 ASSERT(newsvd->tr_state == SEGVN_TR_OFF); 1771 newsvd->rcookie = svd->rcookie; 1772 hat_dup_region(newseg->s_as->a_hat, newsvd->rcookie); 1773 } 1774 return (error); 1775 } 1776 1777 1778 /* 1779 * callback function to invoke free_vp_pages() for only those pages actually 1780 * processed by the HAT when a shared region is destroyed. 1781 */ 1782 extern int free_pages; 1783 1784 static void 1785 segvn_hat_rgn_unload_callback(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 1786 size_t r_size, void *r_obj, u_offset_t r_objoff) 1787 { 1788 u_offset_t off; 1789 size_t len; 1790 vnode_t *vp = (vnode_t *)r_obj; 1791 1792 ASSERT(eaddr > saddr); 1793 ASSERT(saddr >= r_saddr); 1794 ASSERT(saddr < r_saddr + r_size); 1795 ASSERT(eaddr > r_saddr); 1796 ASSERT(eaddr <= r_saddr + r_size); 1797 ASSERT(vp != NULL); 1798 1799 if (!free_pages) { 1800 return; 1801 } 1802 1803 len = eaddr - saddr; 1804 off = (saddr - r_saddr) + r_objoff; 1805 free_vp_pages(vp, off, len); 1806 } 1807 1808 /* 1809 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1810 * those pages actually processed by the HAT 1811 */ 1812 static void 1813 segvn_hat_unload_callback(hat_callback_t *cb) 1814 { 1815 struct seg *seg = cb->hcb_data; 1816 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1817 size_t len; 1818 u_offset_t off; 1819 1820 ASSERT(svd->vp != NULL); 1821 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1822 ASSERT(cb->hcb_start_addr >= seg->s_base); 1823 1824 len = cb->hcb_end_addr - cb->hcb_start_addr; 1825 off = cb->hcb_start_addr - seg->s_base; 1826 free_vp_pages(svd->vp, svd->offset + off, len); 1827 } 1828 1829 /* 1830 * This function determines the number of bytes of swap reserved by 1831 * a segment for which per-page accounting is present. It is used to 1832 * calculate the correct value of a segvn_data's swresv. 1833 */ 1834 static size_t 1835 segvn_count_swap_by_vpages(struct seg *seg) 1836 { 1837 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1838 struct vpage *vp, *evp; 1839 size_t nswappages = 0; 1840 1841 ASSERT(svd->pageswap); 1842 ASSERT(svd->vpage != NULL); 1843 1844 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 1845 1846 for (vp = svd->vpage; vp < evp; vp++) { 1847 if (VPP_ISSWAPRES(vp)) 1848 nswappages++; 1849 } 1850 1851 return (nswappages << PAGESHIFT); 1852 } 1853 1854 static int 1855 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1856 { 1857 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1858 struct segvn_data *nsvd; 1859 struct seg *nseg; 1860 struct anon_map *amp; 1861 pgcnt_t opages; /* old segment size in pages */ 1862 pgcnt_t npages; /* new segment size in pages */ 1863 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1864 hat_callback_t callback; /* used for free_vp_pages() */ 1865 hat_callback_t *cbp = NULL; 1866 caddr_t nbase; 1867 size_t nsize; 1868 size_t oswresv; 1869 int reclaim = 1; 1870 1871 /* 1872 * We don't need any segment level locks for "segvn" data 1873 * since the address space is "write" locked. 1874 */ 1875 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 1876 1877 /* 1878 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1879 * softlockcnt is protected from change by the as write lock. 1880 */ 1881 retry: 1882 if (svd->softlockcnt > 0) { 1883 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1884 1885 /* 1886 * If this is shared segment non 0 softlockcnt 1887 * means locked pages are still in use. 1888 */ 1889 if (svd->type == MAP_SHARED) { 1890 return (EAGAIN); 1891 } 1892 1893 /* 1894 * since we do have the writers lock nobody can fill 1895 * the cache during the purge. The flush either succeeds 1896 * or we still have pending I/Os. 1897 */ 1898 if (reclaim == 1) { 1899 segvn_purge(seg); 1900 reclaim = 0; 1901 goto retry; 1902 } 1903 return (EAGAIN); 1904 } 1905 1906 /* 1907 * Check for bad sizes 1908 */ 1909 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1910 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1911 panic("segvn_unmap"); 1912 /*NOTREACHED*/ 1913 } 1914 1915 if (seg->s_szc != 0) { 1916 size_t pgsz = page_get_pagesize(seg->s_szc); 1917 int err; 1918 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1919 ASSERT(seg->s_base != addr || seg->s_size != len); 1920 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1921 ASSERT(svd->amp == NULL); 1922 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1923 hat_leave_region(seg->s_as->a_hat, 1924 svd->rcookie, HAT_REGION_TEXT); 1925 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1926 /* 1927 * could pass a flag to segvn_demote_range() 1928 * below to tell it not to do any unloads but 1929 * this case is rare enough to not bother for 1930 * now. 1931 */ 1932 } else if (svd->tr_state == SEGVN_TR_INIT) { 1933 svd->tr_state = SEGVN_TR_OFF; 1934 } else if (svd->tr_state == SEGVN_TR_ON) { 1935 ASSERT(svd->amp != NULL); 1936 segvn_textunrepl(seg, 1); 1937 ASSERT(svd->amp == NULL); 1938 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1939 } 1940 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1941 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1942 if (err == 0) { 1943 return (IE_RETRY); 1944 } 1945 return (err); 1946 } 1947 } 1948 1949 /* Inform the vnode of the unmapping. */ 1950 if (svd->vp) { 1951 int error; 1952 1953 error = VOP_DELMAP(svd->vp, 1954 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1955 seg->s_as, addr, len, svd->prot, svd->maxprot, 1956 svd->type, svd->cred, NULL); 1957 1958 if (error == EAGAIN) 1959 return (error); 1960 } 1961 1962 /* 1963 * Remove any page locks set through this mapping. 1964 * If text replication is not off no page locks could have been 1965 * established via this mapping. 1966 */ 1967 if (svd->tr_state == SEGVN_TR_OFF) { 1968 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1969 } 1970 1971 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 1972 ASSERT(svd->amp == NULL); 1973 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1974 ASSERT(svd->type == MAP_PRIVATE); 1975 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 1976 HAT_REGION_TEXT); 1977 svd->rcookie = HAT_INVALID_REGION_COOKIE; 1978 } else if (svd->tr_state == SEGVN_TR_ON) { 1979 ASSERT(svd->amp != NULL); 1980 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE)); 1981 segvn_textunrepl(seg, 1); 1982 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 1983 } else { 1984 if (svd->tr_state != SEGVN_TR_OFF) { 1985 ASSERT(svd->tr_state == SEGVN_TR_INIT); 1986 svd->tr_state = SEGVN_TR_OFF; 1987 } 1988 /* 1989 * Unload any hardware translations in the range to be taken 1990 * out. Use a callback to invoke free_vp_pages() effectively. 1991 */ 1992 if (svd->vp != NULL && free_pages != 0) { 1993 callback.hcb_data = seg; 1994 callback.hcb_function = segvn_hat_unload_callback; 1995 cbp = &callback; 1996 } 1997 hat_unload_callback(seg->s_as->a_hat, addr, len, 1998 HAT_UNLOAD_UNMAP, cbp); 1999 2000 if (svd->type == MAP_SHARED && svd->vp != NULL && 2001 (svd->vp->v_flag & VVMEXEC) && 2002 ((svd->prot & PROT_WRITE) || svd->pageprot)) { 2003 segvn_inval_trcache(svd->vp); 2004 } 2005 } 2006 2007 /* 2008 * Check for entire segment 2009 */ 2010 if (addr == seg->s_base && len == seg->s_size) { 2011 seg_free(seg); 2012 return (0); 2013 } 2014 2015 opages = seg_pages(seg); 2016 dpages = btop(len); 2017 npages = opages - dpages; 2018 amp = svd->amp; 2019 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 2020 2021 /* 2022 * Check for beginning of segment 2023 */ 2024 if (addr == seg->s_base) { 2025 if (svd->vpage != NULL) { 2026 size_t nbytes; 2027 struct vpage *ovpage; 2028 2029 ovpage = svd->vpage; /* keep pointer to vpage */ 2030 2031 nbytes = vpgtob(npages); 2032 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2033 bcopy(&ovpage[dpages], svd->vpage, nbytes); 2034 2035 /* free up old vpage */ 2036 kmem_free(ovpage, vpgtob(opages)); 2037 } 2038 if (amp != NULL) { 2039 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2040 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2041 /* 2042 * Shared anon map is no longer in use. Before 2043 * freeing its pages purge all entries from 2044 * pcache that belong to this amp. 2045 */ 2046 if (svd->type == MAP_SHARED) { 2047 ASSERT(amp->refcnt == 1); 2048 ASSERT(svd->softlockcnt == 0); 2049 anonmap_purge(amp); 2050 } 2051 /* 2052 * Free up now unused parts of anon_map array. 2053 */ 2054 if (amp->a_szc == seg->s_szc) { 2055 if (seg->s_szc != 0) { 2056 anon_free_pages(amp->ahp, 2057 svd->anon_index, len, 2058 seg->s_szc); 2059 } else { 2060 anon_free(amp->ahp, 2061 svd->anon_index, 2062 len); 2063 } 2064 } else { 2065 ASSERT(svd->type == MAP_SHARED); 2066 ASSERT(amp->a_szc > seg->s_szc); 2067 anon_shmap_free_pages(amp, 2068 svd->anon_index, len); 2069 } 2070 2071 /* 2072 * Unreserve swap space for the 2073 * unmapped chunk of this segment in 2074 * case it's MAP_SHARED 2075 */ 2076 if (svd->type == MAP_SHARED) { 2077 anon_unresv_zone(len, 2078 seg->s_as->a_proc->p_zone); 2079 amp->swresv -= len; 2080 } 2081 } 2082 ANON_LOCK_EXIT(&->a_rwlock); 2083 svd->anon_index += dpages; 2084 } 2085 if (svd->vp != NULL) 2086 svd->offset += len; 2087 2088 seg->s_base += len; 2089 seg->s_size -= len; 2090 2091 if (svd->swresv) { 2092 if (svd->flags & MAP_NORESERVE) { 2093 ASSERT(amp); 2094 oswresv = svd->swresv; 2095 2096 svd->swresv = ptob(anon_pages(amp->ahp, 2097 svd->anon_index, npages)); 2098 anon_unresv_zone(oswresv - svd->swresv, 2099 seg->s_as->a_proc->p_zone); 2100 if (SEG_IS_PARTIAL_RESV(seg)) 2101 seg->s_as->a_resvsize -= oswresv - 2102 svd->swresv; 2103 } else { 2104 size_t unlen; 2105 2106 if (svd->pageswap) { 2107 oswresv = svd->swresv; 2108 svd->swresv = 2109 segvn_count_swap_by_vpages(seg); 2110 ASSERT(oswresv >= svd->swresv); 2111 unlen = oswresv - svd->swresv; 2112 } else { 2113 svd->swresv -= len; 2114 ASSERT(svd->swresv == seg->s_size); 2115 unlen = len; 2116 } 2117 anon_unresv_zone(unlen, 2118 seg->s_as->a_proc->p_zone); 2119 } 2120 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2121 seg, len, 0); 2122 } 2123 2124 return (0); 2125 } 2126 2127 /* 2128 * Check for end of segment 2129 */ 2130 if (addr + len == seg->s_base + seg->s_size) { 2131 if (svd->vpage != NULL) { 2132 size_t nbytes; 2133 struct vpage *ovpage; 2134 2135 ovpage = svd->vpage; /* keep pointer to vpage */ 2136 2137 nbytes = vpgtob(npages); 2138 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2139 bcopy(ovpage, svd->vpage, nbytes); 2140 2141 /* free up old vpage */ 2142 kmem_free(ovpage, vpgtob(opages)); 2143 2144 } 2145 if (amp != NULL) { 2146 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2147 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2148 /* 2149 * Free up now unused parts of anon_map array. 2150 */ 2151 ulong_t an_idx = svd->anon_index + npages; 2152 2153 /* 2154 * Shared anon map is no longer in use. Before 2155 * freeing its pages purge all entries from 2156 * pcache that belong to this amp. 2157 */ 2158 if (svd->type == MAP_SHARED) { 2159 ASSERT(amp->refcnt == 1); 2160 ASSERT(svd->softlockcnt == 0); 2161 anonmap_purge(amp); 2162 } 2163 2164 if (amp->a_szc == seg->s_szc) { 2165 if (seg->s_szc != 0) { 2166 anon_free_pages(amp->ahp, 2167 an_idx, len, 2168 seg->s_szc); 2169 } else { 2170 anon_free(amp->ahp, an_idx, 2171 len); 2172 } 2173 } else { 2174 ASSERT(svd->type == MAP_SHARED); 2175 ASSERT(amp->a_szc > seg->s_szc); 2176 anon_shmap_free_pages(amp, 2177 an_idx, len); 2178 } 2179 2180 /* 2181 * Unreserve swap space for the 2182 * unmapped chunk of this segment in 2183 * case it's MAP_SHARED 2184 */ 2185 if (svd->type == MAP_SHARED) { 2186 anon_unresv_zone(len, 2187 seg->s_as->a_proc->p_zone); 2188 amp->swresv -= len; 2189 } 2190 } 2191 ANON_LOCK_EXIT(&->a_rwlock); 2192 } 2193 2194 seg->s_size -= len; 2195 2196 if (svd->swresv) { 2197 if (svd->flags & MAP_NORESERVE) { 2198 ASSERT(amp); 2199 oswresv = svd->swresv; 2200 svd->swresv = ptob(anon_pages(amp->ahp, 2201 svd->anon_index, npages)); 2202 anon_unresv_zone(oswresv - svd->swresv, 2203 seg->s_as->a_proc->p_zone); 2204 if (SEG_IS_PARTIAL_RESV(seg)) 2205 seg->s_as->a_resvsize -= oswresv - 2206 svd->swresv; 2207 } else { 2208 size_t unlen; 2209 2210 if (svd->pageswap) { 2211 oswresv = svd->swresv; 2212 svd->swresv = 2213 segvn_count_swap_by_vpages(seg); 2214 ASSERT(oswresv >= svd->swresv); 2215 unlen = oswresv - svd->swresv; 2216 } else { 2217 svd->swresv -= len; 2218 ASSERT(svd->swresv == seg->s_size); 2219 unlen = len; 2220 } 2221 anon_unresv_zone(unlen, 2222 seg->s_as->a_proc->p_zone); 2223 } 2224 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2225 "anon proc:%p %lu %u", seg, len, 0); 2226 } 2227 2228 return (0); 2229 } 2230 2231 /* 2232 * The section to go is in the middle of the segment, 2233 * have to make it into two segments. nseg is made for 2234 * the high end while seg is cut down at the low end. 2235 */ 2236 nbase = addr + len; /* new seg base */ 2237 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 2238 seg->s_size = addr - seg->s_base; /* shrink old seg */ 2239 nseg = seg_alloc(seg->s_as, nbase, nsize); 2240 if (nseg == NULL) { 2241 panic("segvn_unmap seg_alloc"); 2242 /*NOTREACHED*/ 2243 } 2244 nseg->s_ops = seg->s_ops; 2245 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 2246 nseg->s_data = (void *)nsvd; 2247 nseg->s_szc = seg->s_szc; 2248 *nsvd = *svd; 2249 nsvd->seg = nseg; 2250 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 2251 nsvd->swresv = 0; 2252 nsvd->softlockcnt = 0; 2253 nsvd->softlockcnt_sbase = 0; 2254 nsvd->softlockcnt_send = 0; 2255 nsvd->svn_inz = svd->svn_inz; 2256 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 2257 2258 if (svd->vp != NULL) { 2259 VN_HOLD(nsvd->vp); 2260 if (nsvd->type == MAP_SHARED) 2261 lgrp_shm_policy_init(NULL, nsvd->vp); 2262 } 2263 crhold(svd->cred); 2264 2265 if (svd->vpage == NULL) { 2266 nsvd->vpage = NULL; 2267 } else { 2268 /* need to split vpage into two arrays */ 2269 size_t nbytes; 2270 struct vpage *ovpage; 2271 2272 ovpage = svd->vpage; /* keep pointer to vpage */ 2273 2274 npages = seg_pages(seg); /* seg has shrunk */ 2275 nbytes = vpgtob(npages); 2276 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2277 2278 bcopy(ovpage, svd->vpage, nbytes); 2279 2280 npages = seg_pages(nseg); 2281 nbytes = vpgtob(npages); 2282 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 2283 2284 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 2285 2286 /* free up old vpage */ 2287 kmem_free(ovpage, vpgtob(opages)); 2288 } 2289 2290 if (amp == NULL) { 2291 nsvd->amp = NULL; 2292 nsvd->anon_index = 0; 2293 } else { 2294 /* 2295 * Need to create a new anon map for the new segment. 2296 * We'll also allocate a new smaller array for the old 2297 * smaller segment to save space. 2298 */ 2299 opages = btop((uintptr_t)(addr - seg->s_base)); 2300 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2301 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 2302 /* 2303 * Free up now unused parts of anon_map array. 2304 */ 2305 ulong_t an_idx = svd->anon_index + opages; 2306 2307 /* 2308 * Shared anon map is no longer in use. Before 2309 * freeing its pages purge all entries from 2310 * pcache that belong to this amp. 2311 */ 2312 if (svd->type == MAP_SHARED) { 2313 ASSERT(amp->refcnt == 1); 2314 ASSERT(svd->softlockcnt == 0); 2315 anonmap_purge(amp); 2316 } 2317 2318 if (amp->a_szc == seg->s_szc) { 2319 if (seg->s_szc != 0) { 2320 anon_free_pages(amp->ahp, an_idx, len, 2321 seg->s_szc); 2322 } else { 2323 anon_free(amp->ahp, an_idx, 2324 len); 2325 } 2326 } else { 2327 ASSERT(svd->type == MAP_SHARED); 2328 ASSERT(amp->a_szc > seg->s_szc); 2329 anon_shmap_free_pages(amp, an_idx, len); 2330 } 2331 2332 /* 2333 * Unreserve swap space for the 2334 * unmapped chunk of this segment in 2335 * case it's MAP_SHARED 2336 */ 2337 if (svd->type == MAP_SHARED) { 2338 anon_unresv_zone(len, 2339 seg->s_as->a_proc->p_zone); 2340 amp->swresv -= len; 2341 } 2342 } 2343 nsvd->anon_index = svd->anon_index + 2344 btop((uintptr_t)(nseg->s_base - seg->s_base)); 2345 if (svd->type == MAP_SHARED) { 2346 amp->refcnt++; 2347 nsvd->amp = amp; 2348 } else { 2349 struct anon_map *namp; 2350 struct anon_hdr *nahp; 2351 2352 ASSERT(svd->type == MAP_PRIVATE); 2353 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 2354 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 2355 namp->a_szc = seg->s_szc; 2356 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 2357 0, btop(seg->s_size), ANON_SLEEP); 2358 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 2359 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 2360 anon_release(amp->ahp, btop(amp->size)); 2361 svd->anon_index = 0; 2362 nsvd->anon_index = 0; 2363 amp->ahp = nahp; 2364 amp->size = seg->s_size; 2365 nsvd->amp = namp; 2366 } 2367 ANON_LOCK_EXIT(&->a_rwlock); 2368 } 2369 if (svd->swresv) { 2370 if (svd->flags & MAP_NORESERVE) { 2371 ASSERT(amp); 2372 oswresv = svd->swresv; 2373 svd->swresv = ptob(anon_pages(amp->ahp, 2374 svd->anon_index, btop(seg->s_size))); 2375 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 2376 nsvd->anon_index, btop(nseg->s_size))); 2377 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2378 anon_unresv_zone(oswresv - (svd->swresv + nsvd->swresv), 2379 seg->s_as->a_proc->p_zone); 2380 if (SEG_IS_PARTIAL_RESV(seg)) 2381 seg->s_as->a_resvsize -= oswresv - 2382 (svd->swresv + nsvd->swresv); 2383 } else { 2384 size_t unlen; 2385 2386 if (svd->pageswap) { 2387 oswresv = svd->swresv; 2388 svd->swresv = segvn_count_swap_by_vpages(seg); 2389 nsvd->swresv = segvn_count_swap_by_vpages(nseg); 2390 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2391 unlen = oswresv - (svd->swresv + nsvd->swresv); 2392 } else { 2393 if (seg->s_size + nseg->s_size + len != 2394 svd->swresv) { 2395 panic("segvn_unmap: cannot split " 2396 "swap reservation"); 2397 /*NOTREACHED*/ 2398 } 2399 svd->swresv = seg->s_size; 2400 nsvd->swresv = nseg->s_size; 2401 unlen = len; 2402 } 2403 anon_unresv_zone(unlen, 2404 seg->s_as->a_proc->p_zone); 2405 } 2406 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2407 seg, len, 0); 2408 } 2409 2410 return (0); /* I'm glad that's all over with! */ 2411 } 2412 2413 static void 2414 segvn_free(struct seg *seg) 2415 { 2416 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2417 pgcnt_t npages = seg_pages(seg); 2418 struct anon_map *amp; 2419 size_t len; 2420 2421 /* 2422 * We don't need any segment level locks for "segvn" data 2423 * since the address space is "write" locked. 2424 */ 2425 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 2426 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2427 2428 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2429 2430 /* 2431 * Be sure to unlock pages. XXX Why do things get free'ed instead 2432 * of unmapped? XXX 2433 */ 2434 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 2435 0, MC_UNLOCK, NULL, 0); 2436 2437 /* 2438 * Deallocate the vpage and anon pointers if necessary and possible. 2439 */ 2440 if (svd->vpage != NULL) { 2441 kmem_free(svd->vpage, vpgtob(npages)); 2442 svd->vpage = NULL; 2443 } 2444 if ((amp = svd->amp) != NULL) { 2445 /* 2446 * If there are no more references to this anon_map 2447 * structure, then deallocate the structure after freeing 2448 * up all the anon slot pointers that we can. 2449 */ 2450 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2451 ASSERT(amp->a_szc >= seg->s_szc); 2452 if (--amp->refcnt == 0) { 2453 if (svd->type == MAP_PRIVATE) { 2454 /* 2455 * Private - we only need to anon_free 2456 * the part that this segment refers to. 2457 */ 2458 if (seg->s_szc != 0) { 2459 anon_free_pages(amp->ahp, 2460 svd->anon_index, seg->s_size, 2461 seg->s_szc); 2462 } else { 2463 anon_free(amp->ahp, svd->anon_index, 2464 seg->s_size); 2465 } 2466 } else { 2467 2468 /* 2469 * Shared anon map is no longer in use. Before 2470 * freeing its pages purge all entries from 2471 * pcache that belong to this amp. 2472 */ 2473 ASSERT(svd->softlockcnt == 0); 2474 anonmap_purge(amp); 2475 2476 /* 2477 * Shared - anon_free the entire 2478 * anon_map's worth of stuff and 2479 * release any swap reservation. 2480 */ 2481 if (amp->a_szc != 0) { 2482 anon_shmap_free_pages(amp, 0, 2483 amp->size); 2484 } else { 2485 anon_free(amp->ahp, 0, amp->size); 2486 } 2487 if ((len = amp->swresv) != 0) { 2488 anon_unresv_zone(len, 2489 seg->s_as->a_proc->p_zone); 2490 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2491 "anon proc:%p %lu %u", seg, len, 0); 2492 } 2493 } 2494 svd->amp = NULL; 2495 ANON_LOCK_EXIT(&->a_rwlock); 2496 anonmap_free(amp); 2497 } else if (svd->type == MAP_PRIVATE) { 2498 /* 2499 * We had a private mapping which still has 2500 * a held anon_map so just free up all the 2501 * anon slot pointers that we were using. 2502 */ 2503 if (seg->s_szc != 0) { 2504 anon_free_pages(amp->ahp, svd->anon_index, 2505 seg->s_size, seg->s_szc); 2506 } else { 2507 anon_free(amp->ahp, svd->anon_index, 2508 seg->s_size); 2509 } 2510 ANON_LOCK_EXIT(&->a_rwlock); 2511 } else { 2512 ANON_LOCK_EXIT(&->a_rwlock); 2513 } 2514 } 2515 2516 /* 2517 * Release swap reservation. 2518 */ 2519 if ((len = svd->swresv) != 0) { 2520 anon_unresv_zone(svd->swresv, 2521 seg->s_as->a_proc->p_zone); 2522 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2523 seg, len, 0); 2524 if (SEG_IS_PARTIAL_RESV(seg)) 2525 seg->s_as->a_resvsize -= svd->swresv; 2526 svd->swresv = 0; 2527 } 2528 /* 2529 * Release claim on vnode, credentials, and finally free the 2530 * private data. 2531 */ 2532 if (svd->vp != NULL) { 2533 if (svd->type == MAP_SHARED) 2534 lgrp_shm_policy_fini(NULL, svd->vp); 2535 VN_RELE(svd->vp); 2536 svd->vp = NULL; 2537 } 2538 crfree(svd->cred); 2539 svd->pageprot = 0; 2540 svd->pageadvice = 0; 2541 svd->pageswap = 0; 2542 svd->cred = NULL; 2543 2544 /* 2545 * Take segfree_syncmtx lock to let segvn_reclaim() finish if it's 2546 * still working with this segment without holding as lock (in case 2547 * it's called by pcache async thread). 2548 */ 2549 ASSERT(svd->softlockcnt == 0); 2550 mutex_enter(&svd->segfree_syncmtx); 2551 mutex_exit(&svd->segfree_syncmtx); 2552 2553 seg->s_data = NULL; 2554 kmem_cache_free(segvn_cache, svd); 2555 } 2556 2557 /* 2558 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2559 * already been F_SOFTLOCK'ed. 2560 * Caller must always match addr and len of a softunlock with a previous 2561 * softlock with exactly the same addr and len. 2562 */ 2563 static void 2564 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2565 { 2566 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2567 page_t *pp; 2568 caddr_t adr; 2569 struct vnode *vp; 2570 u_offset_t offset; 2571 ulong_t anon_index; 2572 struct anon_map *amp; 2573 struct anon *ap = NULL; 2574 2575 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 2576 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2577 2578 if ((amp = svd->amp) != NULL) 2579 anon_index = svd->anon_index + seg_page(seg, addr); 2580 2581 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 2582 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2583 hat_unlock_region(seg->s_as->a_hat, addr, len, svd->rcookie); 2584 } else { 2585 hat_unlock(seg->s_as->a_hat, addr, len); 2586 } 2587 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2588 if (amp != NULL) { 2589 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2590 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2591 != NULL) { 2592 swap_xlate(ap, &vp, &offset); 2593 } else { 2594 vp = svd->vp; 2595 offset = svd->offset + 2596 (uintptr_t)(adr - seg->s_base); 2597 } 2598 ANON_LOCK_EXIT(&->a_rwlock); 2599 } else { 2600 vp = svd->vp; 2601 offset = svd->offset + 2602 (uintptr_t)(adr - seg->s_base); 2603 } 2604 2605 /* 2606 * Use page_find() instead of page_lookup() to 2607 * find the page since we know that it is locked. 2608 */ 2609 pp = page_find(vp, offset); 2610 if (pp == NULL) { 2611 panic( 2612 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2613 (void *)adr, (void *)ap, (void *)vp, offset); 2614 /*NOTREACHED*/ 2615 } 2616 2617 if (rw == S_WRITE) { 2618 hat_setrefmod(pp); 2619 if (seg->s_as->a_vbits) 2620 hat_setstat(seg->s_as, adr, PAGESIZE, 2621 P_REF | P_MOD); 2622 } else if (rw != S_OTHER) { 2623 hat_setref(pp); 2624 if (seg->s_as->a_vbits) 2625 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2626 } 2627 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2628 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2629 page_unlock(pp); 2630 } 2631 ASSERT(svd->softlockcnt >= btop(len)); 2632 if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -btop(len))) { 2633 /* 2634 * All SOFTLOCKS are gone. Wakeup any waiting 2635 * unmappers so they can try again to unmap. 2636 * Check for waiters first without the mutex 2637 * held so we don't always grab the mutex on 2638 * softunlocks. 2639 */ 2640 if (AS_ISUNMAPWAIT(seg->s_as)) { 2641 mutex_enter(&seg->s_as->a_contents); 2642 if (AS_ISUNMAPWAIT(seg->s_as)) { 2643 AS_CLRUNMAPWAIT(seg->s_as); 2644 cv_broadcast(&seg->s_as->a_cv); 2645 } 2646 mutex_exit(&seg->s_as->a_contents); 2647 } 2648 } 2649 } 2650 2651 #define PAGE_HANDLED ((page_t *)-1) 2652 2653 /* 2654 * Release all the pages in the NULL terminated ppp list 2655 * which haven't already been converted to PAGE_HANDLED. 2656 */ 2657 static void 2658 segvn_pagelist_rele(page_t **ppp) 2659 { 2660 for (; *ppp != NULL; ppp++) { 2661 if (*ppp != PAGE_HANDLED) 2662 page_unlock(*ppp); 2663 } 2664 } 2665 2666 static int stealcow = 1; 2667 2668 /* 2669 * Workaround for viking chip bug. See bug id 1220902. 2670 * To fix this down in pagefault() would require importing so 2671 * much as and segvn code as to be unmaintainable. 2672 */ 2673 int enable_mbit_wa = 0; 2674 2675 /* 2676 * Handles all the dirty work of getting the right 2677 * anonymous pages and loading up the translations. 2678 * This routine is called only from segvn_fault() 2679 * when looping over the range of addresses requested. 2680 * 2681 * The basic algorithm here is: 2682 * If this is an anon_zero case 2683 * Call anon_zero to allocate page 2684 * Load up translation 2685 * Return 2686 * endif 2687 * If this is an anon page 2688 * Use anon_getpage to get the page 2689 * else 2690 * Find page in pl[] list passed in 2691 * endif 2692 * If not a cow 2693 * Load up the translation to the page 2694 * return 2695 * endif 2696 * Call anon_private to handle cow 2697 * Load up (writable) translation to new page 2698 */ 2699 static faultcode_t 2700 segvn_faultpage( 2701 struct hat *hat, /* the hat to use for mapping */ 2702 struct seg *seg, /* seg_vn of interest */ 2703 caddr_t addr, /* address in as */ 2704 u_offset_t off, /* offset in vp */ 2705 struct vpage *vpage, /* pointer to vpage for vp, off */ 2706 page_t *pl[], /* object source page pointer */ 2707 uint_t vpprot, /* access allowed to object pages */ 2708 enum fault_type type, /* type of fault */ 2709 enum seg_rw rw, /* type of access at fault */ 2710 int brkcow) /* we may need to break cow */ 2711 { 2712 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2713 page_t *pp, **ppp; 2714 uint_t pageflags = 0; 2715 page_t *anon_pl[1 + 1]; 2716 page_t *opp = NULL; /* original page */ 2717 uint_t prot; 2718 int err; 2719 int cow; 2720 int claim; 2721 int steal = 0; 2722 ulong_t anon_index; 2723 struct anon *ap, *oldap; 2724 struct anon_map *amp; 2725 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2726 int anon_lock = 0; 2727 anon_sync_obj_t cookie; 2728 2729 if (svd->flags & MAP_TEXT) { 2730 hat_flag |= HAT_LOAD_TEXT; 2731 } 2732 2733 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2734 ASSERT(seg->s_szc == 0); 2735 ASSERT(svd->tr_state != SEGVN_TR_INIT); 2736 2737 /* 2738 * Initialize protection value for this page. 2739 * If we have per page protection values check it now. 2740 */ 2741 if (svd->pageprot) { 2742 uint_t protchk; 2743 2744 switch (rw) { 2745 case S_READ: 2746 protchk = PROT_READ; 2747 break; 2748 case S_WRITE: 2749 protchk = PROT_WRITE; 2750 break; 2751 case S_EXEC: 2752 protchk = PROT_EXEC; 2753 break; 2754 case S_OTHER: 2755 default: 2756 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2757 break; 2758 } 2759 2760 prot = VPP_PROT(vpage); 2761 if ((prot & protchk) == 0) 2762 return (FC_PROT); /* illegal access type */ 2763 } else { 2764 prot = svd->prot; 2765 } 2766 2767 if (type == F_SOFTLOCK) { 2768 atomic_inc_ulong((ulong_t *)&svd->softlockcnt); 2769 } 2770 2771 /* 2772 * Always acquire the anon array lock to prevent 2 threads from 2773 * allocating separate anon slots for the same "addr". 2774 */ 2775 2776 if ((amp = svd->amp) != NULL) { 2777 ASSERT(RW_READ_HELD(&->a_rwlock)); 2778 anon_index = svd->anon_index + seg_page(seg, addr); 2779 anon_array_enter(amp, anon_index, &cookie); 2780 anon_lock = 1; 2781 } 2782 2783 if (svd->vp == NULL && amp != NULL) { 2784 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2785 /* 2786 * Allocate a (normally) writable anonymous page of 2787 * zeroes. If no advance reservations, reserve now. 2788 */ 2789 if (svd->flags & MAP_NORESERVE) { 2790 if (anon_resv_zone(ptob(1), 2791 seg->s_as->a_proc->p_zone)) { 2792 atomic_add_long(&svd->swresv, ptob(1)); 2793 atomic_add_long(&seg->s_as->a_resvsize, 2794 ptob(1)); 2795 } else { 2796 err = ENOMEM; 2797 goto out; 2798 } 2799 } 2800 if ((pp = anon_zero(seg, addr, &ap, 2801 svd->cred)) == NULL) { 2802 err = ENOMEM; 2803 goto out; /* out of swap space */ 2804 } 2805 /* 2806 * Re-acquire the anon_map lock and 2807 * initialize the anon array entry. 2808 */ 2809 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2810 ANON_SLEEP); 2811 2812 ASSERT(pp->p_szc == 0); 2813 2814 /* 2815 * Handle pages that have been marked for migration 2816 */ 2817 if (lgrp_optimizations()) 2818 page_migrate(seg, addr, &pp, 1); 2819 2820 if (enable_mbit_wa) { 2821 if (rw == S_WRITE) 2822 hat_setmod(pp); 2823 else if (!hat_ismod(pp)) 2824 prot &= ~PROT_WRITE; 2825 } 2826 /* 2827 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2828 * with MC_LOCKAS, MCL_FUTURE) and this is a 2829 * MAP_NORESERVE segment, we may need to 2830 * permanently lock the page as it is being faulted 2831 * for the first time. The following text applies 2832 * only to MAP_NORESERVE segments: 2833 * 2834 * As per memcntl(2), if this segment was created 2835 * after MCL_FUTURE was applied (a "future" 2836 * segment), its pages must be locked. If this 2837 * segment existed at MCL_FUTURE application (a 2838 * "past" segment), the interface is unclear. 2839 * 2840 * We decide to lock only if vpage is present: 2841 * 2842 * - "future" segments will have a vpage array (see 2843 * as_map), and so will be locked as required 2844 * 2845 * - "past" segments may not have a vpage array, 2846 * depending on whether events (such as 2847 * mprotect) have occurred. Locking if vpage 2848 * exists will preserve legacy behavior. Not 2849 * locking if vpage is absent, will not break 2850 * the interface or legacy behavior. Note that 2851 * allocating vpage here if it's absent requires 2852 * upgrading the segvn reader lock, the cost of 2853 * which does not seem worthwhile. 2854 * 2855 * Usually testing and setting VPP_ISPPLOCK and 2856 * VPP_SETPPLOCK requires holding the segvn lock as 2857 * writer, but in this case all readers are 2858 * serializing on the anon array lock. 2859 */ 2860 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2861 (svd->flags & MAP_NORESERVE) && 2862 !VPP_ISPPLOCK(vpage)) { 2863 proc_t *p = seg->s_as->a_proc; 2864 ASSERT(svd->type == MAP_PRIVATE); 2865 mutex_enter(&p->p_lock); 2866 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2867 1) == 0) { 2868 claim = VPP_PROT(vpage) & PROT_WRITE; 2869 if (page_pp_lock(pp, claim, 0)) { 2870 VPP_SETPPLOCK(vpage); 2871 } else { 2872 rctl_decr_locked_mem(p, NULL, 2873 PAGESIZE, 1); 2874 } 2875 } 2876 mutex_exit(&p->p_lock); 2877 } 2878 2879 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 2880 hat_memload(hat, addr, pp, prot, hat_flag); 2881 2882 if (!(hat_flag & HAT_LOAD_LOCK)) 2883 page_unlock(pp); 2884 2885 anon_array_exit(&cookie); 2886 return (0); 2887 } 2888 } 2889 2890 /* 2891 * Obtain the page structure via anon_getpage() if it is 2892 * a private copy of an object (the result of a previous 2893 * copy-on-write). 2894 */ 2895 if (amp != NULL) { 2896 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2897 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2898 seg, addr, rw, svd->cred); 2899 if (err) 2900 goto out; 2901 2902 if (svd->type == MAP_SHARED) { 2903 /* 2904 * If this is a shared mapping to an 2905 * anon_map, then ignore the write 2906 * permissions returned by anon_getpage(). 2907 * They apply to the private mappings 2908 * of this anon_map. 2909 */ 2910 vpprot |= PROT_WRITE; 2911 } 2912 opp = anon_pl[0]; 2913 } 2914 } 2915 2916 /* 2917 * Search the pl[] list passed in if it is from the 2918 * original object (i.e., not a private copy). 2919 */ 2920 if (opp == NULL) { 2921 /* 2922 * Find original page. We must be bringing it in 2923 * from the list in pl[]. 2924 */ 2925 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2926 if (opp == PAGE_HANDLED) 2927 continue; 2928 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2929 if (opp->p_offset == off) 2930 break; 2931 } 2932 if (opp == NULL) { 2933 panic("segvn_faultpage not found"); 2934 /*NOTREACHED*/ 2935 } 2936 *ppp = PAGE_HANDLED; 2937 2938 } 2939 2940 ASSERT(PAGE_LOCKED(opp)); 2941 2942 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2943 "segvn_fault:pp %p vp %p offset %llx", opp, NULL, 0); 2944 2945 /* 2946 * The fault is treated as a copy-on-write fault if a 2947 * write occurs on a private segment and the object 2948 * page (i.e., mapping) is write protected. We assume 2949 * that fatal protection checks have already been made. 2950 */ 2951 2952 if (brkcow) { 2953 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2954 cow = !(vpprot & PROT_WRITE); 2955 } else if (svd->tr_state == SEGVN_TR_ON) { 2956 /* 2957 * If we are doing text replication COW on first touch. 2958 */ 2959 ASSERT(amp != NULL); 2960 ASSERT(svd->vp != NULL); 2961 ASSERT(rw != S_WRITE); 2962 cow = (ap == NULL); 2963 } else { 2964 cow = 0; 2965 } 2966 2967 /* 2968 * If not a copy-on-write case load the translation 2969 * and return. 2970 */ 2971 if (cow == 0) { 2972 2973 /* 2974 * Handle pages that have been marked for migration 2975 */ 2976 if (lgrp_optimizations()) 2977 page_migrate(seg, addr, &opp, 1); 2978 2979 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2980 if (rw == S_WRITE) 2981 hat_setmod(opp); 2982 else if (rw != S_OTHER && !hat_ismod(opp)) 2983 prot &= ~PROT_WRITE; 2984 } 2985 2986 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 2987 (!svd->pageprot && svd->prot == (prot & vpprot))); 2988 ASSERT(amp == NULL || 2989 svd->rcookie == HAT_INVALID_REGION_COOKIE); 2990 hat_memload_region(hat, addr, opp, prot & vpprot, hat_flag, 2991 svd->rcookie); 2992 2993 if (!(hat_flag & HAT_LOAD_LOCK)) 2994 page_unlock(opp); 2995 2996 if (anon_lock) { 2997 anon_array_exit(&cookie); 2998 } 2999 return (0); 3000 } 3001 3002 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3003 3004 hat_setref(opp); 3005 3006 ASSERT(amp != NULL && anon_lock); 3007 3008 /* 3009 * Steal the page only if it isn't a private page 3010 * since stealing a private page is not worth the effort. 3011 */ 3012 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 3013 steal = 1; 3014 3015 /* 3016 * Steal the original page if the following conditions are true: 3017 * 3018 * We are low on memory, the page is not private, page is not large, 3019 * not shared, not modified, not `locked' or if we have it `locked' 3020 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 3021 * that the page is not shared) and if it doesn't have any 3022 * translations. page_struct_lock isn't needed to look at p_cowcnt 3023 * and p_lckcnt because we first get exclusive lock on page. 3024 */ 3025 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 3026 3027 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 3028 page_tryupgrade(opp) && !hat_ismod(opp) && 3029 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 3030 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 3031 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 3032 /* 3033 * Check if this page has other translations 3034 * after unloading our translation. 3035 */ 3036 if (hat_page_is_mapped(opp)) { 3037 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3038 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 3039 HAT_UNLOAD); 3040 } 3041 3042 /* 3043 * hat_unload() might sync back someone else's recent 3044 * modification, so check again. 3045 */ 3046 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 3047 pageflags |= STEAL_PAGE; 3048 } 3049 3050 /* 3051 * If we have a vpage pointer, see if it indicates that we have 3052 * ``locked'' the page we map -- if so, tell anon_private to 3053 * transfer the locking resource to the new page. 3054 * 3055 * See Statement at the beginning of segvn_lockop regarding 3056 * the way lockcnts/cowcnts are handled during COW. 3057 * 3058 */ 3059 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 3060 pageflags |= LOCK_PAGE; 3061 3062 /* 3063 * Allocate a private page and perform the copy. 3064 * For MAP_NORESERVE reserve swap space now, unless this 3065 * is a cow fault on an existing anon page in which case 3066 * MAP_NORESERVE will have made advance reservations. 3067 */ 3068 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 3069 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 3070 atomic_add_long(&svd->swresv, ptob(1)); 3071 atomic_add_long(&seg->s_as->a_resvsize, ptob(1)); 3072 } else { 3073 page_unlock(opp); 3074 err = ENOMEM; 3075 goto out; 3076 } 3077 } 3078 oldap = ap; 3079 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 3080 if (pp == NULL) { 3081 err = ENOMEM; /* out of swap space */ 3082 goto out; 3083 } 3084 3085 /* 3086 * If we copied away from an anonymous page, then 3087 * we are one step closer to freeing up an anon slot. 3088 * 3089 * NOTE: The original anon slot must be released while 3090 * holding the "anon_map" lock. This is necessary to prevent 3091 * other threads from obtaining a pointer to the anon slot 3092 * which may be freed if its "refcnt" is 1. 3093 */ 3094 if (oldap != NULL) 3095 anon_decref(oldap); 3096 3097 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 3098 3099 /* 3100 * Handle pages that have been marked for migration 3101 */ 3102 if (lgrp_optimizations()) 3103 page_migrate(seg, addr, &pp, 1); 3104 3105 ASSERT(pp->p_szc == 0); 3106 3107 ASSERT(!IS_VMODSORT(pp->p_vnode)); 3108 if (enable_mbit_wa) { 3109 if (rw == S_WRITE) 3110 hat_setmod(pp); 3111 else if (!hat_ismod(pp)) 3112 prot &= ~PROT_WRITE; 3113 } 3114 3115 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 3116 hat_memload(hat, addr, pp, prot, hat_flag); 3117 3118 if (!(hat_flag & HAT_LOAD_LOCK)) 3119 page_unlock(pp); 3120 3121 ASSERT(anon_lock); 3122 anon_array_exit(&cookie); 3123 return (0); 3124 out: 3125 if (anon_lock) 3126 anon_array_exit(&cookie); 3127 3128 if (type == F_SOFTLOCK) { 3129 atomic_dec_ulong((ulong_t *)&svd->softlockcnt); 3130 } 3131 return (FC_MAKE_ERR(err)); 3132 } 3133 3134 /* 3135 * relocate a bunch of smaller targ pages into one large repl page. all targ 3136 * pages must be complete pages smaller than replacement pages. 3137 * it's assumed that no page's szc can change since they are all PAGESIZE or 3138 * complete large pages locked SHARED. 3139 */ 3140 static void 3141 segvn_relocate_pages(page_t **targ, page_t *replacement) 3142 { 3143 page_t *pp; 3144 pgcnt_t repl_npgs, curnpgs; 3145 pgcnt_t i; 3146 uint_t repl_szc = replacement->p_szc; 3147 page_t *first_repl = replacement; 3148 page_t *repl; 3149 spgcnt_t npgs; 3150 3151 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 3152 3153 ASSERT(repl_szc != 0); 3154 npgs = repl_npgs = page_get_pagecnt(repl_szc); 3155 3156 i = 0; 3157 while (repl_npgs) { 3158 spgcnt_t nreloc; 3159 int err; 3160 ASSERT(replacement != NULL); 3161 pp = targ[i]; 3162 ASSERT(pp->p_szc < repl_szc); 3163 ASSERT(PAGE_EXCL(pp)); 3164 ASSERT(!PP_ISFREE(pp)); 3165 curnpgs = page_get_pagecnt(pp->p_szc); 3166 if (curnpgs == 1) { 3167 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 3168 repl = replacement; 3169 page_sub(&replacement, repl); 3170 ASSERT(PAGE_EXCL(repl)); 3171 ASSERT(!PP_ISFREE(repl)); 3172 ASSERT(repl->p_szc == repl_szc); 3173 } else { 3174 page_t *repl_savepp; 3175 int j; 3176 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 3177 repl_savepp = replacement; 3178 for (j = 0; j < curnpgs; j++) { 3179 repl = replacement; 3180 page_sub(&replacement, repl); 3181 ASSERT(PAGE_EXCL(repl)); 3182 ASSERT(!PP_ISFREE(repl)); 3183 ASSERT(repl->p_szc == repl_szc); 3184 ASSERT(page_pptonum(targ[i + j]) == 3185 page_pptonum(targ[i]) + j); 3186 } 3187 repl = repl_savepp; 3188 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 3189 } 3190 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 3191 if (err || nreloc != curnpgs) { 3192 panic("segvn_relocate_pages: " 3193 "page_relocate failed err=%d curnpgs=%ld " 3194 "nreloc=%ld", err, curnpgs, nreloc); 3195 } 3196 ASSERT(curnpgs <= repl_npgs); 3197 repl_npgs -= curnpgs; 3198 i += curnpgs; 3199 } 3200 ASSERT(replacement == NULL); 3201 3202 repl = first_repl; 3203 repl_npgs = npgs; 3204 for (i = 0; i < repl_npgs; i++) { 3205 ASSERT(PAGE_EXCL(repl)); 3206 ASSERT(!PP_ISFREE(repl)); 3207 targ[i] = repl; 3208 page_downgrade(targ[i]); 3209 repl++; 3210 } 3211 } 3212 3213 /* 3214 * Check if all pages in ppa array are complete smaller than szc pages and 3215 * their roots will still be aligned relative to their current size if the 3216 * entire ppa array is relocated into one szc page. If these conditions are 3217 * not met return 0. 3218 * 3219 * If all pages are properly aligned attempt to upgrade their locks 3220 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 3221 * upgrdfail was set to 0 by caller. 3222 * 3223 * Return 1 if all pages are aligned and locked exclusively. 3224 * 3225 * If all pages in ppa array happen to be physically contiguous to make one 3226 * szc page and all exclusive locks are successfully obtained promote the page 3227 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 3228 */ 3229 static int 3230 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 3231 { 3232 page_t *pp; 3233 pfn_t pfn; 3234 pgcnt_t totnpgs = page_get_pagecnt(szc); 3235 pfn_t first_pfn; 3236 int contig = 1; 3237 pgcnt_t i; 3238 pgcnt_t j; 3239 uint_t curszc; 3240 pgcnt_t curnpgs; 3241 int root = 0; 3242 3243 ASSERT(szc > 0); 3244 3245 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 3246 3247 for (i = 0; i < totnpgs; i++) { 3248 pp = ppa[i]; 3249 ASSERT(PAGE_SHARED(pp)); 3250 ASSERT(!PP_ISFREE(pp)); 3251 pfn = page_pptonum(pp); 3252 if (i == 0) { 3253 if (!IS_P2ALIGNED(pfn, totnpgs)) { 3254 contig = 0; 3255 } else { 3256 first_pfn = pfn; 3257 } 3258 } else if (contig && pfn != first_pfn + i) { 3259 contig = 0; 3260 } 3261 if (pp->p_szc == 0) { 3262 if (root) { 3263 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 3264 return (0); 3265 } 3266 } else if (!root) { 3267 if ((curszc = pp->p_szc) >= szc) { 3268 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 3269 return (0); 3270 } 3271 if (curszc == 0) { 3272 /* 3273 * p_szc changed means we don't have all pages 3274 * locked. return failure. 3275 */ 3276 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 3277 return (0); 3278 } 3279 curnpgs = page_get_pagecnt(curszc); 3280 if (!IS_P2ALIGNED(pfn, curnpgs) || 3281 !IS_P2ALIGNED(i, curnpgs)) { 3282 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 3283 return (0); 3284 } 3285 root = 1; 3286 } else { 3287 ASSERT(i > 0); 3288 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 3289 if (pp->p_szc != curszc) { 3290 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 3291 return (0); 3292 } 3293 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 3294 panic("segvn_full_szcpages: " 3295 "large page not physically contiguous"); 3296 } 3297 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 3298 root = 0; 3299 } 3300 } 3301 } 3302 3303 for (i = 0; i < totnpgs; i++) { 3304 ASSERT(ppa[i]->p_szc < szc); 3305 if (!page_tryupgrade(ppa[i])) { 3306 for (j = 0; j < i; j++) { 3307 page_downgrade(ppa[j]); 3308 } 3309 *pszc = ppa[i]->p_szc; 3310 *upgrdfail = 1; 3311 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 3312 return (0); 3313 } 3314 } 3315 3316 /* 3317 * When a page is put a free cachelist its szc is set to 0. if file 3318 * system reclaimed pages from cachelist targ pages will be physically 3319 * contiguous with 0 p_szc. in this case just upgrade szc of targ 3320 * pages without any relocations. 3321 * To avoid any hat issues with previous small mappings 3322 * hat_pageunload() the target pages first. 3323 */ 3324 if (contig) { 3325 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 3326 for (i = 0; i < totnpgs; i++) { 3327 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 3328 } 3329 for (i = 0; i < totnpgs; i++) { 3330 ppa[i]->p_szc = szc; 3331 } 3332 for (i = 0; i < totnpgs; i++) { 3333 ASSERT(PAGE_EXCL(ppa[i])); 3334 page_downgrade(ppa[i]); 3335 } 3336 if (pszc != NULL) { 3337 *pszc = szc; 3338 } 3339 } 3340 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 3341 return (1); 3342 } 3343 3344 /* 3345 * Create physically contiguous pages for [vp, off] - [vp, off + 3346 * page_size(szc)) range and for private segment return them in ppa array. 3347 * Pages are created either via IO or relocations. 3348 * 3349 * Return 1 on success and 0 on failure. 3350 * 3351 * If physically contiguous pages already exist for this range return 1 without 3352 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 3353 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 3354 */ 3355 3356 static int 3357 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 3358 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 3359 int *downsize) 3360 3361 { 3362 page_t *pplist = *ppplist; 3363 size_t pgsz = page_get_pagesize(szc); 3364 pgcnt_t pages = btop(pgsz); 3365 ulong_t start_off = off; 3366 u_offset_t eoff = off + pgsz; 3367 spgcnt_t nreloc; 3368 u_offset_t io_off = off; 3369 size_t io_len; 3370 page_t *io_pplist = NULL; 3371 page_t *done_pplist = NULL; 3372 pgcnt_t pgidx = 0; 3373 page_t *pp; 3374 page_t *newpp; 3375 page_t *targpp; 3376 int io_err = 0; 3377 int i; 3378 pfn_t pfn; 3379 ulong_t ppages; 3380 page_t *targ_pplist = NULL; 3381 page_t *repl_pplist = NULL; 3382 page_t *tmp_pplist; 3383 int nios = 0; 3384 uint_t pszc; 3385 struct vattr va; 3386 3387 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 3388 3389 ASSERT(szc != 0); 3390 ASSERT(pplist->p_szc == szc); 3391 3392 /* 3393 * downsize will be set to 1 only if we fail to lock pages. this will 3394 * allow subsequent faults to try to relocate the page again. If we 3395 * fail due to misalignment don't downsize and let the caller map the 3396 * whole region with small mappings to avoid more faults into the area 3397 * where we can't get large pages anyway. 3398 */ 3399 *downsize = 0; 3400 3401 while (off < eoff) { 3402 newpp = pplist; 3403 ASSERT(newpp != NULL); 3404 ASSERT(PAGE_EXCL(newpp)); 3405 ASSERT(!PP_ISFREE(newpp)); 3406 /* 3407 * we pass NULL for nrelocp to page_lookup_create() 3408 * so that it doesn't relocate. We relocate here 3409 * later only after we make sure we can lock all 3410 * pages in the range we handle and they are all 3411 * aligned. 3412 */ 3413 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 3414 ASSERT(pp != NULL); 3415 ASSERT(!PP_ISFREE(pp)); 3416 ASSERT(pp->p_vnode == vp); 3417 ASSERT(pp->p_offset == off); 3418 if (pp == newpp) { 3419 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 3420 page_sub(&pplist, pp); 3421 ASSERT(PAGE_EXCL(pp)); 3422 ASSERT(page_iolock_assert(pp)); 3423 page_list_concat(&io_pplist, &pp); 3424 off += PAGESIZE; 3425 continue; 3426 } 3427 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 3428 pfn = page_pptonum(pp); 3429 pszc = pp->p_szc; 3430 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 3431 IS_P2ALIGNED(pfn, pages)) { 3432 ASSERT(repl_pplist == NULL); 3433 ASSERT(done_pplist == NULL); 3434 ASSERT(pplist == *ppplist); 3435 page_unlock(pp); 3436 page_free_replacement_page(pplist); 3437 page_create_putback(pages); 3438 *ppplist = NULL; 3439 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 3440 return (1); 3441 } 3442 if (pszc >= szc) { 3443 page_unlock(pp); 3444 segvn_faultvnmpss_align_err1++; 3445 goto out; 3446 } 3447 ppages = page_get_pagecnt(pszc); 3448 if (!IS_P2ALIGNED(pfn, ppages)) { 3449 ASSERT(pszc > 0); 3450 /* 3451 * sizing down to pszc won't help. 3452 */ 3453 page_unlock(pp); 3454 segvn_faultvnmpss_align_err2++; 3455 goto out; 3456 } 3457 pfn = page_pptonum(newpp); 3458 if (!IS_P2ALIGNED(pfn, ppages)) { 3459 ASSERT(pszc > 0); 3460 /* 3461 * sizing down to pszc won't help. 3462 */ 3463 page_unlock(pp); 3464 segvn_faultvnmpss_align_err3++; 3465 goto out; 3466 } 3467 if (!PAGE_EXCL(pp)) { 3468 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3469 page_unlock(pp); 3470 *downsize = 1; 3471 *ret_pszc = pp->p_szc; 3472 goto out; 3473 } 3474 targpp = pp; 3475 if (io_pplist != NULL) { 3476 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3477 io_len = off - io_off; 3478 /* 3479 * Some file systems like NFS don't check EOF 3480 * conditions in VOP_PAGEIO(). Check it here 3481 * now that pages are locked SE_EXCL. Any file 3482 * truncation will wait until the pages are 3483 * unlocked so no need to worry that file will 3484 * be truncated after we check its size here. 3485 * XXX fix NFS to remove this check. 3486 */ 3487 va.va_mask = AT_SIZE; 3488 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL)) { 3489 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3490 page_unlock(targpp); 3491 goto out; 3492 } 3493 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3494 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3495 *downsize = 1; 3496 *ret_pszc = 0; 3497 page_unlock(targpp); 3498 goto out; 3499 } 3500 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3501 B_READ, svd->cred, NULL); 3502 if (io_err) { 3503 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3504 page_unlock(targpp); 3505 if (io_err == EDEADLK) { 3506 segvn_vmpss_pageio_deadlk_err++; 3507 } 3508 goto out; 3509 } 3510 nios++; 3511 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3512 while (io_pplist != NULL) { 3513 pp = io_pplist; 3514 page_sub(&io_pplist, pp); 3515 ASSERT(page_iolock_assert(pp)); 3516 page_io_unlock(pp); 3517 pgidx = (pp->p_offset - start_off) >> 3518 PAGESHIFT; 3519 ASSERT(pgidx < pages); 3520 ppa[pgidx] = pp; 3521 page_list_concat(&done_pplist, &pp); 3522 } 3523 } 3524 pp = targpp; 3525 ASSERT(PAGE_EXCL(pp)); 3526 ASSERT(pp->p_szc <= pszc); 3527 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3528 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3529 page_unlock(pp); 3530 *downsize = 1; 3531 *ret_pszc = pp->p_szc; 3532 goto out; 3533 } 3534 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3535 /* 3536 * page szc chould have changed before the entire group was 3537 * locked. reread page szc. 3538 */ 3539 pszc = pp->p_szc; 3540 ppages = page_get_pagecnt(pszc); 3541 3542 /* link just the roots */ 3543 page_list_concat(&targ_pplist, &pp); 3544 page_sub(&pplist, newpp); 3545 page_list_concat(&repl_pplist, &newpp); 3546 off += PAGESIZE; 3547 while (--ppages != 0) { 3548 newpp = pplist; 3549 page_sub(&pplist, newpp); 3550 off += PAGESIZE; 3551 } 3552 io_off = off; 3553 } 3554 if (io_pplist != NULL) { 3555 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3556 io_len = eoff - io_off; 3557 va.va_mask = AT_SIZE; 3558 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred, NULL) != 0) { 3559 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3560 goto out; 3561 } 3562 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3563 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3564 *downsize = 1; 3565 *ret_pszc = 0; 3566 goto out; 3567 } 3568 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3569 B_READ, svd->cred, NULL); 3570 if (io_err) { 3571 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3572 if (io_err == EDEADLK) { 3573 segvn_vmpss_pageio_deadlk_err++; 3574 } 3575 goto out; 3576 } 3577 nios++; 3578 while (io_pplist != NULL) { 3579 pp = io_pplist; 3580 page_sub(&io_pplist, pp); 3581 ASSERT(page_iolock_assert(pp)); 3582 page_io_unlock(pp); 3583 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3584 ASSERT(pgidx < pages); 3585 ppa[pgidx] = pp; 3586 } 3587 } 3588 /* 3589 * we're now bound to succeed or panic. 3590 * remove pages from done_pplist. it's not needed anymore. 3591 */ 3592 while (done_pplist != NULL) { 3593 pp = done_pplist; 3594 page_sub(&done_pplist, pp); 3595 } 3596 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3597 ASSERT(pplist == NULL); 3598 *ppplist = NULL; 3599 while (targ_pplist != NULL) { 3600 int ret; 3601 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3602 ASSERT(repl_pplist); 3603 pp = targ_pplist; 3604 page_sub(&targ_pplist, pp); 3605 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3606 newpp = repl_pplist; 3607 page_sub(&repl_pplist, newpp); 3608 #ifdef DEBUG 3609 pfn = page_pptonum(pp); 3610 pszc = pp->p_szc; 3611 ppages = page_get_pagecnt(pszc); 3612 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3613 pfn = page_pptonum(newpp); 3614 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3615 ASSERT(P2PHASE(pfn, pages) == pgidx); 3616 #endif 3617 nreloc = 0; 3618 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3619 if (ret != 0 || nreloc == 0) { 3620 panic("segvn_fill_vp_pages: " 3621 "page_relocate failed"); 3622 } 3623 pp = newpp; 3624 while (nreloc-- != 0) { 3625 ASSERT(PAGE_EXCL(pp)); 3626 ASSERT(pp->p_vnode == vp); 3627 ASSERT(pgidx == 3628 ((pp->p_offset - start_off) >> PAGESHIFT)); 3629 ppa[pgidx++] = pp; 3630 pp++; 3631 } 3632 } 3633 3634 if (svd->type == MAP_PRIVATE) { 3635 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3636 for (i = 0; i < pages; i++) { 3637 ASSERT(ppa[i] != NULL); 3638 ASSERT(PAGE_EXCL(ppa[i])); 3639 ASSERT(ppa[i]->p_vnode == vp); 3640 ASSERT(ppa[i]->p_offset == 3641 start_off + (i << PAGESHIFT)); 3642 page_downgrade(ppa[i]); 3643 } 3644 ppa[pages] = NULL; 3645 } else { 3646 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3647 /* 3648 * the caller will still call VOP_GETPAGE() for shared segments 3649 * to check FS write permissions. For private segments we map 3650 * file read only anyway. so no VOP_GETPAGE is needed. 3651 */ 3652 for (i = 0; i < pages; i++) { 3653 ASSERT(ppa[i] != NULL); 3654 ASSERT(PAGE_EXCL(ppa[i])); 3655 ASSERT(ppa[i]->p_vnode == vp); 3656 ASSERT(ppa[i]->p_offset == 3657 start_off + (i << PAGESHIFT)); 3658 page_unlock(ppa[i]); 3659 } 3660 ppa[0] = NULL; 3661 } 3662 3663 return (1); 3664 out: 3665 /* 3666 * Do the cleanup. Unlock target pages we didn't relocate. They are 3667 * linked on targ_pplist by root pages. reassemble unused replacement 3668 * and io pages back to pplist. 3669 */ 3670 if (io_pplist != NULL) { 3671 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3672 pp = io_pplist; 3673 do { 3674 ASSERT(pp->p_vnode == vp); 3675 ASSERT(pp->p_offset == io_off); 3676 ASSERT(page_iolock_assert(pp)); 3677 page_io_unlock(pp); 3678 page_hashout(pp, NULL); 3679 io_off += PAGESIZE; 3680 } while ((pp = pp->p_next) != io_pplist); 3681 page_list_concat(&io_pplist, &pplist); 3682 pplist = io_pplist; 3683 } 3684 tmp_pplist = NULL; 3685 while (targ_pplist != NULL) { 3686 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3687 pp = targ_pplist; 3688 ASSERT(PAGE_EXCL(pp)); 3689 page_sub(&targ_pplist, pp); 3690 3691 pszc = pp->p_szc; 3692 ppages = page_get_pagecnt(pszc); 3693 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3694 3695 if (pszc != 0) { 3696 group_page_unlock(pp); 3697 } 3698 page_unlock(pp); 3699 3700 pp = repl_pplist; 3701 ASSERT(pp != NULL); 3702 ASSERT(PAGE_EXCL(pp)); 3703 ASSERT(pp->p_szc == szc); 3704 page_sub(&repl_pplist, pp); 3705 3706 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3707 3708 /* relink replacement page */ 3709 page_list_concat(&tmp_pplist, &pp); 3710 while (--ppages != 0) { 3711 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3712 pp++; 3713 ASSERT(PAGE_EXCL(pp)); 3714 ASSERT(pp->p_szc == szc); 3715 page_list_concat(&tmp_pplist, &pp); 3716 } 3717 } 3718 if (tmp_pplist != NULL) { 3719 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3720 page_list_concat(&tmp_pplist, &pplist); 3721 pplist = tmp_pplist; 3722 } 3723 /* 3724 * at this point all pages are either on done_pplist or 3725 * pplist. They can't be all on done_pplist otherwise 3726 * we'd've been done. 3727 */ 3728 ASSERT(pplist != NULL); 3729 if (nios != 0) { 3730 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3731 pp = pplist; 3732 do { 3733 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3734 ASSERT(pp->p_szc == szc); 3735 ASSERT(PAGE_EXCL(pp)); 3736 ASSERT(pp->p_vnode != vp); 3737 pp->p_szc = 0; 3738 } while ((pp = pp->p_next) != pplist); 3739 3740 pp = done_pplist; 3741 do { 3742 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3743 ASSERT(pp->p_szc == szc); 3744 ASSERT(PAGE_EXCL(pp)); 3745 ASSERT(pp->p_vnode == vp); 3746 pp->p_szc = 0; 3747 } while ((pp = pp->p_next) != done_pplist); 3748 3749 while (pplist != NULL) { 3750 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3751 pp = pplist; 3752 page_sub(&pplist, pp); 3753 page_free(pp, 0); 3754 } 3755 3756 while (done_pplist != NULL) { 3757 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3758 pp = done_pplist; 3759 page_sub(&done_pplist, pp); 3760 page_unlock(pp); 3761 } 3762 *ppplist = NULL; 3763 return (0); 3764 } 3765 ASSERT(pplist == *ppplist); 3766 if (io_err) { 3767 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3768 /* 3769 * don't downsize on io error. 3770 * see if vop_getpage succeeds. 3771 * pplist may still be used in this case 3772 * for relocations. 3773 */ 3774 return (0); 3775 } 3776 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3777 page_free_replacement_page(pplist); 3778 page_create_putback(pages); 3779 *ppplist = NULL; 3780 return (0); 3781 } 3782 3783 int segvn_anypgsz = 0; 3784 3785 #define SEGVN_RESTORE_SOFTLOCK_VP(type, pages) \ 3786 if ((type) == F_SOFTLOCK) { \ 3787 atomic_add_long((ulong_t *)&(svd)->softlockcnt, \ 3788 -(pages)); \ 3789 } 3790 3791 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3792 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3793 if ((rw) == S_WRITE) { \ 3794 for (i = 0; i < (pages); i++) { \ 3795 ASSERT((ppa)[i]->p_vnode == \ 3796 (ppa)[0]->p_vnode); \ 3797 hat_setmod((ppa)[i]); \ 3798 } \ 3799 } else if ((rw) != S_OTHER && \ 3800 ((prot) & (vpprot) & PROT_WRITE)) { \ 3801 for (i = 0; i < (pages); i++) { \ 3802 ASSERT((ppa)[i]->p_vnode == \ 3803 (ppa)[0]->p_vnode); \ 3804 if (!hat_ismod((ppa)[i])) { \ 3805 prot &= ~PROT_WRITE; \ 3806 break; \ 3807 } \ 3808 } \ 3809 } \ 3810 } 3811 3812 #ifdef VM_STATS 3813 3814 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3815 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3816 3817 #else /* VM_STATS */ 3818 3819 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3820 3821 #endif 3822 3823 static faultcode_t 3824 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3825 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3826 caddr_t eaddr, int brkcow) 3827 { 3828 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3829 struct anon_map *amp = svd->amp; 3830 uchar_t segtype = svd->type; 3831 uint_t szc = seg->s_szc; 3832 size_t pgsz = page_get_pagesize(szc); 3833 size_t maxpgsz = pgsz; 3834 pgcnt_t pages = btop(pgsz); 3835 pgcnt_t maxpages = pages; 3836 size_t ppasize = (pages + 1) * sizeof (page_t *); 3837 caddr_t a = lpgaddr; 3838 caddr_t maxlpgeaddr = lpgeaddr; 3839 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3840 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3841 struct vpage *vpage = (svd->vpage != NULL) ? 3842 &svd->vpage[seg_page(seg, a)] : NULL; 3843 vnode_t *vp = svd->vp; 3844 page_t **ppa; 3845 uint_t pszc; 3846 size_t ppgsz; 3847 pgcnt_t ppages; 3848 faultcode_t err = 0; 3849 int ierr; 3850 int vop_size_err = 0; 3851 uint_t protchk, prot, vpprot; 3852 ulong_t i; 3853 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3854 anon_sync_obj_t an_cookie; 3855 enum seg_rw arw; 3856 int alloc_failed = 0; 3857 int adjszc_chk; 3858 struct vattr va; 3859 int xhat = 0; 3860 page_t *pplist; 3861 pfn_t pfn; 3862 int physcontig; 3863 int upgrdfail; 3864 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3865 int tron = (svd->tr_state == SEGVN_TR_ON); 3866 3867 ASSERT(szc != 0); 3868 ASSERT(vp != NULL); 3869 ASSERT(brkcow == 0 || amp != NULL); 3870 ASSERT(tron == 0 || amp != NULL); 3871 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3872 ASSERT(!(svd->flags & MAP_NORESERVE)); 3873 ASSERT(type != F_SOFTUNLOCK); 3874 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3875 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3876 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3877 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3878 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3879 ASSERT(svd->tr_state != SEGVN_TR_INIT); 3880 3881 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3882 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3883 3884 if (svd->flags & MAP_TEXT) { 3885 hat_flag |= HAT_LOAD_TEXT; 3886 } 3887 3888 if (svd->pageprot) { 3889 switch (rw) { 3890 case S_READ: 3891 protchk = PROT_READ; 3892 break; 3893 case S_WRITE: 3894 protchk = PROT_WRITE; 3895 break; 3896 case S_EXEC: 3897 protchk = PROT_EXEC; 3898 break; 3899 case S_OTHER: 3900 default: 3901 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3902 break; 3903 } 3904 } else { 3905 prot = svd->prot; 3906 /* caller has already done segment level protection check. */ 3907 } 3908 3909 if (seg->s_as->a_hat != hat) { 3910 xhat = 1; 3911 } 3912 3913 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3914 SEGVN_VMSTAT_FLTVNPAGES(2); 3915 arw = S_READ; 3916 } else { 3917 arw = rw; 3918 } 3919 3920 ppa = kmem_alloc(ppasize, KM_SLEEP); 3921 3922 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3923 3924 for (;;) { 3925 adjszc_chk = 0; 3926 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3927 if (adjszc_chk) { 3928 while (szc < seg->s_szc) { 3929 uintptr_t e; 3930 uint_t tszc; 3931 tszc = segvn_anypgsz_vnode ? szc + 1 : 3932 seg->s_szc; 3933 ppgsz = page_get_pagesize(tszc); 3934 if (!IS_P2ALIGNED(a, ppgsz) || 3935 ((alloc_failed >> tszc) & 0x1)) { 3936 break; 3937 } 3938 SEGVN_VMSTAT_FLTVNPAGES(4); 3939 szc = tszc; 3940 pgsz = ppgsz; 3941 pages = btop(pgsz); 3942 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3943 lpgeaddr = (caddr_t)e; 3944 } 3945 } 3946 3947 again: 3948 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3949 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3950 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3951 anon_array_enter(amp, aindx, &an_cookie); 3952 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3953 SEGVN_VMSTAT_FLTVNPAGES(5); 3954 ASSERT(anon_pages(amp->ahp, aindx, 3955 maxpages) == maxpages); 3956 anon_array_exit(&an_cookie); 3957 ANON_LOCK_EXIT(&->a_rwlock); 3958 err = segvn_fault_anonpages(hat, seg, 3959 a, a + maxpgsz, type, rw, 3960 MAX(a, addr), 3961 MIN(a + maxpgsz, eaddr), brkcow); 3962 if (err != 0) { 3963 SEGVN_VMSTAT_FLTVNPAGES(6); 3964 goto out; 3965 } 3966 if (szc < seg->s_szc) { 3967 szc = seg->s_szc; 3968 pgsz = maxpgsz; 3969 pages = maxpages; 3970 lpgeaddr = maxlpgeaddr; 3971 } 3972 goto next; 3973 } else { 3974 ASSERT(anon_pages(amp->ahp, aindx, 3975 maxpages) == 0); 3976 SEGVN_VMSTAT_FLTVNPAGES(7); 3977 anon_array_exit(&an_cookie); 3978 ANON_LOCK_EXIT(&->a_rwlock); 3979 } 3980 } 3981 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3982 ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz)); 3983 3984 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3985 ASSERT(vpage != NULL); 3986 prot = VPP_PROT(vpage); 3987 ASSERT(sameprot(seg, a, maxpgsz)); 3988 if ((prot & protchk) == 0) { 3989 SEGVN_VMSTAT_FLTVNPAGES(8); 3990 err = FC_PROT; 3991 goto out; 3992 } 3993 } 3994 if (type == F_SOFTLOCK) { 3995 atomic_add_long((ulong_t *)&svd->softlockcnt, 3996 pages); 3997 } 3998 3999 pplist = NULL; 4000 physcontig = 0; 4001 ppa[0] = NULL; 4002 if (!brkcow && !tron && szc && 4003 !page_exists_physcontig(vp, off, szc, 4004 segtype == MAP_PRIVATE ? ppa : NULL)) { 4005 SEGVN_VMSTAT_FLTVNPAGES(9); 4006 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 4007 szc, 0, 0) && type != F_SOFTLOCK) { 4008 SEGVN_VMSTAT_FLTVNPAGES(10); 4009 pszc = 0; 4010 ierr = -1; 4011 alloc_failed |= (1 << szc); 4012 break; 4013 } 4014 if (pplist != NULL && 4015 vp->v_mpssdata == SEGVN_PAGEIO) { 4016 int downsize; 4017 SEGVN_VMSTAT_FLTVNPAGES(11); 4018 physcontig = segvn_fill_vp_pages(svd, 4019 vp, off, szc, ppa, &pplist, 4020 &pszc, &downsize); 4021 ASSERT(!physcontig || pplist == NULL); 4022 if (!physcontig && downsize && 4023 type != F_SOFTLOCK) { 4024 ASSERT(pplist == NULL); 4025 SEGVN_VMSTAT_FLTVNPAGES(12); 4026 ierr = -1; 4027 break; 4028 } 4029 ASSERT(!physcontig || 4030 segtype == MAP_PRIVATE || 4031 ppa[0] == NULL); 4032 if (physcontig && ppa[0] == NULL) { 4033 physcontig = 0; 4034 } 4035 } 4036 } else if (!brkcow && !tron && szc && ppa[0] != NULL) { 4037 SEGVN_VMSTAT_FLTVNPAGES(13); 4038 ASSERT(segtype == MAP_PRIVATE); 4039 physcontig = 1; 4040 } 4041 4042 if (!physcontig) { 4043 SEGVN_VMSTAT_FLTVNPAGES(14); 4044 ppa[0] = NULL; 4045 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 4046 &vpprot, ppa, pgsz, seg, a, arw, 4047 svd->cred, NULL); 4048 #ifdef DEBUG 4049 if (ierr == 0) { 4050 for (i = 0; i < pages; i++) { 4051 ASSERT(PAGE_LOCKED(ppa[i])); 4052 ASSERT(!PP_ISFREE(ppa[i])); 4053 ASSERT(ppa[i]->p_vnode == vp); 4054 ASSERT(ppa[i]->p_offset == 4055 off + (i << PAGESHIFT)); 4056 } 4057 } 4058 #endif /* DEBUG */ 4059 if (segtype == MAP_PRIVATE) { 4060 SEGVN_VMSTAT_FLTVNPAGES(15); 4061 vpprot &= ~PROT_WRITE; 4062 } 4063 } else { 4064 ASSERT(segtype == MAP_PRIVATE); 4065 SEGVN_VMSTAT_FLTVNPAGES(16); 4066 vpprot = PROT_ALL & ~PROT_WRITE; 4067 ierr = 0; 4068 } 4069 4070 if (ierr != 0) { 4071 SEGVN_VMSTAT_FLTVNPAGES(17); 4072 if (pplist != NULL) { 4073 SEGVN_VMSTAT_FLTVNPAGES(18); 4074 page_free_replacement_page(pplist); 4075 page_create_putback(pages); 4076 } 4077 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4078 if (a + pgsz <= eaddr) { 4079 SEGVN_VMSTAT_FLTVNPAGES(19); 4080 err = FC_MAKE_ERR(ierr); 4081 goto out; 4082 } 4083 va.va_mask = AT_SIZE; 4084 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL)) { 4085 SEGVN_VMSTAT_FLTVNPAGES(20); 4086 err = FC_MAKE_ERR(EIO); 4087 goto out; 4088 } 4089 if (btopr(va.va_size) >= btopr(off + pgsz)) { 4090 SEGVN_VMSTAT_FLTVNPAGES(21); 4091 err = FC_MAKE_ERR(ierr); 4092 goto out; 4093 } 4094 if (btopr(va.va_size) < 4095 btopr(off + (eaddr - a))) { 4096 SEGVN_VMSTAT_FLTVNPAGES(22); 4097 err = FC_MAKE_ERR(ierr); 4098 goto out; 4099 } 4100 if (brkcow || tron || type == F_SOFTLOCK) { 4101 /* can't reduce map area */ 4102 SEGVN_VMSTAT_FLTVNPAGES(23); 4103 vop_size_err = 1; 4104 goto out; 4105 } 4106 SEGVN_VMSTAT_FLTVNPAGES(24); 4107 ASSERT(szc != 0); 4108 pszc = 0; 4109 ierr = -1; 4110 break; 4111 } 4112 4113 if (amp != NULL) { 4114 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4115 anon_array_enter(amp, aindx, &an_cookie); 4116 } 4117 if (amp != NULL && 4118 anon_get_ptr(amp->ahp, aindx) != NULL) { 4119 ulong_t taindx = P2ALIGN(aindx, maxpages); 4120 4121 SEGVN_VMSTAT_FLTVNPAGES(25); 4122 ASSERT(anon_pages(amp->ahp, taindx, 4123 maxpages) == maxpages); 4124 for (i = 0; i < pages; i++) { 4125 page_unlock(ppa[i]); 4126 } 4127 anon_array_exit(&an_cookie); 4128 ANON_LOCK_EXIT(&->a_rwlock); 4129 if (pplist != NULL) { 4130 page_free_replacement_page(pplist); 4131 page_create_putback(pages); 4132 } 4133 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4134 if (szc < seg->s_szc) { 4135 SEGVN_VMSTAT_FLTVNPAGES(26); 4136 /* 4137 * For private segments SOFTLOCK 4138 * either always breaks cow (any rw 4139 * type except S_READ_NOCOW) or 4140 * address space is locked as writer 4141 * (S_READ_NOCOW case) and anon slots 4142 * can't show up on second check. 4143 * Therefore if we are here for 4144 * SOFTLOCK case it must be a cow 4145 * break but cow break never reduces 4146 * szc. text replication (tron) in 4147 * this case works as cow break. 4148 * Thus the assert below. 4149 */ 4150 ASSERT(!brkcow && !tron && 4151 type != F_SOFTLOCK); 4152 pszc = seg->s_szc; 4153 ierr = -2; 4154 break; 4155 } 4156 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4157 goto again; 4158 } 4159 #ifdef DEBUG 4160 if (amp != NULL) { 4161 ulong_t taindx = P2ALIGN(aindx, maxpages); 4162 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 4163 } 4164 #endif /* DEBUG */ 4165 4166 if (brkcow || tron) { 4167 ASSERT(amp != NULL); 4168 ASSERT(pplist == NULL); 4169 ASSERT(szc == seg->s_szc); 4170 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4171 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 4172 SEGVN_VMSTAT_FLTVNPAGES(27); 4173 ierr = anon_map_privatepages(amp, aindx, szc, 4174 seg, a, prot, ppa, vpage, segvn_anypgsz, 4175 tron ? PG_LOCAL : 0, svd->cred); 4176 if (ierr != 0) { 4177 SEGVN_VMSTAT_FLTVNPAGES(28); 4178 anon_array_exit(&an_cookie); 4179 ANON_LOCK_EXIT(&->a_rwlock); 4180 SEGVN_RESTORE_SOFTLOCK_VP(type, pages); 4181 err = FC_MAKE_ERR(ierr); 4182 goto out; 4183 } 4184 4185 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4186 /* 4187 * p_szc can't be changed for locked 4188 * swapfs pages. 4189 */ 4190 ASSERT(svd->rcookie == 4191 HAT_INVALID_REGION_COOKIE); 4192 hat_memload_array(hat, a, pgsz, ppa, prot, 4193 hat_flag); 4194 4195 if (!(hat_flag & HAT_LOAD_LOCK)) { 4196 SEGVN_VMSTAT_FLTVNPAGES(29); 4197 for (i = 0; i < pages; i++) { 4198 page_unlock(ppa[i]); 4199 } 4200 } 4201 anon_array_exit(&an_cookie); 4202 ANON_LOCK_EXIT(&->a_rwlock); 4203 goto next; 4204 } 4205 4206 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE || 4207 (!svd->pageprot && svd->prot == (prot & vpprot))); 4208 4209 pfn = page_pptonum(ppa[0]); 4210 /* 4211 * hat_page_demote() needs an SE_EXCL lock on one of 4212 * constituent page_t's and it decreases root's p_szc 4213 * last. This means if root's p_szc is equal szc and 4214 * all its constituent pages are locked 4215 * hat_page_demote() that could have changed p_szc to 4216 * szc is already done and no new have page_demote() 4217 * can start for this large page. 4218 */ 4219 4220 /* 4221 * we need to make sure same mapping size is used for 4222 * the same address range if there's a possibility the 4223 * adddress is already mapped because hat layer panics 4224 * when translation is loaded for the range already 4225 * mapped with a different page size. We achieve it 4226 * by always using largest page size possible subject 4227 * to the constraints of page size, segment page size 4228 * and page alignment. Since mappings are invalidated 4229 * when those constraints change and make it 4230 * impossible to use previously used mapping size no 4231 * mapping size conflicts should happen. 4232 */ 4233 4234 chkszc: 4235 if ((pszc = ppa[0]->p_szc) == szc && 4236 IS_P2ALIGNED(pfn, pages)) { 4237 4238 SEGVN_VMSTAT_FLTVNPAGES(30); 4239 #ifdef DEBUG 4240 for (i = 0; i < pages; i++) { 4241 ASSERT(PAGE_LOCKED(ppa[i])); 4242 ASSERT(!PP_ISFREE(ppa[i])); 4243 ASSERT(page_pptonum(ppa[i]) == 4244 pfn + i); 4245 ASSERT(ppa[i]->p_szc == szc); 4246 ASSERT(ppa[i]->p_vnode == vp); 4247 ASSERT(ppa[i]->p_offset == 4248 off + (i << PAGESHIFT)); 4249 } 4250 #endif /* DEBUG */ 4251 /* 4252 * All pages are of szc we need and they are 4253 * all locked so they can't change szc. load 4254 * translations. 4255 * 4256 * if page got promoted since last check 4257 * we don't need pplist. 4258 */ 4259 if (pplist != NULL) { 4260 page_free_replacement_page(pplist); 4261 page_create_putback(pages); 4262 } 4263 if (PP_ISMIGRATE(ppa[0])) { 4264 page_migrate(seg, a, ppa, pages); 4265 } 4266 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4267 prot, vpprot); 4268 if (!xhat) { 4269 hat_memload_array_region(hat, a, pgsz, 4270 ppa, prot & vpprot, hat_flag, 4271 svd->rcookie); 4272 } else { 4273 /* 4274 * avoid large xhat mappings to FS 4275 * pages so that hat_page_demote() 4276 * doesn't need to check for xhat 4277 * large mappings. 4278 * Don't use regions with xhats. 4279 */ 4280 for (i = 0; i < pages; i++) { 4281 hat_memload(hat, 4282 a + (i << PAGESHIFT), 4283 ppa[i], prot & vpprot, 4284 hat_flag); 4285 } 4286 } 4287 4288 if (!(hat_flag & HAT_LOAD_LOCK)) { 4289 for (i = 0; i < pages; i++) { 4290 page_unlock(ppa[i]); 4291 } 4292 } 4293 if (amp != NULL) { 4294 anon_array_exit(&an_cookie); 4295 ANON_LOCK_EXIT(&->a_rwlock); 4296 } 4297 goto next; 4298 } 4299 4300 /* 4301 * See if upsize is possible. 4302 */ 4303 if (pszc > szc && szc < seg->s_szc && 4304 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 4305 pgcnt_t aphase; 4306 uint_t pszc1 = MIN(pszc, seg->s_szc); 4307 ppgsz = page_get_pagesize(pszc1); 4308 ppages = btop(ppgsz); 4309 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 4310 4311 ASSERT(type != F_SOFTLOCK); 4312 4313 SEGVN_VMSTAT_FLTVNPAGES(31); 4314 if (aphase != P2PHASE(pfn, ppages)) { 4315 segvn_faultvnmpss_align_err4++; 4316 } else { 4317 SEGVN_VMSTAT_FLTVNPAGES(32); 4318 if (pplist != NULL) { 4319 page_t *pl = pplist; 4320 page_free_replacement_page(pl); 4321 page_create_putback(pages); 4322 } 4323 for (i = 0; i < pages; i++) { 4324 page_unlock(ppa[i]); 4325 } 4326 if (amp != NULL) { 4327 anon_array_exit(&an_cookie); 4328 ANON_LOCK_EXIT(&->a_rwlock); 4329 } 4330 pszc = pszc1; 4331 ierr = -2; 4332 break; 4333 } 4334 } 4335 4336 /* 4337 * check if we should use smallest mapping size. 4338 */ 4339 upgrdfail = 0; 4340 if (szc == 0 || xhat || 4341 (pszc >= szc && 4342 !IS_P2ALIGNED(pfn, pages)) || 4343 (pszc < szc && 4344 !segvn_full_szcpages(ppa, szc, &upgrdfail, 4345 &pszc))) { 4346 4347 if (upgrdfail && type != F_SOFTLOCK) { 4348 /* 4349 * segvn_full_szcpages failed to lock 4350 * all pages EXCL. Size down. 4351 */ 4352 ASSERT(pszc < szc); 4353 4354 SEGVN_VMSTAT_FLTVNPAGES(33); 4355 4356 if (pplist != NULL) { 4357 page_t *pl = pplist; 4358 page_free_replacement_page(pl); 4359 page_create_putback(pages); 4360 } 4361 4362 for (i = 0; i < pages; i++) { 4363 page_unlock(ppa[i]); 4364 } 4365 if (amp != NULL) { 4366 anon_array_exit(&an_cookie); 4367 ANON_LOCK_EXIT(&->a_rwlock); 4368 } 4369 ierr = -1; 4370 break; 4371 } 4372 if (szc != 0 && !xhat && !upgrdfail) { 4373 segvn_faultvnmpss_align_err5++; 4374 } 4375 SEGVN_VMSTAT_FLTVNPAGES(34); 4376 if (pplist != NULL) { 4377 page_free_replacement_page(pplist); 4378 page_create_putback(pages); 4379 } 4380 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4381 prot, vpprot); 4382 if (upgrdfail && segvn_anypgsz_vnode) { 4383 /* SOFTLOCK case */ 4384 hat_memload_array_region(hat, a, pgsz, 4385 ppa, prot & vpprot, hat_flag, 4386 svd->rcookie); 4387 } else { 4388 for (i = 0; i < pages; i++) { 4389 hat_memload_region(hat, 4390 a + (i << PAGESHIFT), 4391 ppa[i], prot & vpprot, 4392 hat_flag, svd->rcookie); 4393 } 4394 } 4395 if (!(hat_flag & HAT_LOAD_LOCK)) { 4396 for (i = 0; i < pages; i++) { 4397 page_unlock(ppa[i]); 4398 } 4399 } 4400 if (amp != NULL) { 4401 anon_array_exit(&an_cookie); 4402 ANON_LOCK_EXIT(&->a_rwlock); 4403 } 4404 goto next; 4405 } 4406 4407 if (pszc == szc) { 4408 /* 4409 * segvn_full_szcpages() upgraded pages szc. 4410 */ 4411 ASSERT(pszc == ppa[0]->p_szc); 4412 ASSERT(IS_P2ALIGNED(pfn, pages)); 4413 goto chkszc; 4414 } 4415 4416 if (pszc > szc) { 4417 kmutex_t *szcmtx; 4418 SEGVN_VMSTAT_FLTVNPAGES(35); 4419 /* 4420 * p_szc of ppa[0] can change since we haven't 4421 * locked all constituent pages. Call 4422 * page_lock_szc() to prevent szc changes. 4423 * This should be a rare case that happens when 4424 * multiple segments use a different page size 4425 * to map the same file offsets. 4426 */ 4427 szcmtx = page_szc_lock(ppa[0]); 4428 pszc = ppa[0]->p_szc; 4429 ASSERT(szcmtx != NULL || pszc == 0); 4430 ASSERT(ppa[0]->p_szc <= pszc); 4431 if (pszc <= szc) { 4432 SEGVN_VMSTAT_FLTVNPAGES(36); 4433 if (szcmtx != NULL) { 4434 mutex_exit(szcmtx); 4435 } 4436 goto chkszc; 4437 } 4438 if (pplist != NULL) { 4439 /* 4440 * page got promoted since last check. 4441 * we don't need preaalocated large 4442 * page. 4443 */ 4444 SEGVN_VMSTAT_FLTVNPAGES(37); 4445 page_free_replacement_page(pplist); 4446 page_create_putback(pages); 4447 } 4448 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4449 prot, vpprot); 4450 hat_memload_array_region(hat, a, pgsz, ppa, 4451 prot & vpprot, hat_flag, svd->rcookie); 4452 mutex_exit(szcmtx); 4453 if (!(hat_flag & HAT_LOAD_LOCK)) { 4454 for (i = 0; i < pages; i++) { 4455 page_unlock(ppa[i]); 4456 } 4457 } 4458 if (amp != NULL) { 4459 anon_array_exit(&an_cookie); 4460 ANON_LOCK_EXIT(&->a_rwlock); 4461 } 4462 goto next; 4463 } 4464 4465 /* 4466 * if page got demoted since last check 4467 * we could have not allocated larger page. 4468 * allocate now. 4469 */ 4470 if (pplist == NULL && 4471 page_alloc_pages(vp, seg, a, &pplist, NULL, 4472 szc, 0, 0) && type != F_SOFTLOCK) { 4473 SEGVN_VMSTAT_FLTVNPAGES(38); 4474 for (i = 0; i < pages; i++) { 4475 page_unlock(ppa[i]); 4476 } 4477 if (amp != NULL) { 4478 anon_array_exit(&an_cookie); 4479 ANON_LOCK_EXIT(&->a_rwlock); 4480 } 4481 ierr = -1; 4482 alloc_failed |= (1 << szc); 4483 break; 4484 } 4485 4486 SEGVN_VMSTAT_FLTVNPAGES(39); 4487 4488 if (pplist != NULL) { 4489 segvn_relocate_pages(ppa, pplist); 4490 #ifdef DEBUG 4491 } else { 4492 ASSERT(type == F_SOFTLOCK); 4493 SEGVN_VMSTAT_FLTVNPAGES(40); 4494 #endif /* DEBUG */ 4495 } 4496 4497 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4498 4499 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4500 ASSERT(type == F_SOFTLOCK); 4501 for (i = 0; i < pages; i++) { 4502 ASSERT(ppa[i]->p_szc < szc); 4503 hat_memload_region(hat, 4504 a + (i << PAGESHIFT), 4505 ppa[i], prot & vpprot, hat_flag, 4506 svd->rcookie); 4507 } 4508 } else { 4509 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4510 hat_memload_array_region(hat, a, pgsz, ppa, 4511 prot & vpprot, hat_flag, svd->rcookie); 4512 } 4513 if (!(hat_flag & HAT_LOAD_LOCK)) { 4514 for (i = 0; i < pages; i++) { 4515 ASSERT(PAGE_SHARED(ppa[i])); 4516 page_unlock(ppa[i]); 4517 } 4518 } 4519 if (amp != NULL) { 4520 anon_array_exit(&an_cookie); 4521 ANON_LOCK_EXIT(&->a_rwlock); 4522 } 4523 4524 next: 4525 if (vpage != NULL) { 4526 vpage += pages; 4527 } 4528 adjszc_chk = 1; 4529 } 4530 if (a == lpgeaddr) 4531 break; 4532 ASSERT(a < lpgeaddr); 4533 4534 ASSERT(!brkcow && !tron && type != F_SOFTLOCK); 4535 4536 /* 4537 * ierr == -1 means we failed to map with a large page. 4538 * (either due to allocation/relocation failures or 4539 * misalignment with other mappings to this file. 4540 * 4541 * ierr == -2 means some other thread allocated a large page 4542 * after we gave up tp map with a large page. retry with 4543 * larger mapping. 4544 */ 4545 ASSERT(ierr == -1 || ierr == -2); 4546 ASSERT(ierr == -2 || szc != 0); 4547 ASSERT(ierr == -1 || szc < seg->s_szc); 4548 if (ierr == -2) { 4549 SEGVN_VMSTAT_FLTVNPAGES(41); 4550 ASSERT(pszc > szc && pszc <= seg->s_szc); 4551 szc = pszc; 4552 } else if (segvn_anypgsz_vnode) { 4553 SEGVN_VMSTAT_FLTVNPAGES(42); 4554 szc--; 4555 } else { 4556 SEGVN_VMSTAT_FLTVNPAGES(43); 4557 ASSERT(pszc < szc); 4558 /* 4559 * other process created pszc large page. 4560 * but we still have to drop to 0 szc. 4561 */ 4562 szc = 0; 4563 } 4564 4565 pgsz = page_get_pagesize(szc); 4566 pages = btop(pgsz); 4567 if (ierr == -2) { 4568 /* 4569 * Size up case. Note lpgaddr may only be needed for 4570 * softlock case so we don't adjust it here. 4571 */ 4572 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4573 ASSERT(a >= lpgaddr); 4574 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4575 off = svd->offset + (uintptr_t)(a - seg->s_base); 4576 aindx = svd->anon_index + seg_page(seg, a); 4577 vpage = (svd->vpage != NULL) ? 4578 &svd->vpage[seg_page(seg, a)] : NULL; 4579 } else { 4580 /* 4581 * Size down case. Note lpgaddr may only be needed for 4582 * softlock case so we don't adjust it here. 4583 */ 4584 ASSERT(IS_P2ALIGNED(a, pgsz)); 4585 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4586 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4587 ASSERT(a < lpgeaddr); 4588 if (a < addr) { 4589 SEGVN_VMSTAT_FLTVNPAGES(44); 4590 /* 4591 * The beginning of the large page region can 4592 * be pulled to the right to make a smaller 4593 * region. We haven't yet faulted a single 4594 * page. 4595 */ 4596 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4597 ASSERT(a >= lpgaddr); 4598 off = svd->offset + 4599 (uintptr_t)(a - seg->s_base); 4600 aindx = svd->anon_index + seg_page(seg, a); 4601 vpage = (svd->vpage != NULL) ? 4602 &svd->vpage[seg_page(seg, a)] : NULL; 4603 } 4604 } 4605 } 4606 out: 4607 kmem_free(ppa, ppasize); 4608 if (!err && !vop_size_err) { 4609 SEGVN_VMSTAT_FLTVNPAGES(45); 4610 return (0); 4611 } 4612 if (type == F_SOFTLOCK && a > lpgaddr) { 4613 SEGVN_VMSTAT_FLTVNPAGES(46); 4614 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4615 } 4616 if (!vop_size_err) { 4617 SEGVN_VMSTAT_FLTVNPAGES(47); 4618 return (err); 4619 } 4620 ASSERT(brkcow || tron || type == F_SOFTLOCK); 4621 /* 4622 * Large page end is mapped beyond the end of file and it's a cow 4623 * fault (can be a text replication induced cow) or softlock so we can't 4624 * reduce the map area. For now just demote the segment. This should 4625 * really only happen if the end of the file changed after the mapping 4626 * was established since when large page segments are created we make 4627 * sure they don't extend beyond the end of the file. 4628 */ 4629 SEGVN_VMSTAT_FLTVNPAGES(48); 4630 4631 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4632 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4633 err = 0; 4634 if (seg->s_szc != 0) { 4635 segvn_fltvnpages_clrszc_cnt++; 4636 ASSERT(svd->softlockcnt == 0); 4637 err = segvn_clrszc(seg); 4638 if (err != 0) { 4639 segvn_fltvnpages_clrszc_err++; 4640 } 4641 } 4642 ASSERT(err || seg->s_szc == 0); 4643 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4644 /* segvn_fault will do its job as if szc had been zero to begin with */ 4645 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4646 } 4647 4648 /* 4649 * This routine will attempt to fault in one large page. 4650 * it will use smaller pages if that fails. 4651 * It should only be called for pure anonymous segments. 4652 */ 4653 static faultcode_t 4654 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4655 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4656 caddr_t eaddr, int brkcow) 4657 { 4658 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4659 struct anon_map *amp = svd->amp; 4660 uchar_t segtype = svd->type; 4661 uint_t szc = seg->s_szc; 4662 size_t pgsz = page_get_pagesize(szc); 4663 size_t maxpgsz = pgsz; 4664 pgcnt_t pages = btop(pgsz); 4665 uint_t ppaszc = szc; 4666 caddr_t a = lpgaddr; 4667 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4668 struct vpage *vpage = (svd->vpage != NULL) ? 4669 &svd->vpage[seg_page(seg, a)] : NULL; 4670 page_t **ppa; 4671 uint_t ppa_szc; 4672 faultcode_t err; 4673 int ierr; 4674 uint_t protchk, prot, vpprot; 4675 ulong_t i; 4676 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4677 anon_sync_obj_t cookie; 4678 int adjszc_chk; 4679 int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; 4680 4681 ASSERT(szc != 0); 4682 ASSERT(amp != NULL); 4683 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4684 ASSERT(!(svd->flags & MAP_NORESERVE)); 4685 ASSERT(type != F_SOFTUNLOCK); 4686 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4687 ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF); 4688 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4689 4690 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4691 4692 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4693 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4694 4695 if (svd->flags & MAP_TEXT) { 4696 hat_flag |= HAT_LOAD_TEXT; 4697 } 4698 4699 if (svd->pageprot) { 4700 switch (rw) { 4701 case S_READ: 4702 protchk = PROT_READ; 4703 break; 4704 case S_WRITE: 4705 protchk = PROT_WRITE; 4706 break; 4707 case S_EXEC: 4708 protchk = PROT_EXEC; 4709 break; 4710 case S_OTHER: 4711 default: 4712 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4713 break; 4714 } 4715 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4716 } else { 4717 prot = svd->prot; 4718 /* caller has already done segment level protection check. */ 4719 } 4720 4721 ppa = kmem_cache_alloc(segvn_szc_cache[ppaszc], KM_SLEEP); 4722 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4723 for (;;) { 4724 adjszc_chk = 0; 4725 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4726 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4727 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4728 ASSERT(vpage != NULL); 4729 prot = VPP_PROT(vpage); 4730 ASSERT(sameprot(seg, a, maxpgsz)); 4731 if ((prot & protchk) == 0) { 4732 err = FC_PROT; 4733 goto error; 4734 } 4735 } 4736 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4737 pgsz < maxpgsz) { 4738 ASSERT(a > lpgaddr); 4739 szc = seg->s_szc; 4740 pgsz = maxpgsz; 4741 pages = btop(pgsz); 4742 ASSERT(IS_P2ALIGNED(aindx, pages)); 4743 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4744 pgsz); 4745 } 4746 if (type == F_SOFTLOCK) { 4747 atomic_add_long((ulong_t *)&svd->softlockcnt, 4748 pages); 4749 } 4750 anon_array_enter(amp, aindx, &cookie); 4751 ppa_szc = (uint_t)-1; 4752 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4753 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4754 segvn_anypgsz, pgflags, svd->cred); 4755 if (ierr != 0) { 4756 anon_array_exit(&cookie); 4757 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4758 if (type == F_SOFTLOCK) { 4759 atomic_add_long( 4760 (ulong_t *)&svd->softlockcnt, 4761 -pages); 4762 } 4763 if (ierr > 0) { 4764 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4765 err = FC_MAKE_ERR(ierr); 4766 goto error; 4767 } 4768 break; 4769 } 4770 4771 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4772 4773 ASSERT(segtype == MAP_SHARED || 4774 ppa[0]->p_szc <= szc); 4775 ASSERT(segtype == MAP_PRIVATE || 4776 ppa[0]->p_szc >= szc); 4777 4778 /* 4779 * Handle pages that have been marked for migration 4780 */ 4781 if (lgrp_optimizations()) 4782 page_migrate(seg, a, ppa, pages); 4783 4784 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 4785 4786 if (segtype == MAP_SHARED) { 4787 vpprot |= PROT_WRITE; 4788 } 4789 4790 hat_memload_array(hat, a, pgsz, ppa, 4791 prot & vpprot, hat_flag); 4792 4793 if (hat_flag & HAT_LOAD_LOCK) { 4794 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4795 } else { 4796 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4797 for (i = 0; i < pages; i++) 4798 page_unlock(ppa[i]); 4799 } 4800 if (vpage != NULL) 4801 vpage += pages; 4802 4803 anon_array_exit(&cookie); 4804 adjszc_chk = 1; 4805 } 4806 if (a == lpgeaddr) 4807 break; 4808 ASSERT(a < lpgeaddr); 4809 /* 4810 * ierr == -1 means we failed to allocate a large page. 4811 * so do a size down operation. 4812 * 4813 * ierr == -2 means some other process that privately shares 4814 * pages with this process has allocated a larger page and we 4815 * need to retry with larger pages. So do a size up 4816 * operation. This relies on the fact that large pages are 4817 * never partially shared i.e. if we share any constituent 4818 * page of a large page with another process we must share the 4819 * entire large page. Note this cannot happen for SOFTLOCK 4820 * case, unless current address (a) is at the beginning of the 4821 * next page size boundary because the other process couldn't 4822 * have relocated locked pages. 4823 */ 4824 ASSERT(ierr == -1 || ierr == -2); 4825 4826 if (segvn_anypgsz) { 4827 ASSERT(ierr == -2 || szc != 0); 4828 ASSERT(ierr == -1 || szc < seg->s_szc); 4829 szc = (ierr == -1) ? szc - 1 : szc + 1; 4830 } else { 4831 /* 4832 * For non COW faults and segvn_anypgsz == 0 4833 * we need to be careful not to loop forever 4834 * if existing page is found with szc other 4835 * than 0 or seg->s_szc. This could be due 4836 * to page relocations on behalf of DR or 4837 * more likely large page creation. For this 4838 * case simply re-size to existing page's szc 4839 * if returned by anon_map_getpages(). 4840 */ 4841 if (ppa_szc == (uint_t)-1) { 4842 szc = (ierr == -1) ? 0 : seg->s_szc; 4843 } else { 4844 ASSERT(ppa_szc <= seg->s_szc); 4845 ASSERT(ierr == -2 || ppa_szc < szc); 4846 ASSERT(ierr == -1 || ppa_szc > szc); 4847 szc = ppa_szc; 4848 } 4849 } 4850 4851 pgsz = page_get_pagesize(szc); 4852 pages = btop(pgsz); 4853 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4854 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4855 if (type == F_SOFTLOCK) { 4856 /* 4857 * For softlocks we cannot reduce the fault area 4858 * (calculated based on the largest page size for this 4859 * segment) for size down and a is already next 4860 * page size aligned as assertted above for size 4861 * ups. Therefore just continue in case of softlock. 4862 */ 4863 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4864 continue; /* keep lint happy */ 4865 } else if (ierr == -2) { 4866 4867 /* 4868 * Size up case. Note lpgaddr may only be needed for 4869 * softlock case so we don't adjust it here. 4870 */ 4871 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4872 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4873 ASSERT(a >= lpgaddr); 4874 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4875 aindx = svd->anon_index + seg_page(seg, a); 4876 vpage = (svd->vpage != NULL) ? 4877 &svd->vpage[seg_page(seg, a)] : NULL; 4878 } else { 4879 /* 4880 * Size down case. Note lpgaddr may only be needed for 4881 * softlock case so we don't adjust it here. 4882 */ 4883 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4884 ASSERT(IS_P2ALIGNED(a, pgsz)); 4885 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4886 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4887 ASSERT(a < lpgeaddr); 4888 if (a < addr) { 4889 /* 4890 * The beginning of the large page region can 4891 * be pulled to the right to make a smaller 4892 * region. We haven't yet faulted a single 4893 * page. 4894 */ 4895 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4896 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4897 ASSERT(a >= lpgaddr); 4898 aindx = svd->anon_index + seg_page(seg, a); 4899 vpage = (svd->vpage != NULL) ? 4900 &svd->vpage[seg_page(seg, a)] : NULL; 4901 } 4902 } 4903 } 4904 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4905 ANON_LOCK_EXIT(&->a_rwlock); 4906 kmem_cache_free(segvn_szc_cache[ppaszc], ppa); 4907 return (0); 4908 error: 4909 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4910 ANON_LOCK_EXIT(&->a_rwlock); 4911 kmem_cache_free(segvn_szc_cache[ppaszc], ppa); 4912 if (type == F_SOFTLOCK && a > lpgaddr) { 4913 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4914 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4915 } 4916 return (err); 4917 } 4918 4919 int fltadvice = 1; /* set to free behind pages for sequential access */ 4920 4921 /* 4922 * This routine is called via a machine specific fault handling routine. 4923 * It is also called by software routines wishing to lock or unlock 4924 * a range of addresses. 4925 * 4926 * Here is the basic algorithm: 4927 * If unlocking 4928 * Call segvn_softunlock 4929 * Return 4930 * endif 4931 * Checking and set up work 4932 * If we will need some non-anonymous pages 4933 * Call VOP_GETPAGE over the range of non-anonymous pages 4934 * endif 4935 * Loop over all addresses requested 4936 * Call segvn_faultpage passing in page list 4937 * to load up translations and handle anonymous pages 4938 * endloop 4939 * Load up translation to any additional pages in page list not 4940 * already handled that fit into this segment 4941 */ 4942 static faultcode_t 4943 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4944 enum fault_type type, enum seg_rw rw) 4945 { 4946 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4947 page_t **plp, **ppp, *pp; 4948 u_offset_t off; 4949 caddr_t a; 4950 struct vpage *vpage; 4951 uint_t vpprot, prot; 4952 int err; 4953 page_t *pl[PVN_GETPAGE_NUM + 1]; 4954 size_t plsz, pl_alloc_sz; 4955 size_t page; 4956 ulong_t anon_index; 4957 struct anon_map *amp; 4958 int dogetpage = 0; 4959 caddr_t lpgaddr, lpgeaddr; 4960 size_t pgsz; 4961 anon_sync_obj_t cookie; 4962 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4963 4964 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 4965 ASSERT(svd->amp == NULL || svd->rcookie == HAT_INVALID_REGION_COOKIE); 4966 4967 /* 4968 * First handle the easy stuff 4969 */ 4970 if (type == F_SOFTUNLOCK) { 4971 if (rw == S_READ_NOCOW) { 4972 rw = S_READ; 4973 ASSERT(AS_WRITE_HELD(seg->s_as)); 4974 } 4975 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4976 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4977 page_get_pagesize(seg->s_szc); 4978 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4979 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4980 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4981 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4982 return (0); 4983 } 4984 4985 ASSERT(svd->tr_state == SEGVN_TR_OFF || 4986 !HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 4987 if (brkcow == 0) { 4988 if (svd->tr_state == SEGVN_TR_INIT) { 4989 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4990 if (svd->tr_state == SEGVN_TR_INIT) { 4991 ASSERT(svd->vp != NULL && svd->amp == NULL); 4992 ASSERT(svd->flags & MAP_TEXT); 4993 ASSERT(svd->type == MAP_PRIVATE); 4994 segvn_textrepl(seg); 4995 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4996 ASSERT(svd->tr_state != SEGVN_TR_ON || 4997 svd->amp != NULL); 4998 } 4999 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5000 } 5001 } else if (svd->tr_state != SEGVN_TR_OFF) { 5002 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5003 5004 if (rw == S_WRITE && svd->tr_state != SEGVN_TR_OFF) { 5005 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 5006 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5007 return (FC_PROT); 5008 } 5009 5010 if (svd->tr_state == SEGVN_TR_ON) { 5011 ASSERT(svd->vp != NULL && svd->amp != NULL); 5012 segvn_textunrepl(seg, 0); 5013 ASSERT(svd->amp == NULL && 5014 svd->tr_state == SEGVN_TR_OFF); 5015 } else if (svd->tr_state != SEGVN_TR_OFF) { 5016 svd->tr_state = SEGVN_TR_OFF; 5017 } 5018 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5019 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5020 } 5021 5022 top: 5023 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5024 5025 /* 5026 * If we have the same protections for the entire segment, 5027 * insure that the access being attempted is legitimate. 5028 */ 5029 5030 if (svd->pageprot == 0) { 5031 uint_t protchk; 5032 5033 switch (rw) { 5034 case S_READ: 5035 case S_READ_NOCOW: 5036 protchk = PROT_READ; 5037 break; 5038 case S_WRITE: 5039 protchk = PROT_WRITE; 5040 break; 5041 case S_EXEC: 5042 protchk = PROT_EXEC; 5043 break; 5044 case S_OTHER: 5045 default: 5046 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 5047 break; 5048 } 5049 5050 if ((svd->prot & protchk) == 0) { 5051 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5052 return (FC_PROT); /* illegal access type */ 5053 } 5054 } 5055 5056 if (brkcow && HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5057 /* this must be SOFTLOCK S_READ fault */ 5058 ASSERT(svd->amp == NULL); 5059 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5060 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5061 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5062 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5063 /* 5064 * this must be the first ever non S_READ_NOCOW 5065 * softlock for this segment. 5066 */ 5067 ASSERT(svd->softlockcnt == 0); 5068 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5069 HAT_REGION_TEXT); 5070 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5071 } 5072 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5073 goto top; 5074 } 5075 5076 /* 5077 * We can't allow the long term use of softlocks for vmpss segments, 5078 * because in some file truncation cases we should be able to demote 5079 * the segment, which requires that there are no softlocks. The 5080 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 5081 * segment is S_READ_NOCOW, where the caller holds the address space 5082 * locked as writer and calls softunlock before dropping the as lock. 5083 * S_READ_NOCOW is used by /proc to read memory from another user. 5084 * 5085 * Another deadlock between SOFTLOCK and file truncation can happen 5086 * because segvn_fault_vnodepages() calls the FS one pagesize at 5087 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 5088 * can cause a deadlock because the first set of page_t's remain 5089 * locked SE_SHARED. To avoid this, we demote segments on a first 5090 * SOFTLOCK if they have a length greater than the segment's 5091 * page size. 5092 * 5093 * So for now, we only avoid demoting a segment on a SOFTLOCK when 5094 * the access type is S_READ_NOCOW and the fault length is less than 5095 * or equal to the segment's page size. While this is quite restrictive, 5096 * it should be the most common case of SOFTLOCK against a vmpss 5097 * segment. 5098 * 5099 * For S_READ_NOCOW, it's safe not to do a copy on write because the 5100 * caller makes sure no COW will be caused by another thread for a 5101 * softlocked page. 5102 */ 5103 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 5104 int demote = 0; 5105 5106 if (rw != S_READ_NOCOW) { 5107 demote = 1; 5108 } 5109 if (!demote && len > PAGESIZE) { 5110 pgsz = page_get_pagesize(seg->s_szc); 5111 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 5112 lpgeaddr); 5113 if (lpgeaddr - lpgaddr > pgsz) { 5114 demote = 1; 5115 } 5116 } 5117 5118 ASSERT(demote || AS_WRITE_HELD(seg->s_as)); 5119 5120 if (demote) { 5121 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5122 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5123 if (seg->s_szc != 0) { 5124 segvn_vmpss_clrszc_cnt++; 5125 ASSERT(svd->softlockcnt == 0); 5126 err = segvn_clrszc(seg); 5127 if (err) { 5128 segvn_vmpss_clrszc_err++; 5129 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5130 return (FC_MAKE_ERR(err)); 5131 } 5132 } 5133 ASSERT(seg->s_szc == 0); 5134 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5135 goto top; 5136 } 5137 } 5138 5139 /* 5140 * Check to see if we need to allocate an anon_map structure. 5141 */ 5142 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 5143 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5144 /* 5145 * Drop the "read" lock on the segment and acquire 5146 * the "write" version since we have to allocate the 5147 * anon_map. 5148 */ 5149 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5150 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5151 5152 if (svd->amp == NULL) { 5153 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 5154 svd->amp->a_szc = seg->s_szc; 5155 } 5156 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5157 5158 /* 5159 * Start all over again since segment protections 5160 * may have changed after we dropped the "read" lock. 5161 */ 5162 goto top; 5163 } 5164 5165 /* 5166 * S_READ_NOCOW vs S_READ distinction was 5167 * only needed for the code above. After 5168 * that we treat it as S_READ. 5169 */ 5170 if (rw == S_READ_NOCOW) { 5171 ASSERT(type == F_SOFTLOCK); 5172 ASSERT(AS_WRITE_HELD(seg->s_as)); 5173 rw = S_READ; 5174 } 5175 5176 amp = svd->amp; 5177 5178 /* 5179 * MADV_SEQUENTIAL work is ignored for large page segments. 5180 */ 5181 if (seg->s_szc != 0) { 5182 pgsz = page_get_pagesize(seg->s_szc); 5183 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 5184 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 5185 if (svd->vp == NULL) { 5186 err = segvn_fault_anonpages(hat, seg, lpgaddr, 5187 lpgeaddr, type, rw, addr, addr + len, brkcow); 5188 } else { 5189 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 5190 lpgeaddr, type, rw, addr, addr + len, brkcow); 5191 if (err == IE_RETRY) { 5192 ASSERT(seg->s_szc == 0); 5193 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 5194 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5195 goto top; 5196 } 5197 } 5198 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5199 return (err); 5200 } 5201 5202 page = seg_page(seg, addr); 5203 if (amp != NULL) { 5204 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 5205 anon_index = svd->anon_index + page; 5206 5207 if (type == F_PROT && rw == S_READ && 5208 svd->tr_state == SEGVN_TR_OFF && 5209 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 5210 size_t index = anon_index; 5211 struct anon *ap; 5212 5213 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5214 /* 5215 * The fast path could apply to S_WRITE also, except 5216 * that the protection fault could be caused by lazy 5217 * tlb flush when ro->rw. In this case, the pte is 5218 * RW already. But RO in the other cpu's tlb causes 5219 * the fault. Since hat_chgprot won't do anything if 5220 * pte doesn't change, we may end up faulting 5221 * indefinitely until the RO tlb entry gets replaced. 5222 */ 5223 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 5224 anon_array_enter(amp, index, &cookie); 5225 ap = anon_get_ptr(amp->ahp, index); 5226 anon_array_exit(&cookie); 5227 if ((ap == NULL) || (ap->an_refcnt != 1)) { 5228 ANON_LOCK_EXIT(&->a_rwlock); 5229 goto slow; 5230 } 5231 } 5232 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 5233 ANON_LOCK_EXIT(&->a_rwlock); 5234 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5235 return (0); 5236 } 5237 } 5238 slow: 5239 5240 if (svd->vpage == NULL) 5241 vpage = NULL; 5242 else 5243 vpage = &svd->vpage[page]; 5244 5245 off = svd->offset + (uintptr_t)(addr - seg->s_base); 5246 5247 /* 5248 * If MADV_SEQUENTIAL has been set for the particular page we 5249 * are faulting on, free behind all pages in the segment and put 5250 * them on the free list. 5251 */ 5252 5253 if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) { 5254 struct vpage *vpp; 5255 ulong_t fanon_index; 5256 size_t fpage; 5257 u_offset_t pgoff, fpgoff; 5258 struct vnode *fvp; 5259 struct anon *fap = NULL; 5260 5261 if (svd->advice == MADV_SEQUENTIAL || 5262 (svd->pageadvice && 5263 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 5264 pgoff = off - PAGESIZE; 5265 fpage = page - 1; 5266 if (vpage != NULL) 5267 vpp = &svd->vpage[fpage]; 5268 if (amp != NULL) 5269 fanon_index = svd->anon_index + fpage; 5270 5271 while (pgoff > svd->offset) { 5272 if (svd->advice != MADV_SEQUENTIAL && 5273 (!svd->pageadvice || (vpage && 5274 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 5275 break; 5276 5277 /* 5278 * If this is an anon page, we must find the 5279 * correct <vp, offset> for it 5280 */ 5281 fap = NULL; 5282 if (amp != NULL) { 5283 ANON_LOCK_ENTER(&->a_rwlock, 5284 RW_READER); 5285 anon_array_enter(amp, fanon_index, 5286 &cookie); 5287 fap = anon_get_ptr(amp->ahp, 5288 fanon_index); 5289 if (fap != NULL) { 5290 swap_xlate(fap, &fvp, &fpgoff); 5291 } else { 5292 fpgoff = pgoff; 5293 fvp = svd->vp; 5294 } 5295 anon_array_exit(&cookie); 5296 ANON_LOCK_EXIT(&->a_rwlock); 5297 } else { 5298 fpgoff = pgoff; 5299 fvp = svd->vp; 5300 } 5301 if (fvp == NULL) 5302 break; /* XXX */ 5303 /* 5304 * Skip pages that are free or have an 5305 * "exclusive" lock. 5306 */ 5307 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 5308 if (pp == NULL) 5309 break; 5310 /* 5311 * We don't need the page_struct_lock to test 5312 * as this is only advisory; even if we 5313 * acquire it someone might race in and lock 5314 * the page after we unlock and before the 5315 * PUTPAGE, then VOP_PUTPAGE will do nothing. 5316 */ 5317 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 5318 /* 5319 * Hold the vnode before releasing 5320 * the page lock to prevent it from 5321 * being freed and re-used by some 5322 * other thread. 5323 */ 5324 VN_HOLD(fvp); 5325 page_unlock(pp); 5326 /* 5327 * We should build a page list 5328 * to kluster putpages XXX 5329 */ 5330 (void) VOP_PUTPAGE(fvp, 5331 (offset_t)fpgoff, PAGESIZE, 5332 (B_DONTNEED|B_FREE|B_ASYNC), 5333 svd->cred, NULL); 5334 VN_RELE(fvp); 5335 } else { 5336 /* 5337 * XXX - Should the loop terminate if 5338 * the page is `locked'? 5339 */ 5340 page_unlock(pp); 5341 } 5342 --vpp; 5343 --fanon_index; 5344 pgoff -= PAGESIZE; 5345 } 5346 } 5347 } 5348 5349 plp = pl; 5350 *plp = NULL; 5351 pl_alloc_sz = 0; 5352 5353 /* 5354 * See if we need to call VOP_GETPAGE for 5355 * *any* of the range being faulted on. 5356 * We can skip all of this work if there 5357 * was no original vnode. 5358 */ 5359 if (svd->vp != NULL) { 5360 u_offset_t vp_off; 5361 size_t vp_len; 5362 struct anon *ap; 5363 vnode_t *vp; 5364 5365 vp_off = off; 5366 vp_len = len; 5367 5368 if (amp == NULL) 5369 dogetpage = 1; 5370 else { 5371 /* 5372 * Only acquire reader lock to prevent amp->ahp 5373 * from being changed. It's ok to miss pages, 5374 * hence we don't do anon_array_enter 5375 */ 5376 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5377 ap = anon_get_ptr(amp->ahp, anon_index); 5378 5379 if (len <= PAGESIZE) 5380 /* inline non_anon() */ 5381 dogetpage = (ap == NULL); 5382 else 5383 dogetpage = non_anon(amp->ahp, anon_index, 5384 &vp_off, &vp_len); 5385 ANON_LOCK_EXIT(&->a_rwlock); 5386 } 5387 5388 if (dogetpage) { 5389 enum seg_rw arw; 5390 struct as *as = seg->s_as; 5391 5392 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 5393 /* 5394 * Page list won't fit in local array, 5395 * allocate one of the needed size. 5396 */ 5397 pl_alloc_sz = 5398 (btop(len) + 1) * sizeof (page_t *); 5399 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 5400 plp[0] = NULL; 5401 plsz = len; 5402 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 5403 svd->tr_state == SEGVN_TR_ON || rw == S_OTHER || 5404 (((size_t)(addr + PAGESIZE) < 5405 (size_t)(seg->s_base + seg->s_size)) && 5406 hat_probe(as->a_hat, addr + PAGESIZE))) { 5407 /* 5408 * Ask VOP_GETPAGE to return the exact number 5409 * of pages if 5410 * (a) this is a COW fault, or 5411 * (b) this is a software fault, or 5412 * (c) next page is already mapped. 5413 */ 5414 plsz = len; 5415 } else { 5416 /* 5417 * Ask VOP_GETPAGE to return adjacent pages 5418 * within the segment. 5419 */ 5420 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 5421 ((seg->s_base + seg->s_size) - addr)); 5422 ASSERT((addr + plsz) <= 5423 (seg->s_base + seg->s_size)); 5424 } 5425 5426 /* 5427 * Need to get some non-anonymous pages. 5428 * We need to make only one call to GETPAGE to do 5429 * this to prevent certain deadlocking conditions 5430 * when we are doing locking. In this case 5431 * non_anon() should have picked up the smallest 5432 * range which includes all the non-anonymous 5433 * pages in the requested range. We have to 5434 * be careful regarding which rw flag to pass in 5435 * because on a private mapping, the underlying 5436 * object is never allowed to be written. 5437 */ 5438 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 5439 arw = S_READ; 5440 } else { 5441 arw = rw; 5442 } 5443 vp = svd->vp; 5444 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5445 "segvn_getpage:seg %p addr %p vp %p", 5446 seg, addr, vp); 5447 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 5448 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 5449 svd->cred, NULL); 5450 if (err) { 5451 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5452 segvn_pagelist_rele(plp); 5453 if (pl_alloc_sz) 5454 kmem_free(plp, pl_alloc_sz); 5455 return (FC_MAKE_ERR(err)); 5456 } 5457 if (svd->type == MAP_PRIVATE) 5458 vpprot &= ~PROT_WRITE; 5459 } 5460 } 5461 5462 /* 5463 * N.B. at this time the plp array has all the needed non-anon 5464 * pages in addition to (possibly) having some adjacent pages. 5465 */ 5466 5467 /* 5468 * Always acquire the anon_array_lock to prevent 5469 * 2 threads from allocating separate anon slots for 5470 * the same "addr". 5471 * 5472 * If this is a copy-on-write fault and we don't already 5473 * have the anon_array_lock, acquire it to prevent the 5474 * fault routine from handling multiple copy-on-write faults 5475 * on the same "addr" in the same address space. 5476 * 5477 * Only one thread should deal with the fault since after 5478 * it is handled, the other threads can acquire a translation 5479 * to the newly created private page. This prevents two or 5480 * more threads from creating different private pages for the 5481 * same fault. 5482 * 5483 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5484 * to prevent deadlock between this thread and another thread 5485 * which has soft-locked this page and wants to acquire serial_lock. 5486 * ( bug 4026339 ) 5487 * 5488 * The fix for bug 4026339 becomes unnecessary when using the 5489 * locking scheme with per amp rwlock and a global set of hash 5490 * lock, anon_array_lock. If we steal a vnode page when low 5491 * on memory and upgrad the page lock through page_rename, 5492 * then the page is PAGE_HANDLED, nothing needs to be done 5493 * for this page after returning from segvn_faultpage. 5494 * 5495 * But really, the page lock should be downgraded after 5496 * the stolen page is page_rename'd. 5497 */ 5498 5499 if (amp != NULL) 5500 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5501 5502 /* 5503 * Ok, now loop over the address range and handle faults 5504 */ 5505 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5506 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5507 type, rw, brkcow); 5508 if (err) { 5509 if (amp != NULL) 5510 ANON_LOCK_EXIT(&->a_rwlock); 5511 if (type == F_SOFTLOCK && a > addr) { 5512 segvn_softunlock(seg, addr, (a - addr), 5513 S_OTHER); 5514 } 5515 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5516 segvn_pagelist_rele(plp); 5517 if (pl_alloc_sz) 5518 kmem_free(plp, pl_alloc_sz); 5519 return (err); 5520 } 5521 if (vpage) { 5522 vpage++; 5523 } else if (svd->vpage) { 5524 page = seg_page(seg, addr); 5525 vpage = &svd->vpage[++page]; 5526 } 5527 } 5528 5529 /* Didn't get pages from the underlying fs so we're done */ 5530 if (!dogetpage) 5531 goto done; 5532 5533 /* 5534 * Now handle any other pages in the list returned. 5535 * If the page can be used, load up the translations now. 5536 * Note that the for loop will only be entered if "plp" 5537 * is pointing to a non-NULL page pointer which means that 5538 * VOP_GETPAGE() was called and vpprot has been initialized. 5539 */ 5540 if (svd->pageprot == 0) 5541 prot = svd->prot & vpprot; 5542 5543 5544 /* 5545 * Large Files: diff should be unsigned value because we started 5546 * supporting > 2GB segment sizes from 2.5.1 and when a 5547 * large file of size > 2GB gets mapped to address space 5548 * the diff value can be > 2GB. 5549 */ 5550 5551 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5552 size_t diff; 5553 struct anon *ap; 5554 int anon_index; 5555 anon_sync_obj_t cookie; 5556 int hat_flag = HAT_LOAD_ADV; 5557 5558 if (svd->flags & MAP_TEXT) { 5559 hat_flag |= HAT_LOAD_TEXT; 5560 } 5561 5562 if (pp == PAGE_HANDLED) 5563 continue; 5564 5565 if (svd->tr_state != SEGVN_TR_ON && 5566 pp->p_offset >= svd->offset && 5567 pp->p_offset < svd->offset + seg->s_size) { 5568 5569 diff = pp->p_offset - svd->offset; 5570 5571 /* 5572 * Large Files: Following is the assertion 5573 * validating the above cast. 5574 */ 5575 ASSERT(svd->vp == pp->p_vnode); 5576 5577 page = btop(diff); 5578 if (svd->pageprot) 5579 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5580 5581 /* 5582 * Prevent other threads in the address space from 5583 * creating private pages (i.e., allocating anon slots) 5584 * while we are in the process of loading translations 5585 * to additional pages returned by the underlying 5586 * object. 5587 */ 5588 if (amp != NULL) { 5589 anon_index = svd->anon_index + page; 5590 anon_array_enter(amp, anon_index, &cookie); 5591 ap = anon_get_ptr(amp->ahp, anon_index); 5592 } 5593 if ((amp == NULL) || (ap == NULL)) { 5594 if (IS_VMODSORT(pp->p_vnode) || 5595 enable_mbit_wa) { 5596 if (rw == S_WRITE) 5597 hat_setmod(pp); 5598 else if (rw != S_OTHER && 5599 !hat_ismod(pp)) 5600 prot &= ~PROT_WRITE; 5601 } 5602 /* 5603 * Skip mapping read ahead pages marked 5604 * for migration, so they will get migrated 5605 * properly on fault 5606 */ 5607 ASSERT(amp == NULL || 5608 svd->rcookie == HAT_INVALID_REGION_COOKIE); 5609 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5610 hat_memload_region(hat, 5611 seg->s_base + diff, 5612 pp, prot, hat_flag, 5613 svd->rcookie); 5614 } 5615 } 5616 if (amp != NULL) 5617 anon_array_exit(&cookie); 5618 } 5619 page_unlock(pp); 5620 } 5621 done: 5622 if (amp != NULL) 5623 ANON_LOCK_EXIT(&->a_rwlock); 5624 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5625 if (pl_alloc_sz) 5626 kmem_free(plp, pl_alloc_sz); 5627 return (0); 5628 } 5629 5630 /* 5631 * This routine is used to start I/O on pages asynchronously. XXX it will 5632 * only create PAGESIZE pages. At fault time they will be relocated into 5633 * larger pages. 5634 */ 5635 static faultcode_t 5636 segvn_faulta(struct seg *seg, caddr_t addr) 5637 { 5638 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5639 int err; 5640 struct anon_map *amp; 5641 vnode_t *vp; 5642 5643 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 5644 5645 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5646 if ((amp = svd->amp) != NULL) { 5647 struct anon *ap; 5648 5649 /* 5650 * Reader lock to prevent amp->ahp from being changed. 5651 * This is advisory, it's ok to miss a page, so 5652 * we don't do anon_array_enter lock. 5653 */ 5654 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5655 if ((ap = anon_get_ptr(amp->ahp, 5656 svd->anon_index + seg_page(seg, addr))) != NULL) { 5657 5658 err = anon_getpage(&ap, NULL, NULL, 5659 0, seg, addr, S_READ, svd->cred); 5660 5661 ANON_LOCK_EXIT(&->a_rwlock); 5662 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5663 if (err) 5664 return (FC_MAKE_ERR(err)); 5665 return (0); 5666 } 5667 ANON_LOCK_EXIT(&->a_rwlock); 5668 } 5669 5670 if (svd->vp == NULL) { 5671 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5672 return (0); /* zfod page - do nothing now */ 5673 } 5674 5675 vp = svd->vp; 5676 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5677 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5678 err = VOP_GETPAGE(vp, 5679 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5680 PAGESIZE, NULL, NULL, 0, seg, addr, 5681 S_OTHER, svd->cred, NULL); 5682 5683 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5684 if (err) 5685 return (FC_MAKE_ERR(err)); 5686 return (0); 5687 } 5688 5689 static int 5690 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5691 { 5692 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5693 struct vpage *cvp, *svp, *evp; 5694 struct vnode *vp; 5695 size_t pgsz; 5696 pgcnt_t pgcnt; 5697 anon_sync_obj_t cookie; 5698 int unload_done = 0; 5699 5700 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 5701 5702 if ((svd->maxprot & prot) != prot) 5703 return (EACCES); /* violated maxprot */ 5704 5705 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5706 5707 /* return if prot is the same */ 5708 if (!svd->pageprot && svd->prot == prot) { 5709 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5710 return (0); 5711 } 5712 5713 /* 5714 * Since we change protections we first have to flush the cache. 5715 * This makes sure all the pagelock calls have to recheck 5716 * protections. 5717 */ 5718 if (svd->softlockcnt > 0) { 5719 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5720 5721 /* 5722 * If this is shared segment non 0 softlockcnt 5723 * means locked pages are still in use. 5724 */ 5725 if (svd->type == MAP_SHARED) { 5726 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5727 return (EAGAIN); 5728 } 5729 5730 /* 5731 * Since we do have the segvn writers lock nobody can fill 5732 * the cache with entries belonging to this seg during 5733 * the purge. The flush either succeeds or we still have 5734 * pending I/Os. 5735 */ 5736 segvn_purge(seg); 5737 if (svd->softlockcnt > 0) { 5738 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5739 return (EAGAIN); 5740 } 5741 } 5742 5743 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 5744 ASSERT(svd->amp == NULL); 5745 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5746 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 5747 HAT_REGION_TEXT); 5748 svd->rcookie = HAT_INVALID_REGION_COOKIE; 5749 unload_done = 1; 5750 } else if (svd->tr_state == SEGVN_TR_INIT) { 5751 svd->tr_state = SEGVN_TR_OFF; 5752 } else if (svd->tr_state == SEGVN_TR_ON) { 5753 ASSERT(svd->amp != NULL); 5754 segvn_textunrepl(seg, 0); 5755 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5756 unload_done = 1; 5757 } 5758 5759 if ((prot & PROT_WRITE) && svd->type == MAP_SHARED && 5760 svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) { 5761 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 5762 segvn_inval_trcache(svd->vp); 5763 } 5764 if (seg->s_szc != 0) { 5765 int err; 5766 pgsz = page_get_pagesize(seg->s_szc); 5767 pgcnt = pgsz >> PAGESHIFT; 5768 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5769 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5770 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5771 ASSERT(seg->s_base != addr || seg->s_size != len); 5772 /* 5773 * If we are holding the as lock as a reader then 5774 * we need to return IE_RETRY and let the as 5775 * layer drop and re-acquire the lock as a writer. 5776 */ 5777 if (AS_READ_HELD(seg->s_as)) 5778 return (IE_RETRY); 5779 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5780 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5781 err = segvn_demote_range(seg, addr, len, 5782 SDR_END, 0); 5783 } else { 5784 uint_t szcvec = map_pgszcvec(seg->s_base, 5785 pgsz, (uintptr_t)seg->s_base, 5786 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5787 err = segvn_demote_range(seg, addr, len, 5788 SDR_END, szcvec); 5789 } 5790 if (err == 0) 5791 return (IE_RETRY); 5792 if (err == ENOMEM) 5793 return (IE_NOMEM); 5794 return (err); 5795 } 5796 } 5797 5798 5799 /* 5800 * If it's a private mapping and we're making it writable then we 5801 * may have to reserve the additional swap space now. If we are 5802 * making writable only a part of the segment then we use its vpage 5803 * array to keep a record of the pages for which we have reserved 5804 * swap. In this case we set the pageswap field in the segment's 5805 * segvn structure to record this. 5806 * 5807 * If it's a private mapping to a file (i.e., vp != NULL) and we're 5808 * removing write permission on the entire segment and we haven't 5809 * modified any pages, we can release the swap space. 5810 */ 5811 if (svd->type == MAP_PRIVATE) { 5812 if (prot & PROT_WRITE) { 5813 if (!(svd->flags & MAP_NORESERVE) && 5814 !(svd->swresv && svd->pageswap == 0)) { 5815 size_t sz = 0; 5816 5817 /* 5818 * Start by determining how much swap 5819 * space is required. 5820 */ 5821 if (addr == seg->s_base && 5822 len == seg->s_size && 5823 svd->pageswap == 0) { 5824 /* The whole segment */ 5825 sz = seg->s_size; 5826 } else { 5827 /* 5828 * Make sure that the vpage array 5829 * exists, and make a note of the 5830 * range of elements corresponding 5831 * to len. 5832 */ 5833 segvn_vpage(seg); 5834 if (svd->vpage == NULL) { 5835 SEGVN_LOCK_EXIT(seg->s_as, 5836 &svd->lock); 5837 return (ENOMEM); 5838 } 5839 svp = &svd->vpage[seg_page(seg, addr)]; 5840 evp = &svd->vpage[seg_page(seg, 5841 addr + len)]; 5842 5843 if (svd->pageswap == 0) { 5844 /* 5845 * This is the first time we've 5846 * asked for a part of this 5847 * segment, so we need to 5848 * reserve everything we've 5849 * been asked for. 5850 */ 5851 sz = len; 5852 } else { 5853 /* 5854 * We have to count the number 5855 * of pages required. 5856 */ 5857 for (cvp = svp; cvp < evp; 5858 cvp++) { 5859 if (!VPP_ISSWAPRES(cvp)) 5860 sz++; 5861 } 5862 sz <<= PAGESHIFT; 5863 } 5864 } 5865 5866 /* Try to reserve the necessary swap. */ 5867 if (anon_resv_zone(sz, 5868 seg->s_as->a_proc->p_zone) == 0) { 5869 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5870 return (IE_NOMEM); 5871 } 5872 5873 /* 5874 * Make a note of how much swap space 5875 * we've reserved. 5876 */ 5877 if (svd->pageswap == 0 && sz == seg->s_size) { 5878 svd->swresv = sz; 5879 } else { 5880 ASSERT(svd->vpage != NULL); 5881 svd->swresv += sz; 5882 svd->pageswap = 1; 5883 for (cvp = svp; cvp < evp; cvp++) { 5884 if (!VPP_ISSWAPRES(cvp)) 5885 VPP_SETSWAPRES(cvp); 5886 } 5887 } 5888 } 5889 } else { 5890 /* 5891 * Swap space is released only if this segment 5892 * does not map anonymous memory, since read faults 5893 * on such segments still need an anon slot to read 5894 * in the data. 5895 */ 5896 if (svd->swresv != 0 && svd->vp != NULL && 5897 svd->amp == NULL && addr == seg->s_base && 5898 len == seg->s_size && svd->pageprot == 0) { 5899 ASSERT(svd->pageswap == 0); 5900 anon_unresv_zone(svd->swresv, 5901 seg->s_as->a_proc->p_zone); 5902 svd->swresv = 0; 5903 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5904 "anon proc:%p %lu %u", seg, 0, 0); 5905 } 5906 } 5907 } 5908 5909 if (addr == seg->s_base && len == seg->s_size && svd->vpage == NULL) { 5910 if (svd->prot == prot) { 5911 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5912 return (0); /* all done */ 5913 } 5914 svd->prot = (uchar_t)prot; 5915 } else if (svd->type == MAP_PRIVATE) { 5916 struct anon *ap = NULL; 5917 page_t *pp; 5918 u_offset_t offset, off; 5919 struct anon_map *amp; 5920 ulong_t anon_idx = 0; 5921 5922 /* 5923 * A vpage structure exists or else the change does not 5924 * involve the entire segment. Establish a vpage structure 5925 * if none is there. Then, for each page in the range, 5926 * adjust its individual permissions. Note that write- 5927 * enabling a MAP_PRIVATE page can affect the claims for 5928 * locked down memory. Overcommitting memory terminates 5929 * the operation. 5930 */ 5931 segvn_vpage(seg); 5932 if (svd->vpage == NULL) { 5933 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5934 return (ENOMEM); 5935 } 5936 svd->pageprot = 1; 5937 if ((amp = svd->amp) != NULL) { 5938 anon_idx = svd->anon_index + seg_page(seg, addr); 5939 ASSERT(seg->s_szc == 0 || 5940 IS_P2ALIGNED(anon_idx, pgcnt)); 5941 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5942 } 5943 5944 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5945 evp = &svd->vpage[seg_page(seg, addr + len)]; 5946 5947 /* 5948 * See Statement at the beginning of segvn_lockop regarding 5949 * the way cowcnts and lckcnts are handled. 5950 */ 5951 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5952 5953 if (seg->s_szc != 0) { 5954 if (amp != NULL) { 5955 anon_array_enter(amp, anon_idx, 5956 &cookie); 5957 } 5958 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5959 !segvn_claim_pages(seg, svp, offset, 5960 anon_idx, prot)) { 5961 if (amp != NULL) { 5962 anon_array_exit(&cookie); 5963 } 5964 break; 5965 } 5966 if (amp != NULL) { 5967 anon_array_exit(&cookie); 5968 } 5969 anon_idx++; 5970 } else { 5971 if (amp != NULL) { 5972 anon_array_enter(amp, anon_idx, 5973 &cookie); 5974 ap = anon_get_ptr(amp->ahp, anon_idx++); 5975 } 5976 5977 if (VPP_ISPPLOCK(svp) && 5978 VPP_PROT(svp) != prot) { 5979 5980 if (amp == NULL || ap == NULL) { 5981 vp = svd->vp; 5982 off = offset; 5983 } else 5984 swap_xlate(ap, &vp, &off); 5985 if (amp != NULL) 5986 anon_array_exit(&cookie); 5987 5988 if ((pp = page_lookup(vp, off, 5989 SE_SHARED)) == NULL) { 5990 panic("segvn_setprot: no page"); 5991 /*NOTREACHED*/ 5992 } 5993 ASSERT(seg->s_szc == 0); 5994 if ((VPP_PROT(svp) ^ prot) & 5995 PROT_WRITE) { 5996 if (prot & PROT_WRITE) { 5997 if (!page_addclaim( 5998 pp)) { 5999 page_unlock(pp); 6000 break; 6001 } 6002 } else { 6003 if (!page_subclaim( 6004 pp)) { 6005 page_unlock(pp); 6006 break; 6007 } 6008 } 6009 } 6010 page_unlock(pp); 6011 } else if (amp != NULL) 6012 anon_array_exit(&cookie); 6013 } 6014 VPP_SETPROT(svp, prot); 6015 offset += PAGESIZE; 6016 } 6017 if (amp != NULL) 6018 ANON_LOCK_EXIT(&->a_rwlock); 6019 6020 /* 6021 * Did we terminate prematurely? If so, simply unload 6022 * the translations to the things we've updated so far. 6023 */ 6024 if (svp != evp) { 6025 if (unload_done) { 6026 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6027 return (IE_NOMEM); 6028 } 6029 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 6030 PAGESIZE; 6031 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 6032 if (len != 0) 6033 hat_unload(seg->s_as->a_hat, addr, 6034 len, HAT_UNLOAD); 6035 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6036 return (IE_NOMEM); 6037 } 6038 } else { 6039 segvn_vpage(seg); 6040 if (svd->vpage == NULL) { 6041 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6042 return (ENOMEM); 6043 } 6044 svd->pageprot = 1; 6045 evp = &svd->vpage[seg_page(seg, addr + len)]; 6046 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 6047 VPP_SETPROT(svp, prot); 6048 } 6049 } 6050 6051 if (unload_done) { 6052 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6053 return (0); 6054 } 6055 6056 if (((prot & PROT_WRITE) != 0 && 6057 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 6058 (prot & ~PROT_USER) == PROT_NONE) { 6059 /* 6060 * Either private or shared data with write access (in 6061 * which case we need to throw out all former translations 6062 * so that we get the right translations set up on fault 6063 * and we don't allow write access to any copy-on-write pages 6064 * that might be around or to prevent write access to pages 6065 * representing holes in a file), or we don't have permission 6066 * to access the memory at all (in which case we have to 6067 * unload any current translations that might exist). 6068 */ 6069 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 6070 } else { 6071 /* 6072 * A shared mapping or a private mapping in which write 6073 * protection is going to be denied - just change all the 6074 * protections over the range of addresses in question. 6075 * segvn does not support any other attributes other 6076 * than prot so we can use hat_chgattr. 6077 */ 6078 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 6079 } 6080 6081 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6082 6083 return (0); 6084 } 6085 6086 /* 6087 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 6088 * to determine if the seg is capable of mapping the requested szc. 6089 */ 6090 static int 6091 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 6092 { 6093 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6094 struct segvn_data *nsvd; 6095 struct anon_map *amp = svd->amp; 6096 struct seg *nseg; 6097 caddr_t eaddr = addr + len, a; 6098 size_t pgsz = page_get_pagesize(szc); 6099 pgcnt_t pgcnt = page_get_pagecnt(szc); 6100 int err; 6101 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 6102 6103 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as)); 6104 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6105 6106 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 6107 return (0); 6108 } 6109 6110 /* 6111 * addr should always be pgsz aligned but eaddr may be misaligned if 6112 * it's at the end of the segment. 6113 * 6114 * XXX we should assert this condition since as_setpagesize() logic 6115 * guarantees it. 6116 */ 6117 if (!IS_P2ALIGNED(addr, pgsz) || 6118 (!IS_P2ALIGNED(eaddr, pgsz) && 6119 eaddr != seg->s_base + seg->s_size)) { 6120 6121 segvn_setpgsz_align_err++; 6122 return (EINVAL); 6123 } 6124 6125 if (amp != NULL && svd->type == MAP_SHARED) { 6126 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 6127 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 6128 6129 segvn_setpgsz_anon_align_err++; 6130 return (EINVAL); 6131 } 6132 } 6133 6134 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 6135 szc > segvn_maxpgszc) { 6136 return (EINVAL); 6137 } 6138 6139 /* paranoid check */ 6140 if (svd->vp != NULL && 6141 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 6142 return (EINVAL); 6143 } 6144 6145 if (seg->s_szc == 0 && svd->vp != NULL && 6146 map_addr_vacalign_check(addr, off)) { 6147 return (EINVAL); 6148 } 6149 6150 /* 6151 * Check that protections are the same within new page 6152 * size boundaries. 6153 */ 6154 if (svd->pageprot) { 6155 for (a = addr; a < eaddr; a += pgsz) { 6156 if ((a + pgsz) > eaddr) { 6157 if (!sameprot(seg, a, eaddr - a)) { 6158 return (EINVAL); 6159 } 6160 } else { 6161 if (!sameprot(seg, a, pgsz)) { 6162 return (EINVAL); 6163 } 6164 } 6165 } 6166 } 6167 6168 /* 6169 * Since we are changing page size we first have to flush 6170 * the cache. This makes sure all the pagelock calls have 6171 * to recheck protections. 6172 */ 6173 if (svd->softlockcnt > 0) { 6174 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6175 6176 /* 6177 * If this is shared segment non 0 softlockcnt 6178 * means locked pages are still in use. 6179 */ 6180 if (svd->type == MAP_SHARED) { 6181 return (EAGAIN); 6182 } 6183 6184 /* 6185 * Since we do have the segvn writers lock nobody can fill 6186 * the cache with entries belonging to this seg during 6187 * the purge. The flush either succeeds or we still have 6188 * pending I/Os. 6189 */ 6190 segvn_purge(seg); 6191 if (svd->softlockcnt > 0) { 6192 return (EAGAIN); 6193 } 6194 } 6195 6196 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6197 ASSERT(svd->amp == NULL); 6198 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6199 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6200 HAT_REGION_TEXT); 6201 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6202 } else if (svd->tr_state == SEGVN_TR_INIT) { 6203 svd->tr_state = SEGVN_TR_OFF; 6204 } else if (svd->tr_state == SEGVN_TR_ON) { 6205 ASSERT(svd->amp != NULL); 6206 segvn_textunrepl(seg, 1); 6207 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6208 amp = NULL; 6209 } 6210 6211 /* 6212 * Operation for sub range of existing segment. 6213 */ 6214 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 6215 if (szc < seg->s_szc) { 6216 VM_STAT_ADD(segvnvmstats.demoterange[2]); 6217 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 6218 if (err == 0) { 6219 return (IE_RETRY); 6220 } 6221 if (err == ENOMEM) { 6222 return (IE_NOMEM); 6223 } 6224 return (err); 6225 } 6226 if (addr != seg->s_base) { 6227 nseg = segvn_split_seg(seg, addr); 6228 if (eaddr != (nseg->s_base + nseg->s_size)) { 6229 /* eaddr is szc aligned */ 6230 (void) segvn_split_seg(nseg, eaddr); 6231 } 6232 return (IE_RETRY); 6233 } 6234 if (eaddr != (seg->s_base + seg->s_size)) { 6235 /* eaddr is szc aligned */ 6236 (void) segvn_split_seg(seg, eaddr); 6237 } 6238 return (IE_RETRY); 6239 } 6240 6241 /* 6242 * Break any low level sharing and reset seg->s_szc to 0. 6243 */ 6244 if ((err = segvn_clrszc(seg)) != 0) { 6245 if (err == ENOMEM) { 6246 err = IE_NOMEM; 6247 } 6248 return (err); 6249 } 6250 ASSERT(seg->s_szc == 0); 6251 6252 /* 6253 * If the end of the current segment is not pgsz aligned 6254 * then attempt to concatenate with the next segment. 6255 */ 6256 if (!IS_P2ALIGNED(eaddr, pgsz)) { 6257 nseg = AS_SEGNEXT(seg->s_as, seg); 6258 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 6259 return (ENOMEM); 6260 } 6261 if (nseg->s_ops != &segvn_ops) { 6262 return (EINVAL); 6263 } 6264 nsvd = (struct segvn_data *)nseg->s_data; 6265 if (nsvd->softlockcnt > 0) { 6266 /* 6267 * If this is shared segment non 0 softlockcnt 6268 * means locked pages are still in use. 6269 */ 6270 if (nsvd->type == MAP_SHARED) { 6271 return (EAGAIN); 6272 } 6273 segvn_purge(nseg); 6274 if (nsvd->softlockcnt > 0) { 6275 return (EAGAIN); 6276 } 6277 } 6278 err = segvn_clrszc(nseg); 6279 if (err == ENOMEM) { 6280 err = IE_NOMEM; 6281 } 6282 if (err != 0) { 6283 return (err); 6284 } 6285 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6286 err = segvn_concat(seg, nseg, 1); 6287 if (err == -1) { 6288 return (EINVAL); 6289 } 6290 if (err == -2) { 6291 return (IE_NOMEM); 6292 } 6293 return (IE_RETRY); 6294 } 6295 6296 /* 6297 * May need to re-align anon array to 6298 * new szc. 6299 */ 6300 if (amp != NULL) { 6301 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 6302 struct anon_hdr *nahp; 6303 6304 ASSERT(svd->type == MAP_PRIVATE); 6305 6306 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6307 ASSERT(amp->refcnt == 1); 6308 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 6309 if (nahp == NULL) { 6310 ANON_LOCK_EXIT(&->a_rwlock); 6311 return (IE_NOMEM); 6312 } 6313 if (anon_copy_ptr(amp->ahp, svd->anon_index, 6314 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 6315 anon_release(nahp, btop(amp->size)); 6316 ANON_LOCK_EXIT(&->a_rwlock); 6317 return (IE_NOMEM); 6318 } 6319 anon_release(amp->ahp, btop(amp->size)); 6320 amp->ahp = nahp; 6321 svd->anon_index = 0; 6322 ANON_LOCK_EXIT(&->a_rwlock); 6323 } 6324 } 6325 if (svd->vp != NULL && szc != 0) { 6326 struct vattr va; 6327 u_offset_t eoffpage = svd->offset; 6328 va.va_mask = AT_SIZE; 6329 eoffpage += seg->s_size; 6330 eoffpage = btopr(eoffpage); 6331 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred, NULL) != 0) { 6332 segvn_setpgsz_getattr_err++; 6333 return (EINVAL); 6334 } 6335 if (btopr(va.va_size) < eoffpage) { 6336 segvn_setpgsz_eof_err++; 6337 return (EINVAL); 6338 } 6339 if (amp != NULL) { 6340 /* 6341 * anon_fill_cow_holes() may call VOP_GETPAGE(). 6342 * don't take anon map lock here to avoid holding it 6343 * across VOP_GETPAGE() calls that may call back into 6344 * segvn for klsutering checks. We don't really need 6345 * anon map lock here since it's a private segment and 6346 * we hold as level lock as writers. 6347 */ 6348 if ((err = anon_fill_cow_holes(seg, seg->s_base, 6349 amp->ahp, svd->anon_index, svd->vp, svd->offset, 6350 seg->s_size, szc, svd->prot, svd->vpage, 6351 svd->cred)) != 0) { 6352 return (EINVAL); 6353 } 6354 } 6355 segvn_setvnode_mpss(svd->vp); 6356 } 6357 6358 if (amp != NULL) { 6359 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6360 if (svd->type == MAP_PRIVATE) { 6361 amp->a_szc = szc; 6362 } else if (szc > amp->a_szc) { 6363 amp->a_szc = szc; 6364 } 6365 ANON_LOCK_EXIT(&->a_rwlock); 6366 } 6367 6368 seg->s_szc = szc; 6369 6370 return (0); 6371 } 6372 6373 static int 6374 segvn_clrszc(struct seg *seg) 6375 { 6376 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6377 struct anon_map *amp = svd->amp; 6378 size_t pgsz; 6379 pgcnt_t pages; 6380 int err = 0; 6381 caddr_t a = seg->s_base; 6382 caddr_t ea = a + seg->s_size; 6383 ulong_t an_idx = svd->anon_index; 6384 vnode_t *vp = svd->vp; 6385 struct vpage *vpage = svd->vpage; 6386 page_t *anon_pl[1 + 1], *pp; 6387 struct anon *ap, *oldap; 6388 uint_t prot = svd->prot, vpprot; 6389 int pageflag = 0; 6390 6391 ASSERT(AS_WRITE_HELD(seg->s_as) || 6392 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 6393 ASSERT(svd->softlockcnt == 0); 6394 6395 if (vp == NULL && amp == NULL) { 6396 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6397 seg->s_szc = 0; 6398 return (0); 6399 } 6400 6401 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 6402 ASSERT(svd->amp == NULL); 6403 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6404 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 6405 HAT_REGION_TEXT); 6406 svd->rcookie = HAT_INVALID_REGION_COOKIE; 6407 } else if (svd->tr_state == SEGVN_TR_ON) { 6408 ASSERT(svd->amp != NULL); 6409 segvn_textunrepl(seg, 1); 6410 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6411 amp = NULL; 6412 } else { 6413 if (svd->tr_state != SEGVN_TR_OFF) { 6414 ASSERT(svd->tr_state == SEGVN_TR_INIT); 6415 svd->tr_state = SEGVN_TR_OFF; 6416 } 6417 6418 /* 6419 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 6420 * unload argument is 0 when we are freeing the segment 6421 * and unload was already done. 6422 */ 6423 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 6424 HAT_UNLOAD_UNMAP); 6425 } 6426 6427 if (amp == NULL || svd->type == MAP_SHARED) { 6428 seg->s_szc = 0; 6429 return (0); 6430 } 6431 6432 pgsz = page_get_pagesize(seg->s_szc); 6433 pages = btop(pgsz); 6434 6435 /* 6436 * XXX anon rwlock is not really needed because this is a 6437 * private segment and we are writers. 6438 */ 6439 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6440 6441 for (; a < ea; a += pgsz, an_idx += pages) { 6442 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 6443 ASSERT(vpage != NULL || svd->pageprot == 0); 6444 if (vpage != NULL) { 6445 ASSERT(sameprot(seg, a, pgsz)); 6446 prot = VPP_PROT(vpage); 6447 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 6448 } 6449 if (seg->s_szc != 0) { 6450 ASSERT(vp == NULL || anon_pages(amp->ahp, 6451 an_idx, pages) == pages); 6452 if ((err = anon_map_demotepages(amp, an_idx, 6453 seg, a, prot, vpage, svd->cred)) != 0) { 6454 goto out; 6455 } 6456 } else { 6457 if (oldap->an_refcnt == 1) { 6458 continue; 6459 } 6460 if ((err = anon_getpage(&oldap, &vpprot, 6461 anon_pl, PAGESIZE, seg, a, S_READ, 6462 svd->cred))) { 6463 goto out; 6464 } 6465 if ((pp = anon_private(&ap, seg, a, prot, 6466 anon_pl[0], pageflag, svd->cred)) == NULL) { 6467 err = ENOMEM; 6468 goto out; 6469 } 6470 anon_decref(oldap); 6471 (void) anon_set_ptr(amp->ahp, an_idx, ap, 6472 ANON_SLEEP); 6473 page_unlock(pp); 6474 } 6475 } 6476 vpage = (vpage == NULL) ? NULL : vpage + pages; 6477 } 6478 6479 amp->a_szc = 0; 6480 seg->s_szc = 0; 6481 out: 6482 ANON_LOCK_EXIT(&->a_rwlock); 6483 return (err); 6484 } 6485 6486 static int 6487 segvn_claim_pages( 6488 struct seg *seg, 6489 struct vpage *svp, 6490 u_offset_t off, 6491 ulong_t anon_idx, 6492 uint_t prot) 6493 { 6494 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6495 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 6496 page_t **ppa; 6497 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6498 struct anon_map *amp = svd->amp; 6499 struct vpage *evp = svp + pgcnt; 6500 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 6501 + seg->s_base; 6502 struct anon *ap; 6503 struct vnode *vp = svd->vp; 6504 page_t *pp; 6505 pgcnt_t pg_idx, i; 6506 int err = 0; 6507 anoff_t aoff; 6508 int anon = (amp != NULL) ? 1 : 0; 6509 6510 ASSERT(svd->type == MAP_PRIVATE); 6511 ASSERT(svd->vpage != NULL); 6512 ASSERT(seg->s_szc != 0); 6513 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 6514 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 6515 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 6516 6517 if (VPP_PROT(svp) == prot) 6518 return (1); 6519 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 6520 return (1); 6521 6522 ppa = kmem_alloc(ppasize, KM_SLEEP); 6523 if (anon && vp != NULL) { 6524 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 6525 anon = 0; 6526 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 6527 } 6528 ASSERT(!anon || 6529 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 6530 } 6531 6532 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 6533 if (!VPP_ISPPLOCK(svp)) 6534 continue; 6535 if (anon) { 6536 ap = anon_get_ptr(amp->ahp, anon_idx); 6537 if (ap == NULL) { 6538 panic("segvn_claim_pages: no anon slot"); 6539 } 6540 swap_xlate(ap, &vp, &aoff); 6541 off = (u_offset_t)aoff; 6542 } 6543 ASSERT(vp != NULL); 6544 if ((pp = page_lookup(vp, 6545 (u_offset_t)off, SE_SHARED)) == NULL) { 6546 panic("segvn_claim_pages: no page"); 6547 } 6548 ppa[pg_idx++] = pp; 6549 off += PAGESIZE; 6550 } 6551 6552 if (ppa[0] == NULL) { 6553 kmem_free(ppa, ppasize); 6554 return (1); 6555 } 6556 6557 ASSERT(pg_idx <= pgcnt); 6558 ppa[pg_idx] = NULL; 6559 6560 6561 /* Find each large page within ppa, and adjust its claim */ 6562 6563 /* Does ppa cover a single large page? */ 6564 if (ppa[0]->p_szc == seg->s_szc) { 6565 if (prot & PROT_WRITE) 6566 err = page_addclaim_pages(ppa); 6567 else 6568 err = page_subclaim_pages(ppa); 6569 } else { 6570 for (i = 0; ppa[i]; i += pgcnt) { 6571 ASSERT(IS_P2ALIGNED(page_pptonum(ppa[i]), pgcnt)); 6572 if (prot & PROT_WRITE) 6573 err = page_addclaim_pages(&ppa[i]); 6574 else 6575 err = page_subclaim_pages(&ppa[i]); 6576 if (err == 0) 6577 break; 6578 } 6579 } 6580 6581 for (i = 0; i < pg_idx; i++) { 6582 ASSERT(ppa[i] != NULL); 6583 page_unlock(ppa[i]); 6584 } 6585 6586 kmem_free(ppa, ppasize); 6587 return (err); 6588 } 6589 6590 /* 6591 * Returns right (upper address) segment if split occurred. 6592 * If the address is equal to the beginning or end of its segment it returns 6593 * the current segment. 6594 */ 6595 static struct seg * 6596 segvn_split_seg(struct seg *seg, caddr_t addr) 6597 { 6598 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6599 struct seg *nseg; 6600 size_t nsize; 6601 struct segvn_data *nsvd; 6602 6603 ASSERT(AS_WRITE_HELD(seg->s_as)); 6604 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6605 6606 ASSERT(addr >= seg->s_base); 6607 ASSERT(addr <= seg->s_base + seg->s_size); 6608 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6609 6610 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 6611 return (seg); 6612 6613 nsize = seg->s_base + seg->s_size - addr; 6614 seg->s_size = addr - seg->s_base; 6615 nseg = seg_alloc(seg->s_as, addr, nsize); 6616 ASSERT(nseg != NULL); 6617 nseg->s_ops = seg->s_ops; 6618 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 6619 nseg->s_data = (void *)nsvd; 6620 nseg->s_szc = seg->s_szc; 6621 *nsvd = *svd; 6622 ASSERT(nsvd->rcookie == HAT_INVALID_REGION_COOKIE); 6623 nsvd->seg = nseg; 6624 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 6625 6626 if (nsvd->vp != NULL) { 6627 VN_HOLD(nsvd->vp); 6628 nsvd->offset = svd->offset + 6629 (uintptr_t)(nseg->s_base - seg->s_base); 6630 if (nsvd->type == MAP_SHARED) 6631 lgrp_shm_policy_init(NULL, nsvd->vp); 6632 } else { 6633 /* 6634 * The offset for an anonymous segment has no signifigance in 6635 * terms of an offset into a file. If we were to use the above 6636 * calculation instead, the structures read out of 6637 * /proc/<pid>/xmap would be more difficult to decipher since 6638 * it would be unclear whether two seemingly contiguous 6639 * prxmap_t structures represented different segments or a 6640 * single segment that had been split up into multiple prxmap_t 6641 * structures (e.g. if some part of the segment had not yet 6642 * been faulted in). 6643 */ 6644 nsvd->offset = 0; 6645 } 6646 6647 ASSERT(svd->softlockcnt == 0); 6648 ASSERT(svd->softlockcnt_sbase == 0); 6649 ASSERT(svd->softlockcnt_send == 0); 6650 crhold(svd->cred); 6651 6652 if (svd->vpage != NULL) { 6653 size_t bytes = vpgtob(seg_pages(seg)); 6654 size_t nbytes = vpgtob(seg_pages(nseg)); 6655 struct vpage *ovpage = svd->vpage; 6656 6657 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 6658 bcopy(ovpage, svd->vpage, bytes); 6659 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 6660 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 6661 kmem_free(ovpage, bytes + nbytes); 6662 } 6663 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6664 struct anon_map *oamp = svd->amp, *namp; 6665 struct anon_hdr *nahp; 6666 6667 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6668 ASSERT(oamp->refcnt == 1); 6669 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6670 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6671 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6672 6673 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 6674 namp->a_szc = nseg->s_szc; 6675 (void) anon_copy_ptr(oamp->ahp, 6676 svd->anon_index + btop(seg->s_size), 6677 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6678 anon_release(oamp->ahp, btop(oamp->size)); 6679 oamp->ahp = nahp; 6680 oamp->size = seg->s_size; 6681 svd->anon_index = 0; 6682 nsvd->amp = namp; 6683 nsvd->anon_index = 0; 6684 ANON_LOCK_EXIT(&oamp->a_rwlock); 6685 } else if (svd->amp != NULL) { 6686 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6687 ASSERT(svd->amp == nsvd->amp); 6688 ASSERT(seg->s_szc <= svd->amp->a_szc); 6689 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6690 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6691 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6692 svd->amp->refcnt++; 6693 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6694 } 6695 6696 /* 6697 * Split the amount of swap reserved. 6698 */ 6699 if (svd->swresv) { 6700 /* 6701 * For MAP_NORESERVE, only allocate swap reserve for pages 6702 * being used. Other segments get enough to cover whole 6703 * segment. 6704 */ 6705 if (svd->flags & MAP_NORESERVE) { 6706 size_t oswresv; 6707 6708 ASSERT(svd->amp); 6709 oswresv = svd->swresv; 6710 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6711 svd->anon_index, btop(seg->s_size))); 6712 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6713 nsvd->anon_index, btop(nseg->s_size))); 6714 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6715 } else { 6716 if (svd->pageswap) { 6717 svd->swresv = segvn_count_swap_by_vpages(seg); 6718 ASSERT(nsvd->swresv >= svd->swresv); 6719 nsvd->swresv -= svd->swresv; 6720 } else { 6721 ASSERT(svd->swresv == seg->s_size + 6722 nseg->s_size); 6723 svd->swresv = seg->s_size; 6724 nsvd->swresv = nseg->s_size; 6725 } 6726 } 6727 } 6728 6729 return (nseg); 6730 } 6731 6732 /* 6733 * called on memory operations (unmap, setprot, setpagesize) for a subset 6734 * of a large page segment to either demote the memory range (SDR_RANGE) 6735 * or the ends (SDR_END) by addr/len. 6736 * 6737 * returns 0 on success. returns errno, including ENOMEM, on failure. 6738 */ 6739 static int 6740 segvn_demote_range( 6741 struct seg *seg, 6742 caddr_t addr, 6743 size_t len, 6744 int flag, 6745 uint_t szcvec) 6746 { 6747 caddr_t eaddr = addr + len; 6748 caddr_t lpgaddr, lpgeaddr; 6749 struct seg *nseg; 6750 struct seg *badseg1 = NULL; 6751 struct seg *badseg2 = NULL; 6752 size_t pgsz; 6753 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6754 int err; 6755 uint_t szc = seg->s_szc; 6756 uint_t tszcvec; 6757 6758 ASSERT(AS_WRITE_HELD(seg->s_as)); 6759 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6760 ASSERT(szc != 0); 6761 pgsz = page_get_pagesize(szc); 6762 ASSERT(seg->s_base != addr || seg->s_size != len); 6763 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6764 ASSERT(svd->softlockcnt == 0); 6765 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 6766 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6767 6768 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6769 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6770 if (flag == SDR_RANGE) { 6771 /* demote entire range */ 6772 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6773 (void) segvn_split_seg(nseg, lpgeaddr); 6774 ASSERT(badseg1->s_base == lpgaddr); 6775 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6776 } else if (addr != lpgaddr) { 6777 ASSERT(flag == SDR_END); 6778 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6779 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6780 eaddr < lpgaddr + 2 * pgsz) { 6781 (void) segvn_split_seg(nseg, lpgeaddr); 6782 ASSERT(badseg1->s_base == lpgaddr); 6783 ASSERT(badseg1->s_size == 2 * pgsz); 6784 } else { 6785 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6786 ASSERT(badseg1->s_base == lpgaddr); 6787 ASSERT(badseg1->s_size == pgsz); 6788 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6789 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6790 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6791 badseg2 = nseg; 6792 (void) segvn_split_seg(nseg, lpgeaddr); 6793 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6794 ASSERT(badseg2->s_size == pgsz); 6795 } 6796 } 6797 } else { 6798 ASSERT(flag == SDR_END); 6799 ASSERT(eaddr < lpgeaddr); 6800 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6801 (void) segvn_split_seg(nseg, lpgeaddr); 6802 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6803 ASSERT(badseg1->s_size == pgsz); 6804 } 6805 6806 ASSERT(badseg1 != NULL); 6807 ASSERT(badseg1->s_szc == szc); 6808 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6809 badseg1->s_size == 2 * pgsz); 6810 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6811 ASSERT(badseg1->s_size == pgsz || 6812 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6813 if (err = segvn_clrszc(badseg1)) { 6814 return (err); 6815 } 6816 ASSERT(badseg1->s_szc == 0); 6817 6818 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6819 uint_t tszc = highbit(tszcvec) - 1; 6820 caddr_t ta = MAX(addr, badseg1->s_base); 6821 caddr_t te; 6822 size_t tpgsz = page_get_pagesize(tszc); 6823 6824 ASSERT(svd->type == MAP_SHARED); 6825 ASSERT(flag == SDR_END); 6826 ASSERT(tszc < szc && tszc > 0); 6827 6828 if (eaddr > badseg1->s_base + badseg1->s_size) { 6829 te = badseg1->s_base + badseg1->s_size; 6830 } else { 6831 te = eaddr; 6832 } 6833 6834 ASSERT(ta <= te); 6835 badseg1->s_szc = tszc; 6836 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6837 if (badseg2 != NULL) { 6838 err = segvn_demote_range(badseg1, ta, te - ta, 6839 SDR_END, tszcvec); 6840 if (err != 0) { 6841 return (err); 6842 } 6843 } else { 6844 return (segvn_demote_range(badseg1, ta, 6845 te - ta, SDR_END, tszcvec)); 6846 } 6847 } 6848 } 6849 6850 if (badseg2 == NULL) 6851 return (0); 6852 ASSERT(badseg2->s_szc == szc); 6853 ASSERT(badseg2->s_size == pgsz); 6854 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6855 if (err = segvn_clrszc(badseg2)) { 6856 return (err); 6857 } 6858 ASSERT(badseg2->s_szc == 0); 6859 6860 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6861 uint_t tszc = highbit(tszcvec) - 1; 6862 size_t tpgsz = page_get_pagesize(tszc); 6863 6864 ASSERT(svd->type == MAP_SHARED); 6865 ASSERT(flag == SDR_END); 6866 ASSERT(tszc < szc && tszc > 0); 6867 ASSERT(badseg2->s_base > addr); 6868 ASSERT(eaddr > badseg2->s_base); 6869 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6870 6871 badseg2->s_szc = tszc; 6872 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6873 return (segvn_demote_range(badseg2, badseg2->s_base, 6874 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6875 } 6876 } 6877 6878 return (0); 6879 } 6880 6881 static int 6882 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6883 { 6884 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6885 struct vpage *vp, *evp; 6886 6887 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6888 6889 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6890 /* 6891 * If segment protection can be used, simply check against them. 6892 */ 6893 if (svd->pageprot == 0) { 6894 int err; 6895 6896 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6897 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6898 return (err); 6899 } 6900 6901 /* 6902 * Have to check down to the vpage level. 6903 */ 6904 evp = &svd->vpage[seg_page(seg, addr + len)]; 6905 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6906 if ((VPP_PROT(vp) & prot) != prot) { 6907 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6908 return (EACCES); 6909 } 6910 } 6911 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6912 return (0); 6913 } 6914 6915 static int 6916 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6917 { 6918 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6919 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6920 6921 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6922 6923 if (pgno != 0) { 6924 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6925 if (svd->pageprot == 0) { 6926 do { 6927 protv[--pgno] = svd->prot; 6928 } while (pgno != 0); 6929 } else { 6930 size_t pgoff = seg_page(seg, addr); 6931 6932 do { 6933 pgno--; 6934 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6935 } while (pgno != 0); 6936 } 6937 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6938 } 6939 return (0); 6940 } 6941 6942 static u_offset_t 6943 segvn_getoffset(struct seg *seg, caddr_t addr) 6944 { 6945 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6946 6947 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6948 6949 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6950 } 6951 6952 /*ARGSUSED*/ 6953 static int 6954 segvn_gettype(struct seg *seg, caddr_t addr) 6955 { 6956 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6957 6958 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6959 6960 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6961 MAP_INITDATA))); 6962 } 6963 6964 /*ARGSUSED*/ 6965 static int 6966 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6967 { 6968 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6969 6970 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6971 6972 *vpp = svd->vp; 6973 return (0); 6974 } 6975 6976 /* 6977 * Check to see if it makes sense to do kluster/read ahead to 6978 * addr + delta relative to the mapping at addr. We assume here 6979 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6980 * 6981 * For segvn, we currently "approve" of the action if we are 6982 * still in the segment and it maps from the same vp/off, 6983 * or if the advice stored in segvn_data or vpages allows it. 6984 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6985 */ 6986 static int 6987 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6988 { 6989 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6990 struct anon *oap, *ap; 6991 ssize_t pd; 6992 size_t page; 6993 struct vnode *vp1, *vp2; 6994 u_offset_t off1, off2; 6995 struct anon_map *amp; 6996 6997 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 6998 ASSERT(AS_WRITE_HELD(seg->s_as) || 6999 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 7000 7001 if (addr + delta < seg->s_base || 7002 addr + delta >= (seg->s_base + seg->s_size)) 7003 return (-1); /* exceeded segment bounds */ 7004 7005 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 7006 page = seg_page(seg, addr); 7007 7008 /* 7009 * Check to see if either of the pages addr or addr + delta 7010 * have advice set that prevents klustering (if MADV_RANDOM advice 7011 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 7012 * is negative). 7013 */ 7014 if (svd->advice == MADV_RANDOM || 7015 svd->advice == MADV_SEQUENTIAL && delta < 0) 7016 return (-1); 7017 else if (svd->pageadvice && svd->vpage) { 7018 struct vpage *bvpp, *evpp; 7019 7020 bvpp = &svd->vpage[page]; 7021 evpp = &svd->vpage[page + pd]; 7022 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 7023 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 7024 return (-1); 7025 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 7026 VPP_ADVICE(evpp) == MADV_RANDOM) 7027 return (-1); 7028 } 7029 7030 if (svd->type == MAP_SHARED) 7031 return (0); /* shared mapping - all ok */ 7032 7033 if ((amp = svd->amp) == NULL) 7034 return (0); /* off original vnode */ 7035 7036 page += svd->anon_index; 7037 7038 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7039 7040 oap = anon_get_ptr(amp->ahp, page); 7041 ap = anon_get_ptr(amp->ahp, page + pd); 7042 7043 ANON_LOCK_EXIT(&->a_rwlock); 7044 7045 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 7046 return (-1); /* one with and one without an anon */ 7047 } 7048 7049 if (oap == NULL) { /* implies that ap == NULL */ 7050 return (0); /* off original vnode */ 7051 } 7052 7053 /* 7054 * Now we know we have two anon pointers - check to 7055 * see if they happen to be properly allocated. 7056 */ 7057 7058 /* 7059 * XXX We cheat here and don't lock the anon slots. We can't because 7060 * we may have been called from the anon layer which might already 7061 * have locked them. We are holding a refcnt on the slots so they 7062 * can't disappear. The worst that will happen is we'll get the wrong 7063 * names (vp, off) for the slots and make a poor klustering decision. 7064 */ 7065 swap_xlate(ap, &vp1, &off1); 7066 swap_xlate(oap, &vp2, &off2); 7067 7068 7069 if (!VOP_CMP(vp1, vp2, NULL) || off1 - off2 != delta) 7070 return (-1); 7071 return (0); 7072 } 7073 7074 /* 7075 * Swap the pages of seg out to secondary storage, returning the 7076 * number of bytes of storage freed. 7077 * 7078 * The basic idea is first to unload all translations and then to call 7079 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 7080 * swap device. Pages to which other segments have mappings will remain 7081 * mapped and won't be swapped. Our caller (as_swapout) has already 7082 * performed the unloading step. 7083 * 7084 * The value returned is intended to correlate well with the process's 7085 * memory requirements. However, there are some caveats: 7086 * 1) When given a shared segment as argument, this routine will 7087 * only succeed in swapping out pages for the last sharer of the 7088 * segment. (Previous callers will only have decremented mapping 7089 * reference counts.) 7090 * 2) We assume that the hat layer maintains a large enough translation 7091 * cache to capture process reference patterns. 7092 */ 7093 static size_t 7094 segvn_swapout(struct seg *seg) 7095 { 7096 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7097 struct anon_map *amp; 7098 pgcnt_t pgcnt = 0; 7099 pgcnt_t npages; 7100 pgcnt_t page; 7101 ulong_t anon_index; 7102 7103 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7104 7105 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7106 /* 7107 * Find pages unmapped by our caller and force them 7108 * out to the virtual swap device. 7109 */ 7110 if ((amp = svd->amp) != NULL) 7111 anon_index = svd->anon_index; 7112 npages = seg->s_size >> PAGESHIFT; 7113 for (page = 0; page < npages; page++) { 7114 page_t *pp; 7115 struct anon *ap; 7116 struct vnode *vp; 7117 u_offset_t off; 7118 anon_sync_obj_t cookie; 7119 7120 /* 7121 * Obtain <vp, off> pair for the page, then look it up. 7122 * 7123 * Note that this code is willing to consider regular 7124 * pages as well as anon pages. Is this appropriate here? 7125 */ 7126 ap = NULL; 7127 if (amp != NULL) { 7128 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7129 if (anon_array_try_enter(amp, anon_index + page, 7130 &cookie)) { 7131 ANON_LOCK_EXIT(&->a_rwlock); 7132 continue; 7133 } 7134 ap = anon_get_ptr(amp->ahp, anon_index + page); 7135 if (ap != NULL) { 7136 swap_xlate(ap, &vp, &off); 7137 } else { 7138 vp = svd->vp; 7139 off = svd->offset + ptob(page); 7140 } 7141 anon_array_exit(&cookie); 7142 ANON_LOCK_EXIT(&->a_rwlock); 7143 } else { 7144 vp = svd->vp; 7145 off = svd->offset + ptob(page); 7146 } 7147 if (vp == NULL) { /* untouched zfod page */ 7148 ASSERT(ap == NULL); 7149 continue; 7150 } 7151 7152 pp = page_lookup_nowait(vp, off, SE_SHARED); 7153 if (pp == NULL) 7154 continue; 7155 7156 7157 /* 7158 * Examine the page to see whether it can be tossed out, 7159 * keeping track of how many we've found. 7160 */ 7161 if (!page_tryupgrade(pp)) { 7162 /* 7163 * If the page has an i/o lock and no mappings, 7164 * it's very likely that the page is being 7165 * written out as a result of klustering. 7166 * Assume this is so and take credit for it here. 7167 */ 7168 if (!page_io_trylock(pp)) { 7169 if (!hat_page_is_mapped(pp)) 7170 pgcnt++; 7171 } else { 7172 page_io_unlock(pp); 7173 } 7174 page_unlock(pp); 7175 continue; 7176 } 7177 ASSERT(!page_iolock_assert(pp)); 7178 7179 7180 /* 7181 * Skip if page is locked or has mappings. 7182 * We don't need the page_struct_lock to look at lckcnt 7183 * and cowcnt because the page is exclusive locked. 7184 */ 7185 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 7186 hat_page_is_mapped(pp)) { 7187 page_unlock(pp); 7188 continue; 7189 } 7190 7191 /* 7192 * dispose skips large pages so try to demote first. 7193 */ 7194 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 7195 page_unlock(pp); 7196 /* 7197 * XXX should skip the remaining page_t's of this 7198 * large page. 7199 */ 7200 continue; 7201 } 7202 7203 ASSERT(pp->p_szc == 0); 7204 7205 /* 7206 * No longer mapped -- we can toss it out. How 7207 * we do so depends on whether or not it's dirty. 7208 */ 7209 if (hat_ismod(pp) && pp->p_vnode) { 7210 /* 7211 * We must clean the page before it can be 7212 * freed. Setting B_FREE will cause pvn_done 7213 * to free the page when the i/o completes. 7214 * XXX: This also causes it to be accounted 7215 * as a pageout instead of a swap: need 7216 * B_SWAPOUT bit to use instead of B_FREE. 7217 * 7218 * Hold the vnode before releasing the page lock 7219 * to prevent it from being freed and re-used by 7220 * some other thread. 7221 */ 7222 VN_HOLD(vp); 7223 page_unlock(pp); 7224 7225 /* 7226 * Queue all i/o requests for the pageout thread 7227 * to avoid saturating the pageout devices. 7228 */ 7229 if (!queue_io_request(vp, off)) 7230 VN_RELE(vp); 7231 } else { 7232 /* 7233 * The page was clean, free it. 7234 * 7235 * XXX: Can we ever encounter modified pages 7236 * with no associated vnode here? 7237 */ 7238 ASSERT(pp->p_vnode != NULL); 7239 /*LINTED: constant in conditional context*/ 7240 VN_DISPOSE(pp, B_FREE, 0, kcred); 7241 } 7242 7243 /* 7244 * Credit now even if i/o is in progress. 7245 */ 7246 pgcnt++; 7247 } 7248 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7249 7250 /* 7251 * Wakeup pageout to initiate i/o on all queued requests. 7252 */ 7253 cv_signal_pageout(); 7254 return (ptob(pgcnt)); 7255 } 7256 7257 /* 7258 * Synchronize primary storage cache with real object in virtual memory. 7259 * 7260 * XXX - Anonymous pages should not be sync'ed out at all. 7261 */ 7262 static int 7263 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 7264 { 7265 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7266 struct vpage *vpp; 7267 page_t *pp; 7268 u_offset_t offset; 7269 struct vnode *vp; 7270 u_offset_t off; 7271 caddr_t eaddr; 7272 int bflags; 7273 int err = 0; 7274 int segtype; 7275 int pageprot; 7276 int prot; 7277 ulong_t anon_index; 7278 struct anon_map *amp; 7279 struct anon *ap; 7280 anon_sync_obj_t cookie; 7281 7282 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7283 7284 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7285 7286 if (svd->softlockcnt > 0) { 7287 /* 7288 * If this is shared segment non 0 softlockcnt 7289 * means locked pages are still in use. 7290 */ 7291 if (svd->type == MAP_SHARED) { 7292 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7293 return (EAGAIN); 7294 } 7295 7296 /* 7297 * flush all pages from seg cache 7298 * otherwise we may deadlock in swap_putpage 7299 * for B_INVAL page (4175402). 7300 * 7301 * Even if we grab segvn WRITER's lock 7302 * here, there might be another thread which could've 7303 * successfully performed lookup/insert just before 7304 * we acquired the lock here. So, grabbing either 7305 * lock here is of not much use. Until we devise 7306 * a strategy at upper layers to solve the 7307 * synchronization issues completely, we expect 7308 * applications to handle this appropriately. 7309 */ 7310 segvn_purge(seg); 7311 if (svd->softlockcnt > 0) { 7312 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7313 return (EAGAIN); 7314 } 7315 } else if (svd->type == MAP_SHARED && svd->amp != NULL && 7316 svd->amp->a_softlockcnt > 0) { 7317 /* 7318 * Try to purge this amp's entries from pcache. It will 7319 * succeed only if other segments that share the amp have no 7320 * outstanding softlock's. 7321 */ 7322 segvn_purge(seg); 7323 if (svd->amp->a_softlockcnt > 0 || svd->softlockcnt > 0) { 7324 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7325 return (EAGAIN); 7326 } 7327 } 7328 7329 vpp = svd->vpage; 7330 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7331 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 7332 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 7333 7334 if (attr) { 7335 pageprot = attr & ~(SHARED|PRIVATE); 7336 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 7337 7338 /* 7339 * We are done if the segment types don't match 7340 * or if we have segment level protections and 7341 * they don't match. 7342 */ 7343 if (svd->type != segtype) { 7344 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7345 return (0); 7346 } 7347 if (vpp == NULL) { 7348 if (svd->prot != pageprot) { 7349 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7350 return (0); 7351 } 7352 prot = svd->prot; 7353 } else 7354 vpp = &svd->vpage[seg_page(seg, addr)]; 7355 7356 } else if (svd->vp && svd->amp == NULL && 7357 (flags & MS_INVALIDATE) == 0) { 7358 7359 /* 7360 * No attributes, no anonymous pages and MS_INVALIDATE flag 7361 * is not on, just use one big request. 7362 */ 7363 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 7364 bflags, svd->cred, NULL); 7365 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7366 return (err); 7367 } 7368 7369 if ((amp = svd->amp) != NULL) 7370 anon_index = svd->anon_index + seg_page(seg, addr); 7371 7372 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 7373 ap = NULL; 7374 if (amp != NULL) { 7375 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7376 anon_array_enter(amp, anon_index, &cookie); 7377 ap = anon_get_ptr(amp->ahp, anon_index++); 7378 if (ap != NULL) { 7379 swap_xlate(ap, &vp, &off); 7380 } else { 7381 vp = svd->vp; 7382 off = offset; 7383 } 7384 anon_array_exit(&cookie); 7385 ANON_LOCK_EXIT(&->a_rwlock); 7386 } else { 7387 vp = svd->vp; 7388 off = offset; 7389 } 7390 offset += PAGESIZE; 7391 7392 if (vp == NULL) /* untouched zfod page */ 7393 continue; 7394 7395 if (attr) { 7396 if (vpp) { 7397 prot = VPP_PROT(vpp); 7398 vpp++; 7399 } 7400 if (prot != pageprot) { 7401 continue; 7402 } 7403 } 7404 7405 /* 7406 * See if any of these pages are locked -- if so, then we 7407 * will have to truncate an invalidate request at the first 7408 * locked one. We don't need the page_struct_lock to test 7409 * as this is only advisory; even if we acquire it someone 7410 * might race in and lock the page after we unlock and before 7411 * we do the PUTPAGE, then PUTPAGE simply does nothing. 7412 */ 7413 if (flags & MS_INVALIDATE) { 7414 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 7415 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 7416 page_unlock(pp); 7417 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7418 return (EBUSY); 7419 } 7420 if (ap != NULL && pp->p_szc != 0 && 7421 page_tryupgrade(pp)) { 7422 if (pp->p_lckcnt == 0 && 7423 pp->p_cowcnt == 0) { 7424 /* 7425 * swapfs VN_DISPOSE() won't 7426 * invalidate large pages. 7427 * Attempt to demote. 7428 * XXX can't help it if it 7429 * fails. But for swapfs 7430 * pages it is no big deal. 7431 */ 7432 (void) page_try_demote_pages( 7433 pp); 7434 } 7435 } 7436 page_unlock(pp); 7437 } 7438 } else if (svd->type == MAP_SHARED && amp != NULL) { 7439 /* 7440 * Avoid writing out to disk ISM's large pages 7441 * because segspt_free_pages() relies on NULL an_pvp 7442 * of anon slots of such pages. 7443 */ 7444 7445 ASSERT(svd->vp == NULL); 7446 /* 7447 * swapfs uses page_lookup_nowait if not freeing or 7448 * invalidating and skips a page if 7449 * page_lookup_nowait returns NULL. 7450 */ 7451 pp = page_lookup_nowait(vp, off, SE_SHARED); 7452 if (pp == NULL) { 7453 continue; 7454 } 7455 if (pp->p_szc != 0) { 7456 page_unlock(pp); 7457 continue; 7458 } 7459 7460 /* 7461 * Note ISM pages are created large so (vp, off)'s 7462 * page cannot suddenly become large after we unlock 7463 * pp. 7464 */ 7465 page_unlock(pp); 7466 } 7467 /* 7468 * XXX - Should ultimately try to kluster 7469 * calls to VOP_PUTPAGE() for performance. 7470 */ 7471 VN_HOLD(vp); 7472 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 7473 (bflags | (IS_SWAPFSVP(vp) ? B_PAGE_NOWAIT : 0)), 7474 svd->cred, NULL); 7475 7476 VN_RELE(vp); 7477 if (err) 7478 break; 7479 } 7480 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7481 return (err); 7482 } 7483 7484 /* 7485 * Determine if we have data corresponding to pages in the 7486 * primary storage virtual memory cache (i.e., "in core"). 7487 */ 7488 static size_t 7489 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 7490 { 7491 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7492 struct vnode *vp, *avp; 7493 u_offset_t offset, aoffset; 7494 size_t p, ep; 7495 int ret; 7496 struct vpage *vpp; 7497 page_t *pp; 7498 uint_t start; 7499 struct anon_map *amp; /* XXX - for locknest */ 7500 struct anon *ap; 7501 uint_t attr; 7502 anon_sync_obj_t cookie; 7503 7504 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7505 7506 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7507 if (svd->amp == NULL && svd->vp == NULL) { 7508 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7509 bzero(vec, btopr(len)); 7510 return (len); /* no anonymous pages created yet */ 7511 } 7512 7513 p = seg_page(seg, addr); 7514 ep = seg_page(seg, addr + len); 7515 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 7516 7517 amp = svd->amp; 7518 for (; p < ep; p++, addr += PAGESIZE) { 7519 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 7520 ret = start; 7521 ap = NULL; 7522 avp = NULL; 7523 /* Grab the vnode/offset for the anon slot */ 7524 if (amp != NULL) { 7525 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7526 anon_array_enter(amp, svd->anon_index + p, &cookie); 7527 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 7528 if (ap != NULL) { 7529 swap_xlate(ap, &avp, &aoffset); 7530 } 7531 anon_array_exit(&cookie); 7532 ANON_LOCK_EXIT(&->a_rwlock); 7533 } 7534 if ((avp != NULL) && page_exists(avp, aoffset)) { 7535 /* A page exists for the anon slot */ 7536 ret |= SEG_PAGE_INCORE; 7537 7538 /* 7539 * If page is mapped and writable 7540 */ 7541 attr = (uint_t)0; 7542 if ((hat_getattr(seg->s_as->a_hat, addr, 7543 &attr) != -1) && (attr & PROT_WRITE)) { 7544 ret |= SEG_PAGE_ANON; 7545 } 7546 /* 7547 * Don't get page_struct lock for lckcnt and cowcnt, 7548 * since this is purely advisory. 7549 */ 7550 if ((pp = page_lookup_nowait(avp, aoffset, 7551 SE_SHARED)) != NULL) { 7552 if (pp->p_lckcnt) 7553 ret |= SEG_PAGE_SOFTLOCK; 7554 if (pp->p_cowcnt) 7555 ret |= SEG_PAGE_HASCOW; 7556 page_unlock(pp); 7557 } 7558 } 7559 7560 /* Gather vnode statistics */ 7561 vp = svd->vp; 7562 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7563 7564 if (vp != NULL) { 7565 /* 7566 * Try to obtain a "shared" lock on the page 7567 * without blocking. If this fails, determine 7568 * if the page is in memory. 7569 */ 7570 pp = page_lookup_nowait(vp, offset, SE_SHARED); 7571 if ((pp == NULL) && (page_exists(vp, offset))) { 7572 /* Page is incore, and is named */ 7573 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7574 } 7575 /* 7576 * Don't get page_struct lock for lckcnt and cowcnt, 7577 * since this is purely advisory. 7578 */ 7579 if (pp != NULL) { 7580 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7581 if (pp->p_lckcnt) 7582 ret |= SEG_PAGE_SOFTLOCK; 7583 if (pp->p_cowcnt) 7584 ret |= SEG_PAGE_HASCOW; 7585 page_unlock(pp); 7586 } 7587 } 7588 7589 /* Gather virtual page information */ 7590 if (vpp) { 7591 if (VPP_ISPPLOCK(vpp)) 7592 ret |= SEG_PAGE_LOCKED; 7593 vpp++; 7594 } 7595 7596 *vec++ = (char)ret; 7597 } 7598 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7599 return (len); 7600 } 7601 7602 /* 7603 * Statement for p_cowcnts/p_lckcnts. 7604 * 7605 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 7606 * irrespective of the following factors or anything else: 7607 * 7608 * (1) anon slots are populated or not 7609 * (2) cow is broken or not 7610 * (3) refcnt on ap is 1 or greater than 1 7611 * 7612 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 7613 * and munlock. 7614 * 7615 * 7616 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 7617 * 7618 * if vpage has PROT_WRITE 7619 * transfer cowcnt on the oldpage -> cowcnt on the newpage 7620 * else 7621 * transfer lckcnt on the oldpage -> lckcnt on the newpage 7622 * 7623 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 7624 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 7625 * 7626 * We may also break COW if softlocking on read access in the physio case. 7627 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 7628 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 7629 * vpage doesn't have PROT_WRITE. 7630 * 7631 * 7632 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 7633 * 7634 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 7635 * increment p_lckcnt by calling page_subclaim() which takes care of 7636 * availrmem accounting and p_lckcnt overflow. 7637 * 7638 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 7639 * increment p_cowcnt by calling page_addclaim() which takes care of 7640 * availrmem availability and p_cowcnt overflow. 7641 */ 7642 7643 /* 7644 * Lock down (or unlock) pages mapped by this segment. 7645 * 7646 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7647 * At fault time they will be relocated into larger pages. 7648 */ 7649 static int 7650 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 7651 int attr, int op, ulong_t *lockmap, size_t pos) 7652 { 7653 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7654 struct vpage *vpp; 7655 struct vpage *evp; 7656 page_t *pp; 7657 u_offset_t offset; 7658 u_offset_t off; 7659 int segtype; 7660 int pageprot; 7661 int claim; 7662 struct vnode *vp; 7663 ulong_t anon_index; 7664 struct anon_map *amp; 7665 struct anon *ap; 7666 struct vattr va; 7667 anon_sync_obj_t cookie; 7668 struct kshmid *sp = NULL; 7669 struct proc *p = curproc; 7670 kproject_t *proj = NULL; 7671 int chargeproc = 1; 7672 size_t locked_bytes = 0; 7673 size_t unlocked_bytes = 0; 7674 int err = 0; 7675 7676 /* 7677 * Hold write lock on address space because may split or concatenate 7678 * segments 7679 */ 7680 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 7681 7682 /* 7683 * If this is a shm, use shm's project and zone, else use 7684 * project and zone of calling process 7685 */ 7686 7687 /* Determine if this segment backs a sysV shm */ 7688 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 7689 ASSERT(svd->type == MAP_SHARED); 7690 ASSERT(svd->tr_state == SEGVN_TR_OFF); 7691 sp = svd->amp->a_sp; 7692 proj = sp->shm_perm.ipc_proj; 7693 chargeproc = 0; 7694 } 7695 7696 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7697 if (attr) { 7698 pageprot = attr & ~(SHARED|PRIVATE); 7699 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7700 7701 /* 7702 * We are done if the segment types don't match 7703 * or if we have segment level protections and 7704 * they don't match. 7705 */ 7706 if (svd->type != segtype) { 7707 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7708 return (0); 7709 } 7710 if (svd->pageprot == 0 && svd->prot != pageprot) { 7711 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7712 return (0); 7713 } 7714 } 7715 7716 if (op == MC_LOCK) { 7717 if (svd->tr_state == SEGVN_TR_INIT) { 7718 svd->tr_state = SEGVN_TR_OFF; 7719 } else if (svd->tr_state == SEGVN_TR_ON) { 7720 ASSERT(svd->amp != NULL); 7721 segvn_textunrepl(seg, 0); 7722 ASSERT(svd->amp == NULL && 7723 svd->tr_state == SEGVN_TR_OFF); 7724 } 7725 } 7726 7727 /* 7728 * If we're locking, then we must create a vpage structure if 7729 * none exists. If we're unlocking, then check to see if there 7730 * is a vpage -- if not, then we could not have locked anything. 7731 */ 7732 7733 if ((vpp = svd->vpage) == NULL) { 7734 if (op == MC_LOCK) { 7735 segvn_vpage(seg); 7736 if (svd->vpage == NULL) { 7737 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7738 return (ENOMEM); 7739 } 7740 } else { 7741 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7742 return (0); 7743 } 7744 } 7745 7746 /* 7747 * The anonymous data vector (i.e., previously 7748 * unreferenced mapping to swap space) can be allocated 7749 * by lazily testing for its existence. 7750 */ 7751 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7752 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 7753 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 7754 svd->amp->a_szc = seg->s_szc; 7755 } 7756 7757 if ((amp = svd->amp) != NULL) { 7758 anon_index = svd->anon_index + seg_page(seg, addr); 7759 } 7760 7761 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7762 evp = &svd->vpage[seg_page(seg, addr + len)]; 7763 7764 if (sp != NULL) 7765 mutex_enter(&sp->shm_mlock); 7766 7767 /* determine number of unlocked bytes in range for lock operation */ 7768 if (op == MC_LOCK) { 7769 7770 if (sp == NULL) { 7771 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7772 vpp++) { 7773 if (!VPP_ISPPLOCK(vpp)) 7774 unlocked_bytes += PAGESIZE; 7775 } 7776 } else { 7777 ulong_t i_idx, i_edx; 7778 anon_sync_obj_t i_cookie; 7779 struct anon *i_ap; 7780 struct vnode *i_vp; 7781 u_offset_t i_off; 7782 7783 /* Only count sysV pages once for locked memory */ 7784 i_edx = svd->anon_index + seg_page(seg, addr + len); 7785 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7786 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7787 anon_array_enter(amp, i_idx, &i_cookie); 7788 i_ap = anon_get_ptr(amp->ahp, i_idx); 7789 if (i_ap == NULL) { 7790 unlocked_bytes += PAGESIZE; 7791 anon_array_exit(&i_cookie); 7792 continue; 7793 } 7794 swap_xlate(i_ap, &i_vp, &i_off); 7795 anon_array_exit(&i_cookie); 7796 pp = page_lookup(i_vp, i_off, SE_SHARED); 7797 if (pp == NULL) { 7798 unlocked_bytes += PAGESIZE; 7799 continue; 7800 } else if (pp->p_lckcnt == 0) 7801 unlocked_bytes += PAGESIZE; 7802 page_unlock(pp); 7803 } 7804 ANON_LOCK_EXIT(&->a_rwlock); 7805 } 7806 7807 mutex_enter(&p->p_lock); 7808 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7809 chargeproc); 7810 mutex_exit(&p->p_lock); 7811 7812 if (err) { 7813 if (sp != NULL) 7814 mutex_exit(&sp->shm_mlock); 7815 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7816 return (err); 7817 } 7818 } 7819 /* 7820 * Loop over all pages in the range. Process if we're locking and 7821 * page has not already been locked in this mapping; or if we're 7822 * unlocking and the page has been locked. 7823 */ 7824 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7825 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7826 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7827 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7828 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7829 7830 if (amp != NULL) 7831 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7832 /* 7833 * If this isn't a MAP_NORESERVE segment and 7834 * we're locking, allocate anon slots if they 7835 * don't exist. The page is brought in later on. 7836 */ 7837 if (op == MC_LOCK && svd->vp == NULL && 7838 ((svd->flags & MAP_NORESERVE) == 0) && 7839 amp != NULL && 7840 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7841 == NULL)) { 7842 anon_array_enter(amp, anon_index, &cookie); 7843 7844 if ((ap = anon_get_ptr(amp->ahp, 7845 anon_index)) == NULL) { 7846 pp = anon_zero(seg, addr, &ap, 7847 svd->cred); 7848 if (pp == NULL) { 7849 anon_array_exit(&cookie); 7850 ANON_LOCK_EXIT(&->a_rwlock); 7851 err = ENOMEM; 7852 goto out; 7853 } 7854 ASSERT(anon_get_ptr(amp->ahp, 7855 anon_index) == NULL); 7856 (void) anon_set_ptr(amp->ahp, 7857 anon_index, ap, ANON_SLEEP); 7858 page_unlock(pp); 7859 } 7860 anon_array_exit(&cookie); 7861 } 7862 7863 /* 7864 * Get name for page, accounting for 7865 * existence of private copy. 7866 */ 7867 ap = NULL; 7868 if (amp != NULL) { 7869 anon_array_enter(amp, anon_index, &cookie); 7870 ap = anon_get_ptr(amp->ahp, anon_index); 7871 if (ap != NULL) { 7872 swap_xlate(ap, &vp, &off); 7873 } else { 7874 if (svd->vp == NULL && 7875 (svd->flags & MAP_NORESERVE)) { 7876 anon_array_exit(&cookie); 7877 ANON_LOCK_EXIT(&->a_rwlock); 7878 continue; 7879 } 7880 vp = svd->vp; 7881 off = offset; 7882 } 7883 if (op != MC_LOCK || ap == NULL) { 7884 anon_array_exit(&cookie); 7885 ANON_LOCK_EXIT(&->a_rwlock); 7886 } 7887 } else { 7888 vp = svd->vp; 7889 off = offset; 7890 } 7891 7892 /* 7893 * Get page frame. It's ok if the page is 7894 * not available when we're unlocking, as this 7895 * may simply mean that a page we locked got 7896 * truncated out of existence after we locked it. 7897 * 7898 * Invoke VOP_GETPAGE() to obtain the page struct 7899 * since we may need to read it from disk if its 7900 * been paged out. 7901 */ 7902 if (op != MC_LOCK) 7903 pp = page_lookup(vp, off, SE_SHARED); 7904 else { 7905 page_t *pl[1 + 1]; 7906 int error; 7907 7908 ASSERT(vp != NULL); 7909 7910 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7911 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7912 S_OTHER, svd->cred, NULL); 7913 7914 if (error && ap != NULL) { 7915 anon_array_exit(&cookie); 7916 ANON_LOCK_EXIT(&->a_rwlock); 7917 } 7918 7919 /* 7920 * If the error is EDEADLK then we must bounce 7921 * up and drop all vm subsystem locks and then 7922 * retry the operation later 7923 * This behavior is a temporary measure because 7924 * ufs/sds logging is badly designed and will 7925 * deadlock if we don't allow this bounce to 7926 * happen. The real solution is to re-design 7927 * the logging code to work properly. See bug 7928 * 4125102 for details of the problem. 7929 */ 7930 if (error == EDEADLK) { 7931 err = error; 7932 goto out; 7933 } 7934 /* 7935 * Quit if we fail to fault in the page. Treat 7936 * the failure as an error, unless the addr 7937 * is mapped beyond the end of a file. 7938 */ 7939 if (error && svd->vp) { 7940 va.va_mask = AT_SIZE; 7941 if (VOP_GETATTR(svd->vp, &va, 0, 7942 svd->cred, NULL) != 0) { 7943 err = EIO; 7944 goto out; 7945 } 7946 if (btopr(va.va_size) >= 7947 btopr(off + 1)) { 7948 err = EIO; 7949 goto out; 7950 } 7951 goto out; 7952 7953 } else if (error) { 7954 err = EIO; 7955 goto out; 7956 } 7957 pp = pl[0]; 7958 ASSERT(pp != NULL); 7959 } 7960 7961 /* 7962 * See Statement at the beginning of this routine. 7963 * 7964 * claim is always set if MAP_PRIVATE and PROT_WRITE 7965 * irrespective of following factors: 7966 * 7967 * (1) anon slots are populated or not 7968 * (2) cow is broken or not 7969 * (3) refcnt on ap is 1 or greater than 1 7970 * 7971 * See 4140683 for details 7972 */ 7973 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7974 (svd->type == MAP_PRIVATE)); 7975 7976 /* 7977 * Perform page-level operation appropriate to 7978 * operation. If locking, undo the SOFTLOCK 7979 * performed to bring the page into memory 7980 * after setting the lock. If unlocking, 7981 * and no page was found, account for the claim 7982 * separately. 7983 */ 7984 if (op == MC_LOCK) { 7985 int ret = 1; /* Assume success */ 7986 7987 ASSERT(!VPP_ISPPLOCK(vpp)); 7988 7989 ret = page_pp_lock(pp, claim, 0); 7990 if (ap != NULL) { 7991 if (ap->an_pvp != NULL) { 7992 anon_swap_free(ap, pp); 7993 } 7994 anon_array_exit(&cookie); 7995 ANON_LOCK_EXIT(&->a_rwlock); 7996 } 7997 if (ret == 0) { 7998 /* locking page failed */ 7999 page_unlock(pp); 8000 err = EAGAIN; 8001 goto out; 8002 } 8003 VPP_SETPPLOCK(vpp); 8004 if (sp != NULL) { 8005 if (pp->p_lckcnt == 1) 8006 locked_bytes += PAGESIZE; 8007 } else 8008 locked_bytes += PAGESIZE; 8009 8010 if (lockmap != (ulong_t *)NULL) 8011 BT_SET(lockmap, pos); 8012 8013 page_unlock(pp); 8014 } else { 8015 ASSERT(VPP_ISPPLOCK(vpp)); 8016 if (pp != NULL) { 8017 /* sysV pages should be locked */ 8018 ASSERT(sp == NULL || pp->p_lckcnt > 0); 8019 page_pp_unlock(pp, claim, 0); 8020 if (sp != NULL) { 8021 if (pp->p_lckcnt == 0) 8022 unlocked_bytes 8023 += PAGESIZE; 8024 } else 8025 unlocked_bytes += PAGESIZE; 8026 page_unlock(pp); 8027 } else { 8028 ASSERT(sp == NULL); 8029 unlocked_bytes += PAGESIZE; 8030 } 8031 VPP_CLRPPLOCK(vpp); 8032 } 8033 } 8034 } 8035 out: 8036 if (op == MC_LOCK) { 8037 /* Credit back bytes that did not get locked */ 8038 if ((unlocked_bytes - locked_bytes) > 0) { 8039 if (proj == NULL) 8040 mutex_enter(&p->p_lock); 8041 rctl_decr_locked_mem(p, proj, 8042 (unlocked_bytes - locked_bytes), chargeproc); 8043 if (proj == NULL) 8044 mutex_exit(&p->p_lock); 8045 } 8046 8047 } else { 8048 /* Account bytes that were unlocked */ 8049 if (unlocked_bytes > 0) { 8050 if (proj == NULL) 8051 mutex_enter(&p->p_lock); 8052 rctl_decr_locked_mem(p, proj, unlocked_bytes, 8053 chargeproc); 8054 if (proj == NULL) 8055 mutex_exit(&p->p_lock); 8056 } 8057 } 8058 if (sp != NULL) 8059 mutex_exit(&sp->shm_mlock); 8060 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8061 8062 return (err); 8063 } 8064 8065 /* 8066 * Set advice from user for specified pages 8067 * There are 10 types of advice: 8068 * MADV_NORMAL - Normal (default) behavior (whatever that is) 8069 * MADV_RANDOM - Random page references 8070 * do not allow readahead or 'klustering' 8071 * MADV_SEQUENTIAL - Sequential page references 8072 * Pages previous to the one currently being 8073 * accessed (determined by fault) are 'not needed' 8074 * and are freed immediately 8075 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 8076 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 8077 * MADV_FREE - Contents can be discarded 8078 * MADV_ACCESS_DEFAULT- Default access 8079 * MADV_ACCESS_LWP - Next LWP will access heavily 8080 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 8081 * MADV_PURGE - Contents will be immediately discarded 8082 */ 8083 static int 8084 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 8085 { 8086 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8087 size_t page; 8088 int err = 0; 8089 int already_set; 8090 struct anon_map *amp; 8091 ulong_t anon_index; 8092 struct seg *next; 8093 lgrp_mem_policy_t policy; 8094 struct seg *prev; 8095 struct vnode *vp; 8096 8097 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 8098 8099 /* 8100 * In case of MADV_FREE/MADV_PURGE, we won't be modifying any segment 8101 * private data structures; so, we only need to grab READER's lock 8102 */ 8103 if (behav != MADV_FREE && behav != MADV_PURGE) { 8104 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 8105 if (svd->tr_state != SEGVN_TR_OFF) { 8106 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8107 return (0); 8108 } 8109 } else { 8110 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8111 } 8112 8113 /* 8114 * Large pages are assumed to be only turned on when accesses to the 8115 * segment's address range have spatial and temporal locality. That 8116 * justifies ignoring MADV_SEQUENTIAL for large page segments. 8117 * Also, ignore advice affecting lgroup memory allocation 8118 * if don't need to do lgroup optimizations on this system 8119 */ 8120 8121 if ((behav == MADV_SEQUENTIAL && 8122 (seg->s_szc != 0 || HAT_IS_REGION_COOKIE_VALID(svd->rcookie))) || 8123 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 8124 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 8125 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8126 return (0); 8127 } 8128 8129 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 8130 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 8131 /* 8132 * Since we are going to unload hat mappings 8133 * we first have to flush the cache. Otherwise 8134 * this might lead to system panic if another 8135 * thread is doing physio on the range whose 8136 * mappings are unloaded by madvise(3C). 8137 */ 8138 if (svd->softlockcnt > 0) { 8139 /* 8140 * If this is shared segment non 0 softlockcnt 8141 * means locked pages are still in use. 8142 */ 8143 if (svd->type == MAP_SHARED) { 8144 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8145 return (EAGAIN); 8146 } 8147 /* 8148 * Since we do have the segvn writers lock 8149 * nobody can fill the cache with entries 8150 * belonging to this seg during the purge. 8151 * The flush either succeeds or we still 8152 * have pending I/Os. In the later case, 8153 * madvise(3C) fails. 8154 */ 8155 segvn_purge(seg); 8156 if (svd->softlockcnt > 0) { 8157 /* 8158 * Since madvise(3C) is advisory and 8159 * it's not part of UNIX98, madvise(3C) 8160 * failure here doesn't cause any hardship. 8161 * Note that we don't block in "as" layer. 8162 */ 8163 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8164 return (EAGAIN); 8165 } 8166 } else if (svd->type == MAP_SHARED && svd->amp != NULL && 8167 svd->amp->a_softlockcnt > 0) { 8168 /* 8169 * Try to purge this amp's entries from pcache. It 8170 * will succeed only if other segments that share the 8171 * amp have no outstanding softlock's. 8172 */ 8173 segvn_purge(seg); 8174 } 8175 } 8176 8177 amp = svd->amp; 8178 vp = svd->vp; 8179 if (behav == MADV_FREE || behav == MADV_PURGE) { 8180 pgcnt_t purged; 8181 8182 if (behav == MADV_FREE && (vp != NULL || amp == NULL)) { 8183 /* 8184 * MADV_FREE is not supported for segments with an 8185 * underlying object; if anonmap is NULL, anon slots 8186 * are not yet populated and there is nothing for us 8187 * to do. As MADV_FREE is advisory, we don't return an 8188 * error in either case. 8189 */ 8190 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8191 return (0); 8192 } 8193 8194 if (amp == NULL) { 8195 /* 8196 * If we're here with a NULL anonmap, it's because we 8197 * are doing a MADV_PURGE. We have nothing to do, but 8198 * because MADV_PURGE isn't merely advisory, we return 8199 * an error in this case. 8200 */ 8201 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8202 return (EBUSY); 8203 } 8204 8205 segvn_purge(seg); 8206 8207 page = seg_page(seg, addr); 8208 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8209 err = anon_disclaim(amp, 8210 svd->anon_index + page, len, behav, &purged); 8211 8212 if (purged != 0 && (svd->flags & MAP_NORESERVE)) { 8213 /* 8214 * If we purged pages on a MAP_NORESERVE mapping, we 8215 * need to be sure to now unreserve our reserved swap. 8216 * (We use the atomic operations to manipulate our 8217 * segment and address space counters because we only 8218 * have the corresponding locks held as reader, not 8219 * writer.) 8220 */ 8221 ssize_t bytes = ptob(purged); 8222 8223 anon_unresv_zone(bytes, seg->s_as->a_proc->p_zone); 8224 atomic_add_long(&svd->swresv, -bytes); 8225 atomic_add_long(&seg->s_as->a_resvsize, -bytes); 8226 } 8227 8228 ANON_LOCK_EXIT(&->a_rwlock); 8229 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8230 8231 /* 8232 * MADV_PURGE and MADV_FREE differ in their return semantics: 8233 * because MADV_PURGE is designed to be bug-for-bug compatible 8234 * with its clumsy Linux forebear, it will fail where MADV_FREE 8235 * does not. 8236 */ 8237 return (behav == MADV_PURGE ? err : 0); 8238 } 8239 8240 /* 8241 * If advice is to be applied to entire segment, 8242 * use advice field in seg_data structure 8243 * otherwise use appropriate vpage entry. 8244 */ 8245 if ((addr == seg->s_base) && (len == seg->s_size)) { 8246 switch (behav) { 8247 case MADV_ACCESS_LWP: 8248 case MADV_ACCESS_MANY: 8249 case MADV_ACCESS_DEFAULT: 8250 /* 8251 * Set memory allocation policy for this segment 8252 */ 8253 policy = lgrp_madv_to_policy(behav, len, svd->type); 8254 if (svd->type == MAP_SHARED) 8255 already_set = lgrp_shm_policy_set(policy, amp, 8256 svd->anon_index, vp, svd->offset, len); 8257 else { 8258 /* 8259 * For private memory, need writers lock on 8260 * address space because the segment may be 8261 * split or concatenated when changing policy 8262 */ 8263 if (AS_READ_HELD(seg->s_as)) { 8264 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8265 return (IE_RETRY); 8266 } 8267 8268 already_set = lgrp_privm_policy_set(policy, 8269 &svd->policy_info, len); 8270 } 8271 8272 /* 8273 * If policy set already and it shouldn't be reapplied, 8274 * don't do anything. 8275 */ 8276 if (already_set && 8277 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8278 break; 8279 8280 /* 8281 * Mark any existing pages in given range for 8282 * migration 8283 */ 8284 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8285 vp, svd->offset, 1); 8286 8287 /* 8288 * If same policy set already or this is a shared 8289 * memory segment, don't need to try to concatenate 8290 * segment with adjacent ones. 8291 */ 8292 if (already_set || svd->type == MAP_SHARED) 8293 break; 8294 8295 /* 8296 * Try to concatenate this segment with previous 8297 * one and next one, since we changed policy for 8298 * this one and it may be compatible with adjacent 8299 * ones now. 8300 */ 8301 prev = AS_SEGPREV(seg->s_as, seg); 8302 next = AS_SEGNEXT(seg->s_as, seg); 8303 8304 if (next && next->s_ops == &segvn_ops && 8305 addr + len == next->s_base) 8306 (void) segvn_concat(seg, next, 1); 8307 8308 if (prev && prev->s_ops == &segvn_ops && 8309 addr == prev->s_base + prev->s_size) { 8310 /* 8311 * Drop lock for private data of current 8312 * segment before concatenating (deleting) it 8313 * and return IE_REATTACH to tell as_ctl() that 8314 * current segment has changed 8315 */ 8316 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8317 if (!segvn_concat(prev, seg, 1)) 8318 err = IE_REATTACH; 8319 8320 return (err); 8321 } 8322 break; 8323 8324 case MADV_SEQUENTIAL: 8325 /* 8326 * unloading mapping guarantees 8327 * detection in segvn_fault 8328 */ 8329 ASSERT(seg->s_szc == 0); 8330 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8331 hat_unload(seg->s_as->a_hat, addr, len, 8332 HAT_UNLOAD); 8333 /* FALLTHROUGH */ 8334 case MADV_NORMAL: 8335 case MADV_RANDOM: 8336 svd->advice = (uchar_t)behav; 8337 svd->pageadvice = 0; 8338 break; 8339 case MADV_WILLNEED: /* handled in memcntl */ 8340 case MADV_DONTNEED: /* handled in memcntl */ 8341 case MADV_FREE: /* handled above */ 8342 case MADV_PURGE: /* handled above */ 8343 break; 8344 default: 8345 err = EINVAL; 8346 } 8347 } else { 8348 caddr_t eaddr; 8349 struct seg *new_seg; 8350 struct segvn_data *new_svd; 8351 u_offset_t off; 8352 caddr_t oldeaddr; 8353 8354 page = seg_page(seg, addr); 8355 8356 segvn_vpage(seg); 8357 if (svd->vpage == NULL) { 8358 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8359 return (ENOMEM); 8360 } 8361 8362 switch (behav) { 8363 struct vpage *bvpp, *evpp; 8364 8365 case MADV_ACCESS_LWP: 8366 case MADV_ACCESS_MANY: 8367 case MADV_ACCESS_DEFAULT: 8368 /* 8369 * Set memory allocation policy for portion of this 8370 * segment 8371 */ 8372 8373 /* 8374 * Align address and length of advice to page 8375 * boundaries for large pages 8376 */ 8377 if (seg->s_szc != 0) { 8378 size_t pgsz; 8379 8380 pgsz = page_get_pagesize(seg->s_szc); 8381 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 8382 len = P2ROUNDUP(len, pgsz); 8383 } 8384 8385 /* 8386 * Check to see whether policy is set already 8387 */ 8388 policy = lgrp_madv_to_policy(behav, len, svd->type); 8389 8390 anon_index = svd->anon_index + page; 8391 off = svd->offset + (uintptr_t)(addr - seg->s_base); 8392 8393 if (svd->type == MAP_SHARED) 8394 already_set = lgrp_shm_policy_set(policy, amp, 8395 anon_index, vp, off, len); 8396 else 8397 already_set = 8398 (policy == svd->policy_info.mem_policy); 8399 8400 /* 8401 * If policy set already and it shouldn't be reapplied, 8402 * don't do anything. 8403 */ 8404 if (already_set && 8405 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 8406 break; 8407 8408 /* 8409 * For private memory, need writers lock on 8410 * address space because the segment may be 8411 * split or concatenated when changing policy 8412 */ 8413 if (svd->type == MAP_PRIVATE && 8414 AS_READ_HELD(seg->s_as)) { 8415 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8416 return (IE_RETRY); 8417 } 8418 8419 /* 8420 * Mark any existing pages in given range for 8421 * migration 8422 */ 8423 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 8424 vp, svd->offset, 1); 8425 8426 /* 8427 * Don't need to try to split or concatenate 8428 * segments, since policy is same or this is a shared 8429 * memory segment 8430 */ 8431 if (already_set || svd->type == MAP_SHARED) 8432 break; 8433 8434 if (HAT_IS_REGION_COOKIE_VALID(svd->rcookie)) { 8435 ASSERT(svd->amp == NULL); 8436 ASSERT(svd->tr_state == SEGVN_TR_OFF); 8437 ASSERT(svd->softlockcnt == 0); 8438 hat_leave_region(seg->s_as->a_hat, svd->rcookie, 8439 HAT_REGION_TEXT); 8440 svd->rcookie = HAT_INVALID_REGION_COOKIE; 8441 } 8442 8443 /* 8444 * Split off new segment if advice only applies to a 8445 * portion of existing segment starting in middle 8446 */ 8447 new_seg = NULL; 8448 eaddr = addr + len; 8449 oldeaddr = seg->s_base + seg->s_size; 8450 if (addr > seg->s_base) { 8451 /* 8452 * Must flush I/O page cache 8453 * before splitting segment 8454 */ 8455 if (svd->softlockcnt > 0) 8456 segvn_purge(seg); 8457 8458 /* 8459 * Split segment and return IE_REATTACH to tell 8460 * as_ctl() that current segment changed 8461 */ 8462 new_seg = segvn_split_seg(seg, addr); 8463 new_svd = (struct segvn_data *)new_seg->s_data; 8464 err = IE_REATTACH; 8465 8466 /* 8467 * If new segment ends where old one 8468 * did, try to concatenate the new 8469 * segment with next one. 8470 */ 8471 if (eaddr == oldeaddr) { 8472 /* 8473 * Set policy for new segment 8474 */ 8475 (void) lgrp_privm_policy_set(policy, 8476 &new_svd->policy_info, 8477 new_seg->s_size); 8478 8479 next = AS_SEGNEXT(new_seg->s_as, 8480 new_seg); 8481 8482 if (next && 8483 next->s_ops == &segvn_ops && 8484 eaddr == next->s_base) 8485 (void) segvn_concat(new_seg, 8486 next, 1); 8487 } 8488 } 8489 8490 /* 8491 * Split off end of existing segment if advice only 8492 * applies to a portion of segment ending before 8493 * end of the existing segment 8494 */ 8495 if (eaddr < oldeaddr) { 8496 /* 8497 * Must flush I/O page cache 8498 * before splitting segment 8499 */ 8500 if (svd->softlockcnt > 0) 8501 segvn_purge(seg); 8502 8503 /* 8504 * If beginning of old segment was already 8505 * split off, use new segment to split end off 8506 * from. 8507 */ 8508 if (new_seg != NULL && new_seg != seg) { 8509 /* 8510 * Split segment 8511 */ 8512 (void) segvn_split_seg(new_seg, eaddr); 8513 8514 /* 8515 * Set policy for new segment 8516 */ 8517 (void) lgrp_privm_policy_set(policy, 8518 &new_svd->policy_info, 8519 new_seg->s_size); 8520 } else { 8521 /* 8522 * Split segment and return IE_REATTACH 8523 * to tell as_ctl() that current 8524 * segment changed 8525 */ 8526 (void) segvn_split_seg(seg, eaddr); 8527 err = IE_REATTACH; 8528 8529 (void) lgrp_privm_policy_set(policy, 8530 &svd->policy_info, seg->s_size); 8531 8532 /* 8533 * If new segment starts where old one 8534 * did, try to concatenate it with 8535 * previous segment. 8536 */ 8537 if (addr == seg->s_base) { 8538 prev = AS_SEGPREV(seg->s_as, 8539 seg); 8540 8541 /* 8542 * Drop lock for private data 8543 * of current segment before 8544 * concatenating (deleting) it 8545 */ 8546 if (prev && 8547 prev->s_ops == 8548 &segvn_ops && 8549 addr == prev->s_base + 8550 prev->s_size) { 8551 SEGVN_LOCK_EXIT( 8552 seg->s_as, 8553 &svd->lock); 8554 (void) segvn_concat( 8555 prev, seg, 1); 8556 return (err); 8557 } 8558 } 8559 } 8560 } 8561 break; 8562 case MADV_SEQUENTIAL: 8563 ASSERT(seg->s_szc == 0); 8564 ASSERT(svd->rcookie == HAT_INVALID_REGION_COOKIE); 8565 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 8566 /* FALLTHROUGH */ 8567 case MADV_NORMAL: 8568 case MADV_RANDOM: 8569 bvpp = &svd->vpage[page]; 8570 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8571 for (; bvpp < evpp; bvpp++) 8572 VPP_SETADVICE(bvpp, behav); 8573 svd->advice = MADV_NORMAL; 8574 break; 8575 case MADV_WILLNEED: /* handled in memcntl */ 8576 case MADV_DONTNEED: /* handled in memcntl */ 8577 case MADV_FREE: /* handled above */ 8578 case MADV_PURGE: /* handled above */ 8579 break; 8580 default: 8581 err = EINVAL; 8582 } 8583 } 8584 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8585 return (err); 8586 } 8587 8588 /* 8589 * There is one kind of inheritance that can be specified for pages: 8590 * 8591 * SEGP_INH_ZERO - Pages should be zeroed in the child 8592 */ 8593 static int 8594 segvn_inherit(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 8595 { 8596 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8597 struct vpage *bvpp, *evpp; 8598 size_t page; 8599 int ret = 0; 8600 8601 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 8602 8603 /* Can't support something we don't know about */ 8604 if (behav != SEGP_INH_ZERO) 8605 return (ENOTSUP); 8606 8607 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 8608 8609 /* 8610 * This must be a straightforward anonymous segment that is mapped 8611 * privately and is not backed by a vnode. 8612 */ 8613 if (svd->tr_state != SEGVN_TR_OFF || 8614 svd->type != MAP_PRIVATE || 8615 svd->vp != NULL) { 8616 ret = EINVAL; 8617 goto out; 8618 } 8619 8620 /* 8621 * If the entire segment has been marked as inherit zero, then no reason 8622 * to do anything else. 8623 */ 8624 if (svd->svn_inz == SEGVN_INZ_ALL) { 8625 ret = 0; 8626 goto out; 8627 } 8628 8629 /* 8630 * If this applies to the entire segment, simply mark it and we're done. 8631 */ 8632 if ((addr == seg->s_base) && (len == seg->s_size)) { 8633 svd->svn_inz = SEGVN_INZ_ALL; 8634 ret = 0; 8635 goto out; 8636 } 8637 8638 /* 8639 * We've been asked to mark a subset of this segment as inherit zero, 8640 * therefore we need to mainpulate its vpages. 8641 */ 8642 if (svd->vpage == NULL) { 8643 segvn_vpage(seg); 8644 if (svd->vpage == NULL) { 8645 ret = ENOMEM; 8646 goto out; 8647 } 8648 } 8649 8650 svd->svn_inz = SEGVN_INZ_VPP; 8651 page = seg_page(seg, addr); 8652 bvpp = &svd->vpage[page]; 8653 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8654 for (; bvpp < evpp; bvpp++) 8655 VPP_SETINHZERO(bvpp); 8656 ret = 0; 8657 8658 out: 8659 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8660 return (ret); 8661 } 8662 8663 /* 8664 * Create a vpage structure for this seg. 8665 */ 8666 static void 8667 segvn_vpage(struct seg *seg) 8668 { 8669 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8670 struct vpage *vp, *evp; 8671 static pgcnt_t page_limit = 0; 8672 8673 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8674 8675 /* 8676 * If no vpage structure exists, allocate one. Copy the protections 8677 * and the advice from the segment itself to the individual pages. 8678 */ 8679 if (svd->vpage == NULL) { 8680 /* 8681 * Start by calculating the number of pages we must allocate to 8682 * track the per-page vpage structs needs for this entire 8683 * segment. If we know now that it will require more than our 8684 * heuristic for the maximum amount of kmem we can consume then 8685 * fail. We do this here, instead of trying to detect this deep 8686 * in page_resv and propagating the error up, since the entire 8687 * memory allocation stack is not amenable to passing this 8688 * back. Instead, it wants to keep trying. 8689 * 8690 * As a heuristic we set a page limit of 5/8s of total_pages 8691 * for this allocation. We use shifts so that no floating 8692 * point conversion takes place and only need to do the 8693 * calculation once. 8694 */ 8695 ulong_t mem_needed = seg_pages(seg) * sizeof (struct vpage); 8696 pgcnt_t npages = mem_needed >> PAGESHIFT; 8697 8698 if (page_limit == 0) 8699 page_limit = (total_pages >> 1) + (total_pages >> 3); 8700 8701 if (npages > page_limit) 8702 return; 8703 8704 svd->pageadvice = 1; 8705 svd->vpage = kmem_zalloc(mem_needed, KM_SLEEP); 8706 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 8707 for (vp = svd->vpage; vp < evp; vp++) { 8708 VPP_SETPROT(vp, svd->prot); 8709 VPP_SETADVICE(vp, svd->advice); 8710 } 8711 } 8712 } 8713 8714 /* 8715 * Dump the pages belonging to this segvn segment. 8716 */ 8717 static void 8718 segvn_dump(struct seg *seg) 8719 { 8720 struct segvn_data *svd; 8721 page_t *pp; 8722 struct anon_map *amp; 8723 ulong_t anon_index; 8724 struct vnode *vp; 8725 u_offset_t off, offset; 8726 pfn_t pfn; 8727 pgcnt_t page, npages; 8728 caddr_t addr; 8729 8730 npages = seg_pages(seg); 8731 svd = (struct segvn_data *)seg->s_data; 8732 vp = svd->vp; 8733 off = offset = svd->offset; 8734 addr = seg->s_base; 8735 8736 if ((amp = svd->amp) != NULL) { 8737 anon_index = svd->anon_index; 8738 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8739 } 8740 8741 for (page = 0; page < npages; page++, offset += PAGESIZE) { 8742 struct anon *ap; 8743 int we_own_it = 0; 8744 8745 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 8746 swap_xlate_nopanic(ap, &vp, &off); 8747 } else { 8748 vp = svd->vp; 8749 off = offset; 8750 } 8751 8752 /* 8753 * If pp == NULL, the page either does not exist 8754 * or is exclusively locked. So determine if it 8755 * exists before searching for it. 8756 */ 8757 8758 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 8759 we_own_it = 1; 8760 else 8761 pp = page_exists(vp, off); 8762 8763 if (pp) { 8764 pfn = page_pptonum(pp); 8765 dump_addpage(seg->s_as, addr, pfn); 8766 if (we_own_it) 8767 page_unlock(pp); 8768 } 8769 addr += PAGESIZE; 8770 dump_timeleft = dump_timeout; 8771 } 8772 8773 if (amp != NULL) 8774 ANON_LOCK_EXIT(&->a_rwlock); 8775 } 8776 8777 #ifdef DEBUG 8778 static uint32_t segvn_pglock_mtbf = 0; 8779 #endif 8780 8781 #define PCACHE_SHWLIST ((page_t *)-2) 8782 #define NOPCACHE_SHWLIST ((page_t *)-1) 8783 8784 /* 8785 * Lock/Unlock anon pages over a given range. Return shadow list. This routine 8786 * uses global segment pcache to cache shadow lists (i.e. pp arrays) of pages 8787 * to avoid the overhead of per page locking, unlocking for subsequent IOs to 8788 * the same parts of the segment. Currently shadow list creation is only 8789 * supported for pure anon segments. MAP_PRIVATE segment pcache entries are 8790 * tagged with segment pointer, starting virtual address and length. This 8791 * approach for MAP_SHARED segments may add many pcache entries for the same 8792 * set of pages and lead to long hash chains that decrease pcache lookup 8793 * performance. To avoid this issue for shared segments shared anon map and 8794 * starting anon index are used for pcache entry tagging. This allows all 8795 * segments to share pcache entries for the same anon range and reduces pcache 8796 * chain's length as well as memory overhead from duplicate shadow lists and 8797 * pcache entries. 8798 * 8799 * softlockcnt field in segvn_data structure counts the number of F_SOFTLOCK'd 8800 * pages via segvn_fault() and pagelock'd pages via this routine. But pagelock 8801 * part of softlockcnt accounting is done differently for private and shared 8802 * segments. In private segment case softlock is only incremented when a new 8803 * shadow list is created but not when an existing one is found via 8804 * seg_plookup(). pcache entries have reference count incremented/decremented 8805 * by each seg_plookup()/seg_pinactive() operation. Only entries that have 0 8806 * reference count can be purged (and purging is needed before segment can be 8807 * freed). When a private segment pcache entry is purged segvn_reclaim() will 8808 * decrement softlockcnt. Since in private segment case each of its pcache 8809 * entries only belongs to this segment we can expect that when 8810 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this 8811 * segment purge will succeed and softlockcnt will drop to 0. In shared 8812 * segment case reference count in pcache entry counts active locks from many 8813 * different segments so we can't expect segment purging to succeed even when 8814 * segvn_pagelock(L_PAGEUNLOCK) was called for all outstanding IOs in this 8815 * segment. To be able to determine when there're no pending pagelocks in 8816 * shared segment case we don't rely on purging to make softlockcnt drop to 0 8817 * but instead softlockcnt is incremented and decremented for every 8818 * segvn_pagelock(L_PAGELOCK/L_PAGEUNLOCK) call regardless if a new shadow 8819 * list was created or an existing one was found. When softlockcnt drops to 0 8820 * this segment no longer has any claims for pcached shadow lists and the 8821 * segment can be freed even if there're still active pcache entries 8822 * shared by this segment anon map. Shared segment pcache entries belong to 8823 * anon map and are typically removed when anon map is freed after all 8824 * processes destroy the segments that use this anon map. 8825 */ 8826 static int 8827 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 8828 enum lock_type type, enum seg_rw rw) 8829 { 8830 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8831 size_t np; 8832 pgcnt_t adjustpages; 8833 pgcnt_t npages; 8834 ulong_t anon_index; 8835 uint_t protchk = (rw == S_READ) ? PROT_READ : PROT_WRITE; 8836 uint_t error; 8837 struct anon_map *amp; 8838 pgcnt_t anpgcnt; 8839 struct page **pplist, **pl, *pp; 8840 caddr_t a; 8841 size_t page; 8842 caddr_t lpgaddr, lpgeaddr; 8843 anon_sync_obj_t cookie; 8844 int anlock; 8845 struct anon_map *pamp; 8846 caddr_t paddr; 8847 seg_preclaim_cbfunc_t preclaim_callback; 8848 size_t pgsz; 8849 int use_pcache; 8850 size_t wlen; 8851 uint_t pflags = 0; 8852 int sftlck_sbase = 0; 8853 int sftlck_send = 0; 8854 8855 #ifdef DEBUG 8856 if (type == L_PAGELOCK && segvn_pglock_mtbf) { 8857 hrtime_t ts = gethrtime(); 8858 if ((ts % segvn_pglock_mtbf) == 0) { 8859 return (ENOTSUP); 8860 } 8861 if ((ts % segvn_pglock_mtbf) == 1) { 8862 return (EFAULT); 8863 } 8864 } 8865 #endif 8866 8867 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 8868 "segvn_pagelock: start seg %p addr %p", seg, addr); 8869 8870 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 8871 ASSERT(type == L_PAGELOCK || type == L_PAGEUNLOCK); 8872 8873 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8874 8875 /* 8876 * for now we only support pagelock to anon memory. We would have to 8877 * check protections for vnode objects and call into the vnode driver. 8878 * That's too much for a fast path. Let the fault entry point handle 8879 * it. 8880 */ 8881 if (svd->vp != NULL) { 8882 if (type == L_PAGELOCK) { 8883 error = ENOTSUP; 8884 goto out; 8885 } 8886 panic("segvn_pagelock(L_PAGEUNLOCK): vp != NULL"); 8887 } 8888 if ((amp = svd->amp) == NULL) { 8889 if (type == L_PAGELOCK) { 8890 error = EFAULT; 8891 goto out; 8892 } 8893 panic("segvn_pagelock(L_PAGEUNLOCK): amp == NULL"); 8894 } 8895 if (rw != S_READ && rw != S_WRITE) { 8896 if (type == L_PAGELOCK) { 8897 error = ENOTSUP; 8898 goto out; 8899 } 8900 panic("segvn_pagelock(L_PAGEUNLOCK): bad rw"); 8901 } 8902 8903 if (seg->s_szc != 0) { 8904 /* 8905 * We are adjusting the pagelock region to the large page size 8906 * boundary because the unlocked part of a large page cannot 8907 * be freed anyway unless all constituent pages of a large 8908 * page are locked. Bigger regions reduce pcache chain length 8909 * and improve lookup performance. The tradeoff is that the 8910 * very first segvn_pagelock() call for a given page is more 8911 * expensive if only 1 page_t is needed for IO. This is only 8912 * an issue if pcache entry doesn't get reused by several 8913 * subsequent calls. We optimize here for the case when pcache 8914 * is heavily used by repeated IOs to the same address range. 8915 * 8916 * Note segment's page size cannot change while we are holding 8917 * as lock. And then it cannot change while softlockcnt is 8918 * not 0. This will allow us to correctly recalculate large 8919 * page size region for the matching pageunlock/reclaim call 8920 * since as_pageunlock() caller must always match 8921 * as_pagelock() call's addr and len. 8922 * 8923 * For pageunlock *ppp points to the pointer of page_t that 8924 * corresponds to the real unadjusted start address. Similar 8925 * for pagelock *ppp must point to the pointer of page_t that 8926 * corresponds to the real unadjusted start address. 8927 */ 8928 pgsz = page_get_pagesize(seg->s_szc); 8929 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 8930 adjustpages = btop((uintptr_t)(addr - lpgaddr)); 8931 } else if (len < segvn_pglock_comb_thrshld) { 8932 lpgaddr = addr; 8933 lpgeaddr = addr + len; 8934 adjustpages = 0; 8935 pgsz = PAGESIZE; 8936 } else { 8937 /* 8938 * Align the address range of large enough requests to allow 8939 * combining of different shadow lists into 1 to reduce memory 8940 * overhead from potentially overlapping large shadow lists 8941 * (worst case is we have a 1MB IO into buffers with start 8942 * addresses separated by 4K). Alignment is only possible if 8943 * padded chunks have sufficient access permissions. Note 8944 * permissions won't change between L_PAGELOCK and 8945 * L_PAGEUNLOCK calls since non 0 softlockcnt will force 8946 * segvn_setprot() to wait until softlockcnt drops to 0. This 8947 * allows us to determine in L_PAGEUNLOCK the same range we 8948 * computed in L_PAGELOCK. 8949 * 8950 * If alignment is limited by segment ends set 8951 * sftlck_sbase/sftlck_send flags. In L_PAGELOCK case when 8952 * these flags are set bump softlockcnt_sbase/softlockcnt_send 8953 * per segment counters. In L_PAGEUNLOCK case decrease 8954 * softlockcnt_sbase/softlockcnt_send counters if 8955 * sftlck_sbase/sftlck_send flags are set. When 8956 * softlockcnt_sbase/softlockcnt_send are non 0 8957 * segvn_concat()/segvn_extend_prev()/segvn_extend_next() 8958 * won't merge the segments. This restriction combined with 8959 * restriction on segment unmapping and splitting for segments 8960 * that have non 0 softlockcnt allows L_PAGEUNLOCK to 8961 * correctly determine the same range that was previously 8962 * locked by matching L_PAGELOCK. 8963 */ 8964 pflags = SEGP_PSHIFT | (segvn_pglock_comb_bshift << 16); 8965 pgsz = PAGESIZE; 8966 if (svd->type == MAP_PRIVATE) { 8967 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)addr, 8968 segvn_pglock_comb_balign); 8969 if (lpgaddr < seg->s_base) { 8970 lpgaddr = seg->s_base; 8971 sftlck_sbase = 1; 8972 } 8973 } else { 8974 ulong_t aix = svd->anon_index + seg_page(seg, addr); 8975 ulong_t aaix = P2ALIGN(aix, segvn_pglock_comb_palign); 8976 if (aaix < svd->anon_index) { 8977 lpgaddr = seg->s_base; 8978 sftlck_sbase = 1; 8979 } else { 8980 lpgaddr = addr - ptob(aix - aaix); 8981 ASSERT(lpgaddr >= seg->s_base); 8982 } 8983 } 8984 if (svd->pageprot && lpgaddr != addr) { 8985 struct vpage *vp = &svd->vpage[seg_page(seg, lpgaddr)]; 8986 struct vpage *evp = &svd->vpage[seg_page(seg, addr)]; 8987 while (vp < evp) { 8988 if ((VPP_PROT(vp) & protchk) == 0) { 8989 break; 8990 } 8991 vp++; 8992 } 8993 if (vp < evp) { 8994 lpgaddr = addr; 8995 pflags = 0; 8996 } 8997 } 8998 lpgeaddr = addr + len; 8999 if (pflags) { 9000 if (svd->type == MAP_PRIVATE) { 9001 lpgeaddr = (caddr_t)P2ROUNDUP( 9002 (uintptr_t)lpgeaddr, 9003 segvn_pglock_comb_balign); 9004 } else { 9005 ulong_t aix = svd->anon_index + 9006 seg_page(seg, lpgeaddr); 9007 ulong_t aaix = P2ROUNDUP(aix, 9008 segvn_pglock_comb_palign); 9009 if (aaix < aix) { 9010 lpgeaddr = 0; 9011 } else { 9012 lpgeaddr += ptob(aaix - aix); 9013 } 9014 } 9015 if (lpgeaddr == 0 || 9016 lpgeaddr > seg->s_base + seg->s_size) { 9017 lpgeaddr = seg->s_base + seg->s_size; 9018 sftlck_send = 1; 9019 } 9020 } 9021 if (svd->pageprot && lpgeaddr != addr + len) { 9022 struct vpage *vp; 9023 struct vpage *evp; 9024 9025 vp = &svd->vpage[seg_page(seg, addr + len)]; 9026 evp = &svd->vpage[seg_page(seg, lpgeaddr)]; 9027 9028 while (vp < evp) { 9029 if ((VPP_PROT(vp) & protchk) == 0) { 9030 break; 9031 } 9032 vp++; 9033 } 9034 if (vp < evp) { 9035 lpgeaddr = addr + len; 9036 } 9037 } 9038 adjustpages = btop((uintptr_t)(addr - lpgaddr)); 9039 } 9040 9041 /* 9042 * For MAP_SHARED segments we create pcache entries tagged by amp and 9043 * anon index so that we can share pcache entries with other segments 9044 * that map this amp. For private segments pcache entries are tagged 9045 * with segment and virtual address. 9046 */ 9047 if (svd->type == MAP_SHARED) { 9048 pamp = amp; 9049 paddr = (caddr_t)((lpgaddr - seg->s_base) + 9050 ptob(svd->anon_index)); 9051 preclaim_callback = shamp_reclaim; 9052 } else { 9053 pamp = NULL; 9054 paddr = lpgaddr; 9055 preclaim_callback = segvn_reclaim; 9056 } 9057 9058 if (type == L_PAGEUNLOCK) { 9059 VM_STAT_ADD(segvnvmstats.pagelock[0]); 9060 9061 /* 9062 * update hat ref bits for /proc. We need to make sure 9063 * that threads tracing the ref and mod bits of the 9064 * address space get the right data. 9065 * Note: page ref and mod bits are updated at reclaim time 9066 */ 9067 if (seg->s_as->a_vbits) { 9068 for (a = addr; a < addr + len; a += PAGESIZE) { 9069 if (rw == S_WRITE) { 9070 hat_setstat(seg->s_as, a, 9071 PAGESIZE, P_REF | P_MOD); 9072 } else { 9073 hat_setstat(seg->s_as, a, 9074 PAGESIZE, P_REF); 9075 } 9076 } 9077 } 9078 9079 /* 9080 * Check the shadow list entry after the last page used in 9081 * this IO request. If it's NOPCACHE_SHWLIST the shadow list 9082 * was not inserted into pcache and is not large page 9083 * adjusted. In this case call reclaim callback directly and 9084 * don't adjust the shadow list start and size for large 9085 * pages. 9086 */ 9087 npages = btop(len); 9088 if ((*ppp)[npages] == NOPCACHE_SHWLIST) { 9089 void *ptag; 9090 if (pamp != NULL) { 9091 ASSERT(svd->type == MAP_SHARED); 9092 ptag = (void *)pamp; 9093 paddr = (caddr_t)((addr - seg->s_base) + 9094 ptob(svd->anon_index)); 9095 } else { 9096 ptag = (void *)seg; 9097 paddr = addr; 9098 } 9099 (*preclaim_callback)(ptag, paddr, len, *ppp, rw, 0); 9100 } else { 9101 ASSERT((*ppp)[npages] == PCACHE_SHWLIST || 9102 IS_SWAPFSVP((*ppp)[npages]->p_vnode)); 9103 len = lpgeaddr - lpgaddr; 9104 npages = btop(len); 9105 seg_pinactive(seg, pamp, paddr, len, 9106 *ppp - adjustpages, rw, pflags, preclaim_callback); 9107 } 9108 9109 if (pamp != NULL) { 9110 ASSERT(svd->type == MAP_SHARED); 9111 ASSERT(svd->softlockcnt >= npages); 9112 atomic_add_long((ulong_t *)&svd->softlockcnt, -npages); 9113 } 9114 9115 if (sftlck_sbase) { 9116 ASSERT(svd->softlockcnt_sbase > 0); 9117 atomic_dec_ulong((ulong_t *)&svd->softlockcnt_sbase); 9118 } 9119 if (sftlck_send) { 9120 ASSERT(svd->softlockcnt_send > 0); 9121 atomic_dec_ulong((ulong_t *)&svd->softlockcnt_send); 9122 } 9123 9124 /* 9125 * If someone is blocked while unmapping, we purge 9126 * segment page cache and thus reclaim pplist synchronously 9127 * without waiting for seg_pasync_thread. This speeds up 9128 * unmapping in cases where munmap(2) is called, while 9129 * raw async i/o is still in progress or where a thread 9130 * exits on data fault in a multithreaded application. 9131 */ 9132 if (AS_ISUNMAPWAIT(seg->s_as)) { 9133 if (svd->softlockcnt == 0) { 9134 mutex_enter(&seg->s_as->a_contents); 9135 if (AS_ISUNMAPWAIT(seg->s_as)) { 9136 AS_CLRUNMAPWAIT(seg->s_as); 9137 cv_broadcast(&seg->s_as->a_cv); 9138 } 9139 mutex_exit(&seg->s_as->a_contents); 9140 } else if (pamp == NULL) { 9141 /* 9142 * softlockcnt is not 0 and this is a 9143 * MAP_PRIVATE segment. Try to purge its 9144 * pcache entries to reduce softlockcnt. 9145 * If it drops to 0 segvn_reclaim() 9146 * will wake up a thread waiting on 9147 * unmapwait flag. 9148 * 9149 * We don't purge MAP_SHARED segments with non 9150 * 0 softlockcnt since IO is still in progress 9151 * for such segments. 9152 */ 9153 ASSERT(svd->type == MAP_PRIVATE); 9154 segvn_purge(seg); 9155 } 9156 } 9157 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9158 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 9159 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 9160 return (0); 9161 } 9162 9163 /* The L_PAGELOCK case ... */ 9164 9165 VM_STAT_ADD(segvnvmstats.pagelock[1]); 9166 9167 /* 9168 * For MAP_SHARED segments we have to check protections before 9169 * seg_plookup() since pcache entries may be shared by many segments 9170 * with potentially different page protections. 9171 */ 9172 if (pamp != NULL) { 9173 ASSERT(svd->type == MAP_SHARED); 9174 if (svd->pageprot == 0) { 9175 if ((svd->prot & protchk) == 0) { 9176 error = EACCES; 9177 goto out; 9178 } 9179 } else { 9180 /* 9181 * check page protections 9182 */ 9183 caddr_t ea; 9184 9185 if (seg->s_szc) { 9186 a = lpgaddr; 9187 ea = lpgeaddr; 9188 } else { 9189 a = addr; 9190 ea = addr + len; 9191 } 9192 for (; a < ea; a += pgsz) { 9193 struct vpage *vp; 9194 9195 ASSERT(seg->s_szc == 0 || 9196 sameprot(seg, a, pgsz)); 9197 vp = &svd->vpage[seg_page(seg, a)]; 9198 if ((VPP_PROT(vp) & protchk) == 0) { 9199 error = EACCES; 9200 goto out; 9201 } 9202 } 9203 } 9204 } 9205 9206 /* 9207 * try to find pages in segment page cache 9208 */ 9209 pplist = seg_plookup(seg, pamp, paddr, lpgeaddr - lpgaddr, rw, pflags); 9210 if (pplist != NULL) { 9211 if (pamp != NULL) { 9212 npages = btop((uintptr_t)(lpgeaddr - lpgaddr)); 9213 ASSERT(svd->type == MAP_SHARED); 9214 atomic_add_long((ulong_t *)&svd->softlockcnt, 9215 npages); 9216 } 9217 if (sftlck_sbase) { 9218 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase); 9219 } 9220 if (sftlck_send) { 9221 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send); 9222 } 9223 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9224 *ppp = pplist + adjustpages; 9225 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 9226 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 9227 return (0); 9228 } 9229 9230 /* 9231 * For MAP_SHARED segments we already verified above that segment 9232 * protections allow this pagelock operation. 9233 */ 9234 if (pamp == NULL) { 9235 ASSERT(svd->type == MAP_PRIVATE); 9236 if (svd->pageprot == 0) { 9237 if ((svd->prot & protchk) == 0) { 9238 error = EACCES; 9239 goto out; 9240 } 9241 if (svd->prot & PROT_WRITE) { 9242 wlen = lpgeaddr - lpgaddr; 9243 } else { 9244 wlen = 0; 9245 ASSERT(rw == S_READ); 9246 } 9247 } else { 9248 int wcont = 1; 9249 /* 9250 * check page protections 9251 */ 9252 for (a = lpgaddr, wlen = 0; a < lpgeaddr; a += pgsz) { 9253 struct vpage *vp; 9254 9255 ASSERT(seg->s_szc == 0 || 9256 sameprot(seg, a, pgsz)); 9257 vp = &svd->vpage[seg_page(seg, a)]; 9258 if ((VPP_PROT(vp) & protchk) == 0) { 9259 error = EACCES; 9260 goto out; 9261 } 9262 if (wcont && (VPP_PROT(vp) & PROT_WRITE)) { 9263 wlen += pgsz; 9264 } else { 9265 wcont = 0; 9266 ASSERT(rw == S_READ); 9267 } 9268 } 9269 } 9270 ASSERT(rw == S_READ || wlen == lpgeaddr - lpgaddr); 9271 ASSERT(rw == S_WRITE || wlen <= lpgeaddr - lpgaddr); 9272 } 9273 9274 /* 9275 * Only build large page adjusted shadow list if we expect to insert 9276 * it into pcache. For large enough pages it's a big overhead to 9277 * create a shadow list of the entire large page. But this overhead 9278 * should be amortized over repeated pcache hits on subsequent reuse 9279 * of this shadow list (IO into any range within this shadow list will 9280 * find it in pcache since we large page align the request for pcache 9281 * lookups). pcache performance is improved with bigger shadow lists 9282 * as it reduces the time to pcache the entire big segment and reduces 9283 * pcache chain length. 9284 */ 9285 if (seg_pinsert_check(seg, pamp, paddr, 9286 lpgeaddr - lpgaddr, pflags) == SEGP_SUCCESS) { 9287 addr = lpgaddr; 9288 len = lpgeaddr - lpgaddr; 9289 use_pcache = 1; 9290 } else { 9291 use_pcache = 0; 9292 /* 9293 * Since this entry will not be inserted into the pcache, we 9294 * will not do any adjustments to the starting address or 9295 * size of the memory to be locked. 9296 */ 9297 adjustpages = 0; 9298 } 9299 npages = btop(len); 9300 9301 pplist = kmem_alloc(sizeof (page_t *) * (npages + 1), KM_SLEEP); 9302 pl = pplist; 9303 *ppp = pplist + adjustpages; 9304 /* 9305 * If use_pcache is 0 this shadow list is not large page adjusted. 9306 * Record this info in the last entry of shadow array so that 9307 * L_PAGEUNLOCK can determine if it should large page adjust the 9308 * address range to find the real range that was locked. 9309 */ 9310 pl[npages] = use_pcache ? PCACHE_SHWLIST : NOPCACHE_SHWLIST; 9311 9312 page = seg_page(seg, addr); 9313 anon_index = svd->anon_index + page; 9314 9315 anlock = 0; 9316 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 9317 ASSERT(amp->a_szc >= seg->s_szc); 9318 anpgcnt = page_get_pagecnt(amp->a_szc); 9319 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 9320 struct anon *ap; 9321 struct vnode *vp; 9322 u_offset_t off; 9323 9324 /* 9325 * Lock and unlock anon array only once per large page. 9326 * anon_array_enter() locks the root anon slot according to 9327 * a_szc which can't change while anon map is locked. We lock 9328 * anon the first time through this loop and each time we 9329 * reach anon index that corresponds to a root of a large 9330 * page. 9331 */ 9332 if (a == addr || P2PHASE(anon_index, anpgcnt) == 0) { 9333 ASSERT(anlock == 0); 9334 anon_array_enter(amp, anon_index, &cookie); 9335 anlock = 1; 9336 } 9337 ap = anon_get_ptr(amp->ahp, anon_index); 9338 9339 /* 9340 * We must never use seg_pcache for COW pages 9341 * because we might end up with original page still 9342 * lying in seg_pcache even after private page is 9343 * created. This leads to data corruption as 9344 * aio_write refers to the page still in cache 9345 * while all other accesses refer to the private 9346 * page. 9347 */ 9348 if (ap == NULL || ap->an_refcnt != 1) { 9349 struct vpage *vpage; 9350 9351 if (seg->s_szc) { 9352 error = EFAULT; 9353 break; 9354 } 9355 if (svd->vpage != NULL) { 9356 vpage = &svd->vpage[seg_page(seg, a)]; 9357 } else { 9358 vpage = NULL; 9359 } 9360 ASSERT(anlock); 9361 anon_array_exit(&cookie); 9362 anlock = 0; 9363 pp = NULL; 9364 error = segvn_faultpage(seg->s_as->a_hat, seg, a, 0, 9365 vpage, &pp, 0, F_INVAL, rw, 1); 9366 if (error) { 9367 error = fc_decode(error); 9368 break; 9369 } 9370 anon_array_enter(amp, anon_index, &cookie); 9371 anlock = 1; 9372 ap = anon_get_ptr(amp->ahp, anon_index); 9373 if (ap == NULL || ap->an_refcnt != 1) { 9374 error = EFAULT; 9375 break; 9376 } 9377 } 9378 swap_xlate(ap, &vp, &off); 9379 pp = page_lookup_nowait(vp, off, SE_SHARED); 9380 if (pp == NULL) { 9381 error = EFAULT; 9382 break; 9383 } 9384 if (ap->an_pvp != NULL) { 9385 anon_swap_free(ap, pp); 9386 } 9387 /* 9388 * Unlock anon if this is the last slot in a large page. 9389 */ 9390 if (P2PHASE(anon_index, anpgcnt) == anpgcnt - 1) { 9391 ASSERT(anlock); 9392 anon_array_exit(&cookie); 9393 anlock = 0; 9394 } 9395 *pplist++ = pp; 9396 } 9397 if (anlock) { /* Ensure the lock is dropped */ 9398 anon_array_exit(&cookie); 9399 } 9400 ANON_LOCK_EXIT(&->a_rwlock); 9401 9402 if (a >= addr + len) { 9403 atomic_add_long((ulong_t *)&svd->softlockcnt, npages); 9404 if (pamp != NULL) { 9405 ASSERT(svd->type == MAP_SHARED); 9406 atomic_add_long((ulong_t *)&pamp->a_softlockcnt, 9407 npages); 9408 wlen = len; 9409 } 9410 if (sftlck_sbase) { 9411 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_sbase); 9412 } 9413 if (sftlck_send) { 9414 atomic_inc_ulong((ulong_t *)&svd->softlockcnt_send); 9415 } 9416 if (use_pcache) { 9417 (void) seg_pinsert(seg, pamp, paddr, len, wlen, pl, 9418 rw, pflags, preclaim_callback); 9419 } 9420 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9421 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 9422 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 9423 return (0); 9424 } 9425 9426 pplist = pl; 9427 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 9428 while (np > (uint_t)0) { 9429 ASSERT(PAGE_LOCKED(*pplist)); 9430 page_unlock(*pplist); 9431 np--; 9432 pplist++; 9433 } 9434 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9435 out: 9436 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9437 *ppp = NULL; 9438 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 9439 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 9440 return (error); 9441 } 9442 9443 /* 9444 * purge any cached pages in the I/O page cache 9445 */ 9446 static void 9447 segvn_purge(struct seg *seg) 9448 { 9449 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9450 9451 /* 9452 * pcache is only used by pure anon segments. 9453 */ 9454 if (svd->amp == NULL || svd->vp != NULL) { 9455 return; 9456 } 9457 9458 /* 9459 * For MAP_SHARED segments non 0 segment's softlockcnt means 9460 * active IO is still in progress via this segment. So we only 9461 * purge MAP_SHARED segments when their softlockcnt is 0. 9462 */ 9463 if (svd->type == MAP_PRIVATE) { 9464 if (svd->softlockcnt) { 9465 seg_ppurge(seg, NULL, 0); 9466 } 9467 } else if (svd->softlockcnt == 0 && svd->amp->a_softlockcnt != 0) { 9468 seg_ppurge(seg, svd->amp, 0); 9469 } 9470 } 9471 9472 /* 9473 * If async argument is not 0 we are called from pcache async thread and don't 9474 * hold AS lock. 9475 */ 9476 9477 /*ARGSUSED*/ 9478 static int 9479 segvn_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 9480 enum seg_rw rw, int async) 9481 { 9482 struct seg *seg = (struct seg *)ptag; 9483 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9484 pgcnt_t np, npages; 9485 struct page **pl; 9486 9487 npages = np = btop(len); 9488 ASSERT(npages); 9489 9490 ASSERT(svd->vp == NULL && svd->amp != NULL); 9491 ASSERT(svd->softlockcnt >= npages); 9492 ASSERT(async || AS_LOCK_HELD(seg->s_as)); 9493 9494 pl = pplist; 9495 9496 ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); 9497 ASSERT(!async || pl[np] == PCACHE_SHWLIST); 9498 9499 while (np > (uint_t)0) { 9500 if (rw == S_WRITE) { 9501 hat_setrefmod(*pplist); 9502 } else { 9503 hat_setref(*pplist); 9504 } 9505 page_unlock(*pplist); 9506 np--; 9507 pplist++; 9508 } 9509 9510 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9511 9512 /* 9513 * If we are pcache async thread we don't hold AS lock. This means if 9514 * softlockcnt drops to 0 after the decrement below address space may 9515 * get freed. We can't allow it since after softlock derement to 0 we 9516 * still need to access as structure for possible wakeup of unmap 9517 * waiters. To prevent the disappearance of as we take this segment 9518 * segfree_syncmtx. segvn_free() also takes this mutex as a barrier to 9519 * make sure this routine completes before segment is freed. 9520 * 9521 * The second complication we have to deal with in async case is a 9522 * possibility of missed wake up of unmap wait thread. When we don't 9523 * hold as lock here we may take a_contents lock before unmap wait 9524 * thread that was first to see softlockcnt was still not 0. As a 9525 * result we'll fail to wake up an unmap wait thread. To avoid this 9526 * race we set nounmapwait flag in as structure if we drop softlockcnt 9527 * to 0 when we were called by pcache async thread. unmapwait thread 9528 * will not block if this flag is set. 9529 */ 9530 if (async) { 9531 mutex_enter(&svd->segfree_syncmtx); 9532 } 9533 9534 if (!atomic_add_long_nv((ulong_t *)&svd->softlockcnt, -npages)) { 9535 if (async || AS_ISUNMAPWAIT(seg->s_as)) { 9536 mutex_enter(&seg->s_as->a_contents); 9537 if (async) { 9538 AS_SETNOUNMAPWAIT(seg->s_as); 9539 } 9540 if (AS_ISUNMAPWAIT(seg->s_as)) { 9541 AS_CLRUNMAPWAIT(seg->s_as); 9542 cv_broadcast(&seg->s_as->a_cv); 9543 } 9544 mutex_exit(&seg->s_as->a_contents); 9545 } 9546 } 9547 9548 if (async) { 9549 mutex_exit(&svd->segfree_syncmtx); 9550 } 9551 return (0); 9552 } 9553 9554 /*ARGSUSED*/ 9555 static int 9556 shamp_reclaim(void *ptag, caddr_t addr, size_t len, struct page **pplist, 9557 enum seg_rw rw, int async) 9558 { 9559 amp_t *amp = (amp_t *)ptag; 9560 pgcnt_t np, npages; 9561 struct page **pl; 9562 9563 npages = np = btop(len); 9564 ASSERT(npages); 9565 ASSERT(amp->a_softlockcnt >= npages); 9566 9567 pl = pplist; 9568 9569 ASSERT(pl[np] == NOPCACHE_SHWLIST || pl[np] == PCACHE_SHWLIST); 9570 ASSERT(!async || pl[np] == PCACHE_SHWLIST); 9571 9572 while (np > (uint_t)0) { 9573 if (rw == S_WRITE) { 9574 hat_setrefmod(*pplist); 9575 } else { 9576 hat_setref(*pplist); 9577 } 9578 page_unlock(*pplist); 9579 np--; 9580 pplist++; 9581 } 9582 9583 kmem_free(pl, sizeof (page_t *) * (npages + 1)); 9584 9585 /* 9586 * If somebody sleeps in anonmap_purge() wake them up if a_softlockcnt 9587 * drops to 0. anon map can't be freed until a_softlockcnt drops to 0 9588 * and anonmap_purge() acquires a_purgemtx. 9589 */ 9590 mutex_enter(&->a_purgemtx); 9591 if (!atomic_add_long_nv((ulong_t *)&->a_softlockcnt, -npages) && 9592 amp->a_purgewait) { 9593 amp->a_purgewait = 0; 9594 cv_broadcast(&->a_purgecv); 9595 } 9596 mutex_exit(&->a_purgemtx); 9597 return (0); 9598 } 9599 9600 /* 9601 * get a memory ID for an addr in a given segment 9602 * 9603 * XXX only creates PAGESIZE pages if anon slots are not initialized. 9604 * At fault time they will be relocated into larger pages. 9605 */ 9606 static int 9607 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 9608 { 9609 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9610 struct anon *ap = NULL; 9611 ulong_t anon_index; 9612 struct anon_map *amp; 9613 anon_sync_obj_t cookie; 9614 9615 if (svd->type == MAP_PRIVATE) { 9616 memidp->val[0] = (uintptr_t)seg->s_as; 9617 memidp->val[1] = (uintptr_t)addr; 9618 return (0); 9619 } 9620 9621 if (svd->type == MAP_SHARED) { 9622 if (svd->vp) { 9623 memidp->val[0] = (uintptr_t)svd->vp; 9624 memidp->val[1] = (u_longlong_t)svd->offset + 9625 (uintptr_t)(addr - seg->s_base); 9626 return (0); 9627 } else { 9628 9629 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 9630 if ((amp = svd->amp) != NULL) { 9631 anon_index = svd->anon_index + 9632 seg_page(seg, addr); 9633 } 9634 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9635 9636 ASSERT(amp != NULL); 9637 9638 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 9639 anon_array_enter(amp, anon_index, &cookie); 9640 ap = anon_get_ptr(amp->ahp, anon_index); 9641 if (ap == NULL) { 9642 page_t *pp; 9643 9644 pp = anon_zero(seg, addr, &ap, svd->cred); 9645 if (pp == NULL) { 9646 anon_array_exit(&cookie); 9647 ANON_LOCK_EXIT(&->a_rwlock); 9648 return (ENOMEM); 9649 } 9650 ASSERT(anon_get_ptr(amp->ahp, anon_index) 9651 == NULL); 9652 (void) anon_set_ptr(amp->ahp, anon_index, 9653 ap, ANON_SLEEP); 9654 page_unlock(pp); 9655 } 9656 9657 anon_array_exit(&cookie); 9658 ANON_LOCK_EXIT(&->a_rwlock); 9659 9660 memidp->val[0] = (uintptr_t)ap; 9661 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 9662 return (0); 9663 } 9664 } 9665 return (EINVAL); 9666 } 9667 9668 static int 9669 sameprot(struct seg *seg, caddr_t a, size_t len) 9670 { 9671 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9672 struct vpage *vpage; 9673 spgcnt_t pages = btop(len); 9674 uint_t prot; 9675 9676 if (svd->pageprot == 0) 9677 return (1); 9678 9679 ASSERT(svd->vpage != NULL); 9680 9681 vpage = &svd->vpage[seg_page(seg, a)]; 9682 prot = VPP_PROT(vpage); 9683 vpage++; 9684 pages--; 9685 while (pages-- > 0) { 9686 if (prot != VPP_PROT(vpage)) 9687 return (0); 9688 vpage++; 9689 } 9690 return (1); 9691 } 9692 9693 /* 9694 * Get memory allocation policy info for specified address in given segment 9695 */ 9696 static lgrp_mem_policy_info_t * 9697 segvn_getpolicy(struct seg *seg, caddr_t addr) 9698 { 9699 struct anon_map *amp; 9700 ulong_t anon_index; 9701 lgrp_mem_policy_info_t *policy_info; 9702 struct segvn_data *svn_data; 9703 u_offset_t vn_off; 9704 vnode_t *vp; 9705 9706 ASSERT(seg != NULL); 9707 9708 svn_data = (struct segvn_data *)seg->s_data; 9709 if (svn_data == NULL) 9710 return (NULL); 9711 9712 /* 9713 * Get policy info for private or shared memory 9714 */ 9715 if (svn_data->type != MAP_SHARED) { 9716 if (svn_data->tr_state != SEGVN_TR_ON) { 9717 policy_info = &svn_data->policy_info; 9718 } else { 9719 policy_info = &svn_data->tr_policy_info; 9720 ASSERT(policy_info->mem_policy == 9721 LGRP_MEM_POLICY_NEXT_SEG); 9722 } 9723 } else { 9724 amp = svn_data->amp; 9725 anon_index = svn_data->anon_index + seg_page(seg, addr); 9726 vp = svn_data->vp; 9727 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 9728 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 9729 } 9730 9731 return (policy_info); 9732 } 9733 9734 /*ARGSUSED*/ 9735 static int 9736 segvn_capable(struct seg *seg, segcapability_t capability) 9737 { 9738 return (0); 9739 } 9740 9741 /* 9742 * Bind text vnode segment to an amp. If we bind successfully mappings will be 9743 * established to per vnode mapping per lgroup amp pages instead of to vnode 9744 * pages. There's one amp per vnode text mapping per lgroup. Many processes 9745 * may share the same text replication amp. If a suitable amp doesn't already 9746 * exist in svntr hash table create a new one. We may fail to bind to amp if 9747 * segment is not eligible for text replication. Code below first checks for 9748 * these conditions. If binding is successful segment tr_state is set to on 9749 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and 9750 * svd->amp remains as NULL. 9751 */ 9752 static void 9753 segvn_textrepl(struct seg *seg) 9754 { 9755 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 9756 vnode_t *vp = svd->vp; 9757 u_offset_t off = svd->offset; 9758 size_t size = seg->s_size; 9759 u_offset_t eoff = off + size; 9760 uint_t szc = seg->s_szc; 9761 ulong_t hash = SVNTR_HASH_FUNC(vp); 9762 svntr_t *svntrp; 9763 struct vattr va; 9764 proc_t *p = seg->s_as->a_proc; 9765 lgrp_id_t lgrp_id; 9766 lgrp_id_t olid; 9767 int first; 9768 struct anon_map *amp; 9769 9770 ASSERT(AS_LOCK_HELD(seg->s_as)); 9771 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 9772 ASSERT(p != NULL); 9773 ASSERT(svd->tr_state == SEGVN_TR_INIT); 9774 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 9775 ASSERT(svd->flags & MAP_TEXT); 9776 ASSERT(svd->type == MAP_PRIVATE); 9777 ASSERT(vp != NULL && svd->amp == NULL); 9778 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 9779 ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0); 9780 ASSERT(seg->s_as != &kas); 9781 ASSERT(off < eoff); 9782 ASSERT(svntr_hashtab != NULL); 9783 9784 /* 9785 * If numa optimizations are no longer desired bail out. 9786 */ 9787 if (!lgrp_optimizations()) { 9788 svd->tr_state = SEGVN_TR_OFF; 9789 return; 9790 } 9791 9792 /* 9793 * Avoid creating anon maps with size bigger than the file size. 9794 * If VOP_GETATTR() call fails bail out. 9795 */ 9796 va.va_mask = AT_SIZE | AT_MTIME | AT_CTIME; 9797 if (VOP_GETATTR(vp, &va, 0, svd->cred, NULL) != 0) { 9798 svd->tr_state = SEGVN_TR_OFF; 9799 SEGVN_TR_ADDSTAT(gaerr); 9800 return; 9801 } 9802 if (btopr(va.va_size) < btopr(eoff)) { 9803 svd->tr_state = SEGVN_TR_OFF; 9804 SEGVN_TR_ADDSTAT(overmap); 9805 return; 9806 } 9807 9808 /* 9809 * VVMEXEC may not be set yet if exec() prefaults text segment. Set 9810 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED 9811 * mapping that checks if trcache for this vnode needs to be 9812 * invalidated can't miss us. 9813 */ 9814 if (!(vp->v_flag & VVMEXEC)) { 9815 mutex_enter(&vp->v_lock); 9816 vp->v_flag |= VVMEXEC; 9817 mutex_exit(&vp->v_lock); 9818 } 9819 mutex_enter(&svntr_hashtab[hash].tr_lock); 9820 /* 9821 * Bail out if potentially MAP_SHARED writable mappings exist to this 9822 * vnode. We don't want to use old file contents from existing 9823 * replicas if this mapping was established after the original file 9824 * was changed. 9825 */ 9826 if (vn_is_mapped(vp, V_WRITE)) { 9827 mutex_exit(&svntr_hashtab[hash].tr_lock); 9828 svd->tr_state = SEGVN_TR_OFF; 9829 SEGVN_TR_ADDSTAT(wrcnt); 9830 return; 9831 } 9832 svntrp = svntr_hashtab[hash].tr_head; 9833 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9834 ASSERT(svntrp->tr_refcnt != 0); 9835 if (svntrp->tr_vp != vp) { 9836 continue; 9837 } 9838 9839 /* 9840 * Bail out if the file or its attributes were changed after 9841 * this replication entry was created since we need to use the 9842 * latest file contents. Note that mtime test alone is not 9843 * sufficient because a user can explicitly change mtime via 9844 * utimes(2) interfaces back to the old value after modifiying 9845 * the file contents. To detect this case we also have to test 9846 * ctime which among other things records the time of the last 9847 * mtime change by utimes(2). ctime is not changed when the file 9848 * is only read or executed so we expect that typically existing 9849 * replication amp's can be used most of the time. 9850 */ 9851 if (!svntrp->tr_valid || 9852 svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec || 9853 svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec || 9854 svntrp->tr_ctime.tv_sec != va.va_ctime.tv_sec || 9855 svntrp->tr_ctime.tv_nsec != va.va_ctime.tv_nsec) { 9856 mutex_exit(&svntr_hashtab[hash].tr_lock); 9857 svd->tr_state = SEGVN_TR_OFF; 9858 SEGVN_TR_ADDSTAT(stale); 9859 return; 9860 } 9861 /* 9862 * if off, eoff and szc match current segment we found the 9863 * existing entry we can use. 9864 */ 9865 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff && 9866 svntrp->tr_szc == szc) { 9867 break; 9868 } 9869 /* 9870 * Don't create different but overlapping in file offsets 9871 * entries to avoid replication of the same file pages more 9872 * than once per lgroup. 9873 */ 9874 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) || 9875 (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) { 9876 mutex_exit(&svntr_hashtab[hash].tr_lock); 9877 svd->tr_state = SEGVN_TR_OFF; 9878 SEGVN_TR_ADDSTAT(overlap); 9879 return; 9880 } 9881 } 9882 /* 9883 * If we didn't find existing entry create a new one. 9884 */ 9885 if (svntrp == NULL) { 9886 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP); 9887 if (svntrp == NULL) { 9888 mutex_exit(&svntr_hashtab[hash].tr_lock); 9889 svd->tr_state = SEGVN_TR_OFF; 9890 SEGVN_TR_ADDSTAT(nokmem); 9891 return; 9892 } 9893 #ifdef DEBUG 9894 { 9895 lgrp_id_t i; 9896 for (i = 0; i < NLGRPS_MAX; i++) { 9897 ASSERT(svntrp->tr_amp[i] == NULL); 9898 } 9899 } 9900 #endif /* DEBUG */ 9901 svntrp->tr_vp = vp; 9902 svntrp->tr_off = off; 9903 svntrp->tr_eoff = eoff; 9904 svntrp->tr_szc = szc; 9905 svntrp->tr_valid = 1; 9906 svntrp->tr_mtime = va.va_mtime; 9907 svntrp->tr_ctime = va.va_ctime; 9908 svntrp->tr_refcnt = 0; 9909 svntrp->tr_next = svntr_hashtab[hash].tr_head; 9910 svntr_hashtab[hash].tr_head = svntrp; 9911 } 9912 first = 1; 9913 again: 9914 /* 9915 * We want to pick a replica with pages on main thread's (t_tid = 1, 9916 * aka T1) lgrp. Currently text replication is only optimized for 9917 * workloads that either have all threads of a process on the same 9918 * lgrp or execute their large text primarily on main thread. 9919 */ 9920 lgrp_id = p->p_t1_lgrpid; 9921 if (lgrp_id == LGRP_NONE) { 9922 /* 9923 * In case exec() prefaults text on non main thread use 9924 * current thread lgrpid. It will become main thread anyway 9925 * soon. 9926 */ 9927 lgrp_id = lgrp_home_id(curthread); 9928 } 9929 /* 9930 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise 9931 * just set it to NLGRPS_MAX if it's different from current process T1 9932 * home lgrp. p_tr_lgrpid is used to detect if process uses text 9933 * replication and T1 new home is different from lgrp used for text 9934 * replication. When this happens asyncronous segvn thread rechecks if 9935 * segments should change lgrps used for text replication. If we fail 9936 * to set p_tr_lgrpid with atomic_cas_32 then set it to NLGRPS_MAX 9937 * without cas if it's not already NLGRPS_MAX and not equal lgrp_id 9938 * we want to use. We don't need to use cas in this case because 9939 * another thread that races in between our non atomic check and set 9940 * may only change p_tr_lgrpid to NLGRPS_MAX at this point. 9941 */ 9942 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 9943 olid = p->p_tr_lgrpid; 9944 if (lgrp_id != olid && olid != NLGRPS_MAX) { 9945 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX; 9946 if (atomic_cas_32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != 9947 olid) { 9948 olid = p->p_tr_lgrpid; 9949 ASSERT(olid != LGRP_NONE); 9950 if (olid != lgrp_id && olid != NLGRPS_MAX) { 9951 p->p_tr_lgrpid = NLGRPS_MAX; 9952 } 9953 } 9954 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9955 membar_producer(); 9956 /* 9957 * lgrp_move_thread() won't schedule async recheck after 9958 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not 9959 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid 9960 * is not LGRP_NONE. 9961 */ 9962 if (first && p->p_t1_lgrpid != LGRP_NONE && 9963 p->p_t1_lgrpid != lgrp_id) { 9964 first = 0; 9965 goto again; 9966 } 9967 } 9968 /* 9969 * If no amp was created yet for lgrp_id create a new one as long as 9970 * we have enough memory to afford it. 9971 */ 9972 if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) { 9973 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9974 if (trmem > segvn_textrepl_max_bytes) { 9975 SEGVN_TR_ADDSTAT(normem); 9976 goto fail; 9977 } 9978 if (anon_try_resv_zone(size, NULL) == 0) { 9979 SEGVN_TR_ADDSTAT(noanon); 9980 goto fail; 9981 } 9982 amp = anonmap_alloc(size, size, ANON_NOSLEEP); 9983 if (amp == NULL) { 9984 anon_unresv_zone(size, NULL); 9985 SEGVN_TR_ADDSTAT(nokmem); 9986 goto fail; 9987 } 9988 ASSERT(amp->refcnt == 1); 9989 amp->a_szc = szc; 9990 svntrp->tr_amp[lgrp_id] = amp; 9991 SEGVN_TR_ADDSTAT(newamp); 9992 } 9993 svntrp->tr_refcnt++; 9994 ASSERT(svd->svn_trnext == NULL); 9995 ASSERT(svd->svn_trprev == NULL); 9996 svd->svn_trnext = svntrp->tr_svnhead; 9997 svd->svn_trprev = NULL; 9998 if (svntrp->tr_svnhead != NULL) { 9999 svntrp->tr_svnhead->svn_trprev = svd; 10000 } 10001 svntrp->tr_svnhead = svd; 10002 ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size); 10003 ASSERT(amp->refcnt >= 1); 10004 svd->amp = amp; 10005 svd->anon_index = 0; 10006 svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG; 10007 svd->tr_policy_info.mem_lgrpid = lgrp_id; 10008 svd->tr_state = SEGVN_TR_ON; 10009 mutex_exit(&svntr_hashtab[hash].tr_lock); 10010 SEGVN_TR_ADDSTAT(repl); 10011 return; 10012 fail: 10013 ASSERT(segvn_textrepl_bytes >= size); 10014 atomic_add_long(&segvn_textrepl_bytes, -size); 10015 ASSERT(svntrp != NULL); 10016 ASSERT(svntrp->tr_amp[lgrp_id] == NULL); 10017 if (svntrp->tr_refcnt == 0) { 10018 ASSERT(svntrp == svntr_hashtab[hash].tr_head); 10019 svntr_hashtab[hash].tr_head = svntrp->tr_next; 10020 mutex_exit(&svntr_hashtab[hash].tr_lock); 10021 kmem_cache_free(svntr_cache, svntrp); 10022 } else { 10023 mutex_exit(&svntr_hashtab[hash].tr_lock); 10024 } 10025 svd->tr_state = SEGVN_TR_OFF; 10026 } 10027 10028 /* 10029 * Convert seg back to regular vnode mapping seg by unbinding it from its text 10030 * replication amp. This routine is most typically called when segment is 10031 * unmapped but can also be called when segment no longer qualifies for text 10032 * replication (e.g. due to protection changes). If unload_unmap is set use 10033 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of 10034 * svntr free all its anon maps and remove it from the hash table. 10035 */ 10036 static void 10037 segvn_textunrepl(struct seg *seg, int unload_unmap) 10038 { 10039 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 10040 vnode_t *vp = svd->vp; 10041 u_offset_t off = svd->offset; 10042 size_t size = seg->s_size; 10043 u_offset_t eoff = off + size; 10044 uint_t szc = seg->s_szc; 10045 ulong_t hash = SVNTR_HASH_FUNC(vp); 10046 svntr_t *svntrp; 10047 svntr_t **prv_svntrp; 10048 lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid; 10049 lgrp_id_t i; 10050 10051 ASSERT(AS_LOCK_HELD(seg->s_as)); 10052 ASSERT(AS_WRITE_HELD(seg->s_as) || 10053 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 10054 ASSERT(svd->tr_state == SEGVN_TR_ON); 10055 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 10056 ASSERT(svd->amp != NULL); 10057 ASSERT(svd->amp->refcnt >= 1); 10058 ASSERT(svd->anon_index == 0); 10059 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 10060 ASSERT(svntr_hashtab != NULL); 10061 10062 mutex_enter(&svntr_hashtab[hash].tr_lock); 10063 prv_svntrp = &svntr_hashtab[hash].tr_head; 10064 for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) { 10065 ASSERT(svntrp->tr_refcnt != 0); 10066 if (svntrp->tr_vp == vp && svntrp->tr_off == off && 10067 svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) { 10068 break; 10069 } 10070 } 10071 if (svntrp == NULL) { 10072 panic("segvn_textunrepl: svntr record not found"); 10073 } 10074 if (svntrp->tr_amp[lgrp_id] != svd->amp) { 10075 panic("segvn_textunrepl: amp mismatch"); 10076 } 10077 svd->tr_state = SEGVN_TR_OFF; 10078 svd->amp = NULL; 10079 if (svd->svn_trprev == NULL) { 10080 ASSERT(svntrp->tr_svnhead == svd); 10081 svntrp->tr_svnhead = svd->svn_trnext; 10082 if (svntrp->tr_svnhead != NULL) { 10083 svntrp->tr_svnhead->svn_trprev = NULL; 10084 } 10085 svd->svn_trnext = NULL; 10086 } else { 10087 svd->svn_trprev->svn_trnext = svd->svn_trnext; 10088 if (svd->svn_trnext != NULL) { 10089 svd->svn_trnext->svn_trprev = svd->svn_trprev; 10090 svd->svn_trnext = NULL; 10091 } 10092 svd->svn_trprev = NULL; 10093 } 10094 if (--svntrp->tr_refcnt) { 10095 mutex_exit(&svntr_hashtab[hash].tr_lock); 10096 goto done; 10097 } 10098 *prv_svntrp = svntrp->tr_next; 10099 mutex_exit(&svntr_hashtab[hash].tr_lock); 10100 for (i = 0; i < NLGRPS_MAX; i++) { 10101 struct anon_map *amp = svntrp->tr_amp[i]; 10102 if (amp == NULL) { 10103 continue; 10104 } 10105 ASSERT(amp->refcnt == 1); 10106 ASSERT(amp->swresv == size); 10107 ASSERT(amp->size == size); 10108 ASSERT(amp->a_szc == szc); 10109 if (amp->a_szc != 0) { 10110 anon_free_pages(amp->ahp, 0, size, szc); 10111 } else { 10112 anon_free(amp->ahp, 0, size); 10113 } 10114 svntrp->tr_amp[i] = NULL; 10115 ASSERT(segvn_textrepl_bytes >= size); 10116 atomic_add_long(&segvn_textrepl_bytes, -size); 10117 anon_unresv_zone(amp->swresv, NULL); 10118 amp->refcnt = 0; 10119 anonmap_free(amp); 10120 } 10121 kmem_cache_free(svntr_cache, svntrp); 10122 done: 10123 hat_unload_callback(seg->s_as->a_hat, seg->s_base, size, 10124 unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL); 10125 } 10126 10127 /* 10128 * This is called when a MAP_SHARED writable mapping is created to a vnode 10129 * that is currently used for execution (VVMEXEC flag is set). In this case we 10130 * need to prevent further use of existing replicas. 10131 */ 10132 static void 10133 segvn_inval_trcache(vnode_t *vp) 10134 { 10135 ulong_t hash = SVNTR_HASH_FUNC(vp); 10136 svntr_t *svntrp; 10137 10138 ASSERT(vp->v_flag & VVMEXEC); 10139 10140 if (svntr_hashtab == NULL) { 10141 return; 10142 } 10143 10144 mutex_enter(&svntr_hashtab[hash].tr_lock); 10145 svntrp = svntr_hashtab[hash].tr_head; 10146 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 10147 ASSERT(svntrp->tr_refcnt != 0); 10148 if (svntrp->tr_vp == vp && svntrp->tr_valid) { 10149 svntrp->tr_valid = 0; 10150 } 10151 } 10152 mutex_exit(&svntr_hashtab[hash].tr_lock); 10153 } 10154 10155 static void 10156 segvn_trasync_thread(void) 10157 { 10158 callb_cpr_t cpr_info; 10159 kmutex_t cpr_lock; /* just for CPR stuff */ 10160 10161 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 10162 10163 CALLB_CPR_INIT(&cpr_info, &cpr_lock, 10164 callb_generic_cpr, "segvn_async"); 10165 10166 if (segvn_update_textrepl_interval == 0) { 10167 segvn_update_textrepl_interval = segvn_update_tr_time * hz; 10168 } else { 10169 segvn_update_textrepl_interval *= hz; 10170 } 10171 (void) timeout(segvn_trupdate_wakeup, NULL, 10172 segvn_update_textrepl_interval); 10173 10174 for (;;) { 10175 mutex_enter(&cpr_lock); 10176 CALLB_CPR_SAFE_BEGIN(&cpr_info); 10177 mutex_exit(&cpr_lock); 10178 sema_p(&segvn_trasync_sem); 10179 mutex_enter(&cpr_lock); 10180 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 10181 mutex_exit(&cpr_lock); 10182 segvn_trupdate(); 10183 } 10184 } 10185 10186 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0; 10187 10188 static void 10189 segvn_trupdate_wakeup(void *dummy) 10190 { 10191 uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations(); 10192 10193 if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) { 10194 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs; 10195 sema_v(&segvn_trasync_sem); 10196 } 10197 10198 if (!segvn_disable_textrepl_update && 10199 segvn_update_textrepl_interval != 0) { 10200 (void) timeout(segvn_trupdate_wakeup, dummy, 10201 segvn_update_textrepl_interval); 10202 } 10203 } 10204 10205 static void 10206 segvn_trupdate(void) 10207 { 10208 ulong_t hash; 10209 svntr_t *svntrp; 10210 segvn_data_t *svd; 10211 10212 ASSERT(svntr_hashtab != NULL); 10213 10214 for (hash = 0; hash < svntr_hashtab_sz; hash++) { 10215 mutex_enter(&svntr_hashtab[hash].tr_lock); 10216 svntrp = svntr_hashtab[hash].tr_head; 10217 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 10218 ASSERT(svntrp->tr_refcnt != 0); 10219 svd = svntrp->tr_svnhead; 10220 for (; svd != NULL; svd = svd->svn_trnext) { 10221 segvn_trupdate_seg(svd->seg, svd, svntrp, 10222 hash); 10223 } 10224 } 10225 mutex_exit(&svntr_hashtab[hash].tr_lock); 10226 } 10227 } 10228 10229 static void 10230 segvn_trupdate_seg(struct seg *seg, 10231 segvn_data_t *svd, 10232 svntr_t *svntrp, 10233 ulong_t hash) 10234 { 10235 proc_t *p; 10236 lgrp_id_t lgrp_id; 10237 struct as *as; 10238 size_t size; 10239 struct anon_map *amp; 10240 10241 ASSERT(svd->vp != NULL); 10242 ASSERT(svd->vp == svntrp->tr_vp); 10243 ASSERT(svd->offset == svntrp->tr_off); 10244 ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff); 10245 ASSERT(seg != NULL); 10246 ASSERT(svd->seg == seg); 10247 ASSERT(seg->s_data == (void *)svd); 10248 ASSERT(seg->s_szc == svntrp->tr_szc); 10249 ASSERT(svd->tr_state == SEGVN_TR_ON); 10250 ASSERT(!HAT_IS_REGION_COOKIE_VALID(svd->rcookie)); 10251 ASSERT(svd->amp != NULL); 10252 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 10253 ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE); 10254 ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX); 10255 ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp); 10256 ASSERT(svntrp->tr_refcnt != 0); 10257 ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock)); 10258 10259 as = seg->s_as; 10260 ASSERT(as != NULL && as != &kas); 10261 p = as->a_proc; 10262 ASSERT(p != NULL); 10263 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 10264 lgrp_id = p->p_t1_lgrpid; 10265 if (lgrp_id == LGRP_NONE) { 10266 return; 10267 } 10268 ASSERT(lgrp_id < NLGRPS_MAX); 10269 if (svd->tr_policy_info.mem_lgrpid == lgrp_id) { 10270 return; 10271 } 10272 10273 /* 10274 * Use tryenter locking since we are locking as/seg and svntr hash 10275 * lock in reverse from syncrounous thread order. 10276 */ 10277 if (!AS_LOCK_TRYENTER(as, RW_READER)) { 10278 SEGVN_TR_ADDSTAT(nolock); 10279 if (segvn_lgrp_trthr_migrs_snpsht) { 10280 segvn_lgrp_trthr_migrs_snpsht = 0; 10281 } 10282 return; 10283 } 10284 if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) { 10285 AS_LOCK_EXIT(as); 10286 SEGVN_TR_ADDSTAT(nolock); 10287 if (segvn_lgrp_trthr_migrs_snpsht) { 10288 segvn_lgrp_trthr_migrs_snpsht = 0; 10289 } 10290 return; 10291 } 10292 size = seg->s_size; 10293 if (svntrp->tr_amp[lgrp_id] == NULL) { 10294 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 10295 if (trmem > segvn_textrepl_max_bytes) { 10296 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10297 AS_LOCK_EXIT(as); 10298 atomic_add_long(&segvn_textrepl_bytes, -size); 10299 SEGVN_TR_ADDSTAT(normem); 10300 return; 10301 } 10302 if (anon_try_resv_zone(size, NULL) == 0) { 10303 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10304 AS_LOCK_EXIT(as); 10305 atomic_add_long(&segvn_textrepl_bytes, -size); 10306 SEGVN_TR_ADDSTAT(noanon); 10307 return; 10308 } 10309 amp = anonmap_alloc(size, size, KM_NOSLEEP); 10310 if (amp == NULL) { 10311 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10312 AS_LOCK_EXIT(as); 10313 atomic_add_long(&segvn_textrepl_bytes, -size); 10314 anon_unresv_zone(size, NULL); 10315 SEGVN_TR_ADDSTAT(nokmem); 10316 return; 10317 } 10318 ASSERT(amp->refcnt == 1); 10319 amp->a_szc = seg->s_szc; 10320 svntrp->tr_amp[lgrp_id] = amp; 10321 } 10322 /* 10323 * We don't need to drop the bucket lock but here we give other 10324 * threads a chance. svntr and svd can't be unlinked as long as 10325 * segment lock is held as a writer and AS held as well. After we 10326 * retake bucket lock we'll continue from where we left. We'll be able 10327 * to reach the end of either list since new entries are always added 10328 * to the beginning of the lists. 10329 */ 10330 mutex_exit(&svntr_hashtab[hash].tr_lock); 10331 hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL); 10332 mutex_enter(&svntr_hashtab[hash].tr_lock); 10333 10334 ASSERT(svd->tr_state == SEGVN_TR_ON); 10335 ASSERT(svd->amp != NULL); 10336 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 10337 ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id); 10338 ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]); 10339 10340 svd->tr_policy_info.mem_lgrpid = lgrp_id; 10341 svd->amp = svntrp->tr_amp[lgrp_id]; 10342 p->p_tr_lgrpid = NLGRPS_MAX; 10343 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 10344 AS_LOCK_EXIT(as); 10345 10346 ASSERT(svntrp->tr_refcnt != 0); 10347 ASSERT(svd->vp == svntrp->tr_vp); 10348 ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id); 10349 ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]); 10350 ASSERT(svd->seg == seg); 10351 ASSERT(svd->tr_state == SEGVN_TR_ON); 10352 10353 SEGVN_TR_ADDSTAT(asyncrepl); 10354 } 10355