1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - shared or copy-on-write from a vnode/anonymous memory. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/param.h> 47 #include <sys/t_lock.h> 48 #include <sys/errno.h> 49 #include <sys/systm.h> 50 #include <sys/mman.h> 51 #include <sys/debug.h> 52 #include <sys/cred.h> 53 #include <sys/vmsystm.h> 54 #include <sys/tuneable.h> 55 #include <sys/bitmap.h> 56 #include <sys/swap.h> 57 #include <sys/kmem.h> 58 #include <sys/sysmacros.h> 59 #include <sys/vtrace.h> 60 #include <sys/cmn_err.h> 61 #include <sys/callb.h> 62 #include <sys/vm.h> 63 #include <sys/dumphdr.h> 64 #include <sys/lgrp.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/seg.h> 69 #include <vm/seg_vn.h> 70 #include <vm/pvn.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/vpage.h> 74 #include <sys/proc.h> 75 #include <sys/task.h> 76 #include <sys/project.h> 77 #include <sys/zone.h> 78 #include <sys/shm_impl.h> 79 /* 80 * Private seg op routines. 81 */ 82 static int segvn_dup(struct seg *seg, struct seg *newseg); 83 static int segvn_unmap(struct seg *seg, caddr_t addr, size_t len); 84 static void segvn_free(struct seg *seg); 85 static faultcode_t segvn_fault(struct hat *hat, struct seg *seg, 86 caddr_t addr, size_t len, enum fault_type type, 87 enum seg_rw rw); 88 static faultcode_t segvn_faulta(struct seg *seg, caddr_t addr); 89 static int segvn_setprot(struct seg *seg, caddr_t addr, 90 size_t len, uint_t prot); 91 static int segvn_checkprot(struct seg *seg, caddr_t addr, 92 size_t len, uint_t prot); 93 static int segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta); 94 static size_t segvn_swapout(struct seg *seg); 95 static int segvn_sync(struct seg *seg, caddr_t addr, size_t len, 96 int attr, uint_t flags); 97 static size_t segvn_incore(struct seg *seg, caddr_t addr, size_t len, 98 char *vec); 99 static int segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 100 int attr, int op, ulong_t *lockmap, size_t pos); 101 static int segvn_getprot(struct seg *seg, caddr_t addr, size_t len, 102 uint_t *protv); 103 static u_offset_t segvn_getoffset(struct seg *seg, caddr_t addr); 104 static int segvn_gettype(struct seg *seg, caddr_t addr); 105 static int segvn_getvp(struct seg *seg, caddr_t addr, 106 struct vnode **vpp); 107 static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, 108 uint_t behav); 109 static void segvn_dump(struct seg *seg); 110 static int segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, 111 struct page ***ppp, enum lock_type type, enum seg_rw rw); 112 static int segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, 113 uint_t szc); 114 static int segvn_getmemid(struct seg *seg, caddr_t addr, 115 memid_t *memidp); 116 static lgrp_mem_policy_info_t *segvn_getpolicy(struct seg *, caddr_t); 117 static int segvn_capable(struct seg *seg, segcapability_t capable); 118 119 struct seg_ops segvn_ops = { 120 segvn_dup, 121 segvn_unmap, 122 segvn_free, 123 segvn_fault, 124 segvn_faulta, 125 segvn_setprot, 126 segvn_checkprot, 127 segvn_kluster, 128 segvn_swapout, 129 segvn_sync, 130 segvn_incore, 131 segvn_lockop, 132 segvn_getprot, 133 segvn_getoffset, 134 segvn_gettype, 135 segvn_getvp, 136 segvn_advise, 137 segvn_dump, 138 segvn_pagelock, 139 segvn_setpagesize, 140 segvn_getmemid, 141 segvn_getpolicy, 142 segvn_capable, 143 }; 144 145 /* 146 * Common zfod structures, provided as a shorthand for others to use. 147 */ 148 static segvn_crargs_t zfod_segvn_crargs = 149 SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL); 150 static segvn_crargs_t kzfod_segvn_crargs = 151 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_USER, 152 PROT_ALL & ~PROT_USER); 153 static segvn_crargs_t stack_noexec_crargs = 154 SEGVN_ZFOD_ARGS(PROT_ZFOD & ~PROT_EXEC, PROT_ALL); 155 156 caddr_t zfod_argsp = (caddr_t)&zfod_segvn_crargs; /* user zfod argsp */ 157 caddr_t kzfod_argsp = (caddr_t)&kzfod_segvn_crargs; /* kernel zfod argsp */ 158 caddr_t stack_exec_argsp = (caddr_t)&zfod_segvn_crargs; /* executable stack */ 159 caddr_t stack_noexec_argsp = (caddr_t)&stack_noexec_crargs; /* noexec stack */ 160 161 #define vpgtob(n) ((n) * sizeof (struct vpage)) /* For brevity */ 162 163 size_t segvn_comb_thrshld = UINT_MAX; /* patchable -- see 1196681 */ 164 165 static int segvn_concat(struct seg *, struct seg *, int); 166 static int segvn_extend_prev(struct seg *, struct seg *, 167 struct segvn_crargs *, size_t); 168 static int segvn_extend_next(struct seg *, struct seg *, 169 struct segvn_crargs *, size_t); 170 static void segvn_softunlock(struct seg *, caddr_t, size_t, enum seg_rw); 171 static void segvn_pagelist_rele(page_t **); 172 static void segvn_setvnode_mpss(vnode_t *); 173 static void segvn_relocate_pages(page_t **, page_t *); 174 static int segvn_full_szcpages(page_t **, uint_t, int *, uint_t *); 175 static int segvn_fill_vp_pages(struct segvn_data *, vnode_t *, u_offset_t, 176 uint_t, page_t **, page_t **, uint_t *, int *); 177 static faultcode_t segvn_fault_vnodepages(struct hat *, struct seg *, caddr_t, 178 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 179 static faultcode_t segvn_fault_anonpages(struct hat *, struct seg *, caddr_t, 180 caddr_t, enum fault_type, enum seg_rw, caddr_t, caddr_t, int); 181 static faultcode_t segvn_faultpage(struct hat *, struct seg *, caddr_t, 182 u_offset_t, struct vpage *, page_t **, uint_t, 183 enum fault_type, enum seg_rw, int, int); 184 static void segvn_vpage(struct seg *); 185 186 static void segvn_purge(struct seg *seg); 187 static int segvn_reclaim(struct seg *, caddr_t, size_t, struct page **, 188 enum seg_rw); 189 190 static int sameprot(struct seg *, caddr_t, size_t); 191 192 static int segvn_demote_range(struct seg *, caddr_t, size_t, int, uint_t); 193 static int segvn_clrszc(struct seg *); 194 static struct seg *segvn_split_seg(struct seg *, caddr_t); 195 static int segvn_claim_pages(struct seg *, struct vpage *, u_offset_t, 196 ulong_t, uint_t); 197 198 static int segvn_pp_lock_anonpages(page_t *, int); 199 static void segvn_pp_unlock_anonpages(page_t *, int); 200 201 static struct kmem_cache *segvn_cache; 202 203 #ifdef VM_STATS 204 static struct segvnvmstats_str { 205 ulong_t fill_vp_pages[31]; 206 ulong_t fltvnpages[49]; 207 ulong_t fullszcpages[10]; 208 ulong_t relocatepages[3]; 209 ulong_t fltanpages[17]; 210 ulong_t pagelock[3]; 211 ulong_t demoterange[3]; 212 } segvnvmstats; 213 #endif /* VM_STATS */ 214 215 #define SDR_RANGE 1 /* demote entire range */ 216 #define SDR_END 2 /* demote non aligned ends only */ 217 218 #define CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr) { \ 219 if ((len) != 0) { \ 220 lpgaddr = (caddr_t)P2ALIGN((uintptr_t)(addr), pgsz); \ 221 ASSERT(lpgaddr >= (seg)->s_base); \ 222 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)((addr) + \ 223 (len)), pgsz); \ 224 ASSERT(lpgeaddr > lpgaddr); \ 225 ASSERT(lpgeaddr <= (seg)->s_base + (seg)->s_size); \ 226 } else { \ 227 lpgeaddr = lpgaddr = (addr); \ 228 } \ 229 } 230 231 /*ARGSUSED*/ 232 static int 233 segvn_cache_constructor(void *buf, void *cdrarg, int kmflags) 234 { 235 struct segvn_data *svd = buf; 236 237 rw_init(&svd->lock, NULL, RW_DEFAULT, NULL); 238 mutex_init(&svd->segp_slock, NULL, MUTEX_DEFAULT, NULL); 239 svd->svn_trnext = svd->svn_trprev = NULL; 240 return (0); 241 } 242 243 /*ARGSUSED1*/ 244 static void 245 segvn_cache_destructor(void *buf, void *cdrarg) 246 { 247 struct segvn_data *svd = buf; 248 249 rw_destroy(&svd->lock); 250 mutex_destroy(&svd->segp_slock); 251 } 252 253 /*ARGSUSED*/ 254 static int 255 svntr_cache_constructor(void *buf, void *cdrarg, int kmflags) 256 { 257 bzero(buf, sizeof (svntr_t)); 258 return (0); 259 } 260 261 /* 262 * Patching this variable to non-zero allows the system to run with 263 * stacks marked as "not executable". It's a bit of a kludge, but is 264 * provided as a tweakable for platforms that export those ABIs 265 * (e.g. sparc V8) that have executable stacks enabled by default. 266 * There are also some restrictions for platforms that don't actually 267 * implement 'noexec' protections. 268 * 269 * Once enabled, the system is (therefore) unable to provide a fully 270 * ABI-compliant execution environment, though practically speaking, 271 * most everything works. The exceptions are generally some interpreters 272 * and debuggers that create executable code on the stack and jump 273 * into it (without explicitly mprotecting the address range to include 274 * PROT_EXEC). 275 * 276 * One important class of applications that are disabled are those 277 * that have been transformed into malicious agents using one of the 278 * numerous "buffer overflow" attacks. See 4007890. 279 */ 280 int noexec_user_stack = 0; 281 int noexec_user_stack_log = 1; 282 283 int segvn_lpg_disable = 0; 284 uint_t segvn_maxpgszc = 0; 285 286 ulong_t segvn_vmpss_clrszc_cnt; 287 ulong_t segvn_vmpss_clrszc_err; 288 ulong_t segvn_fltvnpages_clrszc_cnt; 289 ulong_t segvn_fltvnpages_clrszc_err; 290 ulong_t segvn_setpgsz_align_err; 291 ulong_t segvn_setpgsz_anon_align_err; 292 ulong_t segvn_setpgsz_getattr_err; 293 ulong_t segvn_setpgsz_eof_err; 294 ulong_t segvn_faultvnmpss_align_err1; 295 ulong_t segvn_faultvnmpss_align_err2; 296 ulong_t segvn_faultvnmpss_align_err3; 297 ulong_t segvn_faultvnmpss_align_err4; 298 ulong_t segvn_faultvnmpss_align_err5; 299 ulong_t segvn_vmpss_pageio_deadlk_err; 300 301 /* 302 * Segvn supports text replication optimization for NUMA platforms. Text 303 * replica's are represented by anon maps (amp). There's one amp per text file 304 * region per lgroup. A process chooses the amp for each of its text mappings 305 * based on the lgroup assignment of its main thread (t_tid = 1). All 306 * processes that want a replica on a particular lgroup for the same text file 307 * mapping share the same amp. amp's are looked up in svntr_hashtab hash table 308 * with vp,off,size,szc used as a key. Text replication segments are read only 309 * MAP_PRIVATE|MAP_TEXT segments that map vnode. Replication is achieved by 310 * forcing COW faults from vnode to amp and mapping amp pages instead of vnode 311 * pages. Replication amp is assigned to a segment when it gets its first 312 * pagefault. To handle main thread lgroup rehoming segvn_trasync_thread 313 * rechecks periodically if the process still maps an amp local to the main 314 * thread. If not async thread forces process to remap to an amp in the new 315 * home lgroup of the main thread. Current text replication implementation 316 * only provides the benefit to workloads that do most of their work in the 317 * main thread of a process or all the threads of a process run in the same 318 * lgroup. To extend text replication benefit to different types of 319 * multithreaded workloads further work would be needed in the hat layer to 320 * allow the same virtual address in the same hat to simultaneously map 321 * different physical addresses (i.e. page table replication would be needed 322 * for x86). 323 * 324 * amp pages are used instead of vnode pages as long as segment has a very 325 * simple life cycle. It's created via segvn_create(), handles S_EXEC 326 * (S_READ) pagefaults and is fully unmapped. If anything more complicated 327 * happens such as protection is changed, real COW fault happens, pagesize is 328 * changed, MC_LOCK is requested or segment is partially unmapped we turn off 329 * text replication by converting the segment back to vnode only segment 330 * (unmap segment's address range and set svd->amp to NULL). 331 * 332 * The original file can be changed after amp is inserted into 333 * svntr_hashtab. Processes that are launched after the file is already 334 * changed can't use the replica's created prior to the file change. To 335 * implement this functionality hash entries are timestamped. Replica's can 336 * only be used if current file modification time is the same as the timestamp 337 * saved when hash entry was created. However just timestamps alone are not 338 * sufficient to detect file modification via mmap(MAP_SHARED) mappings. We 339 * deal with file changes via MAP_SHARED mappings differently. When writable 340 * MAP_SHARED mappings are created to vnodes marked as executable we mark all 341 * existing replica's for this vnode as not usable for future text 342 * mappings. And we don't create new replica's for files that currently have 343 * potentially writable MAP_SHARED mappings (i.e. vn_is_mapped(V_WRITE) is 344 * true). 345 */ 346 347 #define SEGVN_TEXTREPL_MAXBYTES_FACTOR (20) 348 size_t segvn_textrepl_max_bytes_factor = SEGVN_TEXTREPL_MAXBYTES_FACTOR; 349 350 static ulong_t svntr_hashtab_sz = 512; 351 static svntr_bucket_t *svntr_hashtab = NULL; 352 static struct kmem_cache *svntr_cache; 353 static svntr_stats_t *segvn_textrepl_stats; 354 static ksema_t segvn_trasync_sem; 355 356 int segvn_disable_textrepl = 0; 357 size_t textrepl_size_thresh = (size_t)-1; 358 size_t segvn_textrepl_bytes = 0; 359 size_t segvn_textrepl_max_bytes = 0; 360 clock_t segvn_update_textrepl_interval = 0; 361 int segvn_update_tr_time = 10; 362 int segvn_disable_textrepl_update = 0; 363 364 static void segvn_textrepl(struct seg *); 365 static void segvn_textunrepl(struct seg *, int); 366 static void segvn_inval_trcache(vnode_t *); 367 static void segvn_trasync_thread(void); 368 static void segvn_trupdate_wakeup(void *); 369 static void segvn_trupdate(void); 370 static void segvn_trupdate_seg(struct seg *, segvn_data_t *, svntr_t *, 371 ulong_t); 372 373 /* 374 * Initialize segvn data structures 375 */ 376 void 377 segvn_init(void) 378 { 379 uint_t maxszc; 380 uint_t szc; 381 size_t pgsz; 382 383 segvn_cache = kmem_cache_create("segvn_cache", 384 sizeof (struct segvn_data), 0, 385 segvn_cache_constructor, segvn_cache_destructor, NULL, 386 NULL, NULL, 0); 387 388 if (segvn_lpg_disable != 0) 389 return; 390 szc = maxszc = page_num_pagesizes() - 1; 391 if (szc == 0) { 392 segvn_lpg_disable = 1; 393 return; 394 } 395 if (page_get_pagesize(0) != PAGESIZE) { 396 panic("segvn_init: bad szc 0"); 397 /*NOTREACHED*/ 398 } 399 while (szc != 0) { 400 pgsz = page_get_pagesize(szc); 401 if (pgsz <= PAGESIZE || !IS_P2ALIGNED(pgsz, pgsz)) { 402 panic("segvn_init: bad szc %d", szc); 403 /*NOTREACHED*/ 404 } 405 szc--; 406 } 407 if (segvn_maxpgszc == 0 || segvn_maxpgszc > maxszc) 408 segvn_maxpgszc = maxszc; 409 410 if (lgrp_optimizations() && textrepl_size_thresh != (size_t)-1 && 411 !segvn_disable_textrepl) { 412 ulong_t i; 413 size_t hsz = svntr_hashtab_sz * sizeof (svntr_bucket_t); 414 415 svntr_cache = kmem_cache_create("svntr_cache", 416 sizeof (svntr_t), 0, svntr_cache_constructor, NULL, 417 NULL, NULL, NULL, 0); 418 svntr_hashtab = kmem_zalloc(hsz, KM_SLEEP); 419 for (i = 0; i < svntr_hashtab_sz; i++) { 420 mutex_init(&svntr_hashtab[i].tr_lock, NULL, 421 MUTEX_DEFAULT, NULL); 422 } 423 segvn_textrepl_max_bytes = ptob(physmem) / 424 segvn_textrepl_max_bytes_factor; 425 segvn_textrepl_stats = kmem_zalloc(NCPU * 426 sizeof (svntr_stats_t), KM_SLEEP); 427 sema_init(&segvn_trasync_sem, 0, NULL, SEMA_DEFAULT, NULL); 428 (void) thread_create(NULL, 0, segvn_trasync_thread, 429 NULL, 0, &p0, TS_RUN, minclsyspri); 430 } 431 } 432 433 #define SEGVN_PAGEIO ((void *)0x1) 434 #define SEGVN_NOPAGEIO ((void *)0x2) 435 436 static void 437 segvn_setvnode_mpss(vnode_t *vp) 438 { 439 int err; 440 441 ASSERT(vp->v_mpssdata == NULL || 442 vp->v_mpssdata == SEGVN_PAGEIO || 443 vp->v_mpssdata == SEGVN_NOPAGEIO); 444 445 if (vp->v_mpssdata == NULL) { 446 if (vn_vmpss_usepageio(vp)) { 447 err = VOP_PAGEIO(vp, (page_t *)NULL, 448 (u_offset_t)0, 0, 0, CRED()); 449 } else { 450 err = ENOSYS; 451 } 452 /* 453 * set v_mpssdata just once per vnode life 454 * so that it never changes. 455 */ 456 mutex_enter(&vp->v_lock); 457 if (vp->v_mpssdata == NULL) { 458 if (err == EINVAL) { 459 vp->v_mpssdata = SEGVN_PAGEIO; 460 } else { 461 vp->v_mpssdata = SEGVN_NOPAGEIO; 462 } 463 } 464 mutex_exit(&vp->v_lock); 465 } 466 } 467 468 int 469 segvn_create(struct seg *seg, void *argsp) 470 { 471 struct segvn_crargs *a = (struct segvn_crargs *)argsp; 472 struct segvn_data *svd; 473 size_t swresv = 0; 474 struct cred *cred; 475 struct anon_map *amp; 476 int error = 0; 477 size_t pgsz; 478 lgrp_mem_policy_t mpolicy = LGRP_MEM_POLICY_DEFAULT; 479 int trok = 0; 480 481 482 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 483 484 if (a->type != MAP_PRIVATE && a->type != MAP_SHARED) { 485 panic("segvn_create type"); 486 /*NOTREACHED*/ 487 } 488 489 /* 490 * Check arguments. If a shared anon structure is given then 491 * it is illegal to also specify a vp. 492 */ 493 if (a->amp != NULL && a->vp != NULL) { 494 panic("segvn_create anon_map"); 495 /*NOTREACHED*/ 496 } 497 498 /* MAP_NORESERVE on a MAP_SHARED segment is meaningless. */ 499 if (a->type == MAP_SHARED) 500 a->flags &= ~MAP_NORESERVE; 501 502 if (a->szc != 0) { 503 if (segvn_lpg_disable != 0 || (a->szc == AS_MAP_NO_LPOOB) || 504 (a->amp != NULL && a->type == MAP_PRIVATE) || 505 (a->flags & MAP_NORESERVE) || seg->s_as == &kas) { 506 a->szc = 0; 507 } else { 508 if (a->szc > segvn_maxpgszc) 509 a->szc = segvn_maxpgszc; 510 pgsz = page_get_pagesize(a->szc); 511 if (!IS_P2ALIGNED(seg->s_base, pgsz) || 512 !IS_P2ALIGNED(seg->s_size, pgsz)) { 513 a->szc = 0; 514 } else if (a->vp != NULL) { 515 extern struct vnode kvp; 516 if (IS_SWAPFSVP(a->vp) || VN_ISKAS(a->vp)) { 517 /* 518 * paranoid check. 519 * hat_page_demote() is not supported 520 * on swapfs pages. 521 */ 522 a->szc = 0; 523 } else if (map_addr_vacalign_check(seg->s_base, 524 a->offset & PAGEMASK)) { 525 a->szc = 0; 526 } 527 } else if (a->amp != NULL) { 528 pgcnt_t anum = btopr(a->offset); 529 pgcnt_t pgcnt = page_get_pagecnt(a->szc); 530 if (!IS_P2ALIGNED(anum, pgcnt)) { 531 a->szc = 0; 532 } 533 } 534 } 535 } 536 537 /* 538 * If segment may need private pages, reserve them now. 539 */ 540 if (!(a->flags & MAP_NORESERVE) && ((a->vp == NULL && a->amp == NULL) || 541 (a->type == MAP_PRIVATE && (a->prot & PROT_WRITE)))) { 542 if (anon_resv(seg->s_size) == 0) 543 return (EAGAIN); 544 swresv = seg->s_size; 545 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 546 seg, swresv, 1); 547 } 548 549 /* 550 * Reserve any mapping structures that may be required. 551 */ 552 hat_map(seg->s_as->a_hat, seg->s_base, seg->s_size, HAT_MAP); 553 554 if (a->cred) { 555 cred = a->cred; 556 crhold(cred); 557 } else { 558 crhold(cred = CRED()); 559 } 560 561 /* Inform the vnode of the new mapping */ 562 if (a->vp != NULL) { 563 error = VOP_ADDMAP(a->vp, a->offset & PAGEMASK, 564 seg->s_as, seg->s_base, seg->s_size, a->prot, 565 a->maxprot, a->type, cred); 566 if (error) { 567 if (swresv != 0) { 568 anon_unresv(swresv); 569 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 570 "anon proc:%p %lu %u", 571 seg, swresv, 0); 572 } 573 crfree(cred); 574 hat_unload(seg->s_as->a_hat, seg->s_base, 575 seg->s_size, HAT_UNLOAD_UNMAP); 576 return (error); 577 } 578 trok = ((a->flags & MAP_TEXT) && 579 (seg->s_size > textrepl_size_thresh || 580 (a->flags & _MAP_TEXTREPL)) && 581 lgrp_optimizations() && svntr_hashtab != NULL && 582 a->type == MAP_PRIVATE && swresv == 0 && 583 !(a->flags & MAP_NORESERVE) && 584 seg->s_as != &kas && a->vp->v_type == VREG); 585 } 586 587 /* 588 * If more than one segment in the address space, and they're adjacent 589 * virtually, try to concatenate them. Don't concatenate if an 590 * explicit anon_map structure was supplied (e.g., SystemV shared 591 * memory) or if we'll use text replication for this segment. 592 */ 593 if (a->amp == NULL && !trok) { 594 struct seg *pseg, *nseg; 595 struct segvn_data *psvd, *nsvd; 596 lgrp_mem_policy_t ppolicy, npolicy; 597 uint_t lgrp_mem_policy_flags = 0; 598 extern lgrp_mem_policy_t lgrp_mem_default_policy; 599 600 /* 601 * Memory policy flags (lgrp_mem_policy_flags) is valid when 602 * extending stack/heap segments. 603 */ 604 if ((a->vp == NULL) && (a->type == MAP_PRIVATE) && 605 !(a->flags & MAP_NORESERVE) && (seg->s_as != &kas)) { 606 lgrp_mem_policy_flags = a->lgrp_mem_policy_flags; 607 } else { 608 /* 609 * Get policy when not extending it from another segment 610 */ 611 mpolicy = lgrp_mem_policy_default(seg->s_size, a->type); 612 } 613 614 /* 615 * First, try to concatenate the previous and new segments 616 */ 617 pseg = AS_SEGPREV(seg->s_as, seg); 618 if (pseg != NULL && 619 pseg->s_base + pseg->s_size == seg->s_base && 620 pseg->s_ops == &segvn_ops) { 621 /* 622 * Get memory allocation policy from previous segment. 623 * When extension is specified (e.g. for heap) apply 624 * this policy to the new segment regardless of the 625 * outcome of segment concatenation. Extension occurs 626 * for non-default policy otherwise default policy is 627 * used and is based on extended segment size. 628 */ 629 psvd = (struct segvn_data *)pseg->s_data; 630 ppolicy = psvd->policy_info.mem_policy; 631 if (lgrp_mem_policy_flags == 632 LGRP_MP_FLAG_EXTEND_UP) { 633 if (ppolicy != lgrp_mem_default_policy) { 634 mpolicy = ppolicy; 635 } else { 636 mpolicy = lgrp_mem_policy_default( 637 pseg->s_size + seg->s_size, 638 a->type); 639 } 640 } 641 642 if (mpolicy == ppolicy && 643 (pseg->s_size + seg->s_size <= 644 segvn_comb_thrshld || psvd->amp == NULL) && 645 segvn_extend_prev(pseg, seg, a, swresv) == 0) { 646 /* 647 * success! now try to concatenate 648 * with following seg 649 */ 650 crfree(cred); 651 nseg = AS_SEGNEXT(pseg->s_as, pseg); 652 if (nseg != NULL && 653 nseg != pseg && 654 nseg->s_ops == &segvn_ops && 655 pseg->s_base + pseg->s_size == 656 nseg->s_base) 657 (void) segvn_concat(pseg, nseg, 0); 658 ASSERT(pseg->s_szc == 0 || 659 (a->szc == pseg->s_szc && 660 IS_P2ALIGNED(pseg->s_base, pgsz) && 661 IS_P2ALIGNED(pseg->s_size, pgsz))); 662 return (0); 663 } 664 } 665 666 /* 667 * Failed, so try to concatenate with following seg 668 */ 669 nseg = AS_SEGNEXT(seg->s_as, seg); 670 if (nseg != NULL && 671 seg->s_base + seg->s_size == nseg->s_base && 672 nseg->s_ops == &segvn_ops) { 673 /* 674 * Get memory allocation policy from next segment. 675 * When extension is specified (e.g. for stack) apply 676 * this policy to the new segment regardless of the 677 * outcome of segment concatenation. Extension occurs 678 * for non-default policy otherwise default policy is 679 * used and is based on extended segment size. 680 */ 681 nsvd = (struct segvn_data *)nseg->s_data; 682 npolicy = nsvd->policy_info.mem_policy; 683 if (lgrp_mem_policy_flags == 684 LGRP_MP_FLAG_EXTEND_DOWN) { 685 if (npolicy != lgrp_mem_default_policy) { 686 mpolicy = npolicy; 687 } else { 688 mpolicy = lgrp_mem_policy_default( 689 nseg->s_size + seg->s_size, 690 a->type); 691 } 692 } 693 694 if (mpolicy == npolicy && 695 segvn_extend_next(seg, nseg, a, swresv) == 0) { 696 crfree(cred); 697 ASSERT(nseg->s_szc == 0 || 698 (a->szc == nseg->s_szc && 699 IS_P2ALIGNED(nseg->s_base, pgsz) && 700 IS_P2ALIGNED(nseg->s_size, pgsz))); 701 return (0); 702 } 703 } 704 } 705 706 if (a->vp != NULL) { 707 VN_HOLD(a->vp); 708 if (a->type == MAP_SHARED) 709 lgrp_shm_policy_init(NULL, a->vp); 710 } 711 svd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 712 713 seg->s_ops = &segvn_ops; 714 seg->s_data = (void *)svd; 715 seg->s_szc = a->szc; 716 717 svd->seg = seg; 718 svd->vp = a->vp; 719 /* 720 * Anonymous mappings have no backing file so the offset is meaningless. 721 */ 722 svd->offset = a->vp ? (a->offset & PAGEMASK) : 0; 723 svd->prot = a->prot; 724 svd->maxprot = a->maxprot; 725 svd->pageprot = 0; 726 svd->type = a->type; 727 svd->vpage = NULL; 728 svd->cred = cred; 729 svd->advice = MADV_NORMAL; 730 svd->pageadvice = 0; 731 svd->flags = (ushort_t)a->flags; 732 svd->softlockcnt = 0; 733 if (a->szc != 0 && a->vp != NULL) { 734 segvn_setvnode_mpss(a->vp); 735 } 736 if (svd->type == MAP_SHARED && svd->vp != NULL && 737 (svd->vp->v_flag & VVMEXEC) && (svd->prot & PROT_WRITE)) { 738 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 739 segvn_inval_trcache(svd->vp); 740 } 741 742 amp = a->amp; 743 if ((svd->amp = amp) == NULL) { 744 svd->anon_index = 0; 745 if (svd->type == MAP_SHARED) { 746 svd->swresv = 0; 747 /* 748 * Shared mappings to a vp need no other setup. 749 * If we have a shared mapping to an anon_map object 750 * which hasn't been allocated yet, allocate the 751 * struct now so that it will be properly shared 752 * by remembering the swap reservation there. 753 */ 754 if (a->vp == NULL) { 755 svd->amp = anonmap_alloc(seg->s_size, swresv, 756 ANON_SLEEP); 757 svd->amp->a_szc = seg->s_szc; 758 } 759 } else { 760 /* 761 * Private mapping (with or without a vp). 762 * Allocate anon_map when needed. 763 */ 764 svd->swresv = swresv; 765 } 766 } else { 767 pgcnt_t anon_num; 768 769 /* 770 * Mapping to an existing anon_map structure without a vp. 771 * For now we will insure that the segment size isn't larger 772 * than the size - offset gives us. Later on we may wish to 773 * have the anon array dynamically allocated itself so that 774 * we don't always have to allocate all the anon pointer slots. 775 * This of course involves adding extra code to check that we 776 * aren't trying to use an anon pointer slot beyond the end 777 * of the currently allocated anon array. 778 */ 779 if ((amp->size - a->offset) < seg->s_size) { 780 panic("segvn_create anon_map size"); 781 /*NOTREACHED*/ 782 } 783 784 anon_num = btopr(a->offset); 785 786 if (a->type == MAP_SHARED) { 787 /* 788 * SHARED mapping to a given anon_map. 789 */ 790 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 791 amp->refcnt++; 792 if (a->szc > amp->a_szc) { 793 amp->a_szc = a->szc; 794 } 795 ANON_LOCK_EXIT(&->a_rwlock); 796 svd->anon_index = anon_num; 797 svd->swresv = 0; 798 } else { 799 /* 800 * PRIVATE mapping to a given anon_map. 801 * Make sure that all the needed anon 802 * structures are created (so that we will 803 * share the underlying pages if nothing 804 * is written by this mapping) and then 805 * duplicate the anon array as is done 806 * when a privately mapped segment is dup'ed. 807 */ 808 struct anon *ap; 809 caddr_t addr; 810 caddr_t eaddr; 811 ulong_t anon_idx; 812 int hat_flag = HAT_LOAD; 813 814 if (svd->flags & MAP_TEXT) { 815 hat_flag |= HAT_LOAD_TEXT; 816 } 817 818 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 819 svd->amp->a_szc = seg->s_szc; 820 svd->anon_index = 0; 821 svd->swresv = swresv; 822 823 /* 824 * Prevent 2 threads from allocating anon 825 * slots simultaneously. 826 */ 827 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 828 eaddr = seg->s_base + seg->s_size; 829 830 for (anon_idx = anon_num, addr = seg->s_base; 831 addr < eaddr; addr += PAGESIZE, anon_idx++) { 832 page_t *pp; 833 834 if ((ap = anon_get_ptr(amp->ahp, 835 anon_idx)) != NULL) 836 continue; 837 838 /* 839 * Allocate the anon struct now. 840 * Might as well load up translation 841 * to the page while we're at it... 842 */ 843 pp = anon_zero(seg, addr, &ap, cred); 844 if (ap == NULL || pp == NULL) { 845 panic("segvn_create anon_zero"); 846 /*NOTREACHED*/ 847 } 848 849 /* 850 * Re-acquire the anon_map lock and 851 * initialize the anon array entry. 852 */ 853 ASSERT(anon_get_ptr(amp->ahp, 854 anon_idx) == NULL); 855 (void) anon_set_ptr(amp->ahp, anon_idx, ap, 856 ANON_SLEEP); 857 858 ASSERT(seg->s_szc == 0); 859 ASSERT(!IS_VMODSORT(pp->p_vnode)); 860 861 hat_memload(seg->s_as->a_hat, addr, pp, 862 svd->prot & ~PROT_WRITE, hat_flag); 863 864 page_unlock(pp); 865 } 866 ASSERT(seg->s_szc == 0); 867 anon_dup(amp->ahp, anon_num, svd->amp->ahp, 868 0, seg->s_size); 869 ANON_LOCK_EXIT(&->a_rwlock); 870 } 871 } 872 873 /* 874 * Set default memory allocation policy for segment 875 * 876 * Always set policy for private memory at least for initialization 877 * even if this is a shared memory segment 878 */ 879 (void) lgrp_privm_policy_set(mpolicy, &svd->policy_info, seg->s_size); 880 881 if (svd->type == MAP_SHARED) 882 (void) lgrp_shm_policy_set(mpolicy, svd->amp, svd->anon_index, 883 svd->vp, svd->offset, seg->s_size); 884 885 ASSERT(!trok || !(svd->prot & PROT_WRITE)); 886 svd->tr_state = trok ? SEGVN_TR_INIT : SEGVN_TR_OFF; 887 888 return (0); 889 } 890 891 /* 892 * Concatenate two existing segments, if possible. 893 * Return 0 on success, -1 if two segments are not compatible 894 * or -2 on memory allocation failure. 895 * If amp_cat == 1 then try and concat segments with anon maps 896 */ 897 static int 898 segvn_concat(struct seg *seg1, struct seg *seg2, int amp_cat) 899 { 900 struct segvn_data *svd1 = seg1->s_data; 901 struct segvn_data *svd2 = seg2->s_data; 902 struct anon_map *amp1 = svd1->amp; 903 struct anon_map *amp2 = svd2->amp; 904 struct vpage *vpage1 = svd1->vpage; 905 struct vpage *vpage2 = svd2->vpage, *nvpage = NULL; 906 size_t size, nvpsize; 907 pgcnt_t npages1, npages2; 908 909 ASSERT(seg1->s_as && seg2->s_as && seg1->s_as == seg2->s_as); 910 ASSERT(AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 911 ASSERT(seg1->s_ops == seg2->s_ops); 912 913 /* both segments exist, try to merge them */ 914 #define incompat(x) (svd1->x != svd2->x) 915 if (incompat(vp) || incompat(maxprot) || 916 (!svd1->pageadvice && !svd2->pageadvice && incompat(advice)) || 917 (!svd1->pageprot && !svd2->pageprot && incompat(prot)) || 918 incompat(type) || incompat(cred) || incompat(flags) || 919 seg1->s_szc != seg2->s_szc || incompat(policy_info.mem_policy) || 920 (svd2->softlockcnt > 0)) 921 return (-1); 922 #undef incompat 923 924 /* 925 * vp == NULL implies zfod, offset doesn't matter 926 */ 927 if (svd1->vp != NULL && 928 svd1->offset + seg1->s_size != svd2->offset) { 929 return (-1); 930 } 931 932 /* 933 * Don't concatenate if either segment uses text replication. 934 */ 935 if (svd1->tr_state != SEGVN_TR_OFF || svd2->tr_state != SEGVN_TR_OFF) { 936 return (-1); 937 } 938 939 /* 940 * Fail early if we're not supposed to concatenate 941 * segments with non NULL amp. 942 */ 943 if (amp_cat == 0 && (amp1 != NULL || amp2 != NULL)) { 944 return (-1); 945 } 946 947 if (svd1->vp == NULL && svd1->type == MAP_SHARED) { 948 if (amp1 != amp2) { 949 return (-1); 950 } 951 if (amp1 != NULL && svd1->anon_index + btop(seg1->s_size) != 952 svd2->anon_index) { 953 return (-1); 954 } 955 ASSERT(amp1 == NULL || amp1->refcnt >= 2); 956 } 957 958 /* 959 * If either seg has vpages, create a new merged vpage array. 960 */ 961 if (vpage1 != NULL || vpage2 != NULL) { 962 struct vpage *vp; 963 964 npages1 = seg_pages(seg1); 965 npages2 = seg_pages(seg2); 966 nvpsize = vpgtob(npages1 + npages2); 967 968 if ((nvpage = kmem_zalloc(nvpsize, KM_NOSLEEP)) == NULL) { 969 return (-2); 970 } 971 if (vpage1 != NULL) { 972 bcopy(vpage1, nvpage, vpgtob(npages1)); 973 } 974 if (vpage2 != NULL) { 975 bcopy(vpage2, nvpage + npages1, vpgtob(npages2)); 976 } 977 for (vp = nvpage; vp < nvpage + npages1; vp++) { 978 if (svd2->pageprot && !svd1->pageprot) { 979 VPP_SETPROT(vp, svd1->prot); 980 } 981 if (svd2->pageadvice && !svd1->pageadvice) { 982 VPP_SETADVICE(vp, svd1->advice); 983 } 984 } 985 for (vp = nvpage + npages1; 986 vp < nvpage + npages1 + npages2; vp++) { 987 if (svd1->pageprot && !svd2->pageprot) { 988 VPP_SETPROT(vp, svd2->prot); 989 } 990 if (svd1->pageadvice && !svd2->pageadvice) { 991 VPP_SETADVICE(vp, svd2->advice); 992 } 993 } 994 } 995 996 /* 997 * If either segment has private pages, create a new merged anon 998 * array. If mergeing shared anon segments just decrement anon map's 999 * refcnt. 1000 */ 1001 if (amp1 != NULL && svd1->type == MAP_SHARED) { 1002 ASSERT(amp1 == amp2 && svd1->vp == NULL); 1003 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1004 ASSERT(amp1->refcnt >= 2); 1005 amp1->refcnt--; 1006 ANON_LOCK_EXIT(&1->a_rwlock); 1007 svd2->amp = NULL; 1008 } else if (amp1 != NULL || amp2 != NULL) { 1009 struct anon_hdr *nahp; 1010 struct anon_map *namp = NULL; 1011 size_t asize; 1012 1013 ASSERT(svd1->type == MAP_PRIVATE); 1014 1015 asize = seg1->s_size + seg2->s_size; 1016 if ((nahp = anon_create(btop(asize), ANON_NOSLEEP)) == NULL) { 1017 if (nvpage != NULL) { 1018 kmem_free(nvpage, nvpsize); 1019 } 1020 return (-2); 1021 } 1022 if (amp1 != NULL) { 1023 /* 1024 * XXX anon rwlock is not really needed because 1025 * this is a private segment and we are writers. 1026 */ 1027 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1028 ASSERT(amp1->refcnt == 1); 1029 if (anon_copy_ptr(amp1->ahp, svd1->anon_index, 1030 nahp, 0, btop(seg1->s_size), ANON_NOSLEEP)) { 1031 anon_release(nahp, btop(asize)); 1032 ANON_LOCK_EXIT(&1->a_rwlock); 1033 if (nvpage != NULL) { 1034 kmem_free(nvpage, nvpsize); 1035 } 1036 return (-2); 1037 } 1038 } 1039 if (amp2 != NULL) { 1040 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1041 ASSERT(amp2->refcnt == 1); 1042 if (anon_copy_ptr(amp2->ahp, svd2->anon_index, 1043 nahp, btop(seg1->s_size), btop(seg2->s_size), 1044 ANON_NOSLEEP)) { 1045 anon_release(nahp, btop(asize)); 1046 ANON_LOCK_EXIT(&2->a_rwlock); 1047 if (amp1 != NULL) { 1048 ANON_LOCK_EXIT(&1->a_rwlock); 1049 } 1050 if (nvpage != NULL) { 1051 kmem_free(nvpage, nvpsize); 1052 } 1053 return (-2); 1054 } 1055 } 1056 if (amp1 != NULL) { 1057 namp = amp1; 1058 anon_release(amp1->ahp, btop(amp1->size)); 1059 } 1060 if (amp2 != NULL) { 1061 if (namp == NULL) { 1062 ASSERT(amp1 == NULL); 1063 namp = amp2; 1064 anon_release(amp2->ahp, btop(amp2->size)); 1065 } else { 1066 amp2->refcnt--; 1067 ANON_LOCK_EXIT(&2->a_rwlock); 1068 anonmap_free(amp2); 1069 } 1070 svd2->amp = NULL; /* needed for seg_free */ 1071 } 1072 namp->ahp = nahp; 1073 namp->size = asize; 1074 svd1->amp = namp; 1075 svd1->anon_index = 0; 1076 ANON_LOCK_EXIT(&namp->a_rwlock); 1077 } 1078 /* 1079 * Now free the old vpage structures. 1080 */ 1081 if (nvpage != NULL) { 1082 if (vpage1 != NULL) { 1083 kmem_free(vpage1, vpgtob(npages1)); 1084 } 1085 if (vpage2 != NULL) { 1086 svd2->vpage = NULL; 1087 kmem_free(vpage2, vpgtob(npages2)); 1088 } 1089 if (svd2->pageprot) { 1090 svd1->pageprot = 1; 1091 } 1092 if (svd2->pageadvice) { 1093 svd1->pageadvice = 1; 1094 } 1095 svd1->vpage = nvpage; 1096 } 1097 1098 /* all looks ok, merge segments */ 1099 svd1->swresv += svd2->swresv; 1100 svd2->swresv = 0; /* so seg_free doesn't release swap space */ 1101 size = seg2->s_size; 1102 seg_free(seg2); 1103 seg1->s_size += size; 1104 return (0); 1105 } 1106 1107 /* 1108 * Extend the previous segment (seg1) to include the 1109 * new segment (seg2 + a), if possible. 1110 * Return 0 on success. 1111 */ 1112 static int 1113 segvn_extend_prev(seg1, seg2, a, swresv) 1114 struct seg *seg1, *seg2; 1115 struct segvn_crargs *a; 1116 size_t swresv; 1117 { 1118 struct segvn_data *svd1 = (struct segvn_data *)seg1->s_data; 1119 size_t size; 1120 struct anon_map *amp1; 1121 struct vpage *new_vpage; 1122 1123 /* 1124 * We don't need any segment level locks for "segvn" data 1125 * since the address space is "write" locked. 1126 */ 1127 ASSERT(seg1->s_as && AS_WRITE_HELD(seg1->s_as, &seg1->s_as->a_lock)); 1128 1129 /* second segment is new, try to extend first */ 1130 /* XXX - should also check cred */ 1131 if (svd1->vp != a->vp || svd1->maxprot != a->maxprot || 1132 (!svd1->pageprot && (svd1->prot != a->prot)) || 1133 svd1->type != a->type || svd1->flags != a->flags || 1134 seg1->s_szc != a->szc) 1135 return (-1); 1136 1137 /* vp == NULL implies zfod, offset doesn't matter */ 1138 if (svd1->vp != NULL && 1139 svd1->offset + seg1->s_size != (a->offset & PAGEMASK)) 1140 return (-1); 1141 1142 if (svd1->tr_state != SEGVN_TR_OFF) { 1143 return (-1); 1144 } 1145 1146 amp1 = svd1->amp; 1147 if (amp1) { 1148 pgcnt_t newpgs; 1149 1150 /* 1151 * Segment has private pages, can data structures 1152 * be expanded? 1153 * 1154 * Acquire the anon_map lock to prevent it from changing, 1155 * if it is shared. This ensures that the anon_map 1156 * will not change while a thread which has a read/write 1157 * lock on an address space references it. 1158 * XXX - Don't need the anon_map lock at all if "refcnt" 1159 * is 1. 1160 * 1161 * Can't grow a MAP_SHARED segment with an anonmap because 1162 * there may be existing anon slots where we want to extend 1163 * the segment and we wouldn't know what to do with them 1164 * (e.g., for tmpfs right thing is to just leave them there, 1165 * for /dev/zero they should be cleared out). 1166 */ 1167 if (svd1->type == MAP_SHARED) 1168 return (-1); 1169 1170 ANON_LOCK_ENTER(&1->a_rwlock, RW_WRITER); 1171 if (amp1->refcnt > 1) { 1172 ANON_LOCK_EXIT(&1->a_rwlock); 1173 return (-1); 1174 } 1175 newpgs = anon_grow(amp1->ahp, &svd1->anon_index, 1176 btop(seg1->s_size), btop(seg2->s_size), ANON_NOSLEEP); 1177 1178 if (newpgs == 0) { 1179 ANON_LOCK_EXIT(&1->a_rwlock); 1180 return (-1); 1181 } 1182 amp1->size = ptob(newpgs); 1183 ANON_LOCK_EXIT(&1->a_rwlock); 1184 } 1185 if (svd1->vpage != NULL) { 1186 new_vpage = 1187 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1188 KM_NOSLEEP); 1189 if (new_vpage == NULL) 1190 return (-1); 1191 bcopy(svd1->vpage, new_vpage, vpgtob(seg_pages(seg1))); 1192 kmem_free(svd1->vpage, vpgtob(seg_pages(seg1))); 1193 svd1->vpage = new_vpage; 1194 if (svd1->pageprot) { 1195 struct vpage *vp, *evp; 1196 1197 vp = new_vpage + seg_pages(seg1); 1198 evp = vp + seg_pages(seg2); 1199 for (; vp < evp; vp++) 1200 VPP_SETPROT(vp, a->prot); 1201 } 1202 } 1203 size = seg2->s_size; 1204 seg_free(seg2); 1205 seg1->s_size += size; 1206 svd1->swresv += swresv; 1207 if (svd1->pageprot && (a->prot & PROT_WRITE) && 1208 svd1->type == MAP_SHARED && svd1->vp != NULL && 1209 (svd1->vp->v_flag & VVMEXEC)) { 1210 ASSERT(vn_is_mapped(svd1->vp, V_WRITE)); 1211 segvn_inval_trcache(svd1->vp); 1212 } 1213 return (0); 1214 } 1215 1216 /* 1217 * Extend the next segment (seg2) to include the 1218 * new segment (seg1 + a), if possible. 1219 * Return 0 on success. 1220 */ 1221 static int 1222 segvn_extend_next( 1223 struct seg *seg1, 1224 struct seg *seg2, 1225 struct segvn_crargs *a, 1226 size_t swresv) 1227 { 1228 struct segvn_data *svd2 = (struct segvn_data *)seg2->s_data; 1229 size_t size; 1230 struct anon_map *amp2; 1231 struct vpage *new_vpage; 1232 1233 /* 1234 * We don't need any segment level locks for "segvn" data 1235 * since the address space is "write" locked. 1236 */ 1237 ASSERT(seg2->s_as && AS_WRITE_HELD(seg2->s_as, &seg2->s_as->a_lock)); 1238 1239 /* first segment is new, try to extend second */ 1240 /* XXX - should also check cred */ 1241 if (svd2->vp != a->vp || svd2->maxprot != a->maxprot || 1242 (!svd2->pageprot && (svd2->prot != a->prot)) || 1243 svd2->type != a->type || svd2->flags != a->flags || 1244 seg2->s_szc != a->szc) 1245 return (-1); 1246 /* vp == NULL implies zfod, offset doesn't matter */ 1247 if (svd2->vp != NULL && 1248 (a->offset & PAGEMASK) + seg1->s_size != svd2->offset) 1249 return (-1); 1250 1251 if (svd2->tr_state != SEGVN_TR_OFF) { 1252 return (-1); 1253 } 1254 1255 amp2 = svd2->amp; 1256 if (amp2) { 1257 pgcnt_t newpgs; 1258 1259 /* 1260 * Segment has private pages, can data structures 1261 * be expanded? 1262 * 1263 * Acquire the anon_map lock to prevent it from changing, 1264 * if it is shared. This ensures that the anon_map 1265 * will not change while a thread which has a read/write 1266 * lock on an address space references it. 1267 * 1268 * XXX - Don't need the anon_map lock at all if "refcnt" 1269 * is 1. 1270 */ 1271 if (svd2->type == MAP_SHARED) 1272 return (-1); 1273 1274 ANON_LOCK_ENTER(&2->a_rwlock, RW_WRITER); 1275 if (amp2->refcnt > 1) { 1276 ANON_LOCK_EXIT(&2->a_rwlock); 1277 return (-1); 1278 } 1279 newpgs = anon_grow(amp2->ahp, &svd2->anon_index, 1280 btop(seg2->s_size), btop(seg1->s_size), 1281 ANON_NOSLEEP | ANON_GROWDOWN); 1282 1283 if (newpgs == 0) { 1284 ANON_LOCK_EXIT(&2->a_rwlock); 1285 return (-1); 1286 } 1287 amp2->size = ptob(newpgs); 1288 ANON_LOCK_EXIT(&2->a_rwlock); 1289 } 1290 if (svd2->vpage != NULL) { 1291 new_vpage = 1292 kmem_zalloc(vpgtob(seg_pages(seg1) + seg_pages(seg2)), 1293 KM_NOSLEEP); 1294 if (new_vpage == NULL) { 1295 /* Not merging segments so adjust anon_index back */ 1296 if (amp2) 1297 svd2->anon_index += seg_pages(seg1); 1298 return (-1); 1299 } 1300 bcopy(svd2->vpage, new_vpage + seg_pages(seg1), 1301 vpgtob(seg_pages(seg2))); 1302 kmem_free(svd2->vpage, vpgtob(seg_pages(seg2))); 1303 svd2->vpage = new_vpage; 1304 if (svd2->pageprot) { 1305 struct vpage *vp, *evp; 1306 1307 vp = new_vpage; 1308 evp = vp + seg_pages(seg1); 1309 for (; vp < evp; vp++) 1310 VPP_SETPROT(vp, a->prot); 1311 } 1312 } 1313 size = seg1->s_size; 1314 seg_free(seg1); 1315 seg2->s_size += size; 1316 seg2->s_base -= size; 1317 svd2->offset -= size; 1318 svd2->swresv += swresv; 1319 if (svd2->pageprot && (a->prot & PROT_WRITE) && 1320 svd2->type == MAP_SHARED && svd2->vp != NULL && 1321 (svd2->vp->v_flag & VVMEXEC)) { 1322 ASSERT(vn_is_mapped(svd2->vp, V_WRITE)); 1323 segvn_inval_trcache(svd2->vp); 1324 } 1325 return (0); 1326 } 1327 1328 static int 1329 segvn_dup(struct seg *seg, struct seg *newseg) 1330 { 1331 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1332 struct segvn_data *newsvd; 1333 pgcnt_t npages = seg_pages(seg); 1334 int error = 0; 1335 uint_t prot; 1336 size_t len; 1337 struct anon_map *amp; 1338 1339 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1340 1341 /* 1342 * If segment has anon reserved, reserve more for the new seg. 1343 * For a MAP_NORESERVE segment swresv will be a count of all the 1344 * allocated anon slots; thus we reserve for the child as many slots 1345 * as the parent has allocated. This semantic prevents the child or 1346 * parent from dieing during a copy-on-write fault caused by trying 1347 * to write a shared pre-existing anon page. 1348 */ 1349 if ((len = svd->swresv) != 0) { 1350 if (anon_resv(svd->swresv) == 0) 1351 return (ENOMEM); 1352 1353 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1354 seg, len, 0); 1355 } 1356 1357 newsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1358 1359 newseg->s_ops = &segvn_ops; 1360 newseg->s_data = (void *)newsvd; 1361 newseg->s_szc = seg->s_szc; 1362 1363 newsvd->seg = newseg; 1364 if ((newsvd->vp = svd->vp) != NULL) { 1365 VN_HOLD(svd->vp); 1366 if (svd->type == MAP_SHARED) 1367 lgrp_shm_policy_init(NULL, svd->vp); 1368 } 1369 newsvd->offset = svd->offset; 1370 newsvd->prot = svd->prot; 1371 newsvd->maxprot = svd->maxprot; 1372 newsvd->pageprot = svd->pageprot; 1373 newsvd->type = svd->type; 1374 newsvd->cred = svd->cred; 1375 crhold(newsvd->cred); 1376 newsvd->advice = svd->advice; 1377 newsvd->pageadvice = svd->pageadvice; 1378 newsvd->swresv = svd->swresv; 1379 newsvd->flags = svd->flags; 1380 newsvd->softlockcnt = 0; 1381 newsvd->policy_info = svd->policy_info; 1382 if ((amp = svd->amp) == NULL || svd->tr_state == SEGVN_TR_ON) { 1383 /* 1384 * Not attaching to a shared anon object. 1385 */ 1386 if (svd->tr_state == SEGVN_TR_ON) { 1387 ASSERT(newsvd->vp != NULL && amp != NULL); 1388 newsvd->tr_state = SEGVN_TR_INIT; 1389 } else { 1390 newsvd->tr_state = svd->tr_state; 1391 } 1392 newsvd->amp = NULL; 1393 newsvd->anon_index = 0; 1394 } else { 1395 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1396 newsvd->tr_state = SEGVN_TR_OFF; 1397 if (svd->type == MAP_SHARED) { 1398 newsvd->amp = amp; 1399 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1400 amp->refcnt++; 1401 ANON_LOCK_EXIT(&->a_rwlock); 1402 newsvd->anon_index = svd->anon_index; 1403 } else { 1404 int reclaim = 1; 1405 1406 /* 1407 * Allocate and initialize new anon_map structure. 1408 */ 1409 newsvd->amp = anonmap_alloc(newseg->s_size, 0, 1410 ANON_SLEEP); 1411 newsvd->amp->a_szc = newseg->s_szc; 1412 newsvd->anon_index = 0; 1413 1414 /* 1415 * We don't have to acquire the anon_map lock 1416 * for the new segment (since it belongs to an 1417 * address space that is still not associated 1418 * with any process), or the segment in the old 1419 * address space (since all threads in it 1420 * are stopped while duplicating the address space). 1421 */ 1422 1423 /* 1424 * The goal of the following code is to make sure that 1425 * softlocked pages do not end up as copy on write 1426 * pages. This would cause problems where one 1427 * thread writes to a page that is COW and a different 1428 * thread in the same process has softlocked it. The 1429 * softlock lock would move away from this process 1430 * because the write would cause this process to get 1431 * a copy (without the softlock). 1432 * 1433 * The strategy here is to just break the 1434 * sharing on pages that could possibly be 1435 * softlocked. 1436 */ 1437 retry: 1438 if (svd->softlockcnt) { 1439 struct anon *ap, *newap; 1440 size_t i; 1441 uint_t vpprot; 1442 page_t *anon_pl[1+1], *pp; 1443 caddr_t addr; 1444 ulong_t anon_idx = 0; 1445 1446 /* 1447 * The softlock count might be non zero 1448 * because some pages are still stuck in the 1449 * cache for lazy reclaim. Flush the cache 1450 * now. This should drop the count to zero. 1451 * [or there is really I/O going on to these 1452 * pages]. Note, we have the writers lock so 1453 * nothing gets inserted during the flush. 1454 */ 1455 if (reclaim == 1) { 1456 segvn_purge(seg); 1457 reclaim = 0; 1458 goto retry; 1459 } 1460 i = btopr(seg->s_size); 1461 addr = seg->s_base; 1462 /* 1463 * XXX break cow sharing using PAGESIZE 1464 * pages. They will be relocated into larger 1465 * pages at fault time. 1466 */ 1467 while (i-- > 0) { 1468 if (ap = anon_get_ptr(amp->ahp, 1469 anon_idx)) { 1470 error = anon_getpage(&ap, 1471 &vpprot, anon_pl, PAGESIZE, 1472 seg, addr, S_READ, 1473 svd->cred); 1474 if (error) { 1475 newsvd->vpage = NULL; 1476 goto out; 1477 } 1478 /* 1479 * prot need not be computed 1480 * below 'cause anon_private is 1481 * going to ignore it anyway 1482 * as child doesn't inherit 1483 * pagelock from parent. 1484 */ 1485 prot = svd->pageprot ? 1486 VPP_PROT( 1487 &svd->vpage[ 1488 seg_page(seg, addr)]) 1489 : svd->prot; 1490 pp = anon_private(&newap, 1491 newseg, addr, prot, 1492 anon_pl[0], 0, 1493 newsvd->cred); 1494 if (pp == NULL) { 1495 /* no mem abort */ 1496 newsvd->vpage = NULL; 1497 error = ENOMEM; 1498 goto out; 1499 } 1500 (void) anon_set_ptr( 1501 newsvd->amp->ahp, anon_idx, 1502 newap, ANON_SLEEP); 1503 page_unlock(pp); 1504 } 1505 addr += PAGESIZE; 1506 anon_idx++; 1507 } 1508 } else { /* common case */ 1509 if (seg->s_szc != 0) { 1510 /* 1511 * If at least one of anon slots of a 1512 * large page exists then make sure 1513 * all anon slots of a large page 1514 * exist to avoid partial cow sharing 1515 * of a large page in the future. 1516 */ 1517 anon_dup_fill_holes(amp->ahp, 1518 svd->anon_index, newsvd->amp->ahp, 1519 0, seg->s_size, seg->s_szc, 1520 svd->vp != NULL); 1521 } else { 1522 anon_dup(amp->ahp, svd->anon_index, 1523 newsvd->amp->ahp, 0, seg->s_size); 1524 } 1525 1526 hat_clrattr(seg->s_as->a_hat, seg->s_base, 1527 seg->s_size, PROT_WRITE); 1528 } 1529 } 1530 } 1531 /* 1532 * If necessary, create a vpage structure for the new segment. 1533 * Do not copy any page lock indications. 1534 */ 1535 if (svd->vpage != NULL) { 1536 uint_t i; 1537 struct vpage *ovp = svd->vpage; 1538 struct vpage *nvp; 1539 1540 nvp = newsvd->vpage = 1541 kmem_alloc(vpgtob(npages), KM_SLEEP); 1542 for (i = 0; i < npages; i++) { 1543 *nvp = *ovp++; 1544 VPP_CLRPPLOCK(nvp++); 1545 } 1546 } else 1547 newsvd->vpage = NULL; 1548 1549 /* Inform the vnode of the new mapping */ 1550 if (newsvd->vp != NULL) { 1551 error = VOP_ADDMAP(newsvd->vp, (offset_t)newsvd->offset, 1552 newseg->s_as, newseg->s_base, newseg->s_size, newsvd->prot, 1553 newsvd->maxprot, newsvd->type, newsvd->cred); 1554 } 1555 out: 1556 return (error); 1557 } 1558 1559 1560 /* 1561 * callback function used by segvn_unmap to invoke free_vp_pages() for only 1562 * those pages actually processed by the HAT 1563 */ 1564 extern int free_pages; 1565 1566 static void 1567 segvn_hat_unload_callback(hat_callback_t *cb) 1568 { 1569 struct seg *seg = cb->hcb_data; 1570 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1571 size_t len; 1572 u_offset_t off; 1573 1574 ASSERT(svd->vp != NULL); 1575 ASSERT(cb->hcb_end_addr > cb->hcb_start_addr); 1576 ASSERT(cb->hcb_start_addr >= seg->s_base); 1577 1578 len = cb->hcb_end_addr - cb->hcb_start_addr; 1579 off = cb->hcb_start_addr - seg->s_base; 1580 free_vp_pages(svd->vp, svd->offset + off, len); 1581 } 1582 1583 1584 static int 1585 segvn_unmap(struct seg *seg, caddr_t addr, size_t len) 1586 { 1587 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 1588 struct segvn_data *nsvd; 1589 struct seg *nseg; 1590 struct anon_map *amp; 1591 pgcnt_t opages; /* old segment size in pages */ 1592 pgcnt_t npages; /* new segment size in pages */ 1593 pgcnt_t dpages; /* pages being deleted (unmapped) */ 1594 hat_callback_t callback; /* used for free_vp_pages() */ 1595 hat_callback_t *cbp = NULL; 1596 caddr_t nbase; 1597 size_t nsize; 1598 size_t oswresv; 1599 int reclaim = 1; 1600 int unmap = 1; 1601 1602 /* 1603 * We don't need any segment level locks for "segvn" data 1604 * since the address space is "write" locked. 1605 */ 1606 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 1607 1608 /* 1609 * Fail the unmap if pages are SOFTLOCKed through this mapping. 1610 * softlockcnt is protected from change by the as write lock. 1611 */ 1612 retry: 1613 if (svd->softlockcnt > 0) { 1614 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1615 /* 1616 * since we do have the writers lock nobody can fill 1617 * the cache during the purge. The flush either succeeds 1618 * or we still have pending I/Os. 1619 */ 1620 if (reclaim == 1) { 1621 segvn_purge(seg); 1622 reclaim = 0; 1623 goto retry; 1624 } 1625 return (EAGAIN); 1626 } 1627 1628 /* 1629 * Check for bad sizes 1630 */ 1631 if (addr < seg->s_base || addr + len > seg->s_base + seg->s_size || 1632 (len & PAGEOFFSET) || ((uintptr_t)addr & PAGEOFFSET)) { 1633 panic("segvn_unmap"); 1634 /*NOTREACHED*/ 1635 } 1636 1637 if (seg->s_szc != 0) { 1638 size_t pgsz = page_get_pagesize(seg->s_szc); 1639 int err; 1640 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 1641 ASSERT(seg->s_base != addr || seg->s_size != len); 1642 if (svd->tr_state == SEGVN_TR_INIT) { 1643 svd->tr_state = SEGVN_TR_OFF; 1644 } else if (svd->tr_state == SEGVN_TR_ON) { 1645 ASSERT(svd->amp != NULL); 1646 segvn_textunrepl(seg, 1); 1647 ASSERT(svd->amp == NULL); 1648 ASSERT(svd->tr_state == SEGVN_TR_OFF); 1649 } 1650 VM_STAT_ADD(segvnvmstats.demoterange[0]); 1651 err = segvn_demote_range(seg, addr, len, SDR_END, 0); 1652 if (err == 0) { 1653 return (IE_RETRY); 1654 } 1655 return (err); 1656 } 1657 } 1658 1659 /* Inform the vnode of the unmapping. */ 1660 if (svd->vp) { 1661 int error; 1662 1663 error = VOP_DELMAP(svd->vp, 1664 (offset_t)svd->offset + (uintptr_t)(addr - seg->s_base), 1665 seg->s_as, addr, len, svd->prot, svd->maxprot, 1666 svd->type, svd->cred); 1667 1668 if (error == EAGAIN) 1669 return (error); 1670 } 1671 1672 if (svd->tr_state == SEGVN_TR_INIT) { 1673 svd->tr_state = SEGVN_TR_OFF; 1674 } else if (svd->tr_state == SEGVN_TR_ON) { 1675 ASSERT(svd->amp != NULL); 1676 ASSERT(svd->pageprot == 0 && !(svd->prot & PROT_WRITE)); 1677 segvn_textunrepl(seg, 1); 1678 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 1679 unmap = 0; 1680 } 1681 1682 /* 1683 * Remove any page locks set through this mapping. 1684 */ 1685 (void) segvn_lockop(seg, addr, len, 0, MC_UNLOCK, NULL, 0); 1686 1687 if (unmap) { 1688 /* 1689 * Unload any hardware translations in the range to be taken 1690 * out. Use a callback to invoke free_vp_pages() effectively. 1691 */ 1692 if (svd->vp != NULL && free_pages != 0) { 1693 callback.hcb_data = seg; 1694 callback.hcb_function = segvn_hat_unload_callback; 1695 cbp = &callback; 1696 } 1697 hat_unload_callback(seg->s_as->a_hat, addr, len, 1698 HAT_UNLOAD_UNMAP, cbp); 1699 1700 if (svd->type == MAP_SHARED && svd->vp != NULL && 1701 (svd->vp->v_flag & VVMEXEC) && 1702 ((svd->prot & PROT_WRITE) || svd->pageprot)) { 1703 segvn_inval_trcache(svd->vp); 1704 } 1705 } 1706 1707 /* 1708 * Check for entire segment 1709 */ 1710 if (addr == seg->s_base && len == seg->s_size) { 1711 seg_free(seg); 1712 return (0); 1713 } 1714 1715 opages = seg_pages(seg); 1716 dpages = btop(len); 1717 npages = opages - dpages; 1718 amp = svd->amp; 1719 ASSERT(amp == NULL || amp->a_szc >= seg->s_szc); 1720 1721 /* 1722 * Check for beginning of segment 1723 */ 1724 if (addr == seg->s_base) { 1725 if (svd->vpage != NULL) { 1726 size_t nbytes; 1727 struct vpage *ovpage; 1728 1729 ovpage = svd->vpage; /* keep pointer to vpage */ 1730 1731 nbytes = vpgtob(npages); 1732 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1733 bcopy(&ovpage[dpages], svd->vpage, nbytes); 1734 1735 /* free up old vpage */ 1736 kmem_free(ovpage, vpgtob(opages)); 1737 } 1738 if (amp != NULL) { 1739 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1740 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1741 /* 1742 * Free up now unused parts of anon_map array. 1743 */ 1744 if (amp->a_szc == seg->s_szc) { 1745 if (seg->s_szc != 0) { 1746 anon_free_pages(amp->ahp, 1747 svd->anon_index, len, 1748 seg->s_szc); 1749 } else { 1750 anon_free(amp->ahp, 1751 svd->anon_index, 1752 len); 1753 } 1754 } else { 1755 ASSERT(svd->type == MAP_SHARED); 1756 ASSERT(amp->a_szc > seg->s_szc); 1757 anon_shmap_free_pages(amp, 1758 svd->anon_index, len); 1759 } 1760 1761 /* 1762 * Unreserve swap space for the 1763 * unmapped chunk of this segment in 1764 * case it's MAP_SHARED 1765 */ 1766 if (svd->type == MAP_SHARED) { 1767 anon_unresv(len); 1768 amp->swresv -= len; 1769 } 1770 } 1771 ANON_LOCK_EXIT(&->a_rwlock); 1772 svd->anon_index += dpages; 1773 } 1774 if (svd->vp != NULL) 1775 svd->offset += len; 1776 1777 if (svd->swresv) { 1778 if (svd->flags & MAP_NORESERVE) { 1779 ASSERT(amp); 1780 oswresv = svd->swresv; 1781 1782 svd->swresv = ptob(anon_pages(amp->ahp, 1783 svd->anon_index, npages)); 1784 anon_unresv(oswresv - svd->swresv); 1785 } else { 1786 anon_unresv(len); 1787 svd->swresv -= len; 1788 } 1789 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 1790 seg, len, 0); 1791 } 1792 1793 seg->s_base += len; 1794 seg->s_size -= len; 1795 return (0); 1796 } 1797 1798 /* 1799 * Check for end of segment 1800 */ 1801 if (addr + len == seg->s_base + seg->s_size) { 1802 if (svd->vpage != NULL) { 1803 size_t nbytes; 1804 struct vpage *ovpage; 1805 1806 ovpage = svd->vpage; /* keep pointer to vpage */ 1807 1808 nbytes = vpgtob(npages); 1809 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1810 bcopy(ovpage, svd->vpage, nbytes); 1811 1812 /* free up old vpage */ 1813 kmem_free(ovpage, vpgtob(opages)); 1814 1815 } 1816 if (amp != NULL) { 1817 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1818 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1819 /* 1820 * Free up now unused parts of anon_map array. 1821 */ 1822 ulong_t an_idx = svd->anon_index + npages; 1823 if (amp->a_szc == seg->s_szc) { 1824 if (seg->s_szc != 0) { 1825 anon_free_pages(amp->ahp, 1826 an_idx, len, 1827 seg->s_szc); 1828 } else { 1829 anon_free(amp->ahp, an_idx, 1830 len); 1831 } 1832 } else { 1833 ASSERT(svd->type == MAP_SHARED); 1834 ASSERT(amp->a_szc > seg->s_szc); 1835 anon_shmap_free_pages(amp, 1836 an_idx, len); 1837 } 1838 1839 /* 1840 * Unreserve swap space for the 1841 * unmapped chunk of this segment in 1842 * case it's MAP_SHARED 1843 */ 1844 if (svd->type == MAP_SHARED) { 1845 anon_unresv(len); 1846 amp->swresv -= len; 1847 } 1848 } 1849 ANON_LOCK_EXIT(&->a_rwlock); 1850 } 1851 1852 if (svd->swresv) { 1853 if (svd->flags & MAP_NORESERVE) { 1854 ASSERT(amp); 1855 oswresv = svd->swresv; 1856 svd->swresv = ptob(anon_pages(amp->ahp, 1857 svd->anon_index, npages)); 1858 anon_unresv(oswresv - svd->swresv); 1859 } else { 1860 anon_unresv(len); 1861 svd->swresv -= len; 1862 } 1863 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 1864 "anon proc:%p %lu %u", seg, len, 0); 1865 } 1866 1867 seg->s_size -= len; 1868 return (0); 1869 } 1870 1871 /* 1872 * The section to go is in the middle of the segment, 1873 * have to make it into two segments. nseg is made for 1874 * the high end while seg is cut down at the low end. 1875 */ 1876 nbase = addr + len; /* new seg base */ 1877 nsize = (seg->s_base + seg->s_size) - nbase; /* new seg size */ 1878 seg->s_size = addr - seg->s_base; /* shrink old seg */ 1879 nseg = seg_alloc(seg->s_as, nbase, nsize); 1880 if (nseg == NULL) { 1881 panic("segvn_unmap seg_alloc"); 1882 /*NOTREACHED*/ 1883 } 1884 nseg->s_ops = seg->s_ops; 1885 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 1886 nseg->s_data = (void *)nsvd; 1887 nseg->s_szc = seg->s_szc; 1888 *nsvd = *svd; 1889 nsvd->seg = nseg; 1890 nsvd->offset = svd->offset + (uintptr_t)(nseg->s_base - seg->s_base); 1891 nsvd->swresv = 0; 1892 nsvd->softlockcnt = 0; 1893 1894 if (svd->vp != NULL) { 1895 VN_HOLD(nsvd->vp); 1896 if (nsvd->type == MAP_SHARED) 1897 lgrp_shm_policy_init(NULL, nsvd->vp); 1898 } 1899 crhold(svd->cred); 1900 1901 if (svd->vpage == NULL) { 1902 nsvd->vpage = NULL; 1903 } else { 1904 /* need to split vpage into two arrays */ 1905 size_t nbytes; 1906 struct vpage *ovpage; 1907 1908 ovpage = svd->vpage; /* keep pointer to vpage */ 1909 1910 npages = seg_pages(seg); /* seg has shrunk */ 1911 nbytes = vpgtob(npages); 1912 svd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1913 1914 bcopy(ovpage, svd->vpage, nbytes); 1915 1916 npages = seg_pages(nseg); 1917 nbytes = vpgtob(npages); 1918 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 1919 1920 bcopy(&ovpage[opages - npages], nsvd->vpage, nbytes); 1921 1922 /* free up old vpage */ 1923 kmem_free(ovpage, vpgtob(opages)); 1924 } 1925 1926 if (amp == NULL) { 1927 nsvd->amp = NULL; 1928 nsvd->anon_index = 0; 1929 } else { 1930 /* 1931 * Need to create a new anon map for the new segment. 1932 * We'll also allocate a new smaller array for the old 1933 * smaller segment to save space. 1934 */ 1935 opages = btop((uintptr_t)(addr - seg->s_base)); 1936 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 1937 if (amp->refcnt == 1 || svd->type == MAP_PRIVATE) { 1938 /* 1939 * Free up now unused parts of anon_map array. 1940 */ 1941 ulong_t an_idx = svd->anon_index + opages; 1942 if (amp->a_szc == seg->s_szc) { 1943 if (seg->s_szc != 0) { 1944 anon_free_pages(amp->ahp, an_idx, len, 1945 seg->s_szc); 1946 } else { 1947 anon_free(amp->ahp, an_idx, 1948 len); 1949 } 1950 } else { 1951 ASSERT(svd->type == MAP_SHARED); 1952 ASSERT(amp->a_szc > seg->s_szc); 1953 anon_shmap_free_pages(amp, an_idx, len); 1954 } 1955 1956 /* 1957 * Unreserve swap space for the 1958 * unmapped chunk of this segment in 1959 * case it's MAP_SHARED 1960 */ 1961 if (svd->type == MAP_SHARED) { 1962 anon_unresv(len); 1963 amp->swresv -= len; 1964 } 1965 } 1966 nsvd->anon_index = svd->anon_index + 1967 btop((uintptr_t)(nseg->s_base - seg->s_base)); 1968 if (svd->type == MAP_SHARED) { 1969 amp->refcnt++; 1970 nsvd->amp = amp; 1971 } else { 1972 struct anon_map *namp; 1973 struct anon_hdr *nahp; 1974 1975 ASSERT(svd->type == MAP_PRIVATE); 1976 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 1977 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 1978 namp->a_szc = seg->s_szc; 1979 (void) anon_copy_ptr(amp->ahp, svd->anon_index, nahp, 1980 0, btop(seg->s_size), ANON_SLEEP); 1981 (void) anon_copy_ptr(amp->ahp, nsvd->anon_index, 1982 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 1983 anon_release(amp->ahp, btop(amp->size)); 1984 svd->anon_index = 0; 1985 nsvd->anon_index = 0; 1986 amp->ahp = nahp; 1987 amp->size = seg->s_size; 1988 nsvd->amp = namp; 1989 } 1990 ANON_LOCK_EXIT(&->a_rwlock); 1991 } 1992 if (svd->swresv) { 1993 if (svd->flags & MAP_NORESERVE) { 1994 ASSERT(amp); 1995 oswresv = svd->swresv; 1996 svd->swresv = ptob(anon_pages(amp->ahp, 1997 svd->anon_index, btop(seg->s_size))); 1998 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 1999 nsvd->anon_index, btop(nseg->s_size))); 2000 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 2001 anon_unresv(oswresv - (svd->swresv + nsvd->swresv)); 2002 } else { 2003 if (seg->s_size + nseg->s_size + len != svd->swresv) { 2004 panic("segvn_unmap: " 2005 "cannot split swap reservation"); 2006 /*NOTREACHED*/ 2007 } 2008 anon_unresv(len); 2009 svd->swresv = seg->s_size; 2010 nsvd->swresv = nseg->s_size; 2011 } 2012 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2013 seg, len, 0); 2014 } 2015 2016 return (0); /* I'm glad that's all over with! */ 2017 } 2018 2019 static void 2020 segvn_free(struct seg *seg) 2021 { 2022 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2023 pgcnt_t npages = seg_pages(seg); 2024 struct anon_map *amp; 2025 size_t len; 2026 2027 /* 2028 * We don't need any segment level locks for "segvn" data 2029 * since the address space is "write" locked. 2030 */ 2031 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 2032 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2033 2034 /* 2035 * Be sure to unlock pages. XXX Why do things get free'ed instead 2036 * of unmapped? XXX 2037 */ 2038 (void) segvn_lockop(seg, seg->s_base, seg->s_size, 2039 0, MC_UNLOCK, NULL, 0); 2040 2041 /* 2042 * Deallocate the vpage and anon pointers if necessary and possible. 2043 */ 2044 if (svd->vpage != NULL) { 2045 kmem_free(svd->vpage, vpgtob(npages)); 2046 svd->vpage = NULL; 2047 } 2048 if ((amp = svd->amp) != NULL) { 2049 /* 2050 * If there are no more references to this anon_map 2051 * structure, then deallocate the structure after freeing 2052 * up all the anon slot pointers that we can. 2053 */ 2054 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2055 ASSERT(amp->a_szc >= seg->s_szc); 2056 if (--amp->refcnt == 0) { 2057 if (svd->type == MAP_PRIVATE) { 2058 /* 2059 * Private - we only need to anon_free 2060 * the part that this segment refers to. 2061 */ 2062 if (seg->s_szc != 0) { 2063 anon_free_pages(amp->ahp, 2064 svd->anon_index, seg->s_size, 2065 seg->s_szc); 2066 } else { 2067 anon_free(amp->ahp, svd->anon_index, 2068 seg->s_size); 2069 } 2070 } else { 2071 /* 2072 * Shared - anon_free the entire 2073 * anon_map's worth of stuff and 2074 * release any swap reservation. 2075 */ 2076 if (amp->a_szc != 0) { 2077 anon_shmap_free_pages(amp, 0, 2078 amp->size); 2079 } else { 2080 anon_free(amp->ahp, 0, amp->size); 2081 } 2082 if ((len = amp->swresv) != 0) { 2083 anon_unresv(len); 2084 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 2085 "anon proc:%p %lu %u", 2086 seg, len, 0); 2087 } 2088 } 2089 svd->amp = NULL; 2090 ANON_LOCK_EXIT(&->a_rwlock); 2091 anonmap_free(amp); 2092 } else if (svd->type == MAP_PRIVATE) { 2093 /* 2094 * We had a private mapping which still has 2095 * a held anon_map so just free up all the 2096 * anon slot pointers that we were using. 2097 */ 2098 if (seg->s_szc != 0) { 2099 anon_free_pages(amp->ahp, svd->anon_index, 2100 seg->s_size, seg->s_szc); 2101 } else { 2102 anon_free(amp->ahp, svd->anon_index, 2103 seg->s_size); 2104 } 2105 ANON_LOCK_EXIT(&->a_rwlock); 2106 } else { 2107 ANON_LOCK_EXIT(&->a_rwlock); 2108 } 2109 } 2110 2111 /* 2112 * Release swap reservation. 2113 */ 2114 if ((len = svd->swresv) != 0) { 2115 anon_unresv(svd->swresv); 2116 TRACE_3(TR_FAC_VM, TR_ANON_PROC, "anon proc:%p %lu %u", 2117 seg, len, 0); 2118 svd->swresv = 0; 2119 } 2120 /* 2121 * Release claim on vnode, credentials, and finally free the 2122 * private data. 2123 */ 2124 if (svd->vp != NULL) { 2125 if (svd->type == MAP_SHARED) 2126 lgrp_shm_policy_fini(NULL, svd->vp); 2127 VN_RELE(svd->vp); 2128 svd->vp = NULL; 2129 } 2130 crfree(svd->cred); 2131 svd->cred = NULL; 2132 2133 seg->s_data = NULL; 2134 kmem_cache_free(segvn_cache, svd); 2135 } 2136 2137 ulong_t segvn_lpglck_limit = 0; 2138 /* 2139 * Support routines used by segvn_pagelock() and softlock faults for anonymous 2140 * pages to implement availrmem accounting in a way that makes sure the 2141 * same memory is accounted just once for all softlock/pagelock purposes. 2142 * This prevents a bug when availrmem is quickly incorrectly exausted from 2143 * several pagelocks to different parts of the same large page since each 2144 * pagelock has to decrement availrmem by the size of the entire large 2145 * page. Note those pages are not COW shared until softunlock/pageunlock so 2146 * we don't need to use cow style accounting here. We also need to make sure 2147 * the entire large page is accounted even if softlock range is less than the 2148 * entire large page because large anon pages can't be demoted when any of 2149 * constituent pages is locked. The caller calls this routine for every page_t 2150 * it locks. The very first page in the range may not be the root page of a 2151 * large page. For all other pages it's guranteed we are going to visit the 2152 * root of a particular large page before any other constituent page as we are 2153 * locking sequential pages belonging to the same anon map. So we do all the 2154 * locking when the root is encountered except for the very first page. Since 2155 * softlocking is not supported (except S_READ_NOCOW special case) for vmpss 2156 * segments and since vnode pages can be demoted without locking all 2157 * constituent pages vnode pages don't come here. Unlocking relies on the 2158 * fact that pagesize can't change whenever any of constituent large pages is 2159 * locked at least SE_SHARED. This allows unlocking code to find the right 2160 * root and decrement availrmem by the same amount it was incremented when the 2161 * page was locked. 2162 */ 2163 static int 2164 segvn_pp_lock_anonpages(page_t *pp, int first) 2165 { 2166 pgcnt_t pages; 2167 pfn_t pfn; 2168 uchar_t szc = pp->p_szc; 2169 2170 ASSERT(PAGE_LOCKED(pp)); 2171 ASSERT(pp->p_vnode != NULL); 2172 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2173 2174 /* 2175 * pagesize won't change as long as any constituent page is locked. 2176 */ 2177 pages = page_get_pagecnt(pp->p_szc); 2178 pfn = page_pptonum(pp); 2179 2180 if (!first) { 2181 if (!IS_P2ALIGNED(pfn, pages)) { 2182 #ifdef DEBUG 2183 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2184 pfn = page_pptonum(pp); 2185 ASSERT(IS_P2ALIGNED(pfn, pages)); 2186 ASSERT(pp->p_szc == szc); 2187 ASSERT(pp->p_vnode != NULL); 2188 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2189 ASSERT(pp->p_slckcnt != 0); 2190 #endif /* DEBUG */ 2191 return (1); 2192 } 2193 } else if (!IS_P2ALIGNED(pfn, pages)) { 2194 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2195 #ifdef DEBUG 2196 pfn = page_pptonum(pp); 2197 ASSERT(IS_P2ALIGNED(pfn, pages)); 2198 ASSERT(pp->p_szc == szc); 2199 ASSERT(pp->p_vnode != NULL); 2200 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2201 #endif /* DEBUG */ 2202 } 2203 2204 /* 2205 * pp is a root page. 2206 * We haven't locked this large page yet. 2207 */ 2208 page_struct_lock(pp); 2209 if (pp->p_slckcnt != 0) { 2210 if (pp->p_slckcnt < PAGE_SLOCK_MAXIMUM) { 2211 pp->p_slckcnt++; 2212 page_struct_unlock(pp); 2213 return (1); 2214 } 2215 page_struct_unlock(pp); 2216 segvn_lpglck_limit++; 2217 return (0); 2218 } 2219 mutex_enter(&freemem_lock); 2220 if (availrmem < tune.t_minarmem + pages) { 2221 mutex_exit(&freemem_lock); 2222 page_struct_unlock(pp); 2223 return (0); 2224 } 2225 pp->p_slckcnt++; 2226 availrmem -= pages; 2227 mutex_exit(&freemem_lock); 2228 page_struct_unlock(pp); 2229 return (1); 2230 } 2231 2232 static void 2233 segvn_pp_unlock_anonpages(page_t *pp, int first) 2234 { 2235 pgcnt_t pages; 2236 pfn_t pfn; 2237 2238 ASSERT(PAGE_LOCKED(pp)); 2239 ASSERT(pp->p_vnode != NULL); 2240 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2241 2242 /* 2243 * pagesize won't change as long as any constituent page is locked. 2244 */ 2245 pages = page_get_pagecnt(pp->p_szc); 2246 pfn = page_pptonum(pp); 2247 2248 if (!first) { 2249 if (!IS_P2ALIGNED(pfn, pages)) { 2250 return; 2251 } 2252 } else if (!IS_P2ALIGNED(pfn, pages)) { 2253 pp = &pp[-(spgcnt_t)(pfn & (pages - 1))]; 2254 #ifdef DEBUG 2255 pfn = page_pptonum(pp); 2256 ASSERT(IS_P2ALIGNED(pfn, pages)); 2257 #endif /* DEBUG */ 2258 } 2259 ASSERT(pp->p_vnode != NULL); 2260 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 2261 ASSERT(pp->p_slckcnt != 0); 2262 page_struct_lock(pp); 2263 if (--pp->p_slckcnt == 0) { 2264 mutex_enter(&freemem_lock); 2265 availrmem += pages; 2266 mutex_exit(&freemem_lock); 2267 } 2268 page_struct_unlock(pp); 2269 } 2270 2271 /* 2272 * Do a F_SOFTUNLOCK call over the range requested. The range must have 2273 * already been F_SOFTLOCK'ed. 2274 * Caller must always match addr and len of a softunlock with a previous 2275 * softlock with exactly the same addr and len. 2276 */ 2277 static void 2278 segvn_softunlock(struct seg *seg, caddr_t addr, size_t len, enum seg_rw rw) 2279 { 2280 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2281 page_t *pp; 2282 caddr_t adr; 2283 struct vnode *vp; 2284 u_offset_t offset; 2285 ulong_t anon_index; 2286 struct anon_map *amp; 2287 struct anon *ap = NULL; 2288 2289 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 2290 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 2291 2292 if ((amp = svd->amp) != NULL) 2293 anon_index = svd->anon_index + seg_page(seg, addr); 2294 2295 hat_unlock(seg->s_as->a_hat, addr, len); 2296 for (adr = addr; adr < addr + len; adr += PAGESIZE) { 2297 if (amp != NULL) { 2298 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 2299 if ((ap = anon_get_ptr(amp->ahp, anon_index++)) 2300 != NULL) { 2301 swap_xlate(ap, &vp, &offset); 2302 } else { 2303 vp = svd->vp; 2304 offset = svd->offset + 2305 (uintptr_t)(adr - seg->s_base); 2306 } 2307 ANON_LOCK_EXIT(&->a_rwlock); 2308 } else { 2309 vp = svd->vp; 2310 offset = svd->offset + 2311 (uintptr_t)(adr - seg->s_base); 2312 } 2313 2314 /* 2315 * Use page_find() instead of page_lookup() to 2316 * find the page since we know that it is locked. 2317 */ 2318 pp = page_find(vp, offset); 2319 if (pp == NULL) { 2320 panic( 2321 "segvn_softunlock: addr %p, ap %p, vp %p, off %llx", 2322 (void *)adr, (void *)ap, (void *)vp, offset); 2323 /*NOTREACHED*/ 2324 } 2325 2326 if (rw == S_WRITE) { 2327 hat_setrefmod(pp); 2328 if (seg->s_as->a_vbits) 2329 hat_setstat(seg->s_as, adr, PAGESIZE, 2330 P_REF | P_MOD); 2331 } else if (rw != S_OTHER) { 2332 hat_setref(pp); 2333 if (seg->s_as->a_vbits) 2334 hat_setstat(seg->s_as, adr, PAGESIZE, P_REF); 2335 } 2336 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2337 "segvn_fault:pp %p vp %p offset %llx", pp, vp, offset); 2338 if (svd->vp == NULL) { 2339 segvn_pp_unlock_anonpages(pp, adr == addr); 2340 } 2341 page_unlock(pp); 2342 } 2343 mutex_enter(&freemem_lock); /* for availrmem */ 2344 if (svd->vp != NULL) { 2345 availrmem += btop(len); 2346 } 2347 segvn_pages_locked -= btop(len); 2348 svd->softlockcnt -= btop(len); 2349 mutex_exit(&freemem_lock); 2350 if (svd->softlockcnt == 0) { 2351 /* 2352 * All SOFTLOCKS are gone. Wakeup any waiting 2353 * unmappers so they can try again to unmap. 2354 * Check for waiters first without the mutex 2355 * held so we don't always grab the mutex on 2356 * softunlocks. 2357 */ 2358 if (AS_ISUNMAPWAIT(seg->s_as)) { 2359 mutex_enter(&seg->s_as->a_contents); 2360 if (AS_ISUNMAPWAIT(seg->s_as)) { 2361 AS_CLRUNMAPWAIT(seg->s_as); 2362 cv_broadcast(&seg->s_as->a_cv); 2363 } 2364 mutex_exit(&seg->s_as->a_contents); 2365 } 2366 } 2367 } 2368 2369 #define PAGE_HANDLED ((page_t *)-1) 2370 2371 /* 2372 * Release all the pages in the NULL terminated ppp list 2373 * which haven't already been converted to PAGE_HANDLED. 2374 */ 2375 static void 2376 segvn_pagelist_rele(page_t **ppp) 2377 { 2378 for (; *ppp != NULL; ppp++) { 2379 if (*ppp != PAGE_HANDLED) 2380 page_unlock(*ppp); 2381 } 2382 } 2383 2384 static int stealcow = 1; 2385 2386 /* 2387 * Workaround for viking chip bug. See bug id 1220902. 2388 * To fix this down in pagefault() would require importing so 2389 * much as and segvn code as to be unmaintainable. 2390 */ 2391 int enable_mbit_wa = 0; 2392 2393 /* 2394 * Handles all the dirty work of getting the right 2395 * anonymous pages and loading up the translations. 2396 * This routine is called only from segvn_fault() 2397 * when looping over the range of addresses requested. 2398 * 2399 * The basic algorithm here is: 2400 * If this is an anon_zero case 2401 * Call anon_zero to allocate page 2402 * Load up translation 2403 * Return 2404 * endif 2405 * If this is an anon page 2406 * Use anon_getpage to get the page 2407 * else 2408 * Find page in pl[] list passed in 2409 * endif 2410 * If not a cow 2411 * Load up the translation to the page 2412 * return 2413 * endif 2414 * Call anon_private to handle cow 2415 * Load up (writable) translation to new page 2416 */ 2417 static faultcode_t 2418 segvn_faultpage( 2419 struct hat *hat, /* the hat to use for mapping */ 2420 struct seg *seg, /* seg_vn of interest */ 2421 caddr_t addr, /* address in as */ 2422 u_offset_t off, /* offset in vp */ 2423 struct vpage *vpage, /* pointer to vpage for vp, off */ 2424 page_t *pl[], /* object source page pointer */ 2425 uint_t vpprot, /* access allowed to object pages */ 2426 enum fault_type type, /* type of fault */ 2427 enum seg_rw rw, /* type of access at fault */ 2428 int brkcow, /* we may need to break cow */ 2429 int first) /* first page for this fault if 1 */ 2430 { 2431 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 2432 page_t *pp, **ppp; 2433 uint_t pageflags = 0; 2434 page_t *anon_pl[1 + 1]; 2435 page_t *opp = NULL; /* original page */ 2436 uint_t prot; 2437 int err; 2438 int cow; 2439 int claim; 2440 int steal = 0; 2441 ulong_t anon_index; 2442 struct anon *ap, *oldap; 2443 struct anon_map *amp; 2444 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 2445 int anon_lock = 0; 2446 anon_sync_obj_t cookie; 2447 2448 if (svd->flags & MAP_TEXT) { 2449 hat_flag |= HAT_LOAD_TEXT; 2450 } 2451 2452 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 2453 ASSERT(seg->s_szc == 0); 2454 2455 /* 2456 * Initialize protection value for this page. 2457 * If we have per page protection values check it now. 2458 */ 2459 if (svd->pageprot) { 2460 uint_t protchk; 2461 2462 switch (rw) { 2463 case S_READ: 2464 protchk = PROT_READ; 2465 break; 2466 case S_WRITE: 2467 protchk = PROT_WRITE; 2468 break; 2469 case S_EXEC: 2470 protchk = PROT_EXEC; 2471 break; 2472 case S_OTHER: 2473 default: 2474 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 2475 break; 2476 } 2477 2478 prot = VPP_PROT(vpage); 2479 if ((prot & protchk) == 0) 2480 return (FC_PROT); /* illegal access type */ 2481 } else { 2482 prot = svd->prot; 2483 } 2484 2485 if (type == F_SOFTLOCK && svd->vp != NULL) { 2486 mutex_enter(&freemem_lock); 2487 if (availrmem <= tune.t_minarmem) { 2488 mutex_exit(&freemem_lock); 2489 return (FC_MAKE_ERR(ENOMEM)); /* out of real memory */ 2490 } else { 2491 availrmem--; 2492 svd->softlockcnt++; 2493 segvn_pages_locked++; 2494 } 2495 mutex_exit(&freemem_lock); 2496 } 2497 2498 /* 2499 * Always acquire the anon array lock to prevent 2 threads from 2500 * allocating separate anon slots for the same "addr". 2501 */ 2502 2503 if ((amp = svd->amp) != NULL) { 2504 ASSERT(RW_READ_HELD(&->a_rwlock)); 2505 anon_index = svd->anon_index + seg_page(seg, addr); 2506 anon_array_enter(amp, anon_index, &cookie); 2507 anon_lock = 1; 2508 } 2509 2510 if (svd->vp == NULL && amp != NULL) { 2511 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) { 2512 /* 2513 * Allocate a (normally) writable anonymous page of 2514 * zeroes. If no advance reservations, reserve now. 2515 */ 2516 if (svd->flags & MAP_NORESERVE) { 2517 if (anon_resv_zone(ptob(1), 2518 seg->s_as->a_proc->p_zone)) { 2519 atomic_add_long(&svd->swresv, ptob(1)); 2520 } else { 2521 err = ENOMEM; 2522 goto out; 2523 } 2524 } 2525 if ((pp = anon_zero(seg, addr, &ap, 2526 svd->cred)) == NULL) { 2527 err = ENOMEM; 2528 goto out; /* out of swap space */ 2529 } 2530 /* 2531 * Re-acquire the anon_map lock and 2532 * initialize the anon array entry. 2533 */ 2534 (void) anon_set_ptr(amp->ahp, anon_index, ap, 2535 ANON_SLEEP); 2536 2537 ASSERT(pp->p_szc == 0); 2538 2539 /* 2540 * Handle pages that have been marked for migration 2541 */ 2542 if (lgrp_optimizations()) 2543 page_migrate(seg, addr, &pp, 1); 2544 2545 if (type == F_SOFTLOCK) { 2546 if (!segvn_pp_lock_anonpages(pp, first)) { 2547 page_unlock(pp); 2548 err = ENOMEM; 2549 goto out; 2550 } else { 2551 mutex_enter(&freemem_lock); 2552 svd->softlockcnt++; 2553 segvn_pages_locked++; 2554 mutex_exit(&freemem_lock); 2555 } 2556 } 2557 2558 if (enable_mbit_wa) { 2559 if (rw == S_WRITE) 2560 hat_setmod(pp); 2561 else if (!hat_ismod(pp)) 2562 prot &= ~PROT_WRITE; 2563 } 2564 /* 2565 * If AS_PAGLCK is set in a_flags (via memcntl(2) 2566 * with MC_LOCKAS, MCL_FUTURE) and this is a 2567 * MAP_NORESERVE segment, we may need to 2568 * permanently lock the page as it is being faulted 2569 * for the first time. The following text applies 2570 * only to MAP_NORESERVE segments: 2571 * 2572 * As per memcntl(2), if this segment was created 2573 * after MCL_FUTURE was applied (a "future" 2574 * segment), its pages must be locked. If this 2575 * segment existed at MCL_FUTURE application (a 2576 * "past" segment), the interface is unclear. 2577 * 2578 * We decide to lock only if vpage is present: 2579 * 2580 * - "future" segments will have a vpage array (see 2581 * as_map), and so will be locked as required 2582 * 2583 * - "past" segments may not have a vpage array, 2584 * depending on whether events (such as 2585 * mprotect) have occurred. Locking if vpage 2586 * exists will preserve legacy behavior. Not 2587 * locking if vpage is absent, will not break 2588 * the interface or legacy behavior. Note that 2589 * allocating vpage here if it's absent requires 2590 * upgrading the segvn reader lock, the cost of 2591 * which does not seem worthwhile. 2592 * 2593 * Usually testing and setting VPP_ISPPLOCK and 2594 * VPP_SETPPLOCK requires holding the segvn lock as 2595 * writer, but in this case all readers are 2596 * serializing on the anon array lock. 2597 */ 2598 if (AS_ISPGLCK(seg->s_as) && vpage != NULL && 2599 (svd->flags & MAP_NORESERVE) && 2600 !VPP_ISPPLOCK(vpage)) { 2601 proc_t *p = seg->s_as->a_proc; 2602 ASSERT(svd->type == MAP_PRIVATE); 2603 mutex_enter(&p->p_lock); 2604 if (rctl_incr_locked_mem(p, NULL, PAGESIZE, 2605 1) == 0) { 2606 claim = VPP_PROT(vpage) & PROT_WRITE; 2607 if (page_pp_lock(pp, claim, 0)) { 2608 VPP_SETPPLOCK(vpage); 2609 } else { 2610 rctl_decr_locked_mem(p, NULL, 2611 PAGESIZE, 1); 2612 } 2613 } 2614 mutex_exit(&p->p_lock); 2615 } 2616 2617 hat_memload(hat, addr, pp, prot, hat_flag); 2618 2619 if (!(hat_flag & HAT_LOAD_LOCK)) 2620 page_unlock(pp); 2621 2622 anon_array_exit(&cookie); 2623 return (0); 2624 } 2625 } 2626 2627 /* 2628 * Obtain the page structure via anon_getpage() if it is 2629 * a private copy of an object (the result of a previous 2630 * copy-on-write). 2631 */ 2632 if (amp != NULL) { 2633 if ((ap = anon_get_ptr(amp->ahp, anon_index)) != NULL) { 2634 err = anon_getpage(&ap, &vpprot, anon_pl, PAGESIZE, 2635 seg, addr, rw, svd->cred); 2636 if (err) 2637 goto out; 2638 2639 if (svd->type == MAP_SHARED) { 2640 /* 2641 * If this is a shared mapping to an 2642 * anon_map, then ignore the write 2643 * permissions returned by anon_getpage(). 2644 * They apply to the private mappings 2645 * of this anon_map. 2646 */ 2647 vpprot |= PROT_WRITE; 2648 } 2649 opp = anon_pl[0]; 2650 } 2651 } 2652 2653 /* 2654 * Search the pl[] list passed in if it is from the 2655 * original object (i.e., not a private copy). 2656 */ 2657 if (opp == NULL) { 2658 /* 2659 * Find original page. We must be bringing it in 2660 * from the list in pl[]. 2661 */ 2662 for (ppp = pl; (opp = *ppp) != NULL; ppp++) { 2663 if (opp == PAGE_HANDLED) 2664 continue; 2665 ASSERT(opp->p_vnode == svd->vp); /* XXX */ 2666 if (opp->p_offset == off) 2667 break; 2668 } 2669 if (opp == NULL) { 2670 panic("segvn_faultpage not found"); 2671 /*NOTREACHED*/ 2672 } 2673 *ppp = PAGE_HANDLED; 2674 2675 } 2676 2677 ASSERT(PAGE_LOCKED(opp)); 2678 2679 TRACE_3(TR_FAC_VM, TR_SEGVN_FAULT, 2680 "segvn_fault:pp %p vp %p offset %llx", 2681 opp, NULL, 0); 2682 2683 /* 2684 * The fault is treated as a copy-on-write fault if a 2685 * write occurs on a private segment and the object 2686 * page (i.e., mapping) is write protected. We assume 2687 * that fatal protection checks have already been made. 2688 */ 2689 2690 if (brkcow) { 2691 ASSERT(svd->tr_state == SEGVN_TR_OFF); 2692 cow = !(vpprot & PROT_WRITE); 2693 } else if (svd->tr_state == SEGVN_TR_ON) { 2694 /* 2695 * If we are doing text replication COW on first touch. 2696 */ 2697 ASSERT(amp != NULL); 2698 ASSERT(svd->vp != NULL); 2699 ASSERT(rw != S_WRITE); 2700 cow = (ap == NULL); 2701 } else { 2702 cow = 0; 2703 } 2704 2705 /* 2706 * If not a copy-on-write case load the translation 2707 * and return. 2708 */ 2709 if (cow == 0) { 2710 2711 /* 2712 * Handle pages that have been marked for migration 2713 */ 2714 if (lgrp_optimizations()) 2715 page_migrate(seg, addr, &opp, 1); 2716 2717 if (type == F_SOFTLOCK && svd->vp == NULL) { 2718 2719 ASSERT(opp->p_szc == 0 || 2720 (svd->type == MAP_SHARED && 2721 amp != NULL && amp->a_szc != 0)); 2722 2723 if (!segvn_pp_lock_anonpages(opp, first)) { 2724 page_unlock(opp); 2725 err = ENOMEM; 2726 goto out; 2727 } else { 2728 mutex_enter(&freemem_lock); 2729 svd->softlockcnt++; 2730 segvn_pages_locked++; 2731 mutex_exit(&freemem_lock); 2732 } 2733 } 2734 if (IS_VMODSORT(opp->p_vnode) || enable_mbit_wa) { 2735 if (rw == S_WRITE) 2736 hat_setmod(opp); 2737 else if (rw != S_OTHER && !hat_ismod(opp)) 2738 prot &= ~PROT_WRITE; 2739 } 2740 2741 hat_memload(hat, addr, opp, prot & vpprot, hat_flag); 2742 2743 if (!(hat_flag & HAT_LOAD_LOCK)) 2744 page_unlock(opp); 2745 2746 if (anon_lock) { 2747 anon_array_exit(&cookie); 2748 } 2749 return (0); 2750 } 2751 2752 hat_setref(opp); 2753 2754 ASSERT(amp != NULL && anon_lock); 2755 2756 /* 2757 * Steal the page only if it isn't a private page 2758 * since stealing a private page is not worth the effort. 2759 */ 2760 if ((ap = anon_get_ptr(amp->ahp, anon_index)) == NULL) 2761 steal = 1; 2762 2763 /* 2764 * Steal the original page if the following conditions are true: 2765 * 2766 * We are low on memory, the page is not private, page is not large, 2767 * not shared, not modified, not `locked' or if we have it `locked' 2768 * (i.e., p_cowcnt == 1 and p_lckcnt == 0, which also implies 2769 * that the page is not shared) and if it doesn't have any 2770 * translations. page_struct_lock isn't needed to look at p_cowcnt 2771 * and p_lckcnt because we first get exclusive lock on page. 2772 */ 2773 (void) hat_pagesync(opp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 2774 2775 if (stealcow && freemem < minfree && steal && opp->p_szc == 0 && 2776 page_tryupgrade(opp) && !hat_ismod(opp) && 2777 ((opp->p_lckcnt == 0 && opp->p_cowcnt == 0) || 2778 (opp->p_lckcnt == 0 && opp->p_cowcnt == 1 && 2779 vpage != NULL && VPP_ISPPLOCK(vpage)))) { 2780 /* 2781 * Check if this page has other translations 2782 * after unloading our translation. 2783 */ 2784 if (hat_page_is_mapped(opp)) { 2785 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, 2786 HAT_UNLOAD); 2787 } 2788 2789 /* 2790 * hat_unload() might sync back someone else's recent 2791 * modification, so check again. 2792 */ 2793 if (!hat_ismod(opp) && !hat_page_is_mapped(opp)) 2794 pageflags |= STEAL_PAGE; 2795 } 2796 2797 /* 2798 * If we have a vpage pointer, see if it indicates that we have 2799 * ``locked'' the page we map -- if so, tell anon_private to 2800 * transfer the locking resource to the new page. 2801 * 2802 * See Statement at the beginning of segvn_lockop regarding 2803 * the way lockcnts/cowcnts are handled during COW. 2804 * 2805 */ 2806 if (vpage != NULL && VPP_ISPPLOCK(vpage)) 2807 pageflags |= LOCK_PAGE; 2808 2809 /* 2810 * Allocate a private page and perform the copy. 2811 * For MAP_NORESERVE reserve swap space now, unless this 2812 * is a cow fault on an existing anon page in which case 2813 * MAP_NORESERVE will have made advance reservations. 2814 */ 2815 if ((svd->flags & MAP_NORESERVE) && (ap == NULL)) { 2816 if (anon_resv_zone(ptob(1), seg->s_as->a_proc->p_zone)) { 2817 atomic_add_long(&svd->swresv, ptob(1)); 2818 } else { 2819 page_unlock(opp); 2820 err = ENOMEM; 2821 goto out; 2822 } 2823 } 2824 oldap = ap; 2825 pp = anon_private(&ap, seg, addr, prot, opp, pageflags, svd->cred); 2826 if (pp == NULL) { 2827 err = ENOMEM; /* out of swap space */ 2828 goto out; 2829 } 2830 2831 /* 2832 * If we copied away from an anonymous page, then 2833 * we are one step closer to freeing up an anon slot. 2834 * 2835 * NOTE: The original anon slot must be released while 2836 * holding the "anon_map" lock. This is necessary to prevent 2837 * other threads from obtaining a pointer to the anon slot 2838 * which may be freed if its "refcnt" is 1. 2839 */ 2840 if (oldap != NULL) 2841 anon_decref(oldap); 2842 2843 (void) anon_set_ptr(amp->ahp, anon_index, ap, ANON_SLEEP); 2844 2845 /* 2846 * Handle pages that have been marked for migration 2847 */ 2848 if (lgrp_optimizations()) 2849 page_migrate(seg, addr, &pp, 1); 2850 2851 ASSERT(pp->p_szc == 0); 2852 if (type == F_SOFTLOCK && svd->vp == NULL) { 2853 if (!segvn_pp_lock_anonpages(pp, first)) { 2854 page_unlock(pp); 2855 err = ENOMEM; 2856 goto out; 2857 } else { 2858 mutex_enter(&freemem_lock); 2859 svd->softlockcnt++; 2860 segvn_pages_locked++; 2861 mutex_exit(&freemem_lock); 2862 } 2863 } 2864 2865 ASSERT(!IS_VMODSORT(pp->p_vnode)); 2866 if (enable_mbit_wa) { 2867 if (rw == S_WRITE) 2868 hat_setmod(pp); 2869 else if (!hat_ismod(pp)) 2870 prot &= ~PROT_WRITE; 2871 } 2872 2873 hat_memload(hat, addr, pp, prot, hat_flag); 2874 2875 if (!(hat_flag & HAT_LOAD_LOCK)) 2876 page_unlock(pp); 2877 2878 ASSERT(anon_lock); 2879 anon_array_exit(&cookie); 2880 return (0); 2881 out: 2882 if (anon_lock) 2883 anon_array_exit(&cookie); 2884 2885 if (type == F_SOFTLOCK && svd->vp != NULL) { 2886 mutex_enter(&freemem_lock); 2887 availrmem++; 2888 segvn_pages_locked--; 2889 svd->softlockcnt--; 2890 mutex_exit(&freemem_lock); 2891 } 2892 return (FC_MAKE_ERR(err)); 2893 } 2894 2895 /* 2896 * relocate a bunch of smaller targ pages into one large repl page. all targ 2897 * pages must be complete pages smaller than replacement pages. 2898 * it's assumed that no page's szc can change since they are all PAGESIZE or 2899 * complete large pages locked SHARED. 2900 */ 2901 static void 2902 segvn_relocate_pages(page_t **targ, page_t *replacement) 2903 { 2904 page_t *pp; 2905 pgcnt_t repl_npgs, curnpgs; 2906 pgcnt_t i; 2907 uint_t repl_szc = replacement->p_szc; 2908 page_t *first_repl = replacement; 2909 page_t *repl; 2910 spgcnt_t npgs; 2911 2912 VM_STAT_ADD(segvnvmstats.relocatepages[0]); 2913 2914 ASSERT(repl_szc != 0); 2915 npgs = repl_npgs = page_get_pagecnt(repl_szc); 2916 2917 i = 0; 2918 while (repl_npgs) { 2919 spgcnt_t nreloc; 2920 int err; 2921 ASSERT(replacement != NULL); 2922 pp = targ[i]; 2923 ASSERT(pp->p_szc < repl_szc); 2924 ASSERT(PAGE_EXCL(pp)); 2925 ASSERT(!PP_ISFREE(pp)); 2926 curnpgs = page_get_pagecnt(pp->p_szc); 2927 if (curnpgs == 1) { 2928 VM_STAT_ADD(segvnvmstats.relocatepages[1]); 2929 repl = replacement; 2930 page_sub(&replacement, repl); 2931 ASSERT(PAGE_EXCL(repl)); 2932 ASSERT(!PP_ISFREE(repl)); 2933 ASSERT(repl->p_szc == repl_szc); 2934 } else { 2935 page_t *repl_savepp; 2936 int j; 2937 VM_STAT_ADD(segvnvmstats.relocatepages[2]); 2938 repl_savepp = replacement; 2939 for (j = 0; j < curnpgs; j++) { 2940 repl = replacement; 2941 page_sub(&replacement, repl); 2942 ASSERT(PAGE_EXCL(repl)); 2943 ASSERT(!PP_ISFREE(repl)); 2944 ASSERT(repl->p_szc == repl_szc); 2945 ASSERT(page_pptonum(targ[i + j]) == 2946 page_pptonum(targ[i]) + j); 2947 } 2948 repl = repl_savepp; 2949 ASSERT(IS_P2ALIGNED(page_pptonum(repl), curnpgs)); 2950 } 2951 err = page_relocate(&pp, &repl, 0, 1, &nreloc, NULL); 2952 if (err || nreloc != curnpgs) { 2953 panic("segvn_relocate_pages: " 2954 "page_relocate failed err=%d curnpgs=%ld " 2955 "nreloc=%ld", err, curnpgs, nreloc); 2956 } 2957 ASSERT(curnpgs <= repl_npgs); 2958 repl_npgs -= curnpgs; 2959 i += curnpgs; 2960 } 2961 ASSERT(replacement == NULL); 2962 2963 repl = first_repl; 2964 repl_npgs = npgs; 2965 for (i = 0; i < repl_npgs; i++) { 2966 ASSERT(PAGE_EXCL(repl)); 2967 ASSERT(!PP_ISFREE(repl)); 2968 targ[i] = repl; 2969 page_downgrade(targ[i]); 2970 repl++; 2971 } 2972 } 2973 2974 /* 2975 * Check if all pages in ppa array are complete smaller than szc pages and 2976 * their roots will still be aligned relative to their current size if the 2977 * entire ppa array is relocated into one szc page. If these conditions are 2978 * not met return 0. 2979 * 2980 * If all pages are properly aligned attempt to upgrade their locks 2981 * to exclusive mode. If it fails set *upgrdfail to 1 and return 0. 2982 * upgrdfail was set to 0 by caller. 2983 * 2984 * Return 1 if all pages are aligned and locked exclusively. 2985 * 2986 * If all pages in ppa array happen to be physically contiguous to make one 2987 * szc page and all exclusive locks are successfully obtained promote the page 2988 * size to szc and set *pszc to szc. Return 1 with pages locked shared. 2989 */ 2990 static int 2991 segvn_full_szcpages(page_t **ppa, uint_t szc, int *upgrdfail, uint_t *pszc) 2992 { 2993 page_t *pp; 2994 pfn_t pfn; 2995 pgcnt_t totnpgs = page_get_pagecnt(szc); 2996 pfn_t first_pfn; 2997 int contig = 1; 2998 pgcnt_t i; 2999 pgcnt_t j; 3000 uint_t curszc; 3001 pgcnt_t curnpgs; 3002 int root = 0; 3003 3004 ASSERT(szc > 0); 3005 3006 VM_STAT_ADD(segvnvmstats.fullszcpages[0]); 3007 3008 for (i = 0; i < totnpgs; i++) { 3009 pp = ppa[i]; 3010 ASSERT(PAGE_SHARED(pp)); 3011 ASSERT(!PP_ISFREE(pp)); 3012 pfn = page_pptonum(pp); 3013 if (i == 0) { 3014 if (!IS_P2ALIGNED(pfn, totnpgs)) { 3015 contig = 0; 3016 } else { 3017 first_pfn = pfn; 3018 } 3019 } else if (contig && pfn != first_pfn + i) { 3020 contig = 0; 3021 } 3022 if (pp->p_szc == 0) { 3023 if (root) { 3024 VM_STAT_ADD(segvnvmstats.fullszcpages[1]); 3025 return (0); 3026 } 3027 } else if (!root) { 3028 if ((curszc = pp->p_szc) >= szc) { 3029 VM_STAT_ADD(segvnvmstats.fullszcpages[2]); 3030 return (0); 3031 } 3032 if (curszc == 0) { 3033 /* 3034 * p_szc changed means we don't have all pages 3035 * locked. return failure. 3036 */ 3037 VM_STAT_ADD(segvnvmstats.fullszcpages[3]); 3038 return (0); 3039 } 3040 curnpgs = page_get_pagecnt(curszc); 3041 if (!IS_P2ALIGNED(pfn, curnpgs) || 3042 !IS_P2ALIGNED(i, curnpgs)) { 3043 VM_STAT_ADD(segvnvmstats.fullszcpages[4]); 3044 return (0); 3045 } 3046 root = 1; 3047 } else { 3048 ASSERT(i > 0); 3049 VM_STAT_ADD(segvnvmstats.fullszcpages[5]); 3050 if (pp->p_szc != curszc) { 3051 VM_STAT_ADD(segvnvmstats.fullszcpages[6]); 3052 return (0); 3053 } 3054 if (pfn - 1 != page_pptonum(ppa[i - 1])) { 3055 panic("segvn_full_szcpages: " 3056 "large page not physically contiguous"); 3057 } 3058 if (P2PHASE(pfn, curnpgs) == curnpgs - 1) { 3059 root = 0; 3060 } 3061 } 3062 } 3063 3064 for (i = 0; i < totnpgs; i++) { 3065 ASSERT(ppa[i]->p_szc < szc); 3066 if (!page_tryupgrade(ppa[i])) { 3067 for (j = 0; j < i; j++) { 3068 page_downgrade(ppa[j]); 3069 } 3070 *pszc = ppa[i]->p_szc; 3071 *upgrdfail = 1; 3072 VM_STAT_ADD(segvnvmstats.fullszcpages[7]); 3073 return (0); 3074 } 3075 } 3076 3077 /* 3078 * When a page is put a free cachelist its szc is set to 0. if file 3079 * system reclaimed pages from cachelist targ pages will be physically 3080 * contiguous with 0 p_szc. in this case just upgrade szc of targ 3081 * pages without any relocations. 3082 * To avoid any hat issues with previous small mappings 3083 * hat_pageunload() the target pages first. 3084 */ 3085 if (contig) { 3086 VM_STAT_ADD(segvnvmstats.fullszcpages[8]); 3087 for (i = 0; i < totnpgs; i++) { 3088 (void) hat_pageunload(ppa[i], HAT_FORCE_PGUNLOAD); 3089 } 3090 for (i = 0; i < totnpgs; i++) { 3091 ppa[i]->p_szc = szc; 3092 } 3093 for (i = 0; i < totnpgs; i++) { 3094 ASSERT(PAGE_EXCL(ppa[i])); 3095 page_downgrade(ppa[i]); 3096 } 3097 if (pszc != NULL) { 3098 *pszc = szc; 3099 } 3100 } 3101 VM_STAT_ADD(segvnvmstats.fullszcpages[9]); 3102 return (1); 3103 } 3104 3105 /* 3106 * Create physically contiguous pages for [vp, off] - [vp, off + 3107 * page_size(szc)) range and for private segment return them in ppa array. 3108 * Pages are created either via IO or relocations. 3109 * 3110 * Return 1 on sucess and 0 on failure. 3111 * 3112 * If physically contiguos pages already exist for this range return 1 without 3113 * filling ppa array. Caller initializes ppa[0] as NULL to detect that ppa 3114 * array wasn't filled. In this case caller fills ppa array via VOP_GETPAGE(). 3115 */ 3116 3117 static int 3118 segvn_fill_vp_pages(struct segvn_data *svd, vnode_t *vp, u_offset_t off, 3119 uint_t szc, page_t **ppa, page_t **ppplist, uint_t *ret_pszc, 3120 int *downsize) 3121 3122 { 3123 page_t *pplist = *ppplist; 3124 size_t pgsz = page_get_pagesize(szc); 3125 pgcnt_t pages = btop(pgsz); 3126 ulong_t start_off = off; 3127 u_offset_t eoff = off + pgsz; 3128 spgcnt_t nreloc; 3129 u_offset_t io_off = off; 3130 size_t io_len; 3131 page_t *io_pplist = NULL; 3132 page_t *done_pplist = NULL; 3133 pgcnt_t pgidx = 0; 3134 page_t *pp; 3135 page_t *newpp; 3136 page_t *targpp; 3137 int io_err = 0; 3138 int i; 3139 pfn_t pfn; 3140 ulong_t ppages; 3141 page_t *targ_pplist = NULL; 3142 page_t *repl_pplist = NULL; 3143 page_t *tmp_pplist; 3144 int nios = 0; 3145 uint_t pszc; 3146 struct vattr va; 3147 3148 VM_STAT_ADD(segvnvmstats.fill_vp_pages[0]); 3149 3150 ASSERT(szc != 0); 3151 ASSERT(pplist->p_szc == szc); 3152 3153 /* 3154 * downsize will be set to 1 only if we fail to lock pages. this will 3155 * allow subsequent faults to try to relocate the page again. If we 3156 * fail due to misalignment don't downsize and let the caller map the 3157 * whole region with small mappings to avoid more faults into the area 3158 * where we can't get large pages anyway. 3159 */ 3160 *downsize = 0; 3161 3162 while (off < eoff) { 3163 newpp = pplist; 3164 ASSERT(newpp != NULL); 3165 ASSERT(PAGE_EXCL(newpp)); 3166 ASSERT(!PP_ISFREE(newpp)); 3167 /* 3168 * we pass NULL for nrelocp to page_lookup_create() 3169 * so that it doesn't relocate. We relocate here 3170 * later only after we make sure we can lock all 3171 * pages in the range we handle and they are all 3172 * aligned. 3173 */ 3174 pp = page_lookup_create(vp, off, SE_SHARED, newpp, NULL, 0); 3175 ASSERT(pp != NULL); 3176 ASSERT(!PP_ISFREE(pp)); 3177 ASSERT(pp->p_vnode == vp); 3178 ASSERT(pp->p_offset == off); 3179 if (pp == newpp) { 3180 VM_STAT_ADD(segvnvmstats.fill_vp_pages[1]); 3181 page_sub(&pplist, pp); 3182 ASSERT(PAGE_EXCL(pp)); 3183 ASSERT(page_iolock_assert(pp)); 3184 page_list_concat(&io_pplist, &pp); 3185 off += PAGESIZE; 3186 continue; 3187 } 3188 VM_STAT_ADD(segvnvmstats.fill_vp_pages[2]); 3189 pfn = page_pptonum(pp); 3190 pszc = pp->p_szc; 3191 if (pszc >= szc && targ_pplist == NULL && io_pplist == NULL && 3192 IS_P2ALIGNED(pfn, pages)) { 3193 ASSERT(repl_pplist == NULL); 3194 ASSERT(done_pplist == NULL); 3195 ASSERT(pplist == *ppplist); 3196 page_unlock(pp); 3197 page_free_replacement_page(pplist); 3198 page_create_putback(pages); 3199 *ppplist = NULL; 3200 VM_STAT_ADD(segvnvmstats.fill_vp_pages[3]); 3201 return (1); 3202 } 3203 if (pszc >= szc) { 3204 page_unlock(pp); 3205 segvn_faultvnmpss_align_err1++; 3206 goto out; 3207 } 3208 ppages = page_get_pagecnt(pszc); 3209 if (!IS_P2ALIGNED(pfn, ppages)) { 3210 ASSERT(pszc > 0); 3211 /* 3212 * sizing down to pszc won't help. 3213 */ 3214 page_unlock(pp); 3215 segvn_faultvnmpss_align_err2++; 3216 goto out; 3217 } 3218 pfn = page_pptonum(newpp); 3219 if (!IS_P2ALIGNED(pfn, ppages)) { 3220 ASSERT(pszc > 0); 3221 /* 3222 * sizing down to pszc won't help. 3223 */ 3224 page_unlock(pp); 3225 segvn_faultvnmpss_align_err3++; 3226 goto out; 3227 } 3228 if (!PAGE_EXCL(pp)) { 3229 VM_STAT_ADD(segvnvmstats.fill_vp_pages[4]); 3230 page_unlock(pp); 3231 *downsize = 1; 3232 *ret_pszc = pp->p_szc; 3233 goto out; 3234 } 3235 targpp = pp; 3236 if (io_pplist != NULL) { 3237 VM_STAT_ADD(segvnvmstats.fill_vp_pages[5]); 3238 io_len = off - io_off; 3239 /* 3240 * Some file systems like NFS don't check EOF 3241 * conditions in VOP_PAGEIO(). Check it here 3242 * now that pages are locked SE_EXCL. Any file 3243 * truncation will wait until the pages are 3244 * unlocked so no need to worry that file will 3245 * be truncated after we check its size here. 3246 * XXX fix NFS to remove this check. 3247 */ 3248 va.va_mask = AT_SIZE; 3249 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3250 VM_STAT_ADD(segvnvmstats.fill_vp_pages[6]); 3251 page_unlock(targpp); 3252 goto out; 3253 } 3254 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3255 VM_STAT_ADD(segvnvmstats.fill_vp_pages[7]); 3256 *downsize = 1; 3257 *ret_pszc = 0; 3258 page_unlock(targpp); 3259 goto out; 3260 } 3261 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3262 B_READ, svd->cred); 3263 if (io_err) { 3264 VM_STAT_ADD(segvnvmstats.fill_vp_pages[8]); 3265 page_unlock(targpp); 3266 if (io_err == EDEADLK) { 3267 segvn_vmpss_pageio_deadlk_err++; 3268 } 3269 goto out; 3270 } 3271 nios++; 3272 VM_STAT_ADD(segvnvmstats.fill_vp_pages[9]); 3273 while (io_pplist != NULL) { 3274 pp = io_pplist; 3275 page_sub(&io_pplist, pp); 3276 ASSERT(page_iolock_assert(pp)); 3277 page_io_unlock(pp); 3278 pgidx = (pp->p_offset - start_off) >> 3279 PAGESHIFT; 3280 ASSERT(pgidx < pages); 3281 ppa[pgidx] = pp; 3282 page_list_concat(&done_pplist, &pp); 3283 } 3284 } 3285 pp = targpp; 3286 ASSERT(PAGE_EXCL(pp)); 3287 ASSERT(pp->p_szc <= pszc); 3288 if (pszc != 0 && !group_page_trylock(pp, SE_EXCL)) { 3289 VM_STAT_ADD(segvnvmstats.fill_vp_pages[10]); 3290 page_unlock(pp); 3291 *downsize = 1; 3292 *ret_pszc = pp->p_szc; 3293 goto out; 3294 } 3295 VM_STAT_ADD(segvnvmstats.fill_vp_pages[11]); 3296 /* 3297 * page szc chould have changed before the entire group was 3298 * locked. reread page szc. 3299 */ 3300 pszc = pp->p_szc; 3301 ppages = page_get_pagecnt(pszc); 3302 3303 /* link just the roots */ 3304 page_list_concat(&targ_pplist, &pp); 3305 page_sub(&pplist, newpp); 3306 page_list_concat(&repl_pplist, &newpp); 3307 off += PAGESIZE; 3308 while (--ppages != 0) { 3309 newpp = pplist; 3310 page_sub(&pplist, newpp); 3311 off += PAGESIZE; 3312 } 3313 io_off = off; 3314 } 3315 if (io_pplist != NULL) { 3316 VM_STAT_ADD(segvnvmstats.fill_vp_pages[12]); 3317 io_len = eoff - io_off; 3318 va.va_mask = AT_SIZE; 3319 if (VOP_GETATTR(vp, &va, ATTR_HINT, svd->cred) != 0) { 3320 VM_STAT_ADD(segvnvmstats.fill_vp_pages[13]); 3321 goto out; 3322 } 3323 if (btopr(va.va_size) < btopr(io_off + io_len)) { 3324 VM_STAT_ADD(segvnvmstats.fill_vp_pages[14]); 3325 *downsize = 1; 3326 *ret_pszc = 0; 3327 goto out; 3328 } 3329 io_err = VOP_PAGEIO(vp, io_pplist, io_off, io_len, 3330 B_READ, svd->cred); 3331 if (io_err) { 3332 VM_STAT_ADD(segvnvmstats.fill_vp_pages[15]); 3333 if (io_err == EDEADLK) { 3334 segvn_vmpss_pageio_deadlk_err++; 3335 } 3336 goto out; 3337 } 3338 nios++; 3339 while (io_pplist != NULL) { 3340 pp = io_pplist; 3341 page_sub(&io_pplist, pp); 3342 ASSERT(page_iolock_assert(pp)); 3343 page_io_unlock(pp); 3344 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3345 ASSERT(pgidx < pages); 3346 ppa[pgidx] = pp; 3347 } 3348 } 3349 /* 3350 * we're now bound to succeed or panic. 3351 * remove pages from done_pplist. it's not needed anymore. 3352 */ 3353 while (done_pplist != NULL) { 3354 pp = done_pplist; 3355 page_sub(&done_pplist, pp); 3356 } 3357 VM_STAT_ADD(segvnvmstats.fill_vp_pages[16]); 3358 ASSERT(pplist == NULL); 3359 *ppplist = NULL; 3360 while (targ_pplist != NULL) { 3361 int ret; 3362 VM_STAT_ADD(segvnvmstats.fill_vp_pages[17]); 3363 ASSERT(repl_pplist); 3364 pp = targ_pplist; 3365 page_sub(&targ_pplist, pp); 3366 pgidx = (pp->p_offset - start_off) >> PAGESHIFT; 3367 newpp = repl_pplist; 3368 page_sub(&repl_pplist, newpp); 3369 #ifdef DEBUG 3370 pfn = page_pptonum(pp); 3371 pszc = pp->p_szc; 3372 ppages = page_get_pagecnt(pszc); 3373 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3374 pfn = page_pptonum(newpp); 3375 ASSERT(IS_P2ALIGNED(pfn, ppages)); 3376 ASSERT(P2PHASE(pfn, pages) == pgidx); 3377 #endif 3378 nreloc = 0; 3379 ret = page_relocate(&pp, &newpp, 0, 1, &nreloc, NULL); 3380 if (ret != 0 || nreloc == 0) { 3381 panic("segvn_fill_vp_pages: " 3382 "page_relocate failed"); 3383 } 3384 pp = newpp; 3385 while (nreloc-- != 0) { 3386 ASSERT(PAGE_EXCL(pp)); 3387 ASSERT(pp->p_vnode == vp); 3388 ASSERT(pgidx == 3389 ((pp->p_offset - start_off) >> PAGESHIFT)); 3390 ppa[pgidx++] = pp; 3391 pp++; 3392 } 3393 } 3394 3395 if (svd->type == MAP_PRIVATE) { 3396 VM_STAT_ADD(segvnvmstats.fill_vp_pages[18]); 3397 for (i = 0; i < pages; i++) { 3398 ASSERT(ppa[i] != NULL); 3399 ASSERT(PAGE_EXCL(ppa[i])); 3400 ASSERT(ppa[i]->p_vnode == vp); 3401 ASSERT(ppa[i]->p_offset == 3402 start_off + (i << PAGESHIFT)); 3403 page_downgrade(ppa[i]); 3404 } 3405 ppa[pages] = NULL; 3406 } else { 3407 VM_STAT_ADD(segvnvmstats.fill_vp_pages[19]); 3408 /* 3409 * the caller will still call VOP_GETPAGE() for shared segments 3410 * to check FS write permissions. For private segments we map 3411 * file read only anyway. so no VOP_GETPAGE is needed. 3412 */ 3413 for (i = 0; i < pages; i++) { 3414 ASSERT(ppa[i] != NULL); 3415 ASSERT(PAGE_EXCL(ppa[i])); 3416 ASSERT(ppa[i]->p_vnode == vp); 3417 ASSERT(ppa[i]->p_offset == 3418 start_off + (i << PAGESHIFT)); 3419 page_unlock(ppa[i]); 3420 } 3421 ppa[0] = NULL; 3422 } 3423 3424 return (1); 3425 out: 3426 /* 3427 * Do the cleanup. Unlock target pages we didn't relocate. They are 3428 * linked on targ_pplist by root pages. reassemble unused replacement 3429 * and io pages back to pplist. 3430 */ 3431 if (io_pplist != NULL) { 3432 VM_STAT_ADD(segvnvmstats.fill_vp_pages[20]); 3433 pp = io_pplist; 3434 do { 3435 ASSERT(pp->p_vnode == vp); 3436 ASSERT(pp->p_offset == io_off); 3437 ASSERT(page_iolock_assert(pp)); 3438 page_io_unlock(pp); 3439 page_hashout(pp, NULL); 3440 io_off += PAGESIZE; 3441 } while ((pp = pp->p_next) != io_pplist); 3442 page_list_concat(&io_pplist, &pplist); 3443 pplist = io_pplist; 3444 } 3445 tmp_pplist = NULL; 3446 while (targ_pplist != NULL) { 3447 VM_STAT_ADD(segvnvmstats.fill_vp_pages[21]); 3448 pp = targ_pplist; 3449 ASSERT(PAGE_EXCL(pp)); 3450 page_sub(&targ_pplist, pp); 3451 3452 pszc = pp->p_szc; 3453 ppages = page_get_pagecnt(pszc); 3454 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3455 3456 if (pszc != 0) { 3457 group_page_unlock(pp); 3458 } 3459 page_unlock(pp); 3460 3461 pp = repl_pplist; 3462 ASSERT(pp != NULL); 3463 ASSERT(PAGE_EXCL(pp)); 3464 ASSERT(pp->p_szc == szc); 3465 page_sub(&repl_pplist, pp); 3466 3467 ASSERT(IS_P2ALIGNED(page_pptonum(pp), ppages)); 3468 3469 /* relink replacement page */ 3470 page_list_concat(&tmp_pplist, &pp); 3471 while (--ppages != 0) { 3472 VM_STAT_ADD(segvnvmstats.fill_vp_pages[22]); 3473 pp++; 3474 ASSERT(PAGE_EXCL(pp)); 3475 ASSERT(pp->p_szc == szc); 3476 page_list_concat(&tmp_pplist, &pp); 3477 } 3478 } 3479 if (tmp_pplist != NULL) { 3480 VM_STAT_ADD(segvnvmstats.fill_vp_pages[23]); 3481 page_list_concat(&tmp_pplist, &pplist); 3482 pplist = tmp_pplist; 3483 } 3484 /* 3485 * at this point all pages are either on done_pplist or 3486 * pplist. They can't be all on done_pplist otherwise 3487 * we'd've been done. 3488 */ 3489 ASSERT(pplist != NULL); 3490 if (nios != 0) { 3491 VM_STAT_ADD(segvnvmstats.fill_vp_pages[24]); 3492 pp = pplist; 3493 do { 3494 VM_STAT_ADD(segvnvmstats.fill_vp_pages[25]); 3495 ASSERT(pp->p_szc == szc); 3496 ASSERT(PAGE_EXCL(pp)); 3497 ASSERT(pp->p_vnode != vp); 3498 pp->p_szc = 0; 3499 } while ((pp = pp->p_next) != pplist); 3500 3501 pp = done_pplist; 3502 do { 3503 VM_STAT_ADD(segvnvmstats.fill_vp_pages[26]); 3504 ASSERT(pp->p_szc == szc); 3505 ASSERT(PAGE_EXCL(pp)); 3506 ASSERT(pp->p_vnode == vp); 3507 pp->p_szc = 0; 3508 } while ((pp = pp->p_next) != done_pplist); 3509 3510 while (pplist != NULL) { 3511 VM_STAT_ADD(segvnvmstats.fill_vp_pages[27]); 3512 pp = pplist; 3513 page_sub(&pplist, pp); 3514 page_free(pp, 0); 3515 } 3516 3517 while (done_pplist != NULL) { 3518 VM_STAT_ADD(segvnvmstats.fill_vp_pages[28]); 3519 pp = done_pplist; 3520 page_sub(&done_pplist, pp); 3521 page_unlock(pp); 3522 } 3523 *ppplist = NULL; 3524 return (0); 3525 } 3526 ASSERT(pplist == *ppplist); 3527 if (io_err) { 3528 VM_STAT_ADD(segvnvmstats.fill_vp_pages[29]); 3529 /* 3530 * don't downsize on io error. 3531 * see if vop_getpage succeeds. 3532 * pplist may still be used in this case 3533 * for relocations. 3534 */ 3535 return (0); 3536 } 3537 VM_STAT_ADD(segvnvmstats.fill_vp_pages[30]); 3538 page_free_replacement_page(pplist); 3539 page_create_putback(pages); 3540 *ppplist = NULL; 3541 return (0); 3542 } 3543 3544 int segvn_anypgsz = 0; 3545 3546 #define SEGVN_RESTORE_SOFTLOCK(type, pages) \ 3547 if ((type) == F_SOFTLOCK) { \ 3548 mutex_enter(&freemem_lock); \ 3549 availrmem += (pages); \ 3550 segvn_pages_locked -= (pages); \ 3551 svd->softlockcnt -= (pages); \ 3552 mutex_exit(&freemem_lock); \ 3553 } 3554 3555 #define SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot) \ 3556 if (IS_VMODSORT((ppa)[0]->p_vnode)) { \ 3557 if ((rw) == S_WRITE) { \ 3558 for (i = 0; i < (pages); i++) { \ 3559 ASSERT((ppa)[i]->p_vnode == \ 3560 (ppa)[0]->p_vnode); \ 3561 hat_setmod((ppa)[i]); \ 3562 } \ 3563 } else if ((rw) != S_OTHER && \ 3564 ((prot) & (vpprot) & PROT_WRITE)) { \ 3565 for (i = 0; i < (pages); i++) { \ 3566 ASSERT((ppa)[i]->p_vnode == \ 3567 (ppa)[0]->p_vnode); \ 3568 if (!hat_ismod((ppa)[i])) { \ 3569 prot &= ~PROT_WRITE; \ 3570 break; \ 3571 } \ 3572 } \ 3573 } \ 3574 } 3575 3576 #ifdef VM_STATS 3577 3578 #define SEGVN_VMSTAT_FLTVNPAGES(idx) \ 3579 VM_STAT_ADD(segvnvmstats.fltvnpages[(idx)]); 3580 3581 #else /* VM_STATS */ 3582 3583 #define SEGVN_VMSTAT_FLTVNPAGES(idx) 3584 3585 #endif 3586 3587 static faultcode_t 3588 segvn_fault_vnodepages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 3589 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 3590 caddr_t eaddr, int brkcow) 3591 { 3592 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 3593 struct anon_map *amp = svd->amp; 3594 uchar_t segtype = svd->type; 3595 uint_t szc = seg->s_szc; 3596 size_t pgsz = page_get_pagesize(szc); 3597 size_t maxpgsz = pgsz; 3598 pgcnt_t pages = btop(pgsz); 3599 pgcnt_t maxpages = pages; 3600 size_t ppasize = (pages + 1) * sizeof (page_t *); 3601 caddr_t a = lpgaddr; 3602 caddr_t maxlpgeaddr = lpgeaddr; 3603 u_offset_t off = svd->offset + (uintptr_t)(a - seg->s_base); 3604 ulong_t aindx = svd->anon_index + seg_page(seg, a); 3605 struct vpage *vpage = (svd->vpage != NULL) ? 3606 &svd->vpage[seg_page(seg, a)] : NULL; 3607 vnode_t *vp = svd->vp; 3608 page_t **ppa; 3609 uint_t pszc; 3610 size_t ppgsz; 3611 pgcnt_t ppages; 3612 faultcode_t err = 0; 3613 int ierr; 3614 int vop_size_err = 0; 3615 uint_t protchk, prot, vpprot; 3616 ulong_t i; 3617 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 3618 anon_sync_obj_t an_cookie; 3619 enum seg_rw arw; 3620 int alloc_failed = 0; 3621 int adjszc_chk; 3622 struct vattr va; 3623 int xhat = 0; 3624 page_t *pplist; 3625 pfn_t pfn; 3626 int physcontig; 3627 int upgrdfail; 3628 int segvn_anypgsz_vnode = 0; /* for now map vnode with 2 page sizes */ 3629 int tron = (svd->tr_state == SEGVN_TR_ON); 3630 3631 ASSERT(szc != 0); 3632 ASSERT(vp != NULL); 3633 ASSERT(brkcow == 0 || amp != NULL); 3634 ASSERT(tron == 0 || amp != NULL); 3635 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 3636 ASSERT(!(svd->flags & MAP_NORESERVE)); 3637 ASSERT(type != F_SOFTUNLOCK); 3638 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3639 ASSERT(amp == NULL || IS_P2ALIGNED(aindx, maxpages)); 3640 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 3641 ASSERT(seg->s_szc < NBBY * sizeof (int)); 3642 ASSERT(type != F_SOFTLOCK || lpgeaddr - a == maxpgsz); 3643 3644 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltvnpages[0]); 3645 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltvnpages[1]); 3646 3647 if (svd->flags & MAP_TEXT) { 3648 hat_flag |= HAT_LOAD_TEXT; 3649 } 3650 3651 if (svd->pageprot) { 3652 switch (rw) { 3653 case S_READ: 3654 protchk = PROT_READ; 3655 break; 3656 case S_WRITE: 3657 protchk = PROT_WRITE; 3658 break; 3659 case S_EXEC: 3660 protchk = PROT_EXEC; 3661 break; 3662 case S_OTHER: 3663 default: 3664 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 3665 break; 3666 } 3667 } else { 3668 prot = svd->prot; 3669 /* caller has already done segment level protection check. */ 3670 } 3671 3672 if (seg->s_as->a_hat != hat) { 3673 xhat = 1; 3674 } 3675 3676 if (rw == S_WRITE && segtype == MAP_PRIVATE) { 3677 SEGVN_VMSTAT_FLTVNPAGES(2); 3678 arw = S_READ; 3679 } else { 3680 arw = rw; 3681 } 3682 3683 ppa = kmem_alloc(ppasize, KM_SLEEP); 3684 3685 VM_STAT_COND_ADD(amp != NULL, segvnvmstats.fltvnpages[3]); 3686 3687 for (;;) { 3688 adjszc_chk = 0; 3689 for (; a < lpgeaddr; a += pgsz, off += pgsz, aindx += pages) { 3690 if (adjszc_chk) { 3691 while (szc < seg->s_szc) { 3692 uintptr_t e; 3693 uint_t tszc; 3694 tszc = segvn_anypgsz_vnode ? szc + 1 : 3695 seg->s_szc; 3696 ppgsz = page_get_pagesize(tszc); 3697 if (!IS_P2ALIGNED(a, ppgsz) || 3698 ((alloc_failed >> tszc) & 3699 0x1)) { 3700 break; 3701 } 3702 SEGVN_VMSTAT_FLTVNPAGES(4); 3703 szc = tszc; 3704 pgsz = ppgsz; 3705 pages = btop(pgsz); 3706 e = P2ROUNDUP((uintptr_t)eaddr, pgsz); 3707 lpgeaddr = (caddr_t)e; 3708 } 3709 } 3710 3711 again: 3712 if (IS_P2ALIGNED(a, maxpgsz) && amp != NULL) { 3713 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3714 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3715 anon_array_enter(amp, aindx, &an_cookie); 3716 if (anon_get_ptr(amp->ahp, aindx) != NULL) { 3717 SEGVN_VMSTAT_FLTVNPAGES(5); 3718 ASSERT(anon_pages(amp->ahp, aindx, 3719 maxpages) == maxpages); 3720 anon_array_exit(&an_cookie); 3721 ANON_LOCK_EXIT(&->a_rwlock); 3722 err = segvn_fault_anonpages(hat, seg, 3723 a, a + maxpgsz, type, rw, 3724 MAX(a, addr), 3725 MIN(a + maxpgsz, eaddr), brkcow); 3726 if (err != 0) { 3727 SEGVN_VMSTAT_FLTVNPAGES(6); 3728 goto out; 3729 } 3730 if (szc < seg->s_szc) { 3731 szc = seg->s_szc; 3732 pgsz = maxpgsz; 3733 pages = maxpages; 3734 lpgeaddr = maxlpgeaddr; 3735 } 3736 goto next; 3737 } else { 3738 ASSERT(anon_pages(amp->ahp, aindx, 3739 maxpages) == 0); 3740 SEGVN_VMSTAT_FLTVNPAGES(7); 3741 anon_array_exit(&an_cookie); 3742 ANON_LOCK_EXIT(&->a_rwlock); 3743 } 3744 } 3745 ASSERT(!brkcow || IS_P2ALIGNED(a, maxpgsz)); 3746 ASSERT(!tron || IS_P2ALIGNED(a, maxpgsz)); 3747 3748 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 3749 ASSERT(vpage != NULL); 3750 prot = VPP_PROT(vpage); 3751 ASSERT(sameprot(seg, a, maxpgsz)); 3752 if ((prot & protchk) == 0) { 3753 SEGVN_VMSTAT_FLTVNPAGES(8); 3754 err = FC_PROT; 3755 goto out; 3756 } 3757 } 3758 if (type == F_SOFTLOCK) { 3759 mutex_enter(&freemem_lock); 3760 if (availrmem < tune.t_minarmem + pages) { 3761 mutex_exit(&freemem_lock); 3762 err = FC_MAKE_ERR(ENOMEM); 3763 goto out; 3764 } else { 3765 availrmem -= pages; 3766 segvn_pages_locked += pages; 3767 svd->softlockcnt += pages; 3768 } 3769 mutex_exit(&freemem_lock); 3770 } 3771 3772 pplist = NULL; 3773 physcontig = 0; 3774 ppa[0] = NULL; 3775 if (!brkcow && !tron && szc && 3776 !page_exists_physcontig(vp, off, szc, 3777 segtype == MAP_PRIVATE ? ppa : NULL)) { 3778 SEGVN_VMSTAT_FLTVNPAGES(9); 3779 if (page_alloc_pages(vp, seg, a, &pplist, NULL, 3780 szc, 0, 0) && type != F_SOFTLOCK) { 3781 SEGVN_VMSTAT_FLTVNPAGES(10); 3782 pszc = 0; 3783 ierr = -1; 3784 alloc_failed |= (1 << szc); 3785 break; 3786 } 3787 if (pplist != NULL && 3788 vp->v_mpssdata == SEGVN_PAGEIO) { 3789 int downsize; 3790 SEGVN_VMSTAT_FLTVNPAGES(11); 3791 physcontig = segvn_fill_vp_pages(svd, 3792 vp, off, szc, ppa, &pplist, 3793 &pszc, &downsize); 3794 ASSERT(!physcontig || pplist == NULL); 3795 if (!physcontig && downsize && 3796 type != F_SOFTLOCK) { 3797 ASSERT(pplist == NULL); 3798 SEGVN_VMSTAT_FLTVNPAGES(12); 3799 ierr = -1; 3800 break; 3801 } 3802 ASSERT(!physcontig || 3803 segtype == MAP_PRIVATE || 3804 ppa[0] == NULL); 3805 if (physcontig && ppa[0] == NULL) { 3806 physcontig = 0; 3807 } 3808 } 3809 } else if (!brkcow && !tron && szc && ppa[0] != NULL) { 3810 SEGVN_VMSTAT_FLTVNPAGES(13); 3811 ASSERT(segtype == MAP_PRIVATE); 3812 physcontig = 1; 3813 } 3814 3815 if (!physcontig) { 3816 SEGVN_VMSTAT_FLTVNPAGES(14); 3817 ppa[0] = NULL; 3818 ierr = VOP_GETPAGE(vp, (offset_t)off, pgsz, 3819 &vpprot, ppa, pgsz, seg, a, arw, 3820 svd->cred); 3821 #ifdef DEBUG 3822 if (ierr == 0) { 3823 for (i = 0; i < pages; i++) { 3824 ASSERT(PAGE_LOCKED(ppa[i])); 3825 ASSERT(!PP_ISFREE(ppa[i])); 3826 ASSERT(ppa[i]->p_vnode == vp); 3827 ASSERT(ppa[i]->p_offset == 3828 off + (i << PAGESHIFT)); 3829 } 3830 } 3831 #endif /* DEBUG */ 3832 if (segtype == MAP_PRIVATE) { 3833 SEGVN_VMSTAT_FLTVNPAGES(15); 3834 vpprot &= ~PROT_WRITE; 3835 } 3836 } else { 3837 ASSERT(segtype == MAP_PRIVATE); 3838 SEGVN_VMSTAT_FLTVNPAGES(16); 3839 vpprot = PROT_ALL & ~PROT_WRITE; 3840 ierr = 0; 3841 } 3842 3843 if (ierr != 0) { 3844 SEGVN_VMSTAT_FLTVNPAGES(17); 3845 if (pplist != NULL) { 3846 SEGVN_VMSTAT_FLTVNPAGES(18); 3847 page_free_replacement_page(pplist); 3848 page_create_putback(pages); 3849 } 3850 SEGVN_RESTORE_SOFTLOCK(type, pages); 3851 if (a + pgsz <= eaddr) { 3852 SEGVN_VMSTAT_FLTVNPAGES(19); 3853 err = FC_MAKE_ERR(ierr); 3854 goto out; 3855 } 3856 va.va_mask = AT_SIZE; 3857 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 3858 SEGVN_VMSTAT_FLTVNPAGES(20); 3859 err = FC_MAKE_ERR(EIO); 3860 goto out; 3861 } 3862 if (btopr(va.va_size) >= btopr(off + pgsz)) { 3863 SEGVN_VMSTAT_FLTVNPAGES(21); 3864 err = FC_MAKE_ERR(ierr); 3865 goto out; 3866 } 3867 if (btopr(va.va_size) < 3868 btopr(off + (eaddr - a))) { 3869 SEGVN_VMSTAT_FLTVNPAGES(22); 3870 err = FC_MAKE_ERR(ierr); 3871 goto out; 3872 } 3873 if (brkcow || tron || type == F_SOFTLOCK) { 3874 /* can't reduce map area */ 3875 SEGVN_VMSTAT_FLTVNPAGES(23); 3876 vop_size_err = 1; 3877 goto out; 3878 } 3879 SEGVN_VMSTAT_FLTVNPAGES(24); 3880 ASSERT(szc != 0); 3881 pszc = 0; 3882 ierr = -1; 3883 break; 3884 } 3885 3886 if (amp != NULL) { 3887 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 3888 anon_array_enter(amp, aindx, &an_cookie); 3889 } 3890 if (amp != NULL && 3891 anon_get_ptr(amp->ahp, aindx) != NULL) { 3892 ulong_t taindx = P2ALIGN(aindx, maxpages); 3893 3894 SEGVN_VMSTAT_FLTVNPAGES(25); 3895 ASSERT(anon_pages(amp->ahp, taindx, 3896 maxpages) == maxpages); 3897 for (i = 0; i < pages; i++) { 3898 page_unlock(ppa[i]); 3899 } 3900 anon_array_exit(&an_cookie); 3901 ANON_LOCK_EXIT(&->a_rwlock); 3902 if (pplist != NULL) { 3903 page_free_replacement_page(pplist); 3904 page_create_putback(pages); 3905 } 3906 SEGVN_RESTORE_SOFTLOCK(type, pages); 3907 if (szc < seg->s_szc) { 3908 SEGVN_VMSTAT_FLTVNPAGES(26); 3909 /* 3910 * For private segments SOFTLOCK 3911 * either always breaks cow (any rw 3912 * type except S_READ_NOCOW) or 3913 * address space is locked as writer 3914 * (S_READ_NOCOW case) and anon slots 3915 * can't show up on second check. 3916 * Therefore if we are here for 3917 * SOFTLOCK case it must be a cow 3918 * break but cow break never reduces 3919 * szc. text replication (tron) in 3920 * this case works as cow break. 3921 * Thus the assert below. 3922 */ 3923 ASSERT(!brkcow && !tron && 3924 type != F_SOFTLOCK); 3925 pszc = seg->s_szc; 3926 ierr = -2; 3927 break; 3928 } 3929 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3930 goto again; 3931 } 3932 #ifdef DEBUG 3933 if (amp != NULL) { 3934 ulong_t taindx = P2ALIGN(aindx, maxpages); 3935 ASSERT(!anon_pages(amp->ahp, taindx, maxpages)); 3936 } 3937 #endif /* DEBUG */ 3938 3939 if (brkcow || tron) { 3940 ASSERT(amp != NULL); 3941 ASSERT(pplist == NULL); 3942 ASSERT(szc == seg->s_szc); 3943 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 3944 ASSERT(IS_P2ALIGNED(aindx, maxpages)); 3945 SEGVN_VMSTAT_FLTVNPAGES(27); 3946 ierr = anon_map_privatepages(amp, aindx, szc, 3947 seg, a, prot, ppa, vpage, segvn_anypgsz, 3948 tron ? PG_LOCAL : 0, svd->cred); 3949 if (ierr != 0) { 3950 SEGVN_VMSTAT_FLTVNPAGES(28); 3951 anon_array_exit(&an_cookie); 3952 ANON_LOCK_EXIT(&->a_rwlock); 3953 SEGVN_RESTORE_SOFTLOCK(type, pages); 3954 err = FC_MAKE_ERR(ierr); 3955 goto out; 3956 } 3957 3958 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 3959 /* 3960 * p_szc can't be changed for locked 3961 * swapfs pages. 3962 */ 3963 hat_memload_array(hat, a, pgsz, ppa, prot, 3964 hat_flag); 3965 3966 if (!(hat_flag & HAT_LOAD_LOCK)) { 3967 SEGVN_VMSTAT_FLTVNPAGES(29); 3968 for (i = 0; i < pages; i++) { 3969 page_unlock(ppa[i]); 3970 } 3971 } 3972 anon_array_exit(&an_cookie); 3973 ANON_LOCK_EXIT(&->a_rwlock); 3974 goto next; 3975 } 3976 3977 pfn = page_pptonum(ppa[0]); 3978 /* 3979 * hat_page_demote() needs an EXCl lock on one of 3980 * constituent page_t's and it decreases root's p_szc 3981 * last. This means if root's p_szc is equal szc and 3982 * all its constituent pages are locked 3983 * hat_page_demote() that could have changed p_szc to 3984 * szc is already done and no new have page_demote() 3985 * can start for this large page. 3986 */ 3987 3988 /* 3989 * we need to make sure same mapping size is used for 3990 * the same address range if there's a possibility the 3991 * adddress is already mapped because hat layer panics 3992 * when translation is loaded for the range already 3993 * mapped with a different page size. We achieve it 3994 * by always using largest page size possible subject 3995 * to the constraints of page size, segment page size 3996 * and page alignment. Since mappings are invalidated 3997 * when those constraints change and make it 3998 * impossible to use previously used mapping size no 3999 * mapping size conflicts should happen. 4000 */ 4001 4002 chkszc: 4003 if ((pszc = ppa[0]->p_szc) == szc && 4004 IS_P2ALIGNED(pfn, pages)) { 4005 4006 SEGVN_VMSTAT_FLTVNPAGES(30); 4007 #ifdef DEBUG 4008 for (i = 0; i < pages; i++) { 4009 ASSERT(PAGE_LOCKED(ppa[i])); 4010 ASSERT(!PP_ISFREE(ppa[i])); 4011 ASSERT(page_pptonum(ppa[i]) == 4012 pfn + i); 4013 ASSERT(ppa[i]->p_szc == szc); 4014 ASSERT(ppa[i]->p_vnode == vp); 4015 ASSERT(ppa[i]->p_offset == 4016 off + (i << PAGESHIFT)); 4017 } 4018 #endif /* DEBUG */ 4019 /* 4020 * All pages are of szc we need and they are 4021 * all locked so they can't change szc. load 4022 * translations. 4023 * 4024 * if page got promoted since last check 4025 * we don't need pplist. 4026 */ 4027 if (pplist != NULL) { 4028 page_free_replacement_page(pplist); 4029 page_create_putback(pages); 4030 } 4031 if (PP_ISMIGRATE(ppa[0])) { 4032 page_migrate(seg, a, ppa, pages); 4033 } 4034 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4035 prot, vpprot); 4036 if (!xhat) { 4037 hat_memload_array(hat, a, pgsz, ppa, 4038 prot & vpprot, hat_flag); 4039 } else { 4040 /* 4041 * avoid large xhat mappings to FS 4042 * pages so that hat_page_demote() 4043 * doesn't need to check for xhat 4044 * large mappings. 4045 */ 4046 for (i = 0; i < pages; i++) { 4047 hat_memload(hat, 4048 a + (i << PAGESHIFT), 4049 ppa[i], prot & vpprot, 4050 hat_flag); 4051 } 4052 } 4053 4054 if (!(hat_flag & HAT_LOAD_LOCK)) { 4055 for (i = 0; i < pages; i++) { 4056 page_unlock(ppa[i]); 4057 } 4058 } 4059 if (amp != NULL) { 4060 anon_array_exit(&an_cookie); 4061 ANON_LOCK_EXIT(&->a_rwlock); 4062 } 4063 goto next; 4064 } 4065 4066 /* 4067 * See if upsize is possible. 4068 */ 4069 if (pszc > szc && szc < seg->s_szc && 4070 (segvn_anypgsz_vnode || pszc >= seg->s_szc)) { 4071 pgcnt_t aphase; 4072 uint_t pszc1 = MIN(pszc, seg->s_szc); 4073 ppgsz = page_get_pagesize(pszc1); 4074 ppages = btop(ppgsz); 4075 aphase = btop(P2PHASE((uintptr_t)a, ppgsz)); 4076 4077 ASSERT(type != F_SOFTLOCK); 4078 4079 SEGVN_VMSTAT_FLTVNPAGES(31); 4080 if (aphase != P2PHASE(pfn, ppages)) { 4081 segvn_faultvnmpss_align_err4++; 4082 } else { 4083 SEGVN_VMSTAT_FLTVNPAGES(32); 4084 if (pplist != NULL) { 4085 page_t *pl = pplist; 4086 page_free_replacement_page(pl); 4087 page_create_putback(pages); 4088 } 4089 for (i = 0; i < pages; i++) { 4090 page_unlock(ppa[i]); 4091 } 4092 if (amp != NULL) { 4093 anon_array_exit(&an_cookie); 4094 ANON_LOCK_EXIT(&->a_rwlock); 4095 } 4096 pszc = pszc1; 4097 ierr = -2; 4098 break; 4099 } 4100 } 4101 4102 /* 4103 * check if we should use smallest mapping size. 4104 */ 4105 upgrdfail = 0; 4106 if (szc == 0 || xhat || 4107 (pszc >= szc && 4108 !IS_P2ALIGNED(pfn, pages)) || 4109 (pszc < szc && 4110 !segvn_full_szcpages(ppa, szc, &upgrdfail, 4111 &pszc))) { 4112 4113 if (upgrdfail && type != F_SOFTLOCK) { 4114 /* 4115 * segvn_full_szcpages failed to lock 4116 * all pages EXCL. Size down. 4117 */ 4118 ASSERT(pszc < szc); 4119 4120 SEGVN_VMSTAT_FLTVNPAGES(33); 4121 4122 if (pplist != NULL) { 4123 page_t *pl = pplist; 4124 page_free_replacement_page(pl); 4125 page_create_putback(pages); 4126 } 4127 4128 for (i = 0; i < pages; i++) { 4129 page_unlock(ppa[i]); 4130 } 4131 if (amp != NULL) { 4132 anon_array_exit(&an_cookie); 4133 ANON_LOCK_EXIT(&->a_rwlock); 4134 } 4135 ierr = -1; 4136 break; 4137 } 4138 if (szc != 0 && !xhat && !upgrdfail) { 4139 segvn_faultvnmpss_align_err5++; 4140 } 4141 SEGVN_VMSTAT_FLTVNPAGES(34); 4142 if (pplist != NULL) { 4143 page_free_replacement_page(pplist); 4144 page_create_putback(pages); 4145 } 4146 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4147 prot, vpprot); 4148 if (upgrdfail && segvn_anypgsz_vnode) { 4149 /* SOFTLOCK case */ 4150 hat_memload_array(hat, a, pgsz, 4151 ppa, prot & vpprot, hat_flag); 4152 } else { 4153 for (i = 0; i < pages; i++) { 4154 hat_memload(hat, 4155 a + (i << PAGESHIFT), 4156 ppa[i], prot & vpprot, 4157 hat_flag); 4158 } 4159 } 4160 if (!(hat_flag & HAT_LOAD_LOCK)) { 4161 for (i = 0; i < pages; i++) { 4162 page_unlock(ppa[i]); 4163 } 4164 } 4165 if (amp != NULL) { 4166 anon_array_exit(&an_cookie); 4167 ANON_LOCK_EXIT(&->a_rwlock); 4168 } 4169 goto next; 4170 } 4171 4172 if (pszc == szc) { 4173 /* 4174 * segvn_full_szcpages() upgraded pages szc. 4175 */ 4176 ASSERT(pszc == ppa[0]->p_szc); 4177 ASSERT(IS_P2ALIGNED(pfn, pages)); 4178 goto chkszc; 4179 } 4180 4181 if (pszc > szc) { 4182 kmutex_t *szcmtx; 4183 SEGVN_VMSTAT_FLTVNPAGES(35); 4184 /* 4185 * p_szc of ppa[0] can change since we haven't 4186 * locked all constituent pages. Call 4187 * page_lock_szc() to prevent szc changes. 4188 * This should be a rare case that happens when 4189 * multiple segments use a different page size 4190 * to map the same file offsets. 4191 */ 4192 szcmtx = page_szc_lock(ppa[0]); 4193 pszc = ppa[0]->p_szc; 4194 ASSERT(szcmtx != NULL || pszc == 0); 4195 ASSERT(ppa[0]->p_szc <= pszc); 4196 if (pszc <= szc) { 4197 SEGVN_VMSTAT_FLTVNPAGES(36); 4198 if (szcmtx != NULL) { 4199 mutex_exit(szcmtx); 4200 } 4201 goto chkszc; 4202 } 4203 if (pplist != NULL) { 4204 /* 4205 * page got promoted since last check. 4206 * we don't need preaalocated large 4207 * page. 4208 */ 4209 SEGVN_VMSTAT_FLTVNPAGES(37); 4210 page_free_replacement_page(pplist); 4211 page_create_putback(pages); 4212 } 4213 SEGVN_UPDATE_MODBITS(ppa, pages, rw, 4214 prot, vpprot); 4215 hat_memload_array(hat, a, pgsz, ppa, 4216 prot & vpprot, hat_flag); 4217 mutex_exit(szcmtx); 4218 if (!(hat_flag & HAT_LOAD_LOCK)) { 4219 for (i = 0; i < pages; i++) { 4220 page_unlock(ppa[i]); 4221 } 4222 } 4223 if (amp != NULL) { 4224 anon_array_exit(&an_cookie); 4225 ANON_LOCK_EXIT(&->a_rwlock); 4226 } 4227 goto next; 4228 } 4229 4230 /* 4231 * if page got demoted since last check 4232 * we could have not allocated larger page. 4233 * allocate now. 4234 */ 4235 if (pplist == NULL && 4236 page_alloc_pages(vp, seg, a, &pplist, NULL, 4237 szc, 0, 0) && type != F_SOFTLOCK) { 4238 SEGVN_VMSTAT_FLTVNPAGES(38); 4239 for (i = 0; i < pages; i++) { 4240 page_unlock(ppa[i]); 4241 } 4242 if (amp != NULL) { 4243 anon_array_exit(&an_cookie); 4244 ANON_LOCK_EXIT(&->a_rwlock); 4245 } 4246 ierr = -1; 4247 alloc_failed |= (1 << szc); 4248 break; 4249 } 4250 4251 SEGVN_VMSTAT_FLTVNPAGES(39); 4252 4253 if (pplist != NULL) { 4254 segvn_relocate_pages(ppa, pplist); 4255 #ifdef DEBUG 4256 } else { 4257 ASSERT(type == F_SOFTLOCK); 4258 SEGVN_VMSTAT_FLTVNPAGES(40); 4259 #endif /* DEBUG */ 4260 } 4261 4262 SEGVN_UPDATE_MODBITS(ppa, pages, rw, prot, vpprot); 4263 4264 if (pplist == NULL && segvn_anypgsz_vnode == 0) { 4265 ASSERT(type == F_SOFTLOCK); 4266 for (i = 0; i < pages; i++) { 4267 ASSERT(ppa[i]->p_szc < szc); 4268 hat_memload(hat, a + (i << PAGESHIFT), 4269 ppa[i], prot & vpprot, hat_flag); 4270 } 4271 } else { 4272 ASSERT(pplist != NULL || type == F_SOFTLOCK); 4273 hat_memload_array(hat, a, pgsz, ppa, 4274 prot & vpprot, hat_flag); 4275 } 4276 if (!(hat_flag & HAT_LOAD_LOCK)) { 4277 for (i = 0; i < pages; i++) { 4278 ASSERT(PAGE_SHARED(ppa[i])); 4279 page_unlock(ppa[i]); 4280 } 4281 } 4282 if (amp != NULL) { 4283 anon_array_exit(&an_cookie); 4284 ANON_LOCK_EXIT(&->a_rwlock); 4285 } 4286 4287 next: 4288 if (vpage != NULL) { 4289 vpage += pages; 4290 } 4291 adjszc_chk = 1; 4292 } 4293 if (a == lpgeaddr) 4294 break; 4295 ASSERT(a < lpgeaddr); 4296 4297 ASSERT(!brkcow && !tron && type != F_SOFTLOCK); 4298 4299 /* 4300 * ierr == -1 means we failed to map with a large page. 4301 * (either due to allocation/relocation failures or 4302 * misalignment with other mappings to this file. 4303 * 4304 * ierr == -2 means some other thread allocated a large page 4305 * after we gave up tp map with a large page. retry with 4306 * larger mapping. 4307 */ 4308 ASSERT(ierr == -1 || ierr == -2); 4309 ASSERT(ierr == -2 || szc != 0); 4310 ASSERT(ierr == -1 || szc < seg->s_szc); 4311 if (ierr == -2) { 4312 SEGVN_VMSTAT_FLTVNPAGES(41); 4313 ASSERT(pszc > szc && pszc <= seg->s_szc); 4314 szc = pszc; 4315 } else if (segvn_anypgsz_vnode) { 4316 SEGVN_VMSTAT_FLTVNPAGES(42); 4317 szc--; 4318 } else { 4319 SEGVN_VMSTAT_FLTVNPAGES(43); 4320 ASSERT(pszc < szc); 4321 /* 4322 * other process created pszc large page. 4323 * but we still have to drop to 0 szc. 4324 */ 4325 szc = 0; 4326 } 4327 4328 pgsz = page_get_pagesize(szc); 4329 pages = btop(pgsz); 4330 if (ierr == -2) { 4331 /* 4332 * Size up case. Note lpgaddr may only be needed for 4333 * softlock case so we don't adjust it here. 4334 */ 4335 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4336 ASSERT(a >= lpgaddr); 4337 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4338 off = svd->offset + (uintptr_t)(a - seg->s_base); 4339 aindx = svd->anon_index + seg_page(seg, a); 4340 vpage = (svd->vpage != NULL) ? 4341 &svd->vpage[seg_page(seg, a)] : NULL; 4342 } else { 4343 /* 4344 * Size down case. Note lpgaddr may only be needed for 4345 * softlock case so we don't adjust it here. 4346 */ 4347 ASSERT(IS_P2ALIGNED(a, pgsz)); 4348 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4349 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4350 ASSERT(a < lpgeaddr); 4351 if (a < addr) { 4352 SEGVN_VMSTAT_FLTVNPAGES(44); 4353 /* 4354 * The beginning of the large page region can 4355 * be pulled to the right to make a smaller 4356 * region. We haven't yet faulted a single 4357 * page. 4358 */ 4359 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4360 ASSERT(a >= lpgaddr); 4361 off = svd->offset + 4362 (uintptr_t)(a - seg->s_base); 4363 aindx = svd->anon_index + seg_page(seg, a); 4364 vpage = (svd->vpage != NULL) ? 4365 &svd->vpage[seg_page(seg, a)] : NULL; 4366 } 4367 } 4368 } 4369 out: 4370 kmem_free(ppa, ppasize); 4371 if (!err && !vop_size_err) { 4372 SEGVN_VMSTAT_FLTVNPAGES(45); 4373 return (0); 4374 } 4375 if (type == F_SOFTLOCK && a > lpgaddr) { 4376 SEGVN_VMSTAT_FLTVNPAGES(46); 4377 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4378 } 4379 if (!vop_size_err) { 4380 SEGVN_VMSTAT_FLTVNPAGES(47); 4381 return (err); 4382 } 4383 ASSERT(brkcow || tron || type == F_SOFTLOCK); 4384 /* 4385 * Large page end is mapped beyond the end of file and it's a cow 4386 * fault (can be a text replication induced cow) or softlock so we can't 4387 * reduce the map area. For now just demote the segment. This should 4388 * really only happen if the end of the file changed after the mapping 4389 * was established since when large page segments are created we make 4390 * sure they don't extend beyond the end of the file. 4391 */ 4392 SEGVN_VMSTAT_FLTVNPAGES(48); 4393 4394 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4395 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4396 err = 0; 4397 if (seg->s_szc != 0) { 4398 segvn_fltvnpages_clrszc_cnt++; 4399 ASSERT(svd->softlockcnt == 0); 4400 err = segvn_clrszc(seg); 4401 if (err != 0) { 4402 segvn_fltvnpages_clrszc_err++; 4403 } 4404 } 4405 ASSERT(err || seg->s_szc == 0); 4406 SEGVN_LOCK_DOWNGRADE(seg->s_as, &svd->lock); 4407 /* segvn_fault will do its job as if szc had been zero to begin with */ 4408 return (err == 0 ? IE_RETRY : FC_MAKE_ERR(err)); 4409 } 4410 4411 /* 4412 * This routine will attempt to fault in one large page. 4413 * it will use smaller pages if that fails. 4414 * It should only be called for pure anonymous segments. 4415 */ 4416 static faultcode_t 4417 segvn_fault_anonpages(struct hat *hat, struct seg *seg, caddr_t lpgaddr, 4418 caddr_t lpgeaddr, enum fault_type type, enum seg_rw rw, caddr_t addr, 4419 caddr_t eaddr, int brkcow) 4420 { 4421 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4422 struct anon_map *amp = svd->amp; 4423 uchar_t segtype = svd->type; 4424 uint_t szc = seg->s_szc; 4425 size_t pgsz = page_get_pagesize(szc); 4426 size_t maxpgsz = pgsz; 4427 pgcnt_t pages = btop(pgsz); 4428 size_t ppasize = pages * sizeof (page_t *); 4429 caddr_t a = lpgaddr; 4430 ulong_t aindx = svd->anon_index + seg_page(seg, a); 4431 struct vpage *vpage = (svd->vpage != NULL) ? 4432 &svd->vpage[seg_page(seg, a)] : NULL; 4433 page_t **ppa; 4434 uint_t ppa_szc; 4435 faultcode_t err; 4436 int ierr; 4437 uint_t protchk, prot, vpprot; 4438 ulong_t i; 4439 int hat_flag = (type == F_SOFTLOCK) ? HAT_LOAD_LOCK : HAT_LOAD; 4440 anon_sync_obj_t cookie; 4441 int first = 1; 4442 int adjszc_chk; 4443 int purged = 0; 4444 int pgflags = (svd->tr_state == SEGVN_TR_ON) ? PG_LOCAL : 0; 4445 4446 ASSERT(szc != 0); 4447 ASSERT(amp != NULL); 4448 ASSERT(enable_mbit_wa == 0); /* no mbit simulations with large pages */ 4449 ASSERT(!(svd->flags & MAP_NORESERVE)); 4450 ASSERT(type != F_SOFTUNLOCK); 4451 ASSERT(IS_P2ALIGNED(a, maxpgsz)); 4452 ASSERT(!brkcow || svd->tr_state == SEGVN_TR_OFF); 4453 4454 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4455 4456 VM_STAT_COND_ADD(type == F_SOFTLOCK, segvnvmstats.fltanpages[0]); 4457 VM_STAT_COND_ADD(type != F_SOFTLOCK, segvnvmstats.fltanpages[1]); 4458 4459 if (svd->flags & MAP_TEXT) { 4460 hat_flag |= HAT_LOAD_TEXT; 4461 } 4462 4463 if (svd->pageprot) { 4464 switch (rw) { 4465 case S_READ: 4466 protchk = PROT_READ; 4467 break; 4468 case S_WRITE: 4469 protchk = PROT_WRITE; 4470 break; 4471 case S_EXEC: 4472 protchk = PROT_EXEC; 4473 break; 4474 case S_OTHER: 4475 default: 4476 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4477 break; 4478 } 4479 VM_STAT_ADD(segvnvmstats.fltanpages[2]); 4480 } else { 4481 prot = svd->prot; 4482 /* caller has already done segment level protection check. */ 4483 } 4484 4485 ppa = kmem_alloc(ppasize, KM_SLEEP); 4486 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4487 for (;;) { 4488 adjszc_chk = 0; 4489 for (; a < lpgeaddr; a += pgsz, aindx += pages) { 4490 if (svd->pageprot != 0 && IS_P2ALIGNED(a, maxpgsz)) { 4491 VM_STAT_ADD(segvnvmstats.fltanpages[3]); 4492 ASSERT(vpage != NULL); 4493 prot = VPP_PROT(vpage); 4494 ASSERT(sameprot(seg, a, maxpgsz)); 4495 if ((prot & protchk) == 0) { 4496 err = FC_PROT; 4497 goto error; 4498 } 4499 } 4500 if (adjszc_chk && IS_P2ALIGNED(a, maxpgsz) && 4501 pgsz < maxpgsz) { 4502 ASSERT(a > lpgaddr); 4503 szc = seg->s_szc; 4504 pgsz = maxpgsz; 4505 pages = btop(pgsz); 4506 ASSERT(IS_P2ALIGNED(aindx, pages)); 4507 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, 4508 pgsz); 4509 } 4510 if (type == F_SOFTLOCK && svd->vp != NULL) { 4511 mutex_enter(&freemem_lock); 4512 if (availrmem < tune.t_minarmem + pages) { 4513 mutex_exit(&freemem_lock); 4514 err = FC_MAKE_ERR(ENOMEM); 4515 goto error; 4516 } else { 4517 availrmem -= pages; 4518 segvn_pages_locked += pages; 4519 svd->softlockcnt += pages; 4520 } 4521 mutex_exit(&freemem_lock); 4522 } 4523 anon_array_enter(amp, aindx, &cookie); 4524 ppa_szc = (uint_t)-1; 4525 ierr = anon_map_getpages(amp, aindx, szc, seg, a, 4526 prot, &vpprot, ppa, &ppa_szc, vpage, rw, brkcow, 4527 segvn_anypgsz, pgflags, svd->cred); 4528 if (ierr != 0) { 4529 anon_array_exit(&cookie); 4530 VM_STAT_ADD(segvnvmstats.fltanpages[4]); 4531 if (type == F_SOFTLOCK && svd->vp != NULL) { 4532 VM_STAT_ADD(segvnvmstats.fltanpages[5]); 4533 mutex_enter(&freemem_lock); 4534 availrmem += pages; 4535 segvn_pages_locked -= pages; 4536 svd->softlockcnt -= pages; 4537 mutex_exit(&freemem_lock); 4538 } 4539 if (ierr > 0) { 4540 VM_STAT_ADD(segvnvmstats.fltanpages[6]); 4541 err = FC_MAKE_ERR(ierr); 4542 goto error; 4543 } 4544 break; 4545 } 4546 4547 ASSERT(!IS_VMODSORT(ppa[0]->p_vnode)); 4548 4549 ASSERT(segtype == MAP_SHARED || 4550 ppa[0]->p_szc <= szc); 4551 ASSERT(segtype == MAP_PRIVATE || 4552 ppa[0]->p_szc >= szc); 4553 4554 /* 4555 * Handle pages that have been marked for migration 4556 */ 4557 if (lgrp_optimizations()) 4558 page_migrate(seg, a, ppa, pages); 4559 4560 if (type == F_SOFTLOCK && svd->vp == NULL) { 4561 /* 4562 * All pages in ppa array belong to the same 4563 * large page. This means it's ok to call 4564 * segvn_pp_lock_anonpages just for ppa[0]. 4565 */ 4566 if (!segvn_pp_lock_anonpages(ppa[0], first)) { 4567 for (i = 0; i < pages; i++) { 4568 page_unlock(ppa[i]); 4569 } 4570 err = FC_MAKE_ERR(ENOMEM); 4571 goto error; 4572 } 4573 first = 0; 4574 mutex_enter(&freemem_lock); 4575 svd->softlockcnt += pages; 4576 segvn_pages_locked += pages; 4577 mutex_exit(&freemem_lock); 4578 } 4579 4580 if (segtype == MAP_SHARED) { 4581 vpprot |= PROT_WRITE; 4582 } 4583 4584 hat_memload_array(hat, a, pgsz, ppa, 4585 prot & vpprot, hat_flag); 4586 4587 if (hat_flag & HAT_LOAD_LOCK) { 4588 VM_STAT_ADD(segvnvmstats.fltanpages[7]); 4589 } else { 4590 VM_STAT_ADD(segvnvmstats.fltanpages[8]); 4591 for (i = 0; i < pages; i++) 4592 page_unlock(ppa[i]); 4593 } 4594 if (vpage != NULL) 4595 vpage += pages; 4596 4597 anon_array_exit(&cookie); 4598 adjszc_chk = 1; 4599 } 4600 if (a == lpgeaddr) 4601 break; 4602 ASSERT(a < lpgeaddr); 4603 /* 4604 * ierr == -1 means we failed to allocate a large page. 4605 * so do a size down operation. 4606 * 4607 * ierr == -2 means some other process that privately shares 4608 * pages with this process has allocated a larger page and we 4609 * need to retry with larger pages. So do a size up 4610 * operation. This relies on the fact that large pages are 4611 * never partially shared i.e. if we share any constituent 4612 * page of a large page with another process we must share the 4613 * entire large page. Note this cannot happen for SOFTLOCK 4614 * case, unless current address (a) is at the beginning of the 4615 * next page size boundary because the other process couldn't 4616 * have relocated locked pages. 4617 */ 4618 ASSERT(ierr == -1 || ierr == -2); 4619 /* 4620 * For the very first relocation failure try to purge this 4621 * segment's cache so that the relocator can obtain an 4622 * exclusive lock on pages we want to relocate. 4623 */ 4624 if (!purged && ierr == -1 && ppa_szc != (uint_t)-1 && 4625 svd->softlockcnt != 0) { 4626 purged = 1; 4627 segvn_purge(seg); 4628 continue; 4629 } 4630 4631 if (segvn_anypgsz) { 4632 ASSERT(ierr == -2 || szc != 0); 4633 ASSERT(ierr == -1 || szc < seg->s_szc); 4634 szc = (ierr == -1) ? szc - 1 : szc + 1; 4635 } else { 4636 /* 4637 * For non COW faults and segvn_anypgsz == 0 4638 * we need to be careful not to loop forever 4639 * if existing page is found with szc other 4640 * than 0 or seg->s_szc. This could be due 4641 * to page relocations on behalf of DR or 4642 * more likely large page creation. For this 4643 * case simply re-size to existing page's szc 4644 * if returned by anon_map_getpages(). 4645 */ 4646 if (ppa_szc == (uint_t)-1) { 4647 szc = (ierr == -1) ? 0 : seg->s_szc; 4648 } else { 4649 ASSERT(ppa_szc <= seg->s_szc); 4650 ASSERT(ierr == -2 || ppa_szc < szc); 4651 ASSERT(ierr == -1 || ppa_szc > szc); 4652 szc = ppa_szc; 4653 } 4654 } 4655 4656 pgsz = page_get_pagesize(szc); 4657 pages = btop(pgsz); 4658 ASSERT(type != F_SOFTLOCK || ierr == -1 || 4659 (IS_P2ALIGNED(a, pgsz) && IS_P2ALIGNED(lpgeaddr, pgsz))); 4660 if (type == F_SOFTLOCK) { 4661 /* 4662 * For softlocks we cannot reduce the fault area 4663 * (calculated based on the largest page size for this 4664 * segment) for size down and a is already next 4665 * page size aligned as assertted above for size 4666 * ups. Therefore just continue in case of softlock. 4667 */ 4668 VM_STAT_ADD(segvnvmstats.fltanpages[9]); 4669 continue; /* keep lint happy */ 4670 } else if (ierr == -2) { 4671 4672 /* 4673 * Size up case. Note lpgaddr may only be needed for 4674 * softlock case so we don't adjust it here. 4675 */ 4676 VM_STAT_ADD(segvnvmstats.fltanpages[10]); 4677 a = (caddr_t)P2ALIGN((uintptr_t)a, pgsz); 4678 ASSERT(a >= lpgaddr); 4679 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4680 aindx = svd->anon_index + seg_page(seg, a); 4681 vpage = (svd->vpage != NULL) ? 4682 &svd->vpage[seg_page(seg, a)] : NULL; 4683 } else { 4684 /* 4685 * Size down case. Note lpgaddr may only be needed for 4686 * softlock case so we don't adjust it here. 4687 */ 4688 VM_STAT_ADD(segvnvmstats.fltanpages[11]); 4689 ASSERT(IS_P2ALIGNED(a, pgsz)); 4690 ASSERT(IS_P2ALIGNED(lpgeaddr, pgsz)); 4691 lpgeaddr = (caddr_t)P2ROUNDUP((uintptr_t)eaddr, pgsz); 4692 ASSERT(a < lpgeaddr); 4693 if (a < addr) { 4694 /* 4695 * The beginning of the large page region can 4696 * be pulled to the right to make a smaller 4697 * region. We haven't yet faulted a single 4698 * page. 4699 */ 4700 VM_STAT_ADD(segvnvmstats.fltanpages[12]); 4701 a = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 4702 ASSERT(a >= lpgaddr); 4703 aindx = svd->anon_index + seg_page(seg, a); 4704 vpage = (svd->vpage != NULL) ? 4705 &svd->vpage[seg_page(seg, a)] : NULL; 4706 } 4707 } 4708 } 4709 VM_STAT_ADD(segvnvmstats.fltanpages[13]); 4710 ANON_LOCK_EXIT(&->a_rwlock); 4711 kmem_free(ppa, ppasize); 4712 return (0); 4713 error: 4714 VM_STAT_ADD(segvnvmstats.fltanpages[14]); 4715 ANON_LOCK_EXIT(&->a_rwlock); 4716 kmem_free(ppa, ppasize); 4717 if (type == F_SOFTLOCK && a > lpgaddr) { 4718 VM_STAT_ADD(segvnvmstats.fltanpages[15]); 4719 segvn_softunlock(seg, lpgaddr, a - lpgaddr, S_OTHER); 4720 } 4721 return (err); 4722 } 4723 4724 int fltadvice = 1; /* set to free behind pages for sequential access */ 4725 4726 /* 4727 * This routine is called via a machine specific fault handling routine. 4728 * It is also called by software routines wishing to lock or unlock 4729 * a range of addresses. 4730 * 4731 * Here is the basic algorithm: 4732 * If unlocking 4733 * Call segvn_softunlock 4734 * Return 4735 * endif 4736 * Checking and set up work 4737 * If we will need some non-anonymous pages 4738 * Call VOP_GETPAGE over the range of non-anonymous pages 4739 * endif 4740 * Loop over all addresses requested 4741 * Call segvn_faultpage passing in page list 4742 * to load up translations and handle anonymous pages 4743 * endloop 4744 * Load up translation to any additional pages in page list not 4745 * already handled that fit into this segment 4746 */ 4747 static faultcode_t 4748 segvn_fault(struct hat *hat, struct seg *seg, caddr_t addr, size_t len, 4749 enum fault_type type, enum seg_rw rw) 4750 { 4751 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 4752 page_t **plp, **ppp, *pp; 4753 u_offset_t off; 4754 caddr_t a; 4755 struct vpage *vpage; 4756 uint_t vpprot, prot; 4757 int err; 4758 page_t *pl[PVN_GETPAGE_NUM + 1]; 4759 size_t plsz, pl_alloc_sz; 4760 size_t page; 4761 ulong_t anon_index; 4762 struct anon_map *amp; 4763 int dogetpage = 0; 4764 caddr_t lpgaddr, lpgeaddr; 4765 size_t pgsz; 4766 anon_sync_obj_t cookie; 4767 int brkcow = BREAK_COW_SHARE(rw, type, svd->type); 4768 4769 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 4770 4771 /* 4772 * First handle the easy stuff 4773 */ 4774 if (type == F_SOFTUNLOCK) { 4775 if (rw == S_READ_NOCOW) { 4776 rw = S_READ; 4777 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4778 } 4779 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4780 pgsz = (seg->s_szc == 0) ? PAGESIZE : 4781 page_get_pagesize(seg->s_szc); 4782 VM_STAT_COND_ADD(pgsz > PAGESIZE, segvnvmstats.fltanpages[16]); 4783 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4784 segvn_softunlock(seg, lpgaddr, lpgeaddr - lpgaddr, rw); 4785 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4786 return (0); 4787 } 4788 4789 if (brkcow == 0) { 4790 if (svd->tr_state == SEGVN_TR_INIT) { 4791 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4792 if (svd->tr_state == SEGVN_TR_INIT) { 4793 ASSERT(svd->vp != NULL && svd->amp == NULL); 4794 ASSERT(svd->flags & MAP_TEXT); 4795 ASSERT(svd->type == MAP_PRIVATE); 4796 segvn_textrepl(seg); 4797 ASSERT(svd->tr_state != SEGVN_TR_INIT); 4798 ASSERT(svd->tr_state != SEGVN_TR_ON || 4799 svd->amp != NULL); 4800 } 4801 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4802 } 4803 } else if (svd->tr_state != SEGVN_TR_OFF) { 4804 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4805 if (svd->tr_state == SEGVN_TR_ON) { 4806 ASSERT(svd->vp != NULL && svd->amp != NULL); 4807 segvn_textunrepl(seg, 0); 4808 ASSERT(svd->amp == NULL && 4809 svd->tr_state == SEGVN_TR_OFF); 4810 } else if (svd->tr_state != SEGVN_TR_OFF) { 4811 svd->tr_state = SEGVN_TR_OFF; 4812 } 4813 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 4814 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4815 } 4816 4817 top: 4818 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 4819 4820 /* 4821 * If we have the same protections for the entire segment, 4822 * insure that the access being attempted is legitimate. 4823 */ 4824 4825 if (svd->pageprot == 0) { 4826 uint_t protchk; 4827 4828 switch (rw) { 4829 case S_READ: 4830 case S_READ_NOCOW: 4831 protchk = PROT_READ; 4832 break; 4833 case S_WRITE: 4834 protchk = PROT_WRITE; 4835 break; 4836 case S_EXEC: 4837 protchk = PROT_EXEC; 4838 break; 4839 case S_OTHER: 4840 default: 4841 protchk = PROT_READ | PROT_WRITE | PROT_EXEC; 4842 break; 4843 } 4844 4845 if ((svd->prot & protchk) == 0) { 4846 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4847 return (FC_PROT); /* illegal access type */ 4848 } 4849 } 4850 4851 /* 4852 * We can't allow the long term use of softlocks for vmpss segments, 4853 * because in some file truncation cases we should be able to demote 4854 * the segment, which requires that there are no softlocks. The 4855 * only case where it's ok to allow a SOFTLOCK fault against a vmpss 4856 * segment is S_READ_NOCOW, where the caller holds the address space 4857 * locked as writer and calls softunlock before dropping the as lock. 4858 * S_READ_NOCOW is used by /proc to read memory from another user. 4859 * 4860 * Another deadlock between SOFTLOCK and file truncation can happen 4861 * because segvn_fault_vnodepages() calls the FS one pagesize at 4862 * a time. A second VOP_GETPAGE() call by segvn_fault_vnodepages() 4863 * can cause a deadlock because the first set of page_t's remain 4864 * locked SE_SHARED. To avoid this, we demote segments on a first 4865 * SOFTLOCK if they have a length greater than the segment's 4866 * page size. 4867 * 4868 * So for now, we only avoid demoting a segment on a SOFTLOCK when 4869 * the access type is S_READ_NOCOW and the fault length is less than 4870 * or equal to the segment's page size. While this is quite restrictive, 4871 * it should be the most common case of SOFTLOCK against a vmpss 4872 * segment. 4873 * 4874 * For S_READ_NOCOW, it's safe not to do a copy on write because the 4875 * caller makes sure no COW will be caused by another thread for a 4876 * softlocked page. 4877 */ 4878 if (type == F_SOFTLOCK && svd->vp != NULL && seg->s_szc != 0) { 4879 int demote = 0; 4880 4881 if (rw != S_READ_NOCOW) { 4882 demote = 1; 4883 } 4884 if (!demote && len > PAGESIZE) { 4885 pgsz = page_get_pagesize(seg->s_szc); 4886 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, 4887 lpgeaddr); 4888 if (lpgeaddr - lpgaddr > pgsz) { 4889 demote = 1; 4890 } 4891 } 4892 4893 ASSERT(demote || AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4894 4895 if (demote) { 4896 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4897 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4898 if (seg->s_szc != 0) { 4899 segvn_vmpss_clrszc_cnt++; 4900 ASSERT(svd->softlockcnt == 0); 4901 err = segvn_clrszc(seg); 4902 if (err) { 4903 segvn_vmpss_clrszc_err++; 4904 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4905 return (FC_MAKE_ERR(err)); 4906 } 4907 } 4908 ASSERT(seg->s_szc == 0); 4909 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4910 goto top; 4911 } 4912 } 4913 4914 /* 4915 * Check to see if we need to allocate an anon_map structure. 4916 */ 4917 if (svd->amp == NULL && (svd->vp == NULL || brkcow)) { 4918 /* 4919 * Drop the "read" lock on the segment and acquire 4920 * the "write" version since we have to allocate the 4921 * anon_map. 4922 */ 4923 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4924 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 4925 4926 if (svd->amp == NULL) { 4927 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 4928 svd->amp->a_szc = seg->s_szc; 4929 } 4930 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4931 4932 /* 4933 * Start all over again since segment protections 4934 * may have changed after we dropped the "read" lock. 4935 */ 4936 goto top; 4937 } 4938 4939 /* 4940 * S_READ_NOCOW vs S_READ distinction was 4941 * only needed for the code above. After 4942 * that we treat it as S_READ. 4943 */ 4944 if (rw == S_READ_NOCOW) { 4945 ASSERT(type == F_SOFTLOCK); 4946 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 4947 rw = S_READ; 4948 } 4949 4950 amp = svd->amp; 4951 4952 /* 4953 * MADV_SEQUENTIAL work is ignored for large page segments. 4954 */ 4955 if (seg->s_szc != 0) { 4956 pgsz = page_get_pagesize(seg->s_szc); 4957 ASSERT(SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 4958 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 4959 if (svd->vp == NULL) { 4960 err = segvn_fault_anonpages(hat, seg, lpgaddr, 4961 lpgeaddr, type, rw, addr, addr + len, brkcow); 4962 } else { 4963 err = segvn_fault_vnodepages(hat, seg, lpgaddr, 4964 lpgeaddr, type, rw, addr, addr + len, brkcow); 4965 if (err == IE_RETRY) { 4966 ASSERT(seg->s_szc == 0); 4967 ASSERT(SEGVN_READ_HELD(seg->s_as, &svd->lock)); 4968 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4969 goto top; 4970 } 4971 } 4972 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 4973 return (err); 4974 } 4975 4976 page = seg_page(seg, addr); 4977 if (amp != NULL) { 4978 anon_index = svd->anon_index + page; 4979 4980 if (type == F_PROT && rw == S_READ && 4981 svd->tr_state == SEGVN_TR_OFF && 4982 svd->type == MAP_PRIVATE && svd->pageprot == 0) { 4983 size_t index = anon_index; 4984 struct anon *ap; 4985 4986 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 4987 /* 4988 * The fast path could apply to S_WRITE also, except 4989 * that the protection fault could be caused by lazy 4990 * tlb flush when ro->rw. In this case, the pte is 4991 * RW already. But RO in the other cpu's tlb causes 4992 * the fault. Since hat_chgprot won't do anything if 4993 * pte doesn't change, we may end up faulting 4994 * indefinitely until the RO tlb entry gets replaced. 4995 */ 4996 for (a = addr; a < addr + len; a += PAGESIZE, index++) { 4997 anon_array_enter(amp, index, &cookie); 4998 ap = anon_get_ptr(amp->ahp, index); 4999 anon_array_exit(&cookie); 5000 if ((ap == NULL) || (ap->an_refcnt != 1)) { 5001 ANON_LOCK_EXIT(&->a_rwlock); 5002 goto slow; 5003 } 5004 } 5005 hat_chgprot(seg->s_as->a_hat, addr, len, svd->prot); 5006 ANON_LOCK_EXIT(&->a_rwlock); 5007 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5008 return (0); 5009 } 5010 } 5011 slow: 5012 5013 if (svd->vpage == NULL) 5014 vpage = NULL; 5015 else 5016 vpage = &svd->vpage[page]; 5017 5018 off = svd->offset + (uintptr_t)(addr - seg->s_base); 5019 5020 /* 5021 * If MADV_SEQUENTIAL has been set for the particular page we 5022 * are faulting on, free behind all pages in the segment and put 5023 * them on the free list. 5024 */ 5025 5026 if ((page != 0) && fltadvice && svd->tr_state != SEGVN_TR_ON) { 5027 struct vpage *vpp; 5028 ulong_t fanon_index; 5029 size_t fpage; 5030 u_offset_t pgoff, fpgoff; 5031 struct vnode *fvp; 5032 struct anon *fap = NULL; 5033 5034 if (svd->advice == MADV_SEQUENTIAL || 5035 (svd->pageadvice && 5036 VPP_ADVICE(vpage) == MADV_SEQUENTIAL)) { 5037 pgoff = off - PAGESIZE; 5038 fpage = page - 1; 5039 if (vpage != NULL) 5040 vpp = &svd->vpage[fpage]; 5041 if (amp != NULL) 5042 fanon_index = svd->anon_index + fpage; 5043 5044 while (pgoff > svd->offset) { 5045 if (svd->advice != MADV_SEQUENTIAL && 5046 (!svd->pageadvice || (vpage && 5047 VPP_ADVICE(vpp) != MADV_SEQUENTIAL))) 5048 break; 5049 5050 /* 5051 * If this is an anon page, we must find the 5052 * correct <vp, offset> for it 5053 */ 5054 fap = NULL; 5055 if (amp != NULL) { 5056 ANON_LOCK_ENTER(&->a_rwlock, 5057 RW_READER); 5058 anon_array_enter(amp, fanon_index, 5059 &cookie); 5060 fap = anon_get_ptr(amp->ahp, 5061 fanon_index); 5062 if (fap != NULL) { 5063 swap_xlate(fap, &fvp, &fpgoff); 5064 } else { 5065 fpgoff = pgoff; 5066 fvp = svd->vp; 5067 } 5068 anon_array_exit(&cookie); 5069 ANON_LOCK_EXIT(&->a_rwlock); 5070 } else { 5071 fpgoff = pgoff; 5072 fvp = svd->vp; 5073 } 5074 if (fvp == NULL) 5075 break; /* XXX */ 5076 /* 5077 * Skip pages that are free or have an 5078 * "exclusive" lock. 5079 */ 5080 pp = page_lookup_nowait(fvp, fpgoff, SE_SHARED); 5081 if (pp == NULL) 5082 break; 5083 /* 5084 * We don't need the page_struct_lock to test 5085 * as this is only advisory; even if we 5086 * acquire it someone might race in and lock 5087 * the page after we unlock and before the 5088 * PUTPAGE, then VOP_PUTPAGE will do nothing. 5089 */ 5090 if (pp->p_lckcnt == 0 && pp->p_cowcnt == 0) { 5091 /* 5092 * Hold the vnode before releasing 5093 * the page lock to prevent it from 5094 * being freed and re-used by some 5095 * other thread. 5096 */ 5097 VN_HOLD(fvp); 5098 page_unlock(pp); 5099 /* 5100 * We should build a page list 5101 * to kluster putpages XXX 5102 */ 5103 (void) VOP_PUTPAGE(fvp, 5104 (offset_t)fpgoff, PAGESIZE, 5105 (B_DONTNEED|B_FREE|B_ASYNC), 5106 svd->cred); 5107 VN_RELE(fvp); 5108 } else { 5109 /* 5110 * XXX - Should the loop terminate if 5111 * the page is `locked'? 5112 */ 5113 page_unlock(pp); 5114 } 5115 --vpp; 5116 --fanon_index; 5117 pgoff -= PAGESIZE; 5118 } 5119 } 5120 } 5121 5122 plp = pl; 5123 *plp = NULL; 5124 pl_alloc_sz = 0; 5125 5126 /* 5127 * See if we need to call VOP_GETPAGE for 5128 * *any* of the range being faulted on. 5129 * We can skip all of this work if there 5130 * was no original vnode. 5131 */ 5132 if (svd->vp != NULL) { 5133 u_offset_t vp_off; 5134 size_t vp_len; 5135 struct anon *ap; 5136 vnode_t *vp; 5137 5138 vp_off = off; 5139 vp_len = len; 5140 5141 if (amp == NULL) 5142 dogetpage = 1; 5143 else { 5144 /* 5145 * Only acquire reader lock to prevent amp->ahp 5146 * from being changed. It's ok to miss pages, 5147 * hence we don't do anon_array_enter 5148 */ 5149 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5150 ap = anon_get_ptr(amp->ahp, anon_index); 5151 5152 if (len <= PAGESIZE) 5153 /* inline non_anon() */ 5154 dogetpage = (ap == NULL); 5155 else 5156 dogetpage = non_anon(amp->ahp, anon_index, 5157 &vp_off, &vp_len); 5158 ANON_LOCK_EXIT(&->a_rwlock); 5159 } 5160 5161 if (dogetpage) { 5162 enum seg_rw arw; 5163 struct as *as = seg->s_as; 5164 5165 if (len > ptob((sizeof (pl) / sizeof (pl[0])) - 1)) { 5166 /* 5167 * Page list won't fit in local array, 5168 * allocate one of the needed size. 5169 */ 5170 pl_alloc_sz = 5171 (btop(len) + 1) * sizeof (page_t *); 5172 plp = kmem_alloc(pl_alloc_sz, KM_SLEEP); 5173 plp[0] = NULL; 5174 plsz = len; 5175 } else if (rw == S_WRITE && svd->type == MAP_PRIVATE || 5176 svd->tr_state == SEGVN_TR_ON || rw == S_OTHER || 5177 (((size_t)(addr + PAGESIZE) < 5178 (size_t)(seg->s_base + seg->s_size)) && 5179 hat_probe(as->a_hat, addr + PAGESIZE))) { 5180 /* 5181 * Ask VOP_GETPAGE to return the exact number 5182 * of pages if 5183 * (a) this is a COW fault, or 5184 * (b) this is a software fault, or 5185 * (c) next page is already mapped. 5186 */ 5187 plsz = len; 5188 } else { 5189 /* 5190 * Ask VOP_GETPAGE to return adjacent pages 5191 * within the segment. 5192 */ 5193 plsz = MIN((size_t)PVN_GETPAGE_SZ, (size_t) 5194 ((seg->s_base + seg->s_size) - addr)); 5195 ASSERT((addr + plsz) <= 5196 (seg->s_base + seg->s_size)); 5197 } 5198 5199 /* 5200 * Need to get some non-anonymous pages. 5201 * We need to make only one call to GETPAGE to do 5202 * this to prevent certain deadlocking conditions 5203 * when we are doing locking. In this case 5204 * non_anon() should have picked up the smallest 5205 * range which includes all the non-anonymous 5206 * pages in the requested range. We have to 5207 * be careful regarding which rw flag to pass in 5208 * because on a private mapping, the underlying 5209 * object is never allowed to be written. 5210 */ 5211 if (rw == S_WRITE && svd->type == MAP_PRIVATE) { 5212 arw = S_READ; 5213 } else { 5214 arw = rw; 5215 } 5216 vp = svd->vp; 5217 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5218 "segvn_getpage:seg %p addr %p vp %p", 5219 seg, addr, vp); 5220 err = VOP_GETPAGE(vp, (offset_t)vp_off, vp_len, 5221 &vpprot, plp, plsz, seg, addr + (vp_off - off), arw, 5222 svd->cred); 5223 if (err) { 5224 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5225 segvn_pagelist_rele(plp); 5226 if (pl_alloc_sz) 5227 kmem_free(plp, pl_alloc_sz); 5228 return (FC_MAKE_ERR(err)); 5229 } 5230 if (svd->type == MAP_PRIVATE) 5231 vpprot &= ~PROT_WRITE; 5232 } 5233 } 5234 5235 /* 5236 * N.B. at this time the plp array has all the needed non-anon 5237 * pages in addition to (possibly) having some adjacent pages. 5238 */ 5239 5240 /* 5241 * Always acquire the anon_array_lock to prevent 5242 * 2 threads from allocating separate anon slots for 5243 * the same "addr". 5244 * 5245 * If this is a copy-on-write fault and we don't already 5246 * have the anon_array_lock, acquire it to prevent the 5247 * fault routine from handling multiple copy-on-write faults 5248 * on the same "addr" in the same address space. 5249 * 5250 * Only one thread should deal with the fault since after 5251 * it is handled, the other threads can acquire a translation 5252 * to the newly created private page. This prevents two or 5253 * more threads from creating different private pages for the 5254 * same fault. 5255 * 5256 * We grab "serialization" lock here if this is a MAP_PRIVATE segment 5257 * to prevent deadlock between this thread and another thread 5258 * which has soft-locked this page and wants to acquire serial_lock. 5259 * ( bug 4026339 ) 5260 * 5261 * The fix for bug 4026339 becomes unnecessary when using the 5262 * locking scheme with per amp rwlock and a global set of hash 5263 * lock, anon_array_lock. If we steal a vnode page when low 5264 * on memory and upgrad the page lock through page_rename, 5265 * then the page is PAGE_HANDLED, nothing needs to be done 5266 * for this page after returning from segvn_faultpage. 5267 * 5268 * But really, the page lock should be downgraded after 5269 * the stolen page is page_rename'd. 5270 */ 5271 5272 if (amp != NULL) 5273 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5274 5275 /* 5276 * Ok, now loop over the address range and handle faults 5277 */ 5278 for (a = addr; a < addr + len; a += PAGESIZE, off += PAGESIZE) { 5279 err = segvn_faultpage(hat, seg, a, off, vpage, plp, vpprot, 5280 type, rw, brkcow, a == addr); 5281 if (err) { 5282 if (amp != NULL) 5283 ANON_LOCK_EXIT(&->a_rwlock); 5284 if (type == F_SOFTLOCK && a > addr) { 5285 segvn_softunlock(seg, addr, (a - addr), 5286 S_OTHER); 5287 } 5288 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5289 segvn_pagelist_rele(plp); 5290 if (pl_alloc_sz) 5291 kmem_free(plp, pl_alloc_sz); 5292 return (err); 5293 } 5294 if (vpage) { 5295 vpage++; 5296 } else if (svd->vpage) { 5297 page = seg_page(seg, addr); 5298 vpage = &svd->vpage[++page]; 5299 } 5300 } 5301 5302 /* Didn't get pages from the underlying fs so we're done */ 5303 if (!dogetpage) 5304 goto done; 5305 5306 /* 5307 * Now handle any other pages in the list returned. 5308 * If the page can be used, load up the translations now. 5309 * Note that the for loop will only be entered if "plp" 5310 * is pointing to a non-NULL page pointer which means that 5311 * VOP_GETPAGE() was called and vpprot has been initialized. 5312 */ 5313 if (svd->pageprot == 0) 5314 prot = svd->prot & vpprot; 5315 5316 5317 /* 5318 * Large Files: diff should be unsigned value because we started 5319 * supporting > 2GB segment sizes from 2.5.1 and when a 5320 * large file of size > 2GB gets mapped to address space 5321 * the diff value can be > 2GB. 5322 */ 5323 5324 for (ppp = plp; (pp = *ppp) != NULL; ppp++) { 5325 size_t diff; 5326 struct anon *ap; 5327 int anon_index; 5328 anon_sync_obj_t cookie; 5329 int hat_flag = HAT_LOAD_ADV; 5330 5331 if (svd->flags & MAP_TEXT) { 5332 hat_flag |= HAT_LOAD_TEXT; 5333 } 5334 5335 if (pp == PAGE_HANDLED) 5336 continue; 5337 5338 if (svd->tr_state != SEGVN_TR_ON && 5339 pp->p_offset >= svd->offset && 5340 pp->p_offset < svd->offset + seg->s_size) { 5341 5342 diff = pp->p_offset - svd->offset; 5343 5344 /* 5345 * Large Files: Following is the assertion 5346 * validating the above cast. 5347 */ 5348 ASSERT(svd->vp == pp->p_vnode); 5349 5350 page = btop(diff); 5351 if (svd->pageprot) 5352 prot = VPP_PROT(&svd->vpage[page]) & vpprot; 5353 5354 /* 5355 * Prevent other threads in the address space from 5356 * creating private pages (i.e., allocating anon slots) 5357 * while we are in the process of loading translations 5358 * to additional pages returned by the underlying 5359 * object. 5360 */ 5361 if (amp != NULL) { 5362 anon_index = svd->anon_index + page; 5363 anon_array_enter(amp, anon_index, &cookie); 5364 ap = anon_get_ptr(amp->ahp, anon_index); 5365 } 5366 if ((amp == NULL) || (ap == NULL)) { 5367 if (IS_VMODSORT(pp->p_vnode) || 5368 enable_mbit_wa) { 5369 if (rw == S_WRITE) 5370 hat_setmod(pp); 5371 else if (rw != S_OTHER && 5372 !hat_ismod(pp)) 5373 prot &= ~PROT_WRITE; 5374 } 5375 /* 5376 * Skip mapping read ahead pages marked 5377 * for migration, so they will get migrated 5378 * properly on fault 5379 */ 5380 if ((prot & PROT_READ) && !PP_ISMIGRATE(pp)) { 5381 hat_memload(hat, seg->s_base + diff, 5382 pp, prot, hat_flag); 5383 } 5384 } 5385 if (amp != NULL) 5386 anon_array_exit(&cookie); 5387 } 5388 page_unlock(pp); 5389 } 5390 done: 5391 if (amp != NULL) 5392 ANON_LOCK_EXIT(&->a_rwlock); 5393 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5394 if (pl_alloc_sz) 5395 kmem_free(plp, pl_alloc_sz); 5396 return (0); 5397 } 5398 5399 /* 5400 * This routine is used to start I/O on pages asynchronously. XXX it will 5401 * only create PAGESIZE pages. At fault time they will be relocated into 5402 * larger pages. 5403 */ 5404 static faultcode_t 5405 segvn_faulta(struct seg *seg, caddr_t addr) 5406 { 5407 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5408 int err; 5409 struct anon_map *amp; 5410 vnode_t *vp; 5411 5412 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5413 5414 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 5415 if ((amp = svd->amp) != NULL) { 5416 struct anon *ap; 5417 5418 /* 5419 * Reader lock to prevent amp->ahp from being changed. 5420 * This is advisory, it's ok to miss a page, so 5421 * we don't do anon_array_enter lock. 5422 */ 5423 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5424 if ((ap = anon_get_ptr(amp->ahp, 5425 svd->anon_index + seg_page(seg, addr))) != NULL) { 5426 5427 err = anon_getpage(&ap, NULL, NULL, 5428 0, seg, addr, S_READ, svd->cred); 5429 5430 ANON_LOCK_EXIT(&->a_rwlock); 5431 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5432 if (err) 5433 return (FC_MAKE_ERR(err)); 5434 return (0); 5435 } 5436 ANON_LOCK_EXIT(&->a_rwlock); 5437 } 5438 5439 if (svd->vp == NULL) { 5440 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5441 return (0); /* zfod page - do nothing now */ 5442 } 5443 5444 vp = svd->vp; 5445 TRACE_3(TR_FAC_VM, TR_SEGVN_GETPAGE, 5446 "segvn_getpage:seg %p addr %p vp %p", seg, addr, vp); 5447 err = VOP_GETPAGE(vp, 5448 (offset_t)(svd->offset + (uintptr_t)(addr - seg->s_base)), 5449 PAGESIZE, NULL, NULL, 0, seg, addr, 5450 S_OTHER, svd->cred); 5451 5452 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5453 if (err) 5454 return (FC_MAKE_ERR(err)); 5455 return (0); 5456 } 5457 5458 static int 5459 segvn_setprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 5460 { 5461 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5462 struct vpage *svp, *evp; 5463 struct vnode *vp; 5464 size_t pgsz; 5465 pgcnt_t pgcnt; 5466 anon_sync_obj_t cookie; 5467 5468 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5469 5470 if ((svd->maxprot & prot) != prot) 5471 return (EACCES); /* violated maxprot */ 5472 5473 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 5474 5475 /* return if prot is the same */ 5476 if (!svd->pageprot && svd->prot == prot) { 5477 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5478 return (0); 5479 } 5480 5481 /* 5482 * Since we change protections we first have to flush the cache. 5483 * This makes sure all the pagelock calls have to recheck 5484 * protections. 5485 */ 5486 if (svd->softlockcnt > 0) { 5487 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5488 /* 5489 * Since we do have the segvn writers lock nobody can fill 5490 * the cache with entries belonging to this seg during 5491 * the purge. The flush either succeeds or we still have 5492 * pending I/Os. 5493 */ 5494 segvn_purge(seg); 5495 if (svd->softlockcnt > 0) { 5496 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5497 return (EAGAIN); 5498 } 5499 } 5500 5501 if (svd->tr_state == SEGVN_TR_INIT) { 5502 svd->tr_state = SEGVN_TR_OFF; 5503 } else if (svd->tr_state == SEGVN_TR_ON) { 5504 ASSERT(svd->amp != NULL); 5505 segvn_textunrepl(seg, 0); 5506 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5507 } 5508 5509 if ((prot & PROT_WRITE) && svd->type == MAP_SHARED && 5510 svd->vp != NULL && (svd->vp->v_flag & VVMEXEC)) { 5511 ASSERT(vn_is_mapped(svd->vp, V_WRITE)); 5512 segvn_inval_trcache(svd->vp); 5513 } 5514 5515 if (seg->s_szc != 0) { 5516 int err; 5517 pgsz = page_get_pagesize(seg->s_szc); 5518 pgcnt = pgsz >> PAGESHIFT; 5519 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 5520 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 5521 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5522 ASSERT(seg->s_base != addr || seg->s_size != len); 5523 /* 5524 * If we are holding the as lock as a reader then 5525 * we need to return IE_RETRY and let the as 5526 * layer drop and re-aquire the lock as a writer. 5527 */ 5528 if (AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) 5529 return (IE_RETRY); 5530 VM_STAT_ADD(segvnvmstats.demoterange[1]); 5531 if (svd->type == MAP_PRIVATE || svd->vp != NULL) { 5532 err = segvn_demote_range(seg, addr, len, 5533 SDR_END, 0); 5534 } else { 5535 uint_t szcvec = map_pgszcvec(seg->s_base, 5536 pgsz, (uintptr_t)seg->s_base, 5537 (svd->flags & MAP_TEXT), MAPPGSZC_SHM, 0); 5538 err = segvn_demote_range(seg, addr, len, 5539 SDR_END, szcvec); 5540 } 5541 if (err == 0) 5542 return (IE_RETRY); 5543 if (err == ENOMEM) 5544 return (IE_NOMEM); 5545 return (err); 5546 } 5547 } 5548 5549 5550 /* 5551 * If it's a private mapping and we're making it writable 5552 * and no swap space has been reserved, have to reserve 5553 * it all now. If it's a private mapping to a file (i.e., vp != NULL) 5554 * and we're removing write permission on the entire segment and 5555 * we haven't modified any pages, we can release the swap space. 5556 */ 5557 if (svd->type == MAP_PRIVATE) { 5558 if (prot & PROT_WRITE) { 5559 size_t sz; 5560 if (svd->swresv == 0 && !(svd->flags & MAP_NORESERVE)) { 5561 if (anon_resv_zone(seg->s_size, 5562 seg->s_as->a_proc->p_zone) == 0) { 5563 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5564 return (IE_NOMEM); 5565 } 5566 sz = svd->swresv = seg->s_size; 5567 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5568 "anon proc:%p %lu %u", 5569 seg, sz, 1); 5570 } 5571 } else { 5572 /* 5573 * Swap space is released only if this segment 5574 * does not map anonymous memory, since read faults 5575 * on such segments still need an anon slot to read 5576 * in the data. 5577 */ 5578 if (svd->swresv != 0 && svd->vp != NULL && 5579 svd->amp == NULL && addr == seg->s_base && 5580 len == seg->s_size && svd->pageprot == 0) { 5581 anon_unresv_zone(svd->swresv, 5582 seg->s_as->a_proc->p_zone); 5583 svd->swresv = 0; 5584 TRACE_3(TR_FAC_VM, TR_ANON_PROC, 5585 "anon proc:%p %lu %u", 5586 seg, 0, 0); 5587 } 5588 } 5589 } 5590 5591 if (addr == seg->s_base && len == seg->s_size && svd->pageprot == 0) { 5592 if (svd->prot == prot) { 5593 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5594 return (0); /* all done */ 5595 } 5596 svd->prot = (uchar_t)prot; 5597 } else if (svd->type == MAP_PRIVATE) { 5598 struct anon *ap = NULL; 5599 page_t *pp; 5600 u_offset_t offset, off; 5601 struct anon_map *amp; 5602 ulong_t anon_idx = 0; 5603 5604 /* 5605 * A vpage structure exists or else the change does not 5606 * involve the entire segment. Establish a vpage structure 5607 * if none is there. Then, for each page in the range, 5608 * adjust its individual permissions. Note that write- 5609 * enabling a MAP_PRIVATE page can affect the claims for 5610 * locked down memory. Overcommitting memory terminates 5611 * the operation. 5612 */ 5613 segvn_vpage(seg); 5614 if ((amp = svd->amp) != NULL) { 5615 anon_idx = svd->anon_index + seg_page(seg, addr); 5616 ASSERT(seg->s_szc == 0 || 5617 IS_P2ALIGNED(anon_idx, pgcnt)); 5618 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5619 } 5620 5621 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 5622 evp = &svd->vpage[seg_page(seg, addr + len)]; 5623 5624 /* 5625 * See Statement at the beginning of segvn_lockop regarding 5626 * the way cowcnts and lckcnts are handled. 5627 */ 5628 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5629 5630 if (seg->s_szc != 0) { 5631 if (amp != NULL) { 5632 anon_array_enter(amp, anon_idx, 5633 &cookie); 5634 } 5635 if (IS_P2ALIGNED(anon_idx, pgcnt) && 5636 !segvn_claim_pages(seg, svp, offset, 5637 anon_idx, prot)) { 5638 if (amp != NULL) { 5639 anon_array_exit(&cookie); 5640 } 5641 break; 5642 } 5643 if (amp != NULL) { 5644 anon_array_exit(&cookie); 5645 } 5646 anon_idx++; 5647 } else { 5648 if (amp != NULL) { 5649 anon_array_enter(amp, anon_idx, 5650 &cookie); 5651 ap = anon_get_ptr(amp->ahp, anon_idx++); 5652 } 5653 5654 if (VPP_ISPPLOCK(svp) && 5655 VPP_PROT(svp) != prot) { 5656 5657 if (amp == NULL || ap == NULL) { 5658 vp = svd->vp; 5659 off = offset; 5660 } else 5661 swap_xlate(ap, &vp, &off); 5662 if (amp != NULL) 5663 anon_array_exit(&cookie); 5664 5665 if ((pp = page_lookup(vp, off, 5666 SE_SHARED)) == NULL) { 5667 panic("segvn_setprot: no page"); 5668 /*NOTREACHED*/ 5669 } 5670 ASSERT(seg->s_szc == 0); 5671 if ((VPP_PROT(svp) ^ prot) & 5672 PROT_WRITE) { 5673 if (prot & PROT_WRITE) { 5674 if (!page_addclaim(pp)) { 5675 page_unlock(pp); 5676 break; 5677 } 5678 } else { 5679 if (!page_subclaim(pp)) { 5680 page_unlock(pp); 5681 break; 5682 } 5683 } 5684 } 5685 page_unlock(pp); 5686 } else if (amp != NULL) 5687 anon_array_exit(&cookie); 5688 } 5689 VPP_SETPROT(svp, prot); 5690 offset += PAGESIZE; 5691 } 5692 if (amp != NULL) 5693 ANON_LOCK_EXIT(&->a_rwlock); 5694 5695 /* 5696 * Did we terminate prematurely? If so, simply unload 5697 * the translations to the things we've updated so far. 5698 */ 5699 if (svp != evp) { 5700 len = (svp - &svd->vpage[seg_page(seg, addr)]) * 5701 PAGESIZE; 5702 ASSERT(seg->s_szc == 0 || IS_P2ALIGNED(len, pgsz)); 5703 if (len != 0) 5704 hat_unload(seg->s_as->a_hat, addr, 5705 len, HAT_UNLOAD); 5706 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5707 return (IE_NOMEM); 5708 } 5709 } else { 5710 segvn_vpage(seg); 5711 evp = &svd->vpage[seg_page(seg, addr + len)]; 5712 for (svp = &svd->vpage[seg_page(seg, addr)]; svp < evp; svp++) { 5713 VPP_SETPROT(svp, prot); 5714 } 5715 } 5716 5717 if (((prot & PROT_WRITE) != 0 && 5718 (svd->vp != NULL || svd->type == MAP_PRIVATE)) || 5719 (prot & ~PROT_USER) == PROT_NONE) { 5720 /* 5721 * Either private or shared data with write access (in 5722 * which case we need to throw out all former translations 5723 * so that we get the right translations set up on fault 5724 * and we don't allow write access to any copy-on-write pages 5725 * that might be around or to prevent write access to pages 5726 * representing holes in a file), or we don't have permission 5727 * to access the memory at all (in which case we have to 5728 * unload any current translations that might exist). 5729 */ 5730 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 5731 } else { 5732 /* 5733 * A shared mapping or a private mapping in which write 5734 * protection is going to be denied - just change all the 5735 * protections over the range of addresses in question. 5736 * segvn does not support any other attributes other 5737 * than prot so we can use hat_chgattr. 5738 */ 5739 hat_chgattr(seg->s_as->a_hat, addr, len, prot); 5740 } 5741 5742 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 5743 5744 return (0); 5745 } 5746 5747 /* 5748 * segvn_setpagesize is called via SEGOP_SETPAGESIZE from as_setpagesize, 5749 * to determine if the seg is capable of mapping the requested szc. 5750 */ 5751 static int 5752 segvn_setpagesize(struct seg *seg, caddr_t addr, size_t len, uint_t szc) 5753 { 5754 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 5755 struct segvn_data *nsvd; 5756 struct anon_map *amp = svd->amp; 5757 struct seg *nseg; 5758 caddr_t eaddr = addr + len, a; 5759 size_t pgsz = page_get_pagesize(szc); 5760 pgcnt_t pgcnt = page_get_pagecnt(szc); 5761 int err; 5762 u_offset_t off = svd->offset + (uintptr_t)(addr - seg->s_base); 5763 extern struct vnode kvp; 5764 5765 ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 5766 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 5767 5768 if (seg->s_szc == szc || segvn_lpg_disable != 0) { 5769 return (0); 5770 } 5771 5772 /* 5773 * addr should always be pgsz aligned but eaddr may be misaligned if 5774 * it's at the end of the segment. 5775 * 5776 * XXX we should assert this condition since as_setpagesize() logic 5777 * guarantees it. 5778 */ 5779 if (!IS_P2ALIGNED(addr, pgsz) || 5780 (!IS_P2ALIGNED(eaddr, pgsz) && 5781 eaddr != seg->s_base + seg->s_size)) { 5782 5783 segvn_setpgsz_align_err++; 5784 return (EINVAL); 5785 } 5786 5787 if (amp != NULL && svd->type == MAP_SHARED) { 5788 ulong_t an_idx = svd->anon_index + seg_page(seg, addr); 5789 if (!IS_P2ALIGNED(an_idx, pgcnt)) { 5790 5791 segvn_setpgsz_anon_align_err++; 5792 return (EINVAL); 5793 } 5794 } 5795 5796 if ((svd->flags & MAP_NORESERVE) || seg->s_as == &kas || 5797 szc > segvn_maxpgszc) { 5798 return (EINVAL); 5799 } 5800 5801 /* paranoid check */ 5802 if (svd->vp != NULL && 5803 (IS_SWAPFSVP(svd->vp) || VN_ISKAS(svd->vp))) { 5804 return (EINVAL); 5805 } 5806 5807 if (seg->s_szc == 0 && svd->vp != NULL && 5808 map_addr_vacalign_check(addr, off)) { 5809 return (EINVAL); 5810 } 5811 5812 /* 5813 * Check that protections are the same within new page 5814 * size boundaries. 5815 */ 5816 if (svd->pageprot) { 5817 for (a = addr; a < eaddr; a += pgsz) { 5818 if ((a + pgsz) > eaddr) { 5819 if (!sameprot(seg, a, eaddr - a)) { 5820 return (EINVAL); 5821 } 5822 } else { 5823 if (!sameprot(seg, a, pgsz)) { 5824 return (EINVAL); 5825 } 5826 } 5827 } 5828 } 5829 5830 /* 5831 * Since we are changing page size we first have to flush 5832 * the cache. This makes sure all the pagelock calls have 5833 * to recheck protections. 5834 */ 5835 if (svd->softlockcnt > 0) { 5836 ASSERT(svd->tr_state == SEGVN_TR_OFF); 5837 /* 5838 * Since we do have the segvn writers lock nobody can fill 5839 * the cache with entries belonging to this seg during 5840 * the purge. The flush either succeeds or we still have 5841 * pending I/Os. 5842 */ 5843 segvn_purge(seg); 5844 if (svd->softlockcnt > 0) { 5845 return (EAGAIN); 5846 } 5847 } 5848 5849 if (svd->tr_state == SEGVN_TR_INIT) { 5850 svd->tr_state = SEGVN_TR_OFF; 5851 } else if (svd->tr_state == SEGVN_TR_ON) { 5852 ASSERT(svd->amp != NULL); 5853 segvn_textunrepl(seg, 1); 5854 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 5855 amp = NULL; 5856 } 5857 5858 /* 5859 * Operation for sub range of existing segment. 5860 */ 5861 if (addr != seg->s_base || eaddr != (seg->s_base + seg->s_size)) { 5862 if (szc < seg->s_szc) { 5863 VM_STAT_ADD(segvnvmstats.demoterange[2]); 5864 err = segvn_demote_range(seg, addr, len, SDR_RANGE, 0); 5865 if (err == 0) { 5866 return (IE_RETRY); 5867 } 5868 if (err == ENOMEM) { 5869 return (IE_NOMEM); 5870 } 5871 return (err); 5872 } 5873 if (addr != seg->s_base) { 5874 nseg = segvn_split_seg(seg, addr); 5875 if (eaddr != (nseg->s_base + nseg->s_size)) { 5876 /* eaddr is szc aligned */ 5877 (void) segvn_split_seg(nseg, eaddr); 5878 } 5879 return (IE_RETRY); 5880 } 5881 if (eaddr != (seg->s_base + seg->s_size)) { 5882 /* eaddr is szc aligned */ 5883 (void) segvn_split_seg(seg, eaddr); 5884 } 5885 return (IE_RETRY); 5886 } 5887 5888 /* 5889 * Break any low level sharing and reset seg->s_szc to 0. 5890 */ 5891 if ((err = segvn_clrszc(seg)) != 0) { 5892 if (err == ENOMEM) { 5893 err = IE_NOMEM; 5894 } 5895 return (err); 5896 } 5897 ASSERT(seg->s_szc == 0); 5898 5899 /* 5900 * If the end of the current segment is not pgsz aligned 5901 * then attempt to concatenate with the next segment. 5902 */ 5903 if (!IS_P2ALIGNED(eaddr, pgsz)) { 5904 nseg = AS_SEGNEXT(seg->s_as, seg); 5905 if (nseg == NULL || nseg == seg || eaddr != nseg->s_base) { 5906 return (ENOMEM); 5907 } 5908 if (nseg->s_ops != &segvn_ops) { 5909 return (EINVAL); 5910 } 5911 nsvd = (struct segvn_data *)nseg->s_data; 5912 if (nsvd->softlockcnt > 0) { 5913 segvn_purge(nseg); 5914 if (nsvd->softlockcnt > 0) { 5915 return (EAGAIN); 5916 } 5917 } 5918 err = segvn_clrszc(nseg); 5919 if (err == ENOMEM) { 5920 err = IE_NOMEM; 5921 } 5922 if (err != 0) { 5923 return (err); 5924 } 5925 err = segvn_concat(seg, nseg, 1); 5926 if (err == -1) { 5927 return (EINVAL); 5928 } 5929 if (err == -2) { 5930 return (IE_NOMEM); 5931 } 5932 return (IE_RETRY); 5933 } 5934 5935 /* 5936 * May need to re-align anon array to 5937 * new szc. 5938 */ 5939 if (amp != NULL) { 5940 if (!IS_P2ALIGNED(svd->anon_index, pgcnt)) { 5941 struct anon_hdr *nahp; 5942 5943 ASSERT(svd->type == MAP_PRIVATE); 5944 5945 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5946 ASSERT(amp->refcnt == 1); 5947 nahp = anon_create(btop(amp->size), ANON_NOSLEEP); 5948 if (nahp == NULL) { 5949 ANON_LOCK_EXIT(&->a_rwlock); 5950 return (IE_NOMEM); 5951 } 5952 if (anon_copy_ptr(amp->ahp, svd->anon_index, 5953 nahp, 0, btop(seg->s_size), ANON_NOSLEEP)) { 5954 anon_release(nahp, btop(amp->size)); 5955 ANON_LOCK_EXIT(&->a_rwlock); 5956 return (IE_NOMEM); 5957 } 5958 anon_release(amp->ahp, btop(amp->size)); 5959 amp->ahp = nahp; 5960 svd->anon_index = 0; 5961 ANON_LOCK_EXIT(&->a_rwlock); 5962 } 5963 } 5964 if (svd->vp != NULL && szc != 0) { 5965 struct vattr va; 5966 u_offset_t eoffpage = svd->offset; 5967 va.va_mask = AT_SIZE; 5968 eoffpage += seg->s_size; 5969 eoffpage = btopr(eoffpage); 5970 if (VOP_GETATTR(svd->vp, &va, 0, svd->cred) != 0) { 5971 segvn_setpgsz_getattr_err++; 5972 return (EINVAL); 5973 } 5974 if (btopr(va.va_size) < eoffpage) { 5975 segvn_setpgsz_eof_err++; 5976 return (EINVAL); 5977 } 5978 if (amp != NULL) { 5979 /* 5980 * anon_fill_cow_holes() may call VOP_GETPAGE(). 5981 * don't take anon map lock here to avoid holding it 5982 * across VOP_GETPAGE() calls that may call back into 5983 * segvn for klsutering checks. We don't really need 5984 * anon map lock here since it's a private segment and 5985 * we hold as level lock as writers. 5986 */ 5987 if ((err = anon_fill_cow_holes(seg, seg->s_base, 5988 amp->ahp, svd->anon_index, svd->vp, svd->offset, 5989 seg->s_size, szc, svd->prot, svd->vpage, 5990 svd->cred)) != 0) { 5991 return (EINVAL); 5992 } 5993 } 5994 segvn_setvnode_mpss(svd->vp); 5995 } 5996 5997 if (amp != NULL) { 5998 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 5999 if (svd->type == MAP_PRIVATE) { 6000 amp->a_szc = szc; 6001 } else if (szc > amp->a_szc) { 6002 amp->a_szc = szc; 6003 } 6004 ANON_LOCK_EXIT(&->a_rwlock); 6005 } 6006 6007 seg->s_szc = szc; 6008 6009 return (0); 6010 } 6011 6012 static int 6013 segvn_clrszc(struct seg *seg) 6014 { 6015 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6016 struct anon_map *amp = svd->amp; 6017 size_t pgsz; 6018 pgcnt_t pages; 6019 int err = 0; 6020 caddr_t a = seg->s_base; 6021 caddr_t ea = a + seg->s_size; 6022 ulong_t an_idx = svd->anon_index; 6023 vnode_t *vp = svd->vp; 6024 struct vpage *vpage = svd->vpage; 6025 page_t *anon_pl[1 + 1], *pp; 6026 struct anon *ap, *oldap; 6027 uint_t prot = svd->prot, vpprot; 6028 int pageflag = 0; 6029 int unmap = 1; 6030 6031 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6032 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 6033 6034 if (vp == NULL && amp == NULL) { 6035 seg->s_szc = 0; 6036 return (0); 6037 } 6038 6039 if (svd->tr_state == SEGVN_TR_INIT) { 6040 svd->tr_state = SEGVN_TR_OFF; 6041 } else if (svd->tr_state == SEGVN_TR_ON) { 6042 ASSERT(svd->amp != NULL); 6043 segvn_textunrepl(seg, 1); 6044 ASSERT(svd->amp == NULL && svd->tr_state == SEGVN_TR_OFF); 6045 amp = NULL; 6046 unmap = 0; 6047 } 6048 6049 if (unmap) { 6050 /* 6051 * do HAT_UNLOAD_UNMAP since we are changing the pagesize. 6052 * unload argument is 0 when we are freeing the segment 6053 * and unload was already done. 6054 */ 6055 hat_unload(seg->s_as->a_hat, seg->s_base, seg->s_size, 6056 HAT_UNLOAD_UNMAP); 6057 } 6058 6059 if (amp == NULL || svd->type == MAP_SHARED) { 6060 seg->s_szc = 0; 6061 return (0); 6062 } 6063 6064 pgsz = page_get_pagesize(seg->s_szc); 6065 pages = btop(pgsz); 6066 6067 /* 6068 * XXX anon rwlock is not really needed because this is a 6069 * private segment and we are writers. 6070 */ 6071 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 6072 6073 for (; a < ea; a += pgsz, an_idx += pages) { 6074 if ((oldap = anon_get_ptr(amp->ahp, an_idx)) != NULL) { 6075 ASSERT(vpage != NULL || svd->pageprot == 0); 6076 if (vpage != NULL) { 6077 ASSERT(sameprot(seg, a, pgsz)); 6078 prot = VPP_PROT(vpage); 6079 pageflag = VPP_ISPPLOCK(vpage) ? LOCK_PAGE : 0; 6080 } 6081 if (seg->s_szc != 0) { 6082 ASSERT(vp == NULL || anon_pages(amp->ahp, 6083 an_idx, pages) == pages); 6084 if ((err = anon_map_demotepages(amp, an_idx, 6085 seg, a, prot, vpage, svd->cred)) != 0) { 6086 goto out; 6087 } 6088 } else { 6089 if (oldap->an_refcnt == 1) { 6090 continue; 6091 } 6092 if ((err = anon_getpage(&oldap, &vpprot, 6093 anon_pl, PAGESIZE, seg, a, S_READ, 6094 svd->cred))) { 6095 goto out; 6096 } 6097 if ((pp = anon_private(&ap, seg, a, prot, 6098 anon_pl[0], pageflag, svd->cred)) == NULL) { 6099 err = ENOMEM; 6100 goto out; 6101 } 6102 anon_decref(oldap); 6103 (void) anon_set_ptr(amp->ahp, an_idx, ap, 6104 ANON_SLEEP); 6105 page_unlock(pp); 6106 } 6107 } 6108 vpage = (vpage == NULL) ? NULL : vpage + pages; 6109 } 6110 6111 amp->a_szc = 0; 6112 seg->s_szc = 0; 6113 out: 6114 ANON_LOCK_EXIT(&->a_rwlock); 6115 return (err); 6116 } 6117 6118 static int 6119 segvn_claim_pages( 6120 struct seg *seg, 6121 struct vpage *svp, 6122 u_offset_t off, 6123 ulong_t anon_idx, 6124 uint_t prot) 6125 { 6126 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6127 size_t ppasize = (pgcnt + 1) * sizeof (page_t *); 6128 page_t **ppa; 6129 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6130 struct anon_map *amp = svd->amp; 6131 struct vpage *evp = svp + pgcnt; 6132 caddr_t addr = ((uintptr_t)(svp - svd->vpage) << PAGESHIFT) 6133 + seg->s_base; 6134 struct anon *ap; 6135 struct vnode *vp = svd->vp; 6136 page_t *pp; 6137 pgcnt_t pg_idx, i; 6138 int err = 0; 6139 anoff_t aoff; 6140 int anon = (amp != NULL) ? 1 : 0; 6141 6142 ASSERT(svd->type == MAP_PRIVATE); 6143 ASSERT(svd->vpage != NULL); 6144 ASSERT(seg->s_szc != 0); 6145 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 6146 ASSERT(amp == NULL || IS_P2ALIGNED(anon_idx, pgcnt)); 6147 ASSERT(sameprot(seg, addr, pgcnt << PAGESHIFT)); 6148 6149 if (VPP_PROT(svp) == prot) 6150 return (1); 6151 if (!((VPP_PROT(svp) ^ prot) & PROT_WRITE)) 6152 return (1); 6153 6154 ppa = kmem_alloc(ppasize, KM_SLEEP); 6155 if (anon && vp != NULL) { 6156 if (anon_get_ptr(amp->ahp, anon_idx) == NULL) { 6157 anon = 0; 6158 ASSERT(!anon_pages(amp->ahp, anon_idx, pgcnt)); 6159 } 6160 ASSERT(!anon || 6161 anon_pages(amp->ahp, anon_idx, pgcnt) == pgcnt); 6162 } 6163 6164 for (*ppa = NULL, pg_idx = 0; svp < evp; svp++, anon_idx++) { 6165 if (!VPP_ISPPLOCK(svp)) 6166 continue; 6167 if (anon) { 6168 ap = anon_get_ptr(amp->ahp, anon_idx); 6169 if (ap == NULL) { 6170 panic("segvn_claim_pages: no anon slot"); 6171 } 6172 swap_xlate(ap, &vp, &aoff); 6173 off = (u_offset_t)aoff; 6174 } 6175 ASSERT(vp != NULL); 6176 if ((pp = page_lookup(vp, 6177 (u_offset_t)off, SE_SHARED)) == NULL) { 6178 panic("segvn_claim_pages: no page"); 6179 } 6180 ppa[pg_idx++] = pp; 6181 off += PAGESIZE; 6182 } 6183 6184 if (ppa[0] == NULL) { 6185 kmem_free(ppa, ppasize); 6186 return (1); 6187 } 6188 6189 ASSERT(pg_idx <= pgcnt); 6190 ppa[pg_idx] = NULL; 6191 6192 if (prot & PROT_WRITE) 6193 err = page_addclaim_pages(ppa); 6194 else 6195 err = page_subclaim_pages(ppa); 6196 6197 for (i = 0; i < pg_idx; i++) { 6198 ASSERT(ppa[i] != NULL); 6199 page_unlock(ppa[i]); 6200 } 6201 6202 kmem_free(ppa, ppasize); 6203 return (err); 6204 } 6205 6206 /* 6207 * Returns right (upper address) segment if split occured. 6208 * If the address is equal to the beginning or end of its segment it returns 6209 * the current segment. 6210 */ 6211 static struct seg * 6212 segvn_split_seg(struct seg *seg, caddr_t addr) 6213 { 6214 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6215 struct seg *nseg; 6216 size_t nsize; 6217 struct segvn_data *nsvd; 6218 6219 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6220 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6221 6222 ASSERT(addr >= seg->s_base); 6223 ASSERT(addr <= seg->s_base + seg->s_size); 6224 6225 if (addr == seg->s_base || addr == seg->s_base + seg->s_size) 6226 return (seg); 6227 6228 nsize = seg->s_base + seg->s_size - addr; 6229 seg->s_size = addr - seg->s_base; 6230 nseg = seg_alloc(seg->s_as, addr, nsize); 6231 ASSERT(nseg != NULL); 6232 nseg->s_ops = seg->s_ops; 6233 nsvd = kmem_cache_alloc(segvn_cache, KM_SLEEP); 6234 nseg->s_data = (void *)nsvd; 6235 nseg->s_szc = seg->s_szc; 6236 *nsvd = *svd; 6237 nsvd->seg = nseg; 6238 rw_init(&nsvd->lock, NULL, RW_DEFAULT, NULL); 6239 6240 if (nsvd->vp != NULL) { 6241 VN_HOLD(nsvd->vp); 6242 nsvd->offset = svd->offset + 6243 (uintptr_t)(nseg->s_base - seg->s_base); 6244 if (nsvd->type == MAP_SHARED) 6245 lgrp_shm_policy_init(NULL, nsvd->vp); 6246 } else { 6247 /* 6248 * The offset for an anonymous segment has no signifigance in 6249 * terms of an offset into a file. If we were to use the above 6250 * calculation instead, the structures read out of 6251 * /proc/<pid>/xmap would be more difficult to decipher since 6252 * it would be unclear whether two seemingly contiguous 6253 * prxmap_t structures represented different segments or a 6254 * single segment that had been split up into multiple prxmap_t 6255 * structures (e.g. if some part of the segment had not yet 6256 * been faulted in). 6257 */ 6258 nsvd->offset = 0; 6259 } 6260 6261 ASSERT(svd->softlockcnt == 0); 6262 crhold(svd->cred); 6263 6264 if (svd->vpage != NULL) { 6265 size_t bytes = vpgtob(seg_pages(seg)); 6266 size_t nbytes = vpgtob(seg_pages(nseg)); 6267 struct vpage *ovpage = svd->vpage; 6268 6269 svd->vpage = kmem_alloc(bytes, KM_SLEEP); 6270 bcopy(ovpage, svd->vpage, bytes); 6271 nsvd->vpage = kmem_alloc(nbytes, KM_SLEEP); 6272 bcopy(ovpage + seg_pages(seg), nsvd->vpage, nbytes); 6273 kmem_free(ovpage, bytes + nbytes); 6274 } 6275 if (svd->amp != NULL && svd->type == MAP_PRIVATE) { 6276 struct anon_map *oamp = svd->amp, *namp; 6277 struct anon_hdr *nahp; 6278 6279 ANON_LOCK_ENTER(&oamp->a_rwlock, RW_WRITER); 6280 ASSERT(oamp->refcnt == 1); 6281 nahp = anon_create(btop(seg->s_size), ANON_SLEEP); 6282 (void) anon_copy_ptr(oamp->ahp, svd->anon_index, 6283 nahp, 0, btop(seg->s_size), ANON_SLEEP); 6284 6285 namp = anonmap_alloc(nseg->s_size, 0, ANON_SLEEP); 6286 namp->a_szc = nseg->s_szc; 6287 (void) anon_copy_ptr(oamp->ahp, 6288 svd->anon_index + btop(seg->s_size), 6289 namp->ahp, 0, btop(nseg->s_size), ANON_SLEEP); 6290 anon_release(oamp->ahp, btop(oamp->size)); 6291 oamp->ahp = nahp; 6292 oamp->size = seg->s_size; 6293 svd->anon_index = 0; 6294 nsvd->amp = namp; 6295 nsvd->anon_index = 0; 6296 ANON_LOCK_EXIT(&oamp->a_rwlock); 6297 } else if (svd->amp != NULL) { 6298 pgcnt_t pgcnt = page_get_pagecnt(seg->s_szc); 6299 ASSERT(svd->amp == nsvd->amp); 6300 ASSERT(seg->s_szc <= svd->amp->a_szc); 6301 nsvd->anon_index = svd->anon_index + seg_pages(seg); 6302 ASSERT(IS_P2ALIGNED(nsvd->anon_index, pgcnt)); 6303 ANON_LOCK_ENTER(&svd->amp->a_rwlock, RW_WRITER); 6304 svd->amp->refcnt++; 6305 ANON_LOCK_EXIT(&svd->amp->a_rwlock); 6306 } 6307 6308 /* 6309 * Split amount of swap reserve 6310 */ 6311 if (svd->swresv) { 6312 /* 6313 * For MAP_NORESERVE, only allocate swap reserve for pages 6314 * being used. Other segments get enough to cover whole 6315 * segment. 6316 */ 6317 if (svd->flags & MAP_NORESERVE) { 6318 size_t oswresv; 6319 6320 ASSERT(svd->amp); 6321 oswresv = svd->swresv; 6322 svd->swresv = ptob(anon_pages(svd->amp->ahp, 6323 svd->anon_index, btop(seg->s_size))); 6324 nsvd->swresv = ptob(anon_pages(nsvd->amp->ahp, 6325 nsvd->anon_index, btop(nseg->s_size))); 6326 ASSERT(oswresv >= (svd->swresv + nsvd->swresv)); 6327 } else { 6328 ASSERT(svd->swresv == seg->s_size + nseg->s_size); 6329 svd->swresv = seg->s_size; 6330 nsvd->swresv = nseg->s_size; 6331 } 6332 } 6333 6334 return (nseg); 6335 } 6336 6337 /* 6338 * called on memory operations (unmap, setprot, setpagesize) for a subset 6339 * of a large page segment to either demote the memory range (SDR_RANGE) 6340 * or the ends (SDR_END) by addr/len. 6341 * 6342 * returns 0 on success. returns errno, including ENOMEM, on failure. 6343 */ 6344 static int 6345 segvn_demote_range( 6346 struct seg *seg, 6347 caddr_t addr, 6348 size_t len, 6349 int flag, 6350 uint_t szcvec) 6351 { 6352 caddr_t eaddr = addr + len; 6353 caddr_t lpgaddr, lpgeaddr; 6354 struct seg *nseg; 6355 struct seg *badseg1 = NULL; 6356 struct seg *badseg2 = NULL; 6357 size_t pgsz; 6358 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6359 int err; 6360 uint_t szc = seg->s_szc; 6361 uint_t tszcvec; 6362 6363 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock)); 6364 ASSERT(svd->tr_state == SEGVN_TR_OFF); 6365 ASSERT(szc != 0); 6366 pgsz = page_get_pagesize(szc); 6367 ASSERT(seg->s_base != addr || seg->s_size != len); 6368 ASSERT(addr >= seg->s_base && eaddr <= seg->s_base + seg->s_size); 6369 ASSERT(svd->softlockcnt == 0); 6370 ASSERT(szcvec == 0 || (flag == SDR_END && svd->type == MAP_SHARED)); 6371 6372 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 6373 ASSERT(flag == SDR_RANGE || eaddr < lpgeaddr || addr > lpgaddr); 6374 if (flag == SDR_RANGE) { 6375 /* demote entire range */ 6376 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6377 (void) segvn_split_seg(nseg, lpgeaddr); 6378 ASSERT(badseg1->s_base == lpgaddr); 6379 ASSERT(badseg1->s_size == lpgeaddr - lpgaddr); 6380 } else if (addr != lpgaddr) { 6381 ASSERT(flag == SDR_END); 6382 badseg1 = nseg = segvn_split_seg(seg, lpgaddr); 6383 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz && 6384 eaddr < lpgaddr + 2 * pgsz) { 6385 (void) segvn_split_seg(nseg, lpgeaddr); 6386 ASSERT(badseg1->s_base == lpgaddr); 6387 ASSERT(badseg1->s_size == 2 * pgsz); 6388 } else { 6389 nseg = segvn_split_seg(nseg, lpgaddr + pgsz); 6390 ASSERT(badseg1->s_base == lpgaddr); 6391 ASSERT(badseg1->s_size == pgsz); 6392 if (eaddr != lpgeaddr && eaddr > lpgaddr + pgsz) { 6393 ASSERT(lpgeaddr - lpgaddr > 2 * pgsz); 6394 nseg = segvn_split_seg(nseg, lpgeaddr - pgsz); 6395 badseg2 = nseg; 6396 (void) segvn_split_seg(nseg, lpgeaddr); 6397 ASSERT(badseg2->s_base == lpgeaddr - pgsz); 6398 ASSERT(badseg2->s_size == pgsz); 6399 } 6400 } 6401 } else { 6402 ASSERT(flag == SDR_END); 6403 ASSERT(eaddr < lpgeaddr); 6404 badseg1 = nseg = segvn_split_seg(seg, lpgeaddr - pgsz); 6405 (void) segvn_split_seg(nseg, lpgeaddr); 6406 ASSERT(badseg1->s_base == lpgeaddr - pgsz); 6407 ASSERT(badseg1->s_size == pgsz); 6408 } 6409 6410 ASSERT(badseg1 != NULL); 6411 ASSERT(badseg1->s_szc == szc); 6412 ASSERT(flag == SDR_RANGE || badseg1->s_size == pgsz || 6413 badseg1->s_size == 2 * pgsz); 6414 ASSERT(sameprot(badseg1, badseg1->s_base, pgsz)); 6415 ASSERT(badseg1->s_size == pgsz || 6416 sameprot(badseg1, badseg1->s_base + pgsz, pgsz)); 6417 if (err = segvn_clrszc(badseg1)) { 6418 return (err); 6419 } 6420 ASSERT(badseg1->s_szc == 0); 6421 6422 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6423 uint_t tszc = highbit(tszcvec) - 1; 6424 caddr_t ta = MAX(addr, badseg1->s_base); 6425 caddr_t te; 6426 size_t tpgsz = page_get_pagesize(tszc); 6427 6428 ASSERT(svd->type == MAP_SHARED); 6429 ASSERT(flag == SDR_END); 6430 ASSERT(tszc < szc && tszc > 0); 6431 6432 if (eaddr > badseg1->s_base + badseg1->s_size) { 6433 te = badseg1->s_base + badseg1->s_size; 6434 } else { 6435 te = eaddr; 6436 } 6437 6438 ASSERT(ta <= te); 6439 badseg1->s_szc = tszc; 6440 if (!IS_P2ALIGNED(ta, tpgsz) || !IS_P2ALIGNED(te, tpgsz)) { 6441 if (badseg2 != NULL) { 6442 err = segvn_demote_range(badseg1, ta, te - ta, 6443 SDR_END, tszcvec); 6444 if (err != 0) { 6445 return (err); 6446 } 6447 } else { 6448 return (segvn_demote_range(badseg1, ta, 6449 te - ta, SDR_END, tszcvec)); 6450 } 6451 } 6452 } 6453 6454 if (badseg2 == NULL) 6455 return (0); 6456 ASSERT(badseg2->s_szc == szc); 6457 ASSERT(badseg2->s_size == pgsz); 6458 ASSERT(sameprot(badseg2, badseg2->s_base, badseg2->s_size)); 6459 if (err = segvn_clrszc(badseg2)) { 6460 return (err); 6461 } 6462 ASSERT(badseg2->s_szc == 0); 6463 6464 if (szc > 1 && (tszcvec = P2PHASE(szcvec, 1 << szc)) > 1) { 6465 uint_t tszc = highbit(tszcvec) - 1; 6466 size_t tpgsz = page_get_pagesize(tszc); 6467 6468 ASSERT(svd->type == MAP_SHARED); 6469 ASSERT(flag == SDR_END); 6470 ASSERT(tszc < szc && tszc > 0); 6471 ASSERT(badseg2->s_base > addr); 6472 ASSERT(eaddr > badseg2->s_base); 6473 ASSERT(eaddr < badseg2->s_base + badseg2->s_size); 6474 6475 badseg2->s_szc = tszc; 6476 if (!IS_P2ALIGNED(eaddr, tpgsz)) { 6477 return (segvn_demote_range(badseg2, badseg2->s_base, 6478 eaddr - badseg2->s_base, SDR_END, tszcvec)); 6479 } 6480 } 6481 6482 return (0); 6483 } 6484 6485 static int 6486 segvn_checkprot(struct seg *seg, caddr_t addr, size_t len, uint_t prot) 6487 { 6488 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6489 struct vpage *vp, *evp; 6490 6491 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6492 6493 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6494 /* 6495 * If segment protection can be used, simply check against them. 6496 */ 6497 if (svd->pageprot == 0) { 6498 int err; 6499 6500 err = ((svd->prot & prot) != prot) ? EACCES : 0; 6501 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6502 return (err); 6503 } 6504 6505 /* 6506 * Have to check down to the vpage level. 6507 */ 6508 evp = &svd->vpage[seg_page(seg, addr + len)]; 6509 for (vp = &svd->vpage[seg_page(seg, addr)]; vp < evp; vp++) { 6510 if ((VPP_PROT(vp) & prot) != prot) { 6511 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6512 return (EACCES); 6513 } 6514 } 6515 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6516 return (0); 6517 } 6518 6519 static int 6520 segvn_getprot(struct seg *seg, caddr_t addr, size_t len, uint_t *protv) 6521 { 6522 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6523 size_t pgno = seg_page(seg, addr + len) - seg_page(seg, addr) + 1; 6524 6525 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6526 6527 if (pgno != 0) { 6528 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6529 if (svd->pageprot == 0) { 6530 do 6531 protv[--pgno] = svd->prot; 6532 while (pgno != 0); 6533 } else { 6534 size_t pgoff = seg_page(seg, addr); 6535 6536 do { 6537 pgno--; 6538 protv[pgno] = VPP_PROT(&svd->vpage[pgno+pgoff]); 6539 } while (pgno != 0); 6540 } 6541 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6542 } 6543 return (0); 6544 } 6545 6546 static u_offset_t 6547 segvn_getoffset(struct seg *seg, caddr_t addr) 6548 { 6549 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6550 6551 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6552 6553 return (svd->offset + (uintptr_t)(addr - seg->s_base)); 6554 } 6555 6556 /*ARGSUSED*/ 6557 static int 6558 segvn_gettype(struct seg *seg, caddr_t addr) 6559 { 6560 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6561 6562 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6563 6564 return (svd->type | (svd->flags & (MAP_NORESERVE | MAP_TEXT | 6565 MAP_INITDATA))); 6566 } 6567 6568 /*ARGSUSED*/ 6569 static int 6570 segvn_getvp(struct seg *seg, caddr_t addr, struct vnode **vpp) 6571 { 6572 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6573 6574 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6575 6576 *vpp = svd->vp; 6577 return (0); 6578 } 6579 6580 /* 6581 * Check to see if it makes sense to do kluster/read ahead to 6582 * addr + delta relative to the mapping at addr. We assume here 6583 * that delta is a signed PAGESIZE'd multiple (which can be negative). 6584 * 6585 * For segvn, we currently "approve" of the action if we are 6586 * still in the segment and it maps from the same vp/off, 6587 * or if the advice stored in segvn_data or vpages allows it. 6588 * Currently, klustering is not allowed only if MADV_RANDOM is set. 6589 */ 6590 static int 6591 segvn_kluster(struct seg *seg, caddr_t addr, ssize_t delta) 6592 { 6593 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6594 struct anon *oap, *ap; 6595 ssize_t pd; 6596 size_t page; 6597 struct vnode *vp1, *vp2; 6598 u_offset_t off1, off2; 6599 struct anon_map *amp; 6600 6601 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6602 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 6603 SEGVN_LOCK_HELD(seg->s_as, &svd->lock)); 6604 6605 if (addr + delta < seg->s_base || 6606 addr + delta >= (seg->s_base + seg->s_size)) 6607 return (-1); /* exceeded segment bounds */ 6608 6609 pd = delta / (ssize_t)PAGESIZE; /* divide to preserve sign bit */ 6610 page = seg_page(seg, addr); 6611 6612 /* 6613 * Check to see if either of the pages addr or addr + delta 6614 * have advice set that prevents klustering (if MADV_RANDOM advice 6615 * is set for entire segment, or MADV_SEQUENTIAL is set and delta 6616 * is negative). 6617 */ 6618 if (svd->advice == MADV_RANDOM || 6619 svd->advice == MADV_SEQUENTIAL && delta < 0) 6620 return (-1); 6621 else if (svd->pageadvice && svd->vpage) { 6622 struct vpage *bvpp, *evpp; 6623 6624 bvpp = &svd->vpage[page]; 6625 evpp = &svd->vpage[page + pd]; 6626 if (VPP_ADVICE(bvpp) == MADV_RANDOM || 6627 VPP_ADVICE(evpp) == MADV_SEQUENTIAL && delta < 0) 6628 return (-1); 6629 if (VPP_ADVICE(bvpp) != VPP_ADVICE(evpp) && 6630 VPP_ADVICE(evpp) == MADV_RANDOM) 6631 return (-1); 6632 } 6633 6634 if (svd->type == MAP_SHARED) 6635 return (0); /* shared mapping - all ok */ 6636 6637 if ((amp = svd->amp) == NULL) 6638 return (0); /* off original vnode */ 6639 6640 page += svd->anon_index; 6641 6642 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6643 6644 oap = anon_get_ptr(amp->ahp, page); 6645 ap = anon_get_ptr(amp->ahp, page + pd); 6646 6647 ANON_LOCK_EXIT(&->a_rwlock); 6648 6649 if ((oap == NULL && ap != NULL) || (oap != NULL && ap == NULL)) { 6650 return (-1); /* one with and one without an anon */ 6651 } 6652 6653 if (oap == NULL) { /* implies that ap == NULL */ 6654 return (0); /* off original vnode */ 6655 } 6656 6657 /* 6658 * Now we know we have two anon pointers - check to 6659 * see if they happen to be properly allocated. 6660 */ 6661 6662 /* 6663 * XXX We cheat here and don't lock the anon slots. We can't because 6664 * we may have been called from the anon layer which might already 6665 * have locked them. We are holding a refcnt on the slots so they 6666 * can't disappear. The worst that will happen is we'll get the wrong 6667 * names (vp, off) for the slots and make a poor klustering decision. 6668 */ 6669 swap_xlate(ap, &vp1, &off1); 6670 swap_xlate(oap, &vp2, &off2); 6671 6672 6673 if (!VOP_CMP(vp1, vp2) || off1 - off2 != delta) 6674 return (-1); 6675 return (0); 6676 } 6677 6678 /* 6679 * Swap the pages of seg out to secondary storage, returning the 6680 * number of bytes of storage freed. 6681 * 6682 * The basic idea is first to unload all translations and then to call 6683 * VOP_PUTPAGE() for all newly-unmapped pages, to push them out to the 6684 * swap device. Pages to which other segments have mappings will remain 6685 * mapped and won't be swapped. Our caller (as_swapout) has already 6686 * performed the unloading step. 6687 * 6688 * The value returned is intended to correlate well with the process's 6689 * memory requirements. However, there are some caveats: 6690 * 1) When given a shared segment as argument, this routine will 6691 * only succeed in swapping out pages for the last sharer of the 6692 * segment. (Previous callers will only have decremented mapping 6693 * reference counts.) 6694 * 2) We assume that the hat layer maintains a large enough translation 6695 * cache to capture process reference patterns. 6696 */ 6697 static size_t 6698 segvn_swapout(struct seg *seg) 6699 { 6700 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6701 struct anon_map *amp; 6702 pgcnt_t pgcnt = 0; 6703 pgcnt_t npages; 6704 pgcnt_t page; 6705 ulong_t anon_index; 6706 6707 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6708 6709 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6710 /* 6711 * Find pages unmapped by our caller and force them 6712 * out to the virtual swap device. 6713 */ 6714 if ((amp = svd->amp) != NULL) 6715 anon_index = svd->anon_index; 6716 npages = seg->s_size >> PAGESHIFT; 6717 for (page = 0; page < npages; page++) { 6718 page_t *pp; 6719 struct anon *ap; 6720 struct vnode *vp; 6721 u_offset_t off; 6722 anon_sync_obj_t cookie; 6723 6724 /* 6725 * Obtain <vp, off> pair for the page, then look it up. 6726 * 6727 * Note that this code is willing to consider regular 6728 * pages as well as anon pages. Is this appropriate here? 6729 */ 6730 ap = NULL; 6731 if (amp != NULL) { 6732 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6733 if (anon_array_try_enter(amp, anon_index + page, 6734 &cookie)) { 6735 ANON_LOCK_EXIT(&->a_rwlock); 6736 continue; 6737 } 6738 ap = anon_get_ptr(amp->ahp, anon_index + page); 6739 if (ap != NULL) { 6740 swap_xlate(ap, &vp, &off); 6741 } else { 6742 vp = svd->vp; 6743 off = svd->offset + ptob(page); 6744 } 6745 anon_array_exit(&cookie); 6746 ANON_LOCK_EXIT(&->a_rwlock); 6747 } else { 6748 vp = svd->vp; 6749 off = svd->offset + ptob(page); 6750 } 6751 if (vp == NULL) { /* untouched zfod page */ 6752 ASSERT(ap == NULL); 6753 continue; 6754 } 6755 6756 pp = page_lookup_nowait(vp, off, SE_SHARED); 6757 if (pp == NULL) 6758 continue; 6759 6760 6761 /* 6762 * Examine the page to see whether it can be tossed out, 6763 * keeping track of how many we've found. 6764 */ 6765 if (!page_tryupgrade(pp)) { 6766 /* 6767 * If the page has an i/o lock and no mappings, 6768 * it's very likely that the page is being 6769 * written out as a result of klustering. 6770 * Assume this is so and take credit for it here. 6771 */ 6772 if (!page_io_trylock(pp)) { 6773 if (!hat_page_is_mapped(pp)) 6774 pgcnt++; 6775 } else { 6776 page_io_unlock(pp); 6777 } 6778 page_unlock(pp); 6779 continue; 6780 } 6781 ASSERT(!page_iolock_assert(pp)); 6782 6783 6784 /* 6785 * Skip if page is locked or has mappings. 6786 * We don't need the page_struct_lock to look at lckcnt 6787 * and cowcnt because the page is exclusive locked. 6788 */ 6789 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 6790 hat_page_is_mapped(pp)) { 6791 page_unlock(pp); 6792 continue; 6793 } 6794 6795 /* 6796 * dispose skips large pages so try to demote first. 6797 */ 6798 if (pp->p_szc != 0 && !page_try_demote_pages(pp)) { 6799 page_unlock(pp); 6800 /* 6801 * XXX should skip the remaining page_t's of this 6802 * large page. 6803 */ 6804 continue; 6805 } 6806 6807 ASSERT(pp->p_szc == 0); 6808 6809 /* 6810 * No longer mapped -- we can toss it out. How 6811 * we do so depends on whether or not it's dirty. 6812 */ 6813 if (hat_ismod(pp) && pp->p_vnode) { 6814 /* 6815 * We must clean the page before it can be 6816 * freed. Setting B_FREE will cause pvn_done 6817 * to free the page when the i/o completes. 6818 * XXX: This also causes it to be accounted 6819 * as a pageout instead of a swap: need 6820 * B_SWAPOUT bit to use instead of B_FREE. 6821 * 6822 * Hold the vnode before releasing the page lock 6823 * to prevent it from being freed and re-used by 6824 * some other thread. 6825 */ 6826 VN_HOLD(vp); 6827 page_unlock(pp); 6828 6829 /* 6830 * Queue all i/o requests for the pageout thread 6831 * to avoid saturating the pageout devices. 6832 */ 6833 if (!queue_io_request(vp, off)) 6834 VN_RELE(vp); 6835 } else { 6836 /* 6837 * The page was clean, free it. 6838 * 6839 * XXX: Can we ever encounter modified pages 6840 * with no associated vnode here? 6841 */ 6842 ASSERT(pp->p_vnode != NULL); 6843 /*LINTED: constant in conditional context*/ 6844 VN_DISPOSE(pp, B_FREE, 0, kcred); 6845 } 6846 6847 /* 6848 * Credit now even if i/o is in progress. 6849 */ 6850 pgcnt++; 6851 } 6852 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6853 6854 /* 6855 * Wakeup pageout to initiate i/o on all queued requests. 6856 */ 6857 cv_signal_pageout(); 6858 return (ptob(pgcnt)); 6859 } 6860 6861 /* 6862 * Synchronize primary storage cache with real object in virtual memory. 6863 * 6864 * XXX - Anonymous pages should not be sync'ed out at all. 6865 */ 6866 static int 6867 segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) 6868 { 6869 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 6870 struct vpage *vpp; 6871 page_t *pp; 6872 u_offset_t offset; 6873 struct vnode *vp; 6874 u_offset_t off; 6875 caddr_t eaddr; 6876 int bflags; 6877 int err = 0; 6878 int segtype; 6879 int pageprot; 6880 int prot; 6881 ulong_t anon_index; 6882 struct anon_map *amp; 6883 struct anon *ap; 6884 anon_sync_obj_t cookie; 6885 6886 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6887 6888 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 6889 6890 if (svd->softlockcnt > 0) { 6891 /* 6892 * flush all pages from seg cache 6893 * otherwise we may deadlock in swap_putpage 6894 * for B_INVAL page (4175402). 6895 * 6896 * Even if we grab segvn WRITER's lock or segp_slock 6897 * here, there might be another thread which could've 6898 * successfully performed lookup/insert just before 6899 * we acquired the lock here. So, grabbing either 6900 * lock here is of not much use. Until we devise 6901 * a strategy at upper layers to solve the 6902 * synchronization issues completely, we expect 6903 * applications to handle this appropriately. 6904 */ 6905 segvn_purge(seg); 6906 if (svd->softlockcnt > 0) { 6907 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6908 return (EAGAIN); 6909 } 6910 } 6911 6912 vpp = svd->vpage; 6913 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 6914 bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | 6915 ((flags & MS_INVALIDATE) ? B_INVAL : 0); 6916 6917 if (attr) { 6918 pageprot = attr & ~(SHARED|PRIVATE); 6919 segtype = (attr & SHARED) ? MAP_SHARED : MAP_PRIVATE; 6920 6921 /* 6922 * We are done if the segment types don't match 6923 * or if we have segment level protections and 6924 * they don't match. 6925 */ 6926 if (svd->type != segtype) { 6927 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6928 return (0); 6929 } 6930 if (vpp == NULL) { 6931 if (svd->prot != pageprot) { 6932 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6933 return (0); 6934 } 6935 prot = svd->prot; 6936 } else 6937 vpp = &svd->vpage[seg_page(seg, addr)]; 6938 6939 } else if (svd->vp && svd->amp == NULL && 6940 (flags & MS_INVALIDATE) == 0) { 6941 6942 /* 6943 * No attributes, no anonymous pages and MS_INVALIDATE flag 6944 * is not on, just use one big request. 6945 */ 6946 err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, 6947 bflags, svd->cred); 6948 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 6949 return (err); 6950 } 6951 6952 if ((amp = svd->amp) != NULL) 6953 anon_index = svd->anon_index + seg_page(seg, addr); 6954 6955 for (eaddr = addr + len; addr < eaddr; addr += PAGESIZE) { 6956 ap = NULL; 6957 if (amp != NULL) { 6958 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 6959 anon_array_enter(amp, anon_index, &cookie); 6960 ap = anon_get_ptr(amp->ahp, anon_index++); 6961 if (ap != NULL) { 6962 swap_xlate(ap, &vp, &off); 6963 } else { 6964 vp = svd->vp; 6965 off = offset; 6966 } 6967 anon_array_exit(&cookie); 6968 ANON_LOCK_EXIT(&->a_rwlock); 6969 } else { 6970 vp = svd->vp; 6971 off = offset; 6972 } 6973 offset += PAGESIZE; 6974 6975 if (vp == NULL) /* untouched zfod page */ 6976 continue; 6977 6978 if (attr) { 6979 if (vpp) { 6980 prot = VPP_PROT(vpp); 6981 vpp++; 6982 } 6983 if (prot != pageprot) { 6984 continue; 6985 } 6986 } 6987 6988 /* 6989 * See if any of these pages are locked -- if so, then we 6990 * will have to truncate an invalidate request at the first 6991 * locked one. We don't need the page_struct_lock to test 6992 * as this is only advisory; even if we acquire it someone 6993 * might race in and lock the page after we unlock and before 6994 * we do the PUTPAGE, then PUTPAGE simply does nothing. 6995 */ 6996 if (flags & MS_INVALIDATE) { 6997 if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { 6998 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 6999 page_unlock(pp); 7000 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7001 return (EBUSY); 7002 } 7003 if (ap != NULL && pp->p_szc != 0 && 7004 page_tryupgrade(pp)) { 7005 if (pp->p_lckcnt == 0 && 7006 pp->p_cowcnt == 0) { 7007 /* 7008 * swapfs VN_DISPOSE() won't 7009 * invalidate large pages. 7010 * Attempt to demote. 7011 * XXX can't help it if it 7012 * fails. But for swapfs 7013 * pages it is no big deal. 7014 */ 7015 (void) page_try_demote_pages( 7016 pp); 7017 } 7018 } 7019 page_unlock(pp); 7020 } 7021 } else if (svd->type == MAP_SHARED && amp != NULL) { 7022 /* 7023 * Avoid writting out to disk ISM's large pages 7024 * because segspt_free_pages() relies on NULL an_pvp 7025 * of anon slots of such pages. 7026 */ 7027 7028 ASSERT(svd->vp == NULL); 7029 /* 7030 * swapfs uses page_lookup_nowait if not freeing or 7031 * invalidating and skips a page if 7032 * page_lookup_nowait returns NULL. 7033 */ 7034 pp = page_lookup_nowait(vp, off, SE_SHARED); 7035 if (pp == NULL) { 7036 continue; 7037 } 7038 if (pp->p_szc != 0) { 7039 page_unlock(pp); 7040 continue; 7041 } 7042 7043 /* 7044 * Note ISM pages are created large so (vp, off)'s 7045 * page cannot suddenly become large after we unlock 7046 * pp. 7047 */ 7048 page_unlock(pp); 7049 } 7050 /* 7051 * XXX - Should ultimately try to kluster 7052 * calls to VOP_PUTPAGE() for performance. 7053 */ 7054 VN_HOLD(vp); 7055 err = VOP_PUTPAGE(vp, (offset_t)off, PAGESIZE, 7056 bflags, svd->cred); 7057 VN_RELE(vp); 7058 if (err) 7059 break; 7060 } 7061 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7062 return (err); 7063 } 7064 7065 /* 7066 * Determine if we have data corresponding to pages in the 7067 * primary storage virtual memory cache (i.e., "in core"). 7068 */ 7069 static size_t 7070 segvn_incore(struct seg *seg, caddr_t addr, size_t len, char *vec) 7071 { 7072 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7073 struct vnode *vp, *avp; 7074 u_offset_t offset, aoffset; 7075 size_t p, ep; 7076 int ret; 7077 struct vpage *vpp; 7078 page_t *pp; 7079 uint_t start; 7080 struct anon_map *amp; /* XXX - for locknest */ 7081 struct anon *ap; 7082 uint_t attr; 7083 anon_sync_obj_t cookie; 7084 7085 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7086 7087 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7088 if (svd->amp == NULL && svd->vp == NULL) { 7089 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7090 bzero(vec, btopr(len)); 7091 return (len); /* no anonymous pages created yet */ 7092 } 7093 7094 p = seg_page(seg, addr); 7095 ep = seg_page(seg, addr + len); 7096 start = svd->vp ? SEG_PAGE_VNODEBACKED : 0; 7097 7098 amp = svd->amp; 7099 for (; p < ep; p++, addr += PAGESIZE) { 7100 vpp = (svd->vpage) ? &svd->vpage[p]: NULL; 7101 ret = start; 7102 ap = NULL; 7103 avp = NULL; 7104 /* Grab the vnode/offset for the anon slot */ 7105 if (amp != NULL) { 7106 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7107 anon_array_enter(amp, svd->anon_index + p, &cookie); 7108 ap = anon_get_ptr(amp->ahp, svd->anon_index + p); 7109 if (ap != NULL) { 7110 swap_xlate(ap, &avp, &aoffset); 7111 } 7112 anon_array_exit(&cookie); 7113 ANON_LOCK_EXIT(&->a_rwlock); 7114 } 7115 if ((avp != NULL) && page_exists(avp, aoffset)) { 7116 /* A page exists for the anon slot */ 7117 ret |= SEG_PAGE_INCORE; 7118 7119 /* 7120 * If page is mapped and writable 7121 */ 7122 attr = (uint_t)0; 7123 if ((hat_getattr(seg->s_as->a_hat, addr, 7124 &attr) != -1) && (attr & PROT_WRITE)) { 7125 ret |= SEG_PAGE_ANON; 7126 } 7127 /* 7128 * Don't get page_struct lock for lckcnt and cowcnt, 7129 * since this is purely advisory. 7130 */ 7131 if ((pp = page_lookup_nowait(avp, aoffset, 7132 SE_SHARED)) != NULL) { 7133 if (pp->p_lckcnt) 7134 ret |= SEG_PAGE_SOFTLOCK; 7135 if (pp->p_cowcnt) 7136 ret |= SEG_PAGE_HASCOW; 7137 page_unlock(pp); 7138 } 7139 } 7140 7141 /* Gather vnode statistics */ 7142 vp = svd->vp; 7143 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7144 7145 if (vp != NULL) { 7146 /* 7147 * Try to obtain a "shared" lock on the page 7148 * without blocking. If this fails, determine 7149 * if the page is in memory. 7150 */ 7151 pp = page_lookup_nowait(vp, offset, SE_SHARED); 7152 if ((pp == NULL) && (page_exists(vp, offset))) { 7153 /* Page is incore, and is named */ 7154 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7155 } 7156 /* 7157 * Don't get page_struct lock for lckcnt and cowcnt, 7158 * since this is purely advisory. 7159 */ 7160 if (pp != NULL) { 7161 ret |= (SEG_PAGE_INCORE | SEG_PAGE_VNODE); 7162 if (pp->p_lckcnt) 7163 ret |= SEG_PAGE_SOFTLOCK; 7164 if (pp->p_cowcnt) 7165 ret |= SEG_PAGE_HASCOW; 7166 page_unlock(pp); 7167 } 7168 } 7169 7170 /* Gather virtual page information */ 7171 if (vpp) { 7172 if (VPP_ISPPLOCK(vpp)) 7173 ret |= SEG_PAGE_LOCKED; 7174 vpp++; 7175 } 7176 7177 *vec++ = (char)ret; 7178 } 7179 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7180 return (len); 7181 } 7182 7183 /* 7184 * Statement for p_cowcnts/p_lckcnts. 7185 * 7186 * p_cowcnt is updated while mlock/munlocking MAP_PRIVATE and PROT_WRITE region 7187 * irrespective of the following factors or anything else: 7188 * 7189 * (1) anon slots are populated or not 7190 * (2) cow is broken or not 7191 * (3) refcnt on ap is 1 or greater than 1 7192 * 7193 * If it's not MAP_PRIVATE and PROT_WRITE, p_lckcnt is updated during mlock 7194 * and munlock. 7195 * 7196 * 7197 * Handling p_cowcnts/p_lckcnts during copy-on-write fault: 7198 * 7199 * if vpage has PROT_WRITE 7200 * transfer cowcnt on the oldpage -> cowcnt on the newpage 7201 * else 7202 * transfer lckcnt on the oldpage -> lckcnt on the newpage 7203 * 7204 * During copy-on-write, decrement p_cowcnt on the oldpage and increment 7205 * p_cowcnt on the newpage *if* the corresponding vpage has PROT_WRITE. 7206 * 7207 * We may also break COW if softlocking on read access in the physio case. 7208 * In this case, vpage may not have PROT_WRITE. So, we need to decrement 7209 * p_lckcnt on the oldpage and increment p_lckcnt on the newpage *if* the 7210 * vpage doesn't have PROT_WRITE. 7211 * 7212 * 7213 * Handling p_cowcnts/p_lckcnts during mprotect on mlocked region: 7214 * 7215 * If a MAP_PRIVATE region loses PROT_WRITE, we decrement p_cowcnt and 7216 * increment p_lckcnt by calling page_subclaim() which takes care of 7217 * availrmem accounting and p_lckcnt overflow. 7218 * 7219 * If a MAP_PRIVATE region gains PROT_WRITE, we decrement p_lckcnt and 7220 * increment p_cowcnt by calling page_addclaim() which takes care of 7221 * availrmem availability and p_cowcnt overflow. 7222 */ 7223 7224 /* 7225 * Lock down (or unlock) pages mapped by this segment. 7226 * 7227 * XXX only creates PAGESIZE pages if anon slots are not initialized. 7228 * At fault time they will be relocated into larger pages. 7229 */ 7230 static int 7231 segvn_lockop(struct seg *seg, caddr_t addr, size_t len, 7232 int attr, int op, ulong_t *lockmap, size_t pos) 7233 { 7234 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7235 struct vpage *vpp; 7236 struct vpage *evp; 7237 page_t *pp; 7238 u_offset_t offset; 7239 u_offset_t off; 7240 int segtype; 7241 int pageprot; 7242 int claim; 7243 struct vnode *vp; 7244 ulong_t anon_index; 7245 struct anon_map *amp; 7246 struct anon *ap; 7247 struct vattr va; 7248 anon_sync_obj_t cookie; 7249 struct kshmid *sp = NULL; 7250 struct proc *p = curproc; 7251 kproject_t *proj = NULL; 7252 int chargeproc = 1; 7253 size_t locked_bytes = 0; 7254 size_t unlocked_bytes = 0; 7255 int err = 0; 7256 7257 /* 7258 * Hold write lock on address space because may split or concatenate 7259 * segments 7260 */ 7261 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7262 7263 /* 7264 * If this is a shm, use shm's project and zone, else use 7265 * project and zone of calling process 7266 */ 7267 7268 /* Determine if this segment backs a sysV shm */ 7269 if (svd->amp != NULL && svd->amp->a_sp != NULL) { 7270 ASSERT(svd->type == MAP_SHARED); 7271 ASSERT(svd->tr_state == SEGVN_TR_OFF); 7272 sp = svd->amp->a_sp; 7273 proj = sp->shm_perm.ipc_proj; 7274 chargeproc = 0; 7275 } 7276 7277 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7278 if (attr) { 7279 pageprot = attr & ~(SHARED|PRIVATE); 7280 segtype = attr & SHARED ? MAP_SHARED : MAP_PRIVATE; 7281 7282 /* 7283 * We are done if the segment types don't match 7284 * or if we have segment level protections and 7285 * they don't match. 7286 */ 7287 if (svd->type != segtype) { 7288 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7289 return (0); 7290 } 7291 if (svd->pageprot == 0 && svd->prot != pageprot) { 7292 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7293 return (0); 7294 } 7295 } 7296 7297 if (op == MC_LOCK) { 7298 if (svd->tr_state == SEGVN_TR_INIT) { 7299 svd->tr_state = SEGVN_TR_OFF; 7300 } else if (svd->tr_state == SEGVN_TR_ON) { 7301 ASSERT(svd->amp != NULL); 7302 segvn_textunrepl(seg, 0); 7303 ASSERT(svd->amp == NULL && 7304 svd->tr_state == SEGVN_TR_OFF); 7305 } 7306 } 7307 7308 /* 7309 * If we're locking, then we must create a vpage structure if 7310 * none exists. If we're unlocking, then check to see if there 7311 * is a vpage -- if not, then we could not have locked anything. 7312 */ 7313 7314 if ((vpp = svd->vpage) == NULL) { 7315 if (op == MC_LOCK) 7316 segvn_vpage(seg); 7317 else { 7318 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7319 return (0); 7320 } 7321 } 7322 7323 /* 7324 * The anonymous data vector (i.e., previously 7325 * unreferenced mapping to swap space) can be allocated 7326 * by lazily testing for its existence. 7327 */ 7328 if (op == MC_LOCK && svd->amp == NULL && svd->vp == NULL) { 7329 svd->amp = anonmap_alloc(seg->s_size, 0, ANON_SLEEP); 7330 svd->amp->a_szc = seg->s_szc; 7331 } 7332 7333 if ((amp = svd->amp) != NULL) { 7334 anon_index = svd->anon_index + seg_page(seg, addr); 7335 } 7336 7337 offset = svd->offset + (uintptr_t)(addr - seg->s_base); 7338 evp = &svd->vpage[seg_page(seg, addr + len)]; 7339 7340 if (sp != NULL) 7341 mutex_enter(&sp->shm_mlock); 7342 7343 /* determine number of unlocked bytes in range for lock operation */ 7344 if (op == MC_LOCK) { 7345 7346 if (sp == NULL) { 7347 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7348 vpp++) { 7349 if (!VPP_ISPPLOCK(vpp)) 7350 unlocked_bytes += PAGESIZE; 7351 } 7352 } else { 7353 ulong_t i_idx, i_edx; 7354 anon_sync_obj_t i_cookie; 7355 struct anon *i_ap; 7356 struct vnode *i_vp; 7357 u_offset_t i_off; 7358 7359 /* Only count sysV pages once for locked memory */ 7360 i_edx = svd->anon_index + seg_page(seg, addr + len); 7361 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7362 for (i_idx = anon_index; i_idx < i_edx; i_idx++) { 7363 anon_array_enter(amp, i_idx, &i_cookie); 7364 i_ap = anon_get_ptr(amp->ahp, i_idx); 7365 if (i_ap == NULL) { 7366 unlocked_bytes += PAGESIZE; 7367 anon_array_exit(&i_cookie); 7368 continue; 7369 } 7370 swap_xlate(i_ap, &i_vp, &i_off); 7371 anon_array_exit(&i_cookie); 7372 pp = page_lookup(i_vp, i_off, SE_SHARED); 7373 if (pp == NULL) { 7374 unlocked_bytes += PAGESIZE; 7375 continue; 7376 } else if (pp->p_lckcnt == 0) 7377 unlocked_bytes += PAGESIZE; 7378 page_unlock(pp); 7379 } 7380 ANON_LOCK_EXIT(&->a_rwlock); 7381 } 7382 7383 mutex_enter(&p->p_lock); 7384 err = rctl_incr_locked_mem(p, proj, unlocked_bytes, 7385 chargeproc); 7386 mutex_exit(&p->p_lock); 7387 7388 if (err) { 7389 if (sp != NULL) 7390 mutex_exit(&sp->shm_mlock); 7391 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7392 return (err); 7393 } 7394 } 7395 /* 7396 * Loop over all pages in the range. Process if we're locking and 7397 * page has not already been locked in this mapping; or if we're 7398 * unlocking and the page has been locked. 7399 */ 7400 for (vpp = &svd->vpage[seg_page(seg, addr)]; vpp < evp; 7401 vpp++, pos++, addr += PAGESIZE, offset += PAGESIZE, anon_index++) { 7402 if ((attr == 0 || VPP_PROT(vpp) == pageprot) && 7403 ((op == MC_LOCK && !VPP_ISPPLOCK(vpp)) || 7404 (op == MC_UNLOCK && VPP_ISPPLOCK(vpp)))) { 7405 7406 if (amp != NULL) 7407 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7408 /* 7409 * If this isn't a MAP_NORESERVE segment and 7410 * we're locking, allocate anon slots if they 7411 * don't exist. The page is brought in later on. 7412 */ 7413 if (op == MC_LOCK && svd->vp == NULL && 7414 ((svd->flags & MAP_NORESERVE) == 0) && 7415 amp != NULL && 7416 ((ap = anon_get_ptr(amp->ahp, anon_index)) 7417 == NULL)) { 7418 anon_array_enter(amp, anon_index, &cookie); 7419 7420 if ((ap = anon_get_ptr(amp->ahp, 7421 anon_index)) == NULL) { 7422 pp = anon_zero(seg, addr, &ap, 7423 svd->cred); 7424 if (pp == NULL) { 7425 anon_array_exit(&cookie); 7426 ANON_LOCK_EXIT(&->a_rwlock); 7427 err = ENOMEM; 7428 goto out; 7429 } 7430 ASSERT(anon_get_ptr(amp->ahp, 7431 anon_index) == NULL); 7432 (void) anon_set_ptr(amp->ahp, 7433 anon_index, ap, ANON_SLEEP); 7434 page_unlock(pp); 7435 } 7436 anon_array_exit(&cookie); 7437 } 7438 7439 /* 7440 * Get name for page, accounting for 7441 * existence of private copy. 7442 */ 7443 ap = NULL; 7444 if (amp != NULL) { 7445 anon_array_enter(amp, anon_index, &cookie); 7446 ap = anon_get_ptr(amp->ahp, anon_index); 7447 if (ap != NULL) { 7448 swap_xlate(ap, &vp, &off); 7449 } else { 7450 if (svd->vp == NULL && 7451 (svd->flags & MAP_NORESERVE)) { 7452 anon_array_exit(&cookie); 7453 ANON_LOCK_EXIT(&->a_rwlock); 7454 continue; 7455 } 7456 vp = svd->vp; 7457 off = offset; 7458 } 7459 anon_array_exit(&cookie); 7460 ANON_LOCK_EXIT(&->a_rwlock); 7461 } else { 7462 vp = svd->vp; 7463 off = offset; 7464 } 7465 7466 /* 7467 * Get page frame. It's ok if the page is 7468 * not available when we're unlocking, as this 7469 * may simply mean that a page we locked got 7470 * truncated out of existence after we locked it. 7471 * 7472 * Invoke VOP_GETPAGE() to obtain the page struct 7473 * since we may need to read it from disk if its 7474 * been paged out. 7475 */ 7476 if (op != MC_LOCK) 7477 pp = page_lookup(vp, off, SE_SHARED); 7478 else { 7479 page_t *pl[1 + 1]; 7480 int error; 7481 7482 ASSERT(vp != NULL); 7483 7484 error = VOP_GETPAGE(vp, (offset_t)off, PAGESIZE, 7485 (uint_t *)NULL, pl, PAGESIZE, seg, addr, 7486 S_OTHER, svd->cred); 7487 7488 /* 7489 * If the error is EDEADLK then we must bounce 7490 * up and drop all vm subsystem locks and then 7491 * retry the operation later 7492 * This behavior is a temporary measure because 7493 * ufs/sds logging is badly designed and will 7494 * deadlock if we don't allow this bounce to 7495 * happen. The real solution is to re-design 7496 * the logging code to work properly. See bug 7497 * 4125102 for details of the problem. 7498 */ 7499 if (error == EDEADLK) { 7500 err = error; 7501 goto out; 7502 } 7503 /* 7504 * Quit if we fail to fault in the page. Treat 7505 * the failure as an error, unless the addr 7506 * is mapped beyond the end of a file. 7507 */ 7508 if (error && svd->vp) { 7509 va.va_mask = AT_SIZE; 7510 if (VOP_GETATTR(svd->vp, &va, 0, 7511 svd->cred) != 0) { 7512 err = EIO; 7513 goto out; 7514 } 7515 if (btopr(va.va_size) >= 7516 btopr(off + 1)) { 7517 err = EIO; 7518 goto out; 7519 } 7520 goto out; 7521 7522 } else if (error) { 7523 err = EIO; 7524 goto out; 7525 } 7526 pp = pl[0]; 7527 ASSERT(pp != NULL); 7528 } 7529 7530 /* 7531 * See Statement at the beginning of this routine. 7532 * 7533 * claim is always set if MAP_PRIVATE and PROT_WRITE 7534 * irrespective of following factors: 7535 * 7536 * (1) anon slots are populated or not 7537 * (2) cow is broken or not 7538 * (3) refcnt on ap is 1 or greater than 1 7539 * 7540 * See 4140683 for details 7541 */ 7542 claim = ((VPP_PROT(vpp) & PROT_WRITE) && 7543 (svd->type == MAP_PRIVATE)); 7544 7545 /* 7546 * Perform page-level operation appropriate to 7547 * operation. If locking, undo the SOFTLOCK 7548 * performed to bring the page into memory 7549 * after setting the lock. If unlocking, 7550 * and no page was found, account for the claim 7551 * separately. 7552 */ 7553 if (op == MC_LOCK) { 7554 int ret = 1; /* Assume success */ 7555 7556 ASSERT(!VPP_ISPPLOCK(vpp)); 7557 7558 ret = page_pp_lock(pp, claim, 0); 7559 if (ret == 0) { 7560 /* locking page failed */ 7561 page_unlock(pp); 7562 err = EAGAIN; 7563 goto out; 7564 } 7565 VPP_SETPPLOCK(vpp); 7566 if (sp != NULL) { 7567 if (pp->p_lckcnt == 1) 7568 locked_bytes += PAGESIZE; 7569 } else 7570 locked_bytes += PAGESIZE; 7571 7572 if (lockmap != (ulong_t *)NULL) 7573 BT_SET(lockmap, pos); 7574 7575 page_unlock(pp); 7576 } else { 7577 ASSERT(VPP_ISPPLOCK(vpp)); 7578 if (pp != NULL) { 7579 /* sysV pages should be locked */ 7580 ASSERT(sp == NULL || pp->p_lckcnt > 0); 7581 page_pp_unlock(pp, claim, 0); 7582 if (sp != NULL) { 7583 if (pp->p_lckcnt == 0) 7584 unlocked_bytes 7585 += PAGESIZE; 7586 } else 7587 unlocked_bytes += PAGESIZE; 7588 page_unlock(pp); 7589 } else { 7590 ASSERT(sp == NULL); 7591 unlocked_bytes += PAGESIZE; 7592 } 7593 VPP_CLRPPLOCK(vpp); 7594 } 7595 } 7596 } 7597 out: 7598 if (op == MC_LOCK) { 7599 /* Credit back bytes that did not get locked */ 7600 if ((unlocked_bytes - locked_bytes) > 0) { 7601 if (proj == NULL) 7602 mutex_enter(&p->p_lock); 7603 rctl_decr_locked_mem(p, proj, 7604 (unlocked_bytes - locked_bytes), chargeproc); 7605 if (proj == NULL) 7606 mutex_exit(&p->p_lock); 7607 } 7608 7609 } else { 7610 /* Account bytes that were unlocked */ 7611 if (unlocked_bytes > 0) { 7612 if (proj == NULL) 7613 mutex_enter(&p->p_lock); 7614 rctl_decr_locked_mem(p, proj, unlocked_bytes, 7615 chargeproc); 7616 if (proj == NULL) 7617 mutex_exit(&p->p_lock); 7618 } 7619 } 7620 if (sp != NULL) 7621 mutex_exit(&sp->shm_mlock); 7622 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7623 7624 return (err); 7625 } 7626 7627 /* 7628 * Set advice from user for specified pages 7629 * There are 5 types of advice: 7630 * MADV_NORMAL - Normal (default) behavior (whatever that is) 7631 * MADV_RANDOM - Random page references 7632 * do not allow readahead or 'klustering' 7633 * MADV_SEQUENTIAL - Sequential page references 7634 * Pages previous to the one currently being 7635 * accessed (determined by fault) are 'not needed' 7636 * and are freed immediately 7637 * MADV_WILLNEED - Pages are likely to be used (fault ahead in mctl) 7638 * MADV_DONTNEED - Pages are not needed (synced out in mctl) 7639 * MADV_FREE - Contents can be discarded 7640 * MADV_ACCESS_DEFAULT- Default access 7641 * MADV_ACCESS_LWP - Next LWP will access heavily 7642 * MADV_ACCESS_MANY- Many LWPs or processes will access heavily 7643 */ 7644 static int 7645 segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) 7646 { 7647 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 7648 size_t page; 7649 int err = 0; 7650 int already_set; 7651 struct anon_map *amp; 7652 ulong_t anon_index; 7653 struct seg *next; 7654 lgrp_mem_policy_t policy; 7655 struct seg *prev; 7656 struct vnode *vp; 7657 7658 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 7659 7660 /* 7661 * In case of MADV_FREE, we won't be modifying any segment private 7662 * data structures; so, we only need to grab READER's lock 7663 */ 7664 if (behav != MADV_FREE) { 7665 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); 7666 if (svd->tr_state != SEGVN_TR_OFF) { 7667 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7668 return (0); 7669 } 7670 } else { 7671 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 7672 } 7673 7674 /* 7675 * Large pages are assumed to be only turned on when accesses to the 7676 * segment's address range have spatial and temporal locality. That 7677 * justifies ignoring MADV_SEQUENTIAL for large page segments. 7678 * Also, ignore advice affecting lgroup memory allocation 7679 * if don't need to do lgroup optimizations on this system 7680 */ 7681 7682 if ((behav == MADV_SEQUENTIAL && seg->s_szc != 0) || 7683 (!lgrp_optimizations() && (behav == MADV_ACCESS_DEFAULT || 7684 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY))) { 7685 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7686 return (0); 7687 } 7688 7689 if (behav == MADV_SEQUENTIAL || behav == MADV_ACCESS_DEFAULT || 7690 behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY) { 7691 /* 7692 * Since we are going to unload hat mappings 7693 * we first have to flush the cache. Otherwise 7694 * this might lead to system panic if another 7695 * thread is doing physio on the range whose 7696 * mappings are unloaded by madvise(3C). 7697 */ 7698 if (svd->softlockcnt > 0) { 7699 /* 7700 * Since we do have the segvn writers lock 7701 * nobody can fill the cache with entries 7702 * belonging to this seg during the purge. 7703 * The flush either succeeds or we still 7704 * have pending I/Os. In the later case, 7705 * madvise(3C) fails. 7706 */ 7707 segvn_purge(seg); 7708 if (svd->softlockcnt > 0) { 7709 /* 7710 * Since madvise(3C) is advisory and 7711 * it's not part of UNIX98, madvise(3C) 7712 * failure here doesn't cause any hardship. 7713 * Note that we don't block in "as" layer. 7714 */ 7715 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7716 return (EAGAIN); 7717 } 7718 } 7719 } 7720 7721 amp = svd->amp; 7722 vp = svd->vp; 7723 if (behav == MADV_FREE) { 7724 /* 7725 * MADV_FREE is not supported for segments with 7726 * underlying object; if anonmap is NULL, anon slots 7727 * are not yet populated and there is nothing for 7728 * us to do. As MADV_FREE is advisory, we don't 7729 * return error in either case. 7730 */ 7731 if (vp != NULL || amp == NULL) { 7732 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7733 return (0); 7734 } 7735 7736 page = seg_page(seg, addr); 7737 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 7738 anon_disclaim(amp, svd->anon_index + page, len, 0); 7739 ANON_LOCK_EXIT(&->a_rwlock); 7740 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7741 return (0); 7742 } 7743 7744 /* 7745 * If advice is to be applied to entire segment, 7746 * use advice field in seg_data structure 7747 * otherwise use appropriate vpage entry. 7748 */ 7749 if ((addr == seg->s_base) && (len == seg->s_size)) { 7750 switch (behav) { 7751 case MADV_ACCESS_LWP: 7752 case MADV_ACCESS_MANY: 7753 case MADV_ACCESS_DEFAULT: 7754 /* 7755 * Set memory allocation policy for this segment 7756 */ 7757 policy = lgrp_madv_to_policy(behav, len, svd->type); 7758 if (svd->type == MAP_SHARED) 7759 already_set = lgrp_shm_policy_set(policy, amp, 7760 svd->anon_index, vp, svd->offset, len); 7761 else { 7762 /* 7763 * For private memory, need writers lock on 7764 * address space because the segment may be 7765 * split or concatenated when changing policy 7766 */ 7767 if (AS_READ_HELD(seg->s_as, 7768 &seg->s_as->a_lock)) { 7769 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7770 return (IE_RETRY); 7771 } 7772 7773 already_set = lgrp_privm_policy_set(policy, 7774 &svd->policy_info, len); 7775 } 7776 7777 /* 7778 * If policy set already and it shouldn't be reapplied, 7779 * don't do anything. 7780 */ 7781 if (already_set && 7782 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7783 break; 7784 7785 /* 7786 * Mark any existing pages in given range for 7787 * migration 7788 */ 7789 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7790 vp, svd->offset, 1); 7791 7792 /* 7793 * If same policy set already or this is a shared 7794 * memory segment, don't need to try to concatenate 7795 * segment with adjacent ones. 7796 */ 7797 if (already_set || svd->type == MAP_SHARED) 7798 break; 7799 7800 /* 7801 * Try to concatenate this segment with previous 7802 * one and next one, since we changed policy for 7803 * this one and it may be compatible with adjacent 7804 * ones now. 7805 */ 7806 prev = AS_SEGPREV(seg->s_as, seg); 7807 next = AS_SEGNEXT(seg->s_as, seg); 7808 7809 if (next && next->s_ops == &segvn_ops && 7810 addr + len == next->s_base) 7811 (void) segvn_concat(seg, next, 1); 7812 7813 if (prev && prev->s_ops == &segvn_ops && 7814 addr == prev->s_base + prev->s_size) { 7815 /* 7816 * Drop lock for private data of current 7817 * segment before concatenating (deleting) it 7818 * and return IE_REATTACH to tell as_ctl() that 7819 * current segment has changed 7820 */ 7821 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7822 if (!segvn_concat(prev, seg, 1)) 7823 err = IE_REATTACH; 7824 7825 return (err); 7826 } 7827 break; 7828 7829 case MADV_SEQUENTIAL: 7830 /* 7831 * unloading mapping guarantees 7832 * detection in segvn_fault 7833 */ 7834 ASSERT(seg->s_szc == 0); 7835 hat_unload(seg->s_as->a_hat, addr, len, 7836 HAT_UNLOAD); 7837 /* FALLTHROUGH */ 7838 case MADV_NORMAL: 7839 case MADV_RANDOM: 7840 svd->advice = (uchar_t)behav; 7841 svd->pageadvice = 0; 7842 break; 7843 case MADV_WILLNEED: /* handled in memcntl */ 7844 case MADV_DONTNEED: /* handled in memcntl */ 7845 case MADV_FREE: /* handled above */ 7846 break; 7847 default: 7848 err = EINVAL; 7849 } 7850 } else { 7851 caddr_t eaddr; 7852 struct seg *new_seg; 7853 struct segvn_data *new_svd; 7854 u_offset_t off; 7855 caddr_t oldeaddr; 7856 7857 page = seg_page(seg, addr); 7858 7859 segvn_vpage(seg); 7860 7861 switch (behav) { 7862 struct vpage *bvpp, *evpp; 7863 7864 case MADV_ACCESS_LWP: 7865 case MADV_ACCESS_MANY: 7866 case MADV_ACCESS_DEFAULT: 7867 /* 7868 * Set memory allocation policy for portion of this 7869 * segment 7870 */ 7871 7872 /* 7873 * Align address and length of advice to page 7874 * boundaries for large pages 7875 */ 7876 if (seg->s_szc != 0) { 7877 size_t pgsz; 7878 7879 pgsz = page_get_pagesize(seg->s_szc); 7880 addr = (caddr_t)P2ALIGN((uintptr_t)addr, pgsz); 7881 len = P2ROUNDUP(len, pgsz); 7882 } 7883 7884 /* 7885 * Check to see whether policy is set already 7886 */ 7887 policy = lgrp_madv_to_policy(behav, len, svd->type); 7888 7889 anon_index = svd->anon_index + page; 7890 off = svd->offset + (uintptr_t)(addr - seg->s_base); 7891 7892 if (svd->type == MAP_SHARED) 7893 already_set = lgrp_shm_policy_set(policy, amp, 7894 anon_index, vp, off, len); 7895 else 7896 already_set = 7897 (policy == svd->policy_info.mem_policy); 7898 7899 /* 7900 * If policy set already and it shouldn't be reapplied, 7901 * don't do anything. 7902 */ 7903 if (already_set && 7904 !LGRP_MEM_POLICY_REAPPLICABLE(policy)) 7905 break; 7906 7907 /* 7908 * For private memory, need writers lock on 7909 * address space because the segment may be 7910 * split or concatenated when changing policy 7911 */ 7912 if (svd->type == MAP_PRIVATE && 7913 AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)) { 7914 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 7915 return (IE_RETRY); 7916 } 7917 7918 /* 7919 * Mark any existing pages in given range for 7920 * migration 7921 */ 7922 page_mark_migrate(seg, addr, len, amp, svd->anon_index, 7923 vp, svd->offset, 1); 7924 7925 /* 7926 * Don't need to try to split or concatenate 7927 * segments, since policy is same or this is a shared 7928 * memory segment 7929 */ 7930 if (already_set || svd->type == MAP_SHARED) 7931 break; 7932 7933 /* 7934 * Split off new segment if advice only applies to a 7935 * portion of existing segment starting in middle 7936 */ 7937 new_seg = NULL; 7938 eaddr = addr + len; 7939 oldeaddr = seg->s_base + seg->s_size; 7940 if (addr > seg->s_base) { 7941 /* 7942 * Must flush I/O page cache 7943 * before splitting segment 7944 */ 7945 if (svd->softlockcnt > 0) 7946 segvn_purge(seg); 7947 7948 /* 7949 * Split segment and return IE_REATTACH to tell 7950 * as_ctl() that current segment changed 7951 */ 7952 new_seg = segvn_split_seg(seg, addr); 7953 new_svd = (struct segvn_data *)new_seg->s_data; 7954 err = IE_REATTACH; 7955 7956 /* 7957 * If new segment ends where old one 7958 * did, try to concatenate the new 7959 * segment with next one. 7960 */ 7961 if (eaddr == oldeaddr) { 7962 /* 7963 * Set policy for new segment 7964 */ 7965 (void) lgrp_privm_policy_set(policy, 7966 &new_svd->policy_info, 7967 new_seg->s_size); 7968 7969 next = AS_SEGNEXT(new_seg->s_as, 7970 new_seg); 7971 7972 if (next && 7973 next->s_ops == &segvn_ops && 7974 eaddr == next->s_base) 7975 (void) segvn_concat(new_seg, 7976 next, 1); 7977 } 7978 } 7979 7980 /* 7981 * Split off end of existing segment if advice only 7982 * applies to a portion of segment ending before 7983 * end of the existing segment 7984 */ 7985 if (eaddr < oldeaddr) { 7986 /* 7987 * Must flush I/O page cache 7988 * before splitting segment 7989 */ 7990 if (svd->softlockcnt > 0) 7991 segvn_purge(seg); 7992 7993 /* 7994 * If beginning of old segment was already 7995 * split off, use new segment to split end off 7996 * from. 7997 */ 7998 if (new_seg != NULL && new_seg != seg) { 7999 /* 8000 * Split segment 8001 */ 8002 (void) segvn_split_seg(new_seg, eaddr); 8003 8004 /* 8005 * Set policy for new segment 8006 */ 8007 (void) lgrp_privm_policy_set(policy, 8008 &new_svd->policy_info, 8009 new_seg->s_size); 8010 } else { 8011 /* 8012 * Split segment and return IE_REATTACH 8013 * to tell as_ctl() that current 8014 * segment changed 8015 */ 8016 (void) segvn_split_seg(seg, eaddr); 8017 err = IE_REATTACH; 8018 8019 (void) lgrp_privm_policy_set(policy, 8020 &svd->policy_info, seg->s_size); 8021 8022 /* 8023 * If new segment starts where old one 8024 * did, try to concatenate it with 8025 * previous segment. 8026 */ 8027 if (addr == seg->s_base) { 8028 prev = AS_SEGPREV(seg->s_as, 8029 seg); 8030 8031 /* 8032 * Drop lock for private data 8033 * of current segment before 8034 * concatenating (deleting) it 8035 */ 8036 if (prev && 8037 prev->s_ops == 8038 &segvn_ops && 8039 addr == prev->s_base + 8040 prev->s_size) { 8041 SEGVN_LOCK_EXIT( 8042 seg->s_as, 8043 &svd->lock); 8044 (void) segvn_concat( 8045 prev, seg, 1); 8046 return (err); 8047 } 8048 } 8049 } 8050 } 8051 break; 8052 case MADV_SEQUENTIAL: 8053 ASSERT(seg->s_szc == 0); 8054 hat_unload(seg->s_as->a_hat, addr, len, HAT_UNLOAD); 8055 /* FALLTHROUGH */ 8056 case MADV_NORMAL: 8057 case MADV_RANDOM: 8058 bvpp = &svd->vpage[page]; 8059 evpp = &svd->vpage[page + (len >> PAGESHIFT)]; 8060 for (; bvpp < evpp; bvpp++) 8061 VPP_SETADVICE(bvpp, behav); 8062 svd->advice = MADV_NORMAL; 8063 break; 8064 case MADV_WILLNEED: /* handled in memcntl */ 8065 case MADV_DONTNEED: /* handled in memcntl */ 8066 case MADV_FREE: /* handled above */ 8067 break; 8068 default: 8069 err = EINVAL; 8070 } 8071 } 8072 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8073 return (err); 8074 } 8075 8076 /* 8077 * Create a vpage structure for this seg. 8078 */ 8079 static void 8080 segvn_vpage(struct seg *seg) 8081 { 8082 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8083 struct vpage *vp, *evp; 8084 8085 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8086 8087 /* 8088 * If no vpage structure exists, allocate one. Copy the protections 8089 * and the advice from the segment itself to the individual pages. 8090 */ 8091 if (svd->vpage == NULL) { 8092 svd->pageprot = 1; 8093 svd->pageadvice = 1; 8094 svd->vpage = kmem_zalloc(seg_pages(seg) * sizeof (struct vpage), 8095 KM_SLEEP); 8096 evp = &svd->vpage[seg_page(seg, seg->s_base + seg->s_size)]; 8097 for (vp = svd->vpage; vp < evp; vp++) { 8098 VPP_SETPROT(vp, svd->prot); 8099 VPP_SETADVICE(vp, svd->advice); 8100 } 8101 } 8102 } 8103 8104 /* 8105 * Dump the pages belonging to this segvn segment. 8106 */ 8107 static void 8108 segvn_dump(struct seg *seg) 8109 { 8110 struct segvn_data *svd; 8111 page_t *pp; 8112 struct anon_map *amp; 8113 ulong_t anon_index; 8114 struct vnode *vp; 8115 u_offset_t off, offset; 8116 pfn_t pfn; 8117 pgcnt_t page, npages; 8118 caddr_t addr; 8119 8120 npages = seg_pages(seg); 8121 svd = (struct segvn_data *)seg->s_data; 8122 vp = svd->vp; 8123 off = offset = svd->offset; 8124 addr = seg->s_base; 8125 8126 if ((amp = svd->amp) != NULL) { 8127 anon_index = svd->anon_index; 8128 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8129 } 8130 8131 for (page = 0; page < npages; page++, offset += PAGESIZE) { 8132 struct anon *ap; 8133 int we_own_it = 0; 8134 8135 if (amp && (ap = anon_get_ptr(svd->amp->ahp, anon_index++))) { 8136 swap_xlate_nopanic(ap, &vp, &off); 8137 } else { 8138 vp = svd->vp; 8139 off = offset; 8140 } 8141 8142 /* 8143 * If pp == NULL, the page either does not exist 8144 * or is exclusively locked. So determine if it 8145 * exists before searching for it. 8146 */ 8147 8148 if ((pp = page_lookup_nowait(vp, off, SE_SHARED))) 8149 we_own_it = 1; 8150 else 8151 pp = page_exists(vp, off); 8152 8153 if (pp) { 8154 pfn = page_pptonum(pp); 8155 dump_addpage(seg->s_as, addr, pfn); 8156 if (we_own_it) 8157 page_unlock(pp); 8158 } 8159 addr += PAGESIZE; 8160 dump_timeleft = dump_timeout; 8161 } 8162 8163 if (amp != NULL) 8164 ANON_LOCK_EXIT(&->a_rwlock); 8165 } 8166 8167 /* 8168 * lock/unlock anon pages over a given range. Return shadow list 8169 */ 8170 static int 8171 segvn_pagelock(struct seg *seg, caddr_t addr, size_t len, struct page ***ppp, 8172 enum lock_type type, enum seg_rw rw) 8173 { 8174 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8175 size_t np, adjustpages = 0, npages = (len >> PAGESHIFT); 8176 ulong_t anon_index; 8177 uint_t protchk; 8178 uint_t error; 8179 struct anon_map *amp; 8180 struct page **pplist, **pl, *pp; 8181 caddr_t a; 8182 size_t page; 8183 caddr_t lpgaddr, lpgeaddr; 8184 pgcnt_t szc0_npages = 0; 8185 8186 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_START, 8187 "segvn_pagelock: start seg %p addr %p", seg, addr); 8188 8189 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8190 if (seg->s_szc != 0 && (type == L_PAGELOCK || type == L_PAGEUNLOCK)) { 8191 /* 8192 * We are adjusting the pagelock region to the large page size 8193 * boundary because the unlocked part of a large page cannot 8194 * be freed anyway unless all constituent pages of a large 8195 * page are locked. Therefore this adjustment allows us to 8196 * decrement availrmem by the right value (note we don't want 8197 * to just decrement availrem by the large page size without 8198 * adjusting addr and len because then we may end up 8199 * decrementing availrmem by large page size for every 8200 * constituent page locked by a new as_pagelock call). 8201 * as_pageunlock caller must always match as_pagelock call's 8202 * addr and len. 8203 * 8204 * Note segment's page size cannot change while we are holding 8205 * as lock. And then it cannot change while softlockcnt is 8206 * not 0. This will allow us to correctly recalculate large 8207 * page size region for the matching pageunlock/reclaim call. 8208 * 8209 * for pageunlock *ppp points to the pointer of page_t that 8210 * corresponds to the real unadjusted start address. Similar 8211 * for pagelock *ppp must point to the pointer of page_t that 8212 * corresponds to the real unadjusted start address. 8213 */ 8214 size_t pgsz = page_get_pagesize(seg->s_szc); 8215 CALC_LPG_REGION(pgsz, seg, addr, len, lpgaddr, lpgeaddr); 8216 adjustpages = ((uintptr_t)(addr - lpgaddr)) >> PAGESHIFT; 8217 } 8218 8219 if (type == L_PAGEUNLOCK) { 8220 8221 /* 8222 * update hat ref bits for /proc. We need to make sure 8223 * that threads tracing the ref and mod bits of the 8224 * address space get the right data. 8225 * Note: page ref and mod bits are updated at reclaim time 8226 */ 8227 if (seg->s_as->a_vbits) { 8228 for (a = addr; a < addr + len; a += PAGESIZE) { 8229 if (rw == S_WRITE) { 8230 hat_setstat(seg->s_as, a, 8231 PAGESIZE, P_REF | P_MOD); 8232 } else { 8233 hat_setstat(seg->s_as, a, 8234 PAGESIZE, P_REF); 8235 } 8236 } 8237 } 8238 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8239 if (seg->s_szc != 0) { 8240 VM_STAT_ADD(segvnvmstats.pagelock[0]); 8241 seg_pinactive(seg, lpgaddr, lpgeaddr - lpgaddr, 8242 *ppp - adjustpages, rw, segvn_reclaim); 8243 } else { 8244 seg_pinactive(seg, addr, len, *ppp, rw, segvn_reclaim); 8245 } 8246 8247 /* 8248 * If someone is blocked while unmapping, we purge 8249 * segment page cache and thus reclaim pplist synchronously 8250 * without waiting for seg_pasync_thread. This speeds up 8251 * unmapping in cases where munmap(2) is called, while 8252 * raw async i/o is still in progress or where a thread 8253 * exits on data fault in a multithreaded application. 8254 */ 8255 if (AS_ISUNMAPWAIT(seg->s_as) && (svd->softlockcnt > 0)) { 8256 /* 8257 * Even if we grab segvn WRITER's lock or segp_slock 8258 * here, there might be another thread which could've 8259 * successfully performed lookup/insert just before 8260 * we acquired the lock here. So, grabbing either 8261 * lock here is of not much use. Until we devise 8262 * a strategy at upper layers to solve the 8263 * synchronization issues completely, we expect 8264 * applications to handle this appropriately. 8265 */ 8266 segvn_purge(seg); 8267 } 8268 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8269 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8270 "segvn_pagelock: unlock seg %p addr %p", seg, addr); 8271 return (0); 8272 } else if (type == L_PAGERECLAIM) { 8273 VM_STAT_COND_ADD(seg->s_szc != 0, segvnvmstats.pagelock[1]); 8274 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8275 (void) segvn_reclaim(seg, addr, len, *ppp, rw); 8276 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8277 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_UNLOCK_END, 8278 "segvn_pagelock: reclaim seg %p addr %p", seg, addr); 8279 return (0); 8280 } 8281 8282 if (seg->s_szc != 0) { 8283 VM_STAT_ADD(segvnvmstats.pagelock[2]); 8284 addr = lpgaddr; 8285 len = lpgeaddr - lpgaddr; 8286 npages = (len >> PAGESHIFT); 8287 } 8288 8289 /* 8290 * for now we only support pagelock to anon memory. We've to check 8291 * protections for vnode objects and call into the vnode driver. 8292 * That's too much for a fast path. Let the fault entry point handle it. 8293 */ 8294 if (svd->vp != NULL) { 8295 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8296 "segvn_pagelock: mapped vnode seg %p addr %p", seg, addr); 8297 *ppp = NULL; 8298 return (ENOTSUP); 8299 } 8300 8301 /* 8302 * if anonmap is not yet created, let the fault entry point populate it 8303 * with anon ptrs. 8304 */ 8305 if ((amp = svd->amp) == NULL) { 8306 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8307 "segvn_pagelock: anonmap null seg %p addr %p", seg, addr); 8308 *ppp = NULL; 8309 return (EFAULT); 8310 } 8311 8312 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8313 8314 /* 8315 * we acquire segp_slock to prevent duplicate entries 8316 * in seg_pcache 8317 */ 8318 mutex_enter(&svd->segp_slock); 8319 8320 /* 8321 * try to find pages in segment page cache 8322 */ 8323 pplist = seg_plookup(seg, addr, len, rw); 8324 if (pplist != NULL) { 8325 mutex_exit(&svd->segp_slock); 8326 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8327 *ppp = pplist + adjustpages; 8328 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_HIT_END, 8329 "segvn_pagelock: cache hit seg %p addr %p", seg, addr); 8330 return (0); 8331 } 8332 8333 if (rw == S_READ) { 8334 protchk = PROT_READ; 8335 } else { 8336 protchk = PROT_WRITE; 8337 } 8338 8339 if (svd->pageprot == 0) { 8340 if ((svd->prot & protchk) == 0) { 8341 mutex_exit(&svd->segp_slock); 8342 error = EFAULT; 8343 goto out; 8344 } 8345 } else { 8346 /* 8347 * check page protections 8348 */ 8349 for (a = addr; a < addr + len; a += PAGESIZE) { 8350 struct vpage *vp; 8351 8352 vp = &svd->vpage[seg_page(seg, a)]; 8353 if ((VPP_PROT(vp) & protchk) == 0) { 8354 mutex_exit(&svd->segp_slock); 8355 error = EFAULT; 8356 goto out; 8357 } 8358 } 8359 } 8360 8361 /* 8362 * Avoid per page overhead of segvn_pp_lock_anonpages() for small 8363 * pages. For large pages segvn_pp_lock_anonpages() only does real 8364 * work once per large page. The tradeoff is that we may decrement 8365 * availrmem more than once for the same page but this is ok 8366 * for small pages. 8367 */ 8368 if (seg->s_szc == 0) { 8369 mutex_enter(&freemem_lock); 8370 if (availrmem < tune.t_minarmem + npages) { 8371 mutex_exit(&freemem_lock); 8372 mutex_exit(&svd->segp_slock); 8373 error = ENOMEM; 8374 goto out; 8375 } 8376 availrmem -= npages; 8377 mutex_exit(&freemem_lock); 8378 } 8379 8380 pplist = kmem_alloc(sizeof (page_t *) * npages, KM_SLEEP); 8381 pl = pplist; 8382 *ppp = pplist + adjustpages; 8383 8384 page = seg_page(seg, addr); 8385 anon_index = svd->anon_index + page; 8386 8387 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8388 for (a = addr; a < addr + len; a += PAGESIZE, anon_index++) { 8389 struct anon *ap; 8390 struct vnode *vp; 8391 u_offset_t off; 8392 anon_sync_obj_t cookie; 8393 8394 anon_array_enter(amp, anon_index, &cookie); 8395 ap = anon_get_ptr(amp->ahp, anon_index); 8396 if (ap == NULL) { 8397 anon_array_exit(&cookie); 8398 break; 8399 } else { 8400 /* 8401 * We must never use seg_pcache for COW pages 8402 * because we might end up with original page still 8403 * lying in seg_pcache even after private page is 8404 * created. This leads to data corruption as 8405 * aio_write refers to the page still in cache 8406 * while all other accesses refer to the private 8407 * page. 8408 */ 8409 if (ap->an_refcnt != 1) { 8410 anon_array_exit(&cookie); 8411 break; 8412 } 8413 } 8414 swap_xlate(ap, &vp, &off); 8415 anon_array_exit(&cookie); 8416 8417 pp = page_lookup_nowait(vp, off, SE_SHARED); 8418 if (pp == NULL) { 8419 break; 8420 } 8421 if (seg->s_szc != 0 || pp->p_szc != 0) { 8422 if (!segvn_pp_lock_anonpages(pp, a == addr)) { 8423 page_unlock(pp); 8424 break; 8425 } 8426 } else { 8427 szc0_npages++; 8428 } 8429 *pplist++ = pp; 8430 } 8431 ANON_LOCK_EXIT(&->a_rwlock); 8432 8433 ASSERT(npages >= szc0_npages); 8434 8435 if (a >= addr + len) { 8436 mutex_enter(&freemem_lock); 8437 if (seg->s_szc == 0 && npages != szc0_npages) { 8438 ASSERT(svd->type == MAP_SHARED && amp->a_szc > 0); 8439 availrmem += (npages - szc0_npages); 8440 } 8441 svd->softlockcnt += npages; 8442 segvn_pages_locked += npages; 8443 mutex_exit(&freemem_lock); 8444 (void) seg_pinsert(seg, addr, len, pl, rw, SEGP_ASYNC_FLUSH, 8445 segvn_reclaim); 8446 mutex_exit(&svd->segp_slock); 8447 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8448 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_FILL_END, 8449 "segvn_pagelock: cache fill seg %p addr %p", seg, addr); 8450 return (0); 8451 } 8452 8453 mutex_exit(&svd->segp_slock); 8454 if (seg->s_szc == 0) { 8455 mutex_enter(&freemem_lock); 8456 availrmem += npages; 8457 mutex_exit(&freemem_lock); 8458 } 8459 error = EFAULT; 8460 pplist = pl; 8461 np = ((uintptr_t)(a - addr)) >> PAGESHIFT; 8462 while (np > (uint_t)0) { 8463 ASSERT(PAGE_LOCKED(*pplist)); 8464 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8465 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8466 } 8467 page_unlock(*pplist); 8468 np--; 8469 pplist++; 8470 } 8471 kmem_free(pl, sizeof (page_t *) * npages); 8472 out: 8473 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8474 *ppp = NULL; 8475 TRACE_2(TR_FAC_PHYSIO, TR_PHYSIO_SEGVN_MISS_END, 8476 "segvn_pagelock: cache miss seg %p addr %p", seg, addr); 8477 return (error); 8478 } 8479 8480 /* 8481 * purge any cached pages in the I/O page cache 8482 */ 8483 static void 8484 segvn_purge(struct seg *seg) 8485 { 8486 seg_ppurge(seg); 8487 } 8488 8489 static int 8490 segvn_reclaim(struct seg *seg, caddr_t addr, size_t len, struct page **pplist, 8491 enum seg_rw rw) 8492 { 8493 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8494 pgcnt_t np, npages; 8495 struct page **pl; 8496 pgcnt_t szc0_npages = 0; 8497 8498 #ifdef lint 8499 addr = addr; 8500 #endif 8501 8502 npages = np = (len >> PAGESHIFT); 8503 ASSERT(npages); 8504 pl = pplist; 8505 if (seg->s_szc != 0) { 8506 size_t pgsz = page_get_pagesize(seg->s_szc); 8507 if (!IS_P2ALIGNED(addr, pgsz) || !IS_P2ALIGNED(len, pgsz)) { 8508 panic("segvn_reclaim: unaligned addr or len"); 8509 /*NOTREACHED*/ 8510 } 8511 } 8512 8513 ASSERT(svd->vp == NULL && svd->amp != NULL); 8514 8515 while (np > (uint_t)0) { 8516 if (rw == S_WRITE) { 8517 hat_setrefmod(*pplist); 8518 } else { 8519 hat_setref(*pplist); 8520 } 8521 if (seg->s_szc != 0 || (*pplist)->p_szc != 0) { 8522 segvn_pp_unlock_anonpages(*pplist, pplist == pl); 8523 } else { 8524 szc0_npages++; 8525 } 8526 page_unlock(*pplist); 8527 np--; 8528 pplist++; 8529 } 8530 kmem_free(pl, sizeof (page_t *) * npages); 8531 8532 mutex_enter(&freemem_lock); 8533 segvn_pages_locked -= npages; 8534 svd->softlockcnt -= npages; 8535 if (szc0_npages != 0) { 8536 availrmem += szc0_npages; 8537 } 8538 mutex_exit(&freemem_lock); 8539 if (svd->softlockcnt <= 0) { 8540 if (AS_ISUNMAPWAIT(seg->s_as)) { 8541 mutex_enter(&seg->s_as->a_contents); 8542 if (AS_ISUNMAPWAIT(seg->s_as)) { 8543 AS_CLRUNMAPWAIT(seg->s_as); 8544 cv_broadcast(&seg->s_as->a_cv); 8545 } 8546 mutex_exit(&seg->s_as->a_contents); 8547 } 8548 } 8549 return (0); 8550 } 8551 /* 8552 * get a memory ID for an addr in a given segment 8553 * 8554 * XXX only creates PAGESIZE pages if anon slots are not initialized. 8555 * At fault time they will be relocated into larger pages. 8556 */ 8557 static int 8558 segvn_getmemid(struct seg *seg, caddr_t addr, memid_t *memidp) 8559 { 8560 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8561 struct anon *ap = NULL; 8562 ulong_t anon_index; 8563 struct anon_map *amp; 8564 anon_sync_obj_t cookie; 8565 8566 if (svd->type == MAP_PRIVATE) { 8567 memidp->val[0] = (uintptr_t)seg->s_as; 8568 memidp->val[1] = (uintptr_t)addr; 8569 return (0); 8570 } 8571 8572 if (svd->type == MAP_SHARED) { 8573 if (svd->vp) { 8574 memidp->val[0] = (uintptr_t)svd->vp; 8575 memidp->val[1] = (u_longlong_t)svd->offset + 8576 (uintptr_t)(addr - seg->s_base); 8577 return (0); 8578 } else { 8579 8580 SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_READER); 8581 if ((amp = svd->amp) != NULL) { 8582 anon_index = svd->anon_index + 8583 seg_page(seg, addr); 8584 } 8585 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 8586 8587 ASSERT(amp != NULL); 8588 8589 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 8590 anon_array_enter(amp, anon_index, &cookie); 8591 ap = anon_get_ptr(amp->ahp, anon_index); 8592 if (ap == NULL) { 8593 page_t *pp; 8594 8595 pp = anon_zero(seg, addr, &ap, svd->cred); 8596 if (pp == NULL) { 8597 anon_array_exit(&cookie); 8598 ANON_LOCK_EXIT(&->a_rwlock); 8599 return (ENOMEM); 8600 } 8601 ASSERT(anon_get_ptr(amp->ahp, anon_index) 8602 == NULL); 8603 (void) anon_set_ptr(amp->ahp, anon_index, 8604 ap, ANON_SLEEP); 8605 page_unlock(pp); 8606 } 8607 8608 anon_array_exit(&cookie); 8609 ANON_LOCK_EXIT(&->a_rwlock); 8610 8611 memidp->val[0] = (uintptr_t)ap; 8612 memidp->val[1] = (uintptr_t)addr & PAGEOFFSET; 8613 return (0); 8614 } 8615 } 8616 return (EINVAL); 8617 } 8618 8619 static int 8620 sameprot(struct seg *seg, caddr_t a, size_t len) 8621 { 8622 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8623 struct vpage *vpage; 8624 spgcnt_t pages = btop(len); 8625 uint_t prot; 8626 8627 if (svd->pageprot == 0) 8628 return (1); 8629 8630 ASSERT(svd->vpage != NULL); 8631 8632 vpage = &svd->vpage[seg_page(seg, a)]; 8633 prot = VPP_PROT(vpage); 8634 vpage++; 8635 pages--; 8636 while (pages-- > 0) { 8637 if (prot != VPP_PROT(vpage)) 8638 return (0); 8639 vpage++; 8640 } 8641 return (1); 8642 } 8643 8644 /* 8645 * Get memory allocation policy info for specified address in given segment 8646 */ 8647 static lgrp_mem_policy_info_t * 8648 segvn_getpolicy(struct seg *seg, caddr_t addr) 8649 { 8650 struct anon_map *amp; 8651 ulong_t anon_index; 8652 lgrp_mem_policy_info_t *policy_info; 8653 struct segvn_data *svn_data; 8654 u_offset_t vn_off; 8655 vnode_t *vp; 8656 8657 ASSERT(seg != NULL); 8658 8659 svn_data = (struct segvn_data *)seg->s_data; 8660 if (svn_data == NULL) 8661 return (NULL); 8662 8663 /* 8664 * Get policy info for private or shared memory 8665 */ 8666 if (svn_data->type != MAP_SHARED) { 8667 if (svn_data->tr_state != SEGVN_TR_ON) { 8668 policy_info = &svn_data->policy_info; 8669 } else { 8670 policy_info = &svn_data->tr_policy_info; 8671 ASSERT(policy_info->mem_policy == 8672 LGRP_MEM_POLICY_NEXT_SEG); 8673 } 8674 } else { 8675 amp = svn_data->amp; 8676 anon_index = svn_data->anon_index + seg_page(seg, addr); 8677 vp = svn_data->vp; 8678 vn_off = svn_data->offset + (uintptr_t)(addr - seg->s_base); 8679 policy_info = lgrp_shm_policy_get(amp, anon_index, vp, vn_off); 8680 } 8681 8682 return (policy_info); 8683 } 8684 8685 /*ARGSUSED*/ 8686 static int 8687 segvn_capable(struct seg *seg, segcapability_t capability) 8688 { 8689 return (0); 8690 } 8691 8692 /* 8693 * Bind text vnode segment to an amp. If we bind successfully mappings will be 8694 * established to per vnode mapping per lgroup amp pages instead of to vnode 8695 * pages. There's one amp per vnode text mapping per lgroup. Many processes 8696 * may share the same text replication amp. If a suitable amp doesn't already 8697 * exist in svntr hash table create a new one. We may fail to bind to amp if 8698 * segment is not eligible for text replication. Code below first checks for 8699 * these conditions. If binding is successful segment tr_state is set to on 8700 * and svd->amp points to the amp to use. Otherwise tr_state is set to off and 8701 * svd->amp remains as NULL. 8702 */ 8703 static void 8704 segvn_textrepl(struct seg *seg) 8705 { 8706 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8707 vnode_t *vp = svd->vp; 8708 u_offset_t off = svd->offset; 8709 size_t size = seg->s_size; 8710 u_offset_t eoff = off + size; 8711 uint_t szc = seg->s_szc; 8712 ulong_t hash = SVNTR_HASH_FUNC(vp); 8713 svntr_t *svntrp; 8714 struct vattr va; 8715 proc_t *p = seg->s_as->a_proc; 8716 lgrp_id_t lgrp_id; 8717 lgrp_id_t olid; 8718 int first; 8719 struct anon_map *amp; 8720 8721 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8722 ASSERT(SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8723 ASSERT(p != NULL); 8724 ASSERT(svd->tr_state == SEGVN_TR_INIT); 8725 ASSERT(svd->flags & MAP_TEXT); 8726 ASSERT(svd->type == MAP_PRIVATE); 8727 ASSERT(vp != NULL && svd->amp == NULL); 8728 ASSERT(!svd->pageprot && !(svd->prot & PROT_WRITE)); 8729 ASSERT(!(svd->flags & MAP_NORESERVE) && svd->swresv == 0); 8730 ASSERT(seg->s_as != &kas); 8731 ASSERT(off < eoff); 8732 ASSERT(svntr_hashtab != NULL); 8733 8734 /* 8735 * If numa optimizations are no longer desired bail out. 8736 */ 8737 if (!lgrp_optimizations()) { 8738 svd->tr_state = SEGVN_TR_OFF; 8739 return; 8740 } 8741 8742 /* 8743 * Avoid creating anon maps with size bigger than the file size. 8744 * If VOP_GETATTR() call fails bail out. 8745 */ 8746 va.va_mask = AT_SIZE | AT_MTIME; 8747 if (VOP_GETATTR(vp, &va, 0, svd->cred) != 0) { 8748 svd->tr_state = SEGVN_TR_OFF; 8749 SEGVN_TR_ADDSTAT(gaerr); 8750 return; 8751 } 8752 if (btopr(va.va_size) < btopr(eoff)) { 8753 svd->tr_state = SEGVN_TR_OFF; 8754 SEGVN_TR_ADDSTAT(overmap); 8755 return; 8756 } 8757 8758 /* 8759 * VVMEXEC may not be set yet if exec() prefaults text segment. Set 8760 * this flag now before vn_is_mapped(V_WRITE) so that MAP_SHARED 8761 * mapping that checks if trcache for this vnode needs to be 8762 * invalidated can't miss us. 8763 */ 8764 if (!(vp->v_flag & VVMEXEC)) { 8765 mutex_enter(&vp->v_lock); 8766 vp->v_flag |= VVMEXEC; 8767 mutex_exit(&vp->v_lock); 8768 } 8769 mutex_enter(&svntr_hashtab[hash].tr_lock); 8770 /* 8771 * Bail out if potentially MAP_SHARED writable mappings exist to this 8772 * vnode. We don't want to use old file contents from existing 8773 * replicas if this mapping was established after the original file 8774 * was changed. 8775 */ 8776 if (vn_is_mapped(vp, V_WRITE)) { 8777 mutex_exit(&svntr_hashtab[hash].tr_lock); 8778 svd->tr_state = SEGVN_TR_OFF; 8779 SEGVN_TR_ADDSTAT(wrcnt); 8780 return; 8781 } 8782 svntrp = svntr_hashtab[hash].tr_head; 8783 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 8784 ASSERT(svntrp->tr_refcnt != 0); 8785 if (svntrp->tr_vp != vp) { 8786 continue; 8787 } 8788 /* 8789 * Bail out if file was changed after this replication entry 8790 * was created since we need to use the latest file contents. 8791 */ 8792 if (!svntrp->tr_valid || 8793 svntrp->tr_mtime.tv_sec != va.va_mtime.tv_sec || 8794 svntrp->tr_mtime.tv_nsec != va.va_mtime.tv_nsec) { 8795 mutex_exit(&svntr_hashtab[hash].tr_lock); 8796 svd->tr_state = SEGVN_TR_OFF; 8797 SEGVN_TR_ADDSTAT(stale); 8798 return; 8799 } 8800 /* 8801 * if off, eoff and szc match current segment we found the 8802 * existing entry we can use. 8803 */ 8804 if (svntrp->tr_off == off && svntrp->tr_eoff == eoff && 8805 svntrp->tr_szc == szc) { 8806 break; 8807 } 8808 /* 8809 * Don't create different but overlapping in file offsets 8810 * entries to avoid replication of the same file pages more 8811 * than once per lgroup. 8812 */ 8813 if ((off >= svntrp->tr_off && off < svntrp->tr_eoff) || 8814 (eoff > svntrp->tr_off && eoff <= svntrp->tr_eoff)) { 8815 mutex_exit(&svntr_hashtab[hash].tr_lock); 8816 svd->tr_state = SEGVN_TR_OFF; 8817 SEGVN_TR_ADDSTAT(overlap); 8818 return; 8819 } 8820 } 8821 /* 8822 * If we didn't find existing entry create a new one. 8823 */ 8824 if (svntrp == NULL) { 8825 svntrp = kmem_cache_alloc(svntr_cache, KM_NOSLEEP); 8826 if (svntrp == NULL) { 8827 mutex_exit(&svntr_hashtab[hash].tr_lock); 8828 svd->tr_state = SEGVN_TR_OFF; 8829 SEGVN_TR_ADDSTAT(nokmem); 8830 return; 8831 } 8832 #ifdef DEBUG 8833 { 8834 lgrp_id_t i; 8835 for (i = 0; i < NLGRPS_MAX; i++) { 8836 ASSERT(svntrp->tr_amp[i] == NULL); 8837 } 8838 } 8839 #endif /* DEBUG */ 8840 svntrp->tr_vp = vp; 8841 svntrp->tr_off = off; 8842 svntrp->tr_eoff = eoff; 8843 svntrp->tr_szc = szc; 8844 svntrp->tr_valid = 1; 8845 svntrp->tr_mtime = va.va_mtime; 8846 svntrp->tr_refcnt = 0; 8847 svntrp->tr_next = svntr_hashtab[hash].tr_head; 8848 svntr_hashtab[hash].tr_head = svntrp; 8849 } 8850 first = 1; 8851 again: 8852 /* 8853 * We want to pick a replica with pages on main thread's (t_tid = 1, 8854 * aka T1) lgrp. Currently text replication is only optimized for 8855 * workloads that either have all threads of a process on the same 8856 * lgrp or execute their large text primarily on main thread. 8857 */ 8858 lgrp_id = p->p_t1_lgrpid; 8859 if (lgrp_id == LGRP_NONE) { 8860 /* 8861 * In case exec() prefaults text on non main thread use 8862 * current thread lgrpid. It will become main thread anyway 8863 * soon. 8864 */ 8865 lgrp_id = lgrp_home_id(curthread); 8866 } 8867 /* 8868 * Set p_tr_lgrpid to lgrpid if it hasn't been set yet. Otherwise 8869 * just set it to NLGRPS_MAX if it's different from current process T1 8870 * home lgrp. p_tr_lgrpid is used to detect if process uses text 8871 * replication and T1 new home is different from lgrp used for text 8872 * replication. When this happens asyncronous segvn thread rechecks if 8873 * segments should change lgrps used for text replication. If we fail 8874 * to set p_tr_lgrpid with cas32 then set it to NLGRPS_MAX without cas 8875 * if it's not already NLGRPS_MAX and not equal lgrp_id we want to 8876 * use. We don't need to use cas in this case because another thread 8877 * that races in between our non atomic check and set may only change 8878 * p_tr_lgrpid to NLGRPS_MAX at this point. 8879 */ 8880 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 8881 olid = p->p_tr_lgrpid; 8882 if (lgrp_id != olid && olid != NLGRPS_MAX) { 8883 lgrp_id_t nlid = (olid == LGRP_NONE) ? lgrp_id : NLGRPS_MAX; 8884 if (cas32((uint32_t *)&p->p_tr_lgrpid, olid, nlid) != olid) { 8885 olid = p->p_tr_lgrpid; 8886 ASSERT(olid != LGRP_NONE); 8887 if (olid != lgrp_id && olid != NLGRPS_MAX) { 8888 p->p_tr_lgrpid = NLGRPS_MAX; 8889 } 8890 } 8891 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 8892 membar_producer(); 8893 /* 8894 * lgrp_move_thread() won't schedule async recheck after 8895 * p->p_t1_lgrpid update unless p->p_tr_lgrpid is not 8896 * LGRP_NONE. Recheck p_t1_lgrpid once now that p->p_tr_lgrpid 8897 * is not LGRP_NONE. 8898 */ 8899 if (first && p->p_t1_lgrpid != LGRP_NONE && 8900 p->p_t1_lgrpid != lgrp_id) { 8901 first = 0; 8902 goto again; 8903 } 8904 } 8905 /* 8906 * If no amp was created yet for lgrp_id create a new one as long as 8907 * we have enough memory to afford it. 8908 */ 8909 if ((amp = svntrp->tr_amp[lgrp_id]) == NULL) { 8910 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 8911 if (trmem > segvn_textrepl_max_bytes) { 8912 SEGVN_TR_ADDSTAT(normem); 8913 goto fail; 8914 } 8915 if (anon_try_resv_zone(size, NULL) == 0) { 8916 SEGVN_TR_ADDSTAT(noanon); 8917 goto fail; 8918 } 8919 amp = anonmap_alloc(size, size, ANON_NOSLEEP); 8920 if (amp == NULL) { 8921 anon_unresv_zone(size, NULL); 8922 SEGVN_TR_ADDSTAT(nokmem); 8923 goto fail; 8924 } 8925 ASSERT(amp->refcnt == 1); 8926 amp->a_szc = szc; 8927 svntrp->tr_amp[lgrp_id] = amp; 8928 SEGVN_TR_ADDSTAT(newamp); 8929 } 8930 svntrp->tr_refcnt++; 8931 ASSERT(svd->svn_trnext == NULL); 8932 ASSERT(svd->svn_trprev == NULL); 8933 svd->svn_trnext = svntrp->tr_svnhead; 8934 svd->svn_trprev = NULL; 8935 if (svntrp->tr_svnhead != NULL) { 8936 svntrp->tr_svnhead->svn_trprev = svd; 8937 } 8938 svntrp->tr_svnhead = svd; 8939 ASSERT(amp->a_szc == szc && amp->size == size && amp->swresv == size); 8940 ASSERT(amp->refcnt >= 1); 8941 svd->amp = amp; 8942 svd->anon_index = 0; 8943 svd->tr_policy_info.mem_policy = LGRP_MEM_POLICY_NEXT_SEG; 8944 svd->tr_policy_info.mem_lgrpid = lgrp_id; 8945 svd->tr_state = SEGVN_TR_ON; 8946 mutex_exit(&svntr_hashtab[hash].tr_lock); 8947 SEGVN_TR_ADDSTAT(repl); 8948 return; 8949 fail: 8950 ASSERT(segvn_textrepl_bytes >= size); 8951 atomic_add_long(&segvn_textrepl_bytes, -size); 8952 ASSERT(svntrp != NULL); 8953 ASSERT(svntrp->tr_amp[lgrp_id] == NULL); 8954 if (svntrp->tr_refcnt == 0) { 8955 ASSERT(svntrp == svntr_hashtab[hash].tr_head); 8956 svntr_hashtab[hash].tr_head = svntrp->tr_next; 8957 mutex_exit(&svntr_hashtab[hash].tr_lock); 8958 kmem_cache_free(svntr_cache, svntrp); 8959 } else { 8960 mutex_exit(&svntr_hashtab[hash].tr_lock); 8961 } 8962 svd->tr_state = SEGVN_TR_OFF; 8963 } 8964 8965 /* 8966 * Convert seg back to regular vnode mapping seg by unbinding it from its text 8967 * replication amp. This routine is most typically called when segment is 8968 * unmapped but can also be called when segment no longer qualifies for text 8969 * replication (e.g. due to protection changes). If unload_unmap is set use 8970 * HAT_UNLOAD_UNMAP flag in hat_unload_callback(). If we are the last user of 8971 * svntr free all its anon maps and remove it from the hash table. 8972 */ 8973 static void 8974 segvn_textunrepl(struct seg *seg, int unload_unmap) 8975 { 8976 struct segvn_data *svd = (struct segvn_data *)seg->s_data; 8977 vnode_t *vp = svd->vp; 8978 u_offset_t off = svd->offset; 8979 size_t size = seg->s_size; 8980 u_offset_t eoff = off + size; 8981 uint_t szc = seg->s_szc; 8982 ulong_t hash = SVNTR_HASH_FUNC(vp); 8983 svntr_t *svntrp; 8984 svntr_t **prv_svntrp; 8985 lgrp_id_t lgrp_id = svd->tr_policy_info.mem_lgrpid; 8986 lgrp_id_t i; 8987 8988 ASSERT(AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 8989 ASSERT(AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock) || 8990 SEGVN_WRITE_HELD(seg->s_as, &svd->lock)); 8991 ASSERT(svd->tr_state == SEGVN_TR_ON); 8992 ASSERT(svd->amp != NULL); 8993 ASSERT(svd->amp->refcnt >= 1); 8994 ASSERT(svd->anon_index == 0); 8995 ASSERT(lgrp_id != LGRP_NONE && lgrp_id < NLGRPS_MAX); 8996 ASSERT(svntr_hashtab != NULL); 8997 8998 mutex_enter(&svntr_hashtab[hash].tr_lock); 8999 prv_svntrp = &svntr_hashtab[hash].tr_head; 9000 for (; (svntrp = *prv_svntrp) != NULL; prv_svntrp = &svntrp->tr_next) { 9001 ASSERT(svntrp->tr_refcnt != 0); 9002 if (svntrp->tr_vp == vp && svntrp->tr_off == off && 9003 svntrp->tr_eoff == eoff && svntrp->tr_szc == szc) { 9004 break; 9005 } 9006 } 9007 if (svntrp == NULL) { 9008 panic("segvn_textunrepl: svntr record not found"); 9009 } 9010 if (svntrp->tr_amp[lgrp_id] != svd->amp) { 9011 panic("segvn_textunrepl: amp mismatch"); 9012 } 9013 svd->tr_state = SEGVN_TR_OFF; 9014 svd->amp = NULL; 9015 if (svd->svn_trprev == NULL) { 9016 ASSERT(svntrp->tr_svnhead == svd); 9017 svntrp->tr_svnhead = svd->svn_trnext; 9018 if (svntrp->tr_svnhead != NULL) { 9019 svntrp->tr_svnhead->svn_trprev = NULL; 9020 } 9021 svd->svn_trnext = NULL; 9022 } else { 9023 svd->svn_trprev->svn_trnext = svd->svn_trnext; 9024 if (svd->svn_trnext != NULL) { 9025 svd->svn_trnext->svn_trprev = svd->svn_trprev; 9026 svd->svn_trnext = NULL; 9027 } 9028 svd->svn_trprev = NULL; 9029 } 9030 if (--svntrp->tr_refcnt) { 9031 mutex_exit(&svntr_hashtab[hash].tr_lock); 9032 goto done; 9033 } 9034 *prv_svntrp = svntrp->tr_next; 9035 mutex_exit(&svntr_hashtab[hash].tr_lock); 9036 for (i = 0; i < NLGRPS_MAX; i++) { 9037 struct anon_map *amp = svntrp->tr_amp[i]; 9038 if (amp == NULL) { 9039 continue; 9040 } 9041 ASSERT(amp->refcnt == 1); 9042 ASSERT(amp->swresv == size); 9043 ASSERT(amp->size == size); 9044 ASSERT(amp->a_szc == szc); 9045 if (amp->a_szc != 0) { 9046 anon_free_pages(amp->ahp, 0, size, szc); 9047 } else { 9048 anon_free(amp->ahp, 0, size); 9049 } 9050 svntrp->tr_amp[i] = NULL; 9051 ASSERT(segvn_textrepl_bytes >= size); 9052 atomic_add_long(&segvn_textrepl_bytes, -size); 9053 anon_unresv_zone(amp->swresv, NULL); 9054 amp->refcnt = 0; 9055 anonmap_free(amp); 9056 } 9057 kmem_cache_free(svntr_cache, svntrp); 9058 done: 9059 hat_unload_callback(seg->s_as->a_hat, seg->s_base, size, 9060 unload_unmap ? HAT_UNLOAD_UNMAP : 0, NULL); 9061 } 9062 9063 /* 9064 * This is called when a MAP_SHARED writabble mapping is created to a vnode 9065 * that is currently used for execution (VVMEXEC flag is set). In this case we 9066 * need to prevent further use of existing replicas. 9067 */ 9068 static void 9069 segvn_inval_trcache(vnode_t *vp) 9070 { 9071 ulong_t hash = SVNTR_HASH_FUNC(vp); 9072 svntr_t *svntrp; 9073 9074 ASSERT(vp->v_flag & VVMEXEC); 9075 9076 if (svntr_hashtab == NULL) { 9077 return; 9078 } 9079 9080 mutex_enter(&svntr_hashtab[hash].tr_lock); 9081 svntrp = svntr_hashtab[hash].tr_head; 9082 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9083 ASSERT(svntrp->tr_refcnt != 0); 9084 if (svntrp->tr_vp == vp && svntrp->tr_valid) { 9085 svntrp->tr_valid = 0; 9086 } 9087 } 9088 mutex_exit(&svntr_hashtab[hash].tr_lock); 9089 } 9090 9091 static void 9092 segvn_trasync_thread(void) 9093 { 9094 callb_cpr_t cpr_info; 9095 kmutex_t cpr_lock; /* just for CPR stuff */ 9096 9097 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL); 9098 9099 CALLB_CPR_INIT(&cpr_info, &cpr_lock, 9100 callb_generic_cpr, "segvn_async"); 9101 9102 if (segvn_update_textrepl_interval == 0) { 9103 segvn_update_textrepl_interval = segvn_update_tr_time * hz; 9104 } else { 9105 segvn_update_textrepl_interval *= hz; 9106 } 9107 (void) timeout(segvn_trupdate_wakeup, NULL, 9108 segvn_update_textrepl_interval); 9109 9110 for (;;) { 9111 mutex_enter(&cpr_lock); 9112 CALLB_CPR_SAFE_BEGIN(&cpr_info); 9113 mutex_exit(&cpr_lock); 9114 sema_p(&segvn_trasync_sem); 9115 mutex_enter(&cpr_lock); 9116 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock); 9117 mutex_exit(&cpr_lock); 9118 segvn_trupdate(); 9119 } 9120 } 9121 9122 static uint64_t segvn_lgrp_trthr_migrs_snpsht = 0; 9123 9124 static void 9125 segvn_trupdate_wakeup(void *dummy) 9126 { 9127 uint64_t cur_lgrp_trthr_migrs = lgrp_get_trthr_migrations(); 9128 9129 if (cur_lgrp_trthr_migrs != segvn_lgrp_trthr_migrs_snpsht) { 9130 segvn_lgrp_trthr_migrs_snpsht = cur_lgrp_trthr_migrs; 9131 sema_v(&segvn_trasync_sem); 9132 } 9133 9134 if (!segvn_disable_textrepl_update && 9135 segvn_update_textrepl_interval != 0) { 9136 (void) timeout(segvn_trupdate_wakeup, dummy, 9137 segvn_update_textrepl_interval); 9138 } 9139 } 9140 9141 static void 9142 segvn_trupdate(void) 9143 { 9144 ulong_t hash; 9145 svntr_t *svntrp; 9146 segvn_data_t *svd; 9147 9148 ASSERT(svntr_hashtab != NULL); 9149 9150 for (hash = 0; hash < svntr_hashtab_sz; hash++) { 9151 mutex_enter(&svntr_hashtab[hash].tr_lock); 9152 svntrp = svntr_hashtab[hash].tr_head; 9153 for (; svntrp != NULL; svntrp = svntrp->tr_next) { 9154 ASSERT(svntrp->tr_refcnt != 0); 9155 svd = svntrp->tr_svnhead; 9156 for (; svd != NULL; svd = svd->svn_trnext) { 9157 segvn_trupdate_seg(svd->seg, svd, svntrp, 9158 hash); 9159 } 9160 } 9161 mutex_exit(&svntr_hashtab[hash].tr_lock); 9162 } 9163 } 9164 9165 static void 9166 segvn_trupdate_seg(struct seg *seg, 9167 segvn_data_t *svd, 9168 svntr_t *svntrp, 9169 ulong_t hash) 9170 { 9171 proc_t *p; 9172 lgrp_id_t lgrp_id; 9173 struct as *as; 9174 size_t size; 9175 struct anon_map *amp; 9176 9177 ASSERT(svd->vp != NULL); 9178 ASSERT(svd->vp == svntrp->tr_vp); 9179 ASSERT(svd->offset == svntrp->tr_off); 9180 ASSERT(svd->offset + seg->s_size == svntrp->tr_eoff); 9181 ASSERT(seg != NULL); 9182 ASSERT(svd->seg == seg); 9183 ASSERT(seg->s_data == (void *)svd); 9184 ASSERT(seg->s_szc == svntrp->tr_szc); 9185 ASSERT(svd->tr_state == SEGVN_TR_ON); 9186 ASSERT(svd->amp != NULL); 9187 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9188 ASSERT(svd->tr_policy_info.mem_lgrpid != LGRP_NONE); 9189 ASSERT(svd->tr_policy_info.mem_lgrpid < NLGRPS_MAX); 9190 ASSERT(svntrp->tr_amp[svd->tr_policy_info.mem_lgrpid] == svd->amp); 9191 ASSERT(svntrp->tr_refcnt != 0); 9192 ASSERT(mutex_owned(&svntr_hashtab[hash].tr_lock)); 9193 9194 as = seg->s_as; 9195 ASSERT(as != NULL && as != &kas); 9196 p = as->a_proc; 9197 ASSERT(p != NULL); 9198 ASSERT(p->p_tr_lgrpid != LGRP_NONE); 9199 lgrp_id = p->p_t1_lgrpid; 9200 if (lgrp_id == LGRP_NONE) { 9201 return; 9202 } 9203 ASSERT(lgrp_id < NLGRPS_MAX); 9204 if (svd->tr_policy_info.mem_lgrpid == lgrp_id) { 9205 return; 9206 } 9207 9208 /* 9209 * Use tryenter locking since we are locking as/seg and svntr hash 9210 * lock in reverse from syncrounous thread order. 9211 */ 9212 if (!AS_LOCK_TRYENTER(as, &as->a_lock, RW_READER)) { 9213 SEGVN_TR_ADDSTAT(nolock); 9214 if (segvn_lgrp_trthr_migrs_snpsht) { 9215 segvn_lgrp_trthr_migrs_snpsht = 0; 9216 } 9217 return; 9218 } 9219 if (!SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, RW_WRITER)) { 9220 AS_LOCK_EXIT(as, &as->a_lock); 9221 SEGVN_TR_ADDSTAT(nolock); 9222 if (segvn_lgrp_trthr_migrs_snpsht) { 9223 segvn_lgrp_trthr_migrs_snpsht = 0; 9224 } 9225 return; 9226 } 9227 size = seg->s_size; 9228 if (svntrp->tr_amp[lgrp_id] == NULL) { 9229 size_t trmem = atomic_add_long_nv(&segvn_textrepl_bytes, size); 9230 if (trmem > segvn_textrepl_max_bytes) { 9231 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9232 AS_LOCK_EXIT(as, &as->a_lock); 9233 atomic_add_long(&segvn_textrepl_bytes, -size); 9234 SEGVN_TR_ADDSTAT(normem); 9235 return; 9236 } 9237 if (anon_try_resv_zone(size, NULL) == 0) { 9238 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9239 AS_LOCK_EXIT(as, &as->a_lock); 9240 atomic_add_long(&segvn_textrepl_bytes, -size); 9241 SEGVN_TR_ADDSTAT(noanon); 9242 return; 9243 } 9244 amp = anonmap_alloc(size, size, KM_NOSLEEP); 9245 if (amp == NULL) { 9246 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9247 AS_LOCK_EXIT(as, &as->a_lock); 9248 atomic_add_long(&segvn_textrepl_bytes, -size); 9249 anon_unresv_zone(size, NULL); 9250 SEGVN_TR_ADDSTAT(nokmem); 9251 return; 9252 } 9253 ASSERT(amp->refcnt == 1); 9254 amp->a_szc = seg->s_szc; 9255 svntrp->tr_amp[lgrp_id] = amp; 9256 } 9257 /* 9258 * We don't need to drop the bucket lock but here we give other 9259 * threads a chance. svntr and svd can't be unlinked as long as 9260 * segment lock is held as a writer and AS held as well. After we 9261 * retake bucket lock we'll continue from where we left. We'll be able 9262 * to reach the end of either list since new entries are always added 9263 * to the beginning of the lists. 9264 */ 9265 mutex_exit(&svntr_hashtab[hash].tr_lock); 9266 hat_unload_callback(as->a_hat, seg->s_base, size, 0, NULL); 9267 mutex_enter(&svntr_hashtab[hash].tr_lock); 9268 9269 ASSERT(svd->tr_state == SEGVN_TR_ON); 9270 ASSERT(svd->amp != NULL); 9271 ASSERT(svd->tr_policy_info.mem_policy == LGRP_MEM_POLICY_NEXT_SEG); 9272 ASSERT(svd->tr_policy_info.mem_lgrpid != lgrp_id); 9273 ASSERT(svd->amp != svntrp->tr_amp[lgrp_id]); 9274 9275 svd->tr_policy_info.mem_lgrpid = lgrp_id; 9276 svd->amp = svntrp->tr_amp[lgrp_id]; 9277 p->p_tr_lgrpid = NLGRPS_MAX; 9278 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 9279 AS_LOCK_EXIT(as, &as->a_lock); 9280 9281 ASSERT(svntrp->tr_refcnt != 0); 9282 ASSERT(svd->vp == svntrp->tr_vp); 9283 ASSERT(svd->tr_policy_info.mem_lgrpid == lgrp_id); 9284 ASSERT(svd->amp != NULL && svd->amp == svntrp->tr_amp[lgrp_id]); 9285 ASSERT(svd->seg == seg); 9286 ASSERT(svd->tr_state == SEGVN_TR_ON); 9287 9288 SEGVN_TR_ADDSTAT(asyncrepl); 9289 } 9290