17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5aa042c4bSkchow * Common Development and Distribution License (the "License"). 6aa042c4bSkchow * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate /* 22cb15d5d9SPeter Rival * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 237c478bd9Sstevel@tonic-gate */ 24a3114836SGerry Liu /* 25a3114836SGerry Liu * Copyright (c) 2010, Intel Corporation. 26a3114836SGerry Liu * All rights reserved. 27a3114836SGerry Liu */ 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 307c478bd9Sstevel@tonic-gate /* All Rights Reserved */ 317c478bd9Sstevel@tonic-gate 327c478bd9Sstevel@tonic-gate /* 337c478bd9Sstevel@tonic-gate * Portions of this source code were derived from Berkeley 4.3 BSD 347c478bd9Sstevel@tonic-gate * under license from the Regents of the University of California. 357c478bd9Sstevel@tonic-gate */ 367c478bd9Sstevel@tonic-gate 377c478bd9Sstevel@tonic-gate /* 387c478bd9Sstevel@tonic-gate * UNIX machine dependent virtual memory support. 397c478bd9Sstevel@tonic-gate */ 407c478bd9Sstevel@tonic-gate 417c478bd9Sstevel@tonic-gate #include <sys/types.h> 427c478bd9Sstevel@tonic-gate #include <sys/param.h> 437c478bd9Sstevel@tonic-gate #include <sys/systm.h> 447c478bd9Sstevel@tonic-gate #include <sys/user.h> 457c478bd9Sstevel@tonic-gate #include <sys/proc.h> 467c478bd9Sstevel@tonic-gate #include <sys/kmem.h> 477c478bd9Sstevel@tonic-gate #include <sys/vmem.h> 487c478bd9Sstevel@tonic-gate #include <sys/buf.h> 497c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 507c478bd9Sstevel@tonic-gate #include <sys/lgrp.h> 517c478bd9Sstevel@tonic-gate #include <sys/disp.h> 527c478bd9Sstevel@tonic-gate #include <sys/vm.h> 537c478bd9Sstevel@tonic-gate #include <sys/mman.h> 547c478bd9Sstevel@tonic-gate #include <sys/vnode.h> 557c478bd9Sstevel@tonic-gate #include <sys/cred.h> 567c478bd9Sstevel@tonic-gate #include <sys/exec.h> 577c478bd9Sstevel@tonic-gate #include <sys/exechdr.h> 587c478bd9Sstevel@tonic-gate #include <sys/debug.h> 59ec25b48fSsusans #include <sys/vmsystm.h> 60cb15d5d9SPeter Rival #include <sys/swap.h> 61*1f84c0d7SDave Plauger #include <sys/dumphdr.h> 627c478bd9Sstevel@tonic-gate 637c478bd9Sstevel@tonic-gate #include <vm/hat.h> 647c478bd9Sstevel@tonic-gate #include <vm/as.h> 657c478bd9Sstevel@tonic-gate #include <vm/seg.h> 667c478bd9Sstevel@tonic-gate #include <vm/seg_kp.h> 677c478bd9Sstevel@tonic-gate #include <vm/seg_vn.h> 687c478bd9Sstevel@tonic-gate #include <vm/page.h> 697c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 707c478bd9Sstevel@tonic-gate #include <vm/seg_kpm.h> 717c478bd9Sstevel@tonic-gate #include <vm/vm_dep.h> 727c478bd9Sstevel@tonic-gate 737c478bd9Sstevel@tonic-gate #include <sys/cpu.h> 747c478bd9Sstevel@tonic-gate #include <sys/vm_machparam.h> 757c478bd9Sstevel@tonic-gate #include <sys/memlist.h> 767c478bd9Sstevel@tonic-gate #include <sys/bootconf.h> /* XXX the memlist stuff belongs in memlist_plat.h */ 777c478bd9Sstevel@tonic-gate #include <vm/hat_i86.h> 787c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h> 797c478bd9Sstevel@tonic-gate #include <sys/elf_386.h> 807c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 817c478bd9Sstevel@tonic-gate #include <sys/archsystm.h> 827c478bd9Sstevel@tonic-gate #include <sys/machsystm.h> 837c478bd9Sstevel@tonic-gate 847c478bd9Sstevel@tonic-gate #include <sys/vtrace.h> 857c478bd9Sstevel@tonic-gate #include <sys/ddidmareq.h> 867c478bd9Sstevel@tonic-gate #include <sys/promif.h> 877c478bd9Sstevel@tonic-gate #include <sys/memnode.h> 887c478bd9Sstevel@tonic-gate #include <sys/stack.h> 89843e1988Sjohnlev #include <util/qsort.h> 90843e1988Sjohnlev #include <sys/taskq.h> 91843e1988Sjohnlev 92843e1988Sjohnlev #ifdef __xpv 93843e1988Sjohnlev 94843e1988Sjohnlev #include <sys/hypervisor.h> 95843e1988Sjohnlev #include <sys/xen_mmu.h> 96843e1988Sjohnlev #include <sys/balloon_impl.h> 97843e1988Sjohnlev 98843e1988Sjohnlev /* 99843e1988Sjohnlev * domain 0 pages usable for DMA are kept pre-allocated and kept in 100843e1988Sjohnlev * distinct lists, ordered by increasing mfn. 101843e1988Sjohnlev */ 102843e1988Sjohnlev static kmutex_t io_pool_lock; 103b9bc7f78Ssmaybe static kmutex_t contig_list_lock; 104843e1988Sjohnlev static page_t *io_pool_4g; /* pool for 32 bit dma limited devices */ 105843e1988Sjohnlev static page_t *io_pool_16m; /* pool for 24 bit dma limited legacy devices */ 106843e1988Sjohnlev static long io_pool_cnt; 107843e1988Sjohnlev static long io_pool_cnt_max = 0; 108843e1988Sjohnlev #define DEFAULT_IO_POOL_MIN 128 109843e1988Sjohnlev static long io_pool_cnt_min = DEFAULT_IO_POOL_MIN; 110843e1988Sjohnlev static long io_pool_cnt_lowater = 0; 111843e1988Sjohnlev static long io_pool_shrink_attempts; /* how many times did we try to shrink */ 112843e1988Sjohnlev static long io_pool_shrinks; /* how many times did we really shrink */ 113843e1988Sjohnlev static long io_pool_grows; /* how many times did we grow */ 114843e1988Sjohnlev static mfn_t start_mfn = 1; 115843e1988Sjohnlev static caddr_t io_pool_kva; /* use to alloc pages when needed */ 116843e1988Sjohnlev 117843e1988Sjohnlev static int create_contig_pfnlist(uint_t); 118843e1988Sjohnlev 119843e1988Sjohnlev /* 120843e1988Sjohnlev * percentage of phys mem to hold in the i/o pool 121843e1988Sjohnlev */ 122843e1988Sjohnlev #define DEFAULT_IO_POOL_PCT 2 123843e1988Sjohnlev static long io_pool_physmem_pct = DEFAULT_IO_POOL_PCT; 124843e1988Sjohnlev static void page_io_pool_sub(page_t **, page_t *, page_t *); 125b9bc7f78Ssmaybe int ioalloc_dbg = 0; 126843e1988Sjohnlev 127843e1988Sjohnlev #endif /* __xpv */ 1287c478bd9Sstevel@tonic-gate 1295d07b933Sdp78419 uint_t vac_colors = 1; 1307c478bd9Sstevel@tonic-gate 1317c478bd9Sstevel@tonic-gate int largepagesupport = 0; 1327c478bd9Sstevel@tonic-gate extern uint_t page_create_new; 1337c478bd9Sstevel@tonic-gate extern uint_t page_create_exists; 1347c478bd9Sstevel@tonic-gate extern uint_t page_create_putbacks; 135ae115bc7Smrj /* 136ae115bc7Smrj * Allow users to disable the kernel's use of SSE. 137ae115bc7Smrj */ 138ae115bc7Smrj extern int use_sse_pagecopy, use_sse_pagezero; 1397c478bd9Sstevel@tonic-gate 140843e1988Sjohnlev /* 141d94ffb28Sjmcp * combined memory ranges from mnode and memranges[] to manage single 142d94ffb28Sjmcp * mnode/mtype dimension in the page lists. 143d94ffb28Sjmcp */ 144d94ffb28Sjmcp typedef struct { 145d94ffb28Sjmcp pfn_t mnr_pfnlo; 146d94ffb28Sjmcp pfn_t mnr_pfnhi; 147d94ffb28Sjmcp int mnr_mnode; 148d94ffb28Sjmcp int mnr_memrange; /* index into memranges[] */ 149d94ffb28Sjmcp int mnr_next; /* next lower PA mnoderange */ 150d94ffb28Sjmcp int mnr_exists; 151d94ffb28Sjmcp /* maintain page list stats */ 152d94ffb28Sjmcp pgcnt_t mnr_mt_clpgcnt; /* cache list cnt */ 153d94ffb28Sjmcp pgcnt_t mnr_mt_flpgcnt[MMU_PAGE_SIZES]; /* free list cnt per szc */ 154d94ffb28Sjmcp pgcnt_t mnr_mt_totcnt; /* sum of cache and free lists */ 155d94ffb28Sjmcp #ifdef DEBUG 156d94ffb28Sjmcp struct mnr_mts { /* mnode/mtype szc stats */ 157d94ffb28Sjmcp pgcnt_t mnr_mts_pgcnt; 158d94ffb28Sjmcp int mnr_mts_colors; 159d94ffb28Sjmcp pgcnt_t *mnr_mtsc_pgcnt; 160d94ffb28Sjmcp } *mnr_mts; 161d94ffb28Sjmcp #endif 162d94ffb28Sjmcp } mnoderange_t; 163d94ffb28Sjmcp 164d94ffb28Sjmcp #define MEMRANGEHI(mtype) \ 165d94ffb28Sjmcp ((mtype > 0) ? memranges[mtype - 1] - 1: physmax) 166d94ffb28Sjmcp #define MEMRANGELO(mtype) (memranges[mtype]) 167d94ffb28Sjmcp 168d94ffb28Sjmcp #define MTYPE_FREEMEM(mt) (mnoderanges[mt].mnr_mt_totcnt) 169d94ffb28Sjmcp 170d94ffb28Sjmcp /* 171843e1988Sjohnlev * As the PC architecture evolved memory up was clumped into several 172843e1988Sjohnlev * ranges for various historical I/O devices to do DMA. 173843e1988Sjohnlev * < 16Meg - ISA bus 174843e1988Sjohnlev * < 2Gig - ??? 175843e1988Sjohnlev * < 4Gig - PCI bus or drivers that don't understand PAE mode 176843e1988Sjohnlev * 177843e1988Sjohnlev * These are listed in reverse order, so that we can skip over unused 178843e1988Sjohnlev * ranges on machines with small memories. 179843e1988Sjohnlev * 180843e1988Sjohnlev * For now under the Hypervisor, we'll only ever have one memrange. 181843e1988Sjohnlev */ 182843e1988Sjohnlev #define PFN_4GIG 0x100000 183843e1988Sjohnlev #define PFN_16MEG 0x1000 184a3114836SGerry Liu /* Indices into the memory range (arch_memranges) array. */ 185a3114836SGerry Liu #define MRI_4G 0 186a3114836SGerry Liu #define MRI_2G 1 187a3114836SGerry Liu #define MRI_16M 2 188a3114836SGerry Liu #define MRI_0 3 189843e1988Sjohnlev static pfn_t arch_memranges[NUM_MEM_RANGES] = { 190843e1988Sjohnlev PFN_4GIG, /* pfn range for 4G and above */ 191843e1988Sjohnlev 0x80000, /* pfn range for 2G-4G */ 192843e1988Sjohnlev PFN_16MEG, /* pfn range for 16M-2G */ 193843e1988Sjohnlev 0x00000, /* pfn range for 0-16M */ 194843e1988Sjohnlev }; 195843e1988Sjohnlev pfn_t *memranges = &arch_memranges[0]; 196843e1988Sjohnlev int nranges = NUM_MEM_RANGES; 197843e1988Sjohnlev 198843e1988Sjohnlev /* 199843e1988Sjohnlev * This combines mem_node_config and memranges into one data 200843e1988Sjohnlev * structure to be used for page list management. 201843e1988Sjohnlev */ 202843e1988Sjohnlev mnoderange_t *mnoderanges; 203843e1988Sjohnlev int mnoderangecnt; 204843e1988Sjohnlev int mtype4g; 205a3114836SGerry Liu int mtype16m; 206a3114836SGerry Liu int mtypetop; /* index of highest pfn'ed mnoderange */ 207843e1988Sjohnlev 208843e1988Sjohnlev /* 209843e1988Sjohnlev * 4g memory management variables for systems with more than 4g of memory: 210843e1988Sjohnlev * 211843e1988Sjohnlev * physical memory below 4g is required for 32bit dma devices and, currently, 212843e1988Sjohnlev * for kmem memory. On systems with more than 4g of memory, the pool of memory 213843e1988Sjohnlev * below 4g can be depleted without any paging activity given that there is 214843e1988Sjohnlev * likely to be sufficient memory above 4g. 215843e1988Sjohnlev * 216843e1988Sjohnlev * physmax4g is set true if the largest pfn is over 4g. The rest of the 217843e1988Sjohnlev * 4g memory management code is enabled only when physmax4g is true. 218843e1988Sjohnlev * 219843e1988Sjohnlev * maxmem4g is the count of the maximum number of pages on the page lists 220843e1988Sjohnlev * with physical addresses below 4g. It can be a lot less then 4g given that 221843e1988Sjohnlev * BIOS may reserve large chunks of space below 4g for hot plug pci devices, 222843e1988Sjohnlev * agp aperture etc. 223843e1988Sjohnlev * 224843e1988Sjohnlev * freemem4g maintains the count of the number of available pages on the 225843e1988Sjohnlev * page lists with physical addresses below 4g. 226843e1988Sjohnlev * 227843e1988Sjohnlev * DESFREE4G specifies the desired amount of below 4g memory. It defaults to 228843e1988Sjohnlev * 6% (desfree4gshift = 4) of maxmem4g. 229843e1988Sjohnlev * 230843e1988Sjohnlev * RESTRICT4G_ALLOC returns true if freemem4g falls below DESFREE4G 231843e1988Sjohnlev * and the amount of physical memory above 4g is greater than freemem4g. 232843e1988Sjohnlev * In this case, page_get_* routines will restrict below 4g allocations 233843e1988Sjohnlev * for requests that don't specifically require it. 234843e1988Sjohnlev */ 235843e1988Sjohnlev 236843e1988Sjohnlev #define DESFREE4G (maxmem4g >> desfree4gshift) 237843e1988Sjohnlev 238843e1988Sjohnlev #define RESTRICT4G_ALLOC \ 239843e1988Sjohnlev (physmax4g && (freemem4g < DESFREE4G) && ((freemem4g << 1) < freemem)) 240843e1988Sjohnlev 241843e1988Sjohnlev static pgcnt_t maxmem4g; 242843e1988Sjohnlev static pgcnt_t freemem4g; 243843e1988Sjohnlev static int physmax4g; 244843e1988Sjohnlev static int desfree4gshift = 4; /* maxmem4g shift to derive DESFREE4G */ 245843e1988Sjohnlev 246843e1988Sjohnlev /* 247843e1988Sjohnlev * 16m memory management: 248843e1988Sjohnlev * 249843e1988Sjohnlev * reserve some amount of physical memory below 16m for legacy devices. 250843e1988Sjohnlev * 251843e1988Sjohnlev * RESTRICT16M_ALLOC returns true if an there are sufficient free pages above 252843e1988Sjohnlev * 16m or if the 16m pool drops below DESFREE16M. 253843e1988Sjohnlev * 254843e1988Sjohnlev * In this case, general page allocations via page_get_{free,cache}list 255843e1988Sjohnlev * routines will be restricted from allocating from the 16m pool. Allocations 256843e1988Sjohnlev * that require specific pfn ranges (page_get_anylist) and PG_PANIC allocations 257843e1988Sjohnlev * are not restricted. 258843e1988Sjohnlev */ 259843e1988Sjohnlev 260a3114836SGerry Liu #define FREEMEM16M MTYPE_FREEMEM(mtype16m) 261843e1988Sjohnlev #define DESFREE16M desfree16m 262843e1988Sjohnlev #define RESTRICT16M_ALLOC(freemem, pgcnt, flags) \ 263843e1988Sjohnlev ((freemem != 0) && ((flags & PG_PANIC) == 0) && \ 264843e1988Sjohnlev ((freemem >= (FREEMEM16M)) || \ 265843e1988Sjohnlev (FREEMEM16M < (DESFREE16M + pgcnt)))) 266843e1988Sjohnlev 267843e1988Sjohnlev static pgcnt_t desfree16m = 0x380; 268843e1988Sjohnlev 269843e1988Sjohnlev /* 270843e1988Sjohnlev * This can be patched via /etc/system to allow old non-PAE aware device 271843e1988Sjohnlev * drivers to use kmem_alloc'd memory on 32 bit systems with > 4Gig RAM. 272843e1988Sjohnlev */ 273843e1988Sjohnlev int restricted_kmemalloc = 0; 27407ad560dSkchow 2757c478bd9Sstevel@tonic-gate #ifdef VM_STATS 2767c478bd9Sstevel@tonic-gate struct { 2777c478bd9Sstevel@tonic-gate ulong_t pga_alloc; 2787c478bd9Sstevel@tonic-gate ulong_t pga_notfullrange; 2797c478bd9Sstevel@tonic-gate ulong_t pga_nulldmaattr; 2807c478bd9Sstevel@tonic-gate ulong_t pga_allocok; 2817c478bd9Sstevel@tonic-gate ulong_t pga_allocfailed; 2827c478bd9Sstevel@tonic-gate ulong_t pgma_alloc; 2837c478bd9Sstevel@tonic-gate ulong_t pgma_allocok; 2847c478bd9Sstevel@tonic-gate ulong_t pgma_allocfailed; 2857c478bd9Sstevel@tonic-gate ulong_t pgma_allocempty; 2867c478bd9Sstevel@tonic-gate } pga_vmstats; 2877c478bd9Sstevel@tonic-gate #endif 2887c478bd9Sstevel@tonic-gate 2897c478bd9Sstevel@tonic-gate uint_t mmu_page_sizes; 2907c478bd9Sstevel@tonic-gate 2917c478bd9Sstevel@tonic-gate /* How many page sizes the users can see */ 2927c478bd9Sstevel@tonic-gate uint_t mmu_exported_page_sizes; 2937c478bd9Sstevel@tonic-gate 29402bc52beSkchow /* page sizes that legacy applications can see */ 29502bc52beSkchow uint_t mmu_legacy_page_sizes; 29602bc52beSkchow 297beb1bda0Sdavemq /* 298beb1bda0Sdavemq * Number of pages in 1 GB. Don't enable automatic large pages if we have 299beb1bda0Sdavemq * fewer than this many pages. 300beb1bda0Sdavemq */ 301ec25b48fSsusans pgcnt_t shm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 302ec25b48fSsusans pgcnt_t privm_lpg_min_physmem = 1 << (30 - MMU_PAGESHIFT); 303ec25b48fSsusans 304ec25b48fSsusans /* 305ec25b48fSsusans * Maximum and default segment size tunables for user private 306ec25b48fSsusans * and shared anon memory, and user text and initialized data. 307ec25b48fSsusans * These can be patched via /etc/system to allow large pages 308ec25b48fSsusans * to be used for mapping application private and shared anon memory. 309ec25b48fSsusans */ 310ec25b48fSsusans size_t mcntl0_lpsize = MMU_PAGESIZE; 311ec25b48fSsusans size_t max_uheap_lpsize = MMU_PAGESIZE; 312ec25b48fSsusans size_t default_uheap_lpsize = MMU_PAGESIZE; 313ec25b48fSsusans size_t max_ustack_lpsize = MMU_PAGESIZE; 314ec25b48fSsusans size_t default_ustack_lpsize = MMU_PAGESIZE; 315ec25b48fSsusans size_t max_privmap_lpsize = MMU_PAGESIZE; 316ec25b48fSsusans size_t max_uidata_lpsize = MMU_PAGESIZE; 317ec25b48fSsusans size_t max_utext_lpsize = MMU_PAGESIZE; 318ec25b48fSsusans size_t max_shm_lpsize = MMU_PAGESIZE; 3197c478bd9Sstevel@tonic-gate 320843e1988Sjohnlev 321843e1988Sjohnlev /* 322843e1988Sjohnlev * initialized by page_coloring_init(). 323843e1988Sjohnlev */ 324843e1988Sjohnlev uint_t page_colors; 325843e1988Sjohnlev uint_t page_colors_mask; 326843e1988Sjohnlev uint_t page_coloring_shift; 327843e1988Sjohnlev int cpu_page_colors; 328843e1988Sjohnlev static uint_t l2_colors; 329843e1988Sjohnlev 330843e1988Sjohnlev /* 331843e1988Sjohnlev * Page freelists and cachelists are dynamically allocated once mnoderangecnt 332843e1988Sjohnlev * and page_colors are calculated from the l2 cache n-way set size. Within a 333843e1988Sjohnlev * mnode range, the page freelist and cachelist are hashed into bins based on 334843e1988Sjohnlev * color. This makes it easier to search for a page within a specific memory 335843e1988Sjohnlev * range. 336843e1988Sjohnlev */ 337843e1988Sjohnlev #define PAGE_COLORS_MIN 16 338843e1988Sjohnlev 339843e1988Sjohnlev page_t ****page_freelists; 340843e1988Sjohnlev page_t ***page_cachelists; 341843e1988Sjohnlev 342843e1988Sjohnlev 343843e1988Sjohnlev /* 344843e1988Sjohnlev * Used by page layer to know about page sizes 345843e1988Sjohnlev */ 346843e1988Sjohnlev hw_pagesize_t hw_page_array[MAX_NUM_LEVEL + 1]; 347843e1988Sjohnlev 348d94ffb28Sjmcp kmutex_t *fpc_mutex[NPC_MUTEX]; 349d94ffb28Sjmcp kmutex_t *cpc_mutex[NPC_MUTEX]; 350843e1988Sjohnlev 351a3114836SGerry Liu /* Lock to protect mnoderanges array for memory DR operations. */ 352a3114836SGerry Liu static kmutex_t mnoderange_lock; 353a3114836SGerry Liu 354843e1988Sjohnlev /* 355843e1988Sjohnlev * Only let one thread at a time try to coalesce large pages, to 356843e1988Sjohnlev * prevent them from working against each other. 357843e1988Sjohnlev */ 358843e1988Sjohnlev static kmutex_t contig_lock; 359843e1988Sjohnlev #define CONTIG_LOCK() mutex_enter(&contig_lock); 360843e1988Sjohnlev #define CONTIG_UNLOCK() mutex_exit(&contig_lock); 361843e1988Sjohnlev 362843e1988Sjohnlev #define PFN_16M (mmu_btop((uint64_t)0x1000000)) 363843e1988Sjohnlev 3647c478bd9Sstevel@tonic-gate /* 3657c478bd9Sstevel@tonic-gate * Return the optimum page size for a given mapping 3667c478bd9Sstevel@tonic-gate */ 3677c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 3687c478bd9Sstevel@tonic-gate size_t 369ec25b48fSsusans map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int memcntl) 3707c478bd9Sstevel@tonic-gate { 371ec25b48fSsusans level_t l = 0; 372ec25b48fSsusans size_t pgsz = MMU_PAGESIZE; 373ec25b48fSsusans size_t max_lpsize; 374ec25b48fSsusans uint_t mszc; 3757c478bd9Sstevel@tonic-gate 376ec25b48fSsusans ASSERT(maptype != MAPPGSZ_VA); 377ec25b48fSsusans 378ec25b48fSsusans if (maptype != MAPPGSZ_ISM && physmem < privm_lpg_min_physmem) { 379ec25b48fSsusans return (MMU_PAGESIZE); 380ec25b48fSsusans } 3817c478bd9Sstevel@tonic-gate 3827c478bd9Sstevel@tonic-gate switch (maptype) { 3837c478bd9Sstevel@tonic-gate case MAPPGSZ_HEAP: 384ec25b48fSsusans case MAPPGSZ_STK: 385ec25b48fSsusans max_lpsize = memcntl ? mcntl0_lpsize : (maptype == 386ec25b48fSsusans MAPPGSZ_HEAP ? max_uheap_lpsize : max_ustack_lpsize); 387ec25b48fSsusans if (max_lpsize == MMU_PAGESIZE) { 388ec25b48fSsusans return (MMU_PAGESIZE); 389ec25b48fSsusans } 390ec25b48fSsusans if (len == 0) { 391ec25b48fSsusans len = (maptype == MAPPGSZ_HEAP) ? p->p_brkbase + 392ec25b48fSsusans p->p_brksize - p->p_bssbase : p->p_stksize; 393ec25b48fSsusans } 394ec25b48fSsusans len = (maptype == MAPPGSZ_HEAP) ? MAX(len, 395ec25b48fSsusans default_uheap_lpsize) : MAX(len, default_ustack_lpsize); 396ec25b48fSsusans 3977c478bd9Sstevel@tonic-gate /* 3987c478bd9Sstevel@tonic-gate * use the pages size that best fits len 3997c478bd9Sstevel@tonic-gate */ 40002bc52beSkchow for (l = mmu.umax_page_level; l > 0; --l) { 401ec25b48fSsusans if (LEVEL_SIZE(l) > max_lpsize || len < LEVEL_SIZE(l)) { 4027c478bd9Sstevel@tonic-gate continue; 403ec25b48fSsusans } else { 404ec25b48fSsusans pgsz = LEVEL_SIZE(l); 405ec25b48fSsusans } 4067c478bd9Sstevel@tonic-gate break; 4077c478bd9Sstevel@tonic-gate } 408ec25b48fSsusans 409ec25b48fSsusans mszc = (maptype == MAPPGSZ_HEAP ? p->p_brkpageszc : 410ec25b48fSsusans p->p_stkpageszc); 411ec25b48fSsusans if (addr == 0 && (pgsz < hw_page_array[mszc].hp_size)) { 412ec25b48fSsusans pgsz = hw_page_array[mszc].hp_size; 413ec25b48fSsusans } 414ec25b48fSsusans return (pgsz); 4157c478bd9Sstevel@tonic-gate 4167c478bd9Sstevel@tonic-gate case MAPPGSZ_ISM: 41702bc52beSkchow for (l = mmu.umax_page_level; l > 0; --l) { 41802bc52beSkchow if (len >= LEVEL_SIZE(l)) 41902bc52beSkchow return (LEVEL_SIZE(l)); 42002bc52beSkchow } 42102bc52beSkchow return (LEVEL_SIZE(0)); 4227c478bd9Sstevel@tonic-gate } 423ec25b48fSsusans return (pgsz); 424ec25b48fSsusans } 425ec25b48fSsusans 426ec25b48fSsusans static uint_t 427ec25b48fSsusans map_szcvec(caddr_t addr, size_t size, uintptr_t off, size_t max_lpsize, 428ec25b48fSsusans size_t min_physmem) 429ec25b48fSsusans { 430ec25b48fSsusans caddr_t eaddr = addr + size; 431ec25b48fSsusans uint_t szcvec = 0; 432ec25b48fSsusans caddr_t raddr; 433ec25b48fSsusans caddr_t readdr; 434ec25b48fSsusans size_t pgsz; 435ec25b48fSsusans int i; 436ec25b48fSsusans 437ec25b48fSsusans if (physmem < min_physmem || max_lpsize <= MMU_PAGESIZE) { 4387c478bd9Sstevel@tonic-gate return (0); 4397c478bd9Sstevel@tonic-gate } 4407c478bd9Sstevel@tonic-gate 44102bc52beSkchow for (i = mmu_exported_page_sizes - 1; i > 0; i--) { 442ec25b48fSsusans pgsz = page_get_pagesize(i); 443ec25b48fSsusans if (pgsz > max_lpsize) { 444ec25b48fSsusans continue; 445ec25b48fSsusans } 446ec25b48fSsusans raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); 447ec25b48fSsusans readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); 448ec25b48fSsusans if (raddr < addr || raddr >= readdr) { 449ec25b48fSsusans continue; 450ec25b48fSsusans } 451ec25b48fSsusans if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { 452ec25b48fSsusans continue; 453ec25b48fSsusans } 4547c478bd9Sstevel@tonic-gate /* 455ec25b48fSsusans * Set szcvec to the remaining page sizes. 4567c478bd9Sstevel@tonic-gate */ 457ec25b48fSsusans szcvec = ((1 << (i + 1)) - 1) & ~1; 458ec25b48fSsusans break; 459ec25b48fSsusans } 460ec25b48fSsusans return (szcvec); 461ec25b48fSsusans } 4627c478bd9Sstevel@tonic-gate 4637c478bd9Sstevel@tonic-gate /* 4647c478bd9Sstevel@tonic-gate * Return a bit vector of large page size codes that 4657c478bd9Sstevel@tonic-gate * can be used to map [addr, addr + len) region. 4667c478bd9Sstevel@tonic-gate */ 4677c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 4687c478bd9Sstevel@tonic-gate uint_t 469ec25b48fSsusans map_pgszcvec(caddr_t addr, size_t size, uintptr_t off, int flags, int type, 470ec25b48fSsusans int memcntl) 4717c478bd9Sstevel@tonic-gate { 472ec25b48fSsusans size_t max_lpsize = mcntl0_lpsize; 4737c478bd9Sstevel@tonic-gate 474ec25b48fSsusans if (mmu.max_page_level == 0) 4757c478bd9Sstevel@tonic-gate return (0); 4767c478bd9Sstevel@tonic-gate 477ec25b48fSsusans if (flags & MAP_TEXT) { 478ec25b48fSsusans if (!memcntl) 479ec25b48fSsusans max_lpsize = max_utext_lpsize; 480ec25b48fSsusans return (map_szcvec(addr, size, off, max_lpsize, 481ec25b48fSsusans shm_lpg_min_physmem)); 4827c478bd9Sstevel@tonic-gate 483ec25b48fSsusans } else if (flags & MAP_INITDATA) { 484ec25b48fSsusans if (!memcntl) 485ec25b48fSsusans max_lpsize = max_uidata_lpsize; 486ec25b48fSsusans return (map_szcvec(addr, size, off, max_lpsize, 487ec25b48fSsusans privm_lpg_min_physmem)); 48807b65a64Saguzovsk 489ec25b48fSsusans } else if (type == MAPPGSZC_SHM) { 490ec25b48fSsusans if (!memcntl) 491ec25b48fSsusans max_lpsize = max_shm_lpsize; 492ec25b48fSsusans return (map_szcvec(addr, size, off, max_lpsize, 493ec25b48fSsusans shm_lpg_min_physmem)); 49407b65a64Saguzovsk 495ec25b48fSsusans } else if (type == MAPPGSZC_HEAP) { 496ec25b48fSsusans if (!memcntl) 497ec25b48fSsusans max_lpsize = max_uheap_lpsize; 498ec25b48fSsusans return (map_szcvec(addr, size, off, max_lpsize, 499ec25b48fSsusans privm_lpg_min_physmem)); 500ec25b48fSsusans 501ec25b48fSsusans } else if (type == MAPPGSZC_STACK) { 502ec25b48fSsusans if (!memcntl) 503ec25b48fSsusans max_lpsize = max_ustack_lpsize; 504ec25b48fSsusans return (map_szcvec(addr, size, off, max_lpsize, 505ec25b48fSsusans privm_lpg_min_physmem)); 506ec25b48fSsusans 507ec25b48fSsusans } else { 508ec25b48fSsusans if (!memcntl) 509ec25b48fSsusans max_lpsize = max_privmap_lpsize; 510ec25b48fSsusans return (map_szcvec(addr, size, off, max_lpsize, 511ec25b48fSsusans privm_lpg_min_physmem)); 51207b65a64Saguzovsk } 51307b65a64Saguzovsk } 51407b65a64Saguzovsk 5157c478bd9Sstevel@tonic-gate /* 5167c478bd9Sstevel@tonic-gate * Handle a pagefault. 5177c478bd9Sstevel@tonic-gate */ 5187c478bd9Sstevel@tonic-gate faultcode_t 5197c478bd9Sstevel@tonic-gate pagefault( 5207c478bd9Sstevel@tonic-gate caddr_t addr, 5217c478bd9Sstevel@tonic-gate enum fault_type type, 5227c478bd9Sstevel@tonic-gate enum seg_rw rw, 5237c478bd9Sstevel@tonic-gate int iskernel) 5247c478bd9Sstevel@tonic-gate { 5257c478bd9Sstevel@tonic-gate struct as *as; 5267c478bd9Sstevel@tonic-gate struct hat *hat; 5277c478bd9Sstevel@tonic-gate struct proc *p; 5287c478bd9Sstevel@tonic-gate kthread_t *t; 5297c478bd9Sstevel@tonic-gate faultcode_t res; 5307c478bd9Sstevel@tonic-gate caddr_t base; 5317c478bd9Sstevel@tonic-gate size_t len; 5327c478bd9Sstevel@tonic-gate int err; 5337c478bd9Sstevel@tonic-gate int mapped_red; 5347c478bd9Sstevel@tonic-gate uintptr_t ea; 5357c478bd9Sstevel@tonic-gate 5367c478bd9Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 5377c478bd9Sstevel@tonic-gate 5387c478bd9Sstevel@tonic-gate if (INVALID_VADDR(addr)) 5397c478bd9Sstevel@tonic-gate return (FC_NOMAP); 5407c478bd9Sstevel@tonic-gate 5417c478bd9Sstevel@tonic-gate mapped_red = segkp_map_red(); 5427c478bd9Sstevel@tonic-gate 5437c478bd9Sstevel@tonic-gate if (iskernel) { 5447c478bd9Sstevel@tonic-gate as = &kas; 5457c478bd9Sstevel@tonic-gate hat = as->a_hat; 5467c478bd9Sstevel@tonic-gate } else { 5477c478bd9Sstevel@tonic-gate t = curthread; 5487c478bd9Sstevel@tonic-gate p = ttoproc(t); 5497c478bd9Sstevel@tonic-gate as = p->p_as; 5507c478bd9Sstevel@tonic-gate hat = as->a_hat; 5517c478bd9Sstevel@tonic-gate } 5527c478bd9Sstevel@tonic-gate 5537c478bd9Sstevel@tonic-gate /* 5547c478bd9Sstevel@tonic-gate * Dispatch pagefault. 5557c478bd9Sstevel@tonic-gate */ 5567c478bd9Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, type, rw); 5577c478bd9Sstevel@tonic-gate 5587c478bd9Sstevel@tonic-gate /* 5597c478bd9Sstevel@tonic-gate * If this isn't a potential unmapped hole in the user's 5607c478bd9Sstevel@tonic-gate * UNIX data or stack segments, just return status info. 5617c478bd9Sstevel@tonic-gate */ 5627c478bd9Sstevel@tonic-gate if (res != FC_NOMAP || iskernel) 5637c478bd9Sstevel@tonic-gate goto out; 5647c478bd9Sstevel@tonic-gate 5657c478bd9Sstevel@tonic-gate /* 5667c478bd9Sstevel@tonic-gate * Check to see if we happened to faulted on a currently unmapped 5677c478bd9Sstevel@tonic-gate * part of the UNIX data or stack segments. If so, create a zfod 5687c478bd9Sstevel@tonic-gate * mapping there and then try calling the fault routine again. 5697c478bd9Sstevel@tonic-gate */ 5707c478bd9Sstevel@tonic-gate base = p->p_brkbase; 5717c478bd9Sstevel@tonic-gate len = p->p_brksize; 5727c478bd9Sstevel@tonic-gate 5737c478bd9Sstevel@tonic-gate if (addr < base || addr >= base + len) { /* data seg? */ 5747c478bd9Sstevel@tonic-gate base = (caddr_t)p->p_usrstack - p->p_stksize; 5757c478bd9Sstevel@tonic-gate len = p->p_stksize; 5767c478bd9Sstevel@tonic-gate if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ 5777c478bd9Sstevel@tonic-gate /* not in either UNIX data or stack segments */ 5787c478bd9Sstevel@tonic-gate res = FC_NOMAP; 5797c478bd9Sstevel@tonic-gate goto out; 5807c478bd9Sstevel@tonic-gate } 5817c478bd9Sstevel@tonic-gate } 5827c478bd9Sstevel@tonic-gate 5837c478bd9Sstevel@tonic-gate /* 5847c478bd9Sstevel@tonic-gate * the rest of this function implements a 3.X 4.X 5.X compatibility 5857c478bd9Sstevel@tonic-gate * This code is probably not needed anymore 5867c478bd9Sstevel@tonic-gate */ 5877c478bd9Sstevel@tonic-gate if (p->p_model == DATAMODEL_ILP32) { 5887c478bd9Sstevel@tonic-gate 5897c478bd9Sstevel@tonic-gate /* expand the gap to the page boundaries on each side */ 5907c478bd9Sstevel@tonic-gate ea = P2ROUNDUP((uintptr_t)base + len, MMU_PAGESIZE); 5917c478bd9Sstevel@tonic-gate base = (caddr_t)P2ALIGN((uintptr_t)base, MMU_PAGESIZE); 5927c478bd9Sstevel@tonic-gate len = ea - (uintptr_t)base; 5937c478bd9Sstevel@tonic-gate 5947c478bd9Sstevel@tonic-gate as_rangelock(as); 5957c478bd9Sstevel@tonic-gate if (as_gap(as, MMU_PAGESIZE, &base, &len, AH_CONTAIN, addr) == 5967c478bd9Sstevel@tonic-gate 0) { 5977c478bd9Sstevel@tonic-gate err = as_map(as, base, len, segvn_create, zfod_argsp); 5987c478bd9Sstevel@tonic-gate as_rangeunlock(as); 5997c478bd9Sstevel@tonic-gate if (err) { 6007c478bd9Sstevel@tonic-gate res = FC_MAKE_ERR(err); 6017c478bd9Sstevel@tonic-gate goto out; 6027c478bd9Sstevel@tonic-gate } 6037c478bd9Sstevel@tonic-gate } else { 6047c478bd9Sstevel@tonic-gate /* 6057c478bd9Sstevel@tonic-gate * This page is already mapped by another thread after 6067c478bd9Sstevel@tonic-gate * we returned from as_fault() above. We just fall 6077c478bd9Sstevel@tonic-gate * through as_fault() below. 6087c478bd9Sstevel@tonic-gate */ 6097c478bd9Sstevel@tonic-gate as_rangeunlock(as); 6107c478bd9Sstevel@tonic-gate } 6117c478bd9Sstevel@tonic-gate 6127c478bd9Sstevel@tonic-gate res = as_fault(hat, as, addr, 1, F_INVAL, rw); 6137c478bd9Sstevel@tonic-gate } 6147c478bd9Sstevel@tonic-gate 6157c478bd9Sstevel@tonic-gate out: 6167c478bd9Sstevel@tonic-gate if (mapped_red) 6177c478bd9Sstevel@tonic-gate segkp_unmap_red(); 6187c478bd9Sstevel@tonic-gate 6197c478bd9Sstevel@tonic-gate return (res); 6207c478bd9Sstevel@tonic-gate } 6217c478bd9Sstevel@tonic-gate 6227c478bd9Sstevel@tonic-gate void 6237c478bd9Sstevel@tonic-gate map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) 6247c478bd9Sstevel@tonic-gate { 6257c478bd9Sstevel@tonic-gate struct proc *p = curproc; 6267c478bd9Sstevel@tonic-gate caddr_t userlimit = (flags & _MAP_LOW32) ? 6277c478bd9Sstevel@tonic-gate (caddr_t)_userlimit32 : p->p_as->a_userlimit; 6287c478bd9Sstevel@tonic-gate 6297c478bd9Sstevel@tonic-gate map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); 6307c478bd9Sstevel@tonic-gate } 6317c478bd9Sstevel@tonic-gate 6327c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 6337c478bd9Sstevel@tonic-gate int 6347c478bd9Sstevel@tonic-gate map_addr_vacalign_check(caddr_t addr, u_offset_t off) 6357c478bd9Sstevel@tonic-gate { 6367c478bd9Sstevel@tonic-gate return (0); 6377c478bd9Sstevel@tonic-gate } 6387c478bd9Sstevel@tonic-gate 6397c478bd9Sstevel@tonic-gate /* 6407c478bd9Sstevel@tonic-gate * map_addr_proc() is the routine called when the system is to 6417c478bd9Sstevel@tonic-gate * choose an address for the user. We will pick an address 642ae115bc7Smrj * range which is the highest available below userlimit. 6437c478bd9Sstevel@tonic-gate * 64446ab9534Smec * Every mapping will have a redzone of a single page on either side of 64546ab9534Smec * the request. This is done to leave one page unmapped between segments. 64646ab9534Smec * This is not required, but it's useful for the user because if their 64746ab9534Smec * program strays across a segment boundary, it will catch a fault 64846ab9534Smec * immediately making debugging a little easier. Currently the redzone 64946ab9534Smec * is mandatory. 65046ab9534Smec * 6517c478bd9Sstevel@tonic-gate * addrp is a value/result parameter. 6527c478bd9Sstevel@tonic-gate * On input it is a hint from the user to be used in a completely 6537c478bd9Sstevel@tonic-gate * machine dependent fashion. We decide to completely ignore this hint. 65446ab9534Smec * If MAP_ALIGN was specified, addrp contains the minimal alignment, which 65546ab9534Smec * must be some "power of two" multiple of pagesize. 6567c478bd9Sstevel@tonic-gate * 6577c478bd9Sstevel@tonic-gate * On output it is NULL if no address can be found in the current 6587c478bd9Sstevel@tonic-gate * processes address space or else an address that is currently 6597c478bd9Sstevel@tonic-gate * not mapped for len bytes with a page of red zone on either side. 6607c478bd9Sstevel@tonic-gate * 66146ab9534Smec * vacalign is not needed on x86 (it's for viturally addressed caches) 6627c478bd9Sstevel@tonic-gate */ 6637c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 6647c478bd9Sstevel@tonic-gate void 6657c478bd9Sstevel@tonic-gate map_addr_proc( 6667c478bd9Sstevel@tonic-gate caddr_t *addrp, 6677c478bd9Sstevel@tonic-gate size_t len, 6687c478bd9Sstevel@tonic-gate offset_t off, 6697c478bd9Sstevel@tonic-gate int vacalign, 6707c478bd9Sstevel@tonic-gate caddr_t userlimit, 6717c478bd9Sstevel@tonic-gate struct proc *p, 6727c478bd9Sstevel@tonic-gate uint_t flags) 6737c478bd9Sstevel@tonic-gate { 6747c478bd9Sstevel@tonic-gate struct as *as = p->p_as; 6757c478bd9Sstevel@tonic-gate caddr_t addr; 6767c478bd9Sstevel@tonic-gate caddr_t base; 6777c478bd9Sstevel@tonic-gate size_t slen; 6787c478bd9Sstevel@tonic-gate size_t align_amount; 6797c478bd9Sstevel@tonic-gate 6807c478bd9Sstevel@tonic-gate ASSERT32(userlimit == as->a_userlimit); 6817c478bd9Sstevel@tonic-gate 6827c478bd9Sstevel@tonic-gate base = p->p_brkbase; 6837c478bd9Sstevel@tonic-gate #if defined(__amd64) 6847c478bd9Sstevel@tonic-gate /* 6857c478bd9Sstevel@tonic-gate * XX64 Yes, this needs more work. 6867c478bd9Sstevel@tonic-gate */ 6877c478bd9Sstevel@tonic-gate if (p->p_model == DATAMODEL_NATIVE) { 6887c478bd9Sstevel@tonic-gate if (userlimit < as->a_userlimit) { 6897c478bd9Sstevel@tonic-gate /* 6907c478bd9Sstevel@tonic-gate * This happens when a program wants to map 6917c478bd9Sstevel@tonic-gate * something in a range that's accessible to a 6927c478bd9Sstevel@tonic-gate * program in a smaller address space. For example, 6937c478bd9Sstevel@tonic-gate * a 64-bit program calling mmap32(2) to guarantee 6947c478bd9Sstevel@tonic-gate * that the returned address is below 4Gbytes. 6957c478bd9Sstevel@tonic-gate */ 6967c478bd9Sstevel@tonic-gate ASSERT((uintptr_t)userlimit < ADDRESS_C(0xffffffff)); 6977c478bd9Sstevel@tonic-gate 6987c478bd9Sstevel@tonic-gate if (userlimit > base) 6997c478bd9Sstevel@tonic-gate slen = userlimit - base; 7007c478bd9Sstevel@tonic-gate else { 7017c478bd9Sstevel@tonic-gate *addrp = NULL; 7027c478bd9Sstevel@tonic-gate return; 7037c478bd9Sstevel@tonic-gate } 7047c478bd9Sstevel@tonic-gate } else { 7057c478bd9Sstevel@tonic-gate /* 7067c478bd9Sstevel@tonic-gate * XX64 This layout is probably wrong .. but in 7077c478bd9Sstevel@tonic-gate * the event we make the amd64 address space look 7087c478bd9Sstevel@tonic-gate * like sparcv9 i.e. with the stack -above- the 7097c478bd9Sstevel@tonic-gate * heap, this bit of code might even be correct. 7107c478bd9Sstevel@tonic-gate */ 7117c478bd9Sstevel@tonic-gate slen = p->p_usrstack - base - 7121e1e1eecSMichael Corcoran ((p->p_stk_ctl + PAGEOFFSET) & PAGEMASK); 7137c478bd9Sstevel@tonic-gate } 7147c478bd9Sstevel@tonic-gate } else 7157c478bd9Sstevel@tonic-gate #endif 7167c478bd9Sstevel@tonic-gate slen = userlimit - base; 7177c478bd9Sstevel@tonic-gate 71846ab9534Smec /* Make len be a multiple of PAGESIZE */ 7197c478bd9Sstevel@tonic-gate len = (len + PAGEOFFSET) & PAGEMASK; 7207c478bd9Sstevel@tonic-gate 7217c478bd9Sstevel@tonic-gate /* 7227c478bd9Sstevel@tonic-gate * figure out what the alignment should be 7237c478bd9Sstevel@tonic-gate * 7247c478bd9Sstevel@tonic-gate * XX64 -- is there an ELF_AMD64_MAXPGSZ or is it the same???? 7257c478bd9Sstevel@tonic-gate */ 7267c478bd9Sstevel@tonic-gate if (len <= ELF_386_MAXPGSZ) { 7277c478bd9Sstevel@tonic-gate /* 7287c478bd9Sstevel@tonic-gate * Align virtual addresses to ensure that ELF shared libraries 7297c478bd9Sstevel@tonic-gate * are mapped with the appropriate alignment constraints by 7307c478bd9Sstevel@tonic-gate * the run-time linker. 7317c478bd9Sstevel@tonic-gate */ 7327c478bd9Sstevel@tonic-gate align_amount = ELF_386_MAXPGSZ; 7337c478bd9Sstevel@tonic-gate } else { 734534f2768SSudheer A /* 735534f2768SSudheer A * For 32-bit processes, only those which have specified 736534f2768SSudheer A * MAP_ALIGN and an addr will be aligned on a larger page size. 737534f2768SSudheer A * Not doing so can potentially waste up to 1G of process 738534f2768SSudheer A * address space. 739534f2768SSudheer A */ 740534f2768SSudheer A int lvl = (p->p_model == DATAMODEL_ILP32) ? 1 : 741534f2768SSudheer A mmu.umax_page_level; 7427c478bd9Sstevel@tonic-gate 743534f2768SSudheer A while (lvl && len < LEVEL_SIZE(lvl)) 744534f2768SSudheer A --lvl; 7457c478bd9Sstevel@tonic-gate 746534f2768SSudheer A align_amount = LEVEL_SIZE(lvl); 7477c478bd9Sstevel@tonic-gate } 7487c478bd9Sstevel@tonic-gate if ((flags & MAP_ALIGN) && ((uintptr_t)*addrp > align_amount)) 7497c478bd9Sstevel@tonic-gate align_amount = (uintptr_t)*addrp; 7507c478bd9Sstevel@tonic-gate 75146ab9534Smec ASSERT(ISP2(align_amount)); 75246ab9534Smec ASSERT(align_amount == 0 || align_amount >= PAGESIZE); 7537c478bd9Sstevel@tonic-gate 75446ab9534Smec off = off & (align_amount - 1); 7557c478bd9Sstevel@tonic-gate /* 7567c478bd9Sstevel@tonic-gate * Look for a large enough hole starting below userlimit. 75746ab9534Smec * After finding it, use the upper part. 7587c478bd9Sstevel@tonic-gate */ 75946ab9534Smec if (as_gap_aligned(as, len, &base, &slen, AH_HI, NULL, align_amount, 76046ab9534Smec PAGESIZE, off) == 0) { 7617c478bd9Sstevel@tonic-gate caddr_t as_addr; 7627c478bd9Sstevel@tonic-gate 76346ab9534Smec /* 76446ab9534Smec * addr is the highest possible address to use since we have 76546ab9534Smec * a PAGESIZE redzone at the beginning and end. 76646ab9534Smec */ 76746ab9534Smec addr = base + slen - (PAGESIZE + len); 7687c478bd9Sstevel@tonic-gate as_addr = addr; 7697c478bd9Sstevel@tonic-gate /* 77046ab9534Smec * Round address DOWN to the alignment amount and 77146ab9534Smec * add the offset in. 77246ab9534Smec * If addr is greater than as_addr, len would not be large 77346ab9534Smec * enough to include the redzone, so we must adjust down 77446ab9534Smec * by the alignment amount. 7757c478bd9Sstevel@tonic-gate */ 7767c478bd9Sstevel@tonic-gate addr = (caddr_t)((uintptr_t)addr & (~(align_amount - 1))); 77746ab9534Smec addr += (uintptr_t)off; 77846ab9534Smec if (addr > as_addr) { 77946ab9534Smec addr -= align_amount; 78046ab9534Smec } 7817c478bd9Sstevel@tonic-gate 78246ab9534Smec ASSERT(addr > base); 78346ab9534Smec ASSERT(addr + len < base + slen); 7847c478bd9Sstevel@tonic-gate ASSERT(((uintptr_t)addr & (align_amount - 1)) == 78546ab9534Smec ((uintptr_t)(off))); 7867c478bd9Sstevel@tonic-gate *addrp = addr; 7877c478bd9Sstevel@tonic-gate } else { 7887c478bd9Sstevel@tonic-gate *addrp = NULL; /* no more virtual space */ 7897c478bd9Sstevel@tonic-gate } 7907c478bd9Sstevel@tonic-gate } 7917c478bd9Sstevel@tonic-gate 79246ab9534Smec int valid_va_range_aligned_wraparound; 79346ab9534Smec 7947c478bd9Sstevel@tonic-gate /* 79546ab9534Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 79646ab9534Smec * addresses at least "minlen" long, where the base of the range is at "off" 79746ab9534Smec * phase from an "align" boundary and there is space for a "redzone"-sized 79846ab9534Smec * redzone on either side of the range. On success, 1 is returned and *basep 79946ab9534Smec * and *lenp are adjusted to describe the acceptable range (including 80046ab9534Smec * the redzone). On failure, 0 is returned. 8017c478bd9Sstevel@tonic-gate */ 8027c478bd9Sstevel@tonic-gate /*ARGSUSED3*/ 8037c478bd9Sstevel@tonic-gate int 80446ab9534Smec valid_va_range_aligned(caddr_t *basep, size_t *lenp, size_t minlen, int dir, 80546ab9534Smec size_t align, size_t redzone, size_t off) 8067c478bd9Sstevel@tonic-gate { 8077c478bd9Sstevel@tonic-gate uintptr_t hi, lo; 80846ab9534Smec size_t tot_len; 80946ab9534Smec 81046ab9534Smec ASSERT(align == 0 ? off == 0 : off < align); 81146ab9534Smec ASSERT(ISP2(align)); 81246ab9534Smec ASSERT(align == 0 || align >= PAGESIZE); 8137c478bd9Sstevel@tonic-gate 8147c478bd9Sstevel@tonic-gate lo = (uintptr_t)*basep; 8157c478bd9Sstevel@tonic-gate hi = lo + *lenp; 81646ab9534Smec tot_len = minlen + 2 * redzone; /* need at least this much space */ 8177c478bd9Sstevel@tonic-gate 8187c478bd9Sstevel@tonic-gate /* 8197c478bd9Sstevel@tonic-gate * If hi rolled over the top, try cutting back. 8207c478bd9Sstevel@tonic-gate */ 8217c478bd9Sstevel@tonic-gate if (hi < lo) { 82246ab9534Smec *lenp = 0UL - lo - 1UL; 82346ab9534Smec /* See if this really happens. If so, then we figure out why */ 82446ab9534Smec valid_va_range_aligned_wraparound++; 82546ab9534Smec hi = lo + *lenp; 82646ab9534Smec } 82746ab9534Smec if (*lenp < tot_len) { 8287c478bd9Sstevel@tonic-gate return (0); 8297c478bd9Sstevel@tonic-gate } 83046ab9534Smec 8317c478bd9Sstevel@tonic-gate #if defined(__amd64) 8327c478bd9Sstevel@tonic-gate /* 8337c478bd9Sstevel@tonic-gate * Deal with a possible hole in the address range between 8347c478bd9Sstevel@tonic-gate * hole_start and hole_end that should never be mapped. 8357c478bd9Sstevel@tonic-gate */ 8367c478bd9Sstevel@tonic-gate if (lo < hole_start) { 8377c478bd9Sstevel@tonic-gate if (hi > hole_start) { 8387c478bd9Sstevel@tonic-gate if (hi < hole_end) { 8397c478bd9Sstevel@tonic-gate hi = hole_start; 8407c478bd9Sstevel@tonic-gate } else { 8417c478bd9Sstevel@tonic-gate /* lo < hole_start && hi >= hole_end */ 8427c478bd9Sstevel@tonic-gate if (dir == AH_LO) { 8437c478bd9Sstevel@tonic-gate /* 8447c478bd9Sstevel@tonic-gate * prefer lowest range 8457c478bd9Sstevel@tonic-gate */ 84646ab9534Smec if (hole_start - lo >= tot_len) 8477c478bd9Sstevel@tonic-gate hi = hole_start; 84846ab9534Smec else if (hi - hole_end >= tot_len) 8497c478bd9Sstevel@tonic-gate lo = hole_end; 8507c478bd9Sstevel@tonic-gate else 8517c478bd9Sstevel@tonic-gate return (0); 8527c478bd9Sstevel@tonic-gate } else { 8537c478bd9Sstevel@tonic-gate /* 8547c478bd9Sstevel@tonic-gate * prefer highest range 8557c478bd9Sstevel@tonic-gate */ 85646ab9534Smec if (hi - hole_end >= tot_len) 8577c478bd9Sstevel@tonic-gate lo = hole_end; 85846ab9534Smec else if (hole_start - lo >= tot_len) 8597c478bd9Sstevel@tonic-gate hi = hole_start; 8607c478bd9Sstevel@tonic-gate else 8617c478bd9Sstevel@tonic-gate return (0); 8627c478bd9Sstevel@tonic-gate } 8637c478bd9Sstevel@tonic-gate } 8647c478bd9Sstevel@tonic-gate } 8657c478bd9Sstevel@tonic-gate } else { 8667c478bd9Sstevel@tonic-gate /* lo >= hole_start */ 8677c478bd9Sstevel@tonic-gate if (hi < hole_end) 8687c478bd9Sstevel@tonic-gate return (0); 8697c478bd9Sstevel@tonic-gate if (lo < hole_end) 8707c478bd9Sstevel@tonic-gate lo = hole_end; 8717c478bd9Sstevel@tonic-gate } 87246ab9534Smec #endif 8737c478bd9Sstevel@tonic-gate 87446ab9534Smec if (hi - lo < tot_len) 8757c478bd9Sstevel@tonic-gate return (0); 8767c478bd9Sstevel@tonic-gate 87746ab9534Smec if (align > 1) { 87846ab9534Smec uintptr_t tlo = lo + redzone; 87946ab9534Smec uintptr_t thi = hi - redzone; 88046ab9534Smec tlo = (uintptr_t)P2PHASEUP(tlo, align, off); 88146ab9534Smec if (tlo < lo + redzone) { 88246ab9534Smec return (0); 88346ab9534Smec } 88446ab9534Smec if (thi < tlo || thi - tlo < minlen) { 88546ab9534Smec return (0); 88646ab9534Smec } 88746ab9534Smec } 88846ab9534Smec 8897c478bd9Sstevel@tonic-gate *basep = (caddr_t)lo; 8907c478bd9Sstevel@tonic-gate *lenp = hi - lo; 8917c478bd9Sstevel@tonic-gate return (1); 8927c478bd9Sstevel@tonic-gate } 8937c478bd9Sstevel@tonic-gate 8947c478bd9Sstevel@tonic-gate /* 89546ab9534Smec * Determine whether [*basep, *basep + *lenp) contains a mappable range of 89646ab9534Smec * addresses at least "minlen" long. On success, 1 is returned and *basep 89746ab9534Smec * and *lenp are adjusted to describe the acceptable range. On failure, 0 89846ab9534Smec * is returned. 89946ab9534Smec */ 90046ab9534Smec int 90146ab9534Smec valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) 90246ab9534Smec { 90346ab9534Smec return (valid_va_range_aligned(basep, lenp, minlen, dir, 0, 0, 0)); 90446ab9534Smec } 90546ab9534Smec 90646ab9534Smec /* 9077c478bd9Sstevel@tonic-gate * Determine whether [addr, addr+len] are valid user addresses. 9087c478bd9Sstevel@tonic-gate */ 9097c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 9107c478bd9Sstevel@tonic-gate int 9117c478bd9Sstevel@tonic-gate valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, 9127c478bd9Sstevel@tonic-gate caddr_t userlimit) 9137c478bd9Sstevel@tonic-gate { 9147c478bd9Sstevel@tonic-gate caddr_t eaddr = addr + len; 9157c478bd9Sstevel@tonic-gate 9167c478bd9Sstevel@tonic-gate if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) 9177c478bd9Sstevel@tonic-gate return (RANGE_BADADDR); 9187c478bd9Sstevel@tonic-gate 9197c478bd9Sstevel@tonic-gate #if defined(__amd64) 9207c478bd9Sstevel@tonic-gate /* 9217c478bd9Sstevel@tonic-gate * Check for the VA hole 9227c478bd9Sstevel@tonic-gate */ 9237c478bd9Sstevel@tonic-gate if (eaddr > (caddr_t)hole_start && addr < (caddr_t)hole_end) 9247c478bd9Sstevel@tonic-gate return (RANGE_BADADDR); 9257c478bd9Sstevel@tonic-gate #endif 9267c478bd9Sstevel@tonic-gate 9277c478bd9Sstevel@tonic-gate return (RANGE_OKAY); 9287c478bd9Sstevel@tonic-gate } 9297c478bd9Sstevel@tonic-gate 9307c478bd9Sstevel@tonic-gate /* 9317c478bd9Sstevel@tonic-gate * Return 1 if the page frame is onboard memory, else 0. 9327c478bd9Sstevel@tonic-gate */ 9337c478bd9Sstevel@tonic-gate int 9347c478bd9Sstevel@tonic-gate pf_is_memory(pfn_t pf) 9357c478bd9Sstevel@tonic-gate { 936ae115bc7Smrj if (pfn_is_foreign(pf)) 937ae115bc7Smrj return (0); 938ae115bc7Smrj return (address_in_memlist(phys_install, pfn_to_pa(pf), 1)); 9397c478bd9Sstevel@tonic-gate } 9407c478bd9Sstevel@tonic-gate 9417c478bd9Sstevel@tonic-gate /* 9427c478bd9Sstevel@tonic-gate * return the memrange containing pfn 9437c478bd9Sstevel@tonic-gate */ 9447c478bd9Sstevel@tonic-gate int 9457c478bd9Sstevel@tonic-gate memrange_num(pfn_t pfn) 9467c478bd9Sstevel@tonic-gate { 9477c478bd9Sstevel@tonic-gate int n; 9487c478bd9Sstevel@tonic-gate 9497c478bd9Sstevel@tonic-gate for (n = 0; n < nranges - 1; ++n) { 9507c478bd9Sstevel@tonic-gate if (pfn >= memranges[n]) 9517c478bd9Sstevel@tonic-gate break; 9527c478bd9Sstevel@tonic-gate } 9537c478bd9Sstevel@tonic-gate return (n); 9547c478bd9Sstevel@tonic-gate } 9557c478bd9Sstevel@tonic-gate 9567c478bd9Sstevel@tonic-gate /* 9577c478bd9Sstevel@tonic-gate * return the mnoderange containing pfn 9587c478bd9Sstevel@tonic-gate */ 959843e1988Sjohnlev /*ARGSUSED*/ 9607c478bd9Sstevel@tonic-gate int 9617c478bd9Sstevel@tonic-gate pfn_2_mtype(pfn_t pfn) 9627c478bd9Sstevel@tonic-gate { 963843e1988Sjohnlev #if defined(__xpv) 964843e1988Sjohnlev return (0); 965843e1988Sjohnlev #else 9667c478bd9Sstevel@tonic-gate int n; 9677c478bd9Sstevel@tonic-gate 968a3114836SGerry Liu /* Always start from highest pfn and work our way down */ 969a3114836SGerry Liu for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 9707c478bd9Sstevel@tonic-gate if (pfn >= mnoderanges[n].mnr_pfnlo) { 9717c478bd9Sstevel@tonic-gate break; 9727c478bd9Sstevel@tonic-gate } 9737c478bd9Sstevel@tonic-gate } 9747c478bd9Sstevel@tonic-gate return (n); 975843e1988Sjohnlev #endif 9767c478bd9Sstevel@tonic-gate } 9777c478bd9Sstevel@tonic-gate 978843e1988Sjohnlev #if !defined(__xpv) 9797c478bd9Sstevel@tonic-gate /* 9807c478bd9Sstevel@tonic-gate * is_contigpage_free: 9817c478bd9Sstevel@tonic-gate * returns a page list of contiguous pages. It minimally has to return 9827c478bd9Sstevel@tonic-gate * minctg pages. Caller determines minctg based on the scatter-gather 9837c478bd9Sstevel@tonic-gate * list length. 9847c478bd9Sstevel@tonic-gate * 9857c478bd9Sstevel@tonic-gate * pfnp is set to the next page frame to search on return. 9867c478bd9Sstevel@tonic-gate */ 9877c478bd9Sstevel@tonic-gate static page_t * 9887c478bd9Sstevel@tonic-gate is_contigpage_free( 9897c478bd9Sstevel@tonic-gate pfn_t *pfnp, 9907c478bd9Sstevel@tonic-gate pgcnt_t *pgcnt, 9917c478bd9Sstevel@tonic-gate pgcnt_t minctg, 9927c478bd9Sstevel@tonic-gate uint64_t pfnseg, 9937c478bd9Sstevel@tonic-gate int iolock) 9947c478bd9Sstevel@tonic-gate { 9957c478bd9Sstevel@tonic-gate int i = 0; 9967c478bd9Sstevel@tonic-gate pfn_t pfn = *pfnp; 9977c478bd9Sstevel@tonic-gate page_t *pp; 9987c478bd9Sstevel@tonic-gate page_t *plist = NULL; 9997c478bd9Sstevel@tonic-gate 10007c478bd9Sstevel@tonic-gate /* 10017c478bd9Sstevel@tonic-gate * fail if pfn + minctg crosses a segment boundary. 10027c478bd9Sstevel@tonic-gate * Adjust for next starting pfn to begin at segment boundary. 10037c478bd9Sstevel@tonic-gate */ 10047c478bd9Sstevel@tonic-gate 10057c478bd9Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) { 10067c478bd9Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 10077c478bd9Sstevel@tonic-gate return (NULL); 10087c478bd9Sstevel@tonic-gate } 10097c478bd9Sstevel@tonic-gate 10107c478bd9Sstevel@tonic-gate do { 10117c478bd9Sstevel@tonic-gate retry: 10127c478bd9Sstevel@tonic-gate pp = page_numtopp_nolock(pfn + i); 1013*1f84c0d7SDave Plauger if ((pp == NULL) || IS_DUMP_PAGE(pp) || 10147c478bd9Sstevel@tonic-gate (page_trylock(pp, SE_EXCL) == 0)) { 10157c478bd9Sstevel@tonic-gate (*pfnp)++; 10167c478bd9Sstevel@tonic-gate break; 10177c478bd9Sstevel@tonic-gate } 10187c478bd9Sstevel@tonic-gate if (page_pptonum(pp) != pfn + i) { 10197c478bd9Sstevel@tonic-gate page_unlock(pp); 10207c478bd9Sstevel@tonic-gate goto retry; 10217c478bd9Sstevel@tonic-gate } 10227c478bd9Sstevel@tonic-gate 10237c478bd9Sstevel@tonic-gate if (!(PP_ISFREE(pp))) { 10247c478bd9Sstevel@tonic-gate page_unlock(pp); 10257c478bd9Sstevel@tonic-gate (*pfnp)++; 10267c478bd9Sstevel@tonic-gate break; 10277c478bd9Sstevel@tonic-gate } 10287c478bd9Sstevel@tonic-gate 10297c478bd9Sstevel@tonic-gate if (!PP_ISAGED(pp)) { 10307c478bd9Sstevel@tonic-gate page_list_sub(pp, PG_CACHE_LIST); 10317c478bd9Sstevel@tonic-gate page_hashout(pp, (kmutex_t *)NULL); 10327c478bd9Sstevel@tonic-gate } else { 10337c478bd9Sstevel@tonic-gate page_list_sub(pp, PG_FREE_LIST); 10347c478bd9Sstevel@tonic-gate } 10357c478bd9Sstevel@tonic-gate 10367c478bd9Sstevel@tonic-gate if (iolock) 10377c478bd9Sstevel@tonic-gate page_io_lock(pp); 10387c478bd9Sstevel@tonic-gate page_list_concat(&plist, &pp); 10397c478bd9Sstevel@tonic-gate 10407c478bd9Sstevel@tonic-gate /* 10417c478bd9Sstevel@tonic-gate * exit loop when pgcnt satisfied or segment boundary reached. 10427c478bd9Sstevel@tonic-gate */ 10437c478bd9Sstevel@tonic-gate 10447c478bd9Sstevel@tonic-gate } while ((++i < *pgcnt) && ((pfn + i) & pfnseg)); 10457c478bd9Sstevel@tonic-gate 10467c478bd9Sstevel@tonic-gate *pfnp += i; /* set to next pfn to search */ 10477c478bd9Sstevel@tonic-gate 10487c478bd9Sstevel@tonic-gate if (i >= minctg) { 10497c478bd9Sstevel@tonic-gate *pgcnt -= i; 10507c478bd9Sstevel@tonic-gate return (plist); 10517c478bd9Sstevel@tonic-gate } 10527c478bd9Sstevel@tonic-gate 10537c478bd9Sstevel@tonic-gate /* 10547c478bd9Sstevel@tonic-gate * failure: minctg not satisfied. 10557c478bd9Sstevel@tonic-gate * 10567c478bd9Sstevel@tonic-gate * if next request crosses segment boundary, set next pfn 10577c478bd9Sstevel@tonic-gate * to search from the segment boundary. 10587c478bd9Sstevel@tonic-gate */ 10597c478bd9Sstevel@tonic-gate if (((*pfnp + minctg - 1) & pfnseg) < (*pfnp & pfnseg)) 10607c478bd9Sstevel@tonic-gate *pfnp = roundup(*pfnp, pfnseg + 1); 10617c478bd9Sstevel@tonic-gate 10627c478bd9Sstevel@tonic-gate /* clean up any pages already allocated */ 10637c478bd9Sstevel@tonic-gate 10647c478bd9Sstevel@tonic-gate while (plist) { 10657c478bd9Sstevel@tonic-gate pp = plist; 10667c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 10677c478bd9Sstevel@tonic-gate page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 10687c478bd9Sstevel@tonic-gate if (iolock) 10697c478bd9Sstevel@tonic-gate page_io_unlock(pp); 10707c478bd9Sstevel@tonic-gate page_unlock(pp); 10717c478bd9Sstevel@tonic-gate } 10727c478bd9Sstevel@tonic-gate 10737c478bd9Sstevel@tonic-gate return (NULL); 10747c478bd9Sstevel@tonic-gate } 1075843e1988Sjohnlev #endif /* !__xpv */ 10767c478bd9Sstevel@tonic-gate 10777c478bd9Sstevel@tonic-gate /* 10787c478bd9Sstevel@tonic-gate * verify that pages being returned from allocator have correct DMA attribute 10797c478bd9Sstevel@tonic-gate */ 10807c478bd9Sstevel@tonic-gate #ifndef DEBUG 1081a3114836SGerry Liu #define check_dma(a, b, c) (void)(0) 10827c478bd9Sstevel@tonic-gate #else 10837c478bd9Sstevel@tonic-gate static void 10847c478bd9Sstevel@tonic-gate check_dma(ddi_dma_attr_t *dma_attr, page_t *pp, int cnt) 10857c478bd9Sstevel@tonic-gate { 10867c478bd9Sstevel@tonic-gate if (dma_attr == NULL) 10877c478bd9Sstevel@tonic-gate return; 10887c478bd9Sstevel@tonic-gate 10897c478bd9Sstevel@tonic-gate while (cnt-- > 0) { 1090ae115bc7Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) < 10917c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_lo) 1092903a11ebSrh87107 panic("PFN (pp=%p) below dma_attr_addr_lo", (void *)pp); 1093ae115bc7Smrj if (pa_to_ma(pfn_to_pa(pp->p_pagenum)) >= 10947c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_hi) 1095903a11ebSrh87107 panic("PFN (pp=%p) above dma_attr_addr_hi", (void *)pp); 10967c478bd9Sstevel@tonic-gate pp = pp->p_next; 10977c478bd9Sstevel@tonic-gate } 10987c478bd9Sstevel@tonic-gate } 10997c478bd9Sstevel@tonic-gate #endif 11007c478bd9Sstevel@tonic-gate 1101843e1988Sjohnlev #if !defined(__xpv) 11027c478bd9Sstevel@tonic-gate static page_t * 11037c478bd9Sstevel@tonic-gate page_get_contigpage(pgcnt_t *pgcnt, ddi_dma_attr_t *mattr, int iolock) 11047c478bd9Sstevel@tonic-gate { 11057c478bd9Sstevel@tonic-gate pfn_t pfn; 11067c478bd9Sstevel@tonic-gate int sgllen; 11077c478bd9Sstevel@tonic-gate uint64_t pfnseg; 11087c478bd9Sstevel@tonic-gate pgcnt_t minctg; 11097c478bd9Sstevel@tonic-gate page_t *pplist = NULL, *plist; 11107c478bd9Sstevel@tonic-gate uint64_t lo, hi; 11117c478bd9Sstevel@tonic-gate pgcnt_t pfnalign = 0; 11127c478bd9Sstevel@tonic-gate static pfn_t startpfn; 11137c478bd9Sstevel@tonic-gate static pgcnt_t lastctgcnt; 11147c478bd9Sstevel@tonic-gate uintptr_t align; 11157c478bd9Sstevel@tonic-gate 11167c478bd9Sstevel@tonic-gate CONTIG_LOCK(); 11177c478bd9Sstevel@tonic-gate 11187c478bd9Sstevel@tonic-gate if (mattr) { 11197c478bd9Sstevel@tonic-gate lo = mmu_btop((mattr->dma_attr_addr_lo + MMU_PAGEOFFSET)); 11207c478bd9Sstevel@tonic-gate hi = mmu_btop(mattr->dma_attr_addr_hi); 11217c478bd9Sstevel@tonic-gate if (hi >= physmax) 11227c478bd9Sstevel@tonic-gate hi = physmax - 1; 11237c478bd9Sstevel@tonic-gate sgllen = mattr->dma_attr_sgllen; 11247c478bd9Sstevel@tonic-gate pfnseg = mmu_btop(mattr->dma_attr_seg); 11257c478bd9Sstevel@tonic-gate 11267c478bd9Sstevel@tonic-gate align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 11277c478bd9Sstevel@tonic-gate if (align > MMU_PAGESIZE) 11287c478bd9Sstevel@tonic-gate pfnalign = mmu_btop(align); 11297c478bd9Sstevel@tonic-gate 11307c478bd9Sstevel@tonic-gate /* 11317c478bd9Sstevel@tonic-gate * in order to satisfy the request, must minimally 11327c478bd9Sstevel@tonic-gate * acquire minctg contiguous pages 11337c478bd9Sstevel@tonic-gate */ 11347c478bd9Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11357c478bd9Sstevel@tonic-gate 11367c478bd9Sstevel@tonic-gate ASSERT(hi >= lo); 11377c478bd9Sstevel@tonic-gate 11387c478bd9Sstevel@tonic-gate /* 11397c478bd9Sstevel@tonic-gate * start from where last searched if the minctg >= lastctgcnt 11407c478bd9Sstevel@tonic-gate */ 11417c478bd9Sstevel@tonic-gate if (minctg < lastctgcnt || startpfn < lo || startpfn > hi) 11427c478bd9Sstevel@tonic-gate startpfn = lo; 11437c478bd9Sstevel@tonic-gate } else { 11447c478bd9Sstevel@tonic-gate hi = physmax - 1; 11457c478bd9Sstevel@tonic-gate lo = 0; 11467c478bd9Sstevel@tonic-gate sgllen = 1; 11477c478bd9Sstevel@tonic-gate pfnseg = mmu.highest_pfn; 11487c478bd9Sstevel@tonic-gate minctg = *pgcnt; 11497c478bd9Sstevel@tonic-gate 11507c478bd9Sstevel@tonic-gate if (minctg < lastctgcnt) 11517c478bd9Sstevel@tonic-gate startpfn = lo; 11527c478bd9Sstevel@tonic-gate } 11537c478bd9Sstevel@tonic-gate lastctgcnt = minctg; 11547c478bd9Sstevel@tonic-gate 11557c478bd9Sstevel@tonic-gate ASSERT(pfnseg + 1 >= (uint64_t)minctg); 11567c478bd9Sstevel@tonic-gate 11577c478bd9Sstevel@tonic-gate /* conserve 16m memory - start search above 16m when possible */ 11587c478bd9Sstevel@tonic-gate if (hi > PFN_16M && startpfn < PFN_16M) 11597c478bd9Sstevel@tonic-gate startpfn = PFN_16M; 11607c478bd9Sstevel@tonic-gate 11617c478bd9Sstevel@tonic-gate pfn = startpfn; 11627c478bd9Sstevel@tonic-gate if (pfnalign) 11637c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11647c478bd9Sstevel@tonic-gate 11657c478bd9Sstevel@tonic-gate while (pfn + minctg - 1 <= hi) { 11667c478bd9Sstevel@tonic-gate 11677c478bd9Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 11687c478bd9Sstevel@tonic-gate if (plist) { 11697c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &plist); 11707c478bd9Sstevel@tonic-gate sgllen--; 11717c478bd9Sstevel@tonic-gate /* 11727c478bd9Sstevel@tonic-gate * return when contig pages no longer needed 11737c478bd9Sstevel@tonic-gate */ 11747c478bd9Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 11757c478bd9Sstevel@tonic-gate startpfn = pfn; 11767c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 11777c478bd9Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 11787c478bd9Sstevel@tonic-gate return (pplist); 11797c478bd9Sstevel@tonic-gate } 11807c478bd9Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 11817c478bd9Sstevel@tonic-gate } 11827c478bd9Sstevel@tonic-gate if (pfnalign) 11837c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11847c478bd9Sstevel@tonic-gate } 11857c478bd9Sstevel@tonic-gate 11867c478bd9Sstevel@tonic-gate /* cannot find contig pages in specified range */ 11877c478bd9Sstevel@tonic-gate if (startpfn == lo) { 11887c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 11897c478bd9Sstevel@tonic-gate return (NULL); 11907c478bd9Sstevel@tonic-gate } 11917c478bd9Sstevel@tonic-gate 11927c478bd9Sstevel@tonic-gate /* did not start with lo previously */ 11937c478bd9Sstevel@tonic-gate pfn = lo; 11947c478bd9Sstevel@tonic-gate if (pfnalign) 11957c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 11967c478bd9Sstevel@tonic-gate 11977c478bd9Sstevel@tonic-gate /* allow search to go above startpfn */ 11987c478bd9Sstevel@tonic-gate while (pfn < startpfn) { 11997c478bd9Sstevel@tonic-gate 12007c478bd9Sstevel@tonic-gate plist = is_contigpage_free(&pfn, pgcnt, minctg, pfnseg, iolock); 12017c478bd9Sstevel@tonic-gate if (plist != NULL) { 12027c478bd9Sstevel@tonic-gate 12037c478bd9Sstevel@tonic-gate page_list_concat(&pplist, &plist); 12047c478bd9Sstevel@tonic-gate sgllen--; 12057c478bd9Sstevel@tonic-gate 12067c478bd9Sstevel@tonic-gate /* 12077c478bd9Sstevel@tonic-gate * return when contig pages no longer needed 12087c478bd9Sstevel@tonic-gate */ 12097c478bd9Sstevel@tonic-gate if (!*pgcnt || ((*pgcnt <= sgllen) && !pfnalign)) { 12107c478bd9Sstevel@tonic-gate startpfn = pfn; 12117c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 12127c478bd9Sstevel@tonic-gate check_dma(mattr, pplist, *pgcnt); 12137c478bd9Sstevel@tonic-gate return (pplist); 12147c478bd9Sstevel@tonic-gate } 12157c478bd9Sstevel@tonic-gate minctg = howmany(*pgcnt, sgllen); 12167c478bd9Sstevel@tonic-gate } 12177c478bd9Sstevel@tonic-gate if (pfnalign) 12187c478bd9Sstevel@tonic-gate pfn = P2ROUNDUP(pfn, pfnalign); 12197c478bd9Sstevel@tonic-gate } 12207c478bd9Sstevel@tonic-gate CONTIG_UNLOCK(); 12217c478bd9Sstevel@tonic-gate return (NULL); 12227c478bd9Sstevel@tonic-gate } 1223843e1988Sjohnlev #endif /* !__xpv */ 12247c478bd9Sstevel@tonic-gate 12257c478bd9Sstevel@tonic-gate /* 12267c478bd9Sstevel@tonic-gate * mnode_range_cnt() calculates the number of memory ranges for mnode and 12277c478bd9Sstevel@tonic-gate * memranges[]. Used to determine the size of page lists and mnoderanges. 12287c478bd9Sstevel@tonic-gate */ 12297c478bd9Sstevel@tonic-gate int 12305d07b933Sdp78419 mnode_range_cnt(int mnode) 12317c478bd9Sstevel@tonic-gate { 1232843e1988Sjohnlev #if defined(__xpv) 1233843e1988Sjohnlev ASSERT(mnode == 0); 1234843e1988Sjohnlev return (1); 1235843e1988Sjohnlev #else /* __xpv */ 12367c478bd9Sstevel@tonic-gate int mri; 12377c478bd9Sstevel@tonic-gate int mnrcnt = 0; 12387c478bd9Sstevel@tonic-gate 12395d07b933Sdp78419 if (mem_node_config[mnode].exists != 0) { 12407c478bd9Sstevel@tonic-gate mri = nranges - 1; 12417c478bd9Sstevel@tonic-gate 12427c478bd9Sstevel@tonic-gate /* find the memranges index below contained in mnode range */ 12437c478bd9Sstevel@tonic-gate 12447c478bd9Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 12457c478bd9Sstevel@tonic-gate mri--; 12467c478bd9Sstevel@tonic-gate 12477c478bd9Sstevel@tonic-gate /* 12487c478bd9Sstevel@tonic-gate * increment mnode range counter when memranges or mnode 12497c478bd9Sstevel@tonic-gate * boundary is reached. 12507c478bd9Sstevel@tonic-gate */ 12517c478bd9Sstevel@tonic-gate while (mri >= 0 && 12527c478bd9Sstevel@tonic-gate mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 12537c478bd9Sstevel@tonic-gate mnrcnt++; 12547c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 12557c478bd9Sstevel@tonic-gate mri--; 12567c478bd9Sstevel@tonic-gate else 12577c478bd9Sstevel@tonic-gate break; 12587c478bd9Sstevel@tonic-gate } 12597c478bd9Sstevel@tonic-gate } 12605d07b933Sdp78419 ASSERT(mnrcnt <= MAX_MNODE_MRANGES); 12617c478bd9Sstevel@tonic-gate return (mnrcnt); 1262843e1988Sjohnlev #endif /* __xpv */ 12637c478bd9Sstevel@tonic-gate } 12647c478bd9Sstevel@tonic-gate 1265843e1988Sjohnlev /* 1266843e1988Sjohnlev * mnode_range_setup() initializes mnoderanges. 1267843e1988Sjohnlev */ 12687c478bd9Sstevel@tonic-gate void 12697c478bd9Sstevel@tonic-gate mnode_range_setup(mnoderange_t *mnoderanges) 12707c478bd9Sstevel@tonic-gate { 1271a3114836SGerry Liu mnoderange_t *mp = mnoderanges; 12727c478bd9Sstevel@tonic-gate int mnode, mri; 1273a3114836SGerry Liu int mindex = 0; /* current index into mnoderanges array */ 1274a3114836SGerry Liu int i, j; 1275a3114836SGerry Liu pfn_t hipfn; 1276a3114836SGerry Liu int last, hi; 12777c478bd9Sstevel@tonic-gate 12787c478bd9Sstevel@tonic-gate for (mnode = 0; mnode < max_mem_nodes; mnode++) { 12797c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].exists == 0) 12807c478bd9Sstevel@tonic-gate continue; 12817c478bd9Sstevel@tonic-gate 12827c478bd9Sstevel@tonic-gate mri = nranges - 1; 12837c478bd9Sstevel@tonic-gate 12847c478bd9Sstevel@tonic-gate while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 12857c478bd9Sstevel@tonic-gate mri--; 12867c478bd9Sstevel@tonic-gate 12877c478bd9Sstevel@tonic-gate while (mri >= 0 && mem_node_config[mnode].physmax >= 12887c478bd9Sstevel@tonic-gate MEMRANGELO(mri)) { 1289843e1988Sjohnlev mnoderanges->mnr_pfnlo = MAX(MEMRANGELO(mri), 12907c478bd9Sstevel@tonic-gate mem_node_config[mnode].physbase); 1291843e1988Sjohnlev mnoderanges->mnr_pfnhi = MIN(MEMRANGEHI(mri), 12927c478bd9Sstevel@tonic-gate mem_node_config[mnode].physmax); 12937c478bd9Sstevel@tonic-gate mnoderanges->mnr_mnode = mnode; 12947c478bd9Sstevel@tonic-gate mnoderanges->mnr_memrange = mri; 1295a3114836SGerry Liu mnoderanges->mnr_exists = 1; 12967c478bd9Sstevel@tonic-gate mnoderanges++; 1297a3114836SGerry Liu mindex++; 12987c478bd9Sstevel@tonic-gate if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 12997c478bd9Sstevel@tonic-gate mri--; 13007c478bd9Sstevel@tonic-gate else 13017c478bd9Sstevel@tonic-gate break; 13027c478bd9Sstevel@tonic-gate } 13037c478bd9Sstevel@tonic-gate } 1304a3114836SGerry Liu 1305a3114836SGerry Liu /* 1306a3114836SGerry Liu * For now do a simple sort of the mnoderanges array to fill in 1307a3114836SGerry Liu * the mnr_next fields. Since mindex is expected to be relatively 1308a3114836SGerry Liu * small, using a simple O(N^2) algorithm. 1309a3114836SGerry Liu */ 1310a3114836SGerry Liu for (i = 0; i < mindex; i++) { 1311a3114836SGerry Liu if (mp[i].mnr_pfnlo == 0) /* find lowest */ 1312a3114836SGerry Liu break; 13137c478bd9Sstevel@tonic-gate } 1314a3114836SGerry Liu ASSERT(i < mindex); 1315a3114836SGerry Liu last = i; 1316a3114836SGerry Liu mtype16m = last; 1317a3114836SGerry Liu mp[last].mnr_next = -1; 1318a3114836SGerry Liu for (i = 0; i < mindex - 1; i++) { 1319a3114836SGerry Liu hipfn = (pfn_t)(-1); 1320a3114836SGerry Liu hi = -1; 1321a3114836SGerry Liu /* find next highest mnode range */ 1322a3114836SGerry Liu for (j = 0; j < mindex; j++) { 1323a3114836SGerry Liu if (mp[j].mnr_pfnlo > mp[last].mnr_pfnlo && 1324a3114836SGerry Liu mp[j].mnr_pfnlo < hipfn) { 1325a3114836SGerry Liu hipfn = mp[j].mnr_pfnlo; 1326a3114836SGerry Liu hi = j; 1327a3114836SGerry Liu } 1328a3114836SGerry Liu } 1329a3114836SGerry Liu mp[hi].mnr_next = last; 1330a3114836SGerry Liu last = hi; 1331a3114836SGerry Liu } 1332a3114836SGerry Liu mtypetop = last; 1333a3114836SGerry Liu } 1334a3114836SGerry Liu 1335a3114836SGerry Liu #ifndef __xpv 1336a3114836SGerry Liu /* 1337a3114836SGerry Liu * Update mnoderanges for memory hot-add DR operations. 1338a3114836SGerry Liu */ 1339a3114836SGerry Liu static void 1340a3114836SGerry Liu mnode_range_add(int mnode) 1341a3114836SGerry Liu { 1342a3114836SGerry Liu int *prev; 1343a3114836SGerry Liu int n, mri; 1344a3114836SGerry Liu pfn_t start, end; 1345a3114836SGerry Liu extern void membar_sync(void); 1346a3114836SGerry Liu 1347a3114836SGerry Liu ASSERT(0 <= mnode && mnode < max_mem_nodes); 1348a3114836SGerry Liu ASSERT(mem_node_config[mnode].exists); 1349a3114836SGerry Liu start = mem_node_config[mnode].physbase; 1350a3114836SGerry Liu end = mem_node_config[mnode].physmax; 1351a3114836SGerry Liu ASSERT(start <= end); 1352a3114836SGerry Liu mutex_enter(&mnoderange_lock); 1353a3114836SGerry Liu 1354a3114836SGerry Liu #ifdef DEBUG 1355a3114836SGerry Liu /* Check whether it interleaves with other memory nodes. */ 1356a3114836SGerry Liu for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 1357a3114836SGerry Liu ASSERT(mnoderanges[n].mnr_exists); 1358a3114836SGerry Liu if (mnoderanges[n].mnr_mnode == mnode) 1359a3114836SGerry Liu continue; 1360a3114836SGerry Liu ASSERT(start > mnoderanges[n].mnr_pfnhi || 1361a3114836SGerry Liu end < mnoderanges[n].mnr_pfnlo); 1362a3114836SGerry Liu } 1363a3114836SGerry Liu #endif /* DEBUG */ 1364a3114836SGerry Liu 1365a3114836SGerry Liu mri = nranges - 1; 1366a3114836SGerry Liu while (MEMRANGEHI(mri) < mem_node_config[mnode].physbase) 1367a3114836SGerry Liu mri--; 1368a3114836SGerry Liu while (mri >= 0 && mem_node_config[mnode].physmax >= MEMRANGELO(mri)) { 1369a3114836SGerry Liu /* Check whether mtype already exists. */ 1370a3114836SGerry Liu for (n = mtypetop; n != -1; n = mnoderanges[n].mnr_next) { 1371a3114836SGerry Liu if (mnoderanges[n].mnr_mnode == mnode && 1372a3114836SGerry Liu mnoderanges[n].mnr_memrange == mri) { 1373a3114836SGerry Liu mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), 1374a3114836SGerry Liu start); 1375a3114836SGerry Liu mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), 1376a3114836SGerry Liu end); 1377a3114836SGerry Liu break; 1378a3114836SGerry Liu } 1379a3114836SGerry Liu } 1380a3114836SGerry Liu 1381a3114836SGerry Liu /* Add a new entry if it doesn't exist yet. */ 1382a3114836SGerry Liu if (n == -1) { 1383a3114836SGerry Liu /* Try to find an unused entry in mnoderanges array. */ 1384a3114836SGerry Liu for (n = 0; n < mnoderangecnt; n++) { 1385a3114836SGerry Liu if (mnoderanges[n].mnr_exists == 0) 1386a3114836SGerry Liu break; 1387a3114836SGerry Liu } 1388a3114836SGerry Liu ASSERT(n < mnoderangecnt); 1389a3114836SGerry Liu mnoderanges[n].mnr_pfnlo = MAX(MEMRANGELO(mri), start); 1390a3114836SGerry Liu mnoderanges[n].mnr_pfnhi = MIN(MEMRANGEHI(mri), end); 1391a3114836SGerry Liu mnoderanges[n].mnr_mnode = mnode; 1392a3114836SGerry Liu mnoderanges[n].mnr_memrange = mri; 1393a3114836SGerry Liu mnoderanges[n].mnr_exists = 1; 1394a3114836SGerry Liu /* Page 0 should always be present. */ 1395a3114836SGerry Liu for (prev = &mtypetop; 1396a3114836SGerry Liu mnoderanges[*prev].mnr_pfnlo > start; 1397a3114836SGerry Liu prev = &mnoderanges[*prev].mnr_next) { 1398a3114836SGerry Liu ASSERT(mnoderanges[*prev].mnr_next >= 0); 1399a3114836SGerry Liu ASSERT(mnoderanges[*prev].mnr_pfnlo > end); 1400a3114836SGerry Liu } 1401a3114836SGerry Liu mnoderanges[n].mnr_next = *prev; 1402a3114836SGerry Liu membar_sync(); 1403a3114836SGerry Liu *prev = n; 1404a3114836SGerry Liu } 1405a3114836SGerry Liu 1406a3114836SGerry Liu if (mem_node_config[mnode].physmax > MEMRANGEHI(mri)) 1407a3114836SGerry Liu mri--; 1408a3114836SGerry Liu else 1409a3114836SGerry Liu break; 1410a3114836SGerry Liu } 1411a3114836SGerry Liu 1412a3114836SGerry Liu mutex_exit(&mnoderange_lock); 1413a3114836SGerry Liu } 1414a3114836SGerry Liu 1415a3114836SGerry Liu /* 1416a3114836SGerry Liu * Update mnoderanges for memory hot-removal DR operations. 1417a3114836SGerry Liu */ 1418a3114836SGerry Liu static void 1419a3114836SGerry Liu mnode_range_del(int mnode) 1420a3114836SGerry Liu { 1421a3114836SGerry Liu _NOTE(ARGUNUSED(mnode)); 1422a3114836SGerry Liu ASSERT(0 <= mnode && mnode < max_mem_nodes); 1423a3114836SGerry Liu /* TODO: support deletion operation. */ 1424a3114836SGerry Liu ASSERT(0); 1425a3114836SGerry Liu } 1426a3114836SGerry Liu 1427a3114836SGerry Liu void 1428a3114836SGerry Liu plat_slice_add(pfn_t start, pfn_t end) 1429a3114836SGerry Liu { 1430a3114836SGerry Liu mem_node_add_slice(start, end); 1431a3114836SGerry Liu if (plat_dr_enabled()) { 1432a3114836SGerry Liu mnode_range_add(PFN_2_MEM_NODE(start)); 1433a3114836SGerry Liu } 1434a3114836SGerry Liu } 1435a3114836SGerry Liu 1436a3114836SGerry Liu void 1437a3114836SGerry Liu plat_slice_del(pfn_t start, pfn_t end) 1438a3114836SGerry Liu { 1439a3114836SGerry Liu ASSERT(PFN_2_MEM_NODE(start) == PFN_2_MEM_NODE(end)); 1440a3114836SGerry Liu ASSERT(plat_dr_enabled()); 1441a3114836SGerry Liu mnode_range_del(PFN_2_MEM_NODE(start)); 1442a3114836SGerry Liu mem_node_del_slice(start, end); 1443a3114836SGerry Liu } 1444a3114836SGerry Liu #endif /* __xpv */ 14457c478bd9Sstevel@tonic-gate 1446843e1988Sjohnlev /*ARGSUSED*/ 1447843e1988Sjohnlev int 1448843e1988Sjohnlev mtype_init(vnode_t *vp, caddr_t vaddr, uint_t *flags, size_t pgsz) 1449843e1988Sjohnlev { 1450a3114836SGerry Liu int mtype = mtypetop; 1451843e1988Sjohnlev 1452843e1988Sjohnlev #if !defined(__xpv) 1453843e1988Sjohnlev #if defined(__i386) 1454843e1988Sjohnlev /* 1455843e1988Sjohnlev * set the mtype range 1456a3114836SGerry Liu * - kmem requests need to be below 4g if restricted_kmemalloc is set. 1457843e1988Sjohnlev * - for non kmem requests, set range to above 4g if memory below 4g 1458843e1988Sjohnlev * runs low. 1459843e1988Sjohnlev */ 1460843e1988Sjohnlev if (restricted_kmemalloc && VN_ISKAS(vp) && 1461843e1988Sjohnlev (caddr_t)(vaddr) >= kernelheap && 1462843e1988Sjohnlev (caddr_t)(vaddr) < ekernelheap) { 1463843e1988Sjohnlev ASSERT(physmax4g); 1464843e1988Sjohnlev mtype = mtype4g; 1465843e1988Sjohnlev if (RESTRICT16M_ALLOC(freemem4g - btop(pgsz), 1466843e1988Sjohnlev btop(pgsz), *flags)) { 1467843e1988Sjohnlev *flags |= PGI_MT_RANGE16M; 1468843e1988Sjohnlev } else { 1469843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1470843e1988Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), 1471843e1988Sjohnlev vmm_vmstats.pgpanicalloc); 1472843e1988Sjohnlev *flags |= PGI_MT_RANGE0; 1473843e1988Sjohnlev } 1474843e1988Sjohnlev return (mtype); 1475843e1988Sjohnlev } 1476843e1988Sjohnlev #endif /* __i386 */ 1477843e1988Sjohnlev 1478843e1988Sjohnlev if (RESTRICT4G_ALLOC) { 1479843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.restrict4gcnt); 1480843e1988Sjohnlev /* here only for > 4g systems */ 1481843e1988Sjohnlev *flags |= PGI_MT_RANGE4G; 1482843e1988Sjohnlev } else if (RESTRICT16M_ALLOC(freemem, btop(pgsz), *flags)) { 1483843e1988Sjohnlev *flags |= PGI_MT_RANGE16M; 1484843e1988Sjohnlev } else { 1485843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1486843e1988Sjohnlev VM_STAT_COND_ADD((*flags & PG_PANIC), vmm_vmstats.pgpanicalloc); 1487843e1988Sjohnlev *flags |= PGI_MT_RANGE0; 1488843e1988Sjohnlev } 1489843e1988Sjohnlev #endif /* !__xpv */ 1490843e1988Sjohnlev return (mtype); 1491843e1988Sjohnlev } 1492843e1988Sjohnlev 1493843e1988Sjohnlev 1494843e1988Sjohnlev /* mtype init for page_get_replacement_page */ 1495843e1988Sjohnlev /*ARGSUSED*/ 1496843e1988Sjohnlev int 1497843e1988Sjohnlev mtype_pgr_init(int *flags, page_t *pp, int mnode, pgcnt_t pgcnt) 1498843e1988Sjohnlev { 1499a3114836SGerry Liu int mtype = mtypetop; 1500a3114836SGerry Liu #if !defined(__xpv) 1501843e1988Sjohnlev if (RESTRICT16M_ALLOC(freemem, pgcnt, *flags)) { 1502843e1988Sjohnlev *flags |= PGI_MT_RANGE16M; 1503843e1988Sjohnlev } else { 1504843e1988Sjohnlev VM_STAT_ADD(vmm_vmstats.unrestrict16mcnt); 1505843e1988Sjohnlev *flags |= PGI_MT_RANGE0; 1506843e1988Sjohnlev } 1507843e1988Sjohnlev #endif 1508843e1988Sjohnlev return (mtype); 1509843e1988Sjohnlev } 1510843e1988Sjohnlev 15117c478bd9Sstevel@tonic-gate /* 15127c478bd9Sstevel@tonic-gate * Determine if the mnode range specified in mtype contains memory belonging 15137c478bd9Sstevel@tonic-gate * to memory node mnode. If flags & PGI_MT_RANGE is set then mtype contains 1514a3114836SGerry Liu * the range from high pfn to 0, 16m or 4g. 15157c478bd9Sstevel@tonic-gate * 15167c478bd9Sstevel@tonic-gate * Return first mnode range type index found otherwise return -1 if none found. 15177c478bd9Sstevel@tonic-gate */ 15187c478bd9Sstevel@tonic-gate int 15197c478bd9Sstevel@tonic-gate mtype_func(int mnode, int mtype, uint_t flags) 15207c478bd9Sstevel@tonic-gate { 15217c478bd9Sstevel@tonic-gate if (flags & PGI_MT_RANGE) { 1522a3114836SGerry Liu int mnr_lim = MRI_0; 15237c478bd9Sstevel@tonic-gate 1524a3114836SGerry Liu if (flags & PGI_MT_NEXT) { 1525a3114836SGerry Liu mtype = mnoderanges[mtype].mnr_next; 1526a3114836SGerry Liu } 1527843e1988Sjohnlev if (flags & PGI_MT_RANGE4G) 1528a3114836SGerry Liu mnr_lim = MRI_4G; /* exclude 0-4g range */ 152907ad560dSkchow else if (flags & PGI_MT_RANGE16M) 1530a3114836SGerry Liu mnr_lim = MRI_16M; /* exclude 0-16m range */ 1531a3114836SGerry Liu while (mtype != -1 && 1532a3114836SGerry Liu mnoderanges[mtype].mnr_memrange <= mnr_lim) { 15337c478bd9Sstevel@tonic-gate if (mnoderanges[mtype].mnr_mnode == mnode) 15347c478bd9Sstevel@tonic-gate return (mtype); 1535a3114836SGerry Liu mtype = mnoderanges[mtype].mnr_next; 15367c478bd9Sstevel@tonic-gate } 1537843e1988Sjohnlev } else if (mnoderanges[mtype].mnr_mnode == mnode) { 15387c478bd9Sstevel@tonic-gate return (mtype); 15397c478bd9Sstevel@tonic-gate } 15407c478bd9Sstevel@tonic-gate return (-1); 15417c478bd9Sstevel@tonic-gate } 15427c478bd9Sstevel@tonic-gate 15437c478bd9Sstevel@tonic-gate /* 1544e21bae1bSkchow * Update the page list max counts with the pfn range specified by the 1545a3114836SGerry Liu * input parameters. 1546e21bae1bSkchow */ 1547e21bae1bSkchow void 1548e21bae1bSkchow mtype_modify_max(pfn_t startpfn, long cnt) 1549e21bae1bSkchow { 1550a3114836SGerry Liu int mtype; 1551e21bae1bSkchow pgcnt_t inc; 1552a3114836SGerry Liu spgcnt_t scnt = (spgcnt_t)(cnt); 1553a3114836SGerry Liu pgcnt_t acnt = ABS(scnt); 1554a3114836SGerry Liu pfn_t endpfn = startpfn + acnt; 1555a3114836SGerry Liu pfn_t pfn, lo; 1556e21bae1bSkchow 1557843e1988Sjohnlev if (!physmax4g) 1558843e1988Sjohnlev return; 1559843e1988Sjohnlev 1560a3114836SGerry Liu mtype = mtypetop; 1561a3114836SGerry Liu for (pfn = endpfn; pfn > startpfn; ) { 1562a3114836SGerry Liu ASSERT(mtype != -1); 1563a3114836SGerry Liu lo = mnoderanges[mtype].mnr_pfnlo; 1564a3114836SGerry Liu if (pfn > lo) { 1565a3114836SGerry Liu if (startpfn >= lo) { 1566a3114836SGerry Liu inc = pfn - startpfn; 1567e21bae1bSkchow } else { 1568a3114836SGerry Liu inc = pfn - lo; 1569e21bae1bSkchow } 1570a3114836SGerry Liu if (mnoderanges[mtype].mnr_memrange != MRI_4G) { 1571a3114836SGerry Liu if (scnt > 0) 1572e21bae1bSkchow maxmem4g += inc; 1573a3114836SGerry Liu else 1574a3114836SGerry Liu maxmem4g -= inc; 1575e21bae1bSkchow } 1576a3114836SGerry Liu pfn -= inc; 1577a3114836SGerry Liu } 1578a3114836SGerry Liu mtype = mnoderanges[mtype].mnr_next; 1579e21bae1bSkchow } 1580e21bae1bSkchow } 1581e21bae1bSkchow 1582843e1988Sjohnlev int 1583843e1988Sjohnlev mtype_2_mrange(int mtype) 1584843e1988Sjohnlev { 1585843e1988Sjohnlev return (mnoderanges[mtype].mnr_memrange); 1586843e1988Sjohnlev } 1587843e1988Sjohnlev 1588843e1988Sjohnlev void 1589843e1988Sjohnlev mnodetype_2_pfn(int mnode, int mtype, pfn_t *pfnlo, pfn_t *pfnhi) 1590843e1988Sjohnlev { 1591a3114836SGerry Liu _NOTE(ARGUNUSED(mnode)); 1592843e1988Sjohnlev ASSERT(mnoderanges[mtype].mnr_mnode == mnode); 1593843e1988Sjohnlev *pfnlo = mnoderanges[mtype].mnr_pfnlo; 1594843e1988Sjohnlev *pfnhi = mnoderanges[mtype].mnr_pfnhi; 1595843e1988Sjohnlev } 1596843e1988Sjohnlev 1597843e1988Sjohnlev size_t 1598843e1988Sjohnlev plcnt_sz(size_t ctrs_sz) 1599843e1988Sjohnlev { 1600843e1988Sjohnlev #ifdef DEBUG 1601843e1988Sjohnlev int szc, colors; 1602843e1988Sjohnlev 1603843e1988Sjohnlev ctrs_sz += mnoderangecnt * sizeof (struct mnr_mts) * mmu_page_sizes; 1604843e1988Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 1605843e1988Sjohnlev colors = page_get_pagecolors(szc); 1606843e1988Sjohnlev ctrs_sz += mnoderangecnt * sizeof (pgcnt_t) * colors; 1607843e1988Sjohnlev } 1608843e1988Sjohnlev #endif 1609843e1988Sjohnlev return (ctrs_sz); 1610843e1988Sjohnlev } 1611843e1988Sjohnlev 1612843e1988Sjohnlev caddr_t 1613843e1988Sjohnlev plcnt_init(caddr_t addr) 1614843e1988Sjohnlev { 1615843e1988Sjohnlev #ifdef DEBUG 1616843e1988Sjohnlev int mt, szc, colors; 1617843e1988Sjohnlev 1618843e1988Sjohnlev for (mt = 0; mt < mnoderangecnt; mt++) { 1619843e1988Sjohnlev mnoderanges[mt].mnr_mts = (struct mnr_mts *)addr; 1620843e1988Sjohnlev addr += (sizeof (struct mnr_mts) * mmu_page_sizes); 1621843e1988Sjohnlev for (szc = 0; szc < mmu_page_sizes; szc++) { 1622843e1988Sjohnlev colors = page_get_pagecolors(szc); 1623843e1988Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mts_colors = colors; 1624843e1988Sjohnlev mnoderanges[mt].mnr_mts[szc].mnr_mtsc_pgcnt = 1625843e1988Sjohnlev (pgcnt_t *)addr; 1626843e1988Sjohnlev addr += (sizeof (pgcnt_t) * colors); 1627843e1988Sjohnlev } 1628843e1988Sjohnlev } 1629843e1988Sjohnlev #endif 1630843e1988Sjohnlev return (addr); 1631843e1988Sjohnlev } 1632843e1988Sjohnlev 1633843e1988Sjohnlev void 1634843e1988Sjohnlev plcnt_inc_dec(page_t *pp, int mtype, int szc, long cnt, int flags) 1635843e1988Sjohnlev { 1636a3114836SGerry Liu _NOTE(ARGUNUSED(pp)); 1637843e1988Sjohnlev #ifdef DEBUG 1638843e1988Sjohnlev int bin = PP_2_BIN(pp); 1639843e1988Sjohnlev 1640843e1988Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mts_pgcnt, cnt); 1641843e1988Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mts[szc].mnr_mtsc_pgcnt[bin], 1642843e1988Sjohnlev cnt); 1643843e1988Sjohnlev #endif 1644843e1988Sjohnlev ASSERT(mtype == PP_2_MTYPE(pp)); 1645a3114836SGerry Liu if (physmax4g && mnoderanges[mtype].mnr_memrange != MRI_4G) 1646843e1988Sjohnlev atomic_add_long(&freemem4g, cnt); 1647843e1988Sjohnlev if (flags & PG_CACHE_LIST) 1648843e1988Sjohnlev atomic_add_long(&mnoderanges[mtype].mnr_mt_clpgcnt, cnt); 1649843e1988Sjohnlev else 165078b03d3aSkchow atomic_add_long(&mnoderanges[mtype].mnr_mt_flpgcnt[szc], cnt); 165178b03d3aSkchow atomic_add_long(&mnoderanges[mtype].mnr_mt_totcnt, cnt); 1652843e1988Sjohnlev } 1653843e1988Sjohnlev 1654e21bae1bSkchow /* 1655affbd3ccSkchow * Returns the free page count for mnode 1656affbd3ccSkchow */ 1657affbd3ccSkchow int 1658affbd3ccSkchow mnode_pgcnt(int mnode) 1659affbd3ccSkchow { 1660a3114836SGerry Liu int mtype = mtypetop; 1661affbd3ccSkchow int flags = PGI_MT_RANGE0; 1662affbd3ccSkchow pgcnt_t pgcnt = 0; 1663affbd3ccSkchow 1664affbd3ccSkchow mtype = mtype_func(mnode, mtype, flags); 1665affbd3ccSkchow 1666affbd3ccSkchow while (mtype != -1) { 166707ad560dSkchow pgcnt += MTYPE_FREEMEM(mtype); 1668affbd3ccSkchow mtype = mtype_func(mnode, mtype, flags | PGI_MT_NEXT); 1669affbd3ccSkchow } 1670affbd3ccSkchow return (pgcnt); 1671affbd3ccSkchow } 1672affbd3ccSkchow 1673affbd3ccSkchow /* 16747c478bd9Sstevel@tonic-gate * Initialize page coloring variables based on the l2 cache parameters. 16757c478bd9Sstevel@tonic-gate * Calculate and return memory needed for page coloring data structures. 16767c478bd9Sstevel@tonic-gate */ 16777c478bd9Sstevel@tonic-gate size_t 16787c478bd9Sstevel@tonic-gate page_coloring_init(uint_t l2_sz, int l2_linesz, int l2_assoc) 16797c478bd9Sstevel@tonic-gate { 1680a3114836SGerry Liu _NOTE(ARGUNUSED(l2_linesz)); 16817c478bd9Sstevel@tonic-gate size_t colorsz = 0; 16827c478bd9Sstevel@tonic-gate int i; 16837c478bd9Sstevel@tonic-gate int colors; 16847c478bd9Sstevel@tonic-gate 1685843e1988Sjohnlev #if defined(__xpv) 1686843e1988Sjohnlev /* 1687843e1988Sjohnlev * Hypervisor domains currently don't have any concept of NUMA. 1688843e1988Sjohnlev * Hence we'll act like there is only 1 memrange. 1689843e1988Sjohnlev */ 1690843e1988Sjohnlev i = memrange_num(1); 1691843e1988Sjohnlev #else /* !__xpv */ 16927c478bd9Sstevel@tonic-gate /* 16937c478bd9Sstevel@tonic-gate * Reduce the memory ranges lists if we don't have large amounts 16947c478bd9Sstevel@tonic-gate * of memory. This avoids searching known empty free lists. 1695a3114836SGerry Liu * To support memory DR operations, we need to keep memory ranges 1696a3114836SGerry Liu * for possible memory hot-add operations. 16977c478bd9Sstevel@tonic-gate */ 1698a3114836SGerry Liu if (plat_dr_physmax > physmax) 1699a3114836SGerry Liu i = memrange_num(plat_dr_physmax); 1700a3114836SGerry Liu else 17017c478bd9Sstevel@tonic-gate i = memrange_num(physmax); 17027c478bd9Sstevel@tonic-gate #if defined(__i386) 1703a3114836SGerry Liu if (i > MRI_4G) 17047c478bd9Sstevel@tonic-gate restricted_kmemalloc = 0; 17057c478bd9Sstevel@tonic-gate #endif 17067c478bd9Sstevel@tonic-gate /* physmax greater than 4g */ 1707a3114836SGerry Liu if (i == MRI_4G) 17087c478bd9Sstevel@tonic-gate physmax4g = 1; 1709843e1988Sjohnlev #endif /* !__xpv */ 1710843e1988Sjohnlev memranges += i; 1711843e1988Sjohnlev nranges -= i; 17127c478bd9Sstevel@tonic-gate 171302bc52beSkchow ASSERT(mmu_page_sizes <= MMU_PAGE_SIZES); 171402bc52beSkchow 17157c478bd9Sstevel@tonic-gate ASSERT(ISP2(l2_linesz)); 17167c478bd9Sstevel@tonic-gate ASSERT(l2_sz > MMU_PAGESIZE); 17177c478bd9Sstevel@tonic-gate 17187c478bd9Sstevel@tonic-gate /* l2_assoc is 0 for fully associative l2 cache */ 17197c478bd9Sstevel@tonic-gate if (l2_assoc) 17207c478bd9Sstevel@tonic-gate l2_colors = MAX(1, l2_sz / (l2_assoc * MMU_PAGESIZE)); 17217c478bd9Sstevel@tonic-gate else 17227c478bd9Sstevel@tonic-gate l2_colors = 1; 17237c478bd9Sstevel@tonic-gate 1724e4ab3d6dSvd224797 ASSERT(ISP2(l2_colors)); 1725e4ab3d6dSvd224797 17267c478bd9Sstevel@tonic-gate /* for scalability, configure at least PAGE_COLORS_MIN color bins */ 17277c478bd9Sstevel@tonic-gate page_colors = MAX(l2_colors, PAGE_COLORS_MIN); 17287c478bd9Sstevel@tonic-gate 17297c478bd9Sstevel@tonic-gate /* 17307c478bd9Sstevel@tonic-gate * cpu_page_colors is non-zero when a page color may be spread across 17317c478bd9Sstevel@tonic-gate * multiple bins. 17327c478bd9Sstevel@tonic-gate */ 17337c478bd9Sstevel@tonic-gate if (l2_colors < page_colors) 17347c478bd9Sstevel@tonic-gate cpu_page_colors = l2_colors; 17357c478bd9Sstevel@tonic-gate 17367c478bd9Sstevel@tonic-gate ASSERT(ISP2(page_colors)); 17377c478bd9Sstevel@tonic-gate 17387c478bd9Sstevel@tonic-gate page_colors_mask = page_colors - 1; 17397c478bd9Sstevel@tonic-gate 17407c478bd9Sstevel@tonic-gate ASSERT(ISP2(CPUSETSIZE())); 17417c478bd9Sstevel@tonic-gate page_coloring_shift = lowbit(CPUSETSIZE()); 17427c478bd9Sstevel@tonic-gate 17435d07b933Sdp78419 /* initialize number of colors per page size */ 17445d07b933Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 17455d07b933Sdp78419 hw_page_array[i].hp_size = LEVEL_SIZE(i); 17465d07b933Sdp78419 hw_page_array[i].hp_shift = LEVEL_SHIFT(i); 17475d07b933Sdp78419 hw_page_array[i].hp_pgcnt = LEVEL_SIZE(i) >> LEVEL_SHIFT(0); 17485d07b933Sdp78419 hw_page_array[i].hp_colors = (page_colors_mask >> 17495d07b933Sdp78419 (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) 17505d07b933Sdp78419 + 1; 1751932dc8e5Sdp78419 colorequivszc[i] = 0; 17525d07b933Sdp78419 } 17535d07b933Sdp78419 17545d07b933Sdp78419 /* 17555d07b933Sdp78419 * The value of cpu_page_colors determines if additional color bins 17565d07b933Sdp78419 * need to be checked for a particular color in the page_get routines. 17575d07b933Sdp78419 */ 17585d07b933Sdp78419 if (cpu_page_colors != 0) { 17595d07b933Sdp78419 17605d07b933Sdp78419 int a = lowbit(page_colors) - lowbit(cpu_page_colors); 17615d07b933Sdp78419 ASSERT(a > 0); 17625d07b933Sdp78419 ASSERT(a < 16); 17635d07b933Sdp78419 17645d07b933Sdp78419 for (i = 0; i <= mmu.max_page_level; i++) { 17655d07b933Sdp78419 if ((colors = hw_page_array[i].hp_colors) <= 1) { 17665d07b933Sdp78419 colorequivszc[i] = 0; 17675d07b933Sdp78419 continue; 17685d07b933Sdp78419 } 17695d07b933Sdp78419 while ((colors >> a) == 0) 17705d07b933Sdp78419 a--; 17715d07b933Sdp78419 ASSERT(a >= 0); 17725d07b933Sdp78419 17735d07b933Sdp78419 /* higher 4 bits encodes color equiv mask */ 17745d07b933Sdp78419 colorequivszc[i] = (a << 4); 17755d07b933Sdp78419 } 17765d07b933Sdp78419 } 17775d07b933Sdp78419 1778843e1988Sjohnlev /* factor in colorequiv to check additional 'equivalent' bins. */ 1779843e1988Sjohnlev if (colorequiv > 1) { 1780843e1988Sjohnlev 1781843e1988Sjohnlev int a = lowbit(colorequiv) - 1; 1782843e1988Sjohnlev if (a > 15) 1783843e1988Sjohnlev a = 15; 1784843e1988Sjohnlev 1785843e1988Sjohnlev for (i = 0; i <= mmu.max_page_level; i++) { 1786843e1988Sjohnlev if ((colors = hw_page_array[i].hp_colors) <= 1) { 1787843e1988Sjohnlev continue; 1788843e1988Sjohnlev } 1789843e1988Sjohnlev while ((colors >> a) == 0) 1790843e1988Sjohnlev a--; 1791843e1988Sjohnlev if ((a << 4) > colorequivszc[i]) { 1792843e1988Sjohnlev colorequivszc[i] = (a << 4); 1793843e1988Sjohnlev } 1794843e1988Sjohnlev } 1795843e1988Sjohnlev } 1796843e1988Sjohnlev 17977c478bd9Sstevel@tonic-gate /* size for mnoderanges */ 17985d07b933Sdp78419 for (mnoderangecnt = 0, i = 0; i < max_mem_nodes; i++) 17995d07b933Sdp78419 mnoderangecnt += mnode_range_cnt(i); 1800a3114836SGerry Liu if (plat_dr_support_memory()) { 1801a3114836SGerry Liu /* 1802a3114836SGerry Liu * Reserve enough space for memory DR operations. 1803a3114836SGerry Liu * Two extra mnoderanges for possbile fragmentations, 1804a3114836SGerry Liu * one for the 2G boundary and the other for the 4G boundary. 1805a3114836SGerry Liu * We don't expect a memory board crossing the 16M boundary 1806a3114836SGerry Liu * for memory hot-add operations on x86 platforms. 1807a3114836SGerry Liu */ 1808a3114836SGerry Liu mnoderangecnt += 2 + max_mem_nodes - lgrp_plat_node_cnt; 1809a3114836SGerry Liu } 18107c478bd9Sstevel@tonic-gate colorsz = mnoderangecnt * sizeof (mnoderange_t); 18117c478bd9Sstevel@tonic-gate 18127c478bd9Sstevel@tonic-gate /* size for fpc_mutex and cpc_mutex */ 18137c478bd9Sstevel@tonic-gate colorsz += (2 * max_mem_nodes * sizeof (kmutex_t) * NPC_MUTEX); 18147c478bd9Sstevel@tonic-gate 18157c478bd9Sstevel@tonic-gate /* size of page_freelists */ 18167c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t ***); 18177c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * mmu_page_sizes * sizeof (page_t **); 18187c478bd9Sstevel@tonic-gate 18197c478bd9Sstevel@tonic-gate for (i = 0; i < mmu_page_sizes; i++) { 18207c478bd9Sstevel@tonic-gate colors = page_get_pagecolors(i); 18217c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * colors * sizeof (page_t *); 18227c478bd9Sstevel@tonic-gate } 18237c478bd9Sstevel@tonic-gate 18247c478bd9Sstevel@tonic-gate /* size of page_cachelists */ 18257c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * sizeof (page_t **); 18267c478bd9Sstevel@tonic-gate colorsz += mnoderangecnt * page_colors * sizeof (page_t *); 18277c478bd9Sstevel@tonic-gate 18287c478bd9Sstevel@tonic-gate return (colorsz); 18297c478bd9Sstevel@tonic-gate } 18307c478bd9Sstevel@tonic-gate 18317c478bd9Sstevel@tonic-gate /* 18327c478bd9Sstevel@tonic-gate * Called once at startup to configure page_coloring data structures and 18337c478bd9Sstevel@tonic-gate * does the 1st page_free()/page_freelist_add(). 18347c478bd9Sstevel@tonic-gate */ 18357c478bd9Sstevel@tonic-gate void 18367c478bd9Sstevel@tonic-gate page_coloring_setup(caddr_t pcmemaddr) 18377c478bd9Sstevel@tonic-gate { 18387c478bd9Sstevel@tonic-gate int i; 18397c478bd9Sstevel@tonic-gate int j; 18407c478bd9Sstevel@tonic-gate int k; 18417c478bd9Sstevel@tonic-gate caddr_t addr; 18427c478bd9Sstevel@tonic-gate int colors; 18437c478bd9Sstevel@tonic-gate 18447c478bd9Sstevel@tonic-gate /* 18457c478bd9Sstevel@tonic-gate * do page coloring setup 18467c478bd9Sstevel@tonic-gate */ 18477c478bd9Sstevel@tonic-gate addr = pcmemaddr; 18487c478bd9Sstevel@tonic-gate 18497c478bd9Sstevel@tonic-gate mnoderanges = (mnoderange_t *)addr; 18507c478bd9Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (mnoderange_t)); 18517c478bd9Sstevel@tonic-gate 18527c478bd9Sstevel@tonic-gate mnode_range_setup(mnoderanges); 18537c478bd9Sstevel@tonic-gate 18547c478bd9Sstevel@tonic-gate if (physmax4g) 18557c478bd9Sstevel@tonic-gate mtype4g = pfn_2_mtype(0xfffff); 18567c478bd9Sstevel@tonic-gate 18577c478bd9Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 18587c478bd9Sstevel@tonic-gate fpc_mutex[k] = (kmutex_t *)addr; 18597c478bd9Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 18607c478bd9Sstevel@tonic-gate } 18617c478bd9Sstevel@tonic-gate for (k = 0; k < NPC_MUTEX; k++) { 18627c478bd9Sstevel@tonic-gate cpc_mutex[k] = (kmutex_t *)addr; 18637c478bd9Sstevel@tonic-gate addr += (max_mem_nodes * sizeof (kmutex_t)); 18647c478bd9Sstevel@tonic-gate } 1865d94ffb28Sjmcp page_freelists = (page_t ****)addr; 18667c478bd9Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t ***)); 18677c478bd9Sstevel@tonic-gate 18687c478bd9Sstevel@tonic-gate page_cachelists = (page_t ***)addr; 18697c478bd9Sstevel@tonic-gate addr += (mnoderangecnt * sizeof (page_t **)); 18707c478bd9Sstevel@tonic-gate 18717c478bd9Sstevel@tonic-gate for (i = 0; i < mnoderangecnt; i++) { 1872d94ffb28Sjmcp page_freelists[i] = (page_t ***)addr; 18737c478bd9Sstevel@tonic-gate addr += (mmu_page_sizes * sizeof (page_t **)); 18747c478bd9Sstevel@tonic-gate 18757c478bd9Sstevel@tonic-gate for (j = 0; j < mmu_page_sizes; j++) { 18767c478bd9Sstevel@tonic-gate colors = page_get_pagecolors(j); 1877d94ffb28Sjmcp page_freelists[i][j] = (page_t **)addr; 18787c478bd9Sstevel@tonic-gate addr += (colors * sizeof (page_t *)); 18797c478bd9Sstevel@tonic-gate } 18807c478bd9Sstevel@tonic-gate page_cachelists[i] = (page_t **)addr; 18817c478bd9Sstevel@tonic-gate addr += (page_colors * sizeof (page_t *)); 18827c478bd9Sstevel@tonic-gate } 18837c478bd9Sstevel@tonic-gate } 18847c478bd9Sstevel@tonic-gate 1885843e1988Sjohnlev #if defined(__xpv) 1886843e1988Sjohnlev /* 1887843e1988Sjohnlev * Give back 10% of the io_pool pages to the free list. 1888843e1988Sjohnlev * Don't shrink the pool below some absolute minimum. 1889843e1988Sjohnlev */ 1890843e1988Sjohnlev static void 1891843e1988Sjohnlev page_io_pool_shrink() 1892843e1988Sjohnlev { 1893843e1988Sjohnlev int retcnt; 1894843e1988Sjohnlev page_t *pp, *pp_first, *pp_last, **curpool; 1895843e1988Sjohnlev mfn_t mfn; 1896843e1988Sjohnlev int bothpools = 0; 1897843e1988Sjohnlev 1898843e1988Sjohnlev mutex_enter(&io_pool_lock); 1899843e1988Sjohnlev io_pool_shrink_attempts++; /* should be a kstat? */ 1900843e1988Sjohnlev retcnt = io_pool_cnt / 10; 1901843e1988Sjohnlev if (io_pool_cnt - retcnt < io_pool_cnt_min) 1902843e1988Sjohnlev retcnt = io_pool_cnt - io_pool_cnt_min; 1903843e1988Sjohnlev if (retcnt <= 0) 1904843e1988Sjohnlev goto done; 1905843e1988Sjohnlev io_pool_shrinks++; /* should be a kstat? */ 1906843e1988Sjohnlev curpool = &io_pool_4g; 1907843e1988Sjohnlev domore: 1908843e1988Sjohnlev /* 1909843e1988Sjohnlev * Loop through taking pages from the end of the list 1910843e1988Sjohnlev * (highest mfns) till amount to return reached. 1911843e1988Sjohnlev */ 1912843e1988Sjohnlev for (pp = *curpool; pp && retcnt > 0; ) { 1913843e1988Sjohnlev pp_first = pp_last = pp->p_prev; 1914843e1988Sjohnlev if (pp_first == *curpool) 1915843e1988Sjohnlev break; 1916843e1988Sjohnlev retcnt--; 1917843e1988Sjohnlev io_pool_cnt--; 1918843e1988Sjohnlev page_io_pool_sub(curpool, pp_first, pp_last); 1919843e1988Sjohnlev if ((mfn = pfn_to_mfn(pp->p_pagenum)) < start_mfn) 1920843e1988Sjohnlev start_mfn = mfn; 1921843e1988Sjohnlev page_free(pp_first, 1); 1922843e1988Sjohnlev pp = *curpool; 1923843e1988Sjohnlev } 1924843e1988Sjohnlev if (retcnt != 0 && !bothpools) { 1925843e1988Sjohnlev /* 1926843e1988Sjohnlev * If not enough found in less constrained pool try the 1927843e1988Sjohnlev * more constrained one. 1928843e1988Sjohnlev */ 1929843e1988Sjohnlev curpool = &io_pool_16m; 1930843e1988Sjohnlev bothpools = 1; 1931843e1988Sjohnlev goto domore; 1932843e1988Sjohnlev } 1933843e1988Sjohnlev done: 1934843e1988Sjohnlev mutex_exit(&io_pool_lock); 1935843e1988Sjohnlev } 1936843e1988Sjohnlev 1937843e1988Sjohnlev #endif /* __xpv */ 1938843e1988Sjohnlev 1939843e1988Sjohnlev uint_t 1940843e1988Sjohnlev page_create_update_flags_x86(uint_t flags) 1941843e1988Sjohnlev { 1942843e1988Sjohnlev #if defined(__xpv) 1943843e1988Sjohnlev /* 1944843e1988Sjohnlev * Check this is an urgent allocation and free pages are depleted. 1945843e1988Sjohnlev */ 1946843e1988Sjohnlev if (!(flags & PG_WAIT) && freemem < desfree) 1947843e1988Sjohnlev page_io_pool_shrink(); 1948843e1988Sjohnlev #else /* !__xpv */ 1949843e1988Sjohnlev /* 1950843e1988Sjohnlev * page_create_get_something may call this because 4g memory may be 1951843e1988Sjohnlev * depleted. Set flags to allow for relocation of base page below 1952843e1988Sjohnlev * 4g if necessary. 1953843e1988Sjohnlev */ 1954843e1988Sjohnlev if (physmax4g) 1955843e1988Sjohnlev flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1956843e1988Sjohnlev #endif /* __xpv */ 1957843e1988Sjohnlev return (flags); 1958843e1988Sjohnlev } 1959843e1988Sjohnlev 19607c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 19617c478bd9Sstevel@tonic-gate int 19627c478bd9Sstevel@tonic-gate bp_color(struct buf *bp) 19637c478bd9Sstevel@tonic-gate { 19647c478bd9Sstevel@tonic-gate return (0); 19657c478bd9Sstevel@tonic-gate } 19667c478bd9Sstevel@tonic-gate 1967843e1988Sjohnlev #if defined(__xpv) 1968843e1988Sjohnlev 1969843e1988Sjohnlev /* 1970843e1988Sjohnlev * Take pages out of an io_pool 1971843e1988Sjohnlev */ 1972843e1988Sjohnlev static void 1973843e1988Sjohnlev page_io_pool_sub(page_t **poolp, page_t *pp_first, page_t *pp_last) 1974843e1988Sjohnlev { 1975843e1988Sjohnlev if (*poolp == pp_first) { 1976843e1988Sjohnlev *poolp = pp_last->p_next; 1977843e1988Sjohnlev if (*poolp == pp_first) 1978843e1988Sjohnlev *poolp = NULL; 1979843e1988Sjohnlev } 1980843e1988Sjohnlev pp_first->p_prev->p_next = pp_last->p_next; 1981843e1988Sjohnlev pp_last->p_next->p_prev = pp_first->p_prev; 1982843e1988Sjohnlev pp_first->p_prev = pp_last; 1983843e1988Sjohnlev pp_last->p_next = pp_first; 1984843e1988Sjohnlev } 1985843e1988Sjohnlev 1986843e1988Sjohnlev /* 1987843e1988Sjohnlev * Put a page on the io_pool list. The list is ordered by increasing MFN. 1988843e1988Sjohnlev */ 1989843e1988Sjohnlev static void 1990843e1988Sjohnlev page_io_pool_add(page_t **poolp, page_t *pp) 1991843e1988Sjohnlev { 1992843e1988Sjohnlev page_t *look; 1993843e1988Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 1994843e1988Sjohnlev 1995843e1988Sjohnlev if (*poolp == NULL) { 1996843e1988Sjohnlev *poolp = pp; 1997843e1988Sjohnlev pp->p_next = pp; 1998843e1988Sjohnlev pp->p_prev = pp; 1999843e1988Sjohnlev return; 2000843e1988Sjohnlev } 2001843e1988Sjohnlev 2002843e1988Sjohnlev /* 2003843e1988Sjohnlev * Since we try to take pages from the high end of the pool 2004843e1988Sjohnlev * chances are good that the pages to be put on the list will 2005843e1988Sjohnlev * go at or near the end of the list. so start at the end and 2006843e1988Sjohnlev * work backwards. 2007843e1988Sjohnlev */ 2008843e1988Sjohnlev look = (*poolp)->p_prev; 2009843e1988Sjohnlev while (mfn < mfn_list[look->p_pagenum]) { 2010843e1988Sjohnlev look = look->p_prev; 2011843e1988Sjohnlev if (look == (*poolp)->p_prev) 2012843e1988Sjohnlev break; /* backed all the way to front of list */ 2013843e1988Sjohnlev } 2014843e1988Sjohnlev 2015843e1988Sjohnlev /* insert after look */ 2016843e1988Sjohnlev pp->p_prev = look; 2017843e1988Sjohnlev pp->p_next = look->p_next; 2018843e1988Sjohnlev pp->p_next->p_prev = pp; 2019843e1988Sjohnlev look->p_next = pp; 2020843e1988Sjohnlev if (mfn < mfn_list[(*poolp)->p_pagenum]) { 2021843e1988Sjohnlev /* 2022843e1988Sjohnlev * we inserted a new first list element 2023843e1988Sjohnlev * adjust pool pointer to newly inserted element 2024843e1988Sjohnlev */ 2025843e1988Sjohnlev *poolp = pp; 2026843e1988Sjohnlev } 2027843e1988Sjohnlev } 2028843e1988Sjohnlev 2029843e1988Sjohnlev /* 2030843e1988Sjohnlev * Add a page to the io_pool. Setting the force flag will force the page 2031843e1988Sjohnlev * into the io_pool no matter what. 2032843e1988Sjohnlev */ 2033843e1988Sjohnlev static void 2034843e1988Sjohnlev add_page_to_pool(page_t *pp, int force) 2035843e1988Sjohnlev { 2036843e1988Sjohnlev page_t *highest; 2037843e1988Sjohnlev page_t *freep = NULL; 2038843e1988Sjohnlev 2039843e1988Sjohnlev mutex_enter(&io_pool_lock); 2040843e1988Sjohnlev /* 2041843e1988Sjohnlev * Always keep the scarce low memory pages 2042843e1988Sjohnlev */ 2043843e1988Sjohnlev if (mfn_list[pp->p_pagenum] < PFN_16MEG) { 2044843e1988Sjohnlev ++io_pool_cnt; 2045843e1988Sjohnlev page_io_pool_add(&io_pool_16m, pp); 2046843e1988Sjohnlev goto done; 2047843e1988Sjohnlev } 2048cf902cd2Ssmaybe if (io_pool_cnt < io_pool_cnt_max || force || io_pool_4g == NULL) { 2049843e1988Sjohnlev ++io_pool_cnt; 2050843e1988Sjohnlev page_io_pool_add(&io_pool_4g, pp); 2051843e1988Sjohnlev } else { 2052843e1988Sjohnlev highest = io_pool_4g->p_prev; 2053843e1988Sjohnlev if (mfn_list[pp->p_pagenum] < mfn_list[highest->p_pagenum]) { 2054843e1988Sjohnlev page_io_pool_sub(&io_pool_4g, highest, highest); 2055843e1988Sjohnlev page_io_pool_add(&io_pool_4g, pp); 2056843e1988Sjohnlev freep = highest; 2057843e1988Sjohnlev } else { 2058843e1988Sjohnlev freep = pp; 2059843e1988Sjohnlev } 2060843e1988Sjohnlev } 2061843e1988Sjohnlev done: 2062843e1988Sjohnlev mutex_exit(&io_pool_lock); 2063843e1988Sjohnlev if (freep) 2064843e1988Sjohnlev page_free(freep, 1); 2065843e1988Sjohnlev } 2066843e1988Sjohnlev 2067843e1988Sjohnlev 2068843e1988Sjohnlev int contig_pfn_cnt; /* no of pfns in the contig pfn list */ 2069843e1988Sjohnlev int contig_pfn_max; /* capacity of the contig pfn list */ 2070843e1988Sjohnlev int next_alloc_pfn; /* next position in list to start a contig search */ 2071843e1988Sjohnlev int contig_pfnlist_updates; /* pfn list update count */ 2072843e1988Sjohnlev int contig_pfnlist_builds; /* how many times have we (re)built list */ 2073843e1988Sjohnlev int contig_pfnlist_buildfailed; /* how many times has list build failed */ 2074843e1988Sjohnlev int create_contig_pending; /* nonzero means taskq creating contig list */ 2075843e1988Sjohnlev pfn_t *contig_pfn_list = NULL; /* list of contig pfns in ascending mfn order */ 2076843e1988Sjohnlev 2077843e1988Sjohnlev /* 2078843e1988Sjohnlev * Function to use in sorting a list of pfns by their underlying mfns. 2079843e1988Sjohnlev */ 2080843e1988Sjohnlev static int 2081843e1988Sjohnlev mfn_compare(const void *pfnp1, const void *pfnp2) 2082843e1988Sjohnlev { 2083843e1988Sjohnlev mfn_t mfn1 = mfn_list[*(pfn_t *)pfnp1]; 2084843e1988Sjohnlev mfn_t mfn2 = mfn_list[*(pfn_t *)pfnp2]; 2085843e1988Sjohnlev 2086843e1988Sjohnlev if (mfn1 > mfn2) 2087843e1988Sjohnlev return (1); 2088843e1988Sjohnlev if (mfn1 < mfn2) 2089843e1988Sjohnlev return (-1); 2090843e1988Sjohnlev return (0); 2091843e1988Sjohnlev } 2092843e1988Sjohnlev 2093843e1988Sjohnlev /* 2094843e1988Sjohnlev * Compact the contig_pfn_list by tossing all the non-contiguous 2095843e1988Sjohnlev * elements from the list. 2096843e1988Sjohnlev */ 2097843e1988Sjohnlev static void 2098843e1988Sjohnlev compact_contig_pfn_list(void) 2099843e1988Sjohnlev { 2100843e1988Sjohnlev pfn_t pfn, lapfn, prev_lapfn; 2101843e1988Sjohnlev mfn_t mfn; 2102843e1988Sjohnlev int i, newcnt = 0; 2103843e1988Sjohnlev 2104843e1988Sjohnlev prev_lapfn = 0; 2105843e1988Sjohnlev for (i = 0; i < contig_pfn_cnt - 1; i++) { 2106843e1988Sjohnlev pfn = contig_pfn_list[i]; 2107843e1988Sjohnlev lapfn = contig_pfn_list[i + 1]; 2108843e1988Sjohnlev mfn = mfn_list[pfn]; 2109843e1988Sjohnlev /* 2110843e1988Sjohnlev * See if next pfn is for a contig mfn 2111843e1988Sjohnlev */ 2112843e1988Sjohnlev if (mfn_list[lapfn] != mfn + 1) 2113843e1988Sjohnlev continue; 2114843e1988Sjohnlev /* 2115843e1988Sjohnlev * pfn and lookahead are both put in list 2116843e1988Sjohnlev * unless pfn is the previous lookahead. 2117843e1988Sjohnlev */ 2118843e1988Sjohnlev if (pfn != prev_lapfn) 2119843e1988Sjohnlev contig_pfn_list[newcnt++] = pfn; 2120843e1988Sjohnlev contig_pfn_list[newcnt++] = lapfn; 2121843e1988Sjohnlev prev_lapfn = lapfn; 2122843e1988Sjohnlev } 2123843e1988Sjohnlev for (i = newcnt; i < contig_pfn_cnt; i++) 2124843e1988Sjohnlev contig_pfn_list[i] = 0; 2125843e1988Sjohnlev contig_pfn_cnt = newcnt; 2126843e1988Sjohnlev } 2127843e1988Sjohnlev 2128843e1988Sjohnlev /*ARGSUSED*/ 2129843e1988Sjohnlev static void 2130843e1988Sjohnlev call_create_contiglist(void *arg) 2131843e1988Sjohnlev { 2132843e1988Sjohnlev (void) create_contig_pfnlist(PG_WAIT); 2133843e1988Sjohnlev } 2134843e1988Sjohnlev 2135843e1988Sjohnlev /* 2136843e1988Sjohnlev * Create list of freelist pfns that have underlying 2137843e1988Sjohnlev * contiguous mfns. The list is kept in ascending mfn order. 2138843e1988Sjohnlev * returns 1 if list created else 0. 2139843e1988Sjohnlev */ 2140843e1988Sjohnlev static int 2141843e1988Sjohnlev create_contig_pfnlist(uint_t flags) 2142843e1988Sjohnlev { 2143843e1988Sjohnlev pfn_t pfn; 2144843e1988Sjohnlev page_t *pp; 2145b9bc7f78Ssmaybe int ret = 1; 2146843e1988Sjohnlev 2147b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2148843e1988Sjohnlev if (contig_pfn_list != NULL) 2149b9bc7f78Ssmaybe goto out; 2150843e1988Sjohnlev contig_pfn_max = freemem + (freemem / 10); 2151843e1988Sjohnlev contig_pfn_list = kmem_zalloc(contig_pfn_max * sizeof (pfn_t), 2152843e1988Sjohnlev (flags & PG_WAIT) ? KM_SLEEP : KM_NOSLEEP); 2153843e1988Sjohnlev if (contig_pfn_list == NULL) { 2154843e1988Sjohnlev /* 2155843e1988Sjohnlev * If we could not create the contig list (because 2156843e1988Sjohnlev * we could not sleep for memory). Dispatch a taskq that can 2157843e1988Sjohnlev * sleep to get the memory. 2158843e1988Sjohnlev */ 2159843e1988Sjohnlev if (!create_contig_pending) { 2160843e1988Sjohnlev if (taskq_dispatch(system_taskq, call_create_contiglist, 2161843e1988Sjohnlev NULL, TQ_NOSLEEP) != NULL) 2162843e1988Sjohnlev create_contig_pending = 1; 2163843e1988Sjohnlev } 2164843e1988Sjohnlev contig_pfnlist_buildfailed++; /* count list build failures */ 2165b9bc7f78Ssmaybe ret = 0; 2166b9bc7f78Ssmaybe goto out; 2167843e1988Sjohnlev } 2168b9bc7f78Ssmaybe create_contig_pending = 0; 2169843e1988Sjohnlev ASSERT(contig_pfn_cnt == 0); 2170843e1988Sjohnlev for (pfn = 0; pfn < mfn_count; pfn++) { 2171843e1988Sjohnlev pp = page_numtopp_nolock(pfn); 2172843e1988Sjohnlev if (pp == NULL || !PP_ISFREE(pp)) 2173843e1988Sjohnlev continue; 2174843e1988Sjohnlev contig_pfn_list[contig_pfn_cnt] = pfn; 2175843e1988Sjohnlev if (++contig_pfn_cnt == contig_pfn_max) 2176843e1988Sjohnlev break; 2177843e1988Sjohnlev } 21786358f641SStuart Maybee /* 21796358f641SStuart Maybee * Sanity check the new list. 21806358f641SStuart Maybee */ 21816358f641SStuart Maybee if (contig_pfn_cnt < 2) { /* no contig pfns */ 21826358f641SStuart Maybee contig_pfn_cnt = 0; 21836358f641SStuart Maybee contig_pfnlist_buildfailed++; 21846358f641SStuart Maybee kmem_free(contig_pfn_list, contig_pfn_max * sizeof (pfn_t)); 21856358f641SStuart Maybee contig_pfn_list = NULL; 21866358f641SStuart Maybee contig_pfn_max = 0; 21876358f641SStuart Maybee ret = 0; 21886358f641SStuart Maybee goto out; 21896358f641SStuart Maybee } 2190843e1988Sjohnlev qsort(contig_pfn_list, contig_pfn_cnt, sizeof (pfn_t), mfn_compare); 2191843e1988Sjohnlev compact_contig_pfn_list(); 2192843e1988Sjohnlev /* 2193843e1988Sjohnlev * Make sure next search of the newly created contiguous pfn 2194843e1988Sjohnlev * list starts at the beginning of the list. 2195843e1988Sjohnlev */ 2196843e1988Sjohnlev next_alloc_pfn = 0; 2197843e1988Sjohnlev contig_pfnlist_builds++; /* count list builds */ 2198b9bc7f78Ssmaybe out: 2199b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2200b9bc7f78Ssmaybe return (ret); 2201843e1988Sjohnlev } 2202843e1988Sjohnlev 2203843e1988Sjohnlev 2204843e1988Sjohnlev /* 2205843e1988Sjohnlev * Toss the current contig pfnlist. Someone is about to do a massive 2206843e1988Sjohnlev * update to pfn<->mfn mappings. So we have them destroy the list and lock 2207843e1988Sjohnlev * it till they are done with their update. 2208843e1988Sjohnlev */ 2209843e1988Sjohnlev void 2210843e1988Sjohnlev clear_and_lock_contig_pfnlist() 2211843e1988Sjohnlev { 2212843e1988Sjohnlev pfn_t *listp = NULL; 2213843e1988Sjohnlev size_t listsize; 2214843e1988Sjohnlev 2215b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2216843e1988Sjohnlev if (contig_pfn_list != NULL) { 2217843e1988Sjohnlev listp = contig_pfn_list; 2218843e1988Sjohnlev listsize = contig_pfn_max * sizeof (pfn_t); 2219843e1988Sjohnlev contig_pfn_list = NULL; 2220843e1988Sjohnlev contig_pfn_max = contig_pfn_cnt = 0; 2221843e1988Sjohnlev } 2222843e1988Sjohnlev if (listp != NULL) 2223843e1988Sjohnlev kmem_free(listp, listsize); 2224843e1988Sjohnlev } 2225843e1988Sjohnlev 2226843e1988Sjohnlev /* 2227843e1988Sjohnlev * Unlock the contig_pfn_list. The next attempted use of it will cause 2228843e1988Sjohnlev * it to be re-created. 2229843e1988Sjohnlev */ 2230843e1988Sjohnlev void 2231843e1988Sjohnlev unlock_contig_pfnlist() 2232843e1988Sjohnlev { 2233b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2234843e1988Sjohnlev } 2235843e1988Sjohnlev 2236843e1988Sjohnlev /* 2237843e1988Sjohnlev * Update the contiguous pfn list in response to a pfn <-> mfn reassignment 2238843e1988Sjohnlev */ 2239843e1988Sjohnlev void 2240843e1988Sjohnlev update_contig_pfnlist(pfn_t pfn, mfn_t oldmfn, mfn_t newmfn) 2241843e1988Sjohnlev { 2242843e1988Sjohnlev int probe_hi, probe_lo, probe_pos, insert_after, insert_point; 2243843e1988Sjohnlev pfn_t probe_pfn; 2244843e1988Sjohnlev mfn_t probe_mfn; 2245b9bc7f78Ssmaybe int drop_lock = 0; 2246843e1988Sjohnlev 2247b9bc7f78Ssmaybe if (mutex_owner(&contig_list_lock) != curthread) { 2248b9bc7f78Ssmaybe drop_lock = 1; 2249b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2250b9bc7f78Ssmaybe } 2251843e1988Sjohnlev if (contig_pfn_list == NULL) 2252b9bc7f78Ssmaybe goto done; 2253843e1988Sjohnlev contig_pfnlist_updates++; 2254843e1988Sjohnlev /* 2255843e1988Sjohnlev * Find the pfn in the current list. Use a binary chop to locate it. 2256843e1988Sjohnlev */ 2257843e1988Sjohnlev probe_hi = contig_pfn_cnt - 1; 2258843e1988Sjohnlev probe_lo = 0; 2259843e1988Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2260843e1988Sjohnlev while ((probe_pfn = contig_pfn_list[probe_pos]) != pfn) { 2261843e1988Sjohnlev if (probe_pos == probe_lo) { /* pfn not in list */ 2262843e1988Sjohnlev probe_pos = -1; 2263843e1988Sjohnlev break; 2264843e1988Sjohnlev } 2265843e1988Sjohnlev if (pfn_to_mfn(probe_pfn) <= oldmfn) 2266843e1988Sjohnlev probe_lo = probe_pos; 2267843e1988Sjohnlev else 2268843e1988Sjohnlev probe_hi = probe_pos; 2269843e1988Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2270843e1988Sjohnlev } 22716358f641SStuart Maybee if (probe_pos >= 0) { 22726358f641SStuart Maybee /* 22736358f641SStuart Maybee * Remove pfn from list and ensure next alloc 22746358f641SStuart Maybee * position stays in bounds. 22756358f641SStuart Maybee */ 22766358f641SStuart Maybee if (--contig_pfn_cnt <= next_alloc_pfn) 22776358f641SStuart Maybee next_alloc_pfn = 0; 2278349b53ddSStuart Maybee if (contig_pfn_cnt < 2) { /* no contig pfns */ 2279349b53ddSStuart Maybee contig_pfn_cnt = 0; 2280349b53ddSStuart Maybee kmem_free(contig_pfn_list, 2281349b53ddSStuart Maybee contig_pfn_max * sizeof (pfn_t)); 2282349b53ddSStuart Maybee contig_pfn_list = NULL; 2283349b53ddSStuart Maybee contig_pfn_max = 0; 2284349b53ddSStuart Maybee goto done; 2285349b53ddSStuart Maybee } 2286843e1988Sjohnlev ovbcopy(&contig_pfn_list[probe_pos + 1], 2287843e1988Sjohnlev &contig_pfn_list[probe_pos], 2288843e1988Sjohnlev (contig_pfn_cnt - probe_pos) * sizeof (pfn_t)); 2289843e1988Sjohnlev } 2290843e1988Sjohnlev if (newmfn == MFN_INVALID) 2291843e1988Sjohnlev goto done; 2292843e1988Sjohnlev /* 2293843e1988Sjohnlev * Check if new mfn has adjacent mfns in the list 2294843e1988Sjohnlev */ 2295843e1988Sjohnlev probe_hi = contig_pfn_cnt - 1; 2296843e1988Sjohnlev probe_lo = 0; 2297843e1988Sjohnlev insert_after = -2; 2298843e1988Sjohnlev do { 2299843e1988Sjohnlev probe_pos = (probe_hi + probe_lo) / 2; 2300843e1988Sjohnlev probe_mfn = pfn_to_mfn(contig_pfn_list[probe_pos]); 2301843e1988Sjohnlev if (newmfn == probe_mfn + 1) 2302843e1988Sjohnlev insert_after = probe_pos; 2303843e1988Sjohnlev else if (newmfn == probe_mfn - 1) 2304843e1988Sjohnlev insert_after = probe_pos - 1; 2305843e1988Sjohnlev if (probe_pos == probe_lo) 2306843e1988Sjohnlev break; 2307843e1988Sjohnlev if (probe_mfn <= newmfn) 2308843e1988Sjohnlev probe_lo = probe_pos; 2309843e1988Sjohnlev else 2310843e1988Sjohnlev probe_hi = probe_pos; 2311843e1988Sjohnlev } while (insert_after == -2); 2312843e1988Sjohnlev /* 2313843e1988Sjohnlev * If there is space in the list and there are adjacent mfns 2314843e1988Sjohnlev * insert the pfn in to its proper place in the list. 2315843e1988Sjohnlev */ 2316843e1988Sjohnlev if (insert_after != -2 && contig_pfn_cnt + 1 <= contig_pfn_max) { 2317843e1988Sjohnlev insert_point = insert_after + 1; 2318843e1988Sjohnlev ovbcopy(&contig_pfn_list[insert_point], 2319843e1988Sjohnlev &contig_pfn_list[insert_point + 1], 2320843e1988Sjohnlev (contig_pfn_cnt - insert_point) * sizeof (pfn_t)); 2321843e1988Sjohnlev contig_pfn_list[insert_point] = pfn; 2322843e1988Sjohnlev contig_pfn_cnt++; 2323843e1988Sjohnlev } 2324843e1988Sjohnlev done: 2325b9bc7f78Ssmaybe if (drop_lock) 2326b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2327843e1988Sjohnlev } 2328843e1988Sjohnlev 2329843e1988Sjohnlev /* 2330843e1988Sjohnlev * Called to (re-)populate the io_pool from the free page lists. 2331843e1988Sjohnlev */ 2332843e1988Sjohnlev long 2333843e1988Sjohnlev populate_io_pool(void) 2334843e1988Sjohnlev { 2335843e1988Sjohnlev pfn_t pfn; 2336843e1988Sjohnlev mfn_t mfn, max_mfn; 2337843e1988Sjohnlev page_t *pp; 2338843e1988Sjohnlev 2339843e1988Sjohnlev /* 2340843e1988Sjohnlev * Figure out the bounds of the pool on first invocation. 2341843e1988Sjohnlev * We use a percentage of memory for the io pool size. 2342843e1988Sjohnlev * we allow that to shrink, but not to less than a fixed minimum 2343843e1988Sjohnlev */ 2344843e1988Sjohnlev if (io_pool_cnt_max == 0) { 2345843e1988Sjohnlev io_pool_cnt_max = physmem / (100 / io_pool_physmem_pct); 2346843e1988Sjohnlev io_pool_cnt_lowater = io_pool_cnt_max; 2347843e1988Sjohnlev /* 2348843e1988Sjohnlev * This is the first time in populate_io_pool, grab a va to use 2349843e1988Sjohnlev * when we need to allocate pages. 2350843e1988Sjohnlev */ 2351843e1988Sjohnlev io_pool_kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); 2352843e1988Sjohnlev } 2353843e1988Sjohnlev /* 2354843e1988Sjohnlev * If we are out of pages in the pool, then grow the size of the pool 2355843e1988Sjohnlev */ 2356cf902cd2Ssmaybe if (io_pool_cnt == 0) { 2357cf902cd2Ssmaybe /* 2358cf902cd2Ssmaybe * Grow the max size of the io pool by 5%, but never more than 2359cf902cd2Ssmaybe * 25% of physical memory. 2360cf902cd2Ssmaybe */ 2361cf902cd2Ssmaybe if (io_pool_cnt_max < physmem / 4) 2362cf902cd2Ssmaybe io_pool_cnt_max += io_pool_cnt_max / 20; 2363cf902cd2Ssmaybe } 2364843e1988Sjohnlev io_pool_grows++; /* should be a kstat? */ 2365843e1988Sjohnlev 2366843e1988Sjohnlev /* 2367843e1988Sjohnlev * Get highest mfn on this platform, but limit to the 32 bit DMA max. 2368843e1988Sjohnlev */ 2369843e1988Sjohnlev (void) mfn_to_pfn(start_mfn); 2370843e1988Sjohnlev max_mfn = MIN(cached_max_mfn, PFN_4GIG); 2371843e1988Sjohnlev for (mfn = start_mfn; mfn < max_mfn; start_mfn = ++mfn) { 2372843e1988Sjohnlev pfn = mfn_to_pfn(mfn); 2373843e1988Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 2374843e1988Sjohnlev continue; 2375843e1988Sjohnlev /* 2376843e1988Sjohnlev * try to allocate it from free pages 2377843e1988Sjohnlev */ 2378843e1988Sjohnlev pp = page_numtopp_alloc(pfn); 2379843e1988Sjohnlev if (pp == NULL) 2380843e1988Sjohnlev continue; 2381843e1988Sjohnlev PP_CLRFREE(pp); 2382843e1988Sjohnlev add_page_to_pool(pp, 1); 2383843e1988Sjohnlev if (io_pool_cnt >= io_pool_cnt_max) 2384843e1988Sjohnlev break; 2385843e1988Sjohnlev } 2386843e1988Sjohnlev 2387843e1988Sjohnlev return (io_pool_cnt); 2388843e1988Sjohnlev } 2389843e1988Sjohnlev 2390843e1988Sjohnlev /* 2391843e1988Sjohnlev * Destroy a page that was being used for DMA I/O. It may or 2392843e1988Sjohnlev * may not actually go back to the io_pool. 2393843e1988Sjohnlev */ 2394843e1988Sjohnlev void 2395843e1988Sjohnlev page_destroy_io(page_t *pp) 2396843e1988Sjohnlev { 2397843e1988Sjohnlev mfn_t mfn = mfn_list[pp->p_pagenum]; 2398843e1988Sjohnlev 2399843e1988Sjohnlev /* 2400843e1988Sjohnlev * When the page was alloc'd a reservation was made, release it now 2401843e1988Sjohnlev */ 2402843e1988Sjohnlev page_unresv(1); 2403843e1988Sjohnlev /* 2404843e1988Sjohnlev * Unload translations, if any, then hash out the 2405843e1988Sjohnlev * page to erase its identity. 2406843e1988Sjohnlev */ 2407843e1988Sjohnlev (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2408843e1988Sjohnlev page_hashout(pp, NULL); 2409843e1988Sjohnlev 2410843e1988Sjohnlev /* 2411843e1988Sjohnlev * If the page came from the free lists, just put it back to them. 2412843e1988Sjohnlev * DomU pages always go on the free lists as well. 2413843e1988Sjohnlev */ 2414843e1988Sjohnlev if (!DOMAIN_IS_INITDOMAIN(xen_info) || mfn >= PFN_4GIG) { 2415843e1988Sjohnlev page_free(pp, 1); 2416843e1988Sjohnlev return; 2417843e1988Sjohnlev } 2418843e1988Sjohnlev 2419843e1988Sjohnlev add_page_to_pool(pp, 0); 2420843e1988Sjohnlev } 2421843e1988Sjohnlev 2422843e1988Sjohnlev 2423843e1988Sjohnlev long contig_searches; /* count of times contig pages requested */ 2424843e1988Sjohnlev long contig_search_restarts; /* count of contig ranges tried */ 2425843e1988Sjohnlev long contig_search_failed; /* count of contig alloc failures */ 2426843e1988Sjohnlev 2427843e1988Sjohnlev /* 2428349b53ddSStuart Maybee * Free partial page list 2429349b53ddSStuart Maybee */ 2430349b53ddSStuart Maybee static void 2431349b53ddSStuart Maybee free_partial_list(page_t **pplist) 2432349b53ddSStuart Maybee { 2433349b53ddSStuart Maybee page_t *pp; 2434349b53ddSStuart Maybee 2435349b53ddSStuart Maybee while (*pplist != NULL) { 2436349b53ddSStuart Maybee pp = *pplist; 2437349b53ddSStuart Maybee page_io_pool_sub(pplist, pp, pp); 2438349b53ddSStuart Maybee page_free(pp, 1); 2439349b53ddSStuart Maybee } 2440349b53ddSStuart Maybee } 2441349b53ddSStuart Maybee 2442349b53ddSStuart Maybee /* 2443843e1988Sjohnlev * Look thru the contiguous pfns that are not part of the io_pool for 2444843e1988Sjohnlev * contiguous free pages. Return a list of the found pages or NULL. 2445843e1988Sjohnlev */ 2446843e1988Sjohnlev page_t * 2447349b53ddSStuart Maybee find_contig_free(uint_t npages, uint_t flags, uint64_t pfnseg, 2448349b53ddSStuart Maybee pgcnt_t pfnalign) 2449843e1988Sjohnlev { 2450843e1988Sjohnlev page_t *pp, *plist = NULL; 24512e8a1712Ssmaybe mfn_t mfn, prev_mfn, start_mfn; 2452843e1988Sjohnlev pfn_t pfn; 2453843e1988Sjohnlev int pages_needed, pages_requested; 2454843e1988Sjohnlev int search_start; 2455843e1988Sjohnlev 2456843e1988Sjohnlev /* 2457843e1988Sjohnlev * create the contig pfn list if not already done 2458843e1988Sjohnlev */ 2459b9bc7f78Ssmaybe retry: 2460b9bc7f78Ssmaybe mutex_enter(&contig_list_lock); 2461843e1988Sjohnlev if (contig_pfn_list == NULL) { 2462b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2463b9bc7f78Ssmaybe if (!create_contig_pfnlist(flags)) { 2464843e1988Sjohnlev return (NULL); 2465843e1988Sjohnlev } 2466b9bc7f78Ssmaybe goto retry; 2467843e1988Sjohnlev } 2468843e1988Sjohnlev contig_searches++; 2469843e1988Sjohnlev /* 2470843e1988Sjohnlev * Search contiguous pfn list for physically contiguous pages not in 2471843e1988Sjohnlev * the io_pool. Start the search where the last search left off. 2472843e1988Sjohnlev */ 24736f235fc0Ssmaybe pages_requested = pages_needed = npages; 2474843e1988Sjohnlev search_start = next_alloc_pfn; 24752e8a1712Ssmaybe start_mfn = prev_mfn = 0; 2476843e1988Sjohnlev while (pages_needed) { 2477843e1988Sjohnlev pfn = contig_pfn_list[next_alloc_pfn]; 2478843e1988Sjohnlev mfn = pfn_to_mfn(pfn); 24792e8a1712Ssmaybe /* 24802e8a1712Ssmaybe * Check if mfn is first one or contig to previous one and 24812e8a1712Ssmaybe * if page corresponding to mfn is free and that mfn 24822e8a1712Ssmaybe * range is not crossing a segment boundary. 24832e8a1712Ssmaybe */ 2484843e1988Sjohnlev if ((prev_mfn == 0 || mfn == prev_mfn + 1) && 24852e8a1712Ssmaybe (pp = page_numtopp_alloc(pfn)) != NULL && 24862e8a1712Ssmaybe !((mfn & pfnseg) < (start_mfn & pfnseg))) { 2487843e1988Sjohnlev PP_CLRFREE(pp); 2488843e1988Sjohnlev page_io_pool_add(&plist, pp); 2489843e1988Sjohnlev pages_needed--; 2490349b53ddSStuart Maybee if (prev_mfn == 0) { 2491349b53ddSStuart Maybee if (pfnalign && 2492349b53ddSStuart Maybee mfn != P2ROUNDUP(mfn, pfnalign)) { 2493349b53ddSStuart Maybee /* 2494349b53ddSStuart Maybee * not properly aligned 2495349b53ddSStuart Maybee */ 2496349b53ddSStuart Maybee contig_search_restarts++; 2497349b53ddSStuart Maybee free_partial_list(&plist); 2498349b53ddSStuart Maybee pages_needed = pages_requested; 2499349b53ddSStuart Maybee start_mfn = prev_mfn = 0; 2500349b53ddSStuart Maybee goto skip; 2501349b53ddSStuart Maybee } 25022e8a1712Ssmaybe start_mfn = mfn; 2503349b53ddSStuart Maybee } 2504843e1988Sjohnlev prev_mfn = mfn; 2505843e1988Sjohnlev } else { 2506843e1988Sjohnlev contig_search_restarts++; 2507349b53ddSStuart Maybee free_partial_list(&plist); 2508843e1988Sjohnlev pages_needed = pages_requested; 25092e8a1712Ssmaybe start_mfn = prev_mfn = 0; 2510843e1988Sjohnlev } 2511349b53ddSStuart Maybee skip: 2512843e1988Sjohnlev if (++next_alloc_pfn == contig_pfn_cnt) 2513843e1988Sjohnlev next_alloc_pfn = 0; 2514843e1988Sjohnlev if (next_alloc_pfn == search_start) 2515843e1988Sjohnlev break; /* all pfns searched */ 2516843e1988Sjohnlev } 2517b9bc7f78Ssmaybe mutex_exit(&contig_list_lock); 2518843e1988Sjohnlev if (pages_needed) { 2519843e1988Sjohnlev contig_search_failed++; 2520843e1988Sjohnlev /* 2521843e1988Sjohnlev * Failed to find enough contig pages. 2522843e1988Sjohnlev * free partial page list 2523843e1988Sjohnlev */ 2524349b53ddSStuart Maybee free_partial_list(&plist); 2525843e1988Sjohnlev } 2526843e1988Sjohnlev return (plist); 2527843e1988Sjohnlev } 2528843e1988Sjohnlev 2529843e1988Sjohnlev /* 25306f235fc0Ssmaybe * Search the reserved io pool pages for a page range with the 25316f235fc0Ssmaybe * desired characteristics. 2532843e1988Sjohnlev */ 2533843e1988Sjohnlev page_t * 25346f235fc0Ssmaybe page_io_pool_alloc(ddi_dma_attr_t *mattr, int contig, pgcnt_t minctg) 2535843e1988Sjohnlev { 25366f235fc0Ssmaybe page_t *pp_first, *pp_last; 25376f235fc0Ssmaybe page_t *pp, **poolp; 25386f235fc0Ssmaybe pgcnt_t nwanted, pfnalign; 2539843e1988Sjohnlev uint64_t pfnseg; 25406f235fc0Ssmaybe mfn_t mfn, tmfn, hi_mfn, lo_mfn; 25416f235fc0Ssmaybe int align, attempt = 0; 2542843e1988Sjohnlev 25436f235fc0Ssmaybe if (minctg == 1) 25446f235fc0Ssmaybe contig = 0; 2545843e1988Sjohnlev lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 2546843e1988Sjohnlev hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 25476f235fc0Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 2548843e1988Sjohnlev align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 2549843e1988Sjohnlev if (align > MMU_PAGESIZE) 2550843e1988Sjohnlev pfnalign = mmu_btop(align); 25516f235fc0Ssmaybe else 25526f235fc0Ssmaybe pfnalign = 0; 2553843e1988Sjohnlev 2554843e1988Sjohnlev try_again: 2555843e1988Sjohnlev /* 2556843e1988Sjohnlev * See if we want pages for a legacy device 2557843e1988Sjohnlev */ 2558843e1988Sjohnlev if (hi_mfn < PFN_16MEG) 2559843e1988Sjohnlev poolp = &io_pool_16m; 2560843e1988Sjohnlev else 2561843e1988Sjohnlev poolp = &io_pool_4g; 2562843e1988Sjohnlev try_smaller: 2563843e1988Sjohnlev /* 25646f235fc0Ssmaybe * Take pages from I/O pool. We'll use pages from the highest 25656f235fc0Ssmaybe * MFN range possible. 2566843e1988Sjohnlev */ 2567843e1988Sjohnlev pp_first = pp_last = NULL; 2568843e1988Sjohnlev mutex_enter(&io_pool_lock); 25696f235fc0Ssmaybe nwanted = minctg; 25706f235fc0Ssmaybe for (pp = *poolp; pp && nwanted > 0; ) { 2571843e1988Sjohnlev pp = pp->p_prev; 2572843e1988Sjohnlev 2573843e1988Sjohnlev /* 2574843e1988Sjohnlev * skip pages above allowable range 2575843e1988Sjohnlev */ 2576843e1988Sjohnlev mfn = mfn_list[pp->p_pagenum]; 2577843e1988Sjohnlev if (hi_mfn < mfn) 2578843e1988Sjohnlev goto skip; 2579843e1988Sjohnlev 2580843e1988Sjohnlev /* 2581843e1988Sjohnlev * stop at pages below allowable range 2582843e1988Sjohnlev */ 2583843e1988Sjohnlev if (lo_mfn > mfn) 2584843e1988Sjohnlev break; 2585843e1988Sjohnlev restart: 2586843e1988Sjohnlev if (pp_last == NULL) { 2587843e1988Sjohnlev /* 2588843e1988Sjohnlev * Check alignment 2589843e1988Sjohnlev */ 25906f235fc0Ssmaybe tmfn = mfn - (minctg - 1); 25916f235fc0Ssmaybe if (pfnalign && tmfn != P2ROUNDUP(tmfn, pfnalign)) 2592843e1988Sjohnlev goto skip; /* not properly aligned */ 2593843e1988Sjohnlev /* 2594843e1988Sjohnlev * Check segment 2595843e1988Sjohnlev */ 2596843e1988Sjohnlev if ((mfn & pfnseg) < (tmfn & pfnseg)) 25976f235fc0Ssmaybe goto skip; /* crosses seg boundary */ 2598843e1988Sjohnlev /* 2599843e1988Sjohnlev * Start building page list 2600843e1988Sjohnlev */ 2601843e1988Sjohnlev pp_first = pp_last = pp; 26026f235fc0Ssmaybe nwanted--; 2603843e1988Sjohnlev } else { 2604843e1988Sjohnlev /* 2605843e1988Sjohnlev * check physical contiguity if required 2606843e1988Sjohnlev */ 2607843e1988Sjohnlev if (contig && 2608843e1988Sjohnlev mfn_list[pp_first->p_pagenum] != mfn + 1) { 2609843e1988Sjohnlev /* 2610843e1988Sjohnlev * not a contiguous page, restart list. 2611843e1988Sjohnlev */ 2612843e1988Sjohnlev pp_last = NULL; 26136f235fc0Ssmaybe nwanted = minctg; 2614843e1988Sjohnlev goto restart; 2615843e1988Sjohnlev } else { /* add page to list */ 2616843e1988Sjohnlev pp_first = pp; 26176f235fc0Ssmaybe nwanted--; 2618843e1988Sjohnlev } 2619843e1988Sjohnlev } 2620843e1988Sjohnlev skip: 2621843e1988Sjohnlev if (pp == *poolp) 2622843e1988Sjohnlev break; 2623843e1988Sjohnlev } 2624843e1988Sjohnlev 2625843e1988Sjohnlev /* 2626843e1988Sjohnlev * If we didn't find memory. Try the more constrained pool, then 26276f235fc0Ssmaybe * sweep free pages into the DMA pool and try again. 2628843e1988Sjohnlev */ 26296f235fc0Ssmaybe if (nwanted != 0) { 2630843e1988Sjohnlev mutex_exit(&io_pool_lock); 2631843e1988Sjohnlev /* 26326f235fc0Ssmaybe * If we were looking in the less constrained pool and 26336f235fc0Ssmaybe * didn't find pages, try the more constrained pool. 2634843e1988Sjohnlev */ 2635843e1988Sjohnlev if (poolp == &io_pool_4g) { 2636843e1988Sjohnlev poolp = &io_pool_16m; 2637843e1988Sjohnlev goto try_smaller; 2638843e1988Sjohnlev } 2639843e1988Sjohnlev kmem_reap(); 2640843e1988Sjohnlev if (++attempt < 4) { 2641843e1988Sjohnlev /* 2642843e1988Sjohnlev * Grab some more io_pool pages 2643843e1988Sjohnlev */ 2644843e1988Sjohnlev (void) populate_io_pool(); 26456f235fc0Ssmaybe goto try_again; /* go around and retry */ 26466f235fc0Ssmaybe } 26476f235fc0Ssmaybe return (NULL); 26486f235fc0Ssmaybe } 26496f235fc0Ssmaybe /* 26506f235fc0Ssmaybe * Found the pages, now snip them from the list 26516f235fc0Ssmaybe */ 26526f235fc0Ssmaybe page_io_pool_sub(poolp, pp_first, pp_last); 26536f235fc0Ssmaybe io_pool_cnt -= minctg; 26546f235fc0Ssmaybe /* 26556f235fc0Ssmaybe * reset low water mark 26566f235fc0Ssmaybe */ 26576f235fc0Ssmaybe if (io_pool_cnt < io_pool_cnt_lowater) 26586f235fc0Ssmaybe io_pool_cnt_lowater = io_pool_cnt; 26596f235fc0Ssmaybe mutex_exit(&io_pool_lock); 26606f235fc0Ssmaybe return (pp_first); 2661843e1988Sjohnlev } 2662843e1988Sjohnlev 26636f235fc0Ssmaybe page_t * 26646f235fc0Ssmaybe page_swap_with_hypervisor(struct vnode *vp, u_offset_t off, caddr_t vaddr, 26656f235fc0Ssmaybe ddi_dma_attr_t *mattr, uint_t flags, pgcnt_t minctg) 26666f235fc0Ssmaybe { 26676f235fc0Ssmaybe uint_t kflags; 26686f235fc0Ssmaybe int order, extra, extpages, i, contig, nbits, extents; 26696f235fc0Ssmaybe page_t *pp, *expp, *pp_first, **pplist = NULL; 26706f235fc0Ssmaybe mfn_t *mfnlist = NULL; 26716f235fc0Ssmaybe 26726f235fc0Ssmaybe contig = flags & PG_PHYSCONTIG; 26736f235fc0Ssmaybe if (minctg == 1) 26746f235fc0Ssmaybe contig = 0; 26756f235fc0Ssmaybe flags &= ~PG_PHYSCONTIG; 2676843e1988Sjohnlev kflags = flags & PG_WAIT ? KM_SLEEP : KM_NOSLEEP; 2677843e1988Sjohnlev /* 26786f235fc0Ssmaybe * Hypervisor will allocate extents, if we want contig 26796f235fc0Ssmaybe * pages extent must be >= minctg 2680843e1988Sjohnlev */ 2681843e1988Sjohnlev if (contig) { 26826f235fc0Ssmaybe order = highbit(minctg) - 1; 26836f235fc0Ssmaybe if (minctg & ((1 << order) - 1)) 2684843e1988Sjohnlev order++; 2685843e1988Sjohnlev extpages = 1 << order; 2686843e1988Sjohnlev } else { 2687843e1988Sjohnlev order = 0; 26886f235fc0Ssmaybe extpages = minctg; 2689843e1988Sjohnlev } 26906f235fc0Ssmaybe if (extpages > minctg) { 26916f235fc0Ssmaybe extra = extpages - minctg; 2692843e1988Sjohnlev if (!page_resv(extra, kflags)) 2693843e1988Sjohnlev return (NULL); 2694843e1988Sjohnlev } 26956f235fc0Ssmaybe pp_first = NULL; 2696843e1988Sjohnlev pplist = kmem_alloc(extpages * sizeof (page_t *), kflags); 2697843e1988Sjohnlev if (pplist == NULL) 26986f235fc0Ssmaybe goto balloon_fail; 2699843e1988Sjohnlev mfnlist = kmem_alloc(extpages * sizeof (mfn_t), kflags); 2700843e1988Sjohnlev if (mfnlist == NULL) 27016f235fc0Ssmaybe goto balloon_fail; 27026f235fc0Ssmaybe pp = page_create_va(vp, off, minctg * PAGESIZE, flags, &kvseg, vaddr); 2703843e1988Sjohnlev if (pp == NULL) 27046f235fc0Ssmaybe goto balloon_fail; 2705843e1988Sjohnlev pp_first = pp; 27066f235fc0Ssmaybe if (extpages > minctg) { 2707843e1988Sjohnlev /* 27086f235fc0Ssmaybe * fill out the rest of extent pages to swap 27096f235fc0Ssmaybe * with the hypervisor 2710843e1988Sjohnlev */ 2711843e1988Sjohnlev for (i = 0; i < extra; i++) { 2712843e1988Sjohnlev expp = page_create_va(vp, 2713843e1988Sjohnlev (u_offset_t)(uintptr_t)io_pool_kva, 2714843e1988Sjohnlev PAGESIZE, flags, &kvseg, io_pool_kva); 2715843e1988Sjohnlev if (expp == NULL) 2716843e1988Sjohnlev goto balloon_fail; 2717843e1988Sjohnlev (void) hat_pageunload(expp, HAT_FORCE_PGUNLOAD); 2718843e1988Sjohnlev page_io_unlock(expp); 2719843e1988Sjohnlev page_hashout(expp, NULL); 2720843e1988Sjohnlev page_io_lock(expp); 2721843e1988Sjohnlev /* 2722843e1988Sjohnlev * add page to end of list 2723843e1988Sjohnlev */ 2724843e1988Sjohnlev expp->p_prev = pp_first->p_prev; 2725843e1988Sjohnlev expp->p_next = pp_first; 2726843e1988Sjohnlev expp->p_prev->p_next = expp; 2727843e1988Sjohnlev pp_first->p_prev = expp; 2728843e1988Sjohnlev } 2729843e1988Sjohnlev 2730843e1988Sjohnlev } 2731843e1988Sjohnlev for (i = 0; i < extpages; i++) { 2732843e1988Sjohnlev pplist[i] = pp; 2733843e1988Sjohnlev pp = pp->p_next; 2734843e1988Sjohnlev } 2735843e1988Sjohnlev nbits = highbit(mattr->dma_attr_addr_hi); 27366f235fc0Ssmaybe extents = contig ? 1 : minctg; 2737843e1988Sjohnlev if (balloon_replace_pages(extents, pplist, nbits, order, 2738b9bc7f78Ssmaybe mfnlist) != extents) { 2739b9bc7f78Ssmaybe if (ioalloc_dbg) 27406f235fc0Ssmaybe cmn_err(CE_NOTE, "request to hypervisor" 27416f235fc0Ssmaybe " for %d pages, maxaddr %" PRIx64 " failed", 2742b9bc7f78Ssmaybe extpages, mattr->dma_attr_addr_hi); 2743843e1988Sjohnlev goto balloon_fail; 2744b9bc7f78Ssmaybe } 2745843e1988Sjohnlev 2746843e1988Sjohnlev kmem_free(pplist, extpages * sizeof (page_t *)); 2747843e1988Sjohnlev kmem_free(mfnlist, extpages * sizeof (mfn_t)); 2748843e1988Sjohnlev /* 2749843e1988Sjohnlev * Return any excess pages to free list 2750843e1988Sjohnlev */ 27516f235fc0Ssmaybe if (extpages > minctg) { 2752843e1988Sjohnlev for (i = 0; i < extra; i++) { 2753843e1988Sjohnlev pp = pp_first->p_prev; 2754843e1988Sjohnlev page_sub(&pp_first, pp); 2755843e1988Sjohnlev page_io_unlock(pp); 2756843e1988Sjohnlev page_unresv(1); 2757843e1988Sjohnlev page_free(pp, 1); 2758843e1988Sjohnlev } 2759843e1988Sjohnlev } 2760843e1988Sjohnlev return (pp_first); 2761843e1988Sjohnlev balloon_fail: 2762843e1988Sjohnlev /* 2763843e1988Sjohnlev * Return pages to free list and return failure 2764843e1988Sjohnlev */ 2765843e1988Sjohnlev while (pp_first != NULL) { 2766843e1988Sjohnlev pp = pp_first; 2767843e1988Sjohnlev page_sub(&pp_first, pp); 2768843e1988Sjohnlev page_io_unlock(pp); 2769843e1988Sjohnlev if (pp->p_vnode != NULL) 2770843e1988Sjohnlev page_hashout(pp, NULL); 2771843e1988Sjohnlev page_free(pp, 1); 2772843e1988Sjohnlev } 2773843e1988Sjohnlev if (pplist) 2774843e1988Sjohnlev kmem_free(pplist, extpages * sizeof (page_t *)); 2775843e1988Sjohnlev if (mfnlist) 2776843e1988Sjohnlev kmem_free(mfnlist, extpages * sizeof (mfn_t)); 27776f235fc0Ssmaybe page_unresv(extpages - minctg); 27786f235fc0Ssmaybe return (NULL); 27796f235fc0Ssmaybe } 27806f235fc0Ssmaybe 27816f235fc0Ssmaybe static void 27826f235fc0Ssmaybe return_partial_alloc(page_t *plist) 27836f235fc0Ssmaybe { 27846f235fc0Ssmaybe page_t *pp; 27856f235fc0Ssmaybe 27866f235fc0Ssmaybe while (plist != NULL) { 27876f235fc0Ssmaybe pp = plist; 27886f235fc0Ssmaybe page_sub(&plist, pp); 2789d21b39ddSmrj page_io_unlock(pp); 27906f235fc0Ssmaybe page_destroy_io(pp); 27916f235fc0Ssmaybe } 27926f235fc0Ssmaybe } 27936f235fc0Ssmaybe 27946f235fc0Ssmaybe static page_t * 27956f235fc0Ssmaybe page_get_contigpages( 27966f235fc0Ssmaybe struct vnode *vp, 27976f235fc0Ssmaybe u_offset_t off, 27986f235fc0Ssmaybe int *npagesp, 27996f235fc0Ssmaybe uint_t flags, 28006f235fc0Ssmaybe caddr_t vaddr, 28016f235fc0Ssmaybe ddi_dma_attr_t *mattr) 28026f235fc0Ssmaybe { 28036f235fc0Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 28046f235fc0Ssmaybe page_t *plist; /* list to return */ 28056f235fc0Ssmaybe page_t *pp, *mcpl; 28066f235fc0Ssmaybe int contig, anyaddr, npages, getone = 0; 28076f235fc0Ssmaybe mfn_t lo_mfn; 28086f235fc0Ssmaybe mfn_t hi_mfn; 28096f235fc0Ssmaybe pgcnt_t pfnalign = 0; 28106f235fc0Ssmaybe int align, sgllen; 28116f235fc0Ssmaybe uint64_t pfnseg; 28126f235fc0Ssmaybe pgcnt_t minctg; 28136f235fc0Ssmaybe 28146f235fc0Ssmaybe npages = *npagesp; 28156f235fc0Ssmaybe ASSERT(mattr != NULL); 28166f235fc0Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 28176f235fc0Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 28186f235fc0Ssmaybe sgllen = mattr->dma_attr_sgllen; 28196f235fc0Ssmaybe pfnseg = mmu_btop(mattr->dma_attr_seg); 28206f235fc0Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 28216f235fc0Ssmaybe if (align > MMU_PAGESIZE) 28226f235fc0Ssmaybe pfnalign = mmu_btop(align); 28236f235fc0Ssmaybe 2824349b53ddSStuart Maybee contig = flags & PG_PHYSCONTIG; 2825349b53ddSStuart Maybee if (npages == -1) { 2826349b53ddSStuart Maybee npages = 1; 2827349b53ddSStuart Maybee pfnalign = 0; 2828349b53ddSStuart Maybee } 28296f235fc0Ssmaybe /* 28306f235fc0Ssmaybe * Clear the contig flag if only one page is needed. 28316f235fc0Ssmaybe */ 28326f235fc0Ssmaybe if (npages == 1) { 28336f235fc0Ssmaybe getone = 1; 28346f235fc0Ssmaybe contig = 0; 28356f235fc0Ssmaybe } 28366f235fc0Ssmaybe 28376f235fc0Ssmaybe /* 28386f235fc0Ssmaybe * Check if any page in the system is fine. 28396f235fc0Ssmaybe */ 2840349b53ddSStuart Maybee anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn; 2841349b53ddSStuart Maybee if (!contig && anyaddr && !pfnalign) { 28426f235fc0Ssmaybe flags &= ~PG_PHYSCONTIG; 28436f235fc0Ssmaybe plist = page_create_va(vp, off, npages * MMU_PAGESIZE, 28446f235fc0Ssmaybe flags, &kvseg, vaddr); 28456f235fc0Ssmaybe if (plist != NULL) { 28466f235fc0Ssmaybe *npagesp = 0; 28476f235fc0Ssmaybe return (plist); 28486f235fc0Ssmaybe } 28496f235fc0Ssmaybe } 28506f235fc0Ssmaybe plist = NULL; 28516f235fc0Ssmaybe minctg = howmany(npages, sgllen); 28526f235fc0Ssmaybe while (npages > sgllen || getone) { 2853804cf79fSsmaybe if (minctg > npages) 2854804cf79fSsmaybe minctg = npages; 2855804cf79fSsmaybe mcpl = NULL; 28566f235fc0Ssmaybe /* 2857349b53ddSStuart Maybee * We could want contig pages with no address range limits. 28586f235fc0Ssmaybe */ 28592e8a1712Ssmaybe if (anyaddr && contig) { 28606f235fc0Ssmaybe /* 28616f235fc0Ssmaybe * Look for free contig pages to satisfy the request. 28626f235fc0Ssmaybe */ 2863349b53ddSStuart Maybee mcpl = find_contig_free(minctg, flags, pfnseg, 2864349b53ddSStuart Maybee pfnalign); 28656f235fc0Ssmaybe } 28666f235fc0Ssmaybe /* 28676f235fc0Ssmaybe * Try the reserved io pools next 28686f235fc0Ssmaybe */ 28696f235fc0Ssmaybe if (mcpl == NULL) 28706f235fc0Ssmaybe mcpl = page_io_pool_alloc(mattr, contig, minctg); 28716f235fc0Ssmaybe if (mcpl != NULL) { 28726f235fc0Ssmaybe pp = mcpl; 28736f235fc0Ssmaybe do { 28746f235fc0Ssmaybe if (!page_hashin(pp, vp, off, NULL)) { 28756f235fc0Ssmaybe panic("page_get_contigpages:" 28766f235fc0Ssmaybe " hashin failed" 28776f235fc0Ssmaybe " pp %p, vp %p, off %llx", 28786f235fc0Ssmaybe (void *)pp, (void *)vp, off); 28796f235fc0Ssmaybe } 28806f235fc0Ssmaybe off += MMU_PAGESIZE; 28816f235fc0Ssmaybe PP_CLRFREE(pp); 28826f235fc0Ssmaybe PP_CLRAGED(pp); 28836f235fc0Ssmaybe page_set_props(pp, P_REF); 28846f235fc0Ssmaybe page_io_lock(pp); 28856f235fc0Ssmaybe pp = pp->p_next; 28866f235fc0Ssmaybe } while (pp != mcpl); 28876f235fc0Ssmaybe } else { 28886f235fc0Ssmaybe /* 28896f235fc0Ssmaybe * Hypervisor exchange doesn't handle segment or 28906f235fc0Ssmaybe * alignment constraints 28916f235fc0Ssmaybe */ 28926f235fc0Ssmaybe if (mattr->dma_attr_seg < mattr->dma_attr_addr_hi || 28936f235fc0Ssmaybe pfnalign) 28946f235fc0Ssmaybe goto fail; 28956f235fc0Ssmaybe /* 28966f235fc0Ssmaybe * Try exchanging pages with the hypervisor 28976f235fc0Ssmaybe */ 28986f235fc0Ssmaybe mcpl = page_swap_with_hypervisor(vp, off, vaddr, mattr, 28996f235fc0Ssmaybe flags, minctg); 29006f235fc0Ssmaybe if (mcpl == NULL) 29016f235fc0Ssmaybe goto fail; 29026f235fc0Ssmaybe off += minctg * MMU_PAGESIZE; 29036f235fc0Ssmaybe } 29046f235fc0Ssmaybe check_dma(mattr, mcpl, minctg); 29056f235fc0Ssmaybe /* 29066f235fc0Ssmaybe * Here with a minctg run of contiguous pages, add them to the 29076f235fc0Ssmaybe * list we will return for this request. 29086f235fc0Ssmaybe */ 29096f235fc0Ssmaybe page_list_concat(&plist, &mcpl); 29106f235fc0Ssmaybe npages -= minctg; 29116f235fc0Ssmaybe *npagesp = npages; 29126f235fc0Ssmaybe sgllen--; 2913804cf79fSsmaybe if (getone) 2914804cf79fSsmaybe break; 29156f235fc0Ssmaybe } 29166f235fc0Ssmaybe return (plist); 29176f235fc0Ssmaybe fail: 29186f235fc0Ssmaybe return_partial_alloc(plist); 29196f235fc0Ssmaybe return (NULL); 29206f235fc0Ssmaybe } 29216f235fc0Ssmaybe 29226f235fc0Ssmaybe /* 29236f235fc0Ssmaybe * Allocator for domain 0 I/O pages. We match the required 29246f235fc0Ssmaybe * DMA attributes and contiguity constraints. 29256f235fc0Ssmaybe */ 29266f235fc0Ssmaybe /*ARGSUSED*/ 29276f235fc0Ssmaybe page_t * 29286f235fc0Ssmaybe page_create_io( 29296f235fc0Ssmaybe struct vnode *vp, 29306f235fc0Ssmaybe u_offset_t off, 29316f235fc0Ssmaybe uint_t bytes, 29326f235fc0Ssmaybe uint_t flags, 29336f235fc0Ssmaybe struct as *as, 29346f235fc0Ssmaybe caddr_t vaddr, 29356f235fc0Ssmaybe ddi_dma_attr_t *mattr) 29366f235fc0Ssmaybe { 29376f235fc0Ssmaybe page_t *plist = NULL, *pp; 29386f235fc0Ssmaybe int npages = 0, contig, anyaddr, pages_req; 29396f235fc0Ssmaybe mfn_t lo_mfn; 29406f235fc0Ssmaybe mfn_t hi_mfn; 29416f235fc0Ssmaybe pgcnt_t pfnalign = 0; 29426f235fc0Ssmaybe int align; 29436f235fc0Ssmaybe int is_domu = 0; 29446f235fc0Ssmaybe int dummy, bytes_got; 29456f235fc0Ssmaybe mfn_t max_mfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); 29466f235fc0Ssmaybe 29476f235fc0Ssmaybe ASSERT(mattr != NULL); 29486f235fc0Ssmaybe lo_mfn = mmu_btop(mattr->dma_attr_addr_lo); 29496f235fc0Ssmaybe hi_mfn = mmu_btop(mattr->dma_attr_addr_hi); 29506f235fc0Ssmaybe align = maxbit(mattr->dma_attr_align, mattr->dma_attr_minxfer); 29516f235fc0Ssmaybe if (align > MMU_PAGESIZE) 29526f235fc0Ssmaybe pfnalign = mmu_btop(align); 29536f235fc0Ssmaybe 29546f235fc0Ssmaybe /* 29556f235fc0Ssmaybe * Clear the contig flag if only one page is needed or the scatter 29566f235fc0Ssmaybe * gather list length is >= npages. 29576f235fc0Ssmaybe */ 29586f235fc0Ssmaybe pages_req = npages = mmu_btopr(bytes); 29596f235fc0Ssmaybe contig = (flags & PG_PHYSCONTIG); 29606f235fc0Ssmaybe bytes = P2ROUNDUP(bytes, MMU_PAGESIZE); 29616f235fc0Ssmaybe if (bytes == MMU_PAGESIZE || mattr->dma_attr_sgllen >= npages) 29626f235fc0Ssmaybe contig = 0; 29636f235fc0Ssmaybe 29646f235fc0Ssmaybe /* 29656f235fc0Ssmaybe * Check if any old page in the system is fine. 29666f235fc0Ssmaybe * DomU should always go down this path. 29676f235fc0Ssmaybe */ 29686f235fc0Ssmaybe is_domu = !DOMAIN_IS_INITDOMAIN(xen_info); 29696f235fc0Ssmaybe anyaddr = lo_mfn == 0 && hi_mfn >= max_mfn && !pfnalign; 29706f235fc0Ssmaybe if ((!contig && anyaddr) || is_domu) { 29716f235fc0Ssmaybe flags &= ~PG_PHYSCONTIG; 29726f235fc0Ssmaybe plist = page_create_va(vp, off, bytes, flags, &kvseg, vaddr); 29736f235fc0Ssmaybe if (plist != NULL) 29746f235fc0Ssmaybe return (plist); 29756f235fc0Ssmaybe else if (is_domu) 29766f235fc0Ssmaybe return (NULL); /* no memory available */ 29776f235fc0Ssmaybe } 29786f235fc0Ssmaybe /* 29796f235fc0Ssmaybe * DomU should never reach here 29806f235fc0Ssmaybe */ 29816f235fc0Ssmaybe if (contig) { 29826f235fc0Ssmaybe plist = page_get_contigpages(vp, off, &npages, flags, vaddr, 29836f235fc0Ssmaybe mattr); 29846f235fc0Ssmaybe if (plist == NULL) 29856f235fc0Ssmaybe goto fail; 29866f235fc0Ssmaybe bytes_got = (pages_req - npages) << MMU_PAGESHIFT; 29876f235fc0Ssmaybe vaddr += bytes_got; 29886f235fc0Ssmaybe off += bytes_got; 29896f235fc0Ssmaybe /* 29906f235fc0Ssmaybe * We now have all the contiguous pages we need, but 29916f235fc0Ssmaybe * we may still need additional non-contiguous pages. 29926f235fc0Ssmaybe */ 29936f235fc0Ssmaybe } 29946f235fc0Ssmaybe /* 29956f235fc0Ssmaybe * now loop collecting the requested number of pages, these do 29966f235fc0Ssmaybe * not have to be contiguous pages but we will use the contig 29976f235fc0Ssmaybe * page alloc code to get the pages since it will honor any 29986f235fc0Ssmaybe * other constraints the pages may have. 29996f235fc0Ssmaybe */ 30006f235fc0Ssmaybe while (npages--) { 3001349b53ddSStuart Maybee dummy = -1; 30026f235fc0Ssmaybe pp = page_get_contigpages(vp, off, &dummy, flags, vaddr, mattr); 30036f235fc0Ssmaybe if (pp == NULL) 30046f235fc0Ssmaybe goto fail; 30056f235fc0Ssmaybe page_add(&plist, pp); 30066f235fc0Ssmaybe vaddr += MMU_PAGESIZE; 30076f235fc0Ssmaybe off += MMU_PAGESIZE; 30086f235fc0Ssmaybe } 30096f235fc0Ssmaybe return (plist); 30106f235fc0Ssmaybe fail: 30116f235fc0Ssmaybe /* 30126f235fc0Ssmaybe * Failed to get enough pages, return ones we did get 30136f235fc0Ssmaybe */ 30146f235fc0Ssmaybe return_partial_alloc(plist); 3015843e1988Sjohnlev return (NULL); 3016843e1988Sjohnlev } 3017843e1988Sjohnlev 3018843e1988Sjohnlev /* 3019843e1988Sjohnlev * Lock and return the page with the highest mfn that we can find. last_mfn 3020843e1988Sjohnlev * holds the last one found, so the next search can start from there. We 3021843e1988Sjohnlev * also keep a counter so that we don't loop forever if the machine has no 3022843e1988Sjohnlev * free pages. 3023843e1988Sjohnlev * 3024843e1988Sjohnlev * This is called from the balloon thread to find pages to give away. new_high 3025843e1988Sjohnlev * is used when new mfn's have been added to the system - we will reset our 3026843e1988Sjohnlev * search if the new mfn's are higher than our current search position. 3027843e1988Sjohnlev */ 3028843e1988Sjohnlev page_t * 3029843e1988Sjohnlev page_get_high_mfn(mfn_t new_high) 3030843e1988Sjohnlev { 3031843e1988Sjohnlev static mfn_t last_mfn = 0; 3032843e1988Sjohnlev pfn_t pfn; 3033843e1988Sjohnlev page_t *pp; 3034843e1988Sjohnlev ulong_t loop_count = 0; 3035843e1988Sjohnlev 3036843e1988Sjohnlev if (new_high > last_mfn) 3037843e1988Sjohnlev last_mfn = new_high; 3038843e1988Sjohnlev 3039843e1988Sjohnlev for (; loop_count < mfn_count; loop_count++, last_mfn--) { 3040843e1988Sjohnlev if (last_mfn == 0) { 3041843e1988Sjohnlev last_mfn = cached_max_mfn; 3042843e1988Sjohnlev } 3043843e1988Sjohnlev 3044843e1988Sjohnlev pfn = mfn_to_pfn(last_mfn); 3045843e1988Sjohnlev if (pfn & PFN_IS_FOREIGN_MFN) 3046843e1988Sjohnlev continue; 3047843e1988Sjohnlev 3048843e1988Sjohnlev /* See if the page is free. If so, lock it. */ 3049843e1988Sjohnlev pp = page_numtopp_alloc(pfn); 3050843e1988Sjohnlev if (pp == NULL) 3051843e1988Sjohnlev continue; 3052843e1988Sjohnlev PP_CLRFREE(pp); 3053843e1988Sjohnlev 3054843e1988Sjohnlev ASSERT(PAGE_EXCL(pp)); 3055843e1988Sjohnlev ASSERT(pp->p_vnode == NULL); 3056843e1988Sjohnlev ASSERT(!hat_page_is_mapped(pp)); 3057843e1988Sjohnlev last_mfn--; 3058843e1988Sjohnlev return (pp); 3059843e1988Sjohnlev } 3060843e1988Sjohnlev return (NULL); 3061843e1988Sjohnlev } 3062843e1988Sjohnlev 3063843e1988Sjohnlev #else /* !__xpv */ 3064843e1988Sjohnlev 30657c478bd9Sstevel@tonic-gate /* 30667c478bd9Sstevel@tonic-gate * get a page from any list with the given mnode 30677c478bd9Sstevel@tonic-gate */ 3068843e1988Sjohnlev static page_t * 30697c478bd9Sstevel@tonic-gate page_get_mnode_anylist(ulong_t origbin, uchar_t szc, uint_t flags, 30707c478bd9Sstevel@tonic-gate int mnode, int mtype, ddi_dma_attr_t *dma_attr) 30717c478bd9Sstevel@tonic-gate { 30727c478bd9Sstevel@tonic-gate kmutex_t *pcm; 30737c478bd9Sstevel@tonic-gate int i; 30747c478bd9Sstevel@tonic-gate page_t *pp; 30757c478bd9Sstevel@tonic-gate page_t *first_pp; 30767c478bd9Sstevel@tonic-gate uint64_t pgaddr; 30777c478bd9Sstevel@tonic-gate ulong_t bin; 30787c478bd9Sstevel@tonic-gate int mtypestart; 30795d07b933Sdp78419 int plw_initialized; 30805d07b933Sdp78419 page_list_walker_t plw; 30817c478bd9Sstevel@tonic-gate 30827c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_alloc); 30837c478bd9Sstevel@tonic-gate 30847c478bd9Sstevel@tonic-gate ASSERT((flags & PG_MATCH_COLOR) == 0); 30857c478bd9Sstevel@tonic-gate ASSERT(szc == 0); 30867c478bd9Sstevel@tonic-gate ASSERT(dma_attr != NULL); 30877c478bd9Sstevel@tonic-gate 30887c478bd9Sstevel@tonic-gate MTYPE_START(mnode, mtype, flags); 30897c478bd9Sstevel@tonic-gate if (mtype < 0) { 30907c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocempty); 30917c478bd9Sstevel@tonic-gate return (NULL); 30927c478bd9Sstevel@tonic-gate } 30937c478bd9Sstevel@tonic-gate 30947c478bd9Sstevel@tonic-gate mtypestart = mtype; 30957c478bd9Sstevel@tonic-gate 30967c478bd9Sstevel@tonic-gate bin = origbin; 30977c478bd9Sstevel@tonic-gate 30987c478bd9Sstevel@tonic-gate /* 30997c478bd9Sstevel@tonic-gate * check up to page_colors + 1 bins - origbin may be checked twice 31007c478bd9Sstevel@tonic-gate * because of BIN_STEP skip 31017c478bd9Sstevel@tonic-gate */ 31027c478bd9Sstevel@tonic-gate do { 31035d07b933Sdp78419 plw_initialized = 0; 31045d07b933Sdp78419 31055d07b933Sdp78419 for (plw.plw_count = 0; 31065d07b933Sdp78419 plw.plw_count < page_colors; plw.plw_count++) { 31075d07b933Sdp78419 3108d94ffb28Sjmcp if (PAGE_FREELISTS(mnode, szc, bin, mtype) == NULL) 31097c478bd9Sstevel@tonic-gate goto nextfreebin; 31107c478bd9Sstevel@tonic-gate 3111d94ffb28Sjmcp pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 31127c478bd9Sstevel@tonic-gate mutex_enter(pcm); 3113d94ffb28Sjmcp pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 31147c478bd9Sstevel@tonic-gate first_pp = pp; 31157c478bd9Sstevel@tonic-gate while (pp != NULL) { 3116*1f84c0d7SDave Plauger if (IS_DUMP_PAGE(pp) || page_trylock(pp, 3117*1f84c0d7SDave Plauger SE_EXCL) == 0) { 31187c478bd9Sstevel@tonic-gate pp = pp->p_next; 31197c478bd9Sstevel@tonic-gate if (pp == first_pp) { 31207c478bd9Sstevel@tonic-gate pp = NULL; 31217c478bd9Sstevel@tonic-gate } 31227c478bd9Sstevel@tonic-gate continue; 31237c478bd9Sstevel@tonic-gate } 31247c478bd9Sstevel@tonic-gate 31257c478bd9Sstevel@tonic-gate ASSERT(PP_ISFREE(pp)); 31267c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp)); 31277c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode == NULL); 31287c478bd9Sstevel@tonic-gate ASSERT(pp->p_hash == NULL); 31297c478bd9Sstevel@tonic-gate ASSERT(pp->p_offset == (u_offset_t)-1); 31307c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == szc); 31317c478bd9Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 31327c478bd9Sstevel@tonic-gate /* check if page within DMA attributes */ 3133ae115bc7Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 31347c478bd9Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 31357c478bd9Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 31367c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 31377c478bd9Sstevel@tonic-gate break; 31387c478bd9Sstevel@tonic-gate } 31397c478bd9Sstevel@tonic-gate 31407c478bd9Sstevel@tonic-gate /* continue looking */ 31417c478bd9Sstevel@tonic-gate page_unlock(pp); 31427c478bd9Sstevel@tonic-gate pp = pp->p_next; 31437c478bd9Sstevel@tonic-gate if (pp == first_pp) 31447c478bd9Sstevel@tonic-gate pp = NULL; 31457c478bd9Sstevel@tonic-gate 31467c478bd9Sstevel@tonic-gate } 31477c478bd9Sstevel@tonic-gate if (pp != NULL) { 31487c478bd9Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 31497c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 31507c478bd9Sstevel@tonic-gate 31517c478bd9Sstevel@tonic-gate /* found a page with specified DMA attributes */ 3152d94ffb28Sjmcp page_sub(&PAGE_FREELISTS(mnode, szc, bin, 3153d94ffb28Sjmcp mtype), pp); 3154affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 31557c478bd9Sstevel@tonic-gate 31567c478bd9Sstevel@tonic-gate if ((PP_ISFREE(pp) == 0) || 31577c478bd9Sstevel@tonic-gate (PP_ISAGED(pp) == 0)) { 31587c478bd9Sstevel@tonic-gate cmn_err(CE_PANIC, "page %p is not free", 31597c478bd9Sstevel@tonic-gate (void *)pp); 31607c478bd9Sstevel@tonic-gate } 31617c478bd9Sstevel@tonic-gate 31627c478bd9Sstevel@tonic-gate mutex_exit(pcm); 31637c478bd9Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 31647c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 31657c478bd9Sstevel@tonic-gate return (pp); 31667c478bd9Sstevel@tonic-gate } 31677c478bd9Sstevel@tonic-gate mutex_exit(pcm); 31687c478bd9Sstevel@tonic-gate nextfreebin: 31695d07b933Sdp78419 if (plw_initialized == 0) { 31705d07b933Sdp78419 page_list_walk_init(szc, 0, bin, 1, 0, &plw); 31715d07b933Sdp78419 ASSERT(plw.plw_ceq_dif == page_colors); 31725d07b933Sdp78419 plw_initialized = 1; 31737c478bd9Sstevel@tonic-gate } 31745d07b933Sdp78419 31755d07b933Sdp78419 if (plw.plw_do_split) { 31765d07b933Sdp78419 pp = page_freelist_split(szc, bin, mnode, 31775d07b933Sdp78419 mtype, 317819397407SSherry Moore mmu_btop(dma_attr->dma_attr_addr_lo), 31795d07b933Sdp78419 mmu_btop(dma_attr->dma_attr_addr_hi + 1), 31805d07b933Sdp78419 &plw); 318119397407SSherry Moore if (pp != NULL) { 318219397407SSherry Moore check_dma(dma_attr, pp, 1); 31835d07b933Sdp78419 return (pp); 31845d07b933Sdp78419 } 318519397407SSherry Moore } 31865d07b933Sdp78419 31875d07b933Sdp78419 bin = page_list_walk_next_bin(szc, bin, &plw); 31885d07b933Sdp78419 } 31895d07b933Sdp78419 3190affbd3ccSkchow MTYPE_NEXT(mnode, mtype, flags); 3191affbd3ccSkchow } while (mtype >= 0); 31927c478bd9Sstevel@tonic-gate 31937c478bd9Sstevel@tonic-gate /* failed to find a page in the freelist; try it in the cachelist */ 31947c478bd9Sstevel@tonic-gate 31957c478bd9Sstevel@tonic-gate /* reset mtype start for cachelist search */ 31967c478bd9Sstevel@tonic-gate mtype = mtypestart; 31977c478bd9Sstevel@tonic-gate ASSERT(mtype >= 0); 31987c478bd9Sstevel@tonic-gate 31997c478bd9Sstevel@tonic-gate /* start with the bin of matching color */ 32007c478bd9Sstevel@tonic-gate bin = origbin; 32017c478bd9Sstevel@tonic-gate 32027c478bd9Sstevel@tonic-gate do { 32037c478bd9Sstevel@tonic-gate for (i = 0; i <= page_colors; i++) { 32047c478bd9Sstevel@tonic-gate if (PAGE_CACHELISTS(mnode, bin, mtype) == NULL) 32057c478bd9Sstevel@tonic-gate goto nextcachebin; 3206d94ffb28Sjmcp pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 32077c478bd9Sstevel@tonic-gate mutex_enter(pcm); 32087c478bd9Sstevel@tonic-gate pp = PAGE_CACHELISTS(mnode, bin, mtype); 32097c478bd9Sstevel@tonic-gate first_pp = pp; 32107c478bd9Sstevel@tonic-gate while (pp != NULL) { 3211*1f84c0d7SDave Plauger if (IS_DUMP_PAGE(pp) || page_trylock(pp, 3212*1f84c0d7SDave Plauger SE_EXCL) == 0) { 32137c478bd9Sstevel@tonic-gate pp = pp->p_next; 32147c478bd9Sstevel@tonic-gate if (pp == first_pp) 3215e172a44eSSherry Moore pp = NULL; 32167c478bd9Sstevel@tonic-gate continue; 32177c478bd9Sstevel@tonic-gate } 32187c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode); 32197c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 32207c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 32217c478bd9Sstevel@tonic-gate ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 32227c478bd9Sstevel@tonic-gate 32237c478bd9Sstevel@tonic-gate /* check if page within DMA attributes */ 32247c478bd9Sstevel@tonic-gate 3225ae115bc7Smrj pgaddr = pa_to_ma(pfn_to_pa(pp->p_pagenum)); 32267c478bd9Sstevel@tonic-gate if ((pgaddr >= dma_attr->dma_attr_addr_lo) && 32277c478bd9Sstevel@tonic-gate (pgaddr + MMU_PAGESIZE - 1 <= 32287c478bd9Sstevel@tonic-gate dma_attr->dma_attr_addr_hi)) { 32297c478bd9Sstevel@tonic-gate break; 32307c478bd9Sstevel@tonic-gate } 32317c478bd9Sstevel@tonic-gate 32327c478bd9Sstevel@tonic-gate /* continue looking */ 32337c478bd9Sstevel@tonic-gate page_unlock(pp); 32347c478bd9Sstevel@tonic-gate pp = pp->p_next; 32357c478bd9Sstevel@tonic-gate if (pp == first_pp) 32367c478bd9Sstevel@tonic-gate pp = NULL; 32377c478bd9Sstevel@tonic-gate } 32387c478bd9Sstevel@tonic-gate 32397c478bd9Sstevel@tonic-gate if (pp != NULL) { 32407c478bd9Sstevel@tonic-gate ASSERT(mtype == PP_2_MTYPE(pp)); 32417c478bd9Sstevel@tonic-gate ASSERT(pp->p_szc == 0); 32427c478bd9Sstevel@tonic-gate 32437c478bd9Sstevel@tonic-gate /* found a page with specified DMA attributes */ 32447c478bd9Sstevel@tonic-gate page_sub(&PAGE_CACHELISTS(mnode, bin, 32457c478bd9Sstevel@tonic-gate mtype), pp); 3246affbd3ccSkchow page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 32477c478bd9Sstevel@tonic-gate 32487c478bd9Sstevel@tonic-gate mutex_exit(pcm); 32497c478bd9Sstevel@tonic-gate ASSERT(pp->p_vnode); 32507c478bd9Sstevel@tonic-gate ASSERT(PP_ISAGED(pp) == 0); 32517c478bd9Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 32527c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocok); 32537c478bd9Sstevel@tonic-gate return (pp); 32547c478bd9Sstevel@tonic-gate } 32557c478bd9Sstevel@tonic-gate mutex_exit(pcm); 32567c478bd9Sstevel@tonic-gate nextcachebin: 32577c478bd9Sstevel@tonic-gate bin += (i == 0) ? BIN_STEP : 1; 32587c478bd9Sstevel@tonic-gate bin &= page_colors_mask; 32597c478bd9Sstevel@tonic-gate } 3260affbd3ccSkchow MTYPE_NEXT(mnode, mtype, flags); 3261affbd3ccSkchow } while (mtype >= 0); 32627c478bd9Sstevel@tonic-gate 32637c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pgma_allocfailed); 32647c478bd9Sstevel@tonic-gate return (NULL); 32657c478bd9Sstevel@tonic-gate } 32667c478bd9Sstevel@tonic-gate 32677c478bd9Sstevel@tonic-gate /* 32687c478bd9Sstevel@tonic-gate * This function is similar to page_get_freelist()/page_get_cachelist() 32697c478bd9Sstevel@tonic-gate * but it searches both the lists to find a page with the specified 32707c478bd9Sstevel@tonic-gate * color (or no color) and DMA attributes. The search is done in the 32717c478bd9Sstevel@tonic-gate * freelist first and then in the cache list within the highest memory 32727c478bd9Sstevel@tonic-gate * range (based on DMA attributes) before searching in the lower 32737c478bd9Sstevel@tonic-gate * memory ranges. 32747c478bd9Sstevel@tonic-gate * 32757c478bd9Sstevel@tonic-gate * Note: This function is called only by page_create_io(). 32767c478bd9Sstevel@tonic-gate */ 32777c478bd9Sstevel@tonic-gate /*ARGSUSED*/ 3278843e1988Sjohnlev static page_t * 32797c478bd9Sstevel@tonic-gate page_get_anylist(struct vnode *vp, u_offset_t off, struct as *as, caddr_t vaddr, 32807c478bd9Sstevel@tonic-gate size_t size, uint_t flags, ddi_dma_attr_t *dma_attr, lgrp_t *lgrp) 32817c478bd9Sstevel@tonic-gate { 32827c478bd9Sstevel@tonic-gate uint_t bin; 32837c478bd9Sstevel@tonic-gate int mtype; 32847c478bd9Sstevel@tonic-gate page_t *pp; 32857c478bd9Sstevel@tonic-gate int n; 32867c478bd9Sstevel@tonic-gate int m; 32877c478bd9Sstevel@tonic-gate int szc; 32887c478bd9Sstevel@tonic-gate int fullrange; 32897c478bd9Sstevel@tonic-gate int mnode; 32907c478bd9Sstevel@tonic-gate int local_failed_stat = 0; 32917c478bd9Sstevel@tonic-gate lgrp_mnode_cookie_t lgrp_cookie; 32927c478bd9Sstevel@tonic-gate 32937c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_alloc); 32947c478bd9Sstevel@tonic-gate 32957c478bd9Sstevel@tonic-gate /* only base pagesize currently supported */ 32967c478bd9Sstevel@tonic-gate if (size != MMU_PAGESIZE) 32977c478bd9Sstevel@tonic-gate return (NULL); 32987c478bd9Sstevel@tonic-gate 32997c478bd9Sstevel@tonic-gate /* 33007c478bd9Sstevel@tonic-gate * If we're passed a specific lgroup, we use it. Otherwise, 33017c478bd9Sstevel@tonic-gate * assume first-touch placement is desired. 33027c478bd9Sstevel@tonic-gate */ 33037c478bd9Sstevel@tonic-gate if (!LGRP_EXISTS(lgrp)) 33047c478bd9Sstevel@tonic-gate lgrp = lgrp_home_lgrp(); 33057c478bd9Sstevel@tonic-gate 33067c478bd9Sstevel@tonic-gate /* LINTED */ 3307d94ffb28Sjmcp AS_2_BIN(as, seg, vp, vaddr, bin, 0); 33087c478bd9Sstevel@tonic-gate 33097c478bd9Sstevel@tonic-gate /* 33107c478bd9Sstevel@tonic-gate * Only hold one freelist or cachelist lock at a time, that way we 33117c478bd9Sstevel@tonic-gate * can start anywhere and not have to worry about lock 33127c478bd9Sstevel@tonic-gate * ordering. 33137c478bd9Sstevel@tonic-gate */ 33147c478bd9Sstevel@tonic-gate if (dma_attr == NULL) { 3315a3114836SGerry Liu n = mtype16m; 3316a3114836SGerry Liu m = mtypetop; 33177c478bd9Sstevel@tonic-gate fullrange = 1; 33187c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_nulldmaattr); 33197c478bd9Sstevel@tonic-gate } else { 33207c478bd9Sstevel@tonic-gate pfn_t pfnlo = mmu_btop(dma_attr->dma_attr_addr_lo); 33217c478bd9Sstevel@tonic-gate pfn_t pfnhi = mmu_btop(dma_attr->dma_attr_addr_hi); 33227c478bd9Sstevel@tonic-gate 33237c478bd9Sstevel@tonic-gate /* 33247c478bd9Sstevel@tonic-gate * We can guarantee alignment only for page boundary. 33257c478bd9Sstevel@tonic-gate */ 33267c478bd9Sstevel@tonic-gate if (dma_attr->dma_attr_align > MMU_PAGESIZE) 33277c478bd9Sstevel@tonic-gate return (NULL); 33287c478bd9Sstevel@tonic-gate 3329a3114836SGerry Liu /* Sanity check the dma_attr */ 3330a3114836SGerry Liu if (pfnlo > pfnhi) 3331a3114836SGerry Liu return (NULL); 3332a3114836SGerry Liu 33337c478bd9Sstevel@tonic-gate n = pfn_2_mtype(pfnlo); 33347c478bd9Sstevel@tonic-gate m = pfn_2_mtype(pfnhi); 33357c478bd9Sstevel@tonic-gate 33367c478bd9Sstevel@tonic-gate fullrange = ((pfnlo == mnoderanges[n].mnr_pfnlo) && 33377c478bd9Sstevel@tonic-gate (pfnhi >= mnoderanges[m].mnr_pfnhi)); 33387c478bd9Sstevel@tonic-gate } 33397c478bd9Sstevel@tonic-gate VM_STAT_COND_ADD(fullrange == 0, pga_vmstats.pga_notfullrange); 33407c478bd9Sstevel@tonic-gate 33417c478bd9Sstevel@tonic-gate szc = 0; 33427c478bd9Sstevel@tonic-gate 3343a3114836SGerry Liu /* cylcing thru mtype handled by RANGE0 if n == mtype16m */ 3344a3114836SGerry Liu if (n == mtype16m) { 33457c478bd9Sstevel@tonic-gate flags |= PGI_MT_RANGE0; 33467c478bd9Sstevel@tonic-gate n = m; 33477c478bd9Sstevel@tonic-gate } 33487c478bd9Sstevel@tonic-gate 33497c478bd9Sstevel@tonic-gate /* 33507c478bd9Sstevel@tonic-gate * Try local memory node first, but try remote if we can't 33517c478bd9Sstevel@tonic-gate * get a page of the right color. 33527c478bd9Sstevel@tonic-gate */ 33537c478bd9Sstevel@tonic-gate LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_HIER); 33547c478bd9Sstevel@tonic-gate while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 33557c478bd9Sstevel@tonic-gate /* 33567c478bd9Sstevel@tonic-gate * allocate pages from high pfn to low. 33577c478bd9Sstevel@tonic-gate */ 3358a3114836SGerry Liu mtype = m; 3359a3114836SGerry Liu do { 33607c478bd9Sstevel@tonic-gate if (fullrange != 0) { 3361d94ffb28Sjmcp pp = page_get_mnode_freelist(mnode, 33627c478bd9Sstevel@tonic-gate bin, mtype, szc, flags); 33637c478bd9Sstevel@tonic-gate if (pp == NULL) { 33647c478bd9Sstevel@tonic-gate pp = page_get_mnode_cachelist( 33657c478bd9Sstevel@tonic-gate bin, flags, mnode, mtype); 33667c478bd9Sstevel@tonic-gate } 33677c478bd9Sstevel@tonic-gate } else { 33687c478bd9Sstevel@tonic-gate pp = page_get_mnode_anylist(bin, szc, 33697c478bd9Sstevel@tonic-gate flags, mnode, mtype, dma_attr); 33707c478bd9Sstevel@tonic-gate } 33717c478bd9Sstevel@tonic-gate if (pp != NULL) { 33727c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocok); 33737c478bd9Sstevel@tonic-gate check_dma(dma_attr, pp, 1); 33747c478bd9Sstevel@tonic-gate return (pp); 33757c478bd9Sstevel@tonic-gate } 3376a3114836SGerry Liu } while (mtype != n && 3377a3114836SGerry Liu (mtype = mnoderanges[mtype].mnr_next) != -1); 33787c478bd9Sstevel@tonic-gate if (!local_failed_stat) { 33797c478bd9Sstevel@tonic-gate lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 33807c478bd9Sstevel@tonic-gate local_failed_stat = 1; 33817c478bd9Sstevel@tonic-gate } 33827c478bd9Sstevel@tonic-gate } 33837c478bd9Sstevel@tonic-gate VM_STAT_ADD(pga_vmstats.pga_allocfailed); 33847c478bd9Sstevel@tonic-gate 33857c478bd9Sstevel@tonic-gate return (NULL); 33867c478bd9Sstevel@tonic-gate } 33877c478bd9Sstevel@tonic-gate 33887c478bd9Sstevel@tonic-gate /* 33897c478bd9Sstevel@tonic-gate * page_create_io() 33907c478bd9Sstevel@tonic-gate * 33917c478bd9Sstevel@tonic-gate * This function is a copy of page_create_va() with an additional 33927c478bd9Sstevel@tonic-gate * argument 'mattr' that specifies DMA memory requirements to 33937c478bd9Sstevel@tonic-gate * the page list functions. This function is used by the segkmem 33947c478bd9Sstevel@tonic-gate * allocator so it is only to create new pages (i.e PG_EXCL is 33957c478bd9Sstevel@tonic-gate * set). 33967c478bd9Sstevel@tonic-gate * 33977c478bd9Sstevel@tonic-gate * Note: This interface is currently used by x86 PSM only and is 33987c478bd9Sstevel@tonic-gate * not fully specified so the commitment level is only for 33997c478bd9Sstevel@tonic-gate * private interface specific to x86. This interface uses PSM 34007c478bd9Sstevel@tonic-gate * specific page_get_anylist() interface. 34017c478bd9Sstevel@tonic-gate */ 34027c478bd9Sstevel@tonic-gate 34037c478bd9Sstevel@tonic-gate #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 34047c478bd9Sstevel@tonic-gate for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 34057c478bd9Sstevel@tonic-gate if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 34067c478bd9Sstevel@tonic-gate break; \ 34077c478bd9Sstevel@tonic-gate } \ 34087c478bd9Sstevel@tonic-gate } 34097c478bd9Sstevel@tonic-gate 34107c478bd9Sstevel@tonic-gate 34117c478bd9Sstevel@tonic-gate page_t * 34127c478bd9Sstevel@tonic-gate page_create_io( 34137c478bd9Sstevel@tonic-gate struct vnode *vp, 34147c478bd9Sstevel@tonic-gate u_offset_t off, 34157c478bd9Sstevel@tonic-gate uint_t bytes, 34167c478bd9Sstevel@tonic-gate uint_t flags, 34177c478bd9Sstevel@tonic-gate struct as *as, 34187c478bd9Sstevel@tonic-gate caddr_t vaddr, 34197c478bd9Sstevel@tonic-gate ddi_dma_attr_t *mattr) /* DMA memory attributes if any */ 34207c478bd9Sstevel@tonic-gate { 34217c478bd9Sstevel@tonic-gate page_t *plist = NULL; 34227c478bd9Sstevel@tonic-gate uint_t plist_len = 0; 34237c478bd9Sstevel@tonic-gate pgcnt_t npages; 34247c478bd9Sstevel@tonic-gate page_t *npp = NULL; 34257c478bd9Sstevel@tonic-gate uint_t pages_req; 34267c478bd9Sstevel@tonic-gate page_t *pp; 34277c478bd9Sstevel@tonic-gate kmutex_t *phm = NULL; 34287c478bd9Sstevel@tonic-gate uint_t index; 34297c478bd9Sstevel@tonic-gate 34307c478bd9Sstevel@tonic-gate TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 34317c478bd9Sstevel@tonic-gate "page_create_start:vp %p off %llx bytes %u flags %x", 34327c478bd9Sstevel@tonic-gate vp, off, bytes, flags); 34337c478bd9Sstevel@tonic-gate 34347c478bd9Sstevel@tonic-gate ASSERT((flags & ~(PG_EXCL | PG_WAIT | PG_PHYSCONTIG)) == 0); 34357c478bd9Sstevel@tonic-gate 34367c478bd9Sstevel@tonic-gate pages_req = npages = mmu_btopr(bytes); 34377c478bd9Sstevel@tonic-gate 34387c478bd9Sstevel@tonic-gate /* 34397c478bd9Sstevel@tonic-gate * Do the freemem and pcf accounting. 34407c478bd9Sstevel@tonic-gate */ 34417c478bd9Sstevel@tonic-gate if (!page_create_wait(npages, flags)) { 34427c478bd9Sstevel@tonic-gate return (NULL); 34437c478bd9Sstevel@tonic-gate } 34447c478bd9Sstevel@tonic-gate 34457c478bd9Sstevel@tonic-gate TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 3446843e1988Sjohnlev "page_create_success:vp %p off %llx", vp, off); 34477c478bd9Sstevel@tonic-gate 34487c478bd9Sstevel@tonic-gate /* 34497c478bd9Sstevel@tonic-gate * If satisfying this request has left us with too little 34507c478bd9Sstevel@tonic-gate * memory, start the wheels turning to get some back. The 34517c478bd9Sstevel@tonic-gate * first clause of the test prevents waking up the pageout 34527c478bd9Sstevel@tonic-gate * daemon in situations where it would decide that there's 34537c478bd9Sstevel@tonic-gate * nothing to do. 34547c478bd9Sstevel@tonic-gate */ 34557c478bd9Sstevel@tonic-gate if (nscan < desscan && freemem < minfree) { 34567c478bd9Sstevel@tonic-gate TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 34577c478bd9Sstevel@tonic-gate "pageout_cv_signal:freemem %ld", freemem); 34587c478bd9Sstevel@tonic-gate cv_signal(&proc_pageout->p_cv); 34597c478bd9Sstevel@tonic-gate } 34607c478bd9Sstevel@tonic-gate 34617c478bd9Sstevel@tonic-gate if (flags & PG_PHYSCONTIG) { 34627c478bd9Sstevel@tonic-gate 34637c478bd9Sstevel@tonic-gate plist = page_get_contigpage(&npages, mattr, 1); 34647c478bd9Sstevel@tonic-gate if (plist == NULL) { 34657c478bd9Sstevel@tonic-gate page_create_putback(npages); 34667c478bd9Sstevel@tonic-gate return (NULL); 34677c478bd9Sstevel@tonic-gate } 34687c478bd9Sstevel@tonic-gate 34697c478bd9Sstevel@tonic-gate pp = plist; 34707c478bd9Sstevel@tonic-gate 34717c478bd9Sstevel@tonic-gate do { 34727c478bd9Sstevel@tonic-gate if (!page_hashin(pp, vp, off, NULL)) { 34737c478bd9Sstevel@tonic-gate panic("pg_creat_io: hashin failed %p %p %llx", 34747c478bd9Sstevel@tonic-gate (void *)pp, (void *)vp, off); 34757c478bd9Sstevel@tonic-gate } 34767c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 34777c478bd9Sstevel@tonic-gate off += MMU_PAGESIZE; 34787c478bd9Sstevel@tonic-gate PP_CLRFREE(pp); 34797c478bd9Sstevel@tonic-gate PP_CLRAGED(pp); 34807c478bd9Sstevel@tonic-gate page_set_props(pp, P_REF); 34817c478bd9Sstevel@tonic-gate pp = pp->p_next; 34827c478bd9Sstevel@tonic-gate } while (pp != plist); 34837c478bd9Sstevel@tonic-gate 34847c478bd9Sstevel@tonic-gate if (!npages) { 34857c478bd9Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 34867c478bd9Sstevel@tonic-gate return (plist); 34877c478bd9Sstevel@tonic-gate } else { 34887c478bd9Sstevel@tonic-gate vaddr += (pages_req - npages) << MMU_PAGESHIFT; 34897c478bd9Sstevel@tonic-gate } 34907c478bd9Sstevel@tonic-gate 34917c478bd9Sstevel@tonic-gate /* 34927c478bd9Sstevel@tonic-gate * fall-thru: 34937c478bd9Sstevel@tonic-gate * 34947c478bd9Sstevel@tonic-gate * page_get_contigpage returns when npages <= sgllen. 34957c478bd9Sstevel@tonic-gate * Grab the rest of the non-contig pages below from anylist. 34967c478bd9Sstevel@tonic-gate */ 34977c478bd9Sstevel@tonic-gate } 34987c478bd9Sstevel@tonic-gate 34997c478bd9Sstevel@tonic-gate /* 35007c478bd9Sstevel@tonic-gate * Loop around collecting the requested number of pages. 35017c478bd9Sstevel@tonic-gate * Most of the time, we have to `create' a new page. With 35027c478bd9Sstevel@tonic-gate * this in mind, pull the page off the free list before 35037c478bd9Sstevel@tonic-gate * getting the hash lock. This will minimize the hash 35047c478bd9Sstevel@tonic-gate * lock hold time, nesting, and the like. If it turns 35057c478bd9Sstevel@tonic-gate * out we don't need the page, we put it back at the end. 35067c478bd9Sstevel@tonic-gate */ 35077c478bd9Sstevel@tonic-gate while (npages--) { 35087c478bd9Sstevel@tonic-gate phm = NULL; 35097c478bd9Sstevel@tonic-gate 35107c478bd9Sstevel@tonic-gate index = PAGE_HASH_FUNC(vp, off); 35117c478bd9Sstevel@tonic-gate top: 35127c478bd9Sstevel@tonic-gate ASSERT(phm == NULL); 35137c478bd9Sstevel@tonic-gate ASSERT(index == PAGE_HASH_FUNC(vp, off)); 35147c478bd9Sstevel@tonic-gate ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 35157c478bd9Sstevel@tonic-gate 35167c478bd9Sstevel@tonic-gate if (npp == NULL) { 35177c478bd9Sstevel@tonic-gate /* 35187c478bd9Sstevel@tonic-gate * Try to get the page of any color either from 35197c478bd9Sstevel@tonic-gate * the freelist or from the cache list. 35207c478bd9Sstevel@tonic-gate */ 35217c478bd9Sstevel@tonic-gate npp = page_get_anylist(vp, off, as, vaddr, MMU_PAGESIZE, 35227c478bd9Sstevel@tonic-gate flags & ~PG_MATCH_COLOR, mattr, NULL); 35237c478bd9Sstevel@tonic-gate if (npp == NULL) { 35247c478bd9Sstevel@tonic-gate if (mattr == NULL) { 35257c478bd9Sstevel@tonic-gate /* 35267c478bd9Sstevel@tonic-gate * Not looking for a special page; 35277c478bd9Sstevel@tonic-gate * panic! 35287c478bd9Sstevel@tonic-gate */ 35297c478bd9Sstevel@tonic-gate panic("no page found %d", (int)npages); 35307c478bd9Sstevel@tonic-gate } 35317c478bd9Sstevel@tonic-gate /* 35327c478bd9Sstevel@tonic-gate * No page found! This can happen 35337c478bd9Sstevel@tonic-gate * if we are looking for a page 35347c478bd9Sstevel@tonic-gate * within a specific memory range 35357c478bd9Sstevel@tonic-gate * for DMA purposes. If PG_WAIT is 35367c478bd9Sstevel@tonic-gate * specified then we wait for a 35377c478bd9Sstevel@tonic-gate * while and then try again. The 35387c478bd9Sstevel@tonic-gate * wait could be forever if we 35397c478bd9Sstevel@tonic-gate * don't get the page(s) we need. 35407c478bd9Sstevel@tonic-gate * 35417c478bd9Sstevel@tonic-gate * Note: XXX We really need a mechanism 35427c478bd9Sstevel@tonic-gate * to wait for pages in the desired 35437c478bd9Sstevel@tonic-gate * range. For now, we wait for any 35447c478bd9Sstevel@tonic-gate * pages and see if we can use it. 35457c478bd9Sstevel@tonic-gate */ 35467c478bd9Sstevel@tonic-gate 35477c478bd9Sstevel@tonic-gate if ((mattr != NULL) && (flags & PG_WAIT)) { 35487c478bd9Sstevel@tonic-gate delay(10); 35497c478bd9Sstevel@tonic-gate goto top; 35507c478bd9Sstevel@tonic-gate } 35517c478bd9Sstevel@tonic-gate goto fail; /* undo accounting stuff */ 35527c478bd9Sstevel@tonic-gate } 35537c478bd9Sstevel@tonic-gate 35547c478bd9Sstevel@tonic-gate if (PP_ISAGED(npp) == 0) { 35557c478bd9Sstevel@tonic-gate /* 35567c478bd9Sstevel@tonic-gate * Since this page came from the 35577c478bd9Sstevel@tonic-gate * cachelist, we must destroy the 35587c478bd9Sstevel@tonic-gate * old vnode association. 35597c478bd9Sstevel@tonic-gate */ 35607c478bd9Sstevel@tonic-gate page_hashout(npp, (kmutex_t *)NULL); 35617c478bd9Sstevel@tonic-gate } 35627c478bd9Sstevel@tonic-gate } 35637c478bd9Sstevel@tonic-gate 35647c478bd9Sstevel@tonic-gate /* 35657c478bd9Sstevel@tonic-gate * We own this page! 35667c478bd9Sstevel@tonic-gate */ 35677c478bd9Sstevel@tonic-gate ASSERT(PAGE_EXCL(npp)); 35687c478bd9Sstevel@tonic-gate ASSERT(npp->p_vnode == NULL); 35697c478bd9Sstevel@tonic-gate ASSERT(!hat_page_is_mapped(npp)); 35707c478bd9Sstevel@tonic-gate PP_CLRFREE(npp); 35717c478bd9Sstevel@tonic-gate PP_CLRAGED(npp); 35727c478bd9Sstevel@tonic-gate 35737c478bd9Sstevel@tonic-gate /* 35747c478bd9Sstevel@tonic-gate * Here we have a page in our hot little mits and are 35757c478bd9Sstevel@tonic-gate * just waiting to stuff it on the appropriate lists. 35767c478bd9Sstevel@tonic-gate * Get the mutex and check to see if it really does 35777c478bd9Sstevel@tonic-gate * not exist. 35787c478bd9Sstevel@tonic-gate */ 35797c478bd9Sstevel@tonic-gate phm = PAGE_HASH_MUTEX(index); 35807c478bd9Sstevel@tonic-gate mutex_enter(phm); 35817c478bd9Sstevel@tonic-gate PAGE_HASH_SEARCH(index, pp, vp, off); 35827c478bd9Sstevel@tonic-gate if (pp == NULL) { 35837c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_new); 35847c478bd9Sstevel@tonic-gate pp = npp; 35857c478bd9Sstevel@tonic-gate npp = NULL; 35867c478bd9Sstevel@tonic-gate if (!page_hashin(pp, vp, off, phm)) { 35877c478bd9Sstevel@tonic-gate /* 35887c478bd9Sstevel@tonic-gate * Since we hold the page hash mutex and 35897c478bd9Sstevel@tonic-gate * just searched for this page, page_hashin 35907c478bd9Sstevel@tonic-gate * had better not fail. If it does, that 35917c478bd9Sstevel@tonic-gate * means somethread did not follow the 35927c478bd9Sstevel@tonic-gate * page hash mutex rules. Panic now and 35937c478bd9Sstevel@tonic-gate * get it over with. As usual, go down 35947c478bd9Sstevel@tonic-gate * holding all the locks. 35957c478bd9Sstevel@tonic-gate */ 35967c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 35977c478bd9Sstevel@tonic-gate panic("page_create: hashin fail %p %p %llx %p", 35987c478bd9Sstevel@tonic-gate (void *)pp, (void *)vp, off, (void *)phm); 35997c478bd9Sstevel@tonic-gate 36007c478bd9Sstevel@tonic-gate } 36017c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 36027c478bd9Sstevel@tonic-gate mutex_exit(phm); 36037c478bd9Sstevel@tonic-gate phm = NULL; 36047c478bd9Sstevel@tonic-gate 36057c478bd9Sstevel@tonic-gate /* 36067c478bd9Sstevel@tonic-gate * Hat layer locking need not be done to set 36077c478bd9Sstevel@tonic-gate * the following bits since the page is not hashed 36087c478bd9Sstevel@tonic-gate * and was on the free list (i.e., had no mappings). 36097c478bd9Sstevel@tonic-gate * 36107c478bd9Sstevel@tonic-gate * Set the reference bit to protect 36117c478bd9Sstevel@tonic-gate * against immediate pageout 36127c478bd9Sstevel@tonic-gate * 36137c478bd9Sstevel@tonic-gate * XXXmh modify freelist code to set reference 36147c478bd9Sstevel@tonic-gate * bit so we don't have to do it here. 36157c478bd9Sstevel@tonic-gate */ 36167c478bd9Sstevel@tonic-gate page_set_props(pp, P_REF); 36177c478bd9Sstevel@tonic-gate } else { 36187c478bd9Sstevel@tonic-gate ASSERT(MUTEX_HELD(phm)); 36197c478bd9Sstevel@tonic-gate mutex_exit(phm); 36207c478bd9Sstevel@tonic-gate phm = NULL; 36217c478bd9Sstevel@tonic-gate /* 36227c478bd9Sstevel@tonic-gate * NOTE: This should not happen for pages associated 36237c478bd9Sstevel@tonic-gate * with kernel vnode 'kvp'. 36247c478bd9Sstevel@tonic-gate */ 36257c478bd9Sstevel@tonic-gate /* XX64 - to debug why this happens! */ 3626ad23a2dbSjohansen ASSERT(!VN_ISKAS(vp)); 3627ad23a2dbSjohansen if (VN_ISKAS(vp)) 36287c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, 36297c478bd9Sstevel@tonic-gate "page_create: page not expected " 36307c478bd9Sstevel@tonic-gate "in hash list for kernel vnode - pp 0x%p", 36317c478bd9Sstevel@tonic-gate (void *)pp); 36327c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_exists); 36337c478bd9Sstevel@tonic-gate goto fail; 36347c478bd9Sstevel@tonic-gate } 36357c478bd9Sstevel@tonic-gate 36367c478bd9Sstevel@tonic-gate /* 36377c478bd9Sstevel@tonic-gate * Got a page! It is locked. Acquire the i/o 36387c478bd9Sstevel@tonic-gate * lock since we are going to use the p_next and 36397c478bd9Sstevel@tonic-gate * p_prev fields to link the requested pages together. 36407c478bd9Sstevel@tonic-gate */ 36417c478bd9Sstevel@tonic-gate page_io_lock(pp); 36427c478bd9Sstevel@tonic-gate page_add(&plist, pp); 36437c478bd9Sstevel@tonic-gate plist = plist->p_next; 36447c478bd9Sstevel@tonic-gate off += MMU_PAGESIZE; 36457c478bd9Sstevel@tonic-gate vaddr += MMU_PAGESIZE; 36467c478bd9Sstevel@tonic-gate } 36477c478bd9Sstevel@tonic-gate 36487c478bd9Sstevel@tonic-gate check_dma(mattr, plist, pages_req); 36497c478bd9Sstevel@tonic-gate return (plist); 36507c478bd9Sstevel@tonic-gate 36517c478bd9Sstevel@tonic-gate fail: 36527c478bd9Sstevel@tonic-gate if (npp != NULL) { 36537c478bd9Sstevel@tonic-gate /* 36547c478bd9Sstevel@tonic-gate * Did not need this page after all. 36557c478bd9Sstevel@tonic-gate * Put it back on the free list. 36567c478bd9Sstevel@tonic-gate */ 36577c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 36587c478bd9Sstevel@tonic-gate PP_SETFREE(npp); 36597c478bd9Sstevel@tonic-gate PP_SETAGED(npp); 36607c478bd9Sstevel@tonic-gate npp->p_offset = (u_offset_t)-1; 36617c478bd9Sstevel@tonic-gate page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 36627c478bd9Sstevel@tonic-gate page_unlock(npp); 36637c478bd9Sstevel@tonic-gate } 36647c478bd9Sstevel@tonic-gate 36657c478bd9Sstevel@tonic-gate /* 36667c478bd9Sstevel@tonic-gate * Give up the pages we already got. 36677c478bd9Sstevel@tonic-gate */ 36687c478bd9Sstevel@tonic-gate while (plist != NULL) { 36697c478bd9Sstevel@tonic-gate pp = plist; 36707c478bd9Sstevel@tonic-gate page_sub(&plist, pp); 36717c478bd9Sstevel@tonic-gate page_io_unlock(pp); 36727c478bd9Sstevel@tonic-gate plist_len++; 36737c478bd9Sstevel@tonic-gate /*LINTED: constant in conditional ctx*/ 36747c478bd9Sstevel@tonic-gate VN_DISPOSE(pp, B_INVAL, 0, kcred); 36757c478bd9Sstevel@tonic-gate } 36767c478bd9Sstevel@tonic-gate 36777c478bd9Sstevel@tonic-gate /* 36787c478bd9Sstevel@tonic-gate * VN_DISPOSE does freemem accounting for the pages in plist 36797c478bd9Sstevel@tonic-gate * by calling page_free. So, we need to undo the pcf accounting 36807c478bd9Sstevel@tonic-gate * for only the remaining pages. 36817c478bd9Sstevel@tonic-gate */ 36827c478bd9Sstevel@tonic-gate VM_STAT_ADD(page_create_putbacks); 36837c478bd9Sstevel@tonic-gate page_create_putback(pages_req - plist_len); 36847c478bd9Sstevel@tonic-gate 36857c478bd9Sstevel@tonic-gate return (NULL); 36867c478bd9Sstevel@tonic-gate } 3687843e1988Sjohnlev #endif /* !__xpv */ 36887c478bd9Sstevel@tonic-gate 36897c478bd9Sstevel@tonic-gate 36907c478bd9Sstevel@tonic-gate /* 36917c478bd9Sstevel@tonic-gate * Copy the data from the physical page represented by "frompp" to 36927c478bd9Sstevel@tonic-gate * that represented by "topp". ppcopy uses CPU->cpu_caddr1 and 36937c478bd9Sstevel@tonic-gate * CPU->cpu_caddr2. It assumes that no one uses either map at interrupt 36947c478bd9Sstevel@tonic-gate * level and no one sleeps with an active mapping there. 36957c478bd9Sstevel@tonic-gate * 36967c478bd9Sstevel@tonic-gate * Note that the ref/mod bits in the page_t's are not affected by 36977c478bd9Sstevel@tonic-gate * this operation, hence it is up to the caller to update them appropriately. 36987c478bd9Sstevel@tonic-gate */ 36998b464eb8Smec int 37007c478bd9Sstevel@tonic-gate ppcopy(page_t *frompp, page_t *topp) 37017c478bd9Sstevel@tonic-gate { 37027c478bd9Sstevel@tonic-gate caddr_t pp_addr1; 37037c478bd9Sstevel@tonic-gate caddr_t pp_addr2; 3704ae115bc7Smrj hat_mempte_t pte1; 3705ae115bc7Smrj hat_mempte_t pte2; 37067c478bd9Sstevel@tonic-gate kmutex_t *ppaddr_mutex; 37078b464eb8Smec label_t ljb; 37088b464eb8Smec int ret = 1; 37097c478bd9Sstevel@tonic-gate 37107c478bd9Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 37117c478bd9Sstevel@tonic-gate ASSERT(PAGE_LOCKED(frompp)); 37127c478bd9Sstevel@tonic-gate ASSERT(PAGE_LOCKED(topp)); 37137c478bd9Sstevel@tonic-gate 37147c478bd9Sstevel@tonic-gate if (kpm_enable) { 37157c478bd9Sstevel@tonic-gate pp_addr1 = hat_kpm_page2va(frompp, 0); 37167c478bd9Sstevel@tonic-gate pp_addr2 = hat_kpm_page2va(topp, 0); 37177c478bd9Sstevel@tonic-gate kpreempt_disable(); 37187c478bd9Sstevel@tonic-gate } else { 37197c478bd9Sstevel@tonic-gate /* 37207c478bd9Sstevel@tonic-gate * disable pre-emption so that CPU can't change 37217c478bd9Sstevel@tonic-gate */ 37227c478bd9Sstevel@tonic-gate kpreempt_disable(); 37237c478bd9Sstevel@tonic-gate 37247c478bd9Sstevel@tonic-gate pp_addr1 = CPU->cpu_caddr1; 37257c478bd9Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 3726ae115bc7Smrj pte1 = CPU->cpu_caddr1pte; 3727ae115bc7Smrj pte2 = CPU->cpu_caddr2pte; 37287c478bd9Sstevel@tonic-gate 37297c478bd9Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 37307c478bd9Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 37317c478bd9Sstevel@tonic-gate 37327c478bd9Sstevel@tonic-gate hat_mempte_remap(page_pptonum(frompp), pp_addr1, pte1, 37337c478bd9Sstevel@tonic-gate PROT_READ | HAT_STORECACHING_OK, HAT_LOAD_NOCONSIST); 37347c478bd9Sstevel@tonic-gate hat_mempte_remap(page_pptonum(topp), pp_addr2, pte2, 37357c478bd9Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 37367c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 37377c478bd9Sstevel@tonic-gate } 37387c478bd9Sstevel@tonic-gate 37398b464eb8Smec if (on_fault(&ljb)) { 37408b464eb8Smec ret = 0; 37418b464eb8Smec goto faulted; 37428b464eb8Smec } 37437c478bd9Sstevel@tonic-gate if (use_sse_pagecopy) 3744843e1988Sjohnlev #ifdef __xpv 3745843e1988Sjohnlev page_copy_no_xmm(pp_addr2, pp_addr1); 3746843e1988Sjohnlev #else 37477c478bd9Sstevel@tonic-gate hwblkpagecopy(pp_addr1, pp_addr2); 3748843e1988Sjohnlev #endif 37497c478bd9Sstevel@tonic-gate else 37507c478bd9Sstevel@tonic-gate bcopy(pp_addr1, pp_addr2, PAGESIZE); 37517c478bd9Sstevel@tonic-gate 37528b464eb8Smec no_fault(); 37538b464eb8Smec faulted: 3754ae115bc7Smrj if (!kpm_enable) { 3755843e1988Sjohnlev #ifdef __xpv 3756843e1988Sjohnlev /* 37578ea72728Sjosephb * We can't leave unused mappings laying about under the 37588ea72728Sjosephb * hypervisor, so blow them away. 3759843e1988Sjohnlev */ 37608ea72728Sjosephb if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr1, 0, 37618ea72728Sjosephb UVMF_INVLPG | UVMF_LOCAL) < 0) 37628ea72728Sjosephb panic("HYPERVISOR_update_va_mapping() failed"); 3763843e1988Sjohnlev if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3764843e1988Sjohnlev UVMF_INVLPG | UVMF_LOCAL) < 0) 3765843e1988Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 3766843e1988Sjohnlev #endif 37677c478bd9Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 3768ae115bc7Smrj } 37697c478bd9Sstevel@tonic-gate kpreempt_enable(); 37708b464eb8Smec return (ret); 37717c478bd9Sstevel@tonic-gate } 37727c478bd9Sstevel@tonic-gate 37737c478bd9Sstevel@tonic-gate void 37747c478bd9Sstevel@tonic-gate pagezero(page_t *pp, uint_t off, uint_t len) 37757c478bd9Sstevel@tonic-gate { 3776d2b85481Srscott ASSERT(PAGE_LOCKED(pp)); 3777d2b85481Srscott pfnzero(page_pptonum(pp), off, len); 3778d2b85481Srscott } 3779d2b85481Srscott 3780d2b85481Srscott /* 3781d2b85481Srscott * Zero the physical page from off to off + len given by pfn 3782d2b85481Srscott * without changing the reference and modified bits of page. 3783d2b85481Srscott * 3784d2b85481Srscott * We use this using CPU private page address #2, see ppcopy() for more info. 3785d2b85481Srscott * pfnzero() must not be called at interrupt level. 3786d2b85481Srscott */ 3787d2b85481Srscott void 3788d2b85481Srscott pfnzero(pfn_t pfn, uint_t off, uint_t len) 3789d2b85481Srscott { 37907c478bd9Sstevel@tonic-gate caddr_t pp_addr2; 3791ae115bc7Smrj hat_mempte_t pte2; 3792d2b85481Srscott kmutex_t *ppaddr_mutex = NULL; 37937c478bd9Sstevel@tonic-gate 37947c478bd9Sstevel@tonic-gate ASSERT_STACK_ALIGNED(); 37957c478bd9Sstevel@tonic-gate ASSERT(len <= MMU_PAGESIZE); 37967c478bd9Sstevel@tonic-gate ASSERT(off <= MMU_PAGESIZE); 37977c478bd9Sstevel@tonic-gate ASSERT(off + len <= MMU_PAGESIZE); 37987c478bd9Sstevel@tonic-gate 3799d2b85481Srscott if (kpm_enable && !pfn_is_foreign(pfn)) { 3800d2b85481Srscott pp_addr2 = hat_kpm_pfn2va(pfn); 38017c478bd9Sstevel@tonic-gate kpreempt_disable(); 38027c478bd9Sstevel@tonic-gate } else { 38037c478bd9Sstevel@tonic-gate kpreempt_disable(); 38047c478bd9Sstevel@tonic-gate 38057c478bd9Sstevel@tonic-gate pp_addr2 = CPU->cpu_caddr2; 3806ae115bc7Smrj pte2 = CPU->cpu_caddr2pte; 38077c478bd9Sstevel@tonic-gate 38087c478bd9Sstevel@tonic-gate ppaddr_mutex = &CPU->cpu_ppaddr_mutex; 38097c478bd9Sstevel@tonic-gate mutex_enter(ppaddr_mutex); 38107c478bd9Sstevel@tonic-gate 3811d2b85481Srscott hat_mempte_remap(pfn, pp_addr2, pte2, 38127c478bd9Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_STORECACHING_OK, 38137c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 38147c478bd9Sstevel@tonic-gate } 38157c478bd9Sstevel@tonic-gate 3816ae115bc7Smrj if (use_sse_pagezero) { 3817843e1988Sjohnlev #ifdef __xpv 3818843e1988Sjohnlev uint_t rem; 3819843e1988Sjohnlev 3820843e1988Sjohnlev /* 3821843e1988Sjohnlev * zero a byte at a time until properly aligned for 3822843e1988Sjohnlev * block_zero_no_xmm(). 3823843e1988Sjohnlev */ 3824843e1988Sjohnlev while (!P2NPHASE(off, ((uint_t)BLOCKZEROALIGN)) && len-- > 0) 3825843e1988Sjohnlev pp_addr2[off++] = 0; 3826843e1988Sjohnlev 3827843e1988Sjohnlev /* 3828843e1988Sjohnlev * Now use faster block_zero_no_xmm() for any range 3829843e1988Sjohnlev * that is properly aligned and sized. 3830843e1988Sjohnlev */ 3831843e1988Sjohnlev rem = P2PHASE(len, ((uint_t)BLOCKZEROALIGN)); 3832843e1988Sjohnlev len -= rem; 3833843e1988Sjohnlev if (len != 0) { 3834843e1988Sjohnlev block_zero_no_xmm(pp_addr2 + off, len); 3835843e1988Sjohnlev off += len; 3836843e1988Sjohnlev } 3837843e1988Sjohnlev 3838843e1988Sjohnlev /* 3839843e1988Sjohnlev * zero remainder with byte stores. 3840843e1988Sjohnlev */ 3841843e1988Sjohnlev while (rem-- > 0) 3842843e1988Sjohnlev pp_addr2[off++] = 0; 3843843e1988Sjohnlev #else 38447c478bd9Sstevel@tonic-gate hwblkclr(pp_addr2 + off, len); 3845843e1988Sjohnlev #endif 3846ae115bc7Smrj } else { 38477c478bd9Sstevel@tonic-gate bzero(pp_addr2 + off, len); 3848ae115bc7Smrj } 38497c478bd9Sstevel@tonic-gate 3850d2b85481Srscott if (!kpm_enable || pfn_is_foreign(pfn)) { 3851843e1988Sjohnlev #ifdef __xpv 3852843e1988Sjohnlev /* 3853d2b85481Srscott * On the hypervisor this page might get used for a page 3854d2b85481Srscott * table before any intervening change to this mapping, 3855d2b85481Srscott * so blow it away. 3856843e1988Sjohnlev */ 3857d2b85481Srscott if (HYPERVISOR_update_va_mapping((uintptr_t)pp_addr2, 0, 3858843e1988Sjohnlev UVMF_INVLPG) < 0) 3859843e1988Sjohnlev panic("HYPERVISOR_update_va_mapping() failed"); 3860843e1988Sjohnlev #endif 38617c478bd9Sstevel@tonic-gate mutex_exit(ppaddr_mutex); 3862d2b85481Srscott } 3863d2b85481Srscott 38647c478bd9Sstevel@tonic-gate kpreempt_enable(); 38657c478bd9Sstevel@tonic-gate } 38667c478bd9Sstevel@tonic-gate 38677c478bd9Sstevel@tonic-gate /* 38687c478bd9Sstevel@tonic-gate * Platform-dependent page scrub call. 38697c478bd9Sstevel@tonic-gate */ 38707c478bd9Sstevel@tonic-gate void 38717c478bd9Sstevel@tonic-gate pagescrub(page_t *pp, uint_t off, uint_t len) 38727c478bd9Sstevel@tonic-gate { 38737c478bd9Sstevel@tonic-gate /* 38747c478bd9Sstevel@tonic-gate * For now, we rely on the fact that pagezero() will 38757c478bd9Sstevel@tonic-gate * always clear UEs. 38767c478bd9Sstevel@tonic-gate */ 38777c478bd9Sstevel@tonic-gate pagezero(pp, off, len); 38787c478bd9Sstevel@tonic-gate } 38797c478bd9Sstevel@tonic-gate 38807c478bd9Sstevel@tonic-gate /* 38817c478bd9Sstevel@tonic-gate * set up two private addresses for use on a given CPU for use in ppcopy() 38827c478bd9Sstevel@tonic-gate */ 38837c478bd9Sstevel@tonic-gate void 38847c478bd9Sstevel@tonic-gate setup_vaddr_for_ppcopy(struct cpu *cpup) 38857c478bd9Sstevel@tonic-gate { 38867c478bd9Sstevel@tonic-gate void *addr; 3887ae115bc7Smrj hat_mempte_t pte_pa; 38887c478bd9Sstevel@tonic-gate 38897c478bd9Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3890ae115bc7Smrj pte_pa = hat_mempte_setup(addr); 38917c478bd9Sstevel@tonic-gate cpup->cpu_caddr1 = addr; 3892ae115bc7Smrj cpup->cpu_caddr1pte = pte_pa; 38937c478bd9Sstevel@tonic-gate 38947c478bd9Sstevel@tonic-gate addr = vmem_alloc(heap_arena, mmu_ptob(1), VM_SLEEP); 3895ae115bc7Smrj pte_pa = hat_mempte_setup(addr); 38967c478bd9Sstevel@tonic-gate cpup->cpu_caddr2 = addr; 3897ae115bc7Smrj cpup->cpu_caddr2pte = pte_pa; 38987c478bd9Sstevel@tonic-gate 38997c478bd9Sstevel@tonic-gate mutex_init(&cpup->cpu_ppaddr_mutex, NULL, MUTEX_DEFAULT, NULL); 39007c478bd9Sstevel@tonic-gate } 39017c478bd9Sstevel@tonic-gate 3902ae115bc7Smrj /* 3903ae115bc7Smrj * Undo setup_vaddr_for_ppcopy 3904ae115bc7Smrj */ 3905ae115bc7Smrj void 3906ae115bc7Smrj teardown_vaddr_for_ppcopy(struct cpu *cpup) 3907ae115bc7Smrj { 3908ae115bc7Smrj mutex_destroy(&cpup->cpu_ppaddr_mutex); 3909ae115bc7Smrj 3910ae115bc7Smrj hat_mempte_release(cpup->cpu_caddr2, cpup->cpu_caddr2pte); 3911ae115bc7Smrj cpup->cpu_caddr2pte = 0; 3912ae115bc7Smrj vmem_free(heap_arena, cpup->cpu_caddr2, mmu_ptob(1)); 3913ae115bc7Smrj cpup->cpu_caddr2 = 0; 3914ae115bc7Smrj 3915ae115bc7Smrj hat_mempte_release(cpup->cpu_caddr1, cpup->cpu_caddr1pte); 3916ae115bc7Smrj cpup->cpu_caddr1pte = 0; 3917ae115bc7Smrj vmem_free(heap_arena, cpup->cpu_caddr1, mmu_ptob(1)); 3918ae115bc7Smrj cpup->cpu_caddr1 = 0; 3919ae115bc7Smrj } 39207c478bd9Sstevel@tonic-gate 39217c478bd9Sstevel@tonic-gate /* 39227c478bd9Sstevel@tonic-gate * Function for flushing D-cache when performing module relocations 39237c478bd9Sstevel@tonic-gate * to an alternate mapping. Unnecessary on Intel / AMD platforms. 39247c478bd9Sstevel@tonic-gate */ 39257c478bd9Sstevel@tonic-gate void 39267c478bd9Sstevel@tonic-gate dcache_flushall() 39277c478bd9Sstevel@tonic-gate {} 3928102033aaSdp78419 3929102033aaSdp78419 size_t 3930102033aaSdp78419 exec_get_spslew(void) 3931102033aaSdp78419 { 3932102033aaSdp78419 return (0); 3933102033aaSdp78419 } 3934ae115bc7Smrj 3935ae115bc7Smrj /* 3936ae115bc7Smrj * Allocate a memory page. The argument 'seed' can be any pseudo-random 3937ae115bc7Smrj * number to vary where the pages come from. This is quite a hacked up 3938ae115bc7Smrj * method -- it works for now, but really needs to be fixed up a bit. 3939ae115bc7Smrj * 3940ae115bc7Smrj * We currently use page_create_va() on the kvp with fake offsets, 3941ae115bc7Smrj * segments and virt address. This is pretty bogus, but was copied from the 3942ae115bc7Smrj * old hat_i86.c code. A better approach would be to specify either mnode 3943ae115bc7Smrj * random or mnode local and takes a page from whatever color has the MOST 3944ae115bc7Smrj * available - this would have a minimal impact on page coloring. 3945ae115bc7Smrj */ 3946ae115bc7Smrj page_t * 3947a77271f8SVikram Hegde page_get_physical(uintptr_t seed) 3948ae115bc7Smrj { 3949ae115bc7Smrj page_t *pp; 3950a77271f8SVikram Hegde u_offset_t offset; 3951ae115bc7Smrj static struct seg tmpseg; 3952ae115bc7Smrj static uintptr_t ctr = 0; 3953ae115bc7Smrj 3954ae115bc7Smrj /* 3955ae115bc7Smrj * This code is gross, we really need a simpler page allocator. 3956ae115bc7Smrj * 3957a77271f8SVikram Hegde * We need to assign an offset for the page to call page_create_va() 3958ae115bc7Smrj * To avoid conflicts with other pages, we get creative with the offset. 395986c1f4dcSVikram Hegde * For 32 bits, we need an offset > 4Gig 396086c1f4dcSVikram Hegde * For 64 bits, need an offset somewhere in the VA hole. 3961ae115bc7Smrj */ 3962a77271f8SVikram Hegde offset = seed; 3963a77271f8SVikram Hegde if (offset > kernelbase) 3964a77271f8SVikram Hegde offset -= kernelbase; 3965a77271f8SVikram Hegde offset <<= MMU_PAGESHIFT; 3966a77271f8SVikram Hegde #if defined(__amd64) 3967a77271f8SVikram Hegde offset += mmu.hole_start; /* something in VA hole */ 3968a77271f8SVikram Hegde #else 3969a77271f8SVikram Hegde offset += 1ULL << 40; /* something > 4 Gig */ 3970a77271f8SVikram Hegde #endif 3971a77271f8SVikram Hegde 3972a77271f8SVikram Hegde if (page_resv(1, KM_NOSLEEP) == 0) 3973ae115bc7Smrj return (NULL); 3974ae115bc7Smrj 3975ae115bc7Smrj #ifdef DEBUG 3976ae115bc7Smrj pp = page_exists(&kvp, offset); 3977ae115bc7Smrj if (pp != NULL) 3978903a11ebSrh87107 panic("page already exists %p", (void *)pp); 3979ae115bc7Smrj #endif 3980ae115bc7Smrj 3981843e1988Sjohnlev pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, 3982ae115bc7Smrj &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ 398386c1f4dcSVikram Hegde if (pp != NULL) { 3984ae115bc7Smrj page_io_unlock(pp); 398586c1f4dcSVikram Hegde page_downgrade(pp); 3986408a1f8eSVikram Hegde } 3987ae115bc7Smrj return (pp); 3988ae115bc7Smrj } 3989