/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * UNIX machine dependent virtual memory support. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * These variables are set by module specific config routines. * They are only set by modules which will use physical cache page coloring * and/or virtual cache page coloring. */ int do_pg_coloring = 0; int do_virtual_coloring = 0; /* * These variables can be conveniently patched at kernel load time to * prevent do_pg_coloring or do_virtual_coloring from being enabled by * module specific config routines. */ int use_page_coloring = 1; int use_virtual_coloring = 1; /* * initialized by page_coloring_init() */ extern uint_t page_colors; extern uint_t page_colors_mask; extern uint_t page_coloring_shift; int cpu_page_colors; uint_t vac_colors = 0; uint_t vac_colors_mask = 0; /* cpu specific coloring initialization */ extern void page_coloring_init_cpu(); #pragma weak page_coloring_init_cpu /* * get the ecache setsize for the current cpu. */ #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) plcnt_t plcnt; /* page list count */ /* * This variable is set by the cpu module to contain the lowest * address not affected by the SF_ERRATA_57 workaround. It should * remain 0 if the workaround is not needed. */ #if defined(SF_ERRATA_57) caddr_t errata57_limit; #endif extern int disable_auto_large_pages; /* used by map_pgsz*() routines */ extern void page_relocate_hash(page_t *, page_t *); /* * these must be defined in platform specific areas */ extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t, struct proc *, uint_t); extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *, caddr_t, size_t, uint_t, struct lgrp *); /* * Convert page frame number to an OBMEM page frame number * (i.e. put in the type bits -- zero for this implementation) */ pfn_t impl_obmem_pfnum(pfn_t pf) { return (pf); } /* * Use physmax to determine the highest physical page of DRAM memory * It is assumed that any physical addresses above physmax is in IO space. * We don't bother checking the low end because we assume that memory space * begins at physical page frame 0. * * Return 1 if the page frame is onboard DRAM memory, else 0. * Returns 0 for nvram so it won't be cached. */ int pf_is_memory(pfn_t pf) { /* We must be IO space */ if (pf > physmax) return (0); /* We must be memory space */ return (1); } /* * Handle a pagefault. */ faultcode_t pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel) { struct as *as; struct proc *p; faultcode_t res; caddr_t base; size_t len; int err; if (INVALID_VADDR(addr)) return (FC_NOMAP); if (iskernel) { as = &kas; } else { p = curproc; as = p->p_as; #if defined(SF_ERRATA_57) /* * Prevent infinite loops due to a segment driver * setting the execute permissions and the sfmmu hat * silently ignoring them. */ if (rw == S_EXEC && AS_TYPE_64BIT(as) && addr < errata57_limit) { res = FC_NOMAP; goto out; } #endif } /* * Dispatch pagefault. */ res = as_fault(as->a_hat, as, addr, 1, type, rw); /* * If this isn't a potential unmapped hole in the user's * UNIX data or stack segments, just return status info. */ if (!(res == FC_NOMAP && iskernel == 0)) goto out; /* * Check to see if we happened to faulted on a currently unmapped * part of the UNIX data or stack segments. If so, create a zfod * mapping there and then try calling the fault routine again. */ base = p->p_brkbase; len = p->p_brksize; if (addr < base || addr >= base + len) { /* data seg? */ base = (caddr_t)(p->p_usrstack - p->p_stksize); len = p->p_stksize; if (addr < base || addr >= p->p_usrstack) { /* stack seg? */ /* not in either UNIX data or stack segments */ res = FC_NOMAP; goto out; } } /* the rest of this function implements a 3.X 4.X 5.X compatibility */ /* This code is probably not needed anymore */ /* expand the gap to the page boundaries on each side */ len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) - ((uintptr_t)base & PAGEMASK); base = (caddr_t)((uintptr_t)base & PAGEMASK); as_rangelock(as); as_purge(as); if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) { err = as_map(as, base, len, segvn_create, zfod_argsp); as_rangeunlock(as); if (err) { res = FC_MAKE_ERR(err); goto out; } } else { /* * This page is already mapped by another thread after we * returned from as_fault() above. We just fallthrough * as_fault() below. */ as_rangeunlock(as); } res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw); out: return (res); } /* * This is the routine which defines the address limit implied * by the flag '_MAP_LOW32'. USERLIMIT32 matches the highest * mappable address in a 32-bit process on this platform (though * perhaps we should make it be UINT32_MAX here?) */ void map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) { struct proc *p = curproc; caddr_t userlimit = flags & _MAP_LOW32 ? (caddr_t)USERLIMIT32 : p->p_as->a_userlimit; map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags); } /* * Some V9 CPUs have holes in the middle of the 64-bit virtual address range. */ caddr_t hole_start, hole_end; /* * kpm mapping window */ caddr_t kpm_vbase; size_t kpm_size; uchar_t kpm_size_shift; /* * Determine whether [base, base+len] contains a mapable range of * addresses at least minlen long. base and len are adjusted if * required to provide a mapable range. */ /* ARGSUSED */ int valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir) { caddr_t hi, lo; lo = *basep; hi = lo + *lenp; /* * If hi rolled over the top, try cutting back. */ if (hi < lo) { size_t newlen = 0 - (uintptr_t)lo - 1l; if (newlen + (uintptr_t)hi < minlen) return (0); if (newlen < minlen) return (0); *lenp = newlen; } else if (hi - lo < minlen) return (0); /* * Deal with a possible hole in the address range between * hole_start and hole_end that should never be mapped by the MMU. */ hi = lo + *lenp; if (lo < hole_start) { if (hi > hole_start) if (hi < hole_end) hi = hole_start; else /* lo < hole_start && hi >= hole_end */ if (dir == AH_LO) { /* * prefer lowest range */ if (hole_start - lo >= minlen) hi = hole_start; else if (hi - hole_end >= minlen) lo = hole_end; else return (0); } else { /* * prefer highest range */ if (hi - hole_end >= minlen) lo = hole_end; else if (hole_start - lo >= minlen) hi = hole_start; else return (0); } } else { /* lo >= hole_start */ if (hi < hole_end) return (0); if (lo < hole_end) lo = hole_end; } if (hi - lo < minlen) return (0); *basep = lo; *lenp = hi - lo; return (1); } /* * Determine whether [addr, addr+len] with protections `prot' are valid * for a user address space. */ /*ARGSUSED*/ int valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as, caddr_t userlimit) { caddr_t eaddr = addr + len; if (eaddr <= addr || addr >= userlimit || eaddr > userlimit) return (RANGE_BADADDR); /* * Determine if the address range falls within an illegal * range of the MMU. */ if (eaddr > hole_start && addr < hole_end) return (RANGE_BADADDR); #if defined(SF_ERRATA_57) /* * Make sure USERLIMIT isn't raised too high */ ASSERT64(addr <= (caddr_t)0xffffffff80000000ul || errata57_limit == 0); if (AS_TYPE_64BIT(as) && (addr < errata57_limit) && (prot & PROT_EXEC)) return (RANGE_BADPROT); #endif /* SF_ERRATA57 */ return (RANGE_OKAY); } /* * Routine used to check to see if an a.out can be executed * by the current machine/architecture. */ int chkaout(struct exdata *exp) { if (exp->ux_mach == M_SPARC) return (0); else return (ENOEXEC); } /* * The following functions return information about an a.out * which is used when a program is executed. */ /* * Return the load memory address for the data segment. */ caddr_t getdmem(struct exec *exp) { /* * XXX - Sparc Reference Hack approaching * Remember that we are loading * 8k executables into a 4k machine * DATA_ALIGN == 2 * PAGESIZE */ if (exp->a_text) return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN))); else return ((caddr_t)USRTEXT); } /* * Return the starting disk address for the data segment. */ ulong_t getdfile(struct exec *exp) { if (exp->a_magic == ZMAGIC) return (exp->a_text); else return (sizeof (struct exec) + exp->a_text); } /* * Return the load memory address for the text segment. */ /*ARGSUSED*/ caddr_t gettmem(struct exec *exp) { return ((caddr_t)USRTEXT); } /* * Return the file byte offset for the text segment. */ uint_t gettfile(struct exec *exp) { if (exp->a_magic == ZMAGIC) return (0); else return (sizeof (struct exec)); } void getexinfo( struct exdata *edp_in, struct exdata *edp_out, int *pagetext, int *pagedata) { *edp_out = *edp_in; /* structure copy */ if ((edp_in->ux_mag == ZMAGIC) && ((edp_in->vp->v_flag & VNOMAP) == 0)) { *pagetext = 1; *pagedata = 1; } else { *pagetext = 0; *pagedata = 0; } } #define MAP_PGSZ_COMMON(pgsz, n, upper, lower, len) \ for ((n) = (upper); (n) > (lower); (n)--) { \ if (disable_auto_large_pages & (1 << (n))) \ continue; \ if (hw_page_array[(n)].hp_size <= (len)) { \ (pgsz) = hw_page_array[(n)].hp_size; \ break; \ } \ } /*ARGSUSED*/ static size_t map_pgszva(struct proc *p, caddr_t addr, size_t len) { size_t pgsz = MMU_PAGESIZE; int n, upper; /* * Select the best fit page size within the constraints of * auto_lpg_{min,max}szc. * * Note that we also take the heap size into account when * deciding if we've crossed the threshold at which we should * increase the page size. This isn't perfect since the heap * may not have reached its full size yet, but it's better than * not considering it at all. */ len += p->p_brksize; if (ptob(auto_lpg_tlb_threshold) <= len) { upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); /* * Use auto_lpg_minszc - 1 as the limit so we never drop * below auto_lpg_minszc. We don't have a size code to refer * to like we have for bss and stack, so we assume 0. * auto_lpg_minszc should always be >= 0. Using * auto_lpg_minszc cuts off the loop. */ MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len); } return (pgsz); } static size_t map_pgszheap(struct proc *p, caddr_t addr, size_t len) { size_t pgsz; int n, upper, lower; /* * If len is zero, retrieve from proc and don't demote the page size. */ if (len == 0) { len = p->p_brksize; } /* * Still zero? Then we don't have a heap yet, so pick the default * heap size. */ if (len == 0) { pgsz = auto_lpg_heap_default; } else { pgsz = hw_page_array[p->p_brkpageszc].hp_size; } if ((pgsz * auto_lpg_tlb_threshold) <= len) { /* * We're past the threshold, so select the best fit * page size within the constraints of * auto_lpg_{min,max}szc and the minimum required * alignment. */ upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); } /* * If addr == 0 we were called by memcntl() or exec_args() when the * size code is 0. Don't set pgsz less than current size. */ if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) { pgsz = hw_page_array[p->p_brkpageszc].hp_size; } return (pgsz); } static size_t map_pgszstk(struct proc *p, caddr_t addr, size_t len) { size_t pgsz; int n, upper, lower; /* * If len is zero, retrieve from proc and don't demote the page size. */ if (len == 0) { len = p->p_stksize; } /* * Still zero? Then we don't have a heap yet, so pick the default * stack size. */ if (len == 0) { pgsz = auto_lpg_stack_default; } else { pgsz = hw_page_array[p->p_stkpageszc].hp_size; } if ((pgsz * auto_lpg_tlb_threshold) <= len) { /* * We're past the threshold, so select the best fit * page size within the constraints of * auto_lpg_{min,max}szc and the minimum required * alignment. */ upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc); lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc); MAP_PGSZ_COMMON(pgsz, n, upper, lower, len); } /* * If addr == 0 we were called by memcntl() or exec_args() when the * size code is 0. Don't set pgsz less than current size. */ if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) { pgsz = hw_page_array[p->p_stkpageszc].hp_size; } return (pgsz); } static size_t map_pgszism(caddr_t addr, size_t len) { uint_t szc; size_t pgsz; extern int disable_ism_large_pages; for (szc = mmu_page_sizes - 1; szc >= TTE4M; szc--) { if (disable_ism_large_pages & (1 << szc)) continue; pgsz = hw_page_array[szc].hp_size; if ((len >= pgsz) && IS_P2ALIGNED(addr, pgsz)) return (pgsz); } return (DEFAULT_ISM_PAGESIZE); } /* * Suggest a page size to be used to map a segment of type maptype and length * len. Returns a page size (not a size code). * If remap is non-NULL, fill in a value suggesting whether or not to remap * this segment. */ size_t map_pgsz(int maptype, struct proc *p, caddr_t addr, size_t len, int *remap) { size_t pgsz = 0; if (remap != NULL) *remap = (len > auto_lpg_remap_threshold); switch (maptype) { case MAPPGSZ_ISM: pgsz = map_pgszism(addr, len); break; case MAPPGSZ_VA: pgsz = map_pgszva(p, addr, len); break; case MAPPGSZ_STK: pgsz = map_pgszstk(p, addr, len); break; case MAPPGSZ_HEAP: pgsz = map_pgszheap(p, addr, len); break; } return (pgsz); } /* * Return non 0 value if the address may cause a VAC alias with KPM mappings. * KPM selects an address such that it's equal offset modulo shm_alignment and * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping. */ int map_addr_vacalign_check(caddr_t addr, u_offset_t off) { if (vac) { return (((uintptr_t)addr ^ off) & shm_alignment - 1); } else { return (0); } } /* * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m * can be set in platform or CPU specific code but user can change the * default values via /etc/system. * * Initial values are defined in architecture specific mach_vm_dep.c file. */ extern int use_text_pgsz64k; extern int use_text_pgsz4m; extern int use_initdata_pgsz64k; /* * disable_text_largepages and disable_initdata_largepages bitmaks are set in * platform or CPU specific code to disable page sizes that should not be * used. These variables normally shouldn't be changed via /etc/system. A * particular page size for text or inititialized data will be used by default * if both one of use_* variables is set to 1 AND this page size is not * disabled in the corresponding disable_* bitmask variable. * * Initial values are defined in architecture specific mach_vm_dep.c file. */ extern int disable_text_largepages; extern int disable_initdata_largepages; /* * Minimum segment size tunables before 64K or 4M large pages * should be used to map it. * * Initial values are defined in architecture specific mach_vm_dep.c file. */ extern size_t text_pgsz64k_minsize; extern size_t text_pgsz4m_minsize; extern size_t initdata_pgsz64k_minsize; /* * Sanity control. Don't use large pages regardless of user * settings if there's less than execseg_lpg_min_physmem memory installed. * The units for this variable is 8K pages. */ pgcnt_t execseg_lpg_min_physmem = 131072; /* 1GB */ extern int disable_shm_large_pages; pgcnt_t shm_lpg_min_physmem = 131072; /* 1GB */ extern size_t max_shm_lpsize; /* assumes TTE8K...TTE4M == szc */ static uint_t map_text_pgsz4m(caddr_t addr, size_t len) { caddr_t a; if (len < text_pgsz4m_minsize) { return (0); } a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); if (a < addr || a >= addr + len) { return (0); } len -= (a - addr); if (len < MMU_PAGESIZE4M) { return (0); } return (1 << TTE4M); } static uint_t map_text_pgsz64k(caddr_t addr, size_t len) { caddr_t a; size_t svlen = len; if (len < text_pgsz64k_minsize) { return (0); } a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); if (a < addr || a >= addr + len) { return (0); } len -= (a - addr); if (len < MMU_PAGESIZE64K) { return (0); } if (!use_text_pgsz4m || disable_text_largepages & (1 << TTE4M)) { return (1 << TTE64K); } if (svlen < text_pgsz4m_minsize) { return (1 << TTE64K); } addr = a; a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t); if (a < addr || a >= addr + len) { return (1 << TTE64K); } len -= (a - addr); if (len < MMU_PAGESIZE4M) { return (1 << TTE64K); } return ((1 << TTE4M) | (1 << TTE64K)); } static uint_t map_initdata_pgsz64k(caddr_t addr, size_t len) { caddr_t a; if (len < initdata_pgsz64k_minsize) { return (0); } a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t); if (a < addr || a >= addr + len) { return (0); } len -= (a - addr); if (len < MMU_PAGESIZE64K) { return (0); } return (1 << TTE64K); } /* * Return a bit vector of large page size codes that * can be used to map [addr, addr + len) region. */ uint_t map_execseg_pgszcvec(int text, caddr_t addr, size_t len) { uint_t ret = 0; if (physmem < execseg_lpg_min_physmem) { return (0); } if (text) { if (use_text_pgsz64k && !(disable_text_largepages & (1 << TTE64K))) { ret = map_text_pgsz64k(addr, len); } else if (use_text_pgsz4m && !(disable_text_largepages & (1 << TTE4M))) { ret = map_text_pgsz4m(addr, len); } } else if (use_initdata_pgsz64k && !(disable_initdata_largepages & (1 << TTE64K))) { ret = map_initdata_pgsz64k(addr, len); } return (ret); } uint_t map_shm_pgszcvec(caddr_t addr, size_t size, uintptr_t off) { caddr_t eaddr = addr + size; uint_t szcvec = 0; int i; caddr_t raddr; caddr_t readdr; size_t pgsz; if (physmem < shm_lpg_min_physmem || mmu_page_sizes <= 1 || max_shm_lpsize <= MMU_PAGESIZE) { return (0); } for (i = mmu_page_sizes - 1; i > 0; i--) { if (disable_shm_large_pages & (1 << i)) { continue; } pgsz = page_get_pagesize(i); if (pgsz > max_shm_lpsize) { continue; } raddr = (caddr_t)P2ROUNDUP((uintptr_t)addr, pgsz); readdr = (caddr_t)P2ALIGN((uintptr_t)eaddr, pgsz); if (raddr < addr || raddr >= readdr) { continue; } if (P2PHASE((uintptr_t)addr ^ off, pgsz)) { continue; } szcvec |= (1 << i); /* * And or in the remaining enabled page sizes. */ szcvec |= P2PHASE(~disable_shm_large_pages, (1 << i)); szcvec &= ~1; /* no need to return 8K pagesize */ break; } return (szcvec); } /* * Anchored in the table below are counters used to keep track * of free contiguous physical memory. Each element of the table contains * the array of counters, the size of array which is allocated during * startup based on physmax and a shift value used to convert a pagenum * into a counter array index or vice versa. The table has page size * for rows and region size for columns: * * page_counters[page_size][region_size] * * page_size: TTE size code of pages on page_size freelist. * * region_size: TTE size code of a candidate larger page made up * made up of contiguous free page_size pages. * * As you go across a page_size row increasing region_size each * element keeps track of how many (region_size - 1) size groups * made up of page_size free pages can be coalesced into a * regsion_size page. Yuck! Lets try an example: * * page_counters[1][3] is the table element used for identifying * candidate 4M pages from contiguous pages off the 64K free list. * Each index in the page_counters[1][3].array spans 4M. Its the * number of free 512K size (regsion_size - 1) groups of contiguous * 64K free pages. So when page_counters[1][3].counters[n] == 8 * we know we have a candidate 4M page made up of 512K size groups * of 64K free pages. */ /* * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins) * dimensions are allocated dynamically. */ page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; /* * For now there is only a single size cache list. * Allocated dynamically. */ page_t ***page_cachelists[MAX_MEM_TYPES]; kmutex_t *fpc_mutex[NPC_MUTEX]; kmutex_t *cpc_mutex[NPC_MUTEX]; caddr_t alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align) { int mtype; uint_t szc; alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); /* * We only support small pages in the cachelist. */ for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { page_cachelists[mtype][mnode] = (page_t **)alloc_base; alloc_base += (sizeof (page_t *) * page_get_pagecolors(0)); /* * Allocate freelists bins for all * supported page sizes. */ for (szc = 0; szc < mmu_page_sizes; szc++) { page_freelists[szc][mtype][mnode] = (page_t **)alloc_base; alloc_base += ((sizeof (page_t *) * page_get_pagecolors(szc))); } } alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align); return (alloc_base); } /* * Allocate page_freelists bin headers for a memnode from the * nucleus data area. This is the first time that mmu_page_sizes is * used during sun4u bootup, so check mmu_page_sizes initialization. */ int ndata_alloc_page_freelists(struct memlist *ndata, int mnode) { size_t alloc_sz; caddr_t alloc_base; caddr_t end; int mtype; uint_t szc; int32_t allp = 0; if (&mmu_init_mmu_page_sizes) { if (!mmu_init_mmu_page_sizes(allp)) { cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized", mmu_page_sizes); } } ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES); /* first time called - allocate max_mem_nodes dimension */ if (mnode == 0) { int i; /* page_cachelists */ alloc_sz = MAX_MEM_TYPES * max_mem_nodes * sizeof (page_t **); /* page_freelists */ alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes * sizeof (page_t **); /* fpc_mutex and cpc_mutex */ alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t); alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); if (alloc_base == NULL) return (-1); ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0); for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { page_cachelists[mtype] = (page_t ***)alloc_base; alloc_base += (max_mem_nodes * sizeof (page_t **)); for (szc = 0; szc < mmu_page_sizes; szc++) { page_freelists[szc][mtype] = (page_t ***)alloc_base; alloc_base += (max_mem_nodes * sizeof (page_t **)); } } for (i = 0; i < NPC_MUTEX; i++) { fpc_mutex[i] = (kmutex_t *)alloc_base; alloc_base += (sizeof (kmutex_t) * max_mem_nodes); cpc_mutex[i] = (kmutex_t *)alloc_base; alloc_base += (sizeof (kmutex_t) * max_mem_nodes); } alloc_sz = 0; } /* * Calculate the size needed by alloc_page_freelists(). */ for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) { alloc_sz += sizeof (page_t *) * page_get_pagecolors(0); for (szc = 0; szc < mmu_page_sizes; szc++) alloc_sz += sizeof (page_t *) * page_get_pagecolors(szc); } alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize); if (alloc_base == NULL) return (-1); end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize); ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz, ecache_alignsize)); return (0); } /* * To select our starting bin, we stride through the bins with a stride * of 337. Why 337? It's prime, it's largeish, and it performs well both * in simulation and practice for different workloads on varying cache sizes. */ uint32_t color_start_current = 0; uint32_t color_start_stride = 337; int color_start_random = 0; /* ARGSUSED */ uint_t get_color_start(struct as *as) { uint32_t old, new; if (consistent_coloring == 2 || color_start_random) { return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) & (hw_page_array[0].hp_colors - 1))); } do { old = color_start_current; new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT)); } while (cas32(&color_start_current, old, new) != old); return ((uint_t)(new)); } /* * Called once at startup from kphysm_init() -- before memialloc() * is invoked to do the 1st page_free()/page_freelist_add(). * * initializes page_colors and page_colors_mask based on ecache_setsize. * * Also initializes the counter locks. */ void page_coloring_init() { int a, i; uint_t colors; if (do_pg_coloring == 0) { page_colors = 1; for (i = 0; i < mmu_page_sizes; i++) hw_page_array[i].hp_colors = 1; return; } /* * Calculate page_colors from ecache_setsize. ecache_setsize contains * the max ecache setsize of all cpus configured in the system or, for * cheetah+ systems, the max possible ecache setsize for all possible * cheetah+ cpus. */ page_colors = ecache_setsize / MMU_PAGESIZE; page_colors_mask = page_colors - 1; vac_colors = vac_size / MMU_PAGESIZE; vac_colors_mask = vac_colors -1; page_coloring_shift = 0; a = ecache_setsize; while (a >>= 1) { page_coloring_shift++; } /* initialize number of colors per page size */ for (i = 0; i < mmu_page_sizes; i++) { hw_page_array[i].hp_colors = (page_colors_mask >> (hw_page_array[i].hp_shift - hw_page_array[0].hp_shift)) + 1; } /* * initialize cpu_page_colors if ecache setsizes are homogenous. * cpu_page_colors set to -1 during DR operation or during startup * if setsizes are heterogenous. * * The value of cpu_page_colors determines if additional color bins * need to be checked for a particular color in the page_get routines. */ if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize)) { cpu_page_colors = cpu_setsize / MMU_PAGESIZE; a = lowbit(page_colors) - lowbit(cpu_page_colors); ASSERT(a > 0); ASSERT(a < 16); for (i = 0; i < mmu_page_sizes; i++) { if ((colors = hw_page_array[i].hp_colors) <= 1) { colorequivszc[i] = 0; continue; } while ((colors >> a) == 0) a--; ASSERT(a >= 0); /* higher 4 bits encodes color equiv mask */ colorequivszc[i] = (a << 4); } } /* factor in colorequiv to check additional 'equivalent' bins. */ if (colorequiv > 1 && &page_coloring_init_cpu == NULL) { a = lowbit(colorequiv) - 1; if (a > 15) a = 15; for (i = 0; i < mmu_page_sizes; i++) { if ((colors = hw_page_array[i].hp_colors) <= 1) { continue; } while ((colors >> a) == 0) a--; if ((a << 4) > colorequivszc[i]) { colorequivszc[i] = (a << 4); } } } /* do cpu specific color initialization */ if (&page_coloring_init_cpu) { page_coloring_init_cpu(); } } int bp_color(struct buf *bp) { int color = -1; if (vac) { if ((bp->b_flags & B_PAGEIO) != 0) { color = sfmmu_get_ppvcolor(bp->b_pages); } else if (bp->b_un.b_addr != NULL) { color = sfmmu_get_addrvcolor(bp->b_un.b_addr); } } return (color < 0 ? 0 : ptob(color)); } /* * Create & Initialise pageout scanner thread. The thread has to * start at procedure with process pp and priority pri. */ void pageout_init(void (*procedure)(), proc_t *pp, pri_t pri) { (void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri); } /* * Function for flushing D-cache when performing module relocations * to an alternate mapping. Stubbed out on all platforms except sun4u, * at least for now. */ void dcache_flushall() { sfmmu_cache_flushall(); } static int kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2) { if (va1 < va2 && va1 + sz1 <= va2) return (0); if (va2 < va1 && va2 + sz2 <= va1) return (0); return (1); } /* * Return the number of bytes, relative to the beginning of a given range, that * are non-toxic (can be read from and written to with relative impunity). */ size_t kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write) { /* OBP reads are harmless, but we don't want people writing there */ if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR - OFW_START_ADDR + 1)) return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0); if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE)) return (va < PIOMAPBASE ? PIOMAPBASE - va : 0); return (sz); /* no overlap */ } /* * Minimum physmem required for enabling large pages for kernel heap * Currently we do not enable lp for kmem on systems with less * than 1GB of memory. This value can be changed via /etc/system */ size_t segkmem_lpminphysmem = 0x40000000; /* 1GB */ /* * this function chooses large page size for kernel heap */ size_t get_segkmem_lpsize(size_t lpsize) { size_t memtotal = physmem * PAGESIZE; size_t mmusz; uint_t szc; extern int disable_large_pages; if (memtotal < segkmem_lpminphysmem) return (PAGESIZE); if (plat_lpkmem_is_supported != NULL && plat_lpkmem_is_supported() == 0) return (PAGESIZE); mmusz = mmu_get_kernel_lpsize(lpsize); szc = page_szc(mmusz); while (szc) { if (!(disable_large_pages & (1 << szc))) return (page_get_pagesize(szc)); szc--; } return (PAGESIZE); }