1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * UNIX machine dependent virtual memory support. 27 */ 28 29 #ifndef _VM_DEP_H 30 #define _VM_DEP_H 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 36 #include <vm/hat_sfmmu.h> 37 #include <sys/archsystm.h> 38 #include <sys/memnode.h> 39 40 #define GETTICK() gettick() 41 42 /* #define for keeping code architecturally neutral */ 43 #define randtick() gettick() 44 45 /* 46 * Per page size free lists. Allocated dynamically. 47 */ 48 #define MAX_MEM_TYPES 2 /* 0 = reloc, 1 = noreloc */ 49 #define MTYPE_RELOC 0 50 #define MTYPE_NORELOC 1 51 52 #define PP_2_MTYPE(pp) (PP_ISNORELOC(pp) ? MTYPE_NORELOC : MTYPE_RELOC) 53 54 #define MTYPE_INIT(mtype, vp, vaddr, flags, pgsz) \ 55 mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC; 56 57 /* mtype init for page_get_replacement_page */ 58 #define MTYPE_PGR_INIT(mtype, flags, pp, mnode, pgcnt) \ 59 mtype = (flags & PG_NORELOC) ? MTYPE_NORELOC : MTYPE_RELOC; 60 61 #define MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi) \ 62 pfnlo = mem_node_config[mnode].physbase; \ 63 pfnhi = mem_node_config[mnode].physmax; 64 65 /* 66 * candidate counters in vm_pagelist.c are indexed by color and range 67 */ 68 #define MAX_MNODE_MRANGES MAX_MEM_TYPES 69 #define MNODE_RANGE_CNT(mnode) MAX_MNODE_MRANGES 70 #define MNODE_MAX_MRANGE(mnode) (MAX_MEM_TYPES - 1) 71 #define MTYPE_2_MRANGE(mnode, mtype) (mtype) 72 73 /* 74 * Internal PG_ flags. 75 */ 76 #define PGI_RELOCONLY 0x10000 /* acts in the opposite sense to PG_NORELOC */ 77 #define PGI_NOCAGE 0x20000 /* indicates Cage is disabled */ 78 #define PGI_PGCPHIPRI 0x40000 /* page_get_contig_page priority allocation */ 79 #define PGI_PGCPSZC0 0x80000 /* relocate base pagesize page */ 80 81 /* 82 * PGI mtype flags - should not overlap PGI flags 83 */ 84 #define PGI_MT_RANGE 0x1000000 /* mtype range */ 85 #define PGI_MT_NEXT 0x2000000 /* get next mtype */ 86 87 extern page_t ***page_cachelists[MAX_MEM_TYPES]; 88 89 #define PAGE_CACHELISTS(mnode, color, mtype) \ 90 (*(page_cachelists[mtype][mnode] + (color))) 91 92 /* 93 * There are 'page_colors' colors/bins. Spread them out under a 94 * couple of locks. There are mutexes for both the page freelist 95 * and the page cachelist. We want enough locks to make contention 96 * reasonable, but not too many -- otherwise page_freelist_lock() gets 97 * so expensive that it becomes the bottleneck! 98 */ 99 #define NPC_MUTEX 16 100 101 extern kmutex_t *fpc_mutex[NPC_MUTEX]; 102 extern kmutex_t *cpc_mutex[NPC_MUTEX]; 103 104 /* 105 * Iterator provides the info needed to convert RA to PA. 106 * MEM_NODE_ITERATOR_INIT() should be called before 107 * PAGE_NEXT_PFN_FOR_COLOR() if pfn was not obtained via a previous 108 * PAGE_NEXT_PFN_FOR_COLOR() call. Iterator caches color 2 hash 109 * translations requiring initializer call if color or ceq_mask changes, 110 * even if pfn doesn't. MEM_NODE_ITERATOR_INIT() must also be called before 111 * PFN_2_COLOR() that uses a valid iterator argument. 112 * 113 * plat_mem_node_iterator_init() starts from last mblock in continuation 114 * case which may be invalid because memory DR. To detect this situation 115 * mi_genid is checked against mpo_genid which is incremented after a 116 * memory DR operation. See also plat_slice_add()/plat_slice_del(). 117 */ 118 #ifdef sun4v 119 120 typedef struct mem_node_iterator { 121 uint_t mi_mnode; /* mnode in which to iterate */ 122 int mi_init; /* set to 1 when first init */ 123 int mi_genid; /* set/checked against mpo_genid */ 124 int mi_last_mblock; /* last mblock visited */ 125 uint_t mi_hash_ceq_mask; /* cached copy of ceq_mask */ 126 uint_t mi_hash_color; /* cached copy of color */ 127 uint_t mi_mnode_mask; /* number of mask bits */ 128 uint_t mi_mnode_pfn_shift; /* mnode position in pfn */ 129 pfn_t mi_mblock_base; /* first valid pfn in current mblock */ 130 pfn_t mi_mblock_end; /* last valid pfn in current mblock */ 131 pfn_t mi_ra_to_pa; /* ra adjustment for current mblock */ 132 pfn_t mi_mnode_pfn_mask; /* mask to obtain mnode id bits */ 133 } mem_node_iterator_t; 134 135 #define MEM_NODE_ITERATOR_DECL(it) \ 136 mem_node_iterator_t it 137 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) \ 138 (pfn) = plat_mem_node_iterator_init((pfn), (mnode), (szc), (it), 1) 139 140 extern pfn_t plat_mem_node_iterator_init(pfn_t, int, uchar_t, 141 mem_node_iterator_t *, int); 142 extern pfn_t plat_rapfn_to_papfn(pfn_t); 143 extern int interleaved_mnodes; 144 145 #else /* sun4v */ 146 147 #define MEM_NODE_ITERATOR_DECL(it) \ 148 void *it = NULL 149 #define MEM_NODE_ITERATOR_INIT(pfn, mnode, szc, it) 150 151 #endif /* sun4v */ 152 153 /* 154 * Return the mnode limits so that hpc_counters length and base 155 * index can be determined. When interleaved_mnodes is set, we 156 * create an array only for the first mnode that exists. All other 157 * mnodes will share the array in this case. 158 * If interleaved_mnodes is not set, simply return the limits for 159 * the given mnode. 160 */ 161 #define HPM_COUNTERS_LIMITS(mnode, physbase, physmax, first) \ 162 if (!interleaved_mnodes) { \ 163 (physbase) = mem_node_config[(mnode)].physbase; \ 164 (physmax) = mem_node_config[(mnode)].physmax; \ 165 (first) = (mnode); \ 166 } else if ((first) < 0) { \ 167 mem_node_max_range(&(physbase), &(physmax)); \ 168 (first) = (mnode); \ 169 } 170 171 #define PAGE_CTRS_WRITE_LOCK(mnode) \ 172 if (!interleaved_mnodes) { \ 173 rw_enter(&page_ctrs_rwlock[(mnode)], RW_WRITER); \ 174 page_freelist_lock(mnode); \ 175 } else { \ 176 /* changing shared hpm_counters */ \ 177 int _i; \ 178 for (_i = 0; _i < max_mem_nodes; _i++) { \ 179 rw_enter(&page_ctrs_rwlock[_i], RW_WRITER); \ 180 page_freelist_lock(_i); \ 181 } \ 182 } 183 184 #define PAGE_CTRS_WRITE_UNLOCK(mnode) \ 185 if (!interleaved_mnodes) { \ 186 page_freelist_unlock(mnode); \ 187 rw_exit(&page_ctrs_rwlock[(mnode)]); \ 188 } else { \ 189 int _i; \ 190 for (_i = 0; _i < max_mem_nodes; _i++) { \ 191 page_freelist_unlock(_i); \ 192 rw_exit(&page_ctrs_rwlock[_i]); \ 193 } \ 194 } 195 196 /* 197 * cpu specific color conversion functions 198 */ 199 extern uint_t page_get_nsz_color_mask_cpu(uchar_t, uint_t); 200 #pragma weak page_get_nsz_color_mask_cpu 201 202 extern uint_t page_get_nsz_color_cpu(uchar_t, uint_t); 203 #pragma weak page_get_nsz_color_cpu 204 205 extern uint_t page_get_color_shift_cpu(uchar_t, uchar_t); 206 #pragma weak page_get_color_shift_cpu 207 208 extern uint_t page_convert_color_cpu(uint_t, uchar_t, uchar_t); 209 #pragma weak page_convert_color_cpu 210 211 extern pfn_t page_next_pfn_for_color_cpu(pfn_t, 212 uchar_t, uint_t, uint_t, uint_t, void *); 213 #pragma weak page_next_pfn_for_color_cpu 214 215 extern uint_t page_pfn_2_color_cpu(pfn_t, uchar_t, void *); 216 #pragma weak page_pfn_2_color_cpu 217 218 #define PAGE_GET_COLOR_SHIFT(szc, nszc) \ 219 ((&page_get_color_shift_cpu != NULL) ? \ 220 page_get_color_shift_cpu(szc, nszc) : \ 221 (hw_page_array[(nszc)].hp_shift - \ 222 hw_page_array[(szc)].hp_shift)) 223 224 #define PAGE_CONVERT_COLOR(ncolor, szc, nszc) \ 225 ((&page_convert_color_cpu != NULL) ? \ 226 page_convert_color_cpu(ncolor, szc, nszc) : \ 227 ((ncolor) << PAGE_GET_COLOR_SHIFT((szc), (nszc)))) 228 229 #define PFN_2_COLOR(pfn, szc, it) \ 230 ((&page_pfn_2_color_cpu != NULL) ? \ 231 page_pfn_2_color_cpu(pfn, szc, it) : \ 232 ((pfn & (hw_page_array[0].hp_colors - 1)) >> \ 233 (hw_page_array[szc].hp_shift - \ 234 hw_page_array[0].hp_shift))) 235 236 #define PNUM_SIZE(szc) \ 237 (hw_page_array[(szc)].hp_pgcnt) 238 #define PNUM_SHIFT(szc) \ 239 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 240 #define PAGE_GET_SHIFT(szc) \ 241 (hw_page_array[(szc)].hp_shift) 242 #define PAGE_GET_PAGECOLORS(szc) \ 243 (hw_page_array[(szc)].hp_colors) 244 245 /* 246 * This macro calculates the next sequential pfn with the specified 247 * color using color equivalency mask 248 */ 249 #define PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, color, ceq_mask, color_mask, it) \ 250 { \ 251 ASSERT(((color) & ~(ceq_mask)) == 0); \ 252 if (&page_next_pfn_for_color_cpu == NULL) { \ 253 uint_t pfn_shift = PAGE_BSZS_SHIFT(szc); \ 254 pfn_t spfn = pfn >> pfn_shift; \ 255 pfn_t stride = (ceq_mask) + 1; \ 256 ASSERT((((ceq_mask) + 1) & (ceq_mask)) == 0); \ 257 if (((spfn ^ (color)) & (ceq_mask)) == 0) { \ 258 pfn += stride << pfn_shift; \ 259 } else { \ 260 pfn = (spfn & ~(pfn_t)(ceq_mask)) | (color); \ 261 pfn = (pfn > spfn ? pfn : pfn + stride) << \ 262 pfn_shift; \ 263 } \ 264 } else { \ 265 pfn = page_next_pfn_for_color_cpu(pfn, szc, color, \ 266 ceq_mask, color_mask, it); \ 267 } \ 268 } 269 270 /* get the color equivalency mask for the next szc */ 271 #define PAGE_GET_NSZ_MASK(szc, mask) \ 272 ((&page_get_nsz_color_mask_cpu == NULL) ? \ 273 ((mask) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \ 274 page_get_nsz_color_mask_cpu(szc, mask)) 275 276 /* get the color of the next szc */ 277 #define PAGE_GET_NSZ_COLOR(szc, color) \ 278 ((&page_get_nsz_color_cpu == NULL) ? \ 279 ((color) >> (PAGE_GET_SHIFT((szc) + 1) - PAGE_GET_SHIFT(szc))) : \ 280 page_get_nsz_color_cpu(szc, color)) 281 282 /* Find the bin for the given page if it was of size szc */ 283 #define PP_2_BIN_SZC(pp, szc) (PFN_2_COLOR(pp->p_pagenum, szc, (void *)(-1))) 284 285 #define PP_2_BIN(pp) (PP_2_BIN_SZC(pp, pp->p_szc)) 286 287 #define PP_2_MEM_NODE(pp) (PFN_2_MEM_NODE(pp->p_pagenum)) 288 289 #define PC_BIN_MUTEX(iskflt, mnode, bin, flags) ((flags & PG_FREE_LIST) ? \ 290 &fpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode] : \ 291 &cpc_mutex[(bin) & (NPC_MUTEX - 1)][mnode]) 292 293 #define FPC_MUTEX(mnode, i) (&fpc_mutex[i][mnode]) 294 #define CPC_MUTEX(mnode, i) (&cpc_mutex[i][mnode]) 295 296 #define PFN_BASE(pfnum, szc) (pfnum & ~((1 << PAGE_BSZS_SHIFT(szc)) - 1)) 297 298 /* 299 * this structure is used for walking free page lists 300 * controls when to split large pages into smaller pages, 301 * and when to coalesce smaller pages into larger pages 302 */ 303 typedef struct page_list_walker { 304 uint_t plw_colors; /* num of colors for szc */ 305 uint_t plw_color_mask; /* colors-1 */ 306 uint_t plw_bin_step; /* next bin: 1 or 2 */ 307 uint_t plw_count; /* loop count */ 308 uint_t plw_bin0; /* starting bin */ 309 uint_t plw_bin_marker; /* bin after initial jump */ 310 uint_t plw_bin_split_prev; /* last bin we tried to split */ 311 uint_t plw_do_split; /* set if OK to split */ 312 uint_t plw_split_next; /* next bin to split */ 313 uint_t plw_ceq_dif; /* number of different color groups */ 314 /* to check */ 315 uint_t plw_ceq_mask[MMU_PAGE_SIZES + 1]; /* color equiv mask */ 316 uint_t plw_bins[MMU_PAGE_SIZES + 1]; /* num of bins */ 317 } page_list_walker_t; 318 319 void page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, 320 int can_split, int use_ceq, page_list_walker_t *plw); 321 322 /* 323 * Page freelists have a single freelist type, the user page freelist. The 324 * kernel page freelist is disabled on SPARC platforms. The definitions related 325 * to the freelist type structure are grouped below. 326 */ 327 328 #define MAX_PFLT_POLICIES 3 329 #define MAX_PFLT_TYPE 2 330 enum freelist_types {PFLT_USER, PFLT_KMEM}; 331 332 /* 333 * The kernel only needs a small number of page colors, far fewer than user 334 * programs. 335 */ 336 #define KFLT_PAGE_COLORS 16 337 /* flag used by the kflt_export function when calling page_promote */ 338 #define PC_KFLT_EXPORT 0x4 339 #define PC_ISKFLT(fltp) (fltp->pflt_type == PFLT_KMEM) 340 341 typedef struct page_freelist_type page_freelist_type_t; 342 extern page_freelist_type_t flt_user; 343 extern page_freelist_type_t *ufltp; 344 345 typedef page_t *(*pflt_get_func_p) (struct vnode *, u_offset_t, struct seg *, 346 caddr_t, size_t, uint_t, lgrp_t *); 347 typedef page_t *(*pflt_policy_func_p)(page_freelist_type_t *, int, uint_t, int, 348 uchar_t, uint_t); 349 typedef void (*pflt_list_walk_init_func_p)(uchar_t, uint_t, uint_t, int, int, 350 page_list_walker_t *); 351 typedef uint_t (*pflt_list_walk_next_func_p)(uchar_t, uint_t, 352 page_list_walker_t *); 353 354 page_t *page_get_uflt(struct vnode *, u_offset_t, struct seg *, caddr_t, 355 size_t, uint_t, struct lgrp *); 356 extern page_t *page_get_mnode_freelist(page_freelist_type_t *, int, uint_t, 357 int, uchar_t, uint_t); 358 extern page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 359 extern page_t *page_get_contig_pages(page_freelist_type_t *, int, uint_t, int, 360 uchar_t, uint_t); 361 extern void page_list_walk_init(uchar_t, uint_t, uint_t, int, int, 362 page_list_walker_t *); 363 extern uint_t page_list_walk_next_bin(uchar_t, uint_t, page_list_walker_t *); 364 365 /* 366 * Page freelists are organized as freelist types, on Sparc systems there 367 * is only a single user freelist type as the kernel cage provides a 368 * similar function to kernel freelist in that it prevents memory 369 * fragmentation. 370 * 371 * The page freelists have fixed page size and memory type dimensions. 372 * the 3rd (max_mem_nodes) and 4th (page coloring bins) dimensions are 373 * allocated dynamically. 374 */ 375 struct page_freelist_type { 376 int pflt_type; 377 pflt_get_func_p pflt_get_free; 378 pflt_list_walk_init_func_p pflt_walk_init; 379 pflt_list_walk_next_func_p pflt_walk_next; 380 int pflt_num_policies; 381 pflt_policy_func_p pflt_policy[MAX_PFLT_POLICIES]; 382 page_t ***pflt_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES]; 383 }; 384 385 #define PAGE_FREELISTP(is_kflt, mnode, szc, color, mtype) \ 386 ((ufltp->pflt_freelists[szc][mtype][mnode] + (color))) 387 388 #define PAGE_FREELISTS(is_kflt, mnode, szc, color, mtype) \ 389 (*(ufltp->pflt_freelists[szc][mtype][mnode] + (color))) 390 391 #define PAGE_GET_FREELISTS(pp, vp, off, seg, vaddr, size, flags, lgrp) \ 392 pp = ufltp->pflt_get_free(vp, off, seg, vaddr, size, \ 393 flags, lgrp); 394 395 #define PAGE_GET_FREELISTS_POLICY(fp, i) \ 396 (fp->pflt_policy[i]) 397 398 #define PAGE_LIST_WALK_INIT(fp, szc, flags, bin, can_split, use_ceq, plw) \ 399 fp->pflt_walk_init(szc, flags, bin, can_split, use_ceq, plw) 400 401 #define PAGE_LIST_WALK_NEXT(fp, szc, bin, plw) \ 402 fp->pflt_walk_next(szc, bin, plw) 403 404 typedef char hpmctr_t; 405 406 #ifdef DEBUG 407 #define CHK_LPG(pp, szc) chk_lpg(pp, szc) 408 extern void chk_lpg(page_t *, uchar_t); 409 #else 410 #define CHK_LPG(pp, szc) 411 #endif 412 413 /* 414 * page list count per mnode and type. 415 */ 416 typedef struct { 417 pgcnt_t plc_mt_pgmax; /* max page cnt */ 418 pgcnt_t plc_mt_clpgcnt; /* cache list cnt */ 419 pgcnt_t plc_mt_flpgcnt; /* free list cnt - small pages */ 420 pgcnt_t plc_mt_lgpgcnt; /* free list cnt - large pages */ 421 #ifdef DEBUG 422 struct { 423 pgcnt_t plc_mts_pgcnt; /* per page size count */ 424 int plc_mts_colors; 425 pgcnt_t *plc_mtsc_pgcnt; /* per color bin count */ 426 } plc_mts[MMU_PAGE_SIZES]; 427 #endif 428 } plcnt_t[MAX_MEM_NODES][MAX_MEM_TYPES]; 429 430 #ifdef DEBUG 431 432 #define PLCNT_SZ(ctrs_sz) { \ 433 int szc; \ 434 for (szc = 0; szc < mmu_page_sizes; szc++) { \ 435 int colors = page_get_pagecolors(szc); \ 436 ctrs_sz += (max_mem_nodes * MAX_MEM_TYPES * \ 437 colors * sizeof (pgcnt_t)); \ 438 } \ 439 } 440 441 #define PLCNT_INIT(base) { \ 442 int mn, mt, szc, colors; \ 443 for (szc = 0; szc < mmu_page_sizes; szc++) { \ 444 colors = page_get_pagecolors(szc); \ 445 for (mn = 0; mn < max_mem_nodes; mn++) { \ 446 for (mt = 0; mt < MAX_MEM_TYPES; mt++) { \ 447 plcnt[mn][mt].plc_mts[szc]. \ 448 plc_mts_colors = colors; \ 449 plcnt[mn][mt].plc_mts[szc]. \ 450 plc_mtsc_pgcnt = (pgcnt_t *)base; \ 451 base += (colors * sizeof (pgcnt_t)); \ 452 } \ 453 } \ 454 } \ 455 } 456 457 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) { \ 458 int bin = PP_2_BIN(pp); \ 459 if (flags & PG_CACHE_LIST) \ 460 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt); \ 461 else if (szc) \ 462 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt); \ 463 else \ 464 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt); \ 465 atomic_add_long(&plcnt[mn][mtype].plc_mts[szc].plc_mts_pgcnt, \ 466 cnt); \ 467 atomic_add_long(&plcnt[mn][mtype].plc_mts[szc]. \ 468 plc_mtsc_pgcnt[bin], cnt); \ 469 } 470 471 #else 472 473 #define PLCNT_SZ(ctrs_sz) 474 475 #define PLCNT_INIT(base) 476 477 /* PG_FREE_LIST may not be explicitly set in flags for large pages */ 478 479 #define PLCNT_DO(pp, mn, mtype, szc, cnt, flags) { \ 480 if (flags & PG_CACHE_LIST) \ 481 atomic_add_long(&plcnt[mn][mtype].plc_mt_clpgcnt, cnt); \ 482 else if (szc) \ 483 atomic_add_long(&plcnt[mn][mtype].plc_mt_lgpgcnt, cnt); \ 484 else \ 485 atomic_add_long(&plcnt[mn][mtype].plc_mt_flpgcnt, cnt); \ 486 } 487 488 #endif 489 490 #define PLCNT_INCR(pp, mn, mtype, szc, flags) { \ 491 long cnt = (1 << PAGE_BSZS_SHIFT(szc)); \ 492 PLCNT_DO(pp, mn, mtype, szc, cnt, flags); \ 493 } 494 495 #define PLCNT_DECR(pp, mn, mtype, szc, flags) { \ 496 long cnt = ((-1) << PAGE_BSZS_SHIFT(szc)); \ 497 PLCNT_DO(pp, mn, mtype, szc, cnt, flags); \ 498 } 499 500 /* 501 * macros to update page list max counts - done when pages transferred 502 * from RELOC to NORELOC mtype (kcage_init or kcage_assimilate_page). 503 */ 504 505 #define PLCNT_XFER_NORELOC(pp) { \ 506 long cnt = (1 << PAGE_BSZS_SHIFT((pp)->p_szc)); \ 507 int mn = PP_2_MEM_NODE(pp); \ 508 atomic_add_long(&plcnt[mn][MTYPE_NORELOC].plc_mt_pgmax, cnt); \ 509 atomic_add_long(&plcnt[mn][MTYPE_RELOC].plc_mt_pgmax, -cnt); \ 510 } 511 512 /* 513 * macro to modify the page list max counts when memory is added to 514 * the page lists during startup (add_physmem) or during a DR operation 515 * when memory is added (kphysm_add_memory_dynamic) or deleted 516 * (kphysm_del_cleanup). 517 */ 518 #define PLCNT_MODIFY_MAX(pfn, cnt) { \ 519 spgcnt_t _cnt = (spgcnt_t)(cnt); \ 520 pgcnt_t _acnt = ABS(_cnt); \ 521 int _mn; \ 522 pgcnt_t _np; \ 523 if (&plat_mem_node_intersect_range != NULL) { \ 524 for (_mn = 0; _mn < max_mem_nodes; _mn++) { \ 525 plat_mem_node_intersect_range((pfn), _acnt, _mn, &_np);\ 526 if (_np == 0) \ 527 continue; \ 528 atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \ 529 (_cnt < 0) ? -_np : _np); \ 530 } \ 531 } else { \ 532 pfn_t _pfn = (pfn); \ 533 pfn_t _endpfn = _pfn + _acnt; \ 534 while (_pfn < _endpfn) { \ 535 _mn = PFN_2_MEM_NODE(_pfn); \ 536 _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \ 537 _pfn; \ 538 _pfn += _np; \ 539 atomic_add_long(&plcnt[_mn][MTYPE_RELOC].plc_mt_pgmax, \ 540 (_cnt < 0) ? -_np : _np); \ 541 } \ 542 } \ 543 } 544 545 /* 546 * macro to call page_ctrs_adjust() when memory is added 547 * during a DR operation. 548 */ 549 #define PAGE_CTRS_ADJUST(pfn, cnt, rv) { \ 550 spgcnt_t _cnt = (spgcnt_t)(cnt); \ 551 int _mn; \ 552 pgcnt_t _np; \ 553 if (&plat_mem_node_intersect_range != NULL) { \ 554 for (_mn = 0; _mn < max_mem_nodes; _mn++) { \ 555 plat_mem_node_intersect_range((pfn), _cnt, _mn, &_np); \ 556 if (_np == 0) \ 557 continue; \ 558 if ((rv = page_ctrs_adjust(_mn)) != 0) \ 559 break; \ 560 } \ 561 } else { \ 562 pfn_t _pfn = (pfn); \ 563 pfn_t _endpfn = _pfn + _cnt; \ 564 while (_pfn < _endpfn) { \ 565 _mn = PFN_2_MEM_NODE(_pfn); \ 566 _np = MIN(_endpfn, mem_node_config[_mn].physmax + 1) - \ 567 _pfn; \ 568 _pfn += _np; \ 569 if ((rv = page_ctrs_adjust(_mn)) != 0) \ 570 break; \ 571 } \ 572 } \ 573 } 574 575 extern plcnt_t plcnt; 576 577 #define MNODE_PGCNT(mn) \ 578 (plcnt[mn][MTYPE_RELOC].plc_mt_clpgcnt + \ 579 plcnt[mn][MTYPE_NORELOC].plc_mt_clpgcnt + \ 580 plcnt[mn][MTYPE_RELOC].plc_mt_flpgcnt + \ 581 plcnt[mn][MTYPE_NORELOC].plc_mt_flpgcnt + \ 582 plcnt[mn][MTYPE_RELOC].plc_mt_lgpgcnt + \ 583 plcnt[mn][MTYPE_NORELOC].plc_mt_lgpgcnt) 584 585 #define MNODETYPE_PGCNT(mn, mtype) \ 586 (plcnt[mn][mtype].plc_mt_clpgcnt + \ 587 plcnt[mn][mtype].plc_mt_flpgcnt + \ 588 plcnt[mn][mtype].plc_mt_lgpgcnt) 589 590 /* 591 * macros to loop through the mtype range - MTYPE_START returns -1 in 592 * mtype if no pages in mnode/mtype and possibly NEXT mtype. 593 */ 594 #define MTYPE_START(mnode, mtype, flags) { \ 595 if (plcnt[mnode][mtype].plc_mt_pgmax == 0) { \ 596 ASSERT(mtype == MTYPE_RELOC || \ 597 MNODETYPE_PGCNT(mnode, mtype) == 0 || \ 598 plcnt[mnode][mtype].plc_mt_pgmax != 0); \ 599 MTYPE_NEXT(mnode, mtype, flags); \ 600 } \ 601 } 602 603 /* 604 * if allocation from the RELOC pool failed and there is sufficient cage 605 * memory, attempt to allocate from the NORELOC pool. 606 */ 607 #define MTYPE_NEXT(mnode, mtype, flags) { \ 608 if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && \ 609 (kcage_freemem >= kcage_lotsfree)) { \ 610 if (plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax == 0) { \ 611 ASSERT(MNODETYPE_PGCNT(mnode, MTYPE_NORELOC) == 0 || \ 612 plcnt[mnode][MTYPE_NORELOC].plc_mt_pgmax != 0); \ 613 mtype = -1; \ 614 } else { \ 615 mtype = MTYPE_NORELOC; \ 616 flags |= PG_NORELOC; \ 617 } \ 618 } else { \ 619 mtype = -1; \ 620 } \ 621 } 622 623 /* 624 * get the ecache setsize for the current cpu. 625 */ 626 #define CPUSETSIZE() (cpunodes[CPU->cpu_id].ecache_setsize) 627 628 extern struct cpu cpu0; 629 #define CPU0 &cpu0 630 631 #define PAGE_BSZS_SHIFT(szc) TTE_BSZS_SHIFT(szc) 632 /* 633 * For sfmmu each larger page is 8 times the size of the previous 634 * size page. 635 */ 636 #define FULL_REGION_CNT(rg_szc) (8) 637 638 /* 639 * The counter base must be per page_counter element to prevent 640 * races when re-indexing, and the base page size element should 641 * be aligned on a boundary of the given region size. 642 * 643 * We also round up the number of pages spanned by the counters 644 * for a given region to PC_BASE_ALIGN in certain situations to simplify 645 * the coding for some non-performance critical routines. 646 */ 647 #define PC_BASE_ALIGN ((pfn_t)1 << PAGE_BSZS_SHIFT(mmu_page_sizes-1)) 648 #define PC_BASE_ALIGN_MASK (PC_BASE_ALIGN - 1) 649 650 extern int ecache_alignsize; 651 #define L2CACHE_ALIGN ecache_alignsize 652 #define L2CACHE_ALIGN_MAX 512 653 654 extern int update_proc_pgcolorbase_after_fork; 655 extern int consistent_coloring; 656 extern uint_t vac_colors_mask; 657 extern int vac_size; 658 extern int vac_shift; 659 660 /* 661 * Kernel mem segment in 64-bit space 662 */ 663 extern caddr_t kmem64_base, kmem64_end, kmem64_aligned_end; 664 extern int kmem64_alignsize, kmem64_szc; 665 extern uint64_t kmem64_pabase; 666 extern int max_bootlp_tteszc; 667 668 /* 669 * Maximum and default values for user heap, stack, private and shared 670 * anonymous memory, and user text and initialized data. 671 * 672 * Initial values are defined in architecture specific mach_vm_dep.c file. 673 * Used by map_pgsz*() routines. 674 */ 675 extern size_t max_uheap_lpsize; 676 extern size_t default_uheap_lpsize; 677 extern size_t max_ustack_lpsize; 678 extern size_t default_ustack_lpsize; 679 extern size_t max_privmap_lpsize; 680 extern size_t max_uidata_lpsize; 681 extern size_t max_utext_lpsize; 682 extern size_t max_shm_lpsize; 683 684 /* 685 * For adjusting the default lpsize, for DTLB-limited page sizes. 686 */ 687 extern void adjust_data_maxlpsize(size_t ismpagesize); 688 689 /* 690 * Sanity control. Don't use large pages regardless of user 691 * settings if there's less than priv or shm_lpg_min_physmem memory installed. 692 * The units for this variable are 8K pages. 693 */ 694 extern pgcnt_t privm_lpg_min_physmem; 695 extern pgcnt_t shm_lpg_min_physmem; 696 697 /* 698 * AS_2_BIN macro controls the page coloring policy. 699 * 0 (default) uses various vaddr bits 700 * 1 virtual=paddr 701 * 2 bin hopping 702 */ 703 #define AS_2_BIN(kflt, as, seg, vp, addr, bin, szc) \ 704 switch (consistent_coloring) { \ 705 default: \ 706 cmn_err(CE_WARN, \ 707 "AS_2_BIN: bad consistent coloring value"); \ 708 /* assume default algorithm -> continue */ \ 709 case 0: { \ 710 uint32_t ndx, new; \ 711 int slew = 0; \ 712 pfn_t pfn; \ 713 \ 714 if (vp != NULL && IS_SWAPVP(vp) && \ 715 seg->s_ops == &segvn_ops) \ 716 slew = as_color_bin(as); \ 717 \ 718 pfn = ((uintptr_t)addr >> MMU_PAGESHIFT) + \ 719 (((uintptr_t)addr >> page_coloring_shift) << \ 720 (vac_shift - MMU_PAGESHIFT)); \ 721 if ((szc) == 0 || &page_pfn_2_color_cpu == NULL) { \ 722 pfn += slew; \ 723 bin = PFN_2_COLOR(pfn, szc, NULL); \ 724 } else { \ 725 bin = PFN_2_COLOR(pfn, szc, NULL); \ 726 bin += slew >> (vac_shift - MMU_PAGESHIFT); \ 727 bin &= hw_page_array[(szc)].hp_colors - 1; \ 728 } \ 729 break; \ 730 } \ 731 case 1: \ 732 bin = PFN_2_COLOR(((uintptr_t)addr >> MMU_PAGESHIFT), \ 733 szc, NULL); \ 734 break; \ 735 case 2: { \ 736 int cnt = as_color_bin(as); \ 737 uint_t color_mask = page_get_pagecolors(0) - 1; \ 738 \ 739 /* make sure physical color aligns with vac color */ \ 740 while ((cnt & vac_colors_mask) != \ 741 addr_to_vcolor(addr)) { \ 742 cnt++; \ 743 } \ 744 bin = cnt = cnt & color_mask; \ 745 bin >>= PAGE_GET_COLOR_SHIFT(0, szc); \ 746 /* update per as page coloring fields */ \ 747 cnt = (cnt + 1) & color_mask; \ 748 if (cnt == (as_color_start(as) & color_mask)) { \ 749 cnt = as_color_start(as) = as_color_start(as) + \ 750 PGCLR_LOOPFACTOR; \ 751 } \ 752 as_color_bin(as) = cnt & color_mask; \ 753 break; \ 754 } \ 755 } \ 756 ASSERT(bin < page_get_pagecolors(szc)); 757 758 /* 759 * cpu private vm data - accessed thru CPU->cpu_vm_data 760 * vc_pnum_memseg: tracks last memseg visited in page_numtopp_nolock() 761 * vc_pnext_memseg: tracks last memseg visited in page_nextn() 762 * vc_kmptr: unaligned kmem pointer for this vm_cpu_data_t 763 * vc_kmsize: orignal kmem size for this vm_cpu_data_t 764 */ 765 766 typedef struct { 767 struct memseg *vc_pnum_memseg; 768 struct memseg *vc_pnext_memseg; 769 void *vc_kmptr; 770 size_t vc_kmsize; 771 } vm_cpu_data_t; 772 773 /* allocation size to ensure vm_cpu_data_t resides in its own cache line */ 774 #define VM_CPU_DATA_PADSIZE \ 775 (P2ROUNDUP(sizeof (vm_cpu_data_t), L2CACHE_ALIGN_MAX)) 776 777 /* for boot cpu before kmem is initialized */ 778 extern char vm_cpu_data0[]; 779 780 /* 781 * Function to get an ecache color bin: F(as, cnt, vcolor). 782 * the goal of this function is to: 783 * - to spread a processes' physical pages across the entire ecache to 784 * maximize its use. 785 * - to minimize vac flushes caused when we reuse a physical page on a 786 * different vac color than it was previously used. 787 * - to prevent all processes to use the same exact colors and trash each 788 * other. 789 * 790 * cnt is a bin ptr kept on a per as basis. As we page_create we increment 791 * the ptr so we spread out the physical pages to cover the entire ecache. 792 * The virtual color is made a subset of the physical color in order to 793 * in minimize virtual cache flushing. 794 * We add in the as to spread out different as. This happens when we 795 * initialize the start count value. 796 * sizeof(struct as) is 60 so we shift by 3 to get into the bit range 797 * that will tend to change. For example, on spitfire based machines 798 * (vcshft == 1) contigous as are spread bu ~6 bins. 799 * vcshft provides for proper virtual color alignment. 800 * In theory cnt should be updated using cas only but if we are off by one 801 * or 2 it is no big deal. 802 * We also keep a start value which is used to randomize on what bin we 803 * start counting when it is time to start another loop. This avoids 804 * contigous allocations of ecache size to point to the same bin. 805 * Why 3? Seems work ok. Better than 7 or anything larger. 806 */ 807 #define PGCLR_LOOPFACTOR 3 808 809 /* 810 * When a bin is empty, and we can't satisfy a color request correctly, 811 * we scan. If we assume that the programs have reasonable spatial 812 * behavior, then it will not be a good idea to use the adjacent color. 813 * Using the adjacent color would result in virtually adjacent addresses 814 * mapping into the same spot in the cache. So, if we stumble across 815 * an empty bin, skip a bunch before looking. After the first skip, 816 * then just look one bin at a time so we don't miss our cache on 817 * every look. Be sure to check every bin. Page_create() will panic 818 * if we miss a page. 819 * 820 * This also explains the `<=' in the for loops in both page_get_freelist() 821 * and page_get_cachelist(). Since we checked the target bin, skipped 822 * a bunch, then continued one a time, we wind up checking the target bin 823 * twice to make sure we get all of them bins. 824 */ 825 #define BIN_STEP 20 826 827 #ifdef VM_STATS 828 struct vmm_vmstats_str { 829 /* page_get_uflt and page_get_kflt */ 830 ulong_t pgf_alloc[MMU_PAGE_SIZES][MAX_PFLT_TYPE]; 831 ulong_t pgf_allocok[MMU_PAGE_SIZES][MAX_PFLT_TYPE]; 832 ulong_t pgf_allocokrem[MMU_PAGE_SIZES][MAX_PFLT_TYPE]; 833 ulong_t pgf_allocfailed[MMU_PAGE_SIZES][MAX_PFLT_TYPE]; 834 ulong_t pgf_allocdeferred; 835 ulong_t pgf_allocretry[MMU_PAGE_SIZES][MAX_PFLT_TYPE]; 836 ulong_t pgik_allocok; /* page_import_kflt */ 837 ulong_t pgik_allocfailed; 838 ulong_t pgkx_allocok; /* kflt_expand */ 839 ulong_t pgkx_allocfailed; 840 ulong_t puak_allocok; /* page_user_alloc_kflt */ 841 ulong_t puak_allocfailed; 842 ulong_t pgexportok; /* kflt_export */ 843 ulong_t pgexportfail; 844 ulong_t pgkflt_disable; /* kflt_user_evict */ 845 ulong_t pgc_alloc; /* page_get_cachelist */ 846 ulong_t pgc_allocok; 847 ulong_t pgc_allocokrem; 848 ulong_t pgc_allocokdeferred; 849 ulong_t pgc_allocfailed; 850 ulong_t pgcp_alloc[MMU_PAGE_SIZES]; /* page_get_contig_pages */ 851 ulong_t pgcp_allocfailed[MMU_PAGE_SIZES]; 852 ulong_t pgcp_allocempty[MMU_PAGE_SIZES]; 853 ulong_t pgcp_allocok[MMU_PAGE_SIZES]; 854 ulong_t ptcp[MMU_PAGE_SIZES]; /* page_trylock_contig_pages */ 855 ulong_t ptcpfreethresh[MMU_PAGE_SIZES]; 856 ulong_t ptcpfailexcl[MMU_PAGE_SIZES]; 857 ulong_t ptcpfailszc[MMU_PAGE_SIZES]; 858 ulong_t ptcpfailcage[MMU_PAGE_SIZES]; 859 ulong_t ptcpfailkflt[MMU_PAGE_SIZES]; 860 ulong_t ptcpok[MMU_PAGE_SIZES]; 861 ulong_t pgmf_alloc[MMU_PAGE_SIZES]; /* page_get_mnode_freelist */ 862 ulong_t pgmf_allocfailed[MMU_PAGE_SIZES]; 863 ulong_t pgmf_allocempty[MMU_PAGE_SIZES]; 864 ulong_t pgmf_allocok[MMU_PAGE_SIZES]; 865 ulong_t pgmc_alloc; /* page_get_mnode_cachelist */ 866 ulong_t pgmc_allocfailed; 867 ulong_t pgmc_allocempty; 868 ulong_t pgmc_allocok; 869 ulong_t pladd_free[MMU_PAGE_SIZES]; /* page_list_add/sub */ 870 ulong_t plsub_free[MMU_PAGE_SIZES]; 871 ulong_t pladd_cache; 872 ulong_t plsub_cache; 873 ulong_t plsubpages_szcbig; 874 ulong_t plsubpages_szc0; 875 ulong_t pfs_req[MMU_PAGE_SIZES]; /* page_freelist_split */ 876 ulong_t pfs_demote[MMU_PAGE_SIZES]; 877 ulong_t pfc_coalok[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 878 ulong_t ppr_reloc[MMU_PAGE_SIZES]; /* page_relocate */ 879 ulong_t ppr_relocnoroot[MMU_PAGE_SIZES]; 880 ulong_t ppr_reloc_replnoroot[MMU_PAGE_SIZES]; 881 ulong_t ppr_relocnolock[MMU_PAGE_SIZES]; 882 ulong_t ppr_relocnomem[MMU_PAGE_SIZES]; 883 ulong_t ppr_relocok[MMU_PAGE_SIZES]; 884 ulong_t ppr_krelocfail[MMU_PAGE_SIZES]; 885 ulong_t ppr_copyfail; 886 /* page coalesce counter */ 887 ulong_t page_ctrs_coalesce[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 888 /* candidates useful */ 889 ulong_t page_ctrs_cands_skip[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 890 /* ctrs changed after locking */ 891 ulong_t page_ctrs_changed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 892 /* page_freelist_coalesce failed */ 893 ulong_t page_ctrs_failed[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 894 ulong_t page_ctrs_coalesce_all; /* page coalesce all counter */ 895 ulong_t page_ctrs_cands_skip_all; /* candidates useful for all func */ 896 }; 897 extern struct vmm_vmstats_str vmm_vmstats; 898 #endif /* VM_STATS */ 899 900 /* 901 * Used to hold off page relocations into the cage until OBP has completed 902 * its boot-time handoff of its resources to the kernel. 903 */ 904 extern int page_relocate_ready; 905 906 /* 907 * cpu/mmu-dependent vm variables may be reset at bootup. 908 */ 909 extern uint_t mmu_page_sizes; 910 extern uint_t max_mmu_page_sizes; 911 extern uint_t mmu_hashcnt; 912 extern uint_t max_mmu_hashcnt; 913 extern size_t mmu_ism_pagesize; 914 extern int mmu_exported_pagesize_mask; 915 extern uint_t mmu_exported_page_sizes; 916 extern uint_t szc_2_userszc[]; 917 extern uint_t userszc_2_szc[]; 918 919 #define mmu_legacy_page_sizes mmu_exported_page_sizes 920 #define USERSZC_2_SZC(userszc) (userszc_2_szc[userszc]) 921 #define SZC_2_USERSZC(szc) (szc_2_userszc[szc]) 922 923 /* 924 * Platform specific page routines 925 */ 926 extern void mach_page_add(page_t **, page_t *); 927 extern void mach_page_sub(page_t **, page_t *); 928 extern uint_t page_get_pagecolors(uint_t); 929 extern void ppcopy_kernel__relocatable(page_t *, page_t *); 930 #define ppcopy_kernel(p1, p2) ppcopy_kernel__relocatable(p1, p2) 931 932 /* 933 * platform specific large pages for kernel heap support 934 */ 935 extern size_t get_segkmem_lpsize(size_t lpsize); 936 extern size_t mmu_get_kernel_lpsize(size_t lpsize); 937 extern void mmu_init_kernel_pgsz(struct hat *hat); 938 extern void mmu_init_kcontext(); 939 extern uint64_t kcontextreg; 940 941 /* 942 * Nucleus data page allocator routines 943 */ 944 extern void ndata_alloc_init(struct memlist *, uintptr_t, uintptr_t); 945 extern void *ndata_alloc(struct memlist *, size_t, size_t); 946 extern void *ndata_extra_base(struct memlist *, size_t, caddr_t); 947 extern size_t ndata_maxsize(struct memlist *); 948 extern size_t ndata_spare(struct memlist *, size_t, size_t); 949 950 #ifdef __cplusplus 951 } 952 #endif 953 954 #endif /* _VM_DEP_H */ 955