1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * VM - Hardware Address Translation management for Spitfire MMU. 30 * 31 * This file implements the machine specific hardware translation 32 * needed by the VM system. The machine independent interface is 33 * described in <vm/hat.h> while the machine dependent interface 34 * and data structures are described in <vm/hat_sfmmu.h>. 35 * 36 * The hat layer manages the address translation hardware as a cache 37 * driven by calls from the higher levels in the VM system. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/kstat.h> 42 #include <vm/hat.h> 43 #include <vm/hat_sfmmu.h> 44 #include <vm/page.h> 45 #include <sys/pte.h> 46 #include <sys/systm.h> 47 #include <sys/mman.h> 48 #include <sys/sysmacros.h> 49 #include <sys/machparam.h> 50 #include <sys/vtrace.h> 51 #include <sys/kmem.h> 52 #include <sys/mmu.h> 53 #include <sys/cmn_err.h> 54 #include <sys/cpu.h> 55 #include <sys/cpuvar.h> 56 #include <sys/debug.h> 57 #include <sys/lgrp.h> 58 #include <sys/archsystm.h> 59 #include <sys/machsystm.h> 60 #include <sys/vmsystm.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_kmem.h> 65 #include <vm/seg_kpm.h> 66 #include <vm/rm.h> 67 #include <sys/t_lock.h> 68 #include <sys/obpdefs.h> 69 #include <sys/vm_machparam.h> 70 #include <sys/var.h> 71 #include <sys/trap.h> 72 #include <sys/machtrap.h> 73 #include <sys/scb.h> 74 #include <sys/bitmap.h> 75 #include <sys/machlock.h> 76 #include <sys/membar.h> 77 #include <sys/atomic.h> 78 #include <sys/cpu_module.h> 79 #include <sys/prom_debug.h> 80 #include <sys/ksynch.h> 81 #include <sys/mem_config.h> 82 #include <sys/mem_cage.h> 83 #include <sys/dtrace.h> 84 #include <vm/vm_dep.h> 85 #include <vm/xhat_sfmmu.h> 86 #include <sys/fpu/fpusystm.h> 87 88 #if defined(SF_ERRATA_57) 89 extern caddr_t errata57_limit; 90 #endif 91 92 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 93 (sizeof (int64_t))) 94 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 95 96 #define HBLK_RESERVE_CNT 128 97 #define HBLK_RESERVE_MIN 20 98 99 static struct hme_blk *freehblkp; 100 static kmutex_t freehblkp_lock; 101 static int freehblkcnt; 102 103 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 104 static kmutex_t hblk_reserve_lock; 105 static kthread_t *hblk_reserve_thread; 106 107 static nucleus_hblk8_info_t nucleus_hblk8; 108 static nucleus_hblk1_info_t nucleus_hblk1; 109 110 /* 111 * SFMMU specific hat functions 112 */ 113 void hat_pagecachectl(struct page *, int); 114 115 /* flags for hat_pagecachectl */ 116 #define HAT_CACHE 0x1 117 #define HAT_UNCACHE 0x2 118 #define HAT_TMPNC 0x4 119 120 /* 121 * Flag to allow the creation of non-cacheable translations 122 * to system memory. It is off by default. At the moment this 123 * flag is used by the ecache error injector. The error injector 124 * will turn it on when creating such a translation then shut it 125 * off when it's finished. 126 */ 127 128 int sfmmu_allow_nc_trans = 0; 129 130 /* 131 * Flag to disable large page support. 132 * value of 1 => disable all large pages. 133 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 134 * 135 * For example, use the value 0x4 to disable 512K pages. 136 * 137 */ 138 #define LARGE_PAGES_OFF 0x1 139 140 /* 141 * WARNING: 512K pages MUST be disabled for ISM/DISM. If not 142 * a process would page fault indefinitely if it tried to 143 * access a 512K page. 144 */ 145 int disable_ism_large_pages = (1 << TTE512K); 146 int disable_large_pages = 0; 147 int disable_auto_large_pages = 0; 148 149 /* 150 * Private sfmmu data structures for hat management 151 */ 152 static struct kmem_cache *sfmmuid_cache; 153 static struct kmem_cache *mmuctxdom_cache; 154 155 /* 156 * Private sfmmu data structures for tsb management 157 */ 158 static struct kmem_cache *sfmmu_tsbinfo_cache; 159 static struct kmem_cache *sfmmu_tsb8k_cache; 160 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 161 static vmem_t *kmem_tsb_arena; 162 163 /* 164 * sfmmu static variables for hmeblk resource management. 165 */ 166 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 167 static struct kmem_cache *sfmmu8_cache; 168 static struct kmem_cache *sfmmu1_cache; 169 static struct kmem_cache *pa_hment_cache; 170 171 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 172 /* 173 * private data for ism 174 */ 175 static struct kmem_cache *ism_blk_cache; 176 static struct kmem_cache *ism_ment_cache; 177 #define ISMID_STARTADDR NULL 178 179 /* 180 * Whether to delay TLB flushes and use Cheetah's flush-all support 181 * when removing contexts from the dirty list. 182 */ 183 int delay_tlb_flush; 184 int disable_delay_tlb_flush; 185 186 /* 187 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 188 * HAT flags, synchronizing TLB/TSB coherency, and context management. 189 * The lock is hashed on the sfmmup since the case where we need to lock 190 * all processes is rare but does occur (e.g. we need to unload a shared 191 * mapping from all processes using the mapping). We have a lot of buckets, 192 * and each slab of sfmmu_t's can use about a quarter of them, giving us 193 * a fairly good distribution without wasting too much space and overhead 194 * when we have to grab them all. 195 */ 196 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 197 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 198 199 /* 200 * Hash algorithm optimized for a small number of slabs. 201 * 7 is (highbit((sizeof sfmmu_t)) - 1) 202 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 203 * kmem_cache, and thus they will be sequential within that cache. In 204 * addition, each new slab will have a different "color" up to cache_maxcolor 205 * which will skew the hashing for each successive slab which is allocated. 206 * If the size of sfmmu_t changed to a larger size, this algorithm may need 207 * to be revisited. 208 */ 209 #define TSB_HASH_SHIFT_BITS (7) 210 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 211 212 #ifdef DEBUG 213 int tsb_hash_debug = 0; 214 #define TSB_HASH(sfmmup) \ 215 (tsb_hash_debug ? &hat_lock[0] : \ 216 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 217 #else /* DEBUG */ 218 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 219 #endif /* DEBUG */ 220 221 222 /* sfmmu_replace_tsb() return codes. */ 223 typedef enum tsb_replace_rc { 224 TSB_SUCCESS, 225 TSB_ALLOCFAIL, 226 TSB_LOSTRACE, 227 TSB_ALREADY_SWAPPED, 228 TSB_CANTGROW 229 } tsb_replace_rc_t; 230 231 /* 232 * Flags for TSB allocation routines. 233 */ 234 #define TSB_ALLOC 0x01 235 #define TSB_FORCEALLOC 0x02 236 #define TSB_GROW 0x04 237 #define TSB_SHRINK 0x08 238 #define TSB_SWAPIN 0x10 239 240 /* 241 * Support for HAT callbacks. 242 */ 243 #define SFMMU_MAX_RELOC_CALLBACKS 10 244 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 245 static id_t sfmmu_cb_nextid = 0; 246 static id_t sfmmu_tsb_cb_id; 247 struct sfmmu_callback *sfmmu_cb_table; 248 249 /* 250 * Kernel page relocation is enabled by default for non-caged 251 * kernel pages. This has little effect unless segkmem_reloc is 252 * set, since by default kernel memory comes from inside the 253 * kernel cage. 254 */ 255 int hat_kpr_enabled = 1; 256 257 kmutex_t kpr_mutex; 258 kmutex_t kpr_suspendlock; 259 kthread_t *kreloc_thread; 260 261 /* 262 * Enable VA->PA translation sanity checking on DEBUG kernels. 263 * Disabled by default. This is incompatible with some 264 * drivers (error injector, RSM) so if it breaks you get 265 * to keep both pieces. 266 */ 267 int hat_check_vtop = 0; 268 269 /* 270 * Private sfmmu routines (prototypes) 271 */ 272 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 273 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 274 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t); 275 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 276 caddr_t, demap_range_t *, uint_t); 277 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 278 caddr_t, int); 279 static void sfmmu_hblk_free(struct hmehash_bucket *, struct hme_blk *, 280 uint64_t, struct hme_blk **); 281 static void sfmmu_hblks_list_purge(struct hme_blk **); 282 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 283 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 284 static struct hme_blk *sfmmu_hblk_steal(int); 285 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 286 struct hme_blk *, uint64_t, uint64_t, 287 struct hme_blk *); 288 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 289 290 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 291 uint_t, uint_t, pgcnt_t); 292 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 293 uint_t); 294 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 295 uint_t); 296 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 297 caddr_t, int); 298 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 299 struct hmehash_bucket *, caddr_t, uint_t, uint_t); 300 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 301 caddr_t, page_t **, uint_t); 302 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 303 304 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 305 pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *); 306 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 307 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 308 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 309 static int tst_tnc(page_t *pp, pgcnt_t); 310 static void conv_tnc(page_t *pp, int); 311 312 static void sfmmu_get_ctx(sfmmu_t *); 313 static void sfmmu_free_sfmmu(sfmmu_t *); 314 315 static void sfmmu_gettte(struct hat *, caddr_t, tte_t *); 316 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 317 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 318 319 static cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 320 static void hat_pagereload(struct page *, struct page *); 321 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 322 static void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 323 static void sfmmu_page_cache(page_t *, int, int, int); 324 325 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 326 pfn_t, int, int, int, int); 327 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 328 pfn_t, int); 329 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 330 static void sfmmu_tlb_range_demap(demap_range_t *); 331 static void sfmmu_invalidate_ctx(sfmmu_t *); 332 static void sfmmu_sync_mmustate(sfmmu_t *); 333 334 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 335 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 336 sfmmu_t *); 337 static void sfmmu_tsb_free(struct tsb_info *); 338 static void sfmmu_tsbinfo_free(struct tsb_info *); 339 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 340 sfmmu_t *); 341 342 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 343 static int sfmmu_select_tsb_szc(pgcnt_t); 344 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 345 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 346 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 347 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 348 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 349 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 350 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 351 hatlock_t *, uint_t); 352 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int); 353 354 static void sfmmu_cache_flush(pfn_t, int); 355 void sfmmu_cache_flushcolor(int, pfn_t); 356 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 357 caddr_t, demap_range_t *, uint_t, int); 358 359 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 360 static uint_t sfmmu_ptov_attr(tte_t *); 361 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 362 caddr_t, demap_range_t *, uint_t); 363 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 364 static int sfmmu_idcache_constructor(void *, void *, int); 365 static void sfmmu_idcache_destructor(void *, void *); 366 static int sfmmu_hblkcache_constructor(void *, void *, int); 367 static void sfmmu_hblkcache_destructor(void *, void *); 368 static void sfmmu_hblkcache_reclaim(void *); 369 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 370 struct hmehash_bucket *); 371 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 372 static void sfmmu_rm_large_mappings(page_t *, int); 373 374 static void hat_lock_init(void); 375 static void hat_kstat_init(void); 376 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 377 static void sfmmu_check_page_sizes(sfmmu_t *, int); 378 static int fnd_mapping_sz(page_t *); 379 static void iment_add(struct ism_ment *, struct hat *); 380 static void iment_sub(struct ism_ment *, struct hat *); 381 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 382 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 383 extern void sfmmu_clear_utsbinfo(void); 384 385 /* kpm prototypes */ 386 static caddr_t sfmmu_kpm_mapin(page_t *); 387 static void sfmmu_kpm_mapout(page_t *, caddr_t); 388 static int sfmmu_kpme_lookup(struct kpme *, page_t *); 389 static void sfmmu_kpme_add(struct kpme *, page_t *); 390 static void sfmmu_kpme_sub(struct kpme *, page_t *); 391 static caddr_t sfmmu_kpm_getvaddr(page_t *, int *); 392 static int sfmmu_kpm_fault(caddr_t, struct memseg *, page_t *); 393 static int sfmmu_kpm_fault_small(caddr_t, struct memseg *, page_t *); 394 static void sfmmu_kpm_vac_conflict(page_t *, caddr_t); 395 static void sfmmu_kpm_pageunload(page_t *); 396 static void sfmmu_kpm_vac_unload(page_t *, caddr_t); 397 static void sfmmu_kpm_demap_large(caddr_t); 398 static void sfmmu_kpm_demap_small(caddr_t); 399 static void sfmmu_kpm_demap_tlbs(caddr_t); 400 static void sfmmu_kpm_hme_unload(page_t *); 401 static kpm_hlk_t *sfmmu_kpm_kpmp_enter(page_t *, pgcnt_t); 402 static void sfmmu_kpm_kpmp_exit(kpm_hlk_t *kpmp); 403 static void sfmmu_kpm_page_cache(page_t *, int, int); 404 405 static void sfmmu_ctx_wrap_around(mmu_ctx_t *); 406 407 /* kpm globals */ 408 #ifdef DEBUG 409 /* 410 * Enable trap level tsbmiss handling 411 */ 412 int kpm_tsbmtl = 1; 413 414 /* 415 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 416 * required TLB shootdowns in this case, so handle w/ care. Off by default. 417 */ 418 int kpm_tlb_flush; 419 #endif /* DEBUG */ 420 421 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 422 423 #ifdef DEBUG 424 static void sfmmu_check_hblk_flist(); 425 #endif 426 427 /* 428 * Semi-private sfmmu data structures. Some of them are initialize in 429 * startup or in hat_init. Some of them are private but accessed by 430 * assembly code or mach_sfmmu.c 431 */ 432 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 433 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 434 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 435 uint64_t khme_hash_pa; /* PA of khme_hash */ 436 int uhmehash_num; /* # of buckets in user hash table */ 437 int khmehash_num; /* # of buckets in kernel hash table */ 438 439 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */ 440 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */ 441 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ 442 443 #define DEFAULT_NUM_CTXS_PER_MMU 8192 444 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; 445 446 int cache; /* describes system cache */ 447 448 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 449 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 450 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 451 int ktsb_sz; /* kernel 8k-indexed tsb size */ 452 453 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 454 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 455 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 456 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 457 458 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 459 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 460 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 461 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 462 463 #ifndef sun4v 464 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 465 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 466 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 467 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 468 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 469 #endif /* sun4v */ 470 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 471 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 472 473 /* 474 * Size to use for TSB slabs. Future platforms that support page sizes 475 * larger than 4M may wish to change these values, and provide their own 476 * assembly macros for building and decoding the TSB base register contents. 477 */ 478 uint_t tsb_slab_size = MMU_PAGESIZE4M; 479 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 480 uint_t tsb_slab_ttesz = TTE4M; 481 uint_t tsb_slab_mask = 0x1ff; /* 4M page alignment for 8K pfn */ 482 483 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 484 int tsb_max_growsize = UTSB_MAX_SZCODE; 485 486 /* 487 * Tunable parameters dealing with TSB policies. 488 */ 489 490 /* 491 * This undocumented tunable forces all 8K TSBs to be allocated from 492 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 493 */ 494 #ifdef DEBUG 495 int tsb_forceheap = 0; 496 #endif /* DEBUG */ 497 498 /* 499 * Decide whether to use per-lgroup arenas, or one global set of 500 * TSB arenas. The default is not to break up per-lgroup, since 501 * most platforms don't recognize any tangible benefit from it. 502 */ 503 int tsb_lgrp_affinity = 0; 504 505 /* 506 * Used for growing the TSB based on the process RSS. 507 * tsb_rss_factor is based on the smallest TSB, and is 508 * shifted by the TSB size to determine if we need to grow. 509 * The default will grow the TSB if the number of TTEs for 510 * this page size exceeds 75% of the number of TSB entries, 511 * which should _almost_ eliminate all conflict misses 512 * (at the expense of using up lots and lots of memory). 513 */ 514 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 515 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 516 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 517 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 518 default_tsb_size) 519 #define TSB_OK_SHRINK() \ 520 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 521 #define TSB_OK_GROW() \ 522 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 523 524 int enable_tsb_rss_sizing = 1; 525 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 526 527 /* which TSB size code to use for new address spaces or if rss sizing off */ 528 int default_tsb_size = TSB_8K_SZCODE; 529 530 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 531 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 532 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 533 534 #ifdef DEBUG 535 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 536 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 537 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 538 static int tsb_alloc_fail_mtbf = 0; 539 static int tsb_alloc_count = 0; 540 #endif /* DEBUG */ 541 542 /* if set to 1, will remap valid TTEs when growing TSB. */ 543 int tsb_remap_ttes = 1; 544 545 /* 546 * If we have more than this many mappings, allocate a second TSB. 547 * This default is chosen because the I/D fully associative TLBs are 548 * assumed to have at least 8 available entries. Platforms with a 549 * larger fully-associative TLB could probably override the default. 550 */ 551 int tsb_sectsb_threshold = 8; 552 553 /* 554 * kstat data 555 */ 556 struct sfmmu_global_stat sfmmu_global_stat; 557 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 558 559 /* 560 * Global data 561 */ 562 sfmmu_t *ksfmmup; /* kernel's hat id */ 563 564 #ifdef DEBUG 565 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 566 #endif 567 568 /* sfmmu locking operations */ 569 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 570 static int sfmmu_mlspl_held(struct page *, int); 571 572 static kmutex_t *sfmmu_page_enter(page_t *); 573 static void sfmmu_page_exit(kmutex_t *); 574 static int sfmmu_page_spl_held(struct page *); 575 576 /* sfmmu internal locking operations - accessed directly */ 577 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 578 kmutex_t **, kmutex_t **); 579 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 580 static hatlock_t * 581 sfmmu_hat_enter(sfmmu_t *); 582 static hatlock_t * 583 sfmmu_hat_tryenter(sfmmu_t *); 584 static void sfmmu_hat_exit(hatlock_t *); 585 static void sfmmu_hat_lock_all(void); 586 static void sfmmu_hat_unlock_all(void); 587 static void sfmmu_ismhat_enter(sfmmu_t *, int); 588 static void sfmmu_ismhat_exit(sfmmu_t *, int); 589 590 /* 591 * Array of mutexes protecting a page's mapping list and p_nrm field. 592 * 593 * The hash function looks complicated, but is made up so that: 594 * 595 * "pp" not shifted, so adjacent pp values will hash to different cache lines 596 * (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock) 597 * 598 * "pp" >> mml_shift, incorporates more source bits into the hash result 599 * 600 * "& (mml_table_size - 1), should be faster than using remainder "%" 601 * 602 * Hopefully, mml_table, mml_table_size and mml_shift are all in the same 603 * cacheline, since they get declared next to each other below. We'll trust 604 * ld not to do something random. 605 */ 606 #ifdef DEBUG 607 int mlist_hash_debug = 0; 608 #define MLIST_HASH(pp) (mlist_hash_debug ? &mml_table[0] : \ 609 &mml_table[((uintptr_t)(pp) + \ 610 ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]) 611 #else /* !DEBUG */ 612 #define MLIST_HASH(pp) &mml_table[ \ 613 ((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)] 614 #endif /* !DEBUG */ 615 616 kmutex_t *mml_table; 617 uint_t mml_table_sz; /* must be a power of 2 */ 618 uint_t mml_shift; /* log2(mml_table_sz) + 3 for align */ 619 620 /* 621 * kpm_page lock hash. 622 * All slots should be used equally and 2 adjacent kpm_page_t's 623 * shouldn't have their mutexes in the same cache line. 624 */ 625 #ifdef DEBUG 626 int kpmp_hash_debug = 0; 627 #define KPMP_HASH(kpp) (kpmp_hash_debug ? &kpmp_table[0] : &kpmp_table[ \ 628 ((uintptr_t)(kpp) + ((uintptr_t)(kpp) >> kpmp_shift)) \ 629 & (kpmp_table_sz - 1)]) 630 #else /* !DEBUG */ 631 #define KPMP_HASH(kpp) &kpmp_table[ \ 632 ((uintptr_t)(kpp) + ((uintptr_t)(kpp) >> kpmp_shift)) \ 633 & (kpmp_table_sz - 1)] 634 #endif /* DEBUG */ 635 636 kpm_hlk_t *kpmp_table; 637 uint_t kpmp_table_sz; /* must be a power of 2 */ 638 uchar_t kpmp_shift; 639 640 #ifdef DEBUG 641 #define KPMP_SHASH(kpp) (kpmp_hash_debug ? &kpmp_stable[0] : &kpmp_stable[ \ 642 (((uintptr_t)(kpp) << kpmp_shift) + (uintptr_t)(kpp)) \ 643 & (kpmp_stable_sz - 1)]) 644 #else /* !DEBUG */ 645 #define KPMP_SHASH(kpp) &kpmp_stable[ \ 646 (((uintptr_t)(kpp) << kpmp_shift) + (uintptr_t)(kpp)) \ 647 & (kpmp_stable_sz - 1)] 648 #endif /* DEBUG */ 649 650 kpm_shlk_t *kpmp_stable; 651 uint_t kpmp_stable_sz; /* must be a power of 2 */ 652 653 /* 654 * SPL_HASH was improved to avoid false cache line sharing 655 */ 656 #define SPL_TABLE_SIZE 128 657 #define SPL_MASK (SPL_TABLE_SIZE - 1) 658 #define SPL_SHIFT 7 /* log2(SPL_TABLE_SIZE) */ 659 660 #define SPL_INDEX(pp) \ 661 ((((uintptr_t)(pp) >> SPL_SHIFT) ^ \ 662 ((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \ 663 (SPL_TABLE_SIZE - 1)) 664 665 #define SPL_HASH(pp) \ 666 (&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex) 667 668 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 669 670 671 /* 672 * hat_unload_callback() will group together callbacks in order 673 * to avoid xt_sync() calls. This is the maximum size of the group. 674 */ 675 #define MAX_CB_ADDR 32 676 677 tte_t hw_tte; 678 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 679 680 static char *mmu_ctx_kstat_names[] = { 681 "mmu_ctx_tsb_exceptions", 682 "mmu_ctx_tsb_raise_exception", 683 "mmu_ctx_wrap_around", 684 }; 685 686 /* 687 * kpm virtual address to physical address 688 */ 689 #define SFMMU_KPM_VTOP(vaddr, paddr) { \ 690 uintptr_t r, v; \ 691 \ 692 r = ((vaddr) - kpm_vbase) >> (uintptr_t)kpm_size_shift; \ 693 (paddr) = (vaddr) - kpm_vbase; \ 694 if (r != 0) { \ 695 v = ((uintptr_t)(vaddr) >> MMU_PAGESHIFT) & \ 696 vac_colors_mask; \ 697 (paddr) -= r << kpm_size_shift; \ 698 if (r > v) \ 699 (paddr) += (r - v) << MMU_PAGESHIFT; \ 700 else \ 701 (paddr) -= r << MMU_PAGESHIFT; \ 702 } \ 703 } 704 705 /* 706 * Wrapper for vmem_xalloc since vmem_create only allows limited 707 * parameters for vm_source_alloc functions. This function allows us 708 * to specify alignment consistent with the size of the object being 709 * allocated. 710 */ 711 static void * 712 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 713 { 714 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 715 } 716 717 /* Common code for setting tsb_alloc_hiwater. */ 718 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 719 ptob(pages) / tsb_alloc_hiwater_factor 720 721 /* 722 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 723 * a single TSB. physmem is the number of physical pages so we need physmem 8K 724 * TTEs to represent all those physical pages. We round this up by using 725 * 1<<highbit(). To figure out which size code to use, remember that the size 726 * code is just an amount to shift the smallest TSB size to get the size of 727 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 728 * highbit() - 1) to get the size code for the smallest TSB that can represent 729 * all of physical memory, while erring on the side of too much. 730 * 731 * If the computed size code is less than the current tsb_max_growsize, we set 732 * tsb_max_growsize to the computed size code. In the case where the computed 733 * size code is greater than tsb_max_growsize, we have these restrictions that 734 * apply to increasing tsb_max_growsize: 735 * 1) TSBs can't grow larger than the TSB slab size 736 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 737 */ 738 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 739 int i, szc; \ 740 \ 741 i = highbit(pages); \ 742 if ((1 << (i - 1)) == (pages)) \ 743 i--; /* 2^n case, round down */ \ 744 szc = i - TSB_START_SIZE; \ 745 if (szc < tsb_max_growsize) \ 746 tsb_max_growsize = szc; \ 747 else if ((szc > tsb_max_growsize) && \ 748 (szc <= tsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT))) \ 749 tsb_max_growsize = MIN(szc, UTSB_MAX_SZCODE); \ 750 } 751 752 /* 753 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 754 * tsb_info which handles that TTE size. 755 */ 756 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) \ 757 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 758 ASSERT(sfmmu_hat_lock_held(sfmmup)); \ 759 if ((tte_szc) >= TTE4M) \ 760 (tsbinfop) = (tsbinfop)->tsb_next; 761 762 /* 763 * Return the number of mappings present in the HAT 764 * for a particular process and page size. 765 */ 766 #define SFMMU_TTE_CNT(sfmmup, szc) \ 767 (sfmmup)->sfmmu_iblk? \ 768 (sfmmup)->sfmmu_ismttecnt[(szc)] + \ 769 (sfmmup)->sfmmu_ttecnt[(szc)] : \ 770 (sfmmup)->sfmmu_ttecnt[(szc)]; 771 772 /* 773 * Macro to use to unload entries from the TSB. 774 * It has knowledge of which page sizes get replicated in the TSB 775 * and will call the appropriate unload routine for the appropriate size. 776 */ 777 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp) \ 778 { \ 779 int ttesz = get_hblk_ttesz(hmeblkp); \ 780 if (ttesz == TTE8K || ttesz == TTE4M) { \ 781 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 782 } else { \ 783 caddr_t sva = (caddr_t)get_hblk_base(hmeblkp); \ 784 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 785 ASSERT(addr >= sva && addr < eva); \ 786 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 787 } \ 788 } 789 790 791 /* Update tsb_alloc_hiwater after memory is configured. */ 792 /*ARGSUSED*/ 793 static void 794 sfmmu_update_tsb_post_add(void *arg, pgcnt_t delta_pages) 795 { 796 /* Assumes physmem has already been updated. */ 797 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 798 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 799 } 800 801 /* 802 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 803 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 804 * deleted. 805 */ 806 /*ARGSUSED*/ 807 static int 808 sfmmu_update_tsb_pre_del(void *arg, pgcnt_t delta_pages) 809 { 810 return (0); 811 } 812 813 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 814 /*ARGSUSED*/ 815 static void 816 sfmmu_update_tsb_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 817 { 818 /* 819 * Whether the delete was cancelled or not, just go ahead and update 820 * tsb_alloc_hiwater and tsb_max_growsize. 821 */ 822 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 823 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 824 } 825 826 static kphysm_setup_vector_t sfmmu_update_tsb_vec = { 827 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 828 sfmmu_update_tsb_post_add, /* post_add */ 829 sfmmu_update_tsb_pre_del, /* pre_del */ 830 sfmmu_update_tsb_post_del /* post_del */ 831 }; 832 833 834 /* 835 * HME_BLK HASH PRIMITIVES 836 */ 837 838 /* 839 * Enter a hme on the mapping list for page pp. 840 * When large pages are more prevalent in the system we might want to 841 * keep the mapping list in ascending order by the hment size. For now, 842 * small pages are more frequent, so don't slow it down. 843 */ 844 #define HME_ADD(hme, pp) \ 845 { \ 846 ASSERT(sfmmu_mlist_held(pp)); \ 847 \ 848 hme->hme_prev = NULL; \ 849 hme->hme_next = pp->p_mapping; \ 850 hme->hme_page = pp; \ 851 if (pp->p_mapping) { \ 852 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 853 ASSERT(pp->p_share > 0); \ 854 } else { \ 855 /* EMPTY */ \ 856 ASSERT(pp->p_share == 0); \ 857 } \ 858 pp->p_mapping = hme; \ 859 pp->p_share++; \ 860 } 861 862 /* 863 * Enter a hme on the mapping list for page pp. 864 * If we are unmapping a large translation, we need to make sure that the 865 * change is reflect in the corresponding bit of the p_index field. 866 */ 867 #define HME_SUB(hme, pp) \ 868 { \ 869 ASSERT(sfmmu_mlist_held(pp)); \ 870 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 871 \ 872 if (pp->p_mapping == NULL) { \ 873 panic("hme_remove - no mappings"); \ 874 } \ 875 \ 876 membar_stst(); /* ensure previous stores finish */ \ 877 \ 878 ASSERT(pp->p_share > 0); \ 879 pp->p_share--; \ 880 \ 881 if (hme->hme_prev) { \ 882 ASSERT(pp->p_mapping != hme); \ 883 ASSERT(hme->hme_prev->hme_page == pp || \ 884 IS_PAHME(hme->hme_prev)); \ 885 hme->hme_prev->hme_next = hme->hme_next; \ 886 } else { \ 887 ASSERT(pp->p_mapping == hme); \ 888 pp->p_mapping = hme->hme_next; \ 889 ASSERT((pp->p_mapping == NULL) ? \ 890 (pp->p_share == 0) : 1); \ 891 } \ 892 \ 893 if (hme->hme_next) { \ 894 ASSERT(hme->hme_next->hme_page == pp || \ 895 IS_PAHME(hme->hme_next)); \ 896 hme->hme_next->hme_prev = hme->hme_prev; \ 897 } \ 898 \ 899 /* zero out the entry */ \ 900 hme->hme_next = NULL; \ 901 hme->hme_prev = NULL; \ 902 hme->hme_page = NULL; \ 903 \ 904 if (hme_size(hme) > TTE8K) { \ 905 /* remove mappings for remainder of large pg */ \ 906 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 907 } \ 908 } 909 910 /* 911 * This function returns the hment given the hme_blk and a vaddr. 912 * It assumes addr has already been checked to belong to hme_blk's 913 * range. 914 */ 915 #define HBLKTOHME(hment, hmeblkp, addr) \ 916 { \ 917 int index; \ 918 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 919 } 920 921 /* 922 * Version of HBLKTOHME that also returns the index in hmeblkp 923 * of the hment. 924 */ 925 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 926 { \ 927 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 928 \ 929 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 930 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 931 } else \ 932 idx = 0; \ 933 \ 934 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 935 } 936 937 /* 938 * Disable any page sizes not supported by the CPU 939 */ 940 void 941 hat_init_pagesizes() 942 { 943 int i; 944 945 mmu_exported_page_sizes = 0; 946 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 947 extern int disable_text_largepages; 948 extern int disable_initdata_largepages; 949 950 szc_2_userszc[i] = (uint_t)-1; 951 userszc_2_szc[i] = (uint_t)-1; 952 953 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 954 disable_large_pages |= (1 << i); 955 disable_ism_large_pages |= (1 << i); 956 disable_text_largepages |= (1 << i); 957 disable_initdata_largepages |= (1 << i); 958 } else { 959 szc_2_userszc[i] = mmu_exported_page_sizes; 960 userszc_2_szc[mmu_exported_page_sizes] = i; 961 mmu_exported_page_sizes++; 962 } 963 } 964 965 disable_auto_large_pages = disable_large_pages; 966 967 /* 968 * Initialize mmu-specific large page sizes. 969 */ 970 if ((mmu_page_sizes == max_mmu_page_sizes) && 971 (&mmu_large_pages_disabled)) { 972 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 973 disable_ism_large_pages |= 974 mmu_large_pages_disabled(HAT_LOAD_SHARE); 975 disable_auto_large_pages |= 976 mmu_large_pages_disabled(HAT_LOAD_AUTOLPG); 977 } 978 979 } 980 981 /* 982 * Initialize the hardware address translation structures. 983 */ 984 void 985 hat_init(void) 986 { 987 int i; 988 size_t size; 989 990 hat_lock_init(); 991 hat_kstat_init(); 992 993 /* 994 * Hardware-only bits in a TTE 995 */ 996 MAKE_TTE_MASK(&hw_tte); 997 998 hat_init_pagesizes(); 999 1000 /* Initialize the hash locks */ 1001 for (i = 0; i < khmehash_num; i++) { 1002 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1003 MUTEX_DEFAULT, NULL); 1004 } 1005 for (i = 0; i < uhmehash_num; i++) { 1006 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1007 MUTEX_DEFAULT, NULL); 1008 } 1009 khmehash_num--; /* make sure counter starts from 0 */ 1010 uhmehash_num--; /* make sure counter starts from 0 */ 1011 1012 /* 1013 * Allocate context domain structures. 1014 * 1015 * A platform may choose to modify max_mmu_ctxdoms in 1016 * set_platform_defaults(). If a platform does not define 1017 * a set_platform_defaults() or does not choose to modify 1018 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU. 1019 * 1020 * For sun4v, there will be one global context domain, this is to 1021 * avoid the ldom cpu substitution problem. 1022 * 1023 * For all platforms that have CPUs sharing MMUs, this 1024 * value must be defined. 1025 */ 1026 if (max_mmu_ctxdoms == 0) { 1027 #ifndef sun4v 1028 max_mmu_ctxdoms = max_ncpus; 1029 #else /* sun4v */ 1030 max_mmu_ctxdoms = 1; 1031 #endif /* sun4v */ 1032 } 1033 1034 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *); 1035 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP); 1036 1037 /* mmu_ctx_t is 64 bytes aligned */ 1038 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache", 1039 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1040 /* 1041 * MMU context domain initialization for the Boot CPU. 1042 * This needs the context domains array allocated above. 1043 */ 1044 mutex_enter(&cpu_lock); 1045 sfmmu_cpu_init(CPU); 1046 mutex_exit(&cpu_lock); 1047 1048 /* 1049 * Intialize ism mapping list lock. 1050 */ 1051 1052 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1053 1054 /* 1055 * Each sfmmu structure carries an array of MMU context info 1056 * structures, one per context domain. The size of this array depends 1057 * on the maximum number of context domains. So, the size of the 1058 * sfmmu structure varies per platform. 1059 * 1060 * sfmmu is allocated from static arena, because trap 1061 * handler at TL > 0 is not allowed to touch kernel relocatable 1062 * memory. sfmmu's alignment is changed to 64 bytes from 1063 * default 8 bytes, as the lower 6 bits will be used to pass 1064 * pgcnt to vtag_flush_pgcnt_tl1. 1065 */ 1066 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1); 1067 1068 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size, 1069 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1070 NULL, NULL, static_arena, 0); 1071 1072 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1073 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1074 1075 /* 1076 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1077 * from the heap when low on memory or when TSB_FORCEALLOC is 1078 * specified, don't use magazines to cache them--we want to return 1079 * them to the system as quickly as possible. 1080 */ 1081 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1082 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1083 static_arena, KMC_NOMAGAZINE); 1084 1085 /* 1086 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1087 * memory, which corresponds to the old static reserve for TSBs. 1088 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1089 * memory we'll allocate for TSB slabs; beyond this point TSB 1090 * allocations will be taken from the kernel heap (via 1091 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1092 * consumer. 1093 */ 1094 if (tsb_alloc_hiwater_factor == 0) { 1095 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1096 } 1097 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1098 1099 /* Set tsb_max_growsize. */ 1100 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1101 1102 /* 1103 * On smaller memory systems, allocate TSB memory in 512K chunks 1104 * instead of the default 4M slab size. The trap handlers need to 1105 * be patched with the final slab shift since they need to be able 1106 * to construct the TSB pointer at runtime. 1107 */ 1108 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1109 !(disable_large_pages & (1 << TTE512K))) { 1110 tsb_slab_size = MMU_PAGESIZE512K; 1111 tsb_slab_shift = MMU_PAGESHIFT512K; 1112 tsb_slab_ttesz = TTE512K; 1113 tsb_slab_mask = 0x3f; /* 512K page alignment for 8K pfn */ 1114 } 1115 1116 /* 1117 * Set up memory callback to update tsb_alloc_hiwater and 1118 * tsb_max_growsize. 1119 */ 1120 i = kphysm_setup_func_register(&sfmmu_update_tsb_vec, (void *) 0); 1121 ASSERT(i == 0); 1122 1123 /* 1124 * kmem_tsb_arena is the source from which large TSB slabs are 1125 * drawn. The quantum of this arena corresponds to the largest 1126 * TSB size we can dynamically allocate for user processes. 1127 * Currently it must also be a supported page size since we 1128 * use exactly one translation entry to map each slab page. 1129 * 1130 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1131 * which most TSBs are allocated. Since most TSB allocations are 1132 * typically 8K we have a kmem cache we stack on top of each 1133 * kmem_tsb_default_arena to speed up those allocations. 1134 * 1135 * Note the two-level scheme of arenas is required only 1136 * because vmem_create doesn't allow us to specify alignment 1137 * requirements. If this ever changes the code could be 1138 * simplified to use only one level of arenas. 1139 */ 1140 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1141 sfmmu_vmem_xalloc_aligned_wrapper, vmem_xfree, heap_arena, 1142 0, VM_SLEEP); 1143 1144 if (tsb_lgrp_affinity) { 1145 char s[50]; 1146 for (i = 0; i < NLGRPS_MAX; i++) { 1147 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1148 kmem_tsb_default_arena[i] = 1149 vmem_create(s, NULL, 0, PAGESIZE, 1150 sfmmu_tsb_segkmem_alloc, sfmmu_tsb_segkmem_free, 1151 kmem_tsb_arena, 0, VM_SLEEP | VM_BESTFIT); 1152 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1153 sfmmu_tsb_cache[i] = kmem_cache_create(s, PAGESIZE, 1154 PAGESIZE, NULL, NULL, NULL, NULL, 1155 kmem_tsb_default_arena[i], 0); 1156 } 1157 } else { 1158 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1159 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1160 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1161 VM_SLEEP | VM_BESTFIT); 1162 1163 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1164 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1165 kmem_tsb_default_arena[0], 0); 1166 } 1167 1168 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1169 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1170 sfmmu_hblkcache_destructor, 1171 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1172 hat_memload_arena, KMC_NOHASH); 1173 1174 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1175 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP); 1176 1177 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1178 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1179 sfmmu_hblkcache_destructor, 1180 NULL, (void *)HME1BLK_SZ, 1181 hat_memload1_arena, KMC_NOHASH); 1182 1183 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1184 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1185 1186 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1187 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1188 NULL, NULL, static_arena, KMC_NOHASH); 1189 1190 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1191 sizeof (ism_ment_t), 0, NULL, NULL, 1192 NULL, NULL, NULL, 0); 1193 1194 /* 1195 * We grab the first hat for the kernel, 1196 */ 1197 AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER); 1198 kas.a_hat = hat_alloc(&kas); 1199 AS_LOCK_EXIT(&kas, &kas.a_lock); 1200 1201 /* 1202 * Initialize hblk_reserve. 1203 */ 1204 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1205 va_to_pa((caddr_t)hblk_reserve); 1206 1207 #ifndef UTSB_PHYS 1208 /* 1209 * Reserve some kernel virtual address space for the locked TTEs 1210 * that allow us to probe the TSB from TL>0. 1211 */ 1212 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1213 0, 0, NULL, NULL, VM_SLEEP); 1214 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1215 0, 0, NULL, NULL, VM_SLEEP); 1216 #endif 1217 1218 /* 1219 * The big page VAC handling code assumes VAC 1220 * will not be bigger than the smallest big 1221 * page- which is 64K. 1222 */ 1223 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1224 cmn_err(CE_PANIC, "VAC too big!"); 1225 } 1226 1227 (void) xhat_init(); 1228 1229 uhme_hash_pa = va_to_pa(uhme_hash); 1230 khme_hash_pa = va_to_pa(khme_hash); 1231 1232 /* 1233 * Initialize relocation locks. kpr_suspendlock is held 1234 * at PIL_MAX to prevent interrupts from pinning the holder 1235 * of a suspended TTE which may access it leading to a 1236 * deadlock condition. 1237 */ 1238 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1239 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1240 } 1241 1242 /* 1243 * Initialize locking for the hat layer, called early during boot. 1244 */ 1245 static void 1246 hat_lock_init() 1247 { 1248 int i; 1249 1250 /* 1251 * initialize the array of mutexes protecting a page's mapping 1252 * list and p_nrm field. 1253 */ 1254 for (i = 0; i < mml_table_sz; i++) 1255 mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL); 1256 1257 if (kpm_enable) { 1258 for (i = 0; i < kpmp_table_sz; i++) { 1259 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1260 MUTEX_DEFAULT, NULL); 1261 } 1262 } 1263 1264 /* 1265 * Initialize array of mutex locks that protects sfmmu fields and 1266 * TSB lists. 1267 */ 1268 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1269 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1270 NULL); 1271 } 1272 1273 extern caddr_t kmem64_base, kmem64_end; 1274 1275 #define SFMMU_KERNEL_MAXVA \ 1276 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1277 1278 /* 1279 * Allocate a hat structure. 1280 * Called when an address space first uses a hat. 1281 */ 1282 struct hat * 1283 hat_alloc(struct as *as) 1284 { 1285 sfmmu_t *sfmmup; 1286 int i; 1287 uint64_t cnum; 1288 extern uint_t get_color_start(struct as *); 1289 1290 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1291 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1292 sfmmup->sfmmu_as = as; 1293 sfmmup->sfmmu_flags = 0; 1294 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock); 1295 1296 if (as == &kas) { 1297 ksfmmup = sfmmup; 1298 sfmmup->sfmmu_cext = 0; 1299 cnum = KCONTEXT; 1300 1301 sfmmup->sfmmu_clrstart = 0; 1302 sfmmup->sfmmu_tsb = NULL; 1303 /* 1304 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1305 * to setup tsb_info for ksfmmup. 1306 */ 1307 } else { 1308 1309 /* 1310 * Just set to invalid ctx. When it faults, it will 1311 * get a valid ctx. This would avoid the situation 1312 * where we get a ctx, but it gets stolen and then 1313 * we fault when we try to run and so have to get 1314 * another ctx. 1315 */ 1316 sfmmup->sfmmu_cext = 0; 1317 cnum = INVALID_CONTEXT; 1318 1319 /* initialize original physical page coloring bin */ 1320 sfmmup->sfmmu_clrstart = get_color_start(as); 1321 #ifdef DEBUG 1322 if (tsb_random_size) { 1323 uint32_t randval = (uint32_t)gettick() >> 4; 1324 int size = randval % (tsb_max_growsize + 1); 1325 1326 /* chose a random tsb size for stress testing */ 1327 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1328 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1329 } else 1330 #endif /* DEBUG */ 1331 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1332 default_tsb_size, 1333 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1334 sfmmup->sfmmu_flags = HAT_SWAPPED; 1335 ASSERT(sfmmup->sfmmu_tsb != NULL); 1336 } 1337 1338 ASSERT(max_mmu_ctxdoms > 0); 1339 for (i = 0; i < max_mmu_ctxdoms; i++) { 1340 sfmmup->sfmmu_ctxs[i].cnum = cnum; 1341 sfmmup->sfmmu_ctxs[i].gnum = 0; 1342 } 1343 1344 sfmmu_setup_tsbinfo(sfmmup); 1345 for (i = 0; i < max_mmu_page_sizes; i++) { 1346 sfmmup->sfmmu_ttecnt[i] = 0; 1347 sfmmup->sfmmu_ismttecnt[i] = 0; 1348 sfmmup->sfmmu_pgsz[i] = TTE8K; 1349 } 1350 1351 sfmmup->sfmmu_iblk = NULL; 1352 sfmmup->sfmmu_ismhat = 0; 1353 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1354 if (sfmmup == ksfmmup) { 1355 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1356 } else { 1357 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1358 } 1359 sfmmup->sfmmu_free = 0; 1360 sfmmup->sfmmu_rmstat = 0; 1361 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1362 sfmmup->sfmmu_xhat_provider = NULL; 1363 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1364 return (sfmmup); 1365 } 1366 1367 /* 1368 * Create per-MMU context domain kstats for a given MMU ctx. 1369 */ 1370 static void 1371 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp) 1372 { 1373 mmu_ctx_stat_t stat; 1374 kstat_t *mmu_kstat; 1375 1376 ASSERT(MUTEX_HELD(&cpu_lock)); 1377 ASSERT(mmu_ctxp->mmu_kstat == NULL); 1378 1379 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx", 1380 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1381 1382 if (mmu_kstat == NULL) { 1383 cmn_err(CE_WARN, "kstat_create for MMU %d failed", 1384 mmu_ctxp->mmu_idx); 1385 } else { 1386 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data; 1387 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++) 1388 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat], 1389 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64); 1390 mmu_ctxp->mmu_kstat = mmu_kstat; 1391 kstat_install(mmu_kstat); 1392 } 1393 } 1394 1395 /* 1396 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU 1397 * context domain information for a given CPU. If a platform does not 1398 * specify that interface, then the function below is used instead to return 1399 * default information. The defaults are as follows: 1400 * 1401 * - For sun4u systems there's one MMU context domain per CPU. 1402 * This default is used by all sun4u systems except OPL. OPL systems 1403 * provide platform specific interface to map CPU ids to MMU ids 1404 * because on OPL more than 1 CPU shares a single MMU. 1405 * Note that on sun4v, there is one global context domain for 1406 * the entire system. This is to avoid running into potential problem 1407 * with ldom physical cpu substitution feature. 1408 * - The number of MMU context IDs supported on any CPU in the 1409 * system is 8K. 1410 */ 1411 /*ARGSUSED*/ 1412 static void 1413 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop) 1414 { 1415 infop->mmu_nctxs = nctxs; 1416 #ifndef sun4v 1417 infop->mmu_idx = cpu[cpuid]->cpu_seqid; 1418 #else /* sun4v */ 1419 infop->mmu_idx = 0; 1420 #endif /* sun4v */ 1421 } 1422 1423 /* 1424 * Called during CPU initialization to set the MMU context-related information 1425 * for a CPU. 1426 * 1427 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum. 1428 */ 1429 void 1430 sfmmu_cpu_init(cpu_t *cp) 1431 { 1432 mmu_ctx_info_t info; 1433 mmu_ctx_t *mmu_ctxp; 1434 1435 ASSERT(MUTEX_HELD(&cpu_lock)); 1436 1437 if (&plat_cpuid_to_mmu_ctx_info == NULL) 1438 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1439 else 1440 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1441 1442 ASSERT(info.mmu_idx < max_mmu_ctxdoms); 1443 1444 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) { 1445 /* Each mmu_ctx is cacheline aligned. */ 1446 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP); 1447 bzero(mmu_ctxp, sizeof (mmu_ctx_t)); 1448 1449 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN, 1450 (void *)ipltospl(DISP_LEVEL)); 1451 mmu_ctxp->mmu_idx = info.mmu_idx; 1452 mmu_ctxp->mmu_nctxs = info.mmu_nctxs; 1453 /* 1454 * Globally for lifetime of a system, 1455 * gnum must always increase. 1456 * mmu_saved_gnum is protected by the cpu_lock. 1457 */ 1458 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1; 1459 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 1460 1461 sfmmu_mmu_kstat_create(mmu_ctxp); 1462 1463 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp; 1464 } else { 1465 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx); 1466 } 1467 1468 /* 1469 * The mmu_lock is acquired here to prevent races with 1470 * the wrap-around code. 1471 */ 1472 mutex_enter(&mmu_ctxp->mmu_lock); 1473 1474 1475 mmu_ctxp->mmu_ncpus++; 1476 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1477 CPU_MMU_IDX(cp) = info.mmu_idx; 1478 CPU_MMU_CTXP(cp) = mmu_ctxp; 1479 1480 mutex_exit(&mmu_ctxp->mmu_lock); 1481 } 1482 1483 /* 1484 * Called to perform MMU context-related cleanup for a CPU. 1485 */ 1486 void 1487 sfmmu_cpu_cleanup(cpu_t *cp) 1488 { 1489 mmu_ctx_t *mmu_ctxp; 1490 1491 ASSERT(MUTEX_HELD(&cpu_lock)); 1492 1493 mmu_ctxp = CPU_MMU_CTXP(cp); 1494 ASSERT(mmu_ctxp != NULL); 1495 1496 /* 1497 * The mmu_lock is acquired here to prevent races with 1498 * the wrap-around code. 1499 */ 1500 mutex_enter(&mmu_ctxp->mmu_lock); 1501 1502 CPU_MMU_CTXP(cp) = NULL; 1503 1504 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1505 if (--mmu_ctxp->mmu_ncpus == 0) { 1506 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL; 1507 mutex_exit(&mmu_ctxp->mmu_lock); 1508 mutex_destroy(&mmu_ctxp->mmu_lock); 1509 1510 if (mmu_ctxp->mmu_kstat) 1511 kstat_delete(mmu_ctxp->mmu_kstat); 1512 1513 /* mmu_saved_gnum is protected by the cpu_lock. */ 1514 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum) 1515 mmu_saved_gnum = mmu_ctxp->mmu_gnum; 1516 1517 kmem_cache_free(mmuctxdom_cache, mmu_ctxp); 1518 1519 return; 1520 } 1521 1522 mutex_exit(&mmu_ctxp->mmu_lock); 1523 } 1524 1525 /* 1526 * Hat_setup, makes an address space context the current active one. 1527 * In sfmmu this translates to setting the secondary context with the 1528 * corresponding context. 1529 */ 1530 void 1531 hat_setup(struct hat *sfmmup, int allocflag) 1532 { 1533 hatlock_t *hatlockp; 1534 1535 /* Init needs some special treatment. */ 1536 if (allocflag == HAT_INIT) { 1537 /* 1538 * Make sure that we have 1539 * 1. a TSB 1540 * 2. a valid ctx that doesn't get stolen after this point. 1541 */ 1542 hatlockp = sfmmu_hat_enter(sfmmup); 1543 1544 /* 1545 * Swap in the TSB. hat_init() allocates tsbinfos without 1546 * TSBs, but we need one for init, since the kernel does some 1547 * special things to set up its stack and needs the TSB to 1548 * resolve page faults. 1549 */ 1550 sfmmu_tsb_swapin(sfmmup, hatlockp); 1551 1552 sfmmu_get_ctx(sfmmup); 1553 1554 sfmmu_hat_exit(hatlockp); 1555 } else { 1556 ASSERT(allocflag == HAT_ALLOC); 1557 1558 hatlockp = sfmmu_hat_enter(sfmmup); 1559 kpreempt_disable(); 1560 1561 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1562 1563 /* 1564 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter, 1565 * pagesize bits don't matter in this case since we are passing 1566 * INVALID_CONTEXT to it. 1567 */ 1568 sfmmu_setctx_sec(INVALID_CONTEXT); 1569 sfmmu_clear_utsbinfo(); 1570 1571 kpreempt_enable(); 1572 sfmmu_hat_exit(hatlockp); 1573 } 1574 } 1575 1576 /* 1577 * Free all the translation resources for the specified address space. 1578 * Called from as_free when an address space is being destroyed. 1579 */ 1580 void 1581 hat_free_start(struct hat *sfmmup) 1582 { 1583 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 1584 ASSERT(sfmmup != ksfmmup); 1585 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1586 1587 sfmmup->sfmmu_free = 1; 1588 } 1589 1590 void 1591 hat_free_end(struct hat *sfmmup) 1592 { 1593 int i; 1594 1595 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1596 if (sfmmup->sfmmu_ismhat) { 1597 for (i = 0; i < mmu_page_sizes; i++) { 1598 sfmmup->sfmmu_ttecnt[i] = 0; 1599 sfmmup->sfmmu_ismttecnt[i] = 0; 1600 } 1601 } else { 1602 /* EMPTY */ 1603 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1604 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1605 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1606 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1607 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1608 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1609 } 1610 1611 if (sfmmup->sfmmu_rmstat) { 1612 hat_freestat(sfmmup->sfmmu_as, NULL); 1613 } 1614 1615 while (sfmmup->sfmmu_tsb != NULL) { 1616 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1617 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1618 sfmmup->sfmmu_tsb = next; 1619 } 1620 sfmmu_free_sfmmu(sfmmup); 1621 1622 kmem_cache_free(sfmmuid_cache, sfmmup); 1623 } 1624 1625 /* 1626 * Set up any translation structures, for the specified address space, 1627 * that are needed or preferred when the process is being swapped in. 1628 */ 1629 /* ARGSUSED */ 1630 void 1631 hat_swapin(struct hat *hat) 1632 { 1633 ASSERT(hat->sfmmu_xhat_provider == NULL); 1634 } 1635 1636 /* 1637 * Free all of the translation resources, for the specified address space, 1638 * that can be freed while the process is swapped out. Called from as_swapout. 1639 * Also, free up the ctx that this process was using. 1640 */ 1641 void 1642 hat_swapout(struct hat *sfmmup) 1643 { 1644 struct hmehash_bucket *hmebp; 1645 struct hme_blk *hmeblkp; 1646 struct hme_blk *pr_hblk = NULL; 1647 struct hme_blk *nx_hblk; 1648 int i; 1649 uint64_t hblkpa, prevpa, nx_pa; 1650 struct hme_blk *list = NULL; 1651 hatlock_t *hatlockp; 1652 struct tsb_info *tsbinfop; 1653 struct free_tsb { 1654 struct free_tsb *next; 1655 struct tsb_info *tsbinfop; 1656 }; /* free list of TSBs */ 1657 struct free_tsb *freelist, *last, *next; 1658 1659 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1660 SFMMU_STAT(sf_swapout); 1661 1662 /* 1663 * There is no way to go from an as to all its translations in sfmmu. 1664 * Here is one of the times when we take the big hit and traverse 1665 * the hash looking for hme_blks to free up. Not only do we free up 1666 * this as hme_blks but all those that are free. We are obviously 1667 * swapping because we need memory so let's free up as much 1668 * as we can. 1669 * 1670 * Note that we don't flush TLB/TSB here -- it's not necessary 1671 * because: 1672 * 1) we free the ctx we're using and throw away the TSB(s); 1673 * 2) processes aren't runnable while being swapped out. 1674 */ 1675 ASSERT(sfmmup != KHATID); 1676 for (i = 0; i <= UHMEHASH_SZ; i++) { 1677 hmebp = &uhme_hash[i]; 1678 SFMMU_HASH_LOCK(hmebp); 1679 hmeblkp = hmebp->hmeblkp; 1680 hblkpa = hmebp->hmeh_nextpa; 1681 prevpa = 0; 1682 pr_hblk = NULL; 1683 while (hmeblkp) { 1684 1685 ASSERT(!hmeblkp->hblk_xhat_bit); 1686 1687 if ((hmeblkp->hblk_tag.htag_id == sfmmup) && 1688 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) { 1689 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 1690 (caddr_t)get_hblk_base(hmeblkp), 1691 get_hblk_endaddr(hmeblkp), 1692 NULL, HAT_UNLOAD); 1693 } 1694 nx_hblk = hmeblkp->hblk_next; 1695 nx_pa = hmeblkp->hblk_nextpa; 1696 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 1697 ASSERT(!hmeblkp->hblk_lckcnt); 1698 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 1699 prevpa, pr_hblk); 1700 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 1701 } else { 1702 pr_hblk = hmeblkp; 1703 prevpa = hblkpa; 1704 } 1705 hmeblkp = nx_hblk; 1706 hblkpa = nx_pa; 1707 } 1708 SFMMU_HASH_UNLOCK(hmebp); 1709 } 1710 1711 sfmmu_hblks_list_purge(&list); 1712 1713 /* 1714 * Now free up the ctx so that others can reuse it. 1715 */ 1716 hatlockp = sfmmu_hat_enter(sfmmup); 1717 1718 sfmmu_invalidate_ctx(sfmmup); 1719 1720 /* 1721 * Free TSBs, but not tsbinfos, and set SWAPPED flag. 1722 * If TSBs were never swapped in, just return. 1723 * This implies that we don't support partial swapping 1724 * of TSBs -- either all are swapped out, or none are. 1725 * 1726 * We must hold the HAT lock here to prevent racing with another 1727 * thread trying to unmap TTEs from the TSB or running the post- 1728 * relocator after relocating the TSB's memory. Unfortunately, we 1729 * can't free memory while holding the HAT lock or we could 1730 * deadlock, so we build a list of TSBs to be freed after marking 1731 * the tsbinfos as swapped out and free them after dropping the 1732 * lock. 1733 */ 1734 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 1735 sfmmu_hat_exit(hatlockp); 1736 return; 1737 } 1738 1739 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED); 1740 last = freelist = NULL; 1741 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 1742 tsbinfop = tsbinfop->tsb_next) { 1743 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0); 1744 1745 /* 1746 * Cast the TSB into a struct free_tsb and put it on the free 1747 * list. 1748 */ 1749 if (freelist == NULL) { 1750 last = freelist = (struct free_tsb *)tsbinfop->tsb_va; 1751 } else { 1752 last->next = (struct free_tsb *)tsbinfop->tsb_va; 1753 last = last->next; 1754 } 1755 last->next = NULL; 1756 last->tsbinfop = tsbinfop; 1757 tsbinfop->tsb_flags |= TSB_SWAPPED; 1758 /* 1759 * Zero out the TTE to clear the valid bit. 1760 * Note we can't use a value like 0xbad because we want to 1761 * ensure diagnostic bits are NEVER set on TTEs that might 1762 * be loaded. The intent is to catch any invalid access 1763 * to the swapped TSB, such as a thread running with a valid 1764 * context without first calling sfmmu_tsb_swapin() to 1765 * allocate TSB memory. 1766 */ 1767 tsbinfop->tsb_tte.ll = 0; 1768 } 1769 1770 /* Now we can drop the lock and free the TSB memory. */ 1771 sfmmu_hat_exit(hatlockp); 1772 for (; freelist != NULL; freelist = next) { 1773 next = freelist->next; 1774 sfmmu_tsb_free(freelist->tsbinfop); 1775 } 1776 } 1777 1778 /* 1779 * Duplicate the translations of an as into another newas 1780 */ 1781 /* ARGSUSED */ 1782 int 1783 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 1784 uint_t flag) 1785 { 1786 ASSERT(hat->sfmmu_xhat_provider == NULL); 1787 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW)); 1788 1789 if (flag == HAT_DUP_COW) { 1790 panic("hat_dup: HAT_DUP_COW not supported"); 1791 } 1792 return (0); 1793 } 1794 1795 /* 1796 * Set up addr to map to page pp with protection prot. 1797 * As an optimization we also load the TSB with the 1798 * corresponding tte but it is no big deal if the tte gets kicked out. 1799 */ 1800 void 1801 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 1802 uint_t attr, uint_t flags) 1803 { 1804 tte_t tte; 1805 1806 1807 ASSERT(hat != NULL); 1808 ASSERT(PAGE_LOCKED(pp)); 1809 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 1810 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 1811 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 1812 1813 if (PP_ISFREE(pp)) { 1814 panic("hat_memload: loading a mapping to free page %p", 1815 (void *)pp); 1816 } 1817 1818 if (hat->sfmmu_xhat_provider) { 1819 XHAT_MEMLOAD(hat, addr, pp, attr, flags); 1820 return; 1821 } 1822 1823 ASSERT((hat == ksfmmup) || 1824 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 1825 1826 if (flags & ~SFMMU_LOAD_ALLFLAG) 1827 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 1828 flags & ~SFMMU_LOAD_ALLFLAG); 1829 1830 if (hat->sfmmu_rmstat) 1831 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 1832 1833 #if defined(SF_ERRATA_57) 1834 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 1835 (addr < errata57_limit) && (attr & PROT_EXEC) && 1836 !(flags & HAT_LOAD_SHARE)) { 1837 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 1838 " page executable"); 1839 attr &= ~PROT_EXEC; 1840 } 1841 #endif 1842 1843 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 1844 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags); 1845 1846 /* 1847 * Check TSB and TLB page sizes. 1848 */ 1849 if ((flags & HAT_LOAD_SHARE) == 0) { 1850 sfmmu_check_page_sizes(hat, 1); 1851 } 1852 } 1853 1854 /* 1855 * hat_devload can be called to map real memory (e.g. 1856 * /dev/kmem) and even though hat_devload will determine pf is 1857 * for memory, it will be unable to get a shared lock on the 1858 * page (because someone else has it exclusively) and will 1859 * pass dp = NULL. If tteload doesn't get a non-NULL 1860 * page pointer it can't cache memory. 1861 */ 1862 void 1863 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 1864 uint_t attr, int flags) 1865 { 1866 tte_t tte; 1867 struct page *pp = NULL; 1868 int use_lgpg = 0; 1869 1870 ASSERT(hat != NULL); 1871 1872 if (hat->sfmmu_xhat_provider) { 1873 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags); 1874 return; 1875 } 1876 1877 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 1878 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 1879 ASSERT((hat == ksfmmup) || 1880 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 1881 if (len == 0) 1882 panic("hat_devload: zero len"); 1883 if (flags & ~SFMMU_LOAD_ALLFLAG) 1884 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 1885 flags & ~SFMMU_LOAD_ALLFLAG); 1886 1887 #if defined(SF_ERRATA_57) 1888 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 1889 (addr < errata57_limit) && (attr & PROT_EXEC) && 1890 !(flags & HAT_LOAD_SHARE)) { 1891 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 1892 " page executable"); 1893 attr &= ~PROT_EXEC; 1894 } 1895 #endif 1896 1897 /* 1898 * If it's a memory page find its pp 1899 */ 1900 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 1901 pp = page_numtopp_nolock(pfn); 1902 if (pp == NULL) { 1903 flags |= HAT_LOAD_NOCONSIST; 1904 } else { 1905 if (PP_ISFREE(pp)) { 1906 panic("hat_memload: loading " 1907 "a mapping to free page %p", 1908 (void *)pp); 1909 } 1910 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 1911 panic("hat_memload: loading a mapping " 1912 "to unlocked relocatable page %p", 1913 (void *)pp); 1914 } 1915 ASSERT(len == MMU_PAGESIZE); 1916 } 1917 } 1918 1919 if (hat->sfmmu_rmstat) 1920 hat_resvstat(len, hat->sfmmu_as, addr); 1921 1922 if (flags & HAT_LOAD_NOCONSIST) { 1923 attr |= SFMMU_UNCACHEVTTE; 1924 use_lgpg = 1; 1925 } 1926 if (!pf_is_memory(pfn)) { 1927 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 1928 use_lgpg = 1; 1929 switch (attr & HAT_ORDER_MASK) { 1930 case HAT_STRICTORDER: 1931 case HAT_UNORDERED_OK: 1932 /* 1933 * we set the side effect bit for all non 1934 * memory mappings unless merging is ok 1935 */ 1936 attr |= SFMMU_SIDEFFECT; 1937 break; 1938 case HAT_MERGING_OK: 1939 case HAT_LOADCACHING_OK: 1940 case HAT_STORECACHING_OK: 1941 break; 1942 default: 1943 panic("hat_devload: bad attr"); 1944 break; 1945 } 1946 } 1947 while (len) { 1948 if (!use_lgpg) { 1949 sfmmu_memtte(&tte, pfn, attr, TTE8K); 1950 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1951 flags); 1952 len -= MMU_PAGESIZE; 1953 addr += MMU_PAGESIZE; 1954 pfn++; 1955 continue; 1956 } 1957 /* 1958 * try to use large pages, check va/pa alignments 1959 * Note that 32M/256M page sizes are not (yet) supported. 1960 */ 1961 if ((len >= MMU_PAGESIZE4M) && 1962 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 1963 !(disable_large_pages & (1 << TTE4M)) && 1964 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 1965 sfmmu_memtte(&tte, pfn, attr, TTE4M); 1966 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1967 flags); 1968 len -= MMU_PAGESIZE4M; 1969 addr += MMU_PAGESIZE4M; 1970 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 1971 } else if ((len >= MMU_PAGESIZE512K) && 1972 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 1973 !(disable_large_pages & (1 << TTE512K)) && 1974 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 1975 sfmmu_memtte(&tte, pfn, attr, TTE512K); 1976 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1977 flags); 1978 len -= MMU_PAGESIZE512K; 1979 addr += MMU_PAGESIZE512K; 1980 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 1981 } else if ((len >= MMU_PAGESIZE64K) && 1982 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 1983 !(disable_large_pages & (1 << TTE64K)) && 1984 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 1985 sfmmu_memtte(&tte, pfn, attr, TTE64K); 1986 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1987 flags); 1988 len -= MMU_PAGESIZE64K; 1989 addr += MMU_PAGESIZE64K; 1990 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 1991 } else { 1992 sfmmu_memtte(&tte, pfn, attr, TTE8K); 1993 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1994 flags); 1995 len -= MMU_PAGESIZE; 1996 addr += MMU_PAGESIZE; 1997 pfn++; 1998 } 1999 } 2000 2001 /* 2002 * Check TSB and TLB page sizes. 2003 */ 2004 if ((flags & HAT_LOAD_SHARE) == 0) { 2005 sfmmu_check_page_sizes(hat, 1); 2006 } 2007 } 2008 2009 /* 2010 * Map the largest extend possible out of the page array. The array may NOT 2011 * be in order. The largest possible mapping a page can have 2012 * is specified in the p_szc field. The p_szc field 2013 * cannot change as long as there any mappings (large or small) 2014 * to any of the pages that make up the large page. (ie. any 2015 * promotion/demotion of page size is not up to the hat but up to 2016 * the page free list manager). The array 2017 * should consist of properly aligned contigous pages that are 2018 * part of a big page for a large mapping to be created. 2019 */ 2020 void 2021 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 2022 struct page **pps, uint_t attr, uint_t flags) 2023 { 2024 int ttesz; 2025 size_t mapsz; 2026 pgcnt_t numpg, npgs; 2027 tte_t tte; 2028 page_t *pp; 2029 int large_pages_disable; 2030 2031 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2032 2033 if (hat->sfmmu_xhat_provider) { 2034 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags); 2035 return; 2036 } 2037 2038 if (hat->sfmmu_rmstat) 2039 hat_resvstat(len, hat->sfmmu_as, addr); 2040 2041 #if defined(SF_ERRATA_57) 2042 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2043 (addr < errata57_limit) && (attr & PROT_EXEC) && 2044 !(flags & HAT_LOAD_SHARE)) { 2045 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 2046 "user page executable"); 2047 attr &= ~PROT_EXEC; 2048 } 2049 #endif 2050 2051 /* Get number of pages */ 2052 npgs = len >> MMU_PAGESHIFT; 2053 2054 if (flags & HAT_LOAD_SHARE) { 2055 large_pages_disable = disable_ism_large_pages; 2056 } else { 2057 large_pages_disable = disable_large_pages; 2058 } 2059 2060 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 2061 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs); 2062 return; 2063 } 2064 2065 while (npgs >= NHMENTS) { 2066 pp = *pps; 2067 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 2068 /* 2069 * Check if this page size is disabled. 2070 */ 2071 if (large_pages_disable & (1 << ttesz)) 2072 continue; 2073 2074 numpg = TTEPAGES(ttesz); 2075 mapsz = numpg << MMU_PAGESHIFT; 2076 if ((npgs >= numpg) && 2077 IS_P2ALIGNED(addr, mapsz) && 2078 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 2079 /* 2080 * At this point we have enough pages and 2081 * we know the virtual address and the pfn 2082 * are properly aligned. We still need 2083 * to check for physical contiguity but since 2084 * it is very likely that this is the case 2085 * we will assume they are so and undo 2086 * the request if necessary. It would 2087 * be great if we could get a hint flag 2088 * like HAT_CONTIG which would tell us 2089 * the pages are contigous for sure. 2090 */ 2091 sfmmu_memtte(&tte, (*pps)->p_pagenum, 2092 attr, ttesz); 2093 if (!sfmmu_tteload_array(hat, &tte, addr, 2094 pps, flags)) { 2095 break; 2096 } 2097 } 2098 } 2099 if (ttesz == TTE8K) { 2100 /* 2101 * We were not able to map array using a large page 2102 * batch a hmeblk or fraction at a time. 2103 */ 2104 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 2105 & (NHMENTS-1); 2106 numpg = NHMENTS - numpg; 2107 ASSERT(numpg <= npgs); 2108 mapsz = numpg * MMU_PAGESIZE; 2109 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 2110 numpg); 2111 } 2112 addr += mapsz; 2113 npgs -= numpg; 2114 pps += numpg; 2115 } 2116 2117 if (npgs) { 2118 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs); 2119 } 2120 2121 /* 2122 * Check TSB and TLB page sizes. 2123 */ 2124 if ((flags & HAT_LOAD_SHARE) == 0) { 2125 sfmmu_check_page_sizes(hat, 1); 2126 } 2127 } 2128 2129 /* 2130 * Function tries to batch 8K pages into the same hme blk. 2131 */ 2132 static void 2133 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 2134 uint_t attr, uint_t flags, pgcnt_t npgs) 2135 { 2136 tte_t tte; 2137 page_t *pp; 2138 struct hmehash_bucket *hmebp; 2139 struct hme_blk *hmeblkp; 2140 int index; 2141 2142 while (npgs) { 2143 /* 2144 * Acquire the hash bucket. 2145 */ 2146 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K); 2147 ASSERT(hmebp); 2148 2149 /* 2150 * Find the hment block. 2151 */ 2152 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2153 TTE8K, flags); 2154 ASSERT(hmeblkp); 2155 2156 do { 2157 /* 2158 * Make the tte. 2159 */ 2160 pp = *pps; 2161 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2162 2163 /* 2164 * Add the translation. 2165 */ 2166 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2167 vaddr, pps, flags); 2168 2169 /* 2170 * Goto next page. 2171 */ 2172 pps++; 2173 npgs--; 2174 2175 /* 2176 * Goto next address. 2177 */ 2178 vaddr += MMU_PAGESIZE; 2179 2180 /* 2181 * Don't crossover into a different hmentblk. 2182 */ 2183 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2184 (NHMENTS-1)); 2185 2186 } while (index != 0 && npgs != 0); 2187 2188 /* 2189 * Release the hash bucket. 2190 */ 2191 2192 sfmmu_tteload_release_hashbucket(hmebp); 2193 } 2194 } 2195 2196 /* 2197 * Construct a tte for a page: 2198 * 2199 * tte_valid = 1 2200 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only) 2201 * tte_size = size 2202 * tte_nfo = attr & HAT_NOFAULT 2203 * tte_ie = attr & HAT_STRUCTURE_LE 2204 * tte_hmenum = hmenum 2205 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2206 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2207 * tte_ref = 1 (optimization) 2208 * tte_wr_perm = attr & PROT_WRITE; 2209 * tte_no_sync = attr & HAT_NOSYNC 2210 * tte_lock = attr & SFMMU_LOCKTTE 2211 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2212 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2213 * tte_e = attr & SFMMU_SIDEFFECT 2214 * tte_priv = !(attr & PROT_USER) 2215 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2216 * tte_glb = 0 2217 */ 2218 void 2219 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2220 { 2221 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2222 2223 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2224 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2225 2226 if (TTE_IS_NOSYNC(ttep)) { 2227 TTE_SET_REF(ttep); 2228 if (TTE_IS_WRITABLE(ttep)) { 2229 TTE_SET_MOD(ttep); 2230 } 2231 } 2232 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2233 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2234 } 2235 } 2236 2237 /* 2238 * This function will add a translation to the hme_blk and allocate the 2239 * hme_blk if one does not exist. 2240 * If a page structure is specified then it will add the 2241 * corresponding hment to the mapping list. 2242 * It will also update the hmenum field for the tte. 2243 */ 2244 void 2245 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2246 uint_t flags) 2247 { 2248 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags); 2249 } 2250 2251 /* 2252 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2253 * Assumes that a particular page size may only be resident in one TSB. 2254 */ 2255 static void 2256 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2257 { 2258 struct tsb_info *tsbinfop = NULL; 2259 uint64_t tag; 2260 struct tsbe *tsbe_addr; 2261 uint64_t tsb_base; 2262 uint_t tsb_size; 2263 int vpshift = MMU_PAGESHIFT; 2264 int phys = 0; 2265 2266 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2267 phys = ktsb_phys; 2268 if (ttesz >= TTE4M) { 2269 #ifndef sun4v 2270 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2271 #endif 2272 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2273 tsb_size = ktsb4m_szcode; 2274 } else { 2275 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2276 tsb_size = ktsb_szcode; 2277 } 2278 } else { 2279 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2280 2281 /* 2282 * If there isn't a TSB for this page size, or the TSB is 2283 * swapped out, there is nothing to do. Note that the latter 2284 * case seems impossible but can occur if hat_pageunload() 2285 * is called on an ISM mapping while the process is swapped 2286 * out. 2287 */ 2288 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2289 return; 2290 2291 /* 2292 * If another thread is in the middle of relocating a TSB 2293 * we can't unload the entry so set a flag so that the 2294 * TSB will be flushed before it can be accessed by the 2295 * process. 2296 */ 2297 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2298 if (ttep == NULL) 2299 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2300 return; 2301 } 2302 #if defined(UTSB_PHYS) 2303 phys = 1; 2304 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2305 #else 2306 tsb_base = (uint64_t)tsbinfop->tsb_va; 2307 #endif 2308 tsb_size = tsbinfop->tsb_szc; 2309 } 2310 if (ttesz >= TTE4M) 2311 vpshift = MMU_PAGESHIFT4M; 2312 2313 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2314 tag = sfmmu_make_tsbtag(vaddr); 2315 2316 if (ttep == NULL) { 2317 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2318 } else { 2319 if (ttesz >= TTE4M) { 2320 SFMMU_STAT(sf_tsb_load4m); 2321 } else { 2322 SFMMU_STAT(sf_tsb_load8k); 2323 } 2324 2325 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2326 } 2327 } 2328 2329 /* 2330 * Unmap all entries from [start, end) matching the given page size. 2331 * 2332 * This function is used primarily to unmap replicated 64K or 512K entries 2333 * from the TSB that are inserted using the base page size TSB pointer, but 2334 * it may also be called to unmap a range of addresses from the TSB. 2335 */ 2336 void 2337 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2338 { 2339 struct tsb_info *tsbinfop; 2340 uint64_t tag; 2341 struct tsbe *tsbe_addr; 2342 caddr_t vaddr; 2343 uint64_t tsb_base; 2344 int vpshift, vpgsz; 2345 uint_t tsb_size; 2346 int phys = 0; 2347 2348 /* 2349 * Assumptions: 2350 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2351 * at a time shooting down any valid entries we encounter. 2352 * 2353 * If ttesz >= 4M we walk the range 4M at a time shooting 2354 * down any valid mappings we find. 2355 */ 2356 if (sfmmup == ksfmmup) { 2357 phys = ktsb_phys; 2358 if (ttesz >= TTE4M) { 2359 #ifndef sun4v 2360 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2361 #endif 2362 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2363 tsb_size = ktsb4m_szcode; 2364 } else { 2365 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2366 tsb_size = ktsb_szcode; 2367 } 2368 } else { 2369 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2370 2371 /* 2372 * If there isn't a TSB for this page size, or the TSB is 2373 * swapped out, there is nothing to do. Note that the latter 2374 * case seems impossible but can occur if hat_pageunload() 2375 * is called on an ISM mapping while the process is swapped 2376 * out. 2377 */ 2378 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2379 return; 2380 2381 /* 2382 * If another thread is in the middle of relocating a TSB 2383 * we can't unload the entry so set a flag so that the 2384 * TSB will be flushed before it can be accessed by the 2385 * process. 2386 */ 2387 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2388 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2389 return; 2390 } 2391 #if defined(UTSB_PHYS) 2392 phys = 1; 2393 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2394 #else 2395 tsb_base = (uint64_t)tsbinfop->tsb_va; 2396 #endif 2397 tsb_size = tsbinfop->tsb_szc; 2398 } 2399 if (ttesz >= TTE4M) { 2400 vpshift = MMU_PAGESHIFT4M; 2401 vpgsz = MMU_PAGESIZE4M; 2402 } else { 2403 vpshift = MMU_PAGESHIFT; 2404 vpgsz = MMU_PAGESIZE; 2405 } 2406 2407 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2408 tag = sfmmu_make_tsbtag(vaddr); 2409 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2410 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2411 } 2412 } 2413 2414 /* 2415 * Select the optimum TSB size given the number of mappings 2416 * that need to be cached. 2417 */ 2418 static int 2419 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2420 { 2421 int szc = 0; 2422 2423 #ifdef DEBUG 2424 if (tsb_grow_stress) { 2425 uint32_t randval = (uint32_t)gettick() >> 4; 2426 return (randval % (tsb_max_growsize + 1)); 2427 } 2428 #endif /* DEBUG */ 2429 2430 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2431 szc++; 2432 return (szc); 2433 } 2434 2435 /* 2436 * This function will add a translation to the hme_blk and allocate the 2437 * hme_blk if one does not exist. 2438 * If a page structure is specified then it will add the 2439 * corresponding hment to the mapping list. 2440 * It will also update the hmenum field for the tte. 2441 * Furthermore, it attempts to create a large page translation 2442 * for <addr,hat> at page array pps. It assumes addr and first 2443 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2444 */ 2445 static int 2446 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2447 page_t **pps, uint_t flags) 2448 { 2449 struct hmehash_bucket *hmebp; 2450 struct hme_blk *hmeblkp; 2451 int ret; 2452 uint_t size; 2453 2454 /* 2455 * Get mapping size. 2456 */ 2457 size = TTE_CSZ(ttep); 2458 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2459 2460 /* 2461 * Acquire the hash bucket. 2462 */ 2463 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size); 2464 ASSERT(hmebp); 2465 2466 /* 2467 * Find the hment block. 2468 */ 2469 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags); 2470 ASSERT(hmeblkp); 2471 2472 /* 2473 * Add the translation. 2474 */ 2475 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags); 2476 2477 /* 2478 * Release the hash bucket. 2479 */ 2480 sfmmu_tteload_release_hashbucket(hmebp); 2481 2482 return (ret); 2483 } 2484 2485 /* 2486 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2487 */ 2488 static struct hmehash_bucket * 2489 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size) 2490 { 2491 struct hmehash_bucket *hmebp; 2492 int hmeshift; 2493 2494 hmeshift = HME_HASH_SHIFT(size); 2495 2496 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 2497 2498 SFMMU_HASH_LOCK(hmebp); 2499 2500 return (hmebp); 2501 } 2502 2503 /* 2504 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2505 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2506 * allocated. 2507 */ 2508 static struct hme_blk * 2509 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2510 caddr_t vaddr, uint_t size, uint_t flags) 2511 { 2512 hmeblk_tag hblktag; 2513 int hmeshift; 2514 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 2515 uint64_t hblkpa, prevpa; 2516 struct kmem_cache *sfmmu_cache; 2517 uint_t forcefree; 2518 2519 hblktag.htag_id = sfmmup; 2520 hmeshift = HME_HASH_SHIFT(size); 2521 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2522 hblktag.htag_rehash = HME_HASH_REHASH(size); 2523 2524 ttearray_realloc: 2525 2526 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, 2527 pr_hblk, prevpa, &list); 2528 2529 /* 2530 * We block until hblk_reserve_lock is released; it's held by 2531 * the thread, temporarily using hblk_reserve, until hblk_reserve is 2532 * replaced by a hblk from sfmmu8_cache. 2533 */ 2534 if (hmeblkp == (struct hme_blk *)hblk_reserve && 2535 hblk_reserve_thread != curthread) { 2536 SFMMU_HASH_UNLOCK(hmebp); 2537 mutex_enter(&hblk_reserve_lock); 2538 mutex_exit(&hblk_reserve_lock); 2539 SFMMU_STAT(sf_hblk_reserve_hit); 2540 SFMMU_HASH_LOCK(hmebp); 2541 goto ttearray_realloc; 2542 } 2543 2544 if (hmeblkp == NULL) { 2545 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2546 hblktag, flags); 2547 } else { 2548 /* 2549 * It is possible for 8k and 64k hblks to collide since they 2550 * have the same rehash value. This is because we 2551 * lazily free hblks and 8K/64K blks could be lingering. 2552 * If we find size mismatch we free the block and & try again. 2553 */ 2554 if (get_hblk_ttesz(hmeblkp) != size) { 2555 ASSERT(!hmeblkp->hblk_vcnt); 2556 ASSERT(!hmeblkp->hblk_hmecnt); 2557 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 2558 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 2559 goto ttearray_realloc; 2560 } 2561 if (hmeblkp->hblk_shw_bit) { 2562 /* 2563 * if the hblk was previously used as a shadow hblk then 2564 * we will change it to a normal hblk 2565 */ 2566 if (hmeblkp->hblk_shw_mask) { 2567 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 2568 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2569 goto ttearray_realloc; 2570 } else { 2571 hmeblkp->hblk_shw_bit = 0; 2572 } 2573 } 2574 SFMMU_STAT(sf_hblk_hit); 2575 } 2576 2577 /* 2578 * hat_memload() should never call kmem_cache_free(); see block 2579 * comment showing the stacktrace in sfmmu_hblk_alloc(); 2580 * enqueue each hblk in the list to reserve list if it's created 2581 * from sfmmu8_cache *and* sfmmup == KHATID. 2582 */ 2583 forcefree = (sfmmup == KHATID) ? 1 : 0; 2584 while ((pr_hblk = list) != NULL) { 2585 list = pr_hblk->hblk_next; 2586 sfmmu_cache = get_hblk_cache(pr_hblk); 2587 if ((sfmmu_cache == sfmmu8_cache) && 2588 sfmmu_put_free_hblk(pr_hblk, forcefree)) 2589 continue; 2590 2591 ASSERT(sfmmup != KHATID); 2592 kmem_cache_free(sfmmu_cache, pr_hblk); 2593 } 2594 2595 ASSERT(get_hblk_ttesz(hmeblkp) == size); 2596 ASSERT(!hmeblkp->hblk_shw_bit); 2597 2598 return (hmeblkp); 2599 } 2600 2601 /* 2602 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 2603 * otherwise. 2604 */ 2605 static int 2606 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 2607 caddr_t vaddr, page_t **pps, uint_t flags) 2608 { 2609 page_t *pp = *pps; 2610 int hmenum, size, remap; 2611 tte_t tteold, flush_tte; 2612 #ifdef DEBUG 2613 tte_t orig_old; 2614 #endif /* DEBUG */ 2615 struct sf_hment *sfhme; 2616 kmutex_t *pml, *pmtx; 2617 hatlock_t *hatlockp; 2618 2619 /* 2620 * remove this panic when we decide to let user virtual address 2621 * space be >= USERLIMIT. 2622 */ 2623 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 2624 panic("user addr %p in kernel space", vaddr); 2625 #if defined(TTE_IS_GLOBAL) 2626 if (TTE_IS_GLOBAL(ttep)) 2627 panic("sfmmu_tteload: creating global tte"); 2628 #endif 2629 2630 #ifdef DEBUG 2631 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 2632 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 2633 panic("sfmmu_tteload: non cacheable memory tte"); 2634 #endif /* DEBUG */ 2635 2636 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 2637 !TTE_IS_MOD(ttep)) { 2638 /* 2639 * Don't load TSB for dummy as in ISM. Also don't preload 2640 * the TSB if the TTE isn't writable since we're likely to 2641 * fault on it again -- preloading can be fairly expensive. 2642 */ 2643 flags |= SFMMU_NO_TSBLOAD; 2644 } 2645 2646 size = TTE_CSZ(ttep); 2647 switch (size) { 2648 case TTE8K: 2649 SFMMU_STAT(sf_tteload8k); 2650 break; 2651 case TTE64K: 2652 SFMMU_STAT(sf_tteload64k); 2653 break; 2654 case TTE512K: 2655 SFMMU_STAT(sf_tteload512k); 2656 break; 2657 case TTE4M: 2658 SFMMU_STAT(sf_tteload4m); 2659 break; 2660 case (TTE32M): 2661 SFMMU_STAT(sf_tteload32m); 2662 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 2663 break; 2664 case (TTE256M): 2665 SFMMU_STAT(sf_tteload256m); 2666 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 2667 break; 2668 } 2669 2670 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2671 2672 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 2673 2674 /* 2675 * Need to grab mlist lock here so that pageunload 2676 * will not change tte behind us. 2677 */ 2678 if (pp) { 2679 pml = sfmmu_mlist_enter(pp); 2680 } 2681 2682 sfmmu_copytte(&sfhme->hme_tte, &tteold); 2683 /* 2684 * Look for corresponding hment and if valid verify 2685 * pfns are equal. 2686 */ 2687 remap = TTE_IS_VALID(&tteold); 2688 if (remap) { 2689 pfn_t new_pfn, old_pfn; 2690 2691 old_pfn = TTE_TO_PFN(vaddr, &tteold); 2692 new_pfn = TTE_TO_PFN(vaddr, ttep); 2693 2694 if (flags & HAT_LOAD_REMAP) { 2695 /* make sure we are remapping same type of pages */ 2696 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 2697 panic("sfmmu_tteload - tte remap io<->memory"); 2698 } 2699 if (old_pfn != new_pfn && 2700 (pp != NULL || sfhme->hme_page != NULL)) { 2701 panic("sfmmu_tteload - tte remap pp != NULL"); 2702 } 2703 } else if (old_pfn != new_pfn) { 2704 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 2705 (void *)hmeblkp); 2706 } 2707 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 2708 } 2709 2710 if (pp) { 2711 if (size == TTE8K) { 2712 /* 2713 * Handle VAC consistency 2714 */ 2715 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 2716 sfmmu_vac_conflict(sfmmup, vaddr, pp); 2717 } 2718 2719 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 2720 pmtx = sfmmu_page_enter(pp); 2721 PP_CLRRO(pp); 2722 sfmmu_page_exit(pmtx); 2723 } else if (!PP_ISMAPPED(pp) && 2724 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 2725 pmtx = sfmmu_page_enter(pp); 2726 if (!(PP_ISMOD(pp))) { 2727 PP_SETRO(pp); 2728 } 2729 sfmmu_page_exit(pmtx); 2730 } 2731 2732 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 2733 /* 2734 * sfmmu_pagearray_setup failed so return 2735 */ 2736 sfmmu_mlist_exit(pml); 2737 return (1); 2738 } 2739 } 2740 2741 /* 2742 * Make sure hment is not on a mapping list. 2743 */ 2744 ASSERT(remap || (sfhme->hme_page == NULL)); 2745 2746 /* if it is not a remap then hme->next better be NULL */ 2747 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 2748 2749 if (flags & HAT_LOAD_LOCK) { 2750 if (((int)hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 2751 panic("too high lckcnt-hmeblk %p", 2752 (void *)hmeblkp); 2753 } 2754 atomic_add_16(&hmeblkp->hblk_lckcnt, 1); 2755 2756 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 2757 } 2758 2759 if (pp && PP_ISNC(pp)) { 2760 /* 2761 * If the physical page is marked to be uncacheable, like 2762 * by a vac conflict, make sure the new mapping is also 2763 * uncacheable. 2764 */ 2765 TTE_CLR_VCACHEABLE(ttep); 2766 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 2767 } 2768 ttep->tte_hmenum = hmenum; 2769 2770 #ifdef DEBUG 2771 orig_old = tteold; 2772 #endif /* DEBUG */ 2773 2774 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 2775 if ((sfmmup == KHATID) && 2776 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 2777 sfmmu_copytte(&sfhme->hme_tte, &tteold); 2778 } 2779 #ifdef DEBUG 2780 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 2781 #endif /* DEBUG */ 2782 } 2783 2784 if (!TTE_IS_VALID(&tteold)) { 2785 2786 atomic_add_16(&hmeblkp->hblk_vcnt, 1); 2787 atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1); 2788 2789 /* 2790 * HAT_RELOAD_SHARE has been deprecated with lpg DISM. 2791 */ 2792 2793 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 2794 sfmmup != ksfmmup) { 2795 /* 2796 * If this is the first large mapping for the process 2797 * we must force any CPUs running this process to TL=0 2798 * where they will reload the HAT flags from the 2799 * tsbmiss area. This is necessary to make the large 2800 * mappings we are about to load visible to those CPUs; 2801 * otherwise they'll loop forever calling pagefault() 2802 * since we don't search large hash chains by default. 2803 */ 2804 hatlockp = sfmmu_hat_enter(sfmmup); 2805 if (size == TTE512K && 2806 !SFMMU_FLAGS_ISSET(sfmmup, HAT_512K_FLAG)) { 2807 SFMMU_FLAGS_SET(sfmmup, HAT_512K_FLAG); 2808 sfmmu_sync_mmustate(sfmmup); 2809 } else if (size == TTE4M && 2810 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4M_FLAG)) { 2811 SFMMU_FLAGS_SET(sfmmup, HAT_4M_FLAG); 2812 sfmmu_sync_mmustate(sfmmup); 2813 } else if (size == TTE64K && 2814 !SFMMU_FLAGS_ISSET(sfmmup, HAT_64K_FLAG)) { 2815 SFMMU_FLAGS_SET(sfmmup, HAT_64K_FLAG); 2816 /* no sync mmustate; 64K shares 8K hashes */ 2817 } else if (mmu_page_sizes == max_mmu_page_sizes) { 2818 if (size == TTE32M && 2819 !SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_FLAG)) { 2820 SFMMU_FLAGS_SET(sfmmup, HAT_32M_FLAG); 2821 sfmmu_sync_mmustate(sfmmup); 2822 } else if (size == TTE256M && 2823 !SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_FLAG)) { 2824 SFMMU_FLAGS_SET(sfmmup, HAT_256M_FLAG); 2825 sfmmu_sync_mmustate(sfmmup); 2826 } 2827 } 2828 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 2829 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 2830 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 2831 } 2832 sfmmu_hat_exit(hatlockp); 2833 } 2834 } 2835 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 2836 2837 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 2838 hw_tte.tte_intlo; 2839 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 2840 hw_tte.tte_inthi; 2841 2842 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 2843 /* 2844 * If remap and new tte differs from old tte we need 2845 * to sync the mod bit and flush TLB/TSB. We don't 2846 * need to sync ref bit because we currently always set 2847 * ref bit in tteload. 2848 */ 2849 ASSERT(TTE_IS_REF(ttep)); 2850 if (TTE_IS_MOD(&tteold)) { 2851 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 2852 } 2853 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 2854 xt_sync(sfmmup->sfmmu_cpusran); 2855 } 2856 2857 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 2858 /* 2859 * We only preload 8K and 4M mappings into the TSB, since 2860 * 64K and 512K mappings are replicated and hence don't 2861 * have a single, unique TSB entry. Ditto for 32M/256M. 2862 */ 2863 if (size == TTE8K || size == TTE4M) { 2864 hatlockp = sfmmu_hat_enter(sfmmup); 2865 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, size); 2866 sfmmu_hat_exit(hatlockp); 2867 } 2868 } 2869 if (pp) { 2870 if (!remap) { 2871 HME_ADD(sfhme, pp); 2872 atomic_add_16(&hmeblkp->hblk_hmecnt, 1); 2873 ASSERT(hmeblkp->hblk_hmecnt > 0); 2874 2875 /* 2876 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 2877 * see pageunload() for comment. 2878 */ 2879 } 2880 sfmmu_mlist_exit(pml); 2881 } 2882 2883 return (0); 2884 } 2885 /* 2886 * Function unlocks hash bucket. 2887 */ 2888 static void 2889 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 2890 { 2891 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2892 SFMMU_HASH_UNLOCK(hmebp); 2893 } 2894 2895 /* 2896 * function which checks and sets up page array for a large 2897 * translation. Will set p_vcolor, p_index, p_ro fields. 2898 * Assumes addr and pfnum of first page are properly aligned. 2899 * Will check for physical contiguity. If check fails it return 2900 * non null. 2901 */ 2902 static int 2903 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 2904 { 2905 int i, index, ttesz, osz; 2906 pfn_t pfnum; 2907 pgcnt_t npgs; 2908 int cflags = 0; 2909 page_t *pp, *pp1; 2910 kmutex_t *pmtx; 2911 int vac_err = 0; 2912 int newidx = 0; 2913 2914 ttesz = TTE_CSZ(ttep); 2915 2916 ASSERT(ttesz > TTE8K); 2917 2918 npgs = TTEPAGES(ttesz); 2919 index = PAGESZ_TO_INDEX(ttesz); 2920 2921 pfnum = (*pps)->p_pagenum; 2922 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 2923 2924 /* 2925 * Save the first pp so we can do HAT_TMPNC at the end. 2926 */ 2927 pp1 = *pps; 2928 osz = fnd_mapping_sz(pp1); 2929 2930 for (i = 0; i < npgs; i++, pps++) { 2931 pp = *pps; 2932 ASSERT(PAGE_LOCKED(pp)); 2933 ASSERT(pp->p_szc >= ttesz); 2934 ASSERT(pp->p_szc == pp1->p_szc); 2935 ASSERT(sfmmu_mlist_held(pp)); 2936 2937 /* 2938 * XXX is it possible to maintain P_RO on the root only? 2939 */ 2940 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 2941 pmtx = sfmmu_page_enter(pp); 2942 PP_CLRRO(pp); 2943 sfmmu_page_exit(pmtx); 2944 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 2945 !PP_ISMOD(pp)) { 2946 pmtx = sfmmu_page_enter(pp); 2947 if (!(PP_ISMOD(pp))) { 2948 PP_SETRO(pp); 2949 } 2950 sfmmu_page_exit(pmtx); 2951 } 2952 2953 /* 2954 * If this is a remap we skip vac & contiguity checks. 2955 */ 2956 if (remap) 2957 continue; 2958 2959 /* 2960 * set p_vcolor and detect any vac conflicts. 2961 */ 2962 if (vac_err == 0) { 2963 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 2964 2965 } 2966 2967 /* 2968 * Save current index in case we need to undo it. 2969 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 2970 * "SFMMU_INDEX_SHIFT 6" 2971 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 2972 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 2973 * 2974 * So: index = PAGESZ_TO_INDEX(ttesz); 2975 * if ttesz == 1 then index = 0x2 2976 * 2 then index = 0x4 2977 * 3 then index = 0x8 2978 * 4 then index = 0x10 2979 * 5 then index = 0x20 2980 * The code below checks if it's a new pagesize (ie, newidx) 2981 * in case we need to take it back out of p_index, 2982 * and then or's the new index into the existing index. 2983 */ 2984 if ((PP_MAPINDEX(pp) & index) == 0) 2985 newidx = 1; 2986 pp->p_index = (PP_MAPINDEX(pp) | index); 2987 2988 /* 2989 * contiguity check 2990 */ 2991 if (pp->p_pagenum != pfnum) { 2992 /* 2993 * If we fail the contiguity test then 2994 * the only thing we need to fix is the p_index field. 2995 * We might get a few extra flushes but since this 2996 * path is rare that is ok. The p_ro field will 2997 * get automatically fixed on the next tteload to 2998 * the page. NO TNC bit is set yet. 2999 */ 3000 while (i >= 0) { 3001 pp = *pps; 3002 if (newidx) 3003 pp->p_index = (PP_MAPINDEX(pp) & 3004 ~index); 3005 pps--; 3006 i--; 3007 } 3008 return (1); 3009 } 3010 pfnum++; 3011 addr += MMU_PAGESIZE; 3012 } 3013 3014 if (vac_err) { 3015 if (ttesz > osz) { 3016 /* 3017 * There are some smaller mappings that causes vac 3018 * conflicts. Convert all existing small mappings to 3019 * TNC. 3020 */ 3021 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 3022 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 3023 npgs); 3024 } else { 3025 /* EMPTY */ 3026 /* 3027 * If there exists an big page mapping, 3028 * that means the whole existing big page 3029 * has TNC setting already. No need to covert to 3030 * TNC again. 3031 */ 3032 ASSERT(PP_ISTNC(pp1)); 3033 } 3034 } 3035 3036 return (0); 3037 } 3038 3039 /* 3040 * Routine that detects vac consistency for a large page. It also 3041 * sets virtual color for all pp's for this big mapping. 3042 */ 3043 static int 3044 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 3045 { 3046 int vcolor, ocolor; 3047 3048 ASSERT(sfmmu_mlist_held(pp)); 3049 3050 if (PP_ISNC(pp)) { 3051 return (HAT_TMPNC); 3052 } 3053 3054 vcolor = addr_to_vcolor(addr); 3055 if (PP_NEWPAGE(pp)) { 3056 PP_SET_VCOLOR(pp, vcolor); 3057 return (0); 3058 } 3059 3060 ocolor = PP_GET_VCOLOR(pp); 3061 if (ocolor == vcolor) { 3062 return (0); 3063 } 3064 3065 if (!PP_ISMAPPED(pp)) { 3066 /* 3067 * Previous user of page had a differnet color 3068 * but since there are no current users 3069 * we just flush the cache and change the color. 3070 * As an optimization for large pages we flush the 3071 * entire cache of that color and set a flag. 3072 */ 3073 SFMMU_STAT(sf_pgcolor_conflict); 3074 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 3075 CacheColor_SetFlushed(*cflags, ocolor); 3076 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 3077 } 3078 PP_SET_VCOLOR(pp, vcolor); 3079 return (0); 3080 } 3081 3082 /* 3083 * We got a real conflict with a current mapping. 3084 * set flags to start unencaching all mappings 3085 * and return failure so we restart looping 3086 * the pp array from the beginning. 3087 */ 3088 return (HAT_TMPNC); 3089 } 3090 3091 /* 3092 * creates a large page shadow hmeblk for a tte. 3093 * The purpose of this routine is to allow us to do quick unloads because 3094 * the vm layer can easily pass a very large but sparsely populated range. 3095 */ 3096 static struct hme_blk * 3097 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 3098 { 3099 struct hmehash_bucket *hmebp; 3100 hmeblk_tag hblktag; 3101 int hmeshift, size, vshift; 3102 uint_t shw_mask, newshw_mask; 3103 struct hme_blk *hmeblkp; 3104 3105 ASSERT(sfmmup != KHATID); 3106 if (mmu_page_sizes == max_mmu_page_sizes) { 3107 ASSERT(ttesz < TTE256M); 3108 } else { 3109 ASSERT(ttesz < TTE4M); 3110 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 3111 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 3112 } 3113 3114 if (ttesz == TTE8K) { 3115 size = TTE512K; 3116 } else { 3117 size = ++ttesz; 3118 } 3119 3120 hblktag.htag_id = sfmmup; 3121 hmeshift = HME_HASH_SHIFT(size); 3122 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3123 hblktag.htag_rehash = HME_HASH_REHASH(size); 3124 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 3125 3126 SFMMU_HASH_LOCK(hmebp); 3127 3128 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3129 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 3130 if (hmeblkp == NULL) { 3131 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3132 hblktag, flags); 3133 } 3134 ASSERT(hmeblkp); 3135 if (!hmeblkp->hblk_shw_mask) { 3136 /* 3137 * if this is a unused hblk it was just allocated or could 3138 * potentially be a previous large page hblk so we need to 3139 * set the shadow bit. 3140 */ 3141 hmeblkp->hblk_shw_bit = 1; 3142 } 3143 ASSERT(hmeblkp->hblk_shw_bit == 1); 3144 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3145 ASSERT(vshift < 8); 3146 /* 3147 * Atomically set shw mask bit 3148 */ 3149 do { 3150 shw_mask = hmeblkp->hblk_shw_mask; 3151 newshw_mask = shw_mask | (1 << vshift); 3152 newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask, 3153 newshw_mask); 3154 } while (newshw_mask != shw_mask); 3155 3156 SFMMU_HASH_UNLOCK(hmebp); 3157 3158 return (hmeblkp); 3159 } 3160 3161 /* 3162 * This routine cleanup a previous shadow hmeblk and changes it to 3163 * a regular hblk. This happens rarely but it is possible 3164 * when a process wants to use large pages and there are hblks still 3165 * lying around from the previous as that used these hmeblks. 3166 * The alternative was to cleanup the shadow hblks at unload time 3167 * but since so few user processes actually use large pages, it is 3168 * better to be lazy and cleanup at this time. 3169 */ 3170 static void 3171 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3172 struct hmehash_bucket *hmebp) 3173 { 3174 caddr_t addr, endaddr; 3175 int hashno, size; 3176 3177 ASSERT(hmeblkp->hblk_shw_bit); 3178 3179 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3180 3181 if (!hmeblkp->hblk_shw_mask) { 3182 hmeblkp->hblk_shw_bit = 0; 3183 return; 3184 } 3185 addr = (caddr_t)get_hblk_base(hmeblkp); 3186 endaddr = get_hblk_endaddr(hmeblkp); 3187 size = get_hblk_ttesz(hmeblkp); 3188 hashno = size - 1; 3189 ASSERT(hashno > 0); 3190 SFMMU_HASH_UNLOCK(hmebp); 3191 3192 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3193 3194 SFMMU_HASH_LOCK(hmebp); 3195 } 3196 3197 static void 3198 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3199 int hashno) 3200 { 3201 int hmeshift, shadow = 0; 3202 hmeblk_tag hblktag; 3203 struct hmehash_bucket *hmebp; 3204 struct hme_blk *hmeblkp; 3205 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3206 uint64_t hblkpa, prevpa, nx_pa; 3207 3208 ASSERT(hashno > 0); 3209 hblktag.htag_id = sfmmup; 3210 hblktag.htag_rehash = hashno; 3211 3212 hmeshift = HME_HASH_SHIFT(hashno); 3213 3214 while (addr < endaddr) { 3215 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3216 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3217 SFMMU_HASH_LOCK(hmebp); 3218 /* inline HME_HASH_SEARCH */ 3219 hmeblkp = hmebp->hmeblkp; 3220 hblkpa = hmebp->hmeh_nextpa; 3221 prevpa = 0; 3222 pr_hblk = NULL; 3223 while (hmeblkp) { 3224 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 3225 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3226 /* found hme_blk */ 3227 if (hmeblkp->hblk_shw_bit) { 3228 if (hmeblkp->hblk_shw_mask) { 3229 shadow = 1; 3230 sfmmu_shadow_hcleanup(sfmmup, 3231 hmeblkp, hmebp); 3232 break; 3233 } else { 3234 hmeblkp->hblk_shw_bit = 0; 3235 } 3236 } 3237 3238 /* 3239 * Hblk_hmecnt and hblk_vcnt could be non zero 3240 * since hblk_unload() does not gurantee that. 3241 * 3242 * XXX - this could cause tteload() to spin 3243 * where sfmmu_shadow_hcleanup() is called. 3244 */ 3245 } 3246 3247 nx_hblk = hmeblkp->hblk_next; 3248 nx_pa = hmeblkp->hblk_nextpa; 3249 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3250 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 3251 pr_hblk); 3252 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 3253 } else { 3254 pr_hblk = hmeblkp; 3255 prevpa = hblkpa; 3256 } 3257 hmeblkp = nx_hblk; 3258 hblkpa = nx_pa; 3259 } 3260 3261 SFMMU_HASH_UNLOCK(hmebp); 3262 3263 if (shadow) { 3264 /* 3265 * We found another shadow hblk so cleaned its 3266 * children. We need to go back and cleanup 3267 * the original hblk so we don't change the 3268 * addr. 3269 */ 3270 shadow = 0; 3271 } else { 3272 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3273 (1 << hmeshift)); 3274 } 3275 } 3276 sfmmu_hblks_list_purge(&list); 3277 } 3278 3279 /* 3280 * Release one hardware address translation lock on the given address range. 3281 */ 3282 void 3283 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3284 { 3285 struct hmehash_bucket *hmebp; 3286 hmeblk_tag hblktag; 3287 int hmeshift, hashno = 1; 3288 struct hme_blk *hmeblkp, *list = NULL; 3289 caddr_t endaddr; 3290 3291 ASSERT(sfmmup != NULL); 3292 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3293 3294 ASSERT((sfmmup == ksfmmup) || 3295 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3296 ASSERT((len & MMU_PAGEOFFSET) == 0); 3297 endaddr = addr + len; 3298 hblktag.htag_id = sfmmup; 3299 3300 /* 3301 * Spitfire supports 4 page sizes. 3302 * Most pages are expected to be of the smallest page size (8K) and 3303 * these will not need to be rehashed. 64K pages also don't need to be 3304 * rehashed because an hmeblk spans 64K of address space. 512K pages 3305 * might need 1 rehash and and 4M pages might need 2 rehashes. 3306 */ 3307 while (addr < endaddr) { 3308 hmeshift = HME_HASH_SHIFT(hashno); 3309 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3310 hblktag.htag_rehash = hashno; 3311 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3312 3313 SFMMU_HASH_LOCK(hmebp); 3314 3315 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3316 if (hmeblkp != NULL) { 3317 /* 3318 * If we encounter a shadow hmeblk then 3319 * we know there are no valid hmeblks mapping 3320 * this address at this size or larger. 3321 * Just increment address by the smallest 3322 * page size. 3323 */ 3324 if (hmeblkp->hblk_shw_bit) { 3325 addr += MMU_PAGESIZE; 3326 } else { 3327 addr = sfmmu_hblk_unlock(hmeblkp, addr, 3328 endaddr); 3329 } 3330 SFMMU_HASH_UNLOCK(hmebp); 3331 hashno = 1; 3332 continue; 3333 } 3334 SFMMU_HASH_UNLOCK(hmebp); 3335 3336 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 3337 /* 3338 * We have traversed the whole list and rehashed 3339 * if necessary without finding the address to unlock 3340 * which should never happen. 3341 */ 3342 panic("sfmmu_unlock: addr not found. " 3343 "addr %p hat %p", (void *)addr, (void *)sfmmup); 3344 } else { 3345 hashno++; 3346 } 3347 } 3348 3349 sfmmu_hblks_list_purge(&list); 3350 } 3351 3352 /* 3353 * Function to unlock a range of addresses in an hmeblk. It returns the 3354 * next address that needs to be unlocked. 3355 * Should be called with the hash lock held. 3356 */ 3357 static caddr_t 3358 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 3359 { 3360 struct sf_hment *sfhme; 3361 tte_t tteold, ttemod; 3362 int ttesz, ret; 3363 3364 ASSERT(in_hblk_range(hmeblkp, addr)); 3365 ASSERT(hmeblkp->hblk_shw_bit == 0); 3366 3367 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 3368 ttesz = get_hblk_ttesz(hmeblkp); 3369 3370 HBLKTOHME(sfhme, hmeblkp, addr); 3371 while (addr < endaddr) { 3372 readtte: 3373 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3374 if (TTE_IS_VALID(&tteold)) { 3375 3376 ttemod = tteold; 3377 3378 ret = sfmmu_modifytte_try(&tteold, &ttemod, 3379 &sfhme->hme_tte); 3380 3381 if (ret < 0) 3382 goto readtte; 3383 3384 if (hmeblkp->hblk_lckcnt == 0) 3385 panic("zero hblk lckcnt"); 3386 3387 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 3388 (uintptr_t)endaddr) 3389 panic("can't unlock large tte"); 3390 3391 ASSERT(hmeblkp->hblk_lckcnt > 0); 3392 atomic_add_16(&hmeblkp->hblk_lckcnt, -1); 3393 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 3394 } else { 3395 panic("sfmmu_hblk_unlock: invalid tte"); 3396 } 3397 addr += TTEBYTES(ttesz); 3398 sfhme++; 3399 } 3400 return (addr); 3401 } 3402 3403 /* 3404 * Physical Address Mapping Framework 3405 * 3406 * General rules: 3407 * 3408 * (1) Applies only to seg_kmem memory pages. To make things easier, 3409 * seg_kpm addresses are also accepted by the routines, but nothing 3410 * is done with them since by definition their PA mappings are static. 3411 * (2) hat_add_callback() may only be called while holding the page lock 3412 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()). 3413 * (3) prehandler() and posthandler() may not call hat_add_callback() or 3414 * hat_delete_callback(), nor should they allocate memory. Post quiesce 3415 * callbacks may not sleep or acquire adaptive mutex locks. 3416 * (4) Either prehandler() or posthandler() (but not both) may be specified 3417 * as being NULL. Specifying an errhandler() is optional. 3418 * 3419 * Details of using the framework: 3420 * 3421 * registering a callback (hat_register_callback()) 3422 * 3423 * Pass prehandler, posthandler, errhandler addresses 3424 * as described below. If capture_cpus argument is nonzero, 3425 * suspend callback to the prehandler will occur with CPUs 3426 * captured and executing xc_loop() and CPUs will remain 3427 * captured until after the posthandler suspend callback 3428 * occurs. 3429 * 3430 * adding a callback (hat_add_callback()) 3431 * 3432 * as_pagelock(); 3433 * hat_add_callback(); 3434 * save returned pfn in private data structures or program registers; 3435 * as_pageunlock(); 3436 * 3437 * prehandler() 3438 * 3439 * Stop all accesses by physical address to this memory page. 3440 * Called twice: the first, PRESUSPEND, is a context safe to acquire 3441 * adaptive locks. The second, SUSPEND, is called at high PIL with 3442 * CPUs captured so adaptive locks may NOT be acquired (and all spin 3443 * locks must be XCALL_PIL or higher locks). 3444 * 3445 * May return the following errors: 3446 * EIO: A fatal error has occurred. This will result in panic. 3447 * EAGAIN: The page cannot be suspended. This will fail the 3448 * relocation. 3449 * 0: Success. 3450 * 3451 * posthandler() 3452 * 3453 * Save new pfn in private data structures or program registers; 3454 * not allowed to fail (non-zero return values will result in panic). 3455 * 3456 * errhandler() 3457 * 3458 * called when an error occurs related to the callback. Currently 3459 * the only such error is HAT_CB_ERR_LEAKED which indicates that 3460 * a page is being freed, but there are still outstanding callback(s) 3461 * registered on the page. 3462 * 3463 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 3464 * 3465 * stop using physical address 3466 * hat_delete_callback(); 3467 * 3468 */ 3469 3470 /* 3471 * Register a callback class. Each subsystem should do this once and 3472 * cache the id_t returned for use in setting up and tearing down callbacks. 3473 * 3474 * There is no facility for removing callback IDs once they are created; 3475 * the "key" should be unique for each module, so in case a module is unloaded 3476 * and subsequently re-loaded, we can recycle the module's previous entry. 3477 */ 3478 id_t 3479 hat_register_callback(int key, 3480 int (*prehandler)(caddr_t, uint_t, uint_t, void *), 3481 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 3482 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 3483 int capture_cpus) 3484 { 3485 id_t id; 3486 3487 /* 3488 * Search the table for a pre-existing callback associated with 3489 * the identifier "key". If one exists, we re-use that entry in 3490 * the table for this instance, otherwise we assign the next 3491 * available table slot. 3492 */ 3493 for (id = 0; id < sfmmu_max_cb_id; id++) { 3494 if (sfmmu_cb_table[id].key == key) 3495 break; 3496 } 3497 3498 if (id == sfmmu_max_cb_id) { 3499 id = sfmmu_cb_nextid++; 3500 if (id >= sfmmu_max_cb_id) 3501 panic("hat_register_callback: out of callback IDs"); 3502 } 3503 3504 ASSERT(prehandler != NULL || posthandler != NULL); 3505 3506 sfmmu_cb_table[id].key = key; 3507 sfmmu_cb_table[id].prehandler = prehandler; 3508 sfmmu_cb_table[id].posthandler = posthandler; 3509 sfmmu_cb_table[id].errhandler = errhandler; 3510 sfmmu_cb_table[id].capture_cpus = capture_cpus; 3511 3512 return (id); 3513 } 3514 3515 /* 3516 * Add relocation callbacks to the specified addr/len which will be called 3517 * when relocating the associated page. See the description of pre and 3518 * posthandler above for more details. IMPT: this operation is only valid 3519 * on seg_kmem pages!! 3520 * 3521 * If HAC_PAGELOCK is included in flags, the underlying memory page is 3522 * locked internally so the caller must be able to deal with the callback 3523 * running even before this function has returned. If HAC_PAGELOCK is not 3524 * set, it is assumed that the underlying memory pages are locked. 3525 * 3526 * Since the caller must track the individual page boundaries anyway, 3527 * we only allow a callback to be added to a single page (large 3528 * or small). Thus [addr, addr + len) MUST be contained within a single 3529 * page. 3530 * 3531 * Registering multiple callbacks on the same [addr, addr+len) is supported, 3532 * in which case the corresponding callback will be called once with each 3533 * unique parameter specified. The number of subsequent deletes must match 3534 * since reference counts are held. If a callback is desired for each 3535 * virtual object with the same parameter specified for multiple callbacks, 3536 * a different virtual address should be specified at the time of 3537 * callback registration. 3538 * 3539 * Returns the pfn of the underlying kernel page in *rpfn 3540 * on success, or PFN_INVALID on failure. 3541 * 3542 * Returns values: 3543 * 0: success 3544 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 3545 * EINVAL: callback ID is not valid 3546 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 3547 * space, or crosses a page boundary 3548 */ 3549 int 3550 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 3551 void *pvt, pfn_t *rpfn) 3552 { 3553 struct hmehash_bucket *hmebp; 3554 hmeblk_tag hblktag; 3555 struct hme_blk *hmeblkp; 3556 int hmeshift, hashno; 3557 caddr_t saddr, eaddr, baseaddr; 3558 struct pa_hment *pahmep, *tpahmep; 3559 struct sf_hment *sfhmep, *osfhmep, *tsfhmep; 3560 kmutex_t *pml; 3561 tte_t tte; 3562 page_t *pp, *rpp; 3563 pfn_t pfn; 3564 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 3565 int locked = 0; 3566 3567 /* 3568 * For KPM mappings, just return the physical address since we 3569 * don't need to register any callbacks. 3570 */ 3571 if (IS_KPM_ADDR(vaddr)) { 3572 uint64_t paddr; 3573 SFMMU_KPM_VTOP(vaddr, paddr); 3574 *rpfn = btop(paddr); 3575 return (0); 3576 } 3577 3578 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 3579 *rpfn = PFN_INVALID; 3580 return (EINVAL); 3581 } 3582 3583 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 3584 *rpfn = PFN_INVALID; 3585 return (ENOMEM); 3586 } 3587 3588 sfhmep = &pahmep->sfment; 3589 3590 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 3591 eaddr = saddr + len; 3592 3593 rehash: 3594 /* Find the mapping(s) for this page */ 3595 for (hashno = TTE64K, hmeblkp = NULL; 3596 hmeblkp == NULL && hashno <= mmu_hashcnt; 3597 hashno++) { 3598 hmeshift = HME_HASH_SHIFT(hashno); 3599 hblktag.htag_id = ksfmmup; 3600 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 3601 hblktag.htag_rehash = hashno; 3602 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 3603 3604 SFMMU_HASH_LOCK(hmebp); 3605 3606 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3607 3608 if (hmeblkp == NULL) 3609 SFMMU_HASH_UNLOCK(hmebp); 3610 } 3611 3612 if (hmeblkp == NULL) { 3613 kmem_cache_free(pa_hment_cache, pahmep); 3614 *rpfn = PFN_INVALID; 3615 return (ENXIO); 3616 } 3617 3618 /* 3619 * Make sure the boundaries for the callback fall within this 3620 * single mapping. 3621 */ 3622 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 3623 ASSERT(saddr >= baseaddr); 3624 if (eaddr > (caddr_t)get_hblk_endaddr(hmeblkp)) { 3625 SFMMU_HASH_UNLOCK(hmebp); 3626 kmem_cache_free(pa_hment_cache, pahmep); 3627 *rpfn = PFN_INVALID; 3628 return (ENXIO); 3629 } 3630 3631 HBLKTOHME(osfhmep, hmeblkp, saddr); 3632 sfmmu_copytte(&osfhmep->hme_tte, &tte); 3633 3634 ASSERT(TTE_IS_VALID(&tte)); 3635 pfn = sfmmu_ttetopfn(&tte, vaddr); 3636 3637 /* 3638 * The pfn may not have a page_t underneath in which case we 3639 * just return it. This can happen if we are doing I/O to a 3640 * static portion of the kernel's address space, for instance. 3641 */ 3642 pp = osfhmep->hme_page; 3643 if (pp == NULL || pp->p_vnode != &kvp) { 3644 SFMMU_HASH_UNLOCK(hmebp); 3645 kmem_cache_free(pa_hment_cache, pahmep); 3646 *rpfn = pfn; 3647 return (0); 3648 } 3649 3650 pml = sfmmu_mlist_enter(pp); 3651 3652 if ((flags & HAC_PAGELOCK) && !locked) { 3653 if (!page_trylock(pp, SE_SHARED)) { 3654 page_t *tpp; 3655 3656 /* 3657 * Somebody is holding SE_EXCL lock. Drop all 3658 * our locks, lookup the page in &kvp, and 3659 * retry. If it doesn't exist in &kvp, then we 3660 * die here; we should have caught it above, 3661 * meaning the page must have changed identity 3662 * (e.g. the caller didn't hold onto the page 3663 * lock after establishing the kernel mapping) 3664 */ 3665 sfmmu_mlist_exit(pml); 3666 SFMMU_HASH_UNLOCK(hmebp); 3667 tpp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 3668 if (tpp == NULL) { 3669 panic("hat_add_callback: page not found: 0x%p", 3670 pp); 3671 } 3672 pp = tpp; 3673 rpp = PP_PAGEROOT(pp); 3674 if (rpp != pp) { 3675 page_unlock(pp); 3676 (void) page_lock(rpp, SE_SHARED, NULL, 3677 P_NO_RECLAIM); 3678 } 3679 locked = 1; 3680 goto rehash; 3681 } 3682 locked = 1; 3683 } 3684 3685 if (!PAGE_LOCKED(pp) && !panicstr) 3686 panic("hat_add_callback: page 0x%p not locked", pp); 3687 3688 if (osfhmep->hme_page != pp || pp->p_vnode != &kvp || 3689 pp->p_offset < (u_offset_t)baseaddr || 3690 pp->p_offset > (u_offset_t)eaddr) { 3691 /* 3692 * The page moved before we got our hands on it. Drop 3693 * all the locks and try again. 3694 */ 3695 ASSERT((flags & HAC_PAGELOCK) != 0); 3696 sfmmu_mlist_exit(pml); 3697 SFMMU_HASH_UNLOCK(hmebp); 3698 page_unlock(pp); 3699 locked = 0; 3700 goto rehash; 3701 } 3702 3703 ASSERT(osfhmep->hme_page == pp); 3704 3705 for (tsfhmep = pp->p_mapping; tsfhmep != NULL; 3706 tsfhmep = tsfhmep->hme_next) { 3707 3708 /* 3709 * skip va to pa mappings 3710 */ 3711 if (!IS_PAHME(tsfhmep)) 3712 continue; 3713 3714 tpahmep = tsfhmep->hme_data; 3715 ASSERT(tpahmep != NULL); 3716 3717 /* 3718 * See if the pahment already exists. 3719 */ 3720 if ((tpahmep->pvt == pvt) && 3721 (tpahmep->addr == vaddr) && 3722 (tpahmep->len == len)) { 3723 ASSERT(tpahmep->cb_id == callback_id); 3724 tpahmep->refcnt++; 3725 pp->p_share++; 3726 3727 sfmmu_mlist_exit(pml); 3728 SFMMU_HASH_UNLOCK(hmebp); 3729 3730 if (locked) 3731 page_unlock(pp); 3732 3733 kmem_cache_free(pa_hment_cache, pahmep); 3734 3735 *rpfn = pfn; 3736 return (0); 3737 } 3738 } 3739 3740 /* 3741 * setup this shiny new pa_hment .. 3742 */ 3743 pp->p_share++; 3744 pahmep->cb_id = callback_id; 3745 pahmep->addr = vaddr; 3746 pahmep->len = len; 3747 pahmep->refcnt = 1; 3748 pahmep->flags = 0; 3749 pahmep->pvt = pvt; 3750 3751 /* 3752 * .. and also set up the sf_hment and link to p_mapping list. 3753 */ 3754 sfhmep->hme_tte.ll = 0; 3755 sfhmep->hme_data = pahmep; 3756 sfhmep->hme_prev = osfhmep; 3757 sfhmep->hme_next = osfhmep->hme_next; 3758 3759 if (osfhmep->hme_next) 3760 osfhmep->hme_next->hme_prev = sfhmep; 3761 3762 osfhmep->hme_next = sfhmep; 3763 3764 sfmmu_mlist_exit(pml); 3765 SFMMU_HASH_UNLOCK(hmebp); 3766 3767 *rpfn = pfn; 3768 if (locked) 3769 page_unlock(pp); 3770 3771 return (0); 3772 } 3773 3774 /* 3775 * Remove the relocation callbacks from the specified addr/len. 3776 */ 3777 void 3778 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags) 3779 { 3780 struct hmehash_bucket *hmebp; 3781 hmeblk_tag hblktag; 3782 struct hme_blk *hmeblkp; 3783 int hmeshift, hashno; 3784 caddr_t saddr, eaddr, baseaddr; 3785 struct pa_hment *pahmep; 3786 struct sf_hment *sfhmep, *osfhmep; 3787 kmutex_t *pml; 3788 tte_t tte; 3789 page_t *pp, *rpp; 3790 int locked = 0; 3791 3792 if (IS_KPM_ADDR(vaddr)) 3793 return; 3794 3795 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 3796 eaddr = saddr + len; 3797 3798 rehash: 3799 /* Find the mapping(s) for this page */ 3800 for (hashno = TTE64K, hmeblkp = NULL; 3801 hmeblkp == NULL && hashno <= mmu_hashcnt; 3802 hashno++) { 3803 hmeshift = HME_HASH_SHIFT(hashno); 3804 hblktag.htag_id = ksfmmup; 3805 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 3806 hblktag.htag_rehash = hashno; 3807 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 3808 3809 SFMMU_HASH_LOCK(hmebp); 3810 3811 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3812 3813 if (hmeblkp == NULL) 3814 SFMMU_HASH_UNLOCK(hmebp); 3815 } 3816 3817 if (hmeblkp == NULL) { 3818 if (!panicstr) { 3819 panic("hat_delete_callback: addr 0x%p not found", 3820 saddr); 3821 } 3822 return; 3823 } 3824 3825 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 3826 HBLKTOHME(osfhmep, hmeblkp, saddr); 3827 3828 sfmmu_copytte(&osfhmep->hme_tte, &tte); 3829 ASSERT(TTE_IS_VALID(&tte)); 3830 3831 pp = osfhmep->hme_page; 3832 if (pp == NULL || pp->p_vnode != &kvp) { 3833 SFMMU_HASH_UNLOCK(hmebp); 3834 return; 3835 } 3836 3837 pml = sfmmu_mlist_enter(pp); 3838 3839 if ((flags & HAC_PAGELOCK) && !locked) { 3840 if (!page_trylock(pp, SE_SHARED)) { 3841 /* 3842 * Somebody is holding SE_EXCL lock. Drop all 3843 * our locks, lookup the page in &kvp, and 3844 * retry. 3845 */ 3846 sfmmu_mlist_exit(pml); 3847 SFMMU_HASH_UNLOCK(hmebp); 3848 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 3849 ASSERT(pp != NULL); 3850 rpp = PP_PAGEROOT(pp); 3851 if (rpp != pp) { 3852 page_unlock(pp); 3853 (void) page_lock(rpp, SE_SHARED, NULL, 3854 P_NO_RECLAIM); 3855 } 3856 locked = 1; 3857 goto rehash; 3858 } 3859 locked = 1; 3860 } 3861 3862 ASSERT(PAGE_LOCKED(pp)); 3863 3864 if (osfhmep->hme_page != pp || pp->p_vnode != &kvp || 3865 pp->p_offset < (u_offset_t)baseaddr || 3866 pp->p_offset > (u_offset_t)eaddr) { 3867 /* 3868 * The page moved before we got our hands on it. Drop 3869 * all the locks and try again. 3870 */ 3871 ASSERT((flags & HAC_PAGELOCK) != 0); 3872 sfmmu_mlist_exit(pml); 3873 SFMMU_HASH_UNLOCK(hmebp); 3874 page_unlock(pp); 3875 locked = 0; 3876 goto rehash; 3877 } 3878 3879 ASSERT(osfhmep->hme_page == pp); 3880 3881 for (sfhmep = pp->p_mapping; sfhmep != NULL; 3882 sfhmep = sfhmep->hme_next) { 3883 3884 /* 3885 * skip va<->pa mappings 3886 */ 3887 if (!IS_PAHME(sfhmep)) 3888 continue; 3889 3890 pahmep = sfhmep->hme_data; 3891 ASSERT(pahmep != NULL); 3892 3893 /* 3894 * if pa_hment matches, remove it 3895 */ 3896 if ((pahmep->pvt == pvt) && 3897 (pahmep->addr == vaddr) && 3898 (pahmep->len == len)) { 3899 break; 3900 } 3901 } 3902 3903 if (sfhmep == NULL) { 3904 if (!panicstr) { 3905 panic("hat_delete_callback: pa_hment not found, pp %p", 3906 (void *)pp); 3907 } 3908 return; 3909 } 3910 3911 /* 3912 * Note: at this point a valid kernel mapping must still be 3913 * present on this page. 3914 */ 3915 pp->p_share--; 3916 if (pp->p_share <= 0) 3917 panic("hat_delete_callback: zero p_share"); 3918 3919 if (--pahmep->refcnt == 0) { 3920 if (pahmep->flags != 0) 3921 panic("hat_delete_callback: pa_hment is busy"); 3922 3923 /* 3924 * Remove sfhmep from the mapping list for the page. 3925 */ 3926 if (sfhmep->hme_prev) { 3927 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 3928 } else { 3929 pp->p_mapping = sfhmep->hme_next; 3930 } 3931 3932 if (sfhmep->hme_next) 3933 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 3934 3935 sfmmu_mlist_exit(pml); 3936 SFMMU_HASH_UNLOCK(hmebp); 3937 3938 if (locked) 3939 page_unlock(pp); 3940 3941 kmem_cache_free(pa_hment_cache, pahmep); 3942 return; 3943 } 3944 3945 sfmmu_mlist_exit(pml); 3946 SFMMU_HASH_UNLOCK(hmebp); 3947 if (locked) 3948 page_unlock(pp); 3949 } 3950 3951 /* 3952 * hat_probe returns 1 if the translation for the address 'addr' is 3953 * loaded, zero otherwise. 3954 * 3955 * hat_probe should be used only for advisorary purposes because it may 3956 * occasionally return the wrong value. The implementation must guarantee that 3957 * returning the wrong value is a very rare event. hat_probe is used 3958 * to implement optimizations in the segment drivers. 3959 * 3960 */ 3961 int 3962 hat_probe(struct hat *sfmmup, caddr_t addr) 3963 { 3964 pfn_t pfn; 3965 tte_t tte; 3966 3967 ASSERT(sfmmup != NULL); 3968 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3969 3970 ASSERT((sfmmup == ksfmmup) || 3971 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3972 3973 if (sfmmup == ksfmmup) { 3974 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 3975 == PFN_SUSPENDED) { 3976 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 3977 } 3978 } else { 3979 pfn = sfmmu_uvatopfn(addr, sfmmup); 3980 } 3981 3982 if (pfn != PFN_INVALID) 3983 return (1); 3984 else 3985 return (0); 3986 } 3987 3988 ssize_t 3989 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 3990 { 3991 tte_t tte; 3992 3993 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3994 3995 sfmmu_gettte(sfmmup, addr, &tte); 3996 if (TTE_IS_VALID(&tte)) { 3997 return (TTEBYTES(TTE_CSZ(&tte))); 3998 } 3999 return (-1); 4000 } 4001 4002 static void 4003 sfmmu_gettte(struct hat *sfmmup, caddr_t addr, tte_t *ttep) 4004 { 4005 struct hmehash_bucket *hmebp; 4006 hmeblk_tag hblktag; 4007 int hmeshift, hashno = 1; 4008 struct hme_blk *hmeblkp, *list = NULL; 4009 struct sf_hment *sfhmep; 4010 4011 /* support for ISM */ 4012 ism_map_t *ism_map; 4013 ism_blk_t *ism_blkp; 4014 int i; 4015 sfmmu_t *ism_hatid = NULL; 4016 sfmmu_t *locked_hatid = NULL; 4017 4018 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 4019 4020 ism_blkp = sfmmup->sfmmu_iblk; 4021 if (ism_blkp) { 4022 sfmmu_ismhat_enter(sfmmup, 0); 4023 locked_hatid = sfmmup; 4024 } 4025 while (ism_blkp && ism_hatid == NULL) { 4026 ism_map = ism_blkp->iblk_maps; 4027 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 4028 if (addr >= ism_start(ism_map[i]) && 4029 addr < ism_end(ism_map[i])) { 4030 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 4031 addr = (caddr_t)(addr - 4032 ism_start(ism_map[i])); 4033 break; 4034 } 4035 } 4036 ism_blkp = ism_blkp->iblk_next; 4037 } 4038 if (locked_hatid) { 4039 sfmmu_ismhat_exit(locked_hatid, 0); 4040 } 4041 4042 hblktag.htag_id = sfmmup; 4043 ttep->ll = 0; 4044 4045 do { 4046 hmeshift = HME_HASH_SHIFT(hashno); 4047 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4048 hblktag.htag_rehash = hashno; 4049 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4050 4051 SFMMU_HASH_LOCK(hmebp); 4052 4053 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4054 if (hmeblkp != NULL) { 4055 HBLKTOHME(sfhmep, hmeblkp, addr); 4056 sfmmu_copytte(&sfhmep->hme_tte, ttep); 4057 SFMMU_HASH_UNLOCK(hmebp); 4058 break; 4059 } 4060 SFMMU_HASH_UNLOCK(hmebp); 4061 hashno++; 4062 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 4063 4064 sfmmu_hblks_list_purge(&list); 4065 } 4066 4067 uint_t 4068 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 4069 { 4070 tte_t tte; 4071 4072 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4073 4074 sfmmu_gettte(sfmmup, addr, &tte); 4075 if (TTE_IS_VALID(&tte)) { 4076 *attr = sfmmu_ptov_attr(&tte); 4077 return (0); 4078 } 4079 *attr = 0; 4080 return ((uint_t)0xffffffff); 4081 } 4082 4083 /* 4084 * Enables more attributes on specified address range (ie. logical OR) 4085 */ 4086 void 4087 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4088 { 4089 if (hat->sfmmu_xhat_provider) { 4090 XHAT_SETATTR(hat, addr, len, attr); 4091 return; 4092 } else { 4093 /* 4094 * This must be a CPU HAT. If the address space has 4095 * XHATs attached, change attributes for all of them, 4096 * just in case 4097 */ 4098 ASSERT(hat->sfmmu_as != NULL); 4099 if (hat->sfmmu_as->a_xhat != NULL) 4100 xhat_setattr_all(hat->sfmmu_as, addr, len, attr); 4101 } 4102 4103 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 4104 } 4105 4106 /* 4107 * Assigns attributes to the specified address range. All the attributes 4108 * are specified. 4109 */ 4110 void 4111 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4112 { 4113 if (hat->sfmmu_xhat_provider) { 4114 XHAT_CHGATTR(hat, addr, len, attr); 4115 return; 4116 } else { 4117 /* 4118 * This must be a CPU HAT. If the address space has 4119 * XHATs attached, change attributes for all of them, 4120 * just in case 4121 */ 4122 ASSERT(hat->sfmmu_as != NULL); 4123 if (hat->sfmmu_as->a_xhat != NULL) 4124 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr); 4125 } 4126 4127 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 4128 } 4129 4130 /* 4131 * Remove attributes on the specified address range (ie. loginal NAND) 4132 */ 4133 void 4134 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4135 { 4136 if (hat->sfmmu_xhat_provider) { 4137 XHAT_CLRATTR(hat, addr, len, attr); 4138 return; 4139 } else { 4140 /* 4141 * This must be a CPU HAT. If the address space has 4142 * XHATs attached, change attributes for all of them, 4143 * just in case 4144 */ 4145 ASSERT(hat->sfmmu_as != NULL); 4146 if (hat->sfmmu_as->a_xhat != NULL) 4147 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr); 4148 } 4149 4150 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 4151 } 4152 4153 /* 4154 * Change attributes on an address range to that specified by attr and mode. 4155 */ 4156 static void 4157 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 4158 int mode) 4159 { 4160 struct hmehash_bucket *hmebp; 4161 hmeblk_tag hblktag; 4162 int hmeshift, hashno = 1; 4163 struct hme_blk *hmeblkp, *list = NULL; 4164 caddr_t endaddr; 4165 cpuset_t cpuset; 4166 demap_range_t dmr; 4167 4168 CPUSET_ZERO(cpuset); 4169 4170 ASSERT((sfmmup == ksfmmup) || 4171 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4172 ASSERT((len & MMU_PAGEOFFSET) == 0); 4173 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4174 4175 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4176 ((addr + len) > (caddr_t)USERLIMIT)) { 4177 panic("user addr %p in kernel space", 4178 (void *)addr); 4179 } 4180 4181 endaddr = addr + len; 4182 hblktag.htag_id = sfmmup; 4183 DEMAP_RANGE_INIT(sfmmup, &dmr); 4184 4185 while (addr < endaddr) { 4186 hmeshift = HME_HASH_SHIFT(hashno); 4187 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4188 hblktag.htag_rehash = hashno; 4189 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4190 4191 SFMMU_HASH_LOCK(hmebp); 4192 4193 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4194 if (hmeblkp != NULL) { 4195 /* 4196 * We've encountered a shadow hmeblk so skip the range 4197 * of the next smaller mapping size. 4198 */ 4199 if (hmeblkp->hblk_shw_bit) { 4200 ASSERT(sfmmup != ksfmmup); 4201 ASSERT(hashno > 1); 4202 addr = (caddr_t)P2END((uintptr_t)addr, 4203 TTEBYTES(hashno - 1)); 4204 } else { 4205 addr = sfmmu_hblk_chgattr(sfmmup, 4206 hmeblkp, addr, endaddr, &dmr, attr, mode); 4207 } 4208 SFMMU_HASH_UNLOCK(hmebp); 4209 hashno = 1; 4210 continue; 4211 } 4212 SFMMU_HASH_UNLOCK(hmebp); 4213 4214 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4215 /* 4216 * We have traversed the whole list and rehashed 4217 * if necessary without finding the address to chgattr. 4218 * This is ok, so we increment the address by the 4219 * smallest hmeblk range for kernel mappings or for 4220 * user mappings with no large pages, and the largest 4221 * hmeblk range, to account for shadow hmeblks, for 4222 * user mappings with large pages and continue. 4223 */ 4224 if (sfmmup == ksfmmup) 4225 addr = (caddr_t)P2END((uintptr_t)addr, 4226 TTEBYTES(1)); 4227 else 4228 addr = (caddr_t)P2END((uintptr_t)addr, 4229 TTEBYTES(hashno)); 4230 hashno = 1; 4231 } else { 4232 hashno++; 4233 } 4234 } 4235 4236 sfmmu_hblks_list_purge(&list); 4237 DEMAP_RANGE_FLUSH(&dmr); 4238 cpuset = sfmmup->sfmmu_cpusran; 4239 xt_sync(cpuset); 4240 } 4241 4242 /* 4243 * This function chgattr on a range of addresses in an hmeblk. It returns the 4244 * next addres that needs to be chgattr. 4245 * It should be called with the hash lock held. 4246 * XXX It should be possible to optimize chgattr by not flushing every time but 4247 * on the other hand: 4248 * 1. do one flush crosscall. 4249 * 2. only flush if we are increasing permissions (make sure this will work) 4250 */ 4251 static caddr_t 4252 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4253 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 4254 { 4255 tte_t tte, tteattr, tteflags, ttemod; 4256 struct sf_hment *sfhmep; 4257 int ttesz; 4258 struct page *pp = NULL; 4259 kmutex_t *pml, *pmtx; 4260 int ret; 4261 int use_demap_range; 4262 #if defined(SF_ERRATA_57) 4263 int check_exec; 4264 #endif 4265 4266 ASSERT(in_hblk_range(hmeblkp, addr)); 4267 ASSERT(hmeblkp->hblk_shw_bit == 0); 4268 4269 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4270 ttesz = get_hblk_ttesz(hmeblkp); 4271 4272 /* 4273 * Flush the current demap region if addresses have been 4274 * skipped or the page size doesn't match. 4275 */ 4276 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 4277 if (use_demap_range) { 4278 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4279 } else { 4280 DEMAP_RANGE_FLUSH(dmrp); 4281 } 4282 4283 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 4284 #if defined(SF_ERRATA_57) 4285 check_exec = (sfmmup != ksfmmup) && 4286 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4287 TTE_IS_EXECUTABLE(&tteattr); 4288 #endif 4289 HBLKTOHME(sfhmep, hmeblkp, addr); 4290 while (addr < endaddr) { 4291 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4292 if (TTE_IS_VALID(&tte)) { 4293 if ((tte.ll & tteflags.ll) == tteattr.ll) { 4294 /* 4295 * if the new attr is the same as old 4296 * continue 4297 */ 4298 goto next_addr; 4299 } 4300 if (!TTE_IS_WRITABLE(&tteattr)) { 4301 /* 4302 * make sure we clear hw modify bit if we 4303 * removing write protections 4304 */ 4305 tteflags.tte_intlo |= TTE_HWWR_INT; 4306 } 4307 4308 pml = NULL; 4309 pp = sfhmep->hme_page; 4310 if (pp) { 4311 pml = sfmmu_mlist_enter(pp); 4312 } 4313 4314 if (pp != sfhmep->hme_page) { 4315 /* 4316 * tte must have been unloaded. 4317 */ 4318 ASSERT(pml); 4319 sfmmu_mlist_exit(pml); 4320 continue; 4321 } 4322 4323 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 4324 4325 ttemod = tte; 4326 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 4327 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 4328 4329 #if defined(SF_ERRATA_57) 4330 if (check_exec && addr < errata57_limit) 4331 ttemod.tte_exec_perm = 0; 4332 #endif 4333 ret = sfmmu_modifytte_try(&tte, &ttemod, 4334 &sfhmep->hme_tte); 4335 4336 if (ret < 0) { 4337 /* tte changed underneath us */ 4338 if (pml) { 4339 sfmmu_mlist_exit(pml); 4340 } 4341 continue; 4342 } 4343 4344 if (tteflags.tte_intlo & TTE_HWWR_INT) { 4345 /* 4346 * need to sync if we are clearing modify bit. 4347 */ 4348 sfmmu_ttesync(sfmmup, addr, &tte, pp); 4349 } 4350 4351 if (pp && PP_ISRO(pp)) { 4352 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 4353 pmtx = sfmmu_page_enter(pp); 4354 PP_CLRRO(pp); 4355 sfmmu_page_exit(pmtx); 4356 } 4357 } 4358 4359 if (ret > 0 && use_demap_range) { 4360 DEMAP_RANGE_MARKPG(dmrp, addr); 4361 } else if (ret > 0) { 4362 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 4363 } 4364 4365 if (pml) { 4366 sfmmu_mlist_exit(pml); 4367 } 4368 } 4369 next_addr: 4370 addr += TTEBYTES(ttesz); 4371 sfhmep++; 4372 DEMAP_RANGE_NEXTPG(dmrp); 4373 } 4374 return (addr); 4375 } 4376 4377 /* 4378 * This routine converts virtual attributes to physical ones. It will 4379 * update the tteflags field with the tte mask corresponding to the attributes 4380 * affected and it returns the new attributes. It will also clear the modify 4381 * bit if we are taking away write permission. This is necessary since the 4382 * modify bit is the hardware permission bit and we need to clear it in order 4383 * to detect write faults. 4384 */ 4385 static uint64_t 4386 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 4387 { 4388 tte_t ttevalue; 4389 4390 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 4391 4392 switch (mode) { 4393 case SFMMU_CHGATTR: 4394 /* all attributes specified */ 4395 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 4396 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 4397 ttemaskp->tte_inthi = TTEINTHI_ATTR; 4398 ttemaskp->tte_intlo = TTEINTLO_ATTR; 4399 break; 4400 case SFMMU_SETATTR: 4401 ASSERT(!(attr & ~HAT_PROT_MASK)); 4402 ttemaskp->ll = 0; 4403 ttevalue.ll = 0; 4404 /* 4405 * a valid tte implies exec and read for sfmmu 4406 * so no need to do anything about them. 4407 * since priviledged access implies user access 4408 * PROT_USER doesn't make sense either. 4409 */ 4410 if (attr & PROT_WRITE) { 4411 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 4412 ttevalue.tte_intlo |= TTE_WRPRM_INT; 4413 } 4414 break; 4415 case SFMMU_CLRATTR: 4416 /* attributes will be nand with current ones */ 4417 if (attr & ~(PROT_WRITE | PROT_USER)) { 4418 panic("sfmmu: attr %x not supported", attr); 4419 } 4420 ttemaskp->ll = 0; 4421 ttevalue.ll = 0; 4422 if (attr & PROT_WRITE) { 4423 /* clear both writable and modify bit */ 4424 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 4425 } 4426 if (attr & PROT_USER) { 4427 ttemaskp->tte_intlo |= TTE_PRIV_INT; 4428 ttevalue.tte_intlo |= TTE_PRIV_INT; 4429 } 4430 break; 4431 default: 4432 panic("sfmmu_vtop_attr: bad mode %x", mode); 4433 } 4434 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 4435 return (ttevalue.ll); 4436 } 4437 4438 static uint_t 4439 sfmmu_ptov_attr(tte_t *ttep) 4440 { 4441 uint_t attr; 4442 4443 ASSERT(TTE_IS_VALID(ttep)); 4444 4445 attr = PROT_READ; 4446 4447 if (TTE_IS_WRITABLE(ttep)) { 4448 attr |= PROT_WRITE; 4449 } 4450 if (TTE_IS_EXECUTABLE(ttep)) { 4451 attr |= PROT_EXEC; 4452 } 4453 if (!TTE_IS_PRIVILEGED(ttep)) { 4454 attr |= PROT_USER; 4455 } 4456 if (TTE_IS_NFO(ttep)) { 4457 attr |= HAT_NOFAULT; 4458 } 4459 if (TTE_IS_NOSYNC(ttep)) { 4460 attr |= HAT_NOSYNC; 4461 } 4462 if (TTE_IS_SIDEFFECT(ttep)) { 4463 attr |= SFMMU_SIDEFFECT; 4464 } 4465 if (!TTE_IS_VCACHEABLE(ttep)) { 4466 attr |= SFMMU_UNCACHEVTTE; 4467 } 4468 if (!TTE_IS_PCACHEABLE(ttep)) { 4469 attr |= SFMMU_UNCACHEPTTE; 4470 } 4471 return (attr); 4472 } 4473 4474 /* 4475 * hat_chgprot is a deprecated hat call. New segment drivers 4476 * should store all attributes and use hat_*attr calls. 4477 * 4478 * Change the protections in the virtual address range 4479 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 4480 * then remove write permission, leaving the other 4481 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 4482 * 4483 */ 4484 void 4485 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 4486 { 4487 struct hmehash_bucket *hmebp; 4488 hmeblk_tag hblktag; 4489 int hmeshift, hashno = 1; 4490 struct hme_blk *hmeblkp, *list = NULL; 4491 caddr_t endaddr; 4492 cpuset_t cpuset; 4493 demap_range_t dmr; 4494 4495 ASSERT((len & MMU_PAGEOFFSET) == 0); 4496 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4497 4498 if (sfmmup->sfmmu_xhat_provider) { 4499 XHAT_CHGPROT(sfmmup, addr, len, vprot); 4500 return; 4501 } else { 4502 /* 4503 * This must be a CPU HAT. If the address space has 4504 * XHATs attached, change attributes for all of them, 4505 * just in case 4506 */ 4507 ASSERT(sfmmup->sfmmu_as != NULL); 4508 if (sfmmup->sfmmu_as->a_xhat != NULL) 4509 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot); 4510 } 4511 4512 CPUSET_ZERO(cpuset); 4513 4514 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 4515 ((addr + len) > (caddr_t)USERLIMIT)) { 4516 panic("user addr %p vprot %x in kernel space", 4517 (void *)addr, vprot); 4518 } 4519 endaddr = addr + len; 4520 hblktag.htag_id = sfmmup; 4521 DEMAP_RANGE_INIT(sfmmup, &dmr); 4522 4523 while (addr < endaddr) { 4524 hmeshift = HME_HASH_SHIFT(hashno); 4525 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4526 hblktag.htag_rehash = hashno; 4527 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4528 4529 SFMMU_HASH_LOCK(hmebp); 4530 4531 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4532 if (hmeblkp != NULL) { 4533 /* 4534 * We've encountered a shadow hmeblk so skip the range 4535 * of the next smaller mapping size. 4536 */ 4537 if (hmeblkp->hblk_shw_bit) { 4538 ASSERT(sfmmup != ksfmmup); 4539 ASSERT(hashno > 1); 4540 addr = (caddr_t)P2END((uintptr_t)addr, 4541 TTEBYTES(hashno - 1)); 4542 } else { 4543 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 4544 addr, endaddr, &dmr, vprot); 4545 } 4546 SFMMU_HASH_UNLOCK(hmebp); 4547 hashno = 1; 4548 continue; 4549 } 4550 SFMMU_HASH_UNLOCK(hmebp); 4551 4552 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4553 /* 4554 * We have traversed the whole list and rehashed 4555 * if necessary without finding the address to chgprot. 4556 * This is ok so we increment the address by the 4557 * smallest hmeblk range for kernel mappings and the 4558 * largest hmeblk range, to account for shadow hmeblks, 4559 * for user mappings and continue. 4560 */ 4561 if (sfmmup == ksfmmup) 4562 addr = (caddr_t)P2END((uintptr_t)addr, 4563 TTEBYTES(1)); 4564 else 4565 addr = (caddr_t)P2END((uintptr_t)addr, 4566 TTEBYTES(hashno)); 4567 hashno = 1; 4568 } else { 4569 hashno++; 4570 } 4571 } 4572 4573 sfmmu_hblks_list_purge(&list); 4574 DEMAP_RANGE_FLUSH(&dmr); 4575 cpuset = sfmmup->sfmmu_cpusran; 4576 xt_sync(cpuset); 4577 } 4578 4579 /* 4580 * This function chgprots a range of addresses in an hmeblk. It returns the 4581 * next addres that needs to be chgprot. 4582 * It should be called with the hash lock held. 4583 * XXX It shold be possible to optimize chgprot by not flushing every time but 4584 * on the other hand: 4585 * 1. do one flush crosscall. 4586 * 2. only flush if we are increasing permissions (make sure this will work) 4587 */ 4588 static caddr_t 4589 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4590 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 4591 { 4592 uint_t pprot; 4593 tte_t tte, ttemod; 4594 struct sf_hment *sfhmep; 4595 uint_t tteflags; 4596 int ttesz; 4597 struct page *pp = NULL; 4598 kmutex_t *pml, *pmtx; 4599 int ret; 4600 int use_demap_range; 4601 #if defined(SF_ERRATA_57) 4602 int check_exec; 4603 #endif 4604 4605 ASSERT(in_hblk_range(hmeblkp, addr)); 4606 ASSERT(hmeblkp->hblk_shw_bit == 0); 4607 4608 #ifdef DEBUG 4609 if (get_hblk_ttesz(hmeblkp) != TTE8K && 4610 (endaddr < get_hblk_endaddr(hmeblkp))) { 4611 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 4612 } 4613 #endif /* DEBUG */ 4614 4615 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4616 ttesz = get_hblk_ttesz(hmeblkp); 4617 4618 pprot = sfmmu_vtop_prot(vprot, &tteflags); 4619 #if defined(SF_ERRATA_57) 4620 check_exec = (sfmmup != ksfmmup) && 4621 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4622 ((vprot & PROT_EXEC) == PROT_EXEC); 4623 #endif 4624 HBLKTOHME(sfhmep, hmeblkp, addr); 4625 4626 /* 4627 * Flush the current demap region if addresses have been 4628 * skipped or the page size doesn't match. 4629 */ 4630 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 4631 if (use_demap_range) { 4632 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4633 } else { 4634 DEMAP_RANGE_FLUSH(dmrp); 4635 } 4636 4637 while (addr < endaddr) { 4638 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4639 if (TTE_IS_VALID(&tte)) { 4640 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 4641 /* 4642 * if the new protection is the same as old 4643 * continue 4644 */ 4645 goto next_addr; 4646 } 4647 pml = NULL; 4648 pp = sfhmep->hme_page; 4649 if (pp) { 4650 pml = sfmmu_mlist_enter(pp); 4651 } 4652 if (pp != sfhmep->hme_page) { 4653 /* 4654 * tte most have been unloaded 4655 * underneath us. Recheck 4656 */ 4657 ASSERT(pml); 4658 sfmmu_mlist_exit(pml); 4659 continue; 4660 } 4661 4662 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 4663 4664 ttemod = tte; 4665 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 4666 #if defined(SF_ERRATA_57) 4667 if (check_exec && addr < errata57_limit) 4668 ttemod.tte_exec_perm = 0; 4669 #endif 4670 ret = sfmmu_modifytte_try(&tte, &ttemod, 4671 &sfhmep->hme_tte); 4672 4673 if (ret < 0) { 4674 /* tte changed underneath us */ 4675 if (pml) { 4676 sfmmu_mlist_exit(pml); 4677 } 4678 continue; 4679 } 4680 4681 if (tteflags & TTE_HWWR_INT) { 4682 /* 4683 * need to sync if we are clearing modify bit. 4684 */ 4685 sfmmu_ttesync(sfmmup, addr, &tte, pp); 4686 } 4687 4688 if (pp && PP_ISRO(pp)) { 4689 if (pprot & TTE_WRPRM_INT) { 4690 pmtx = sfmmu_page_enter(pp); 4691 PP_CLRRO(pp); 4692 sfmmu_page_exit(pmtx); 4693 } 4694 } 4695 4696 if (ret > 0 && use_demap_range) { 4697 DEMAP_RANGE_MARKPG(dmrp, addr); 4698 } else if (ret > 0) { 4699 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 4700 } 4701 4702 if (pml) { 4703 sfmmu_mlist_exit(pml); 4704 } 4705 } 4706 next_addr: 4707 addr += TTEBYTES(ttesz); 4708 sfhmep++; 4709 DEMAP_RANGE_NEXTPG(dmrp); 4710 } 4711 return (addr); 4712 } 4713 4714 /* 4715 * This routine is deprecated and should only be used by hat_chgprot. 4716 * The correct routine is sfmmu_vtop_attr. 4717 * This routine converts virtual page protections to physical ones. It will 4718 * update the tteflags field with the tte mask corresponding to the protections 4719 * affected and it returns the new protections. It will also clear the modify 4720 * bit if we are taking away write permission. This is necessary since the 4721 * modify bit is the hardware permission bit and we need to clear it in order 4722 * to detect write faults. 4723 * It accepts the following special protections: 4724 * ~PROT_WRITE = remove write permissions. 4725 * ~PROT_USER = remove user permissions. 4726 */ 4727 static uint_t 4728 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 4729 { 4730 if (vprot == (uint_t)~PROT_WRITE) { 4731 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 4732 return (0); /* will cause wrprm to be cleared */ 4733 } 4734 if (vprot == (uint_t)~PROT_USER) { 4735 *tteflagsp = TTE_PRIV_INT; 4736 return (0); /* will cause privprm to be cleared */ 4737 } 4738 if ((vprot == 0) || (vprot == PROT_USER) || 4739 ((vprot & PROT_ALL) != vprot)) { 4740 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 4741 } 4742 4743 switch (vprot) { 4744 case (PROT_READ): 4745 case (PROT_EXEC): 4746 case (PROT_EXEC | PROT_READ): 4747 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 4748 return (TTE_PRIV_INT); /* set prv and clr wrt */ 4749 case (PROT_WRITE): 4750 case (PROT_WRITE | PROT_READ): 4751 case (PROT_EXEC | PROT_WRITE): 4752 case (PROT_EXEC | PROT_WRITE | PROT_READ): 4753 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 4754 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 4755 case (PROT_USER | PROT_READ): 4756 case (PROT_USER | PROT_EXEC): 4757 case (PROT_USER | PROT_EXEC | PROT_READ): 4758 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 4759 return (0); /* clr prv and wrt */ 4760 case (PROT_USER | PROT_WRITE): 4761 case (PROT_USER | PROT_WRITE | PROT_READ): 4762 case (PROT_USER | PROT_EXEC | PROT_WRITE): 4763 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 4764 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 4765 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 4766 default: 4767 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 4768 } 4769 return (0); 4770 } 4771 4772 /* 4773 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 4774 * the normal algorithm would take too long for a very large VA range with 4775 * few real mappings. This routine just walks thru all HMEs in the global 4776 * hash table to find and remove mappings. 4777 */ 4778 static void 4779 hat_unload_large_virtual( 4780 struct hat *sfmmup, 4781 caddr_t startaddr, 4782 size_t len, 4783 uint_t flags, 4784 hat_callback_t *callback) 4785 { 4786 struct hmehash_bucket *hmebp; 4787 struct hme_blk *hmeblkp; 4788 struct hme_blk *pr_hblk = NULL; 4789 struct hme_blk *nx_hblk; 4790 struct hme_blk *list = NULL; 4791 int i; 4792 uint64_t hblkpa, prevpa, nx_pa; 4793 demap_range_t dmr, *dmrp; 4794 cpuset_t cpuset; 4795 caddr_t endaddr = startaddr + len; 4796 caddr_t sa; 4797 caddr_t ea; 4798 caddr_t cb_sa[MAX_CB_ADDR]; 4799 caddr_t cb_ea[MAX_CB_ADDR]; 4800 int addr_cnt = 0; 4801 int a = 0; 4802 4803 if (sfmmup->sfmmu_free) { 4804 dmrp = NULL; 4805 } else { 4806 dmrp = &dmr; 4807 DEMAP_RANGE_INIT(sfmmup, dmrp); 4808 } 4809 4810 /* 4811 * Loop through all the hash buckets of HME blocks looking for matches. 4812 */ 4813 for (i = 0; i <= UHMEHASH_SZ; i++) { 4814 hmebp = &uhme_hash[i]; 4815 SFMMU_HASH_LOCK(hmebp); 4816 hmeblkp = hmebp->hmeblkp; 4817 hblkpa = hmebp->hmeh_nextpa; 4818 prevpa = 0; 4819 pr_hblk = NULL; 4820 while (hmeblkp) { 4821 nx_hblk = hmeblkp->hblk_next; 4822 nx_pa = hmeblkp->hblk_nextpa; 4823 4824 /* 4825 * skip if not this context, if a shadow block or 4826 * if the mapping is not in the requested range 4827 */ 4828 if (hmeblkp->hblk_tag.htag_id != sfmmup || 4829 hmeblkp->hblk_shw_bit || 4830 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 4831 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 4832 pr_hblk = hmeblkp; 4833 prevpa = hblkpa; 4834 goto next_block; 4835 } 4836 4837 /* 4838 * unload if there are any current valid mappings 4839 */ 4840 if (hmeblkp->hblk_vcnt != 0 || 4841 hmeblkp->hblk_hmecnt != 0) 4842 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 4843 sa, ea, dmrp, flags); 4844 4845 /* 4846 * on unmap we also release the HME block itself, once 4847 * all mappings are gone. 4848 */ 4849 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 4850 !hmeblkp->hblk_vcnt && 4851 !hmeblkp->hblk_hmecnt) { 4852 ASSERT(!hmeblkp->hblk_lckcnt); 4853 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 4854 prevpa, pr_hblk); 4855 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 4856 } else { 4857 pr_hblk = hmeblkp; 4858 prevpa = hblkpa; 4859 } 4860 4861 if (callback == NULL) 4862 goto next_block; 4863 4864 /* 4865 * HME blocks may span more than one page, but we may be 4866 * unmapping only one page, so check for a smaller range 4867 * for the callback 4868 */ 4869 if (sa < startaddr) 4870 sa = startaddr; 4871 if (--ea > endaddr) 4872 ea = endaddr - 1; 4873 4874 cb_sa[addr_cnt] = sa; 4875 cb_ea[addr_cnt] = ea; 4876 if (++addr_cnt == MAX_CB_ADDR) { 4877 if (dmrp != NULL) { 4878 DEMAP_RANGE_FLUSH(dmrp); 4879 cpuset = sfmmup->sfmmu_cpusran; 4880 xt_sync(cpuset); 4881 } 4882 4883 for (a = 0; a < MAX_CB_ADDR; ++a) { 4884 callback->hcb_start_addr = cb_sa[a]; 4885 callback->hcb_end_addr = cb_ea[a]; 4886 callback->hcb_function(callback); 4887 } 4888 addr_cnt = 0; 4889 } 4890 4891 next_block: 4892 hmeblkp = nx_hblk; 4893 hblkpa = nx_pa; 4894 } 4895 SFMMU_HASH_UNLOCK(hmebp); 4896 } 4897 4898 sfmmu_hblks_list_purge(&list); 4899 if (dmrp != NULL) { 4900 DEMAP_RANGE_FLUSH(dmrp); 4901 cpuset = sfmmup->sfmmu_cpusran; 4902 xt_sync(cpuset); 4903 } 4904 4905 for (a = 0; a < addr_cnt; ++a) { 4906 callback->hcb_start_addr = cb_sa[a]; 4907 callback->hcb_end_addr = cb_ea[a]; 4908 callback->hcb_function(callback); 4909 } 4910 4911 /* 4912 * Check TSB and TLB page sizes if the process isn't exiting. 4913 */ 4914 if (!sfmmup->sfmmu_free) 4915 sfmmu_check_page_sizes(sfmmup, 0); 4916 } 4917 4918 /* 4919 * Unload all the mappings in the range [addr..addr+len). addr and len must 4920 * be MMU_PAGESIZE aligned. 4921 */ 4922 4923 extern struct seg *segkmap; 4924 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 4925 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 4926 4927 4928 void 4929 hat_unload_callback( 4930 struct hat *sfmmup, 4931 caddr_t addr, 4932 size_t len, 4933 uint_t flags, 4934 hat_callback_t *callback) 4935 { 4936 struct hmehash_bucket *hmebp; 4937 hmeblk_tag hblktag; 4938 int hmeshift, hashno, iskernel; 4939 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 4940 caddr_t endaddr; 4941 cpuset_t cpuset; 4942 uint64_t hblkpa, prevpa; 4943 int addr_count = 0; 4944 int a; 4945 caddr_t cb_start_addr[MAX_CB_ADDR]; 4946 caddr_t cb_end_addr[MAX_CB_ADDR]; 4947 int issegkmap = ISSEGKMAP(sfmmup, addr); 4948 demap_range_t dmr, *dmrp; 4949 4950 if (sfmmup->sfmmu_xhat_provider) { 4951 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback); 4952 return; 4953 } else { 4954 /* 4955 * This must be a CPU HAT. If the address space has 4956 * XHATs attached, unload the mappings for all of them, 4957 * just in case 4958 */ 4959 ASSERT(sfmmup->sfmmu_as != NULL); 4960 if (sfmmup->sfmmu_as->a_xhat != NULL) 4961 xhat_unload_callback_all(sfmmup->sfmmu_as, addr, 4962 len, flags, callback); 4963 } 4964 4965 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 4966 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4967 4968 ASSERT(sfmmup != NULL); 4969 ASSERT((len & MMU_PAGEOFFSET) == 0); 4970 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 4971 4972 /* 4973 * Probing through a large VA range (say 63 bits) will be slow, even 4974 * at 4 Meg steps between the probes. So, when the virtual address range 4975 * is very large, search the HME entries for what to unload. 4976 * 4977 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 4978 * 4979 * UHMEHASH_SZ is number of hash buckets to examine 4980 * 4981 */ 4982 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 4983 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 4984 return; 4985 } 4986 4987 CPUSET_ZERO(cpuset); 4988 4989 /* 4990 * If the process is exiting, we can save a lot of fuss since 4991 * we'll flush the TLB when we free the ctx anyway. 4992 */ 4993 if (sfmmup->sfmmu_free) 4994 dmrp = NULL; 4995 else 4996 dmrp = &dmr; 4997 4998 DEMAP_RANGE_INIT(sfmmup, dmrp); 4999 endaddr = addr + len; 5000 hblktag.htag_id = sfmmup; 5001 5002 /* 5003 * It is likely for the vm to call unload over a wide range of 5004 * addresses that are actually very sparsely populated by 5005 * translations. In order to speed this up the sfmmu hat supports 5006 * the concept of shadow hmeblks. Dummy large page hmeblks that 5007 * correspond to actual small translations are allocated at tteload 5008 * time and are referred to as shadow hmeblks. Now, during unload 5009 * time, we first check if we have a shadow hmeblk for that 5010 * translation. The absence of one means the corresponding address 5011 * range is empty and can be skipped. 5012 * 5013 * The kernel is an exception to above statement and that is why 5014 * we don't use shadow hmeblks and hash starting from the smallest 5015 * page size. 5016 */ 5017 if (sfmmup == KHATID) { 5018 iskernel = 1; 5019 hashno = TTE64K; 5020 } else { 5021 iskernel = 0; 5022 if (mmu_page_sizes == max_mmu_page_sizes) { 5023 hashno = TTE256M; 5024 } else { 5025 hashno = TTE4M; 5026 } 5027 } 5028 while (addr < endaddr) { 5029 hmeshift = HME_HASH_SHIFT(hashno); 5030 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5031 hblktag.htag_rehash = hashno; 5032 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5033 5034 SFMMU_HASH_LOCK(hmebp); 5035 5036 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, pr_hblk, 5037 prevpa, &list); 5038 if (hmeblkp == NULL) { 5039 /* 5040 * didn't find an hmeblk. skip the appropiate 5041 * address range. 5042 */ 5043 SFMMU_HASH_UNLOCK(hmebp); 5044 if (iskernel) { 5045 if (hashno < mmu_hashcnt) { 5046 hashno++; 5047 continue; 5048 } else { 5049 hashno = TTE64K; 5050 addr = (caddr_t)roundup((uintptr_t)addr 5051 + 1, MMU_PAGESIZE64K); 5052 continue; 5053 } 5054 } 5055 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5056 (1 << hmeshift)); 5057 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5058 ASSERT(hashno == TTE64K); 5059 continue; 5060 } 5061 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5062 hashno = TTE512K; 5063 continue; 5064 } 5065 if (mmu_page_sizes == max_mmu_page_sizes) { 5066 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5067 hashno = TTE4M; 5068 continue; 5069 } 5070 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5071 hashno = TTE32M; 5072 continue; 5073 } 5074 hashno = TTE256M; 5075 continue; 5076 } else { 5077 hashno = TTE4M; 5078 continue; 5079 } 5080 } 5081 ASSERT(hmeblkp); 5082 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5083 /* 5084 * If the valid count is zero we can skip the range 5085 * mapped by this hmeblk. 5086 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 5087 * is used by segment drivers as a hint 5088 * that the mapping resource won't be used any longer. 5089 * The best example of this is during exit(). 5090 */ 5091 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5092 get_hblk_span(hmeblkp)); 5093 if ((flags & HAT_UNLOAD_UNMAP) || 5094 (iskernel && !issegkmap)) { 5095 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 5096 pr_hblk); 5097 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 5098 } 5099 SFMMU_HASH_UNLOCK(hmebp); 5100 5101 if (iskernel) { 5102 hashno = TTE64K; 5103 continue; 5104 } 5105 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5106 ASSERT(hashno == TTE64K); 5107 continue; 5108 } 5109 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5110 hashno = TTE512K; 5111 continue; 5112 } 5113 if (mmu_page_sizes == max_mmu_page_sizes) { 5114 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5115 hashno = TTE4M; 5116 continue; 5117 } 5118 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5119 hashno = TTE32M; 5120 continue; 5121 } 5122 hashno = TTE256M; 5123 continue; 5124 } else { 5125 hashno = TTE4M; 5126 continue; 5127 } 5128 } 5129 if (hmeblkp->hblk_shw_bit) { 5130 /* 5131 * If we encounter a shadow hmeblk we know there is 5132 * smaller sized hmeblks mapping the same address space. 5133 * Decrement the hash size and rehash. 5134 */ 5135 ASSERT(sfmmup != KHATID); 5136 hashno--; 5137 SFMMU_HASH_UNLOCK(hmebp); 5138 continue; 5139 } 5140 5141 /* 5142 * track callback address ranges. 5143 * only start a new range when it's not contiguous 5144 */ 5145 if (callback != NULL) { 5146 if (addr_count > 0 && 5147 addr == cb_end_addr[addr_count - 1]) 5148 --addr_count; 5149 else 5150 cb_start_addr[addr_count] = addr; 5151 } 5152 5153 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5154 dmrp, flags); 5155 5156 if (callback != NULL) 5157 cb_end_addr[addr_count++] = addr; 5158 5159 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5160 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5161 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 5162 pr_hblk); 5163 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 5164 } 5165 SFMMU_HASH_UNLOCK(hmebp); 5166 5167 /* 5168 * Notify our caller as to exactly which pages 5169 * have been unloaded. We do these in clumps, 5170 * to minimize the number of xt_sync()s that need to occur. 5171 */ 5172 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5173 DEMAP_RANGE_FLUSH(dmrp); 5174 if (dmrp != NULL) { 5175 cpuset = sfmmup->sfmmu_cpusran; 5176 xt_sync(cpuset); 5177 } 5178 5179 for (a = 0; a < MAX_CB_ADDR; ++a) { 5180 callback->hcb_start_addr = cb_start_addr[a]; 5181 callback->hcb_end_addr = cb_end_addr[a]; 5182 callback->hcb_function(callback); 5183 } 5184 addr_count = 0; 5185 } 5186 if (iskernel) { 5187 hashno = TTE64K; 5188 continue; 5189 } 5190 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5191 ASSERT(hashno == TTE64K); 5192 continue; 5193 } 5194 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5195 hashno = TTE512K; 5196 continue; 5197 } 5198 if (mmu_page_sizes == max_mmu_page_sizes) { 5199 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5200 hashno = TTE4M; 5201 continue; 5202 } 5203 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5204 hashno = TTE32M; 5205 continue; 5206 } 5207 hashno = TTE256M; 5208 } else { 5209 hashno = TTE4M; 5210 } 5211 } 5212 5213 sfmmu_hblks_list_purge(&list); 5214 DEMAP_RANGE_FLUSH(dmrp); 5215 if (dmrp != NULL) { 5216 cpuset = sfmmup->sfmmu_cpusran; 5217 xt_sync(cpuset); 5218 } 5219 if (callback && addr_count != 0) { 5220 for (a = 0; a < addr_count; ++a) { 5221 callback->hcb_start_addr = cb_start_addr[a]; 5222 callback->hcb_end_addr = cb_end_addr[a]; 5223 callback->hcb_function(callback); 5224 } 5225 } 5226 5227 /* 5228 * Check TSB and TLB page sizes if the process isn't exiting. 5229 */ 5230 if (!sfmmup->sfmmu_free) 5231 sfmmu_check_page_sizes(sfmmup, 0); 5232 } 5233 5234 /* 5235 * Unload all the mappings in the range [addr..addr+len). addr and len must 5236 * be MMU_PAGESIZE aligned. 5237 */ 5238 void 5239 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 5240 { 5241 if (sfmmup->sfmmu_xhat_provider) { 5242 XHAT_UNLOAD(sfmmup, addr, len, flags); 5243 return; 5244 } 5245 hat_unload_callback(sfmmup, addr, len, flags, NULL); 5246 } 5247 5248 5249 /* 5250 * Find the largest mapping size for this page. 5251 */ 5252 static int 5253 fnd_mapping_sz(page_t *pp) 5254 { 5255 int sz; 5256 int p_index; 5257 5258 p_index = PP_MAPINDEX(pp); 5259 5260 sz = 0; 5261 p_index >>= 1; /* don't care about 8K bit */ 5262 for (; p_index; p_index >>= 1) { 5263 sz++; 5264 } 5265 5266 return (sz); 5267 } 5268 5269 /* 5270 * This function unloads a range of addresses for an hmeblk. 5271 * It returns the next address to be unloaded. 5272 * It should be called with the hash lock held. 5273 */ 5274 static caddr_t 5275 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5276 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 5277 { 5278 tte_t tte, ttemod; 5279 struct sf_hment *sfhmep; 5280 int ttesz; 5281 long ttecnt; 5282 page_t *pp; 5283 kmutex_t *pml; 5284 int ret; 5285 int use_demap_range; 5286 5287 ASSERT(in_hblk_range(hmeblkp, addr)); 5288 ASSERT(!hmeblkp->hblk_shw_bit); 5289 #ifdef DEBUG 5290 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5291 (endaddr < get_hblk_endaddr(hmeblkp))) { 5292 panic("sfmmu_hblk_unload: partial unload of large page"); 5293 } 5294 #endif /* DEBUG */ 5295 5296 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5297 ttesz = get_hblk_ttesz(hmeblkp); 5298 5299 use_demap_range = (do_virtual_coloring && 5300 ((dmrp == NULL) || TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp))); 5301 if (use_demap_range) { 5302 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5303 } else { 5304 DEMAP_RANGE_FLUSH(dmrp); 5305 } 5306 ttecnt = 0; 5307 HBLKTOHME(sfhmep, hmeblkp, addr); 5308 5309 while (addr < endaddr) { 5310 pml = NULL; 5311 again: 5312 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5313 if (TTE_IS_VALID(&tte)) { 5314 pp = sfhmep->hme_page; 5315 if (pp && pml == NULL) { 5316 pml = sfmmu_mlist_enter(pp); 5317 } 5318 5319 /* 5320 * Verify if hme still points to 'pp' now that 5321 * we have p_mapping lock. 5322 */ 5323 if (sfhmep->hme_page != pp) { 5324 if (pp != NULL && sfhmep->hme_page != NULL) { 5325 if (pml) { 5326 sfmmu_mlist_exit(pml); 5327 } 5328 /* Re-start this iteration. */ 5329 continue; 5330 } 5331 ASSERT((pp != NULL) && 5332 (sfhmep->hme_page == NULL)); 5333 goto tte_unloaded; 5334 } 5335 5336 /* 5337 * This point on we have both HASH and p_mapping 5338 * lock. 5339 */ 5340 ASSERT(pp == sfhmep->hme_page); 5341 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5342 5343 /* 5344 * We need to loop on modify tte because it is 5345 * possible for pagesync to come along and 5346 * change the software bits beneath us. 5347 * 5348 * Page_unload can also invalidate the tte after 5349 * we read tte outside of p_mapping lock. 5350 */ 5351 ttemod = tte; 5352 5353 TTE_SET_INVALID(&ttemod); 5354 ret = sfmmu_modifytte_try(&tte, &ttemod, 5355 &sfhmep->hme_tte); 5356 5357 if (ret <= 0) { 5358 if (TTE_IS_VALID(&tte)) { 5359 goto again; 5360 } else { 5361 /* 5362 * We read in a valid pte, but it 5363 * is unloaded by page_unload. 5364 * hme_page has become NULL and 5365 * we hold no p_mapping lock. 5366 */ 5367 ASSERT(pp == NULL && pml == NULL); 5368 goto tte_unloaded; 5369 } 5370 } 5371 5372 if (!(flags & HAT_UNLOAD_NOSYNC)) { 5373 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5374 } 5375 5376 /* 5377 * Ok- we invalidated the tte. Do the rest of the job. 5378 */ 5379 ttecnt++; 5380 5381 if (flags & HAT_UNLOAD_UNLOCK) { 5382 ASSERT(hmeblkp->hblk_lckcnt > 0); 5383 atomic_add_16(&hmeblkp->hblk_lckcnt, -1); 5384 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 5385 } 5386 5387 /* 5388 * Normally we would need to flush the page 5389 * from the virtual cache at this point in 5390 * order to prevent a potential cache alias 5391 * inconsistency. 5392 * The particular scenario we need to worry 5393 * about is: 5394 * Given: va1 and va2 are two virtual address 5395 * that alias and map the same physical 5396 * address. 5397 * 1. mapping exists from va1 to pa and data 5398 * has been read into the cache. 5399 * 2. unload va1. 5400 * 3. load va2 and modify data using va2. 5401 * 4 unload va2. 5402 * 5. load va1 and reference data. Unless we 5403 * flush the data cache when we unload we will 5404 * get stale data. 5405 * Fortunately, page coloring eliminates the 5406 * above scenario by remembering the color a 5407 * physical page was last or is currently 5408 * mapped to. Now, we delay the flush until 5409 * the loading of translations. Only when the 5410 * new translation is of a different color 5411 * are we forced to flush. 5412 */ 5413 if (use_demap_range) { 5414 /* 5415 * Mark this page as needing a demap. 5416 */ 5417 DEMAP_RANGE_MARKPG(dmrp, addr); 5418 } else { 5419 if (do_virtual_coloring) { 5420 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 5421 sfmmup->sfmmu_free, 0); 5422 } else { 5423 pfn_t pfnum; 5424 5425 pfnum = TTE_TO_PFN(addr, &tte); 5426 sfmmu_tlbcache_demap(addr, sfmmup, 5427 hmeblkp, pfnum, sfmmup->sfmmu_free, 5428 FLUSH_NECESSARY_CPUS, 5429 CACHE_FLUSH, 0); 5430 } 5431 } 5432 5433 if (pp) { 5434 /* 5435 * Remove the hment from the mapping list 5436 */ 5437 ASSERT(hmeblkp->hblk_hmecnt > 0); 5438 5439 /* 5440 * Again, we cannot 5441 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 5442 */ 5443 HME_SUB(sfhmep, pp); 5444 membar_stst(); 5445 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 5446 } 5447 5448 ASSERT(hmeblkp->hblk_vcnt > 0); 5449 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 5450 5451 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 5452 !hmeblkp->hblk_lckcnt); 5453 5454 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 5455 if (PP_ISTNC(pp)) { 5456 /* 5457 * If page was temporary 5458 * uncached, try to recache 5459 * it. Note that HME_SUB() was 5460 * called above so p_index and 5461 * mlist had been updated. 5462 */ 5463 conv_tnc(pp, ttesz); 5464 } else if (pp->p_mapping == NULL) { 5465 ASSERT(kpm_enable); 5466 /* 5467 * Page is marked to be in VAC conflict 5468 * to an existing kpm mapping and/or is 5469 * kpm mapped using only the regular 5470 * pagesize. 5471 */ 5472 sfmmu_kpm_hme_unload(pp); 5473 } 5474 } 5475 } else if ((pp = sfhmep->hme_page) != NULL) { 5476 /* 5477 * TTE is invalid but the hme 5478 * still exists. let pageunload 5479 * complete its job. 5480 */ 5481 ASSERT(pml == NULL); 5482 pml = sfmmu_mlist_enter(pp); 5483 if (sfhmep->hme_page != NULL) { 5484 sfmmu_mlist_exit(pml); 5485 pml = NULL; 5486 goto again; 5487 } 5488 ASSERT(sfhmep->hme_page == NULL); 5489 } else if (hmeblkp->hblk_hmecnt != 0) { 5490 /* 5491 * pageunload may have not finished decrementing 5492 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 5493 * wait for pageunload to finish. Rely on pageunload 5494 * to decrement hblk_hmecnt after hblk_vcnt. 5495 */ 5496 pfn_t pfn = TTE_TO_TTEPFN(&tte); 5497 ASSERT(pml == NULL); 5498 if (pf_is_memory(pfn)) { 5499 pp = page_numtopp_nolock(pfn); 5500 if (pp != NULL) { 5501 pml = sfmmu_mlist_enter(pp); 5502 sfmmu_mlist_exit(pml); 5503 pml = NULL; 5504 } 5505 } 5506 } 5507 5508 tte_unloaded: 5509 /* 5510 * At this point, the tte we are looking at 5511 * should be unloaded, and hme has been unlinked 5512 * from page too. This is important because in 5513 * pageunload, it does ttesync() then HME_SUB. 5514 * We need to make sure HME_SUB has been completed 5515 * so we know ttesync() has been completed. Otherwise, 5516 * at exit time, after return from hat layer, VM will 5517 * release as structure which hat_setstat() (called 5518 * by ttesync()) needs. 5519 */ 5520 #ifdef DEBUG 5521 { 5522 tte_t dtte; 5523 5524 ASSERT(sfhmep->hme_page == NULL); 5525 5526 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 5527 ASSERT(!TTE_IS_VALID(&dtte)); 5528 } 5529 #endif 5530 5531 if (pml) { 5532 sfmmu_mlist_exit(pml); 5533 } 5534 5535 addr += TTEBYTES(ttesz); 5536 sfhmep++; 5537 DEMAP_RANGE_NEXTPG(dmrp); 5538 } 5539 if (ttecnt > 0) 5540 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 5541 return (addr); 5542 } 5543 5544 /* 5545 * Synchronize all the mappings in the range [addr..addr+len). 5546 * Can be called with clearflag having two states: 5547 * HAT_SYNC_DONTZERO means just return the rm stats 5548 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 5549 */ 5550 void 5551 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 5552 { 5553 struct hmehash_bucket *hmebp; 5554 hmeblk_tag hblktag; 5555 int hmeshift, hashno = 1; 5556 struct hme_blk *hmeblkp, *list = NULL; 5557 caddr_t endaddr; 5558 cpuset_t cpuset; 5559 5560 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 5561 ASSERT((sfmmup == ksfmmup) || 5562 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 5563 ASSERT((len & MMU_PAGEOFFSET) == 0); 5564 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 5565 (clearflag == HAT_SYNC_ZERORM)); 5566 5567 CPUSET_ZERO(cpuset); 5568 5569 endaddr = addr + len; 5570 hblktag.htag_id = sfmmup; 5571 /* 5572 * Spitfire supports 4 page sizes. 5573 * Most pages are expected to be of the smallest page 5574 * size (8K) and these will not need to be rehashed. 64K 5575 * pages also don't need to be rehashed because the an hmeblk 5576 * spans 64K of address space. 512K pages might need 1 rehash and 5577 * and 4M pages 2 rehashes. 5578 */ 5579 while (addr < endaddr) { 5580 hmeshift = HME_HASH_SHIFT(hashno); 5581 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5582 hblktag.htag_rehash = hashno; 5583 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5584 5585 SFMMU_HASH_LOCK(hmebp); 5586 5587 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5588 if (hmeblkp != NULL) { 5589 /* 5590 * We've encountered a shadow hmeblk so skip the range 5591 * of the next smaller mapping size. 5592 */ 5593 if (hmeblkp->hblk_shw_bit) { 5594 ASSERT(sfmmup != ksfmmup); 5595 ASSERT(hashno > 1); 5596 addr = (caddr_t)P2END((uintptr_t)addr, 5597 TTEBYTES(hashno - 1)); 5598 } else { 5599 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 5600 addr, endaddr, clearflag); 5601 } 5602 SFMMU_HASH_UNLOCK(hmebp); 5603 hashno = 1; 5604 continue; 5605 } 5606 SFMMU_HASH_UNLOCK(hmebp); 5607 5608 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5609 /* 5610 * We have traversed the whole list and rehashed 5611 * if necessary without finding the address to sync. 5612 * This is ok so we increment the address by the 5613 * smallest hmeblk range for kernel mappings and the 5614 * largest hmeblk range, to account for shadow hmeblks, 5615 * for user mappings and continue. 5616 */ 5617 if (sfmmup == ksfmmup) 5618 addr = (caddr_t)P2END((uintptr_t)addr, 5619 TTEBYTES(1)); 5620 else 5621 addr = (caddr_t)P2END((uintptr_t)addr, 5622 TTEBYTES(hashno)); 5623 hashno = 1; 5624 } else { 5625 hashno++; 5626 } 5627 } 5628 sfmmu_hblks_list_purge(&list); 5629 cpuset = sfmmup->sfmmu_cpusran; 5630 xt_sync(cpuset); 5631 } 5632 5633 static caddr_t 5634 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5635 caddr_t endaddr, int clearflag) 5636 { 5637 tte_t tte, ttemod; 5638 struct sf_hment *sfhmep; 5639 int ttesz; 5640 struct page *pp; 5641 kmutex_t *pml; 5642 int ret; 5643 5644 ASSERT(hmeblkp->hblk_shw_bit == 0); 5645 5646 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5647 5648 ttesz = get_hblk_ttesz(hmeblkp); 5649 HBLKTOHME(sfhmep, hmeblkp, addr); 5650 5651 while (addr < endaddr) { 5652 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5653 if (TTE_IS_VALID(&tte)) { 5654 pml = NULL; 5655 pp = sfhmep->hme_page; 5656 if (pp) { 5657 pml = sfmmu_mlist_enter(pp); 5658 } 5659 if (pp != sfhmep->hme_page) { 5660 /* 5661 * tte most have been unloaded 5662 * underneath us. Recheck 5663 */ 5664 ASSERT(pml); 5665 sfmmu_mlist_exit(pml); 5666 continue; 5667 } 5668 5669 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5670 5671 if (clearflag == HAT_SYNC_ZERORM) { 5672 ttemod = tte; 5673 TTE_CLR_RM(&ttemod); 5674 ret = sfmmu_modifytte_try(&tte, &ttemod, 5675 &sfhmep->hme_tte); 5676 if (ret < 0) { 5677 if (pml) { 5678 sfmmu_mlist_exit(pml); 5679 } 5680 continue; 5681 } 5682 5683 if (ret > 0) { 5684 sfmmu_tlb_demap(addr, sfmmup, 5685 hmeblkp, 0, 0); 5686 } 5687 } 5688 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5689 if (pml) { 5690 sfmmu_mlist_exit(pml); 5691 } 5692 } 5693 addr += TTEBYTES(ttesz); 5694 sfhmep++; 5695 } 5696 return (addr); 5697 } 5698 5699 /* 5700 * This function will sync a tte to the page struct and it will 5701 * update the hat stats. Currently it allows us to pass a NULL pp 5702 * and we will simply update the stats. We may want to change this 5703 * so we only keep stats for pages backed by pp's. 5704 */ 5705 static void 5706 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 5707 { 5708 uint_t rm = 0; 5709 int sz; 5710 pgcnt_t npgs; 5711 5712 ASSERT(TTE_IS_VALID(ttep)); 5713 5714 if (TTE_IS_NOSYNC(ttep)) { 5715 return; 5716 } 5717 5718 if (TTE_IS_REF(ttep)) { 5719 rm = P_REF; 5720 } 5721 if (TTE_IS_MOD(ttep)) { 5722 rm |= P_MOD; 5723 } 5724 5725 if (rm == 0) { 5726 return; 5727 } 5728 5729 sz = TTE_CSZ(ttep); 5730 if (sfmmup->sfmmu_rmstat) { 5731 int i; 5732 caddr_t vaddr = addr; 5733 5734 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) { 5735 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm); 5736 } 5737 5738 } 5739 5740 /* 5741 * XXX I want to use cas to update nrm bits but they 5742 * currently belong in common/vm and not in hat where 5743 * they should be. 5744 * The nrm bits are protected by the same mutex as 5745 * the one that protects the page's mapping list. 5746 */ 5747 if (!pp) 5748 return; 5749 ASSERT(sfmmu_mlist_held(pp)); 5750 /* 5751 * If the tte is for a large page, we need to sync all the 5752 * pages covered by the tte. 5753 */ 5754 if (sz != TTE8K) { 5755 ASSERT(pp->p_szc != 0); 5756 pp = PP_GROUPLEADER(pp, sz); 5757 ASSERT(sfmmu_mlist_held(pp)); 5758 } 5759 5760 /* Get number of pages from tte size. */ 5761 npgs = TTEPAGES(sz); 5762 5763 do { 5764 ASSERT(pp); 5765 ASSERT(sfmmu_mlist_held(pp)); 5766 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 5767 ((rm & P_MOD) != 0 && !PP_ISMOD(pp))) 5768 hat_page_setattr(pp, rm); 5769 5770 /* 5771 * Are we done? If not, we must have a large mapping. 5772 * For large mappings we need to sync the rest of the pages 5773 * covered by this tte; goto the next page. 5774 */ 5775 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 5776 } 5777 5778 /* 5779 * Execute pre-callback handler of each pa_hment linked to pp 5780 * 5781 * Inputs: 5782 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 5783 * capture_cpus: pointer to return value (below) 5784 * 5785 * Returns: 5786 * Propagates the subsystem callback return values back to the caller; 5787 * returns 0 on success. If capture_cpus is non-NULL, the value returned 5788 * is zero if all of the pa_hments are of a type that do not require 5789 * capturing CPUs prior to suspending the mapping, else it is 1. 5790 */ 5791 static int 5792 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 5793 { 5794 struct sf_hment *sfhmep; 5795 struct pa_hment *pahmep; 5796 int (*f)(caddr_t, uint_t, uint_t, void *); 5797 int ret; 5798 id_t id; 5799 int locked = 0; 5800 kmutex_t *pml; 5801 5802 ASSERT(PAGE_EXCL(pp)); 5803 if (!sfmmu_mlist_held(pp)) { 5804 pml = sfmmu_mlist_enter(pp); 5805 locked = 1; 5806 } 5807 5808 if (capture_cpus) 5809 *capture_cpus = 0; 5810 5811 top: 5812 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 5813 /* 5814 * skip sf_hments corresponding to VA<->PA mappings; 5815 * for pa_hment's, hme_tte.ll is zero 5816 */ 5817 if (!IS_PAHME(sfhmep)) 5818 continue; 5819 5820 pahmep = sfhmep->hme_data; 5821 ASSERT(pahmep != NULL); 5822 5823 /* 5824 * skip if pre-handler has been called earlier in this loop 5825 */ 5826 if (pahmep->flags & flag) 5827 continue; 5828 5829 id = pahmep->cb_id; 5830 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 5831 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 5832 *capture_cpus = 1; 5833 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 5834 pahmep->flags |= flag; 5835 continue; 5836 } 5837 5838 /* 5839 * Drop the mapping list lock to avoid locking order issues. 5840 */ 5841 if (locked) 5842 sfmmu_mlist_exit(pml); 5843 5844 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 5845 if (ret != 0) 5846 return (ret); /* caller must do the cleanup */ 5847 5848 if (locked) { 5849 pml = sfmmu_mlist_enter(pp); 5850 pahmep->flags |= flag; 5851 goto top; 5852 } 5853 5854 pahmep->flags |= flag; 5855 } 5856 5857 if (locked) 5858 sfmmu_mlist_exit(pml); 5859 5860 return (0); 5861 } 5862 5863 /* 5864 * Execute post-callback handler of each pa_hment linked to pp 5865 * 5866 * Same overall assumptions and restrictions apply as for 5867 * hat_pageprocess_precallbacks(). 5868 */ 5869 static void 5870 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 5871 { 5872 pfn_t pgpfn = pp->p_pagenum; 5873 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 5874 pfn_t newpfn; 5875 struct sf_hment *sfhmep; 5876 struct pa_hment *pahmep; 5877 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 5878 id_t id; 5879 int locked = 0; 5880 kmutex_t *pml; 5881 5882 ASSERT(PAGE_EXCL(pp)); 5883 if (!sfmmu_mlist_held(pp)) { 5884 pml = sfmmu_mlist_enter(pp); 5885 locked = 1; 5886 } 5887 5888 top: 5889 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 5890 /* 5891 * skip sf_hments corresponding to VA<->PA mappings; 5892 * for pa_hment's, hme_tte.ll is zero 5893 */ 5894 if (!IS_PAHME(sfhmep)) 5895 continue; 5896 5897 pahmep = sfhmep->hme_data; 5898 ASSERT(pahmep != NULL); 5899 5900 if ((pahmep->flags & flag) == 0) 5901 continue; 5902 5903 pahmep->flags &= ~flag; 5904 5905 id = pahmep->cb_id; 5906 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 5907 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 5908 continue; 5909 5910 /* 5911 * Convert the base page PFN into the constituent PFN 5912 * which is needed by the callback handler. 5913 */ 5914 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 5915 5916 /* 5917 * Drop the mapping list lock to avoid locking order issues. 5918 */ 5919 if (locked) 5920 sfmmu_mlist_exit(pml); 5921 5922 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 5923 != 0) 5924 panic("sfmmu: posthandler failed"); 5925 5926 if (locked) { 5927 pml = sfmmu_mlist_enter(pp); 5928 goto top; 5929 } 5930 } 5931 5932 if (locked) 5933 sfmmu_mlist_exit(pml); 5934 } 5935 5936 /* 5937 * Suspend locked kernel mapping 5938 */ 5939 void 5940 hat_pagesuspend(struct page *pp) 5941 { 5942 struct sf_hment *sfhmep; 5943 sfmmu_t *sfmmup; 5944 tte_t tte, ttemod; 5945 struct hme_blk *hmeblkp; 5946 caddr_t addr; 5947 int index, cons; 5948 cpuset_t cpuset; 5949 5950 ASSERT(PAGE_EXCL(pp)); 5951 ASSERT(sfmmu_mlist_held(pp)); 5952 5953 mutex_enter(&kpr_suspendlock); 5954 5955 /* 5956 * Call into dtrace to tell it we're about to suspend a 5957 * kernel mapping. This prevents us from running into issues 5958 * with probe context trying to touch a suspended page 5959 * in the relocation codepath itself. 5960 */ 5961 if (dtrace_kreloc_init) 5962 (*dtrace_kreloc_init)(); 5963 5964 index = PP_MAPINDEX(pp); 5965 cons = TTE8K; 5966 5967 retry: 5968 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 5969 5970 if (IS_PAHME(sfhmep)) 5971 continue; 5972 5973 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 5974 continue; 5975 5976 /* 5977 * Loop until we successfully set the suspend bit in 5978 * the TTE. 5979 */ 5980 again: 5981 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5982 ASSERT(TTE_IS_VALID(&tte)); 5983 5984 ttemod = tte; 5985 TTE_SET_SUSPEND(&ttemod); 5986 if (sfmmu_modifytte_try(&tte, &ttemod, 5987 &sfhmep->hme_tte) < 0) 5988 goto again; 5989 5990 /* 5991 * Invalidate TSB entry 5992 */ 5993 hmeblkp = sfmmu_hmetohblk(sfhmep); 5994 5995 sfmmup = hblktosfmmu(hmeblkp); 5996 ASSERT(sfmmup == ksfmmup); 5997 5998 addr = tte_to_vaddr(hmeblkp, tte); 5999 6000 /* 6001 * No need to make sure that the TSB for this sfmmu is 6002 * not being relocated since it is ksfmmup and thus it 6003 * will never be relocated. 6004 */ 6005 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp); 6006 6007 /* 6008 * Update xcall stats 6009 */ 6010 cpuset = cpu_ready_set; 6011 CPUSET_DEL(cpuset, CPU->cpu_id); 6012 6013 /* LINTED: constant in conditional context */ 6014 SFMMU_XCALL_STATS(ksfmmup); 6015 6016 /* 6017 * Flush TLB entry on remote CPU's 6018 */ 6019 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 6020 (uint64_t)ksfmmup); 6021 xt_sync(cpuset); 6022 6023 /* 6024 * Flush TLB entry on local CPU 6025 */ 6026 vtag_flushpage(addr, (uint64_t)ksfmmup); 6027 } 6028 6029 while (index != 0) { 6030 index = index >> 1; 6031 if (index != 0) 6032 cons++; 6033 if (index & 0x1) { 6034 pp = PP_GROUPLEADER(pp, cons); 6035 goto retry; 6036 } 6037 } 6038 } 6039 6040 #ifdef DEBUG 6041 6042 #define N_PRLE 1024 6043 struct prle { 6044 page_t *targ; 6045 page_t *repl; 6046 int status; 6047 int pausecpus; 6048 hrtime_t whence; 6049 }; 6050 6051 static struct prle page_relocate_log[N_PRLE]; 6052 static int prl_entry; 6053 static kmutex_t prl_mutex; 6054 6055 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 6056 mutex_enter(&prl_mutex); \ 6057 page_relocate_log[prl_entry].targ = *(t); \ 6058 page_relocate_log[prl_entry].repl = *(r); \ 6059 page_relocate_log[prl_entry].status = (s); \ 6060 page_relocate_log[prl_entry].pausecpus = (p); \ 6061 page_relocate_log[prl_entry].whence = gethrtime(); \ 6062 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 6063 mutex_exit(&prl_mutex); 6064 6065 #else /* !DEBUG */ 6066 #define PAGE_RELOCATE_LOG(t, r, s, p) 6067 #endif 6068 6069 /* 6070 * Core Kernel Page Relocation Algorithm 6071 * 6072 * Input: 6073 * 6074 * target : constituent pages are SE_EXCL locked. 6075 * replacement: constituent pages are SE_EXCL locked. 6076 * 6077 * Output: 6078 * 6079 * nrelocp: number of pages relocated 6080 */ 6081 int 6082 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 6083 { 6084 page_t *targ, *repl; 6085 page_t *tpp, *rpp; 6086 kmutex_t *low, *high; 6087 spgcnt_t npages, i; 6088 page_t *pl = NULL; 6089 int old_pil; 6090 cpuset_t cpuset; 6091 int cap_cpus; 6092 int ret; 6093 6094 if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) { 6095 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 6096 return (EAGAIN); 6097 } 6098 6099 mutex_enter(&kpr_mutex); 6100 kreloc_thread = curthread; 6101 6102 targ = *target; 6103 repl = *replacement; 6104 ASSERT(repl != NULL); 6105 ASSERT(targ->p_szc == repl->p_szc); 6106 6107 npages = page_get_pagecnt(targ->p_szc); 6108 6109 /* 6110 * unload VA<->PA mappings that are not locked 6111 */ 6112 tpp = targ; 6113 for (i = 0; i < npages; i++) { 6114 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 6115 tpp++; 6116 } 6117 6118 /* 6119 * Do "presuspend" callbacks, in a context from which we can still 6120 * block as needed. Note that we don't hold the mapping list lock 6121 * of "targ" at this point due to potential locking order issues; 6122 * we assume that between the hat_pageunload() above and holding 6123 * the SE_EXCL lock that the mapping list *cannot* change at this 6124 * point. 6125 */ 6126 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 6127 if (ret != 0) { 6128 /* 6129 * EIO translates to fatal error, for all others cleanup 6130 * and return EAGAIN. 6131 */ 6132 ASSERT(ret != EIO); 6133 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 6134 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 6135 kreloc_thread = NULL; 6136 mutex_exit(&kpr_mutex); 6137 return (EAGAIN); 6138 } 6139 6140 /* 6141 * acquire p_mapping list lock for both the target and replacement 6142 * root pages. 6143 * 6144 * low and high refer to the need to grab the mlist locks in a 6145 * specific order in order to prevent race conditions. Thus the 6146 * lower lock must be grabbed before the higher lock. 6147 * 6148 * This will block hat_unload's accessing p_mapping list. Since 6149 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 6150 * blocked. Thus, no one else will be accessing the p_mapping list 6151 * while we suspend and reload the locked mapping below. 6152 */ 6153 tpp = targ; 6154 rpp = repl; 6155 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6156 6157 kpreempt_disable(); 6158 6159 /* 6160 * If the replacement page is of a different virtual color 6161 * than the page it is replacing, we need to handle the VAC 6162 * consistency for it just as we would if we were setting up 6163 * a new mapping to a page. 6164 */ 6165 if ((tpp->p_szc == 0) && (PP_GET_VCOLOR(rpp) != NO_VCOLOR)) { 6166 if (tpp->p_vcolor != rpp->p_vcolor) { 6167 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 6168 rpp->p_pagenum); 6169 } 6170 } 6171 6172 /* 6173 * We raise our PIL to 13 so that we don't get captured by 6174 * another CPU or pinned by an interrupt thread. We can't go to 6175 * PIL 14 since the nexus driver(s) may need to interrupt at 6176 * that level in the case of IOMMU pseudo mappings. 6177 */ 6178 cpuset = cpu_ready_set; 6179 CPUSET_DEL(cpuset, CPU->cpu_id); 6180 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6181 old_pil = splr(XCALL_PIL); 6182 } else { 6183 old_pil = -1; 6184 xc_attention(cpuset); 6185 } 6186 ASSERT(getpil() == XCALL_PIL); 6187 6188 /* 6189 * Now do suspend callbacks. In the case of an IOMMU mapping 6190 * this will suspend all DMA activity to the page while it is 6191 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6192 * may be captured at this point we should have acquired any needed 6193 * locks in the presuspend callback. 6194 */ 6195 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6196 if (ret != 0) { 6197 repl = targ; 6198 goto suspend_fail; 6199 } 6200 6201 /* 6202 * Raise the PIL yet again, this time to block all high-level 6203 * interrupts on this CPU. This is necessary to prevent an 6204 * interrupt routine from pinning the thread which holds the 6205 * mapping suspended and then touching the suspended page. 6206 * 6207 * Once the page is suspended we also need to be careful to 6208 * avoid calling any functions which touch any seg_kmem memory 6209 * since that memory may be backed by the very page we are 6210 * relocating in here! 6211 */ 6212 hat_pagesuspend(targ); 6213 6214 /* 6215 * Now that we are confident everybody has stopped using this page, 6216 * copy the page contents. Note we use a physical copy to prevent 6217 * locking issues and to avoid fpRAS because we can't handle it in 6218 * this context. 6219 */ 6220 for (i = 0; i < npages; i++, tpp++, rpp++) { 6221 /* 6222 * Copy the contents of the page. 6223 */ 6224 ppcopy_kernel(tpp, rpp); 6225 } 6226 6227 tpp = targ; 6228 rpp = repl; 6229 for (i = 0; i < npages; i++, tpp++, rpp++) { 6230 /* 6231 * Copy attributes. VAC consistency was handled above, 6232 * if required. 6233 */ 6234 rpp->p_nrm = tpp->p_nrm; 6235 tpp->p_nrm = 0; 6236 rpp->p_index = tpp->p_index; 6237 tpp->p_index = 0; 6238 rpp->p_vcolor = tpp->p_vcolor; 6239 } 6240 6241 /* 6242 * First, unsuspend the page, if we set the suspend bit, and transfer 6243 * the mapping list from the target page to the replacement page. 6244 * Next process postcallbacks; since pa_hment's are linked only to the 6245 * p_mapping list of root page, we don't iterate over the constituent 6246 * pages. 6247 */ 6248 hat_pagereload(targ, repl); 6249 6250 suspend_fail: 6251 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 6252 6253 /* 6254 * Now lower our PIL and release any captured CPUs since we 6255 * are out of the "danger zone". After this it will again be 6256 * safe to acquire adaptive mutex locks, or to drop them... 6257 */ 6258 if (old_pil != -1) { 6259 splx(old_pil); 6260 } else { 6261 xc_dismissed(cpuset); 6262 } 6263 6264 kpreempt_enable(); 6265 6266 sfmmu_mlist_reloc_exit(low, high); 6267 6268 /* 6269 * Postsuspend callbacks should drop any locks held across 6270 * the suspend callbacks. As before, we don't hold the mapping 6271 * list lock at this point.. our assumption is that the mapping 6272 * list still can't change due to our holding SE_EXCL lock and 6273 * there being no unlocked mappings left. Hence the restriction 6274 * on calling context to hat_delete_callback() 6275 */ 6276 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 6277 if (ret != 0) { 6278 /* 6279 * The second presuspend call failed: we got here through 6280 * the suspend_fail label above. 6281 */ 6282 ASSERT(ret != EIO); 6283 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 6284 kreloc_thread = NULL; 6285 mutex_exit(&kpr_mutex); 6286 return (EAGAIN); 6287 } 6288 6289 /* 6290 * Now that we're out of the performance critical section we can 6291 * take care of updating the hash table, since we still 6292 * hold all the pages locked SE_EXCL at this point we 6293 * needn't worry about things changing out from under us. 6294 */ 6295 tpp = targ; 6296 rpp = repl; 6297 for (i = 0; i < npages; i++, tpp++, rpp++) { 6298 6299 /* 6300 * replace targ with replacement in page_hash table 6301 */ 6302 targ = tpp; 6303 page_relocate_hash(rpp, targ); 6304 6305 /* 6306 * concatenate target; caller of platform_page_relocate() 6307 * expects target to be concatenated after returning. 6308 */ 6309 ASSERT(targ->p_next == targ); 6310 ASSERT(targ->p_prev == targ); 6311 page_list_concat(&pl, &targ); 6312 } 6313 6314 ASSERT(*target == pl); 6315 *nrelocp = npages; 6316 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 6317 kreloc_thread = NULL; 6318 mutex_exit(&kpr_mutex); 6319 return (0); 6320 } 6321 6322 /* 6323 * Called when stray pa_hments are found attached to a page which is 6324 * being freed. Notify the subsystem which attached the pa_hment of 6325 * the error if it registered a suitable handler, else panic. 6326 */ 6327 static void 6328 sfmmu_pahment_leaked(struct pa_hment *pahmep) 6329 { 6330 id_t cb_id = pahmep->cb_id; 6331 6332 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 6333 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 6334 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 6335 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 6336 return; /* non-fatal */ 6337 } 6338 panic("pa_hment leaked: 0x%p", pahmep); 6339 } 6340 6341 /* 6342 * Remove all mappings to page 'pp'. 6343 */ 6344 int 6345 hat_pageunload(struct page *pp, uint_t forceflag) 6346 { 6347 struct page *origpp = pp; 6348 struct sf_hment *sfhme, *tmphme; 6349 struct hme_blk *hmeblkp; 6350 kmutex_t *pml, *pmtx; 6351 cpuset_t cpuset, tset; 6352 int index, cons; 6353 int xhme_blks; 6354 int pa_hments; 6355 6356 ASSERT(PAGE_EXCL(pp)); 6357 6358 retry_xhat: 6359 tmphme = NULL; 6360 xhme_blks = 0; 6361 pa_hments = 0; 6362 CPUSET_ZERO(cpuset); 6363 6364 pml = sfmmu_mlist_enter(pp); 6365 6366 if (pp->p_kpmref) 6367 sfmmu_kpm_pageunload(pp); 6368 ASSERT(!PP_ISMAPPED_KPM(pp)); 6369 6370 index = PP_MAPINDEX(pp); 6371 cons = TTE8K; 6372 retry: 6373 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6374 tmphme = sfhme->hme_next; 6375 6376 if (IS_PAHME(sfhme)) { 6377 ASSERT(sfhme->hme_data != NULL); 6378 pa_hments++; 6379 continue; 6380 } 6381 6382 hmeblkp = sfmmu_hmetohblk(sfhme); 6383 if (hmeblkp->hblk_xhat_bit) { 6384 struct xhat_hme_blk *xblk = 6385 (struct xhat_hme_blk *)hmeblkp; 6386 6387 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat, 6388 pp, forceflag, XBLK2PROVBLK(xblk)); 6389 6390 xhme_blks = 1; 6391 continue; 6392 } 6393 6394 /* 6395 * If there are kernel mappings don't unload them, they will 6396 * be suspended. 6397 */ 6398 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 6399 hmeblkp->hblk_tag.htag_id == ksfmmup) 6400 continue; 6401 6402 tset = sfmmu_pageunload(pp, sfhme, cons); 6403 CPUSET_OR(cpuset, tset); 6404 } 6405 6406 while (index != 0) { 6407 index = index >> 1; 6408 if (index != 0) 6409 cons++; 6410 if (index & 0x1) { 6411 /* Go to leading page */ 6412 pp = PP_GROUPLEADER(pp, cons); 6413 ASSERT(sfmmu_mlist_held(pp)); 6414 goto retry; 6415 } 6416 } 6417 6418 /* 6419 * cpuset may be empty if the page was only mapped by segkpm, 6420 * in which case we won't actually cross-trap. 6421 */ 6422 xt_sync(cpuset); 6423 6424 /* 6425 * The page should have no mappings at this point, unless 6426 * we were called from hat_page_relocate() in which case we 6427 * leave the locked mappings which will be suspended later. 6428 */ 6429 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments || 6430 (forceflag == SFMMU_KERNEL_RELOC)); 6431 6432 if (PP_ISTNC(pp)) { 6433 if (cons == TTE8K) { 6434 pmtx = sfmmu_page_enter(pp); 6435 PP_CLRTNC(pp); 6436 sfmmu_page_exit(pmtx); 6437 } else { 6438 conv_tnc(pp, cons); 6439 } 6440 } 6441 6442 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 6443 /* 6444 * Unlink any pa_hments and free them, calling back 6445 * the responsible subsystem to notify it of the error. 6446 * This can occur in situations such as drivers leaking 6447 * DMA handles: naughty, but common enough that we'd like 6448 * to keep the system running rather than bringing it 6449 * down with an obscure error like "pa_hment leaked" 6450 * which doesn't aid the user in debugging their driver. 6451 */ 6452 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6453 tmphme = sfhme->hme_next; 6454 if (IS_PAHME(sfhme)) { 6455 struct pa_hment *pahmep = sfhme->hme_data; 6456 sfmmu_pahment_leaked(pahmep); 6457 HME_SUB(sfhme, pp); 6458 kmem_cache_free(pa_hment_cache, pahmep); 6459 } 6460 } 6461 6462 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks); 6463 } 6464 6465 sfmmu_mlist_exit(pml); 6466 6467 /* 6468 * XHAT may not have finished unloading pages 6469 * because some other thread was waiting for 6470 * mlist lock and XHAT_PAGEUNLOAD let it do 6471 * the job. 6472 */ 6473 if (xhme_blks) { 6474 pp = origpp; 6475 goto retry_xhat; 6476 } 6477 6478 return (0); 6479 } 6480 6481 static cpuset_t 6482 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 6483 { 6484 struct hme_blk *hmeblkp; 6485 sfmmu_t *sfmmup; 6486 tte_t tte, ttemod; 6487 #ifdef DEBUG 6488 tte_t orig_old; 6489 #endif /* DEBUG */ 6490 caddr_t addr; 6491 int ttesz; 6492 int ret; 6493 cpuset_t cpuset; 6494 6495 ASSERT(pp != NULL); 6496 ASSERT(sfmmu_mlist_held(pp)); 6497 ASSERT(pp->p_vnode != &kvp); 6498 6499 CPUSET_ZERO(cpuset); 6500 6501 hmeblkp = sfmmu_hmetohblk(sfhme); 6502 6503 readtte: 6504 sfmmu_copytte(&sfhme->hme_tte, &tte); 6505 if (TTE_IS_VALID(&tte)) { 6506 sfmmup = hblktosfmmu(hmeblkp); 6507 ttesz = get_hblk_ttesz(hmeblkp); 6508 /* 6509 * Only unload mappings of 'cons' size. 6510 */ 6511 if (ttesz != cons) 6512 return (cpuset); 6513 6514 /* 6515 * Note that we have p_mapping lock, but no hash lock here. 6516 * hblk_unload() has to have both hash lock AND p_mapping 6517 * lock before it tries to modify tte. So, the tte could 6518 * not become invalid in the sfmmu_modifytte_try() below. 6519 */ 6520 ttemod = tte; 6521 #ifdef DEBUG 6522 orig_old = tte; 6523 #endif /* DEBUG */ 6524 6525 TTE_SET_INVALID(&ttemod); 6526 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 6527 if (ret < 0) { 6528 #ifdef DEBUG 6529 /* only R/M bits can change. */ 6530 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 6531 #endif /* DEBUG */ 6532 goto readtte; 6533 } 6534 6535 if (ret == 0) { 6536 panic("pageunload: cas failed?"); 6537 } 6538 6539 addr = tte_to_vaddr(hmeblkp, tte); 6540 6541 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6542 6543 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1); 6544 6545 /* 6546 * We need to flush the page from the virtual cache 6547 * in order to prevent a virtual cache alias 6548 * inconsistency. The particular scenario we need 6549 * to worry about is: 6550 * Given: va1 and va2 are two virtual address that 6551 * alias and will map the same physical address. 6552 * 1. mapping exists from va1 to pa and data has 6553 * been read into the cache. 6554 * 2. unload va1. 6555 * 3. load va2 and modify data using va2. 6556 * 4 unload va2. 6557 * 5. load va1 and reference data. Unless we flush 6558 * the data cache when we unload we will get 6559 * stale data. 6560 * This scenario is taken care of by using virtual 6561 * page coloring. 6562 */ 6563 if (sfmmup->sfmmu_ismhat) { 6564 /* 6565 * Flush TSBs, TLBs and caches 6566 * of every process 6567 * sharing this ism segment. 6568 */ 6569 sfmmu_hat_lock_all(); 6570 mutex_enter(&ism_mlist_lock); 6571 kpreempt_disable(); 6572 if (do_virtual_coloring) 6573 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 6574 pp->p_pagenum, CACHE_NO_FLUSH); 6575 else 6576 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 6577 pp->p_pagenum, CACHE_FLUSH); 6578 kpreempt_enable(); 6579 mutex_exit(&ism_mlist_lock); 6580 sfmmu_hat_unlock_all(); 6581 cpuset = cpu_ready_set; 6582 } else if (do_virtual_coloring) { 6583 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 6584 cpuset = sfmmup->sfmmu_cpusran; 6585 } else { 6586 sfmmu_tlbcache_demap(addr, sfmmup, hmeblkp, 6587 pp->p_pagenum, 0, FLUSH_NECESSARY_CPUS, 6588 CACHE_FLUSH, 0); 6589 cpuset = sfmmup->sfmmu_cpusran; 6590 } 6591 6592 /* 6593 * Hme_sub has to run after ttesync() and a_rss update. 6594 * See hblk_unload(). 6595 */ 6596 HME_SUB(sfhme, pp); 6597 membar_stst(); 6598 6599 /* 6600 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 6601 * since pteload may have done a HME_ADD() right after 6602 * we did the HME_SUB() above. Hmecnt is now maintained 6603 * by cas only. no lock guranteed its value. The only 6604 * gurantee we have is the hmecnt should not be less than 6605 * what it should be so the hblk will not be taken away. 6606 * It's also important that we decremented the hmecnt after 6607 * we are done with hmeblkp so that this hmeblk won't be 6608 * stolen. 6609 */ 6610 ASSERT(hmeblkp->hblk_hmecnt > 0); 6611 ASSERT(hmeblkp->hblk_vcnt > 0); 6612 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 6613 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 6614 /* 6615 * This is bug 4063182. 6616 * XXX: fixme 6617 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 6618 * !hmeblkp->hblk_lckcnt); 6619 */ 6620 } else { 6621 panic("invalid tte? pp %p &tte %p", 6622 (void *)pp, (void *)&tte); 6623 } 6624 6625 return (cpuset); 6626 } 6627 6628 /* 6629 * While relocating a kernel page, this function will move the mappings 6630 * from tpp to dpp and modify any associated data with these mappings. 6631 * It also unsuspends the suspended kernel mapping. 6632 */ 6633 static void 6634 hat_pagereload(struct page *tpp, struct page *dpp) 6635 { 6636 struct sf_hment *sfhme; 6637 tte_t tte, ttemod; 6638 int index, cons; 6639 6640 ASSERT(getpil() == PIL_MAX); 6641 ASSERT(sfmmu_mlist_held(tpp)); 6642 ASSERT(sfmmu_mlist_held(dpp)); 6643 6644 index = PP_MAPINDEX(tpp); 6645 cons = TTE8K; 6646 6647 /* Update real mappings to the page */ 6648 retry: 6649 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 6650 if (IS_PAHME(sfhme)) 6651 continue; 6652 sfmmu_copytte(&sfhme->hme_tte, &tte); 6653 ttemod = tte; 6654 6655 /* 6656 * replace old pfn with new pfn in TTE 6657 */ 6658 PFN_TO_TTE(ttemod, dpp->p_pagenum); 6659 6660 /* 6661 * clear suspend bit 6662 */ 6663 ASSERT(TTE_IS_SUSPEND(&ttemod)); 6664 TTE_CLR_SUSPEND(&ttemod); 6665 6666 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 6667 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 6668 6669 /* 6670 * set hme_page point to new page 6671 */ 6672 sfhme->hme_page = dpp; 6673 } 6674 6675 /* 6676 * move p_mapping list from old page to new page 6677 */ 6678 dpp->p_mapping = tpp->p_mapping; 6679 tpp->p_mapping = NULL; 6680 dpp->p_share = tpp->p_share; 6681 tpp->p_share = 0; 6682 6683 while (index != 0) { 6684 index = index >> 1; 6685 if (index != 0) 6686 cons++; 6687 if (index & 0x1) { 6688 tpp = PP_GROUPLEADER(tpp, cons); 6689 dpp = PP_GROUPLEADER(dpp, cons); 6690 goto retry; 6691 } 6692 } 6693 6694 if (dtrace_kreloc_fini) 6695 (*dtrace_kreloc_fini)(); 6696 mutex_exit(&kpr_suspendlock); 6697 } 6698 6699 uint_t 6700 hat_pagesync(struct page *pp, uint_t clearflag) 6701 { 6702 struct sf_hment *sfhme, *tmphme = NULL; 6703 struct hme_blk *hmeblkp; 6704 kmutex_t *pml; 6705 cpuset_t cpuset, tset; 6706 int index, cons; 6707 extern ulong_t po_share; 6708 page_t *save_pp = pp; 6709 6710 CPUSET_ZERO(cpuset); 6711 6712 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 6713 return (PP_GENERIC_ATTR(pp)); 6714 } 6715 6716 if ((clearflag == (HAT_SYNC_STOPON_REF | HAT_SYNC_DONTZERO)) && 6717 PP_ISREF(pp)) { 6718 return (PP_GENERIC_ATTR(pp)); 6719 } 6720 6721 if ((clearflag == (HAT_SYNC_STOPON_MOD | HAT_SYNC_DONTZERO)) && 6722 PP_ISMOD(pp)) { 6723 return (PP_GENERIC_ATTR(pp)); 6724 } 6725 6726 if ((clearflag & HAT_SYNC_STOPON_SHARED) != 0 && 6727 (pp->p_share > po_share) && 6728 !(clearflag & HAT_SYNC_ZERORM)) { 6729 if (PP_ISRO(pp)) 6730 hat_page_setattr(pp, P_REF); 6731 return (PP_GENERIC_ATTR(pp)); 6732 } 6733 6734 clearflag &= ~HAT_SYNC_STOPON_SHARED; 6735 pml = sfmmu_mlist_enter(pp); 6736 index = PP_MAPINDEX(pp); 6737 cons = TTE8K; 6738 retry: 6739 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6740 /* 6741 * We need to save the next hment on the list since 6742 * it is possible for pagesync to remove an invalid hment 6743 * from the list. 6744 */ 6745 tmphme = sfhme->hme_next; 6746 /* 6747 * If we are looking for large mappings and this hme doesn't 6748 * reach the range we are seeking, just ignore its. 6749 */ 6750 hmeblkp = sfmmu_hmetohblk(sfhme); 6751 if (hmeblkp->hblk_xhat_bit) 6752 continue; 6753 6754 if (hme_size(sfhme) < cons) 6755 continue; 6756 tset = sfmmu_pagesync(pp, sfhme, 6757 clearflag & ~HAT_SYNC_STOPON_RM); 6758 CPUSET_OR(cpuset, tset); 6759 /* 6760 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 6761 * as the "ref" or "mod" is set. 6762 */ 6763 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 6764 ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 6765 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp))) { 6766 index = 0; 6767 break; 6768 } 6769 } 6770 6771 while (index) { 6772 index = index >> 1; 6773 cons++; 6774 if (index & 0x1) { 6775 /* Go to leading page */ 6776 pp = PP_GROUPLEADER(pp, cons); 6777 goto retry; 6778 } 6779 } 6780 6781 xt_sync(cpuset); 6782 sfmmu_mlist_exit(pml); 6783 return (PP_GENERIC_ATTR(save_pp)); 6784 } 6785 6786 /* 6787 * Get all the hardware dependent attributes for a page struct 6788 */ 6789 static cpuset_t 6790 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 6791 uint_t clearflag) 6792 { 6793 caddr_t addr; 6794 tte_t tte, ttemod; 6795 struct hme_blk *hmeblkp; 6796 int ret; 6797 sfmmu_t *sfmmup; 6798 cpuset_t cpuset; 6799 6800 ASSERT(pp != NULL); 6801 ASSERT(sfmmu_mlist_held(pp)); 6802 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6803 (clearflag == HAT_SYNC_ZERORM)); 6804 6805 SFMMU_STAT(sf_pagesync); 6806 6807 CPUSET_ZERO(cpuset); 6808 6809 sfmmu_pagesync_retry: 6810 6811 sfmmu_copytte(&sfhme->hme_tte, &tte); 6812 if (TTE_IS_VALID(&tte)) { 6813 hmeblkp = sfmmu_hmetohblk(sfhme); 6814 sfmmup = hblktosfmmu(hmeblkp); 6815 addr = tte_to_vaddr(hmeblkp, tte); 6816 if (clearflag == HAT_SYNC_ZERORM) { 6817 ttemod = tte; 6818 TTE_CLR_RM(&ttemod); 6819 ret = sfmmu_modifytte_try(&tte, &ttemod, 6820 &sfhme->hme_tte); 6821 if (ret < 0) { 6822 /* 6823 * cas failed and the new value is not what 6824 * we want. 6825 */ 6826 goto sfmmu_pagesync_retry; 6827 } 6828 6829 if (ret > 0) { 6830 /* we win the cas */ 6831 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 6832 cpuset = sfmmup->sfmmu_cpusran; 6833 } 6834 } 6835 6836 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6837 } 6838 return (cpuset); 6839 } 6840 6841 /* 6842 * Remove write permission from a mappings to a page, so that 6843 * we can detect the next modification of it. This requires modifying 6844 * the TTE then invalidating (demap) any TLB entry using that TTE. 6845 * This code is similar to sfmmu_pagesync(). 6846 */ 6847 static cpuset_t 6848 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 6849 { 6850 caddr_t addr; 6851 tte_t tte; 6852 tte_t ttemod; 6853 struct hme_blk *hmeblkp; 6854 int ret; 6855 sfmmu_t *sfmmup; 6856 cpuset_t cpuset; 6857 6858 ASSERT(pp != NULL); 6859 ASSERT(sfmmu_mlist_held(pp)); 6860 6861 CPUSET_ZERO(cpuset); 6862 SFMMU_STAT(sf_clrwrt); 6863 6864 retry: 6865 6866 sfmmu_copytte(&sfhme->hme_tte, &tte); 6867 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 6868 hmeblkp = sfmmu_hmetohblk(sfhme); 6869 6870 /* 6871 * xhat mappings should never be to a VMODSORT page. 6872 */ 6873 ASSERT(hmeblkp->hblk_xhat_bit == 0); 6874 6875 sfmmup = hblktosfmmu(hmeblkp); 6876 addr = tte_to_vaddr(hmeblkp, tte); 6877 6878 ttemod = tte; 6879 TTE_CLR_WRT(&ttemod); 6880 TTE_CLR_MOD(&ttemod); 6881 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 6882 6883 /* 6884 * if cas failed and the new value is not what 6885 * we want retry 6886 */ 6887 if (ret < 0) 6888 goto retry; 6889 6890 /* we win the cas */ 6891 if (ret > 0) { 6892 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 6893 cpuset = sfmmup->sfmmu_cpusran; 6894 } 6895 } 6896 6897 return (cpuset); 6898 } 6899 6900 /* 6901 * Walk all mappings of a page, removing write permission and clearing the 6902 * ref/mod bits. This code is similar to hat_pagesync() 6903 */ 6904 static void 6905 hat_page_clrwrt(page_t *pp) 6906 { 6907 struct sf_hment *sfhme; 6908 struct sf_hment *tmphme = NULL; 6909 kmutex_t *pml; 6910 cpuset_t cpuset; 6911 cpuset_t tset; 6912 int index; 6913 int cons; 6914 6915 CPUSET_ZERO(cpuset); 6916 6917 pml = sfmmu_mlist_enter(pp); 6918 index = PP_MAPINDEX(pp); 6919 cons = TTE8K; 6920 retry: 6921 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6922 tmphme = sfhme->hme_next; 6923 6924 /* 6925 * If we are looking for large mappings and this hme doesn't 6926 * reach the range we are seeking, just ignore its. 6927 */ 6928 6929 if (hme_size(sfhme) < cons) 6930 continue; 6931 6932 tset = sfmmu_pageclrwrt(pp, sfhme); 6933 CPUSET_OR(cpuset, tset); 6934 } 6935 6936 while (index) { 6937 index = index >> 1; 6938 cons++; 6939 if (index & 0x1) { 6940 /* Go to leading page */ 6941 pp = PP_GROUPLEADER(pp, cons); 6942 goto retry; 6943 } 6944 } 6945 6946 xt_sync(cpuset); 6947 sfmmu_mlist_exit(pml); 6948 } 6949 6950 /* 6951 * Set the given REF/MOD/RO bits for the given page. 6952 * For a vnode with a sorted v_pages list, we need to change 6953 * the attributes and the v_pages list together under page_vnode_mutex. 6954 */ 6955 void 6956 hat_page_setattr(page_t *pp, uint_t flag) 6957 { 6958 vnode_t *vp = pp->p_vnode; 6959 page_t **listp; 6960 kmutex_t *pmtx; 6961 kmutex_t *vphm = NULL; 6962 6963 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 6964 6965 /* 6966 * nothing to do if attribute already set 6967 */ 6968 if ((pp->p_nrm & flag) == flag) 6969 return; 6970 6971 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 6972 vphm = page_vnode_mutex(vp); 6973 mutex_enter(vphm); 6974 } 6975 6976 pmtx = sfmmu_page_enter(pp); 6977 pp->p_nrm |= flag; 6978 sfmmu_page_exit(pmtx); 6979 6980 if (vphm != NULL) { 6981 /* 6982 * Some File Systems examine v_pages for NULL w/o 6983 * grabbing the vphm mutex. Must not let it become NULL when 6984 * pp is the only page on the list. 6985 */ 6986 if (pp->p_vpnext != pp) { 6987 page_vpsub(&vp->v_pages, pp); 6988 if (vp->v_pages != NULL) 6989 listp = &vp->v_pages->p_vpprev->p_vpnext; 6990 else 6991 listp = &vp->v_pages; 6992 page_vpadd(listp, pp); 6993 } 6994 mutex_exit(vphm); 6995 } 6996 } 6997 6998 void 6999 hat_page_clrattr(page_t *pp, uint_t flag) 7000 { 7001 vnode_t *vp = pp->p_vnode; 7002 kmutex_t *vphm = NULL; 7003 kmutex_t *pmtx; 7004 7005 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7006 7007 /* 7008 * For vnode with a sorted v_pages list, we need to change 7009 * the attributes and the v_pages list together under page_vnode_mutex. 7010 */ 7011 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 7012 vphm = page_vnode_mutex(vp); 7013 mutex_enter(vphm); 7014 } 7015 7016 pmtx = sfmmu_page_enter(pp); 7017 pp->p_nrm &= ~flag; 7018 sfmmu_page_exit(pmtx); 7019 7020 if (vphm != NULL) { 7021 /* 7022 * Some File Systems examine v_pages for NULL w/o 7023 * grabbing the vphm mutex. Must not let it become NULL when 7024 * pp is the only page on the list. 7025 */ 7026 if (pp->p_vpnext != pp) { 7027 page_vpsub(&vp->v_pages, pp); 7028 page_vpadd(&vp->v_pages, pp); 7029 } 7030 mutex_exit(vphm); 7031 7032 /* 7033 * VMODSORT works by removing write permissions and getting 7034 * a fault when a page is made dirty. At this point 7035 * we need to remove write permission from all mappings 7036 * to this page. 7037 */ 7038 hat_page_clrwrt(pp); 7039 } 7040 } 7041 7042 7043 uint_t 7044 hat_page_getattr(page_t *pp, uint_t flag) 7045 { 7046 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7047 return ((uint_t)(pp->p_nrm & flag)); 7048 } 7049 7050 /* 7051 * DEBUG kernels: verify that a kernel va<->pa translation 7052 * is safe by checking the underlying page_t is in a page 7053 * relocation-safe state. 7054 */ 7055 #ifdef DEBUG 7056 void 7057 sfmmu_check_kpfn(pfn_t pfn) 7058 { 7059 page_t *pp; 7060 int index, cons; 7061 7062 if (hat_check_vtop == 0) 7063 return; 7064 7065 if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr) 7066 return; 7067 7068 pp = page_numtopp_nolock(pfn); 7069 if (!pp) 7070 return; 7071 7072 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7073 return; 7074 7075 /* 7076 * Handed a large kernel page, we dig up the root page since we 7077 * know the root page might have the lock also. 7078 */ 7079 if (pp->p_szc != 0) { 7080 index = PP_MAPINDEX(pp); 7081 cons = TTE8K; 7082 again: 7083 while (index != 0) { 7084 index >>= 1; 7085 if (index != 0) 7086 cons++; 7087 if (index & 0x1) { 7088 pp = PP_GROUPLEADER(pp, cons); 7089 goto again; 7090 } 7091 } 7092 } 7093 7094 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7095 return; 7096 7097 /* 7098 * Pages need to be locked or allocated "permanent" (either from 7099 * static_arena arena or explicitly setting PG_NORELOC when calling 7100 * page_create_va()) for VA->PA translations to be valid. 7101 */ 7102 if (!PP_ISNORELOC(pp)) 7103 panic("Illegal VA->PA translation, pp 0x%p not permanent", pp); 7104 else 7105 panic("Illegal VA->PA translation, pp 0x%p not locked", pp); 7106 } 7107 #endif /* DEBUG */ 7108 7109 /* 7110 * Returns a page frame number for a given virtual address. 7111 * Returns PFN_INVALID to indicate an invalid mapping 7112 */ 7113 pfn_t 7114 hat_getpfnum(struct hat *hat, caddr_t addr) 7115 { 7116 pfn_t pfn; 7117 tte_t tte; 7118 7119 /* 7120 * We would like to 7121 * ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 7122 * but we can't because the iommu driver will call this 7123 * routine at interrupt time and it can't grab the as lock 7124 * or it will deadlock: A thread could have the as lock 7125 * and be waiting for io. The io can't complete 7126 * because the interrupt thread is blocked trying to grab 7127 * the as lock. 7128 */ 7129 7130 ASSERT(hat->sfmmu_xhat_provider == NULL); 7131 7132 if (hat == ksfmmup) { 7133 if (segkpm && IS_KPM_ADDR(addr)) 7134 return (sfmmu_kpm_vatopfn(addr)); 7135 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7136 == PFN_SUSPENDED) { 7137 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7138 } 7139 sfmmu_check_kpfn(pfn); 7140 return (pfn); 7141 } else { 7142 return (sfmmu_uvatopfn(addr, hat)); 7143 } 7144 } 7145 7146 /* 7147 * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged. 7148 * Use hat_getpfnum(kas.a_hat, ...) instead. 7149 * 7150 * We'd like to return PFN_INVALID if the mappings have underlying page_t's 7151 * but can't right now due to the fact that some software has grown to use 7152 * this interface incorrectly. So for now when the interface is misused, 7153 * return a warning to the user that in the future it won't work in the 7154 * way they're abusing it, and carry on (after disabling page relocation). 7155 */ 7156 pfn_t 7157 hat_getkpfnum(caddr_t addr) 7158 { 7159 pfn_t pfn; 7160 tte_t tte; 7161 int badcaller = 0; 7162 extern int segkmem_reloc; 7163 7164 if (segkpm && IS_KPM_ADDR(addr)) { 7165 badcaller = 1; 7166 pfn = sfmmu_kpm_vatopfn(addr); 7167 } else { 7168 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7169 == PFN_SUSPENDED) { 7170 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7171 } 7172 badcaller = pf_is_memory(pfn); 7173 } 7174 7175 if (badcaller) { 7176 /* 7177 * We can't return PFN_INVALID or the caller may panic 7178 * or corrupt the system. The only alternative is to 7179 * disable page relocation at this point for all kernel 7180 * memory. This will impact any callers of page_relocate() 7181 * such as FMA or DR. 7182 * 7183 * RFE: Add junk here to spit out an ereport so the sysadmin 7184 * can be advised that he should upgrade his device driver 7185 * so that this doesn't happen. 7186 */ 7187 hat_getkpfnum_badcall(caller()); 7188 if (hat_kpr_enabled && segkmem_reloc) { 7189 hat_kpr_enabled = 0; 7190 segkmem_reloc = 0; 7191 cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED"); 7192 } 7193 } 7194 return (pfn); 7195 } 7196 7197 pfn_t 7198 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup) 7199 { 7200 struct hmehash_bucket *hmebp; 7201 hmeblk_tag hblktag; 7202 int hmeshift, hashno = 1; 7203 struct hme_blk *hmeblkp = NULL; 7204 7205 struct sf_hment *sfhmep; 7206 tte_t tte; 7207 pfn_t pfn; 7208 7209 /* support for ISM */ 7210 ism_map_t *ism_map; 7211 ism_blk_t *ism_blkp; 7212 int i; 7213 sfmmu_t *ism_hatid = NULL; 7214 sfmmu_t *locked_hatid = NULL; 7215 7216 7217 ASSERT(sfmmup != ksfmmup); 7218 SFMMU_STAT(sf_user_vtop); 7219 /* 7220 * Set ism_hatid if vaddr falls in a ISM segment. 7221 */ 7222 ism_blkp = sfmmup->sfmmu_iblk; 7223 if (ism_blkp) { 7224 sfmmu_ismhat_enter(sfmmup, 0); 7225 locked_hatid = sfmmup; 7226 } 7227 while (ism_blkp && ism_hatid == NULL) { 7228 ism_map = ism_blkp->iblk_maps; 7229 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 7230 if (vaddr >= ism_start(ism_map[i]) && 7231 vaddr < ism_end(ism_map[i])) { 7232 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 7233 vaddr = (caddr_t)(vaddr - 7234 ism_start(ism_map[i])); 7235 break; 7236 } 7237 } 7238 ism_blkp = ism_blkp->iblk_next; 7239 } 7240 if (locked_hatid) { 7241 sfmmu_ismhat_exit(locked_hatid, 0); 7242 } 7243 7244 hblktag.htag_id = sfmmup; 7245 do { 7246 hmeshift = HME_HASH_SHIFT(hashno); 7247 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 7248 hblktag.htag_rehash = hashno; 7249 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 7250 7251 SFMMU_HASH_LOCK(hmebp); 7252 7253 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 7254 if (hmeblkp != NULL) { 7255 HBLKTOHME(sfhmep, hmeblkp, vaddr); 7256 sfmmu_copytte(&sfhmep->hme_tte, &tte); 7257 if (TTE_IS_VALID(&tte)) { 7258 pfn = TTE_TO_PFN(vaddr, &tte); 7259 } else { 7260 pfn = PFN_INVALID; 7261 } 7262 SFMMU_HASH_UNLOCK(hmebp); 7263 return (pfn); 7264 } 7265 SFMMU_HASH_UNLOCK(hmebp); 7266 hashno++; 7267 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 7268 return (PFN_INVALID); 7269 } 7270 7271 7272 /* 7273 * For compatability with AT&T and later optimizations 7274 */ 7275 /* ARGSUSED */ 7276 void 7277 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 7278 { 7279 ASSERT(hat != NULL); 7280 ASSERT(hat->sfmmu_xhat_provider == NULL); 7281 } 7282 7283 /* 7284 * Return the number of mappings to a particular page. 7285 * This number is an approximation of the number of 7286 * number of people sharing the page. 7287 */ 7288 ulong_t 7289 hat_page_getshare(page_t *pp) 7290 { 7291 page_t *spp = pp; /* start page */ 7292 kmutex_t *pml; 7293 ulong_t cnt; 7294 int index, sz = TTE64K; 7295 7296 /* 7297 * We need to grab the mlist lock to make sure any outstanding 7298 * load/unloads complete. Otherwise we could return zero 7299 * even though the unload(s) hasn't finished yet. 7300 */ 7301 pml = sfmmu_mlist_enter(spp); 7302 cnt = spp->p_share; 7303 7304 if (kpm_enable) 7305 cnt += spp->p_kpmref; 7306 7307 /* 7308 * If we have any large mappings, we count the number of 7309 * mappings that this large page is part of. 7310 */ 7311 index = PP_MAPINDEX(spp); 7312 index >>= 1; 7313 while (index) { 7314 pp = PP_GROUPLEADER(spp, sz); 7315 if ((index & 0x1) && pp != spp) { 7316 cnt += pp->p_share; 7317 spp = pp; 7318 } 7319 index >>= 1; 7320 sz++; 7321 } 7322 sfmmu_mlist_exit(pml); 7323 return (cnt); 7324 } 7325 7326 /* 7327 * Unload all large mappings to the pp and reset the p_szc field of every 7328 * constituent page according to the remaining mappings. 7329 * 7330 * pp must be locked SE_EXCL. Even though no other constituent pages are 7331 * locked it's legal to unload the large mappings to the pp because all 7332 * constituent pages of large locked mappings have to be locked SE_SHARED. 7333 * This means if we have SE_EXCL lock on one of constituent pages none of the 7334 * large mappings to pp are locked. 7335 * 7336 * Decrease p_szc field starting from the last constituent page and ending 7337 * with the root page. This method is used because other threads rely on the 7338 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 7339 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 7340 * ensures that p_szc changes of the constituent pages appears atomic for all 7341 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 7342 * 7343 * This mechanism is only used for file system pages where it's not always 7344 * possible to get SE_EXCL locks on all constituent pages to demote the size 7345 * code (as is done for anonymous or kernel large pages). 7346 * 7347 * See more comments in front of sfmmu_mlspl_enter(). 7348 */ 7349 void 7350 hat_page_demote(page_t *pp) 7351 { 7352 int index; 7353 int sz; 7354 cpuset_t cpuset; 7355 int sync = 0; 7356 page_t *rootpp; 7357 struct sf_hment *sfhme; 7358 struct sf_hment *tmphme = NULL; 7359 struct hme_blk *hmeblkp; 7360 uint_t pszc; 7361 page_t *lastpp; 7362 cpuset_t tset; 7363 pgcnt_t npgs; 7364 kmutex_t *pml; 7365 kmutex_t *pmtx = NULL; 7366 7367 ASSERT(PAGE_EXCL(pp)); 7368 ASSERT(!PP_ISFREE(pp)); 7369 ASSERT(page_szc_lock_assert(pp)); 7370 pml = sfmmu_mlist_enter(pp); 7371 7372 pszc = pp->p_szc; 7373 if (pszc == 0) { 7374 goto out; 7375 } 7376 7377 index = PP_MAPINDEX(pp) >> 1; 7378 7379 if (index) { 7380 CPUSET_ZERO(cpuset); 7381 sz = TTE64K; 7382 sync = 1; 7383 } 7384 7385 while (index) { 7386 if (!(index & 0x1)) { 7387 index >>= 1; 7388 sz++; 7389 continue; 7390 } 7391 ASSERT(sz <= pszc); 7392 rootpp = PP_GROUPLEADER(pp, sz); 7393 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 7394 tmphme = sfhme->hme_next; 7395 hmeblkp = sfmmu_hmetohblk(sfhme); 7396 if (hme_size(sfhme) != sz) { 7397 continue; 7398 } 7399 if (hmeblkp->hblk_xhat_bit) { 7400 cmn_err(CE_PANIC, 7401 "hat_page_demote: xhat hmeblk"); 7402 } 7403 tset = sfmmu_pageunload(rootpp, sfhme, sz); 7404 CPUSET_OR(cpuset, tset); 7405 } 7406 if (index >>= 1) { 7407 sz++; 7408 } 7409 } 7410 7411 ASSERT(!PP_ISMAPPED_LARGE(pp)); 7412 7413 if (sync) { 7414 xt_sync(cpuset); 7415 if (PP_ISTNC(pp)) { 7416 conv_tnc(rootpp, sz); 7417 } 7418 } 7419 7420 pmtx = sfmmu_page_enter(pp); 7421 7422 ASSERT(pp->p_szc == pszc); 7423 rootpp = PP_PAGEROOT(pp); 7424 ASSERT(rootpp->p_szc == pszc); 7425 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 7426 7427 while (lastpp != rootpp) { 7428 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 7429 ASSERT(sz < pszc); 7430 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 7431 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 7432 while (--npgs > 0) { 7433 lastpp->p_szc = (uchar_t)sz; 7434 lastpp = PP_PAGEPREV(lastpp); 7435 } 7436 if (sz) { 7437 /* 7438 * make sure before current root's pszc 7439 * is updated all updates to constituent pages pszc 7440 * fields are globally visible. 7441 */ 7442 membar_producer(); 7443 } 7444 lastpp->p_szc = sz; 7445 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 7446 if (lastpp != rootpp) { 7447 lastpp = PP_PAGEPREV(lastpp); 7448 } 7449 } 7450 if (sz == 0) { 7451 /* the loop above doesn't cover this case */ 7452 rootpp->p_szc = 0; 7453 } 7454 out: 7455 ASSERT(pp->p_szc == 0); 7456 if (pmtx != NULL) { 7457 sfmmu_page_exit(pmtx); 7458 } 7459 sfmmu_mlist_exit(pml); 7460 } 7461 7462 /* 7463 * Refresh the HAT ismttecnt[] element for size szc. 7464 * Caller must have set ISM busy flag to prevent mapping 7465 * lists from changing while we're traversing them. 7466 */ 7467 pgcnt_t 7468 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 7469 { 7470 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 7471 ism_map_t *ism_map; 7472 pgcnt_t npgs = 0; 7473 int j; 7474 7475 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 7476 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 7477 ism_map = ism_blkp->iblk_maps; 7478 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) 7479 npgs += ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 7480 } 7481 sfmmup->sfmmu_ismttecnt[szc] = npgs; 7482 return (npgs); 7483 } 7484 7485 /* 7486 * Yield the memory claim requirement for an address space. 7487 * 7488 * This is currently implemented as the number of bytes that have active 7489 * hardware translations that have page structures. Therefore, it can 7490 * underestimate the traditional resident set size, eg, if the 7491 * physical page is present and the hardware translation is missing; 7492 * and it can overestimate the rss, eg, if there are active 7493 * translations to a frame buffer with page structs. 7494 * Also, it does not take sharing into account. 7495 * 7496 * Note that we don't acquire locks here since this function is most often 7497 * called from the clock thread. 7498 */ 7499 size_t 7500 hat_get_mapped_size(struct hat *hat) 7501 { 7502 size_t assize = 0; 7503 int i; 7504 7505 if (hat == NULL) 7506 return (0); 7507 7508 ASSERT(hat->sfmmu_xhat_provider == NULL); 7509 7510 for (i = 0; i < mmu_page_sizes; i++) 7511 assize += (pgcnt_t)hat->sfmmu_ttecnt[i] * TTEBYTES(i); 7512 7513 if (hat->sfmmu_iblk == NULL) 7514 return (assize); 7515 7516 for (i = 0; i < mmu_page_sizes; i++) 7517 assize += (pgcnt_t)hat->sfmmu_ismttecnt[i] * TTEBYTES(i); 7518 7519 return (assize); 7520 } 7521 7522 int 7523 hat_stats_enable(struct hat *hat) 7524 { 7525 hatlock_t *hatlockp; 7526 7527 ASSERT(hat->sfmmu_xhat_provider == NULL); 7528 7529 hatlockp = sfmmu_hat_enter(hat); 7530 hat->sfmmu_rmstat++; 7531 sfmmu_hat_exit(hatlockp); 7532 return (1); 7533 } 7534 7535 void 7536 hat_stats_disable(struct hat *hat) 7537 { 7538 hatlock_t *hatlockp; 7539 7540 ASSERT(hat->sfmmu_xhat_provider == NULL); 7541 7542 hatlockp = sfmmu_hat_enter(hat); 7543 hat->sfmmu_rmstat--; 7544 sfmmu_hat_exit(hatlockp); 7545 } 7546 7547 /* 7548 * Routines for entering or removing ourselves from the 7549 * ism_hat's mapping list. 7550 */ 7551 static void 7552 iment_add(struct ism_ment *iment, struct hat *ism_hat) 7553 { 7554 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 7555 7556 iment->iment_prev = NULL; 7557 iment->iment_next = ism_hat->sfmmu_iment; 7558 if (ism_hat->sfmmu_iment) { 7559 ism_hat->sfmmu_iment->iment_prev = iment; 7560 } 7561 ism_hat->sfmmu_iment = iment; 7562 } 7563 7564 static void 7565 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 7566 { 7567 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 7568 7569 if (ism_hat->sfmmu_iment == NULL) { 7570 panic("ism map entry remove - no entries"); 7571 } 7572 7573 if (iment->iment_prev) { 7574 ASSERT(ism_hat->sfmmu_iment != iment); 7575 iment->iment_prev->iment_next = iment->iment_next; 7576 } else { 7577 ASSERT(ism_hat->sfmmu_iment == iment); 7578 ism_hat->sfmmu_iment = iment->iment_next; 7579 } 7580 7581 if (iment->iment_next) { 7582 iment->iment_next->iment_prev = iment->iment_prev; 7583 } 7584 7585 /* 7586 * zero out the entry 7587 */ 7588 iment->iment_next = NULL; 7589 iment->iment_prev = NULL; 7590 iment->iment_hat = NULL; 7591 } 7592 7593 /* 7594 * Hat_share()/unshare() return an (non-zero) error 7595 * when saddr and daddr are not properly aligned. 7596 * 7597 * The top level mapping element determines the alignment 7598 * requirement for saddr and daddr, depending on different 7599 * architectures. 7600 * 7601 * When hat_share()/unshare() are not supported, 7602 * HATOP_SHARE()/UNSHARE() return 0 7603 */ 7604 int 7605 hat_share(struct hat *sfmmup, caddr_t addr, 7606 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 7607 { 7608 ism_blk_t *ism_blkp; 7609 ism_blk_t *new_iblk; 7610 ism_map_t *ism_map; 7611 ism_ment_t *ism_ment; 7612 int i, added; 7613 hatlock_t *hatlockp; 7614 int reload_mmu = 0; 7615 uint_t ismshift = page_get_shift(ismszc); 7616 size_t ismpgsz = page_get_pagesize(ismszc); 7617 uint_t ismmask = (uint_t)ismpgsz - 1; 7618 size_t sh_size = ISM_SHIFT(ismshift, len); 7619 ushort_t ismhatflag; 7620 7621 #ifdef DEBUG 7622 caddr_t eaddr = addr + len; 7623 #endif /* DEBUG */ 7624 7625 ASSERT(ism_hatid != NULL && sfmmup != NULL); 7626 ASSERT(sptaddr == ISMID_STARTADDR); 7627 /* 7628 * Check the alignment. 7629 */ 7630 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 7631 return (EINVAL); 7632 7633 /* 7634 * Check size alignment. 7635 */ 7636 if (!ISM_ALIGNED(ismshift, len)) 7637 return (EINVAL); 7638 7639 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 7640 7641 /* 7642 * Allocate ism_ment for the ism_hat's mapping list, and an 7643 * ism map blk in case we need one. We must do our 7644 * allocations before acquiring locks to prevent a deadlock 7645 * in the kmem allocator on the mapping list lock. 7646 */ 7647 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 7648 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 7649 7650 /* 7651 * Serialize ISM mappings with the ISM busy flag, and also the 7652 * trap handlers. 7653 */ 7654 sfmmu_ismhat_enter(sfmmup, 0); 7655 7656 /* 7657 * Allocate an ism map blk if necessary. 7658 */ 7659 if (sfmmup->sfmmu_iblk == NULL) { 7660 sfmmup->sfmmu_iblk = new_iblk; 7661 bzero(new_iblk, sizeof (*new_iblk)); 7662 new_iblk->iblk_nextpa = (uint64_t)-1; 7663 membar_stst(); /* make sure next ptr visible to all CPUs */ 7664 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 7665 reload_mmu = 1; 7666 new_iblk = NULL; 7667 } 7668 7669 #ifdef DEBUG 7670 /* 7671 * Make sure mapping does not already exist. 7672 */ 7673 ism_blkp = sfmmup->sfmmu_iblk; 7674 while (ism_blkp) { 7675 ism_map = ism_blkp->iblk_maps; 7676 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 7677 if ((addr >= ism_start(ism_map[i]) && 7678 addr < ism_end(ism_map[i])) || 7679 eaddr > ism_start(ism_map[i]) && 7680 eaddr <= ism_end(ism_map[i])) { 7681 panic("sfmmu_share: Already mapped!"); 7682 } 7683 } 7684 ism_blkp = ism_blkp->iblk_next; 7685 } 7686 #endif /* DEBUG */ 7687 7688 ASSERT(ismszc >= TTE4M); 7689 if (ismszc == TTE4M) { 7690 ismhatflag = HAT_4M_FLAG; 7691 } else if (ismszc == TTE32M) { 7692 ismhatflag = HAT_32M_FLAG; 7693 } else if (ismszc == TTE256M) { 7694 ismhatflag = HAT_256M_FLAG; 7695 } 7696 /* 7697 * Add mapping to first available mapping slot. 7698 */ 7699 ism_blkp = sfmmup->sfmmu_iblk; 7700 added = 0; 7701 while (!added) { 7702 ism_map = ism_blkp->iblk_maps; 7703 for (i = 0; i < ISM_MAP_SLOTS; i++) { 7704 if (ism_map[i].imap_ismhat == NULL) { 7705 7706 ism_map[i].imap_ismhat = ism_hatid; 7707 ism_map[i].imap_vb_shift = (ushort_t)ismshift; 7708 ism_map[i].imap_hatflags = ismhatflag; 7709 ism_map[i].imap_sz_mask = ismmask; 7710 /* 7711 * imap_seg is checked in ISM_CHECK to see if 7712 * non-NULL, then other info assumed valid. 7713 */ 7714 membar_stst(); 7715 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 7716 ism_map[i].imap_ment = ism_ment; 7717 7718 /* 7719 * Now add ourselves to the ism_hat's 7720 * mapping list. 7721 */ 7722 ism_ment->iment_hat = sfmmup; 7723 ism_ment->iment_base_va = addr; 7724 ism_hatid->sfmmu_ismhat = 1; 7725 ism_hatid->sfmmu_flags = 0; 7726 mutex_enter(&ism_mlist_lock); 7727 iment_add(ism_ment, ism_hatid); 7728 mutex_exit(&ism_mlist_lock); 7729 added = 1; 7730 break; 7731 } 7732 } 7733 if (!added && ism_blkp->iblk_next == NULL) { 7734 ism_blkp->iblk_next = new_iblk; 7735 new_iblk = NULL; 7736 bzero(ism_blkp->iblk_next, 7737 sizeof (*ism_blkp->iblk_next)); 7738 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 7739 membar_stst(); 7740 ism_blkp->iblk_nextpa = 7741 va_to_pa((caddr_t)ism_blkp->iblk_next); 7742 } 7743 ism_blkp = ism_blkp->iblk_next; 7744 } 7745 7746 /* 7747 * Update our counters for this sfmmup's ism mappings. 7748 */ 7749 for (i = 0; i <= ismszc; i++) { 7750 if (!(disable_ism_large_pages & (1 << i))) 7751 (void) ism_tsb_entries(sfmmup, i); 7752 } 7753 7754 hatlockp = sfmmu_hat_enter(sfmmup); 7755 7756 /* 7757 * For ISM and DISM we do not support 512K pages, so we only 7758 * only search the 4M and 8K/64K hashes for 4 pagesize cpus, and search 7759 * the 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 7760 */ 7761 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 7762 7763 if (ismszc > TTE4M && !SFMMU_FLAGS_ISSET(sfmmup, HAT_4M_FLAG)) 7764 SFMMU_FLAGS_SET(sfmmup, HAT_4M_FLAG); 7765 7766 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_64K_FLAG)) 7767 SFMMU_FLAGS_SET(sfmmup, HAT_64K_FLAG); 7768 7769 /* 7770 * If we updated the ismblkpa for this HAT or we need 7771 * to start searching the 256M or 32M or 4M hash, we must 7772 * make sure all CPUs running this process reload their 7773 * tsbmiss area. Otherwise they will fail to load the mappings 7774 * in the tsbmiss handler and will loop calling pagefault(). 7775 */ 7776 switch (ismszc) { 7777 case TTE256M: 7778 if (reload_mmu || !SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_FLAG)) { 7779 SFMMU_FLAGS_SET(sfmmup, HAT_256M_FLAG); 7780 sfmmu_sync_mmustate(sfmmup); 7781 } 7782 break; 7783 case TTE32M: 7784 if (reload_mmu || !SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_FLAG)) { 7785 SFMMU_FLAGS_SET(sfmmup, HAT_32M_FLAG); 7786 sfmmu_sync_mmustate(sfmmup); 7787 } 7788 break; 7789 case TTE4M: 7790 if (reload_mmu || !SFMMU_FLAGS_ISSET(sfmmup, HAT_4M_FLAG)) { 7791 SFMMU_FLAGS_SET(sfmmup, HAT_4M_FLAG); 7792 sfmmu_sync_mmustate(sfmmup); 7793 } 7794 break; 7795 default: 7796 break; 7797 } 7798 7799 /* 7800 * Now we can drop the locks. 7801 */ 7802 sfmmu_ismhat_exit(sfmmup, 1); 7803 sfmmu_hat_exit(hatlockp); 7804 7805 /* 7806 * Free up ismblk if we didn't use it. 7807 */ 7808 if (new_iblk != NULL) 7809 kmem_cache_free(ism_blk_cache, new_iblk); 7810 7811 /* 7812 * Check TSB and TLB page sizes. 7813 */ 7814 sfmmu_check_page_sizes(sfmmup, 1); 7815 7816 return (0); 7817 } 7818 7819 /* 7820 * hat_unshare removes exactly one ism_map from 7821 * this process's as. It expects multiple calls 7822 * to hat_unshare for multiple shm segments. 7823 */ 7824 void 7825 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 7826 { 7827 ism_map_t *ism_map; 7828 ism_ment_t *free_ment = NULL; 7829 ism_blk_t *ism_blkp; 7830 struct hat *ism_hatid; 7831 int found, i; 7832 hatlock_t *hatlockp; 7833 struct tsb_info *tsbinfo; 7834 uint_t ismshift = page_get_shift(ismszc); 7835 size_t sh_size = ISM_SHIFT(ismshift, len); 7836 7837 ASSERT(ISM_ALIGNED(ismshift, addr)); 7838 ASSERT(ISM_ALIGNED(ismshift, len)); 7839 ASSERT(sfmmup != NULL); 7840 ASSERT(sfmmup != ksfmmup); 7841 7842 if (sfmmup->sfmmu_xhat_provider) { 7843 XHAT_UNSHARE(sfmmup, addr, len); 7844 return; 7845 } else { 7846 /* 7847 * This must be a CPU HAT. If the address space has 7848 * XHATs attached, inform all XHATs that ISM segment 7849 * is going away 7850 */ 7851 ASSERT(sfmmup->sfmmu_as != NULL); 7852 if (sfmmup->sfmmu_as->a_xhat != NULL) 7853 xhat_unshare_all(sfmmup->sfmmu_as, addr, len); 7854 } 7855 7856 /* 7857 * Make sure that during the entire time ISM mappings are removed, 7858 * the trap handlers serialize behind us, and that no one else 7859 * can be mucking with ISM mappings. This also lets us get away 7860 * with not doing expensive cross calls to flush the TLB -- we 7861 * just discard the context, flush the entire TSB, and call it 7862 * a day. 7863 */ 7864 sfmmu_ismhat_enter(sfmmup, 0); 7865 7866 /* 7867 * Remove the mapping. 7868 * 7869 * We can't have any holes in the ism map. 7870 * The tsb miss code while searching the ism map will 7871 * stop on an empty map slot. So we must move 7872 * everyone past the hole up 1 if any. 7873 * 7874 * Also empty ism map blks are not freed until the 7875 * process exits. This is to prevent a MT race condition 7876 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 7877 */ 7878 found = 0; 7879 ism_blkp = sfmmup->sfmmu_iblk; 7880 while (!found && ism_blkp) { 7881 ism_map = ism_blkp->iblk_maps; 7882 for (i = 0; i < ISM_MAP_SLOTS; i++) { 7883 if (addr == ism_start(ism_map[i]) && 7884 sh_size == (size_t)(ism_size(ism_map[i]))) { 7885 found = 1; 7886 break; 7887 } 7888 } 7889 if (!found) 7890 ism_blkp = ism_blkp->iblk_next; 7891 } 7892 7893 if (found) { 7894 ism_hatid = ism_map[i].imap_ismhat; 7895 ASSERT(ism_hatid != NULL); 7896 ASSERT(ism_hatid->sfmmu_ismhat == 1); 7897 7898 /* 7899 * First remove ourselves from the ism mapping list. 7900 */ 7901 mutex_enter(&ism_mlist_lock); 7902 iment_sub(ism_map[i].imap_ment, ism_hatid); 7903 mutex_exit(&ism_mlist_lock); 7904 free_ment = ism_map[i].imap_ment; 7905 7906 /* 7907 * Now gurantee that any other cpu 7908 * that tries to process an ISM miss 7909 * will go to tl=0. 7910 */ 7911 hatlockp = sfmmu_hat_enter(sfmmup); 7912 7913 sfmmu_invalidate_ctx(sfmmup); 7914 7915 sfmmu_hat_exit(hatlockp); 7916 7917 /* 7918 * We delete the ism map by copying 7919 * the next map over the current one. 7920 * We will take the next one in the maps 7921 * array or from the next ism_blk. 7922 */ 7923 while (ism_blkp) { 7924 ism_map = ism_blkp->iblk_maps; 7925 while (i < (ISM_MAP_SLOTS - 1)) { 7926 ism_map[i] = ism_map[i + 1]; 7927 i++; 7928 } 7929 /* i == (ISM_MAP_SLOTS - 1) */ 7930 ism_blkp = ism_blkp->iblk_next; 7931 if (ism_blkp) { 7932 ism_map[i] = ism_blkp->iblk_maps[0]; 7933 i = 0; 7934 } else { 7935 ism_map[i].imap_seg = 0; 7936 ism_map[i].imap_vb_shift = 0; 7937 ism_map[i].imap_hatflags = 0; 7938 ism_map[i].imap_sz_mask = 0; 7939 ism_map[i].imap_ismhat = NULL; 7940 ism_map[i].imap_ment = NULL; 7941 } 7942 } 7943 7944 /* 7945 * Now flush entire TSB for the process, since 7946 * demapping page by page can be too expensive. 7947 * We don't have to flush the TLB here anymore 7948 * since we switch to a new TLB ctx instead. 7949 * Also, there is no need to flush if the process 7950 * is exiting since the TSB will be freed later. 7951 */ 7952 if (!sfmmup->sfmmu_free) { 7953 hatlockp = sfmmu_hat_enter(sfmmup); 7954 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 7955 tsbinfo = tsbinfo->tsb_next) { 7956 if (tsbinfo->tsb_flags & TSB_SWAPPED) 7957 continue; 7958 sfmmu_inv_tsb(tsbinfo->tsb_va, 7959 TSB_BYTES(tsbinfo->tsb_szc)); 7960 } 7961 sfmmu_hat_exit(hatlockp); 7962 } 7963 } 7964 7965 /* 7966 * Update our counters for this sfmmup's ism mappings. 7967 */ 7968 for (i = 0; i <= ismszc; i++) { 7969 if (!(disable_ism_large_pages & (1 << i))) 7970 (void) ism_tsb_entries(sfmmup, i); 7971 } 7972 7973 sfmmu_ismhat_exit(sfmmup, 0); 7974 7975 /* 7976 * We must do our freeing here after dropping locks 7977 * to prevent a deadlock in the kmem allocator on the 7978 * mapping list lock. 7979 */ 7980 if (free_ment != NULL) 7981 kmem_cache_free(ism_ment_cache, free_ment); 7982 7983 /* 7984 * Check TSB and TLB page sizes if the process isn't exiting. 7985 */ 7986 if (!sfmmup->sfmmu_free) 7987 sfmmu_check_page_sizes(sfmmup, 0); 7988 } 7989 7990 /* ARGSUSED */ 7991 static int 7992 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 7993 { 7994 /* void *buf is sfmmu_t pointer */ 7995 return (0); 7996 } 7997 7998 /* ARGSUSED */ 7999 static void 8000 sfmmu_idcache_destructor(void *buf, void *cdrarg) 8001 { 8002 /* void *buf is sfmmu_t pointer */ 8003 } 8004 8005 /* 8006 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 8007 * field to be the pa of this hmeblk 8008 */ 8009 /* ARGSUSED */ 8010 static int 8011 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 8012 { 8013 struct hme_blk *hmeblkp; 8014 8015 bzero(buf, (size_t)cdrarg); 8016 hmeblkp = (struct hme_blk *)buf; 8017 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 8018 8019 #ifdef HBLK_TRACE 8020 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 8021 #endif /* HBLK_TRACE */ 8022 8023 return (0); 8024 } 8025 8026 /* ARGSUSED */ 8027 static void 8028 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 8029 { 8030 8031 #ifdef HBLK_TRACE 8032 8033 struct hme_blk *hmeblkp; 8034 8035 hmeblkp = (struct hme_blk *)buf; 8036 mutex_destroy(&hmeblkp->hblk_audit_lock); 8037 8038 #endif /* HBLK_TRACE */ 8039 } 8040 8041 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 8042 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 8043 /* 8044 * The kmem allocator will callback into our reclaim routine when the system 8045 * is running low in memory. We traverse the hash and free up all unused but 8046 * still cached hme_blks. We also traverse the free list and free them up 8047 * as well. 8048 */ 8049 /*ARGSUSED*/ 8050 static void 8051 sfmmu_hblkcache_reclaim(void *cdrarg) 8052 { 8053 int i; 8054 uint64_t hblkpa, prevpa, nx_pa; 8055 struct hmehash_bucket *hmebp; 8056 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 8057 static struct hmehash_bucket *uhmehash_reclaim_hand; 8058 static struct hmehash_bucket *khmehash_reclaim_hand; 8059 struct hme_blk *list = NULL; 8060 8061 hmebp = uhmehash_reclaim_hand; 8062 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 8063 uhmehash_reclaim_hand = hmebp = uhme_hash; 8064 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 8065 8066 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 8067 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 8068 hmeblkp = hmebp->hmeblkp; 8069 hblkpa = hmebp->hmeh_nextpa; 8070 prevpa = 0; 8071 pr_hblk = NULL; 8072 while (hmeblkp) { 8073 nx_hblk = hmeblkp->hblk_next; 8074 nx_pa = hmeblkp->hblk_nextpa; 8075 if (!hmeblkp->hblk_vcnt && 8076 !hmeblkp->hblk_hmecnt) { 8077 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 8078 prevpa, pr_hblk); 8079 sfmmu_hblk_free(hmebp, hmeblkp, 8080 hblkpa, &list); 8081 } else { 8082 pr_hblk = hmeblkp; 8083 prevpa = hblkpa; 8084 } 8085 hmeblkp = nx_hblk; 8086 hblkpa = nx_pa; 8087 } 8088 SFMMU_HASH_UNLOCK(hmebp); 8089 } 8090 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 8091 hmebp = uhme_hash; 8092 } 8093 8094 hmebp = khmehash_reclaim_hand; 8095 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 8096 khmehash_reclaim_hand = hmebp = khme_hash; 8097 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 8098 8099 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 8100 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 8101 hmeblkp = hmebp->hmeblkp; 8102 hblkpa = hmebp->hmeh_nextpa; 8103 prevpa = 0; 8104 pr_hblk = NULL; 8105 while (hmeblkp) { 8106 nx_hblk = hmeblkp->hblk_next; 8107 nx_pa = hmeblkp->hblk_nextpa; 8108 if (!hmeblkp->hblk_vcnt && 8109 !hmeblkp->hblk_hmecnt) { 8110 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 8111 prevpa, pr_hblk); 8112 sfmmu_hblk_free(hmebp, hmeblkp, 8113 hblkpa, &list); 8114 } else { 8115 pr_hblk = hmeblkp; 8116 prevpa = hblkpa; 8117 } 8118 hmeblkp = nx_hblk; 8119 hblkpa = nx_pa; 8120 } 8121 SFMMU_HASH_UNLOCK(hmebp); 8122 } 8123 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 8124 hmebp = khme_hash; 8125 } 8126 sfmmu_hblks_list_purge(&list); 8127 } 8128 8129 /* 8130 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 8131 * same goes for sfmmu_get_addrvcolor(). 8132 * 8133 * This function will return the virtual color for the specified page. The 8134 * virtual color corresponds to this page current mapping or its last mapping. 8135 * It is used by memory allocators to choose addresses with the correct 8136 * alignment so vac consistency is automatically maintained. If the page 8137 * has no color it returns -1. 8138 */ 8139 int 8140 sfmmu_get_ppvcolor(struct page *pp) 8141 { 8142 int color; 8143 8144 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 8145 return (-1); 8146 } 8147 color = PP_GET_VCOLOR(pp); 8148 ASSERT(color < mmu_btop(shm_alignment)); 8149 return (color); 8150 } 8151 8152 /* 8153 * This function will return the desired alignment for vac consistency 8154 * (vac color) given a virtual address. If no vac is present it returns -1. 8155 */ 8156 int 8157 sfmmu_get_addrvcolor(caddr_t vaddr) 8158 { 8159 if (cache & CACHE_VAC) { 8160 return (addr_to_vcolor(vaddr)); 8161 } else { 8162 return (-1); 8163 } 8164 8165 } 8166 8167 /* 8168 * Check for conflicts. 8169 * A conflict exists if the new and existent mappings do not match in 8170 * their "shm_alignment fields. If conflicts exist, the existant mappings 8171 * are flushed unless one of them is locked. If one of them is locked, then 8172 * the mappings are flushed and converted to non-cacheable mappings. 8173 */ 8174 static void 8175 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 8176 { 8177 struct hat *tmphat; 8178 struct sf_hment *sfhmep, *tmphme = NULL; 8179 struct hme_blk *hmeblkp; 8180 int vcolor; 8181 tte_t tte; 8182 8183 ASSERT(sfmmu_mlist_held(pp)); 8184 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 8185 8186 vcolor = addr_to_vcolor(addr); 8187 if (PP_NEWPAGE(pp)) { 8188 PP_SET_VCOLOR(pp, vcolor); 8189 return; 8190 } 8191 8192 if (PP_GET_VCOLOR(pp) == vcolor) { 8193 return; 8194 } 8195 8196 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 8197 /* 8198 * Previous user of page had a different color 8199 * but since there are no current users 8200 * we just flush the cache and change the color. 8201 */ 8202 SFMMU_STAT(sf_pgcolor_conflict); 8203 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 8204 PP_SET_VCOLOR(pp, vcolor); 8205 return; 8206 } 8207 8208 /* 8209 * If we get here we have a vac conflict with a current 8210 * mapping. VAC conflict policy is as follows. 8211 * - The default is to unload the other mappings unless: 8212 * - If we have a large mapping we uncache the page. 8213 * We need to uncache the rest of the large page too. 8214 * - If any of the mappings are locked we uncache the page. 8215 * - If the requested mapping is inconsistent 8216 * with another mapping and that mapping 8217 * is in the same address space we have to 8218 * make it non-cached. The default thing 8219 * to do is unload the inconsistent mapping 8220 * but if they are in the same address space 8221 * we run the risk of unmapping the pc or the 8222 * stack which we will use as we return to the user, 8223 * in which case we can then fault on the thing 8224 * we just unloaded and get into an infinite loop. 8225 */ 8226 if (PP_ISMAPPED_LARGE(pp)) { 8227 int sz; 8228 8229 /* 8230 * Existing mapping is for big pages. We don't unload 8231 * existing big mappings to satisfy new mappings. 8232 * Always convert all mappings to TNC. 8233 */ 8234 sz = fnd_mapping_sz(pp); 8235 pp = PP_GROUPLEADER(pp, sz); 8236 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 8237 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 8238 TTEPAGES(sz)); 8239 8240 return; 8241 } 8242 8243 /* 8244 * check if any mapping is in same as or if it is locked 8245 * since in that case we need to uncache. 8246 */ 8247 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 8248 tmphme = sfhmep->hme_next; 8249 hmeblkp = sfmmu_hmetohblk(sfhmep); 8250 if (hmeblkp->hblk_xhat_bit) 8251 continue; 8252 tmphat = hblktosfmmu(hmeblkp); 8253 sfmmu_copytte(&sfhmep->hme_tte, &tte); 8254 ASSERT(TTE_IS_VALID(&tte)); 8255 if ((tmphat == hat) || hmeblkp->hblk_lckcnt) { 8256 /* 8257 * We have an uncache conflict 8258 */ 8259 SFMMU_STAT(sf_uncache_conflict); 8260 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 8261 return; 8262 } 8263 } 8264 8265 /* 8266 * We have an unload conflict 8267 * We have already checked for LARGE mappings, therefore 8268 * the remaining mapping(s) must be TTE8K. 8269 */ 8270 SFMMU_STAT(sf_unload_conflict); 8271 8272 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 8273 tmphme = sfhmep->hme_next; 8274 hmeblkp = sfmmu_hmetohblk(sfhmep); 8275 if (hmeblkp->hblk_xhat_bit) 8276 continue; 8277 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 8278 } 8279 8280 if (PP_ISMAPPED_KPM(pp)) 8281 sfmmu_kpm_vac_unload(pp, addr); 8282 8283 /* 8284 * Unloads only do TLB flushes so we need to flush the 8285 * cache here. 8286 */ 8287 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 8288 PP_SET_VCOLOR(pp, vcolor); 8289 } 8290 8291 /* 8292 * Whenever a mapping is unloaded and the page is in TNC state, 8293 * we see if the page can be made cacheable again. 'pp' is 8294 * the page that we just unloaded a mapping from, the size 8295 * of mapping that was unloaded is 'ottesz'. 8296 * Remark: 8297 * The recache policy for mpss pages can leave a performance problem 8298 * under the following circumstances: 8299 * . A large page in uncached mode has just been unmapped. 8300 * . All constituent pages are TNC due to a conflicting small mapping. 8301 * . There are many other, non conflicting, small mappings around for 8302 * a lot of the constituent pages. 8303 * . We're called w/ the "old" groupleader page and the old ottesz, 8304 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 8305 * we end up w/ TTE8K or npages == 1. 8306 * . We call tst_tnc w/ the old groupleader only, and if there is no 8307 * conflict, we re-cache only this page. 8308 * . All other small mappings are not checked and will be left in TNC mode. 8309 * The problem is not very serious because: 8310 * . mpss is actually only defined for heap and stack, so the probability 8311 * is not very high that a large page mapping exists in parallel to a small 8312 * one (this is possible, but seems to be bad programming style in the 8313 * appl). 8314 * . The problem gets a little bit more serious, when those TNC pages 8315 * have to be mapped into kernel space, e.g. for networking. 8316 * . When VAC alias conflicts occur in applications, this is regarded 8317 * as an application bug. So if kstat's show them, the appl should 8318 * be changed anyway. 8319 */ 8320 static void 8321 conv_tnc(page_t *pp, int ottesz) 8322 { 8323 int cursz, dosz; 8324 pgcnt_t curnpgs, dopgs; 8325 pgcnt_t pg64k; 8326 page_t *pp2; 8327 8328 /* 8329 * Determine how big a range we check for TNC and find 8330 * leader page. cursz is the size of the biggest 8331 * mapping that still exist on 'pp'. 8332 */ 8333 if (PP_ISMAPPED_LARGE(pp)) { 8334 cursz = fnd_mapping_sz(pp); 8335 } else { 8336 cursz = TTE8K; 8337 } 8338 8339 if (ottesz >= cursz) { 8340 dosz = ottesz; 8341 pp2 = pp; 8342 } else { 8343 dosz = cursz; 8344 pp2 = PP_GROUPLEADER(pp, dosz); 8345 } 8346 8347 pg64k = TTEPAGES(TTE64K); 8348 dopgs = TTEPAGES(dosz); 8349 8350 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 8351 8352 while (dopgs != 0) { 8353 curnpgs = TTEPAGES(cursz); 8354 if (tst_tnc(pp2, curnpgs)) { 8355 SFMMU_STAT_ADD(sf_recache, curnpgs); 8356 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 8357 curnpgs); 8358 } 8359 8360 ASSERT(dopgs >= curnpgs); 8361 dopgs -= curnpgs; 8362 8363 if (dopgs == 0) { 8364 break; 8365 } 8366 8367 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 8368 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 8369 cursz = fnd_mapping_sz(pp2); 8370 } else { 8371 cursz = TTE8K; 8372 } 8373 } 8374 } 8375 8376 /* 8377 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 8378 * returns 0 otherwise. Note that oaddr argument is valid for only 8379 * 8k pages. 8380 */ 8381 static int 8382 tst_tnc(page_t *pp, pgcnt_t npages) 8383 { 8384 struct sf_hment *sfhme; 8385 struct hme_blk *hmeblkp; 8386 tte_t tte; 8387 caddr_t vaddr; 8388 int clr_valid = 0; 8389 int color, color1, bcolor; 8390 int i, ncolors; 8391 8392 ASSERT(pp != NULL); 8393 ASSERT(!(cache & CACHE_WRITEBACK)); 8394 8395 if (npages > 1) { 8396 ncolors = CACHE_NUM_COLOR; 8397 } 8398 8399 for (i = 0; i < npages; i++) { 8400 ASSERT(sfmmu_mlist_held(pp)); 8401 ASSERT(PP_ISTNC(pp)); 8402 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 8403 8404 if (PP_ISPNC(pp)) { 8405 return (0); 8406 } 8407 8408 clr_valid = 0; 8409 if (PP_ISMAPPED_KPM(pp)) { 8410 caddr_t kpmvaddr; 8411 8412 ASSERT(kpm_enable); 8413 kpmvaddr = hat_kpm_page2va(pp, 1); 8414 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 8415 color1 = addr_to_vcolor(kpmvaddr); 8416 clr_valid = 1; 8417 } 8418 8419 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 8420 hmeblkp = sfmmu_hmetohblk(sfhme); 8421 if (hmeblkp->hblk_xhat_bit) 8422 continue; 8423 8424 sfmmu_copytte(&sfhme->hme_tte, &tte); 8425 ASSERT(TTE_IS_VALID(&tte)); 8426 8427 vaddr = tte_to_vaddr(hmeblkp, tte); 8428 color = addr_to_vcolor(vaddr); 8429 8430 if (npages > 1) { 8431 /* 8432 * If there is a big mapping, make sure 8433 * 8K mapping is consistent with the big 8434 * mapping. 8435 */ 8436 bcolor = i % ncolors; 8437 if (color != bcolor) { 8438 return (0); 8439 } 8440 } 8441 if (!clr_valid) { 8442 clr_valid = 1; 8443 color1 = color; 8444 } 8445 8446 if (color1 != color) { 8447 return (0); 8448 } 8449 } 8450 8451 pp = PP_PAGENEXT(pp); 8452 } 8453 8454 return (1); 8455 } 8456 8457 static void 8458 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 8459 pgcnt_t npages) 8460 { 8461 kmutex_t *pmtx; 8462 int i, ncolors, bcolor; 8463 kpm_hlk_t *kpmp; 8464 cpuset_t cpuset; 8465 8466 ASSERT(pp != NULL); 8467 ASSERT(!(cache & CACHE_WRITEBACK)); 8468 8469 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 8470 pmtx = sfmmu_page_enter(pp); 8471 8472 /* 8473 * Fast path caching single unmapped page 8474 */ 8475 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 8476 flags == HAT_CACHE) { 8477 PP_CLRTNC(pp); 8478 PP_CLRPNC(pp); 8479 sfmmu_page_exit(pmtx); 8480 sfmmu_kpm_kpmp_exit(kpmp); 8481 return; 8482 } 8483 8484 /* 8485 * We need to capture all cpus in order to change cacheability 8486 * because we can't allow one cpu to access the same physical 8487 * page using a cacheable and a non-cachebale mapping at the same 8488 * time. Since we may end up walking the ism mapping list 8489 * have to grab it's lock now since we can't after all the 8490 * cpus have been captured. 8491 */ 8492 sfmmu_hat_lock_all(); 8493 mutex_enter(&ism_mlist_lock); 8494 kpreempt_disable(); 8495 cpuset = cpu_ready_set; 8496 xc_attention(cpuset); 8497 8498 if (npages > 1) { 8499 /* 8500 * Make sure all colors are flushed since the 8501 * sfmmu_page_cache() only flushes one color- 8502 * it does not know big pages. 8503 */ 8504 ncolors = CACHE_NUM_COLOR; 8505 if (flags & HAT_TMPNC) { 8506 for (i = 0; i < ncolors; i++) { 8507 sfmmu_cache_flushcolor(i, pp->p_pagenum); 8508 } 8509 cache_flush_flag = CACHE_NO_FLUSH; 8510 } 8511 } 8512 8513 for (i = 0; i < npages; i++) { 8514 8515 ASSERT(sfmmu_mlist_held(pp)); 8516 8517 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 8518 8519 if (npages > 1) { 8520 bcolor = i % ncolors; 8521 } else { 8522 bcolor = NO_VCOLOR; 8523 } 8524 8525 sfmmu_page_cache(pp, flags, cache_flush_flag, 8526 bcolor); 8527 } 8528 8529 pp = PP_PAGENEXT(pp); 8530 } 8531 8532 xt_sync(cpuset); 8533 xc_dismissed(cpuset); 8534 mutex_exit(&ism_mlist_lock); 8535 sfmmu_hat_unlock_all(); 8536 sfmmu_page_exit(pmtx); 8537 sfmmu_kpm_kpmp_exit(kpmp); 8538 kpreempt_enable(); 8539 } 8540 8541 /* 8542 * This function changes the virtual cacheability of all mappings to a 8543 * particular page. When changing from uncache to cacheable the mappings will 8544 * only be changed if all of them have the same virtual color. 8545 * We need to flush the cache in all cpus. It is possible that 8546 * a process referenced a page as cacheable but has sinced exited 8547 * and cleared the mapping list. We still to flush it but have no 8548 * state so all cpus is the only alternative. 8549 */ 8550 static void 8551 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 8552 { 8553 struct sf_hment *sfhme; 8554 struct hme_blk *hmeblkp; 8555 sfmmu_t *sfmmup; 8556 tte_t tte, ttemod; 8557 caddr_t vaddr; 8558 int ret, color; 8559 pfn_t pfn; 8560 8561 color = bcolor; 8562 pfn = pp->p_pagenum; 8563 8564 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 8565 8566 hmeblkp = sfmmu_hmetohblk(sfhme); 8567 8568 if (hmeblkp->hblk_xhat_bit) 8569 continue; 8570 8571 sfmmu_copytte(&sfhme->hme_tte, &tte); 8572 ASSERT(TTE_IS_VALID(&tte)); 8573 vaddr = tte_to_vaddr(hmeblkp, tte); 8574 color = addr_to_vcolor(vaddr); 8575 8576 #ifdef DEBUG 8577 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 8578 ASSERT(color == bcolor); 8579 } 8580 #endif 8581 8582 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 8583 8584 ttemod = tte; 8585 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 8586 TTE_CLR_VCACHEABLE(&ttemod); 8587 } else { /* flags & HAT_CACHE */ 8588 TTE_SET_VCACHEABLE(&ttemod); 8589 } 8590 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 8591 if (ret < 0) { 8592 /* 8593 * Since all cpus are captured modifytte should not 8594 * fail. 8595 */ 8596 panic("sfmmu_page_cache: write to tte failed"); 8597 } 8598 8599 sfmmup = hblktosfmmu(hmeblkp); 8600 if (cache_flush_flag == CACHE_FLUSH) { 8601 /* 8602 * Flush TSBs, TLBs and caches 8603 */ 8604 if (sfmmup->sfmmu_ismhat) { 8605 if (flags & HAT_CACHE) { 8606 SFMMU_STAT(sf_ism_recache); 8607 } else { 8608 SFMMU_STAT(sf_ism_uncache); 8609 } 8610 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 8611 pfn, CACHE_FLUSH); 8612 } else { 8613 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 8614 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 8615 } 8616 8617 /* 8618 * all cache entries belonging to this pfn are 8619 * now flushed. 8620 */ 8621 cache_flush_flag = CACHE_NO_FLUSH; 8622 } else { 8623 8624 /* 8625 * Flush only TSBs and TLBs. 8626 */ 8627 if (sfmmup->sfmmu_ismhat) { 8628 if (flags & HAT_CACHE) { 8629 SFMMU_STAT(sf_ism_recache); 8630 } else { 8631 SFMMU_STAT(sf_ism_uncache); 8632 } 8633 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 8634 pfn, CACHE_NO_FLUSH); 8635 } else { 8636 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 8637 } 8638 } 8639 } 8640 8641 if (PP_ISMAPPED_KPM(pp)) 8642 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 8643 8644 switch (flags) { 8645 8646 default: 8647 panic("sfmmu_pagecache: unknown flags"); 8648 break; 8649 8650 case HAT_CACHE: 8651 PP_CLRTNC(pp); 8652 PP_CLRPNC(pp); 8653 PP_SET_VCOLOR(pp, color); 8654 break; 8655 8656 case HAT_TMPNC: 8657 PP_SETTNC(pp); 8658 PP_SET_VCOLOR(pp, NO_VCOLOR); 8659 break; 8660 8661 case HAT_UNCACHE: 8662 PP_SETPNC(pp); 8663 PP_CLRTNC(pp); 8664 PP_SET_VCOLOR(pp, NO_VCOLOR); 8665 break; 8666 } 8667 } 8668 8669 8670 /* 8671 * Wrapper routine used to return a context. 8672 * 8673 * It's the responsibility of the caller to guarantee that the 8674 * process serializes on calls here by taking the HAT lock for 8675 * the hat. 8676 * 8677 */ 8678 static void 8679 sfmmu_get_ctx(sfmmu_t *sfmmup) 8680 { 8681 mmu_ctx_t *mmu_ctxp; 8682 uint_t pstate_save; 8683 8684 ASSERT(sfmmu_hat_lock_held(sfmmup)); 8685 ASSERT(sfmmup != ksfmmup); 8686 8687 kpreempt_disable(); 8688 8689 mmu_ctxp = CPU_MMU_CTXP(CPU); 8690 ASSERT(mmu_ctxp); 8691 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 8692 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 8693 8694 /* 8695 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU. 8696 */ 8697 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs) 8698 sfmmu_ctx_wrap_around(mmu_ctxp); 8699 8700 /* 8701 * Let the MMU set up the page sizes to use for 8702 * this context in the TLB. Don't program 2nd dtlb for ism hat. 8703 */ 8704 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) { 8705 mmu_set_ctx_page_sizes(sfmmup); 8706 } 8707 8708 /* 8709 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with 8710 * interrupts disabled to prevent race condition with wrap-around 8711 * ctx invalidatation. In sun4v, ctx invalidation also involves 8712 * a HV call to set the number of TSBs to 0. If interrupts are not 8713 * disabled until after sfmmu_load_mmustate is complete TSBs may 8714 * become assigned to INVALID_CONTEXT. This is not allowed. 8715 */ 8716 pstate_save = sfmmu_disable_intrs(); 8717 8718 sfmmu_alloc_ctx(sfmmup, 1, CPU); 8719 sfmmu_load_mmustate(sfmmup); 8720 8721 sfmmu_enable_intrs(pstate_save); 8722 8723 kpreempt_enable(); 8724 } 8725 8726 /* 8727 * When all cnums are used up in a MMU, cnum will wrap around to the 8728 * next generation and start from 2. 8729 */ 8730 static void 8731 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp) 8732 { 8733 8734 /* caller must have disabled the preemption */ 8735 ASSERT(curthread->t_preempt >= 1); 8736 ASSERT(mmu_ctxp != NULL); 8737 8738 /* acquire Per-MMU (PM) spin lock */ 8739 mutex_enter(&mmu_ctxp->mmu_lock); 8740 8741 /* re-check to see if wrap-around is needed */ 8742 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs) 8743 goto done; 8744 8745 SFMMU_MMU_STAT(mmu_wrap_around); 8746 8747 /* update gnum */ 8748 ASSERT(mmu_ctxp->mmu_gnum != 0); 8749 mmu_ctxp->mmu_gnum++; 8750 if (mmu_ctxp->mmu_gnum == 0 || 8751 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) { 8752 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.", 8753 (void *)mmu_ctxp); 8754 } 8755 8756 if (mmu_ctxp->mmu_ncpus > 1) { 8757 cpuset_t cpuset; 8758 8759 membar_enter(); /* make sure updated gnum visible */ 8760 8761 SFMMU_XCALL_STATS(NULL); 8762 8763 /* xcall to others on the same MMU to invalidate ctx */ 8764 cpuset = mmu_ctxp->mmu_cpuset; 8765 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id)); 8766 CPUSET_DEL(cpuset, CPU->cpu_id); 8767 CPUSET_AND(cpuset, cpu_ready_set); 8768 8769 /* 8770 * Pass in INVALID_CONTEXT as the first parameter to 8771 * sfmmu_raise_tsb_exception, which invalidates the context 8772 * of any process running on the CPUs in the MMU. 8773 */ 8774 xt_some(cpuset, sfmmu_raise_tsb_exception, 8775 INVALID_CONTEXT, INVALID_CONTEXT); 8776 xt_sync(cpuset); 8777 8778 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 8779 } 8780 8781 if (sfmmu_getctx_sec() != INVALID_CONTEXT) { 8782 sfmmu_setctx_sec(INVALID_CONTEXT); 8783 sfmmu_clear_utsbinfo(); 8784 } 8785 8786 /* 8787 * No xcall is needed here. For sun4u systems all CPUs in context 8788 * domain share a single physical MMU therefore it's enough to flush 8789 * TLB on local CPU. On sun4v systems we use 1 global context 8790 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception 8791 * handler. Note that vtag_flushall_uctxs() is called 8792 * for Ultra II machine, where the equivalent flushall functionality 8793 * is implemented in SW, and only user ctx TLB entries are flushed. 8794 */ 8795 if (&vtag_flushall_uctxs != NULL) { 8796 vtag_flushall_uctxs(); 8797 } else { 8798 vtag_flushall(); 8799 } 8800 8801 /* reset mmu cnum, skips cnum 0 and 1 */ 8802 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 8803 8804 done: 8805 mutex_exit(&mmu_ctxp->mmu_lock); 8806 } 8807 8808 8809 /* 8810 * For multi-threaded process, set the process context to INVALID_CONTEXT 8811 * so that it faults and reloads the MMU state from TL=0. For single-threaded 8812 * process, we can just load the MMU state directly without having to 8813 * set context invalid. Caller must hold the hat lock since we don't 8814 * acquire it here. 8815 */ 8816 static void 8817 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 8818 { 8819 uint_t cnum; 8820 uint_t pstate_save; 8821 8822 ASSERT(sfmmup != ksfmmup); 8823 ASSERT(sfmmu_hat_lock_held(sfmmup)); 8824 8825 kpreempt_disable(); 8826 8827 /* 8828 * We check whether the pass'ed-in sfmmup is the same as the 8829 * current running proc. This is to makes sure the current proc 8830 * stays single-threaded if it already is. 8831 */ 8832 if ((sfmmup == curthread->t_procp->p_as->a_hat) && 8833 (curthread->t_procp->p_lwpcnt == 1)) { 8834 /* single-thread */ 8835 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum; 8836 if (cnum != INVALID_CONTEXT) { 8837 uint_t curcnum; 8838 /* 8839 * Disable interrupts to prevent race condition 8840 * with sfmmu_ctx_wrap_around ctx invalidation. 8841 * In sun4v, ctx invalidation involves setting 8842 * TSB to NULL, hence, interrupts should be disabled 8843 * untill after sfmmu_load_mmustate is completed. 8844 */ 8845 pstate_save = sfmmu_disable_intrs(); 8846 curcnum = sfmmu_getctx_sec(); 8847 if (curcnum == cnum) 8848 sfmmu_load_mmustate(sfmmup); 8849 sfmmu_enable_intrs(pstate_save); 8850 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT); 8851 } 8852 } else { 8853 /* 8854 * multi-thread 8855 * or when sfmmup is not the same as the curproc. 8856 */ 8857 sfmmu_invalidate_ctx(sfmmup); 8858 } 8859 8860 kpreempt_enable(); 8861 } 8862 8863 8864 /* 8865 * Replace the specified TSB with a new TSB. This function gets called when 8866 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the 8867 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 8868 * (8K). 8869 * 8870 * Caller must hold the HAT lock, but should assume any tsb_info 8871 * pointers it has are no longer valid after calling this function. 8872 * 8873 * Return values: 8874 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 8875 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 8876 * something to this tsbinfo/TSB 8877 * TSB_SUCCESS Operation succeeded 8878 */ 8879 static tsb_replace_rc_t 8880 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 8881 hatlock_t *hatlockp, uint_t flags) 8882 { 8883 struct tsb_info *new_tsbinfo = NULL; 8884 struct tsb_info *curtsb, *prevtsb; 8885 uint_t tte_sz_mask; 8886 int i; 8887 8888 ASSERT(sfmmup != ksfmmup); 8889 ASSERT(sfmmup->sfmmu_ismhat == 0); 8890 ASSERT(sfmmu_hat_lock_held(sfmmup)); 8891 ASSERT(szc <= tsb_max_growsize); 8892 8893 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 8894 return (TSB_LOSTRACE); 8895 8896 /* 8897 * Find the tsb_info ahead of this one in the list, and 8898 * also make sure that the tsb_info passed in really 8899 * exists! 8900 */ 8901 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 8902 curtsb != old_tsbinfo && curtsb != NULL; 8903 prevtsb = curtsb, curtsb = curtsb->tsb_next); 8904 ASSERT(curtsb != NULL); 8905 8906 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 8907 /* 8908 * The process is swapped out, so just set the new size 8909 * code. When it swaps back in, we'll allocate a new one 8910 * of the new chosen size. 8911 */ 8912 curtsb->tsb_szc = szc; 8913 return (TSB_SUCCESS); 8914 } 8915 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 8916 8917 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 8918 8919 /* 8920 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 8921 * If we fail to allocate a TSB, exit. 8922 */ 8923 sfmmu_hat_exit(hatlockp); 8924 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, tte_sz_mask, 8925 flags, sfmmup)) { 8926 (void) sfmmu_hat_enter(sfmmup); 8927 if (!(flags & TSB_SWAPIN)) 8928 SFMMU_STAT(sf_tsb_resize_failures); 8929 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 8930 return (TSB_ALLOCFAIL); 8931 } 8932 (void) sfmmu_hat_enter(sfmmup); 8933 8934 /* 8935 * Re-check to make sure somebody else didn't muck with us while we 8936 * didn't hold the HAT lock. If the process swapped out, fine, just 8937 * exit; this can happen if we try to shrink the TSB from the context 8938 * of another process (such as on an ISM unmap), though it is rare. 8939 */ 8940 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 8941 SFMMU_STAT(sf_tsb_resize_failures); 8942 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 8943 sfmmu_hat_exit(hatlockp); 8944 sfmmu_tsbinfo_free(new_tsbinfo); 8945 (void) sfmmu_hat_enter(sfmmup); 8946 return (TSB_LOSTRACE); 8947 } 8948 8949 #ifdef DEBUG 8950 /* Reverify that the tsb_info still exists.. for debugging only */ 8951 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 8952 curtsb != old_tsbinfo && curtsb != NULL; 8953 prevtsb = curtsb, curtsb = curtsb->tsb_next); 8954 ASSERT(curtsb != NULL); 8955 #endif /* DEBUG */ 8956 8957 /* 8958 * Quiesce any CPUs running this process on their next TLB miss 8959 * so they atomically see the new tsb_info. We temporarily set the 8960 * context to invalid context so new threads that come on processor 8961 * after we do the xcall to cpusran will also serialize behind the 8962 * HAT lock on TLB miss and will see the new TSB. Since this short 8963 * race with a new thread coming on processor is relatively rare, 8964 * this synchronization mechanism should be cheaper than always 8965 * pausing all CPUs for the duration of the setup, which is what 8966 * the old implementation did. This is particuarly true if we are 8967 * copying a huge chunk of memory around during that window. 8968 * 8969 * The memory barriers are to make sure things stay consistent 8970 * with resume() since it does not hold the HAT lock while 8971 * walking the list of tsb_info structures. 8972 */ 8973 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 8974 /* The TSB is either growing or shrinking. */ 8975 sfmmu_invalidate_ctx(sfmmup); 8976 } else { 8977 /* 8978 * It is illegal to swap in TSBs from a process other 8979 * than a process being swapped in. This in turn 8980 * implies we do not have a valid MMU context here 8981 * since a process needs one to resolve translation 8982 * misses. 8983 */ 8984 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 8985 } 8986 8987 #ifdef DEBUG 8988 ASSERT(max_mmu_ctxdoms > 0); 8989 8990 /* 8991 * Process should have INVALID_CONTEXT on all MMUs 8992 */ 8993 for (i = 0; i < max_mmu_ctxdoms; i++) { 8994 8995 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT); 8996 } 8997 #endif 8998 8999 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 9000 membar_stst(); /* strict ordering required */ 9001 if (prevtsb) 9002 prevtsb->tsb_next = new_tsbinfo; 9003 else 9004 sfmmup->sfmmu_tsb = new_tsbinfo; 9005 membar_enter(); /* make sure new TSB globally visible */ 9006 sfmmu_setup_tsbinfo(sfmmup); 9007 9008 /* 9009 * We need to migrate TSB entries from the old TSB to the new TSB 9010 * if tsb_remap_ttes is set and the TSB is growing. 9011 */ 9012 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 9013 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 9014 9015 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 9016 9017 /* 9018 * Drop the HAT lock to free our old tsb_info. 9019 */ 9020 sfmmu_hat_exit(hatlockp); 9021 9022 if ((flags & TSB_GROW) == TSB_GROW) { 9023 SFMMU_STAT(sf_tsb_grow); 9024 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 9025 SFMMU_STAT(sf_tsb_shrink); 9026 } 9027 9028 sfmmu_tsbinfo_free(old_tsbinfo); 9029 9030 (void) sfmmu_hat_enter(sfmmup); 9031 return (TSB_SUCCESS); 9032 } 9033 9034 /* 9035 * This function will re-program hat pgsz array, and invalidate the 9036 * process' context, forcing the process to switch to another 9037 * context on the next TLB miss, and therefore start using the 9038 * TLB that is reprogrammed for the new page sizes. 9039 */ 9040 void 9041 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 9042 { 9043 int i; 9044 hatlock_t *hatlockp = NULL; 9045 9046 hatlockp = sfmmu_hat_enter(sfmmup); 9047 /* USIII+-IV+ optimization, requires hat lock */ 9048 if (tmp_pgsz) { 9049 for (i = 0; i < mmu_page_sizes; i++) 9050 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 9051 } 9052 SFMMU_STAT(sf_tlb_reprog_pgsz); 9053 9054 sfmmu_invalidate_ctx(sfmmup); 9055 9056 sfmmu_hat_exit(hatlockp); 9057 } 9058 9059 /* 9060 * This function assumes that there are either four or six supported page 9061 * sizes and at most two programmable TLBs, so we need to decide which 9062 * page sizes are most important and then tell the MMU layer so it 9063 * can adjust the TLB page sizes accordingly (if supported). 9064 * 9065 * If these assumptions change, this function will need to be 9066 * updated to support whatever the new limits are. 9067 * 9068 * The growing flag is nonzero if we are growing the address space, 9069 * and zero if it is shrinking. This allows us to decide whether 9070 * to grow or shrink our TSB, depending upon available memory 9071 * conditions. 9072 */ 9073 static void 9074 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 9075 { 9076 uint64_t ttecnt[MMU_PAGE_SIZES]; 9077 uint64_t tte8k_cnt, tte4m_cnt; 9078 uint8_t i; 9079 int sectsb_thresh; 9080 9081 /* 9082 * Kernel threads, processes with small address spaces not using 9083 * large pages, and dummy ISM HATs need not apply. 9084 */ 9085 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 9086 return; 9087 9088 if ((sfmmup->sfmmu_flags & HAT_LGPG_FLAGS) == 0 && 9089 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 9090 return; 9091 9092 for (i = 0; i < mmu_page_sizes; i++) { 9093 ttecnt[i] = SFMMU_TTE_CNT(sfmmup, i); 9094 } 9095 9096 /* Check pagesizes in use, and possibly reprogram DTLB. */ 9097 if (&mmu_check_page_sizes) 9098 mmu_check_page_sizes(sfmmup, ttecnt); 9099 9100 /* 9101 * Calculate the number of 8k ttes to represent the span of these 9102 * pages. 9103 */ 9104 tte8k_cnt = ttecnt[TTE8K] + 9105 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 9106 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 9107 if (mmu_page_sizes == max_mmu_page_sizes) { 9108 tte4m_cnt = ttecnt[TTE4M] + 9109 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 9110 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 9111 } else { 9112 tte4m_cnt = ttecnt[TTE4M]; 9113 } 9114 9115 /* 9116 * Inflate TSB sizes by a factor of 2 if this process 9117 * uses 4M text pages to minimize extra conflict misses 9118 * in the first TSB since without counting text pages 9119 * 8K TSB may become too small. 9120 * 9121 * Also double the size of the second TSB to minimize 9122 * extra conflict misses due to competition between 4M text pages 9123 * and data pages. 9124 * 9125 * We need to adjust the second TSB allocation threshold by the 9126 * inflation factor, since there is no point in creating a second 9127 * TSB when we know all the mappings can fit in the I/D TLBs. 9128 */ 9129 sectsb_thresh = tsb_sectsb_threshold; 9130 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 9131 tte8k_cnt <<= 1; 9132 tte4m_cnt <<= 1; 9133 sectsb_thresh <<= 1; 9134 } 9135 9136 /* 9137 * Check to see if our TSB is the right size; we may need to 9138 * grow or shrink it. If the process is small, our work is 9139 * finished at this point. 9140 */ 9141 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) { 9142 return; 9143 } 9144 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh); 9145 } 9146 9147 static void 9148 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 9149 uint64_t tte4m_cnt, int sectsb_thresh) 9150 { 9151 int tsb_bits; 9152 uint_t tsb_szc; 9153 struct tsb_info *tsbinfop; 9154 hatlock_t *hatlockp = NULL; 9155 9156 hatlockp = sfmmu_hat_enter(sfmmup); 9157 ASSERT(hatlockp != NULL); 9158 tsbinfop = sfmmup->sfmmu_tsb; 9159 ASSERT(tsbinfop != NULL); 9160 9161 /* 9162 * If we're growing, select the size based on RSS. If we're 9163 * shrinking, leave some room so we don't have to turn around and 9164 * grow again immediately. 9165 */ 9166 if (growing) 9167 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 9168 else 9169 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 9170 9171 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 9172 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 9173 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 9174 hatlockp, TSB_SHRINK); 9175 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 9176 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 9177 hatlockp, TSB_GROW); 9178 } 9179 tsbinfop = sfmmup->sfmmu_tsb; 9180 9181 /* 9182 * With the TLB and first TSB out of the way, we need to see if 9183 * we need a second TSB for 4M pages. If we managed to reprogram 9184 * the TLB page sizes above, the process will start using this new 9185 * TSB right away; otherwise, it will start using it on the next 9186 * context switch. Either way, it's no big deal so there's no 9187 * synchronization with the trap handlers here unless we grow the 9188 * TSB (in which case it's required to prevent using the old one 9189 * after it's freed). Note: second tsb is required for 32M/256M 9190 * page sizes. 9191 */ 9192 if (tte4m_cnt > sectsb_thresh) { 9193 /* 9194 * If we're growing, select the size based on RSS. If we're 9195 * shrinking, leave some room so we don't have to turn 9196 * around and grow again immediately. 9197 */ 9198 if (growing) 9199 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 9200 else 9201 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 9202 if (tsbinfop->tsb_next == NULL) { 9203 struct tsb_info *newtsb; 9204 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 9205 0 : TSB_ALLOC; 9206 9207 sfmmu_hat_exit(hatlockp); 9208 9209 /* 9210 * Try to allocate a TSB for 4[32|256]M pages. If we 9211 * can't get the size we want, retry w/a minimum sized 9212 * TSB. If that still didn't work, give up; we can 9213 * still run without one. 9214 */ 9215 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 9216 TSB4M|TSB32M|TSB256M:TSB4M; 9217 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 9218 allocflags, sfmmup) != 0) && 9219 (sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 9220 tsb_bits, allocflags, sfmmup) != 0)) { 9221 return; 9222 } 9223 9224 hatlockp = sfmmu_hat_enter(sfmmup); 9225 9226 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 9227 sfmmup->sfmmu_tsb->tsb_next = newtsb; 9228 SFMMU_STAT(sf_tsb_sectsb_create); 9229 sfmmu_setup_tsbinfo(sfmmup); 9230 sfmmu_hat_exit(hatlockp); 9231 return; 9232 } else { 9233 /* 9234 * It's annoying, but possible for us 9235 * to get here.. we dropped the HAT lock 9236 * because of locking order in the kmem 9237 * allocator, and while we were off getting 9238 * our memory, some other thread decided to 9239 * do us a favor and won the race to get a 9240 * second TSB for this process. Sigh. 9241 */ 9242 sfmmu_hat_exit(hatlockp); 9243 sfmmu_tsbinfo_free(newtsb); 9244 return; 9245 } 9246 } 9247 9248 /* 9249 * We have a second TSB, see if it's big enough. 9250 */ 9251 tsbinfop = tsbinfop->tsb_next; 9252 9253 /* 9254 * Check to see if our second TSB is the right size; 9255 * we may need to grow or shrink it. 9256 * To prevent thrashing (e.g. growing the TSB on a 9257 * subsequent map operation), only try to shrink if 9258 * the TSB reach exceeds twice the virtual address 9259 * space size. 9260 */ 9261 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 9262 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 9263 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 9264 tsb_szc, hatlockp, TSB_SHRINK); 9265 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 9266 TSB_OK_GROW()) { 9267 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 9268 tsb_szc, hatlockp, TSB_GROW); 9269 } 9270 } 9271 9272 sfmmu_hat_exit(hatlockp); 9273 } 9274 9275 /* 9276 * Get the preferred page size code for a hat. 9277 * This is only advice, so locking is not done; 9278 * this transitory information could change 9279 * following the call anyway. This interface is 9280 * sun4 private. 9281 */ 9282 /*ARGSUSED*/ 9283 uint_t 9284 hat_preferred_pgsz(struct hat *hat, caddr_t vaddr, size_t maplen, int maptype) 9285 { 9286 sfmmu_t *sfmmup = (sfmmu_t *)hat; 9287 uint_t szc, maxszc = mmu_page_sizes - 1; 9288 size_t pgsz; 9289 9290 if (maptype == MAPPGSZ_ISM) { 9291 for (szc = maxszc; szc >= TTE4M; szc--) { 9292 if (disable_ism_large_pages & (1 << szc)) 9293 continue; 9294 9295 pgsz = hw_page_array[szc].hp_size; 9296 if ((maplen >= pgsz) && IS_P2ALIGNED(vaddr, pgsz)) 9297 return (szc); 9298 } 9299 return (TTE4M); 9300 } else if (&mmu_preferred_pgsz) { /* USIII+-USIV+ */ 9301 return (mmu_preferred_pgsz(sfmmup, vaddr, maplen)); 9302 } else { /* USIII, USII, Niagara */ 9303 for (szc = maxszc; szc > TTE8K; szc--) { 9304 if (disable_large_pages & (1 << szc)) 9305 continue; 9306 9307 pgsz = hw_page_array[szc].hp_size; 9308 if ((maplen >= pgsz) && IS_P2ALIGNED(vaddr, pgsz)) 9309 return (szc); 9310 } 9311 return (TTE8K); 9312 } 9313 } 9314 9315 /* 9316 * Free up a sfmmu 9317 * Since the sfmmu is currently embedded in the hat struct we simply zero 9318 * out our fields and free up the ism map blk list if any. 9319 */ 9320 static void 9321 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 9322 { 9323 ism_blk_t *blkp, *nx_blkp; 9324 #ifdef DEBUG 9325 ism_map_t *map; 9326 int i; 9327 #endif 9328 9329 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 9330 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 9331 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 9332 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 9333 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 9334 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 9335 9336 sfmmup->sfmmu_free = 0; 9337 sfmmup->sfmmu_ismhat = 0; 9338 9339 blkp = sfmmup->sfmmu_iblk; 9340 sfmmup->sfmmu_iblk = NULL; 9341 9342 while (blkp) { 9343 #ifdef DEBUG 9344 map = blkp->iblk_maps; 9345 for (i = 0; i < ISM_MAP_SLOTS; i++) { 9346 ASSERT(map[i].imap_seg == 0); 9347 ASSERT(map[i].imap_ismhat == NULL); 9348 ASSERT(map[i].imap_ment == NULL); 9349 } 9350 #endif 9351 nx_blkp = blkp->iblk_next; 9352 blkp->iblk_next = NULL; 9353 blkp->iblk_nextpa = (uint64_t)-1; 9354 kmem_cache_free(ism_blk_cache, blkp); 9355 blkp = nx_blkp; 9356 } 9357 } 9358 9359 /* 9360 * Locking primitves accessed by HATLOCK macros 9361 */ 9362 9363 #define SFMMU_SPL_MTX (0x0) 9364 #define SFMMU_ML_MTX (0x1) 9365 9366 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 9367 SPL_HASH(pg) : MLIST_HASH(pg)) 9368 9369 kmutex_t * 9370 sfmmu_page_enter(struct page *pp) 9371 { 9372 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 9373 } 9374 9375 static void 9376 sfmmu_page_exit(kmutex_t *spl) 9377 { 9378 mutex_exit(spl); 9379 } 9380 9381 static int 9382 sfmmu_page_spl_held(struct page *pp) 9383 { 9384 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 9385 } 9386 9387 kmutex_t * 9388 sfmmu_mlist_enter(struct page *pp) 9389 { 9390 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 9391 } 9392 9393 void 9394 sfmmu_mlist_exit(kmutex_t *mml) 9395 { 9396 mutex_exit(mml); 9397 } 9398 9399 int 9400 sfmmu_mlist_held(struct page *pp) 9401 { 9402 9403 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 9404 } 9405 9406 /* 9407 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 9408 * sfmmu_mlist_enter() case mml_table lock array is used and for 9409 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 9410 * 9411 * The lock is taken on a root page so that it protects an operation on all 9412 * constituent pages of a large page pp belongs to. 9413 * 9414 * The routine takes a lock from the appropriate array. The lock is determined 9415 * by hashing the root page. After taking the lock this routine checks if the 9416 * root page has the same size code that was used to determine the root (i.e 9417 * that root hasn't changed). If root page has the expected p_szc field we 9418 * have the right lock and it's returned to the caller. If root's p_szc 9419 * decreased we release the lock and retry from the beginning. This case can 9420 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 9421 * value and taking the lock. The number of retries due to p_szc decrease is 9422 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 9423 * determined by hashing pp itself. 9424 * 9425 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 9426 * possible that p_szc can increase. To increase p_szc a thread has to lock 9427 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 9428 * callers that don't hold a page locked recheck if hmeblk through which pp 9429 * was found still maps this pp. If it doesn't map it anymore returned lock 9430 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 9431 * p_szc increase after taking the lock it returns this lock without further 9432 * retries because in this case the caller doesn't care about which lock was 9433 * taken. The caller will drop it right away. 9434 * 9435 * After the routine returns it's guaranteed that hat_page_demote() can't 9436 * change p_szc field of any of constituent pages of a large page pp belongs 9437 * to as long as pp was either locked at least SHARED prior to this call or 9438 * the caller finds that hment that pointed to this pp still references this 9439 * pp (this also assumes that the caller holds hme hash bucket lock so that 9440 * the same pp can't be remapped into the same hmeblk after it was unmapped by 9441 * hat_pageunload()). 9442 */ 9443 static kmutex_t * 9444 sfmmu_mlspl_enter(struct page *pp, int type) 9445 { 9446 kmutex_t *mtx; 9447 uint_t prev_rszc = UINT_MAX; 9448 page_t *rootpp; 9449 uint_t szc; 9450 uint_t rszc; 9451 uint_t pszc = pp->p_szc; 9452 9453 ASSERT(pp != NULL); 9454 9455 again: 9456 if (pszc == 0) { 9457 mtx = SFMMU_MLSPL_MTX(type, pp); 9458 mutex_enter(mtx); 9459 return (mtx); 9460 } 9461 9462 /* The lock lives in the root page */ 9463 rootpp = PP_GROUPLEADER(pp, pszc); 9464 mtx = SFMMU_MLSPL_MTX(type, rootpp); 9465 mutex_enter(mtx); 9466 9467 /* 9468 * Return mml in the following 3 cases: 9469 * 9470 * 1) If pp itself is root since if its p_szc decreased before we took 9471 * the lock pp is still the root of smaller szc page. And if its p_szc 9472 * increased it doesn't matter what lock we return (see comment in 9473 * front of this routine). 9474 * 9475 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 9476 * large page we have the right lock since any previous potential 9477 * hat_page_demote() is done demoting from greater than current root's 9478 * p_szc because hat_page_demote() changes root's p_szc last. No 9479 * further hat_page_demote() can start or be in progress since it 9480 * would need the same lock we currently hold. 9481 * 9482 * 3) If rootpp's p_szc increased since previous iteration it doesn't 9483 * matter what lock we return (see comment in front of this routine). 9484 */ 9485 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 9486 rszc >= prev_rszc) { 9487 return (mtx); 9488 } 9489 9490 /* 9491 * hat_page_demote() could have decreased root's p_szc. 9492 * In this case pp's p_szc must also be smaller than pszc. 9493 * Retry. 9494 */ 9495 if (rszc < pszc) { 9496 szc = pp->p_szc; 9497 if (szc < pszc) { 9498 mutex_exit(mtx); 9499 pszc = szc; 9500 goto again; 9501 } 9502 /* 9503 * pp's p_szc increased after it was decreased. 9504 * page cannot be mapped. Return current lock. The caller 9505 * will drop it right away. 9506 */ 9507 return (mtx); 9508 } 9509 9510 /* 9511 * root's p_szc is greater than pp's p_szc. 9512 * hat_page_demote() is not done with all pages 9513 * yet. Wait for it to complete. 9514 */ 9515 mutex_exit(mtx); 9516 rootpp = PP_GROUPLEADER(rootpp, rszc); 9517 mtx = SFMMU_MLSPL_MTX(type, rootpp); 9518 mutex_enter(mtx); 9519 mutex_exit(mtx); 9520 prev_rszc = rszc; 9521 goto again; 9522 } 9523 9524 static int 9525 sfmmu_mlspl_held(struct page *pp, int type) 9526 { 9527 kmutex_t *mtx; 9528 9529 ASSERT(pp != NULL); 9530 /* The lock lives in the root page */ 9531 pp = PP_PAGEROOT(pp); 9532 ASSERT(pp != NULL); 9533 9534 mtx = SFMMU_MLSPL_MTX(type, pp); 9535 return (MUTEX_HELD(mtx)); 9536 } 9537 9538 static uint_t 9539 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 9540 { 9541 struct hme_blk *hblkp; 9542 9543 if (freehblkp != NULL) { 9544 mutex_enter(&freehblkp_lock); 9545 if (freehblkp != NULL) { 9546 /* 9547 * If the current thread is owning hblk_reserve, 9548 * let it succede even if freehblkcnt is really low. 9549 */ 9550 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 9551 SFMMU_STAT(sf_get_free_throttle); 9552 mutex_exit(&freehblkp_lock); 9553 return (0); 9554 } 9555 freehblkcnt--; 9556 *hmeblkpp = freehblkp; 9557 hblkp = *hmeblkpp; 9558 freehblkp = hblkp->hblk_next; 9559 mutex_exit(&freehblkp_lock); 9560 hblkp->hblk_next = NULL; 9561 SFMMU_STAT(sf_get_free_success); 9562 return (1); 9563 } 9564 mutex_exit(&freehblkp_lock); 9565 } 9566 SFMMU_STAT(sf_get_free_fail); 9567 return (0); 9568 } 9569 9570 static uint_t 9571 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 9572 { 9573 struct hme_blk *hblkp; 9574 9575 /* 9576 * If the current thread is mapping into kernel space, 9577 * let it succede even if freehblkcnt is max 9578 * so that it will avoid freeing it to kmem. 9579 * This will prevent stack overflow due to 9580 * possible recursion since kmem_cache_free() 9581 * might require creation of a slab which 9582 * in turn needs an hmeblk to map that slab; 9583 * let's break this vicious chain at the first 9584 * opportunity. 9585 */ 9586 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 9587 mutex_enter(&freehblkp_lock); 9588 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 9589 SFMMU_STAT(sf_put_free_success); 9590 freehblkcnt++; 9591 hmeblkp->hblk_next = freehblkp; 9592 freehblkp = hmeblkp; 9593 mutex_exit(&freehblkp_lock); 9594 return (1); 9595 } 9596 mutex_exit(&freehblkp_lock); 9597 } 9598 9599 /* 9600 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 9601 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 9602 * we are not in the process of mapping into kernel space. 9603 */ 9604 ASSERT(!critical); 9605 while (freehblkcnt > HBLK_RESERVE_CNT) { 9606 mutex_enter(&freehblkp_lock); 9607 if (freehblkcnt > HBLK_RESERVE_CNT) { 9608 freehblkcnt--; 9609 hblkp = freehblkp; 9610 freehblkp = hblkp->hblk_next; 9611 mutex_exit(&freehblkp_lock); 9612 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 9613 kmem_cache_free(sfmmu8_cache, hblkp); 9614 continue; 9615 } 9616 mutex_exit(&freehblkp_lock); 9617 } 9618 SFMMU_STAT(sf_put_free_fail); 9619 return (0); 9620 } 9621 9622 static void 9623 sfmmu_hblk_swap(struct hme_blk *new) 9624 { 9625 struct hme_blk *old, *hblkp, *prev; 9626 uint64_t hblkpa, prevpa, newpa; 9627 caddr_t base, vaddr, endaddr; 9628 struct hmehash_bucket *hmebp; 9629 struct sf_hment *osfhme, *nsfhme; 9630 page_t *pp; 9631 kmutex_t *pml; 9632 tte_t tte; 9633 9634 #ifdef DEBUG 9635 hmeblk_tag hblktag; 9636 struct hme_blk *found; 9637 #endif 9638 old = HBLK_RESERVE; 9639 9640 /* 9641 * save pa before bcopy clobbers it 9642 */ 9643 newpa = new->hblk_nextpa; 9644 9645 base = (caddr_t)get_hblk_base(old); 9646 endaddr = base + get_hblk_span(old); 9647 9648 /* 9649 * acquire hash bucket lock. 9650 */ 9651 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K); 9652 9653 /* 9654 * copy contents from old to new 9655 */ 9656 bcopy((void *)old, (void *)new, HME8BLK_SZ); 9657 9658 /* 9659 * add new to hash chain 9660 */ 9661 sfmmu_hblk_hash_add(hmebp, new, newpa); 9662 9663 /* 9664 * search hash chain for hblk_reserve; this needs to be performed 9665 * after adding new, otherwise prevpa and prev won't correspond 9666 * to the hblk which is prior to old in hash chain when we call 9667 * sfmmu_hblk_hash_rm to remove old later. 9668 */ 9669 for (prevpa = 0, prev = NULL, 9670 hblkpa = hmebp->hmeh_nextpa, hblkp = hmebp->hmeblkp; 9671 hblkp != NULL && hblkp != old; 9672 prevpa = hblkpa, prev = hblkp, 9673 hblkpa = hblkp->hblk_nextpa, hblkp = hblkp->hblk_next); 9674 9675 if (hblkp != old) 9676 panic("sfmmu_hblk_swap: hblk_reserve not found"); 9677 9678 /* 9679 * p_mapping list is still pointing to hments in hblk_reserve; 9680 * fix up p_mapping list so that they point to hments in new. 9681 * 9682 * Since all these mappings are created by hblk_reserve_thread 9683 * on the way and it's using at least one of the buffers from each of 9684 * the newly minted slabs, there is no danger of any of these 9685 * mappings getting unloaded by another thread. 9686 * 9687 * tsbmiss could only modify ref/mod bits of hments in old/new. 9688 * Since all of these hments hold mappings established by segkmem 9689 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 9690 * have no meaning for the mappings in hblk_reserve. hments in 9691 * old and new are identical except for ref/mod bits. 9692 */ 9693 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 9694 9695 HBLKTOHME(osfhme, old, vaddr); 9696 sfmmu_copytte(&osfhme->hme_tte, &tte); 9697 9698 if (TTE_IS_VALID(&tte)) { 9699 if ((pp = osfhme->hme_page) == NULL) 9700 panic("sfmmu_hblk_swap: page not mapped"); 9701 9702 pml = sfmmu_mlist_enter(pp); 9703 9704 if (pp != osfhme->hme_page) 9705 panic("sfmmu_hblk_swap: mapping changed"); 9706 9707 HBLKTOHME(nsfhme, new, vaddr); 9708 9709 HME_ADD(nsfhme, pp); 9710 HME_SUB(osfhme, pp); 9711 9712 sfmmu_mlist_exit(pml); 9713 } 9714 } 9715 9716 /* 9717 * remove old from hash chain 9718 */ 9719 sfmmu_hblk_hash_rm(hmebp, old, prevpa, prev); 9720 9721 #ifdef DEBUG 9722 9723 hblktag.htag_id = ksfmmup; 9724 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 9725 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 9726 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 9727 9728 if (found != new) 9729 panic("sfmmu_hblk_swap: new hblk not found"); 9730 #endif 9731 9732 SFMMU_HASH_UNLOCK(hmebp); 9733 9734 /* 9735 * Reset hblk_reserve 9736 */ 9737 bzero((void *)old, HME8BLK_SZ); 9738 old->hblk_nextpa = va_to_pa((caddr_t)old); 9739 } 9740 9741 /* 9742 * Grab the mlist mutex for both pages passed in. 9743 * 9744 * low and high will be returned as pointers to the mutexes for these pages. 9745 * low refers to the mutex residing in the lower bin of the mlist hash, while 9746 * high refers to the mutex residing in the higher bin of the mlist hash. This 9747 * is due to the locking order restrictions on the same thread grabbing 9748 * multiple mlist mutexes. The low lock must be acquired before the high lock. 9749 * 9750 * If both pages hash to the same mutex, only grab that single mutex, and 9751 * high will be returned as NULL 9752 * If the pages hash to different bins in the hash, grab the lower addressed 9753 * lock first and then the higher addressed lock in order to follow the locking 9754 * rules involved with the same thread grabbing multiple mlist mutexes. 9755 * low and high will both have non-NULL values. 9756 */ 9757 static void 9758 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 9759 kmutex_t **low, kmutex_t **high) 9760 { 9761 kmutex_t *mml_targ, *mml_repl; 9762 9763 /* 9764 * no need to do the dance around szc as in sfmmu_mlist_enter() 9765 * because this routine is only called by hat_page_relocate() and all 9766 * targ and repl pages are already locked EXCL so szc can't change. 9767 */ 9768 9769 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 9770 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 9771 9772 if (mml_targ == mml_repl) { 9773 *low = mml_targ; 9774 *high = NULL; 9775 } else { 9776 if (mml_targ < mml_repl) { 9777 *low = mml_targ; 9778 *high = mml_repl; 9779 } else { 9780 *low = mml_repl; 9781 *high = mml_targ; 9782 } 9783 } 9784 9785 mutex_enter(*low); 9786 if (*high) 9787 mutex_enter(*high); 9788 } 9789 9790 static void 9791 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 9792 { 9793 if (high) 9794 mutex_exit(high); 9795 mutex_exit(low); 9796 } 9797 9798 static hatlock_t * 9799 sfmmu_hat_enter(sfmmu_t *sfmmup) 9800 { 9801 hatlock_t *hatlockp; 9802 9803 if (sfmmup != ksfmmup) { 9804 hatlockp = TSB_HASH(sfmmup); 9805 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 9806 return (hatlockp); 9807 } 9808 return (NULL); 9809 } 9810 9811 static hatlock_t * 9812 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 9813 { 9814 hatlock_t *hatlockp; 9815 9816 if (sfmmup != ksfmmup) { 9817 hatlockp = TSB_HASH(sfmmup); 9818 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 9819 return (NULL); 9820 return (hatlockp); 9821 } 9822 return (NULL); 9823 } 9824 9825 static void 9826 sfmmu_hat_exit(hatlock_t *hatlockp) 9827 { 9828 if (hatlockp != NULL) 9829 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 9830 } 9831 9832 static void 9833 sfmmu_hat_lock_all(void) 9834 { 9835 int i; 9836 for (i = 0; i < SFMMU_NUM_LOCK; i++) 9837 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 9838 } 9839 9840 static void 9841 sfmmu_hat_unlock_all(void) 9842 { 9843 int i; 9844 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 9845 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 9846 } 9847 9848 int 9849 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 9850 { 9851 ASSERT(sfmmup != ksfmmup); 9852 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 9853 } 9854 9855 /* 9856 * Locking primitives to provide consistency between ISM unmap 9857 * and other operations. Since ISM unmap can take a long time, we 9858 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 9859 * contention on the hatlock buckets while ISM segments are being 9860 * unmapped. The tradeoff is that the flags don't prevent priority 9861 * inversion from occurring, so we must request kernel priority in 9862 * case we have to sleep to keep from getting buried while holding 9863 * the HAT_ISMBUSY flag set, which in turn could block other kernel 9864 * threads from running (for example, in sfmmu_uvatopfn()). 9865 */ 9866 static void 9867 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 9868 { 9869 hatlock_t *hatlockp; 9870 9871 THREAD_KPRI_REQUEST(); 9872 if (!hatlock_held) 9873 hatlockp = sfmmu_hat_enter(sfmmup); 9874 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 9875 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 9876 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 9877 if (!hatlock_held) 9878 sfmmu_hat_exit(hatlockp); 9879 } 9880 9881 static void 9882 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 9883 { 9884 hatlock_t *hatlockp; 9885 9886 if (!hatlock_held) 9887 hatlockp = sfmmu_hat_enter(sfmmup); 9888 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 9889 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 9890 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 9891 if (!hatlock_held) 9892 sfmmu_hat_exit(hatlockp); 9893 THREAD_KPRI_RELEASE(); 9894 } 9895 9896 /* 9897 * 9898 * Algorithm: 9899 * 9900 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 9901 * hblks. 9902 * 9903 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 9904 * 9905 * (a) try to return an hblk from reserve pool of free hblks; 9906 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 9907 * and return hblk_reserve. 9908 * 9909 * (3) call kmem_cache_alloc() to allocate hblk; 9910 * 9911 * (a) if hblk_reserve_lock is held by the current thread, 9912 * atomically replace hblk_reserve by the hblk that is 9913 * returned by kmem_cache_alloc; release hblk_reserve_lock 9914 * and call kmem_cache_alloc() again. 9915 * (b) if reserve pool is not full, add the hblk that is 9916 * returned by kmem_cache_alloc to reserve pool and 9917 * call kmem_cache_alloc again. 9918 * 9919 */ 9920 static struct hme_blk * 9921 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 9922 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 9923 uint_t flags) 9924 { 9925 struct hme_blk *hmeblkp = NULL; 9926 struct hme_blk *newhblkp; 9927 struct hme_blk *shw_hblkp = NULL; 9928 struct kmem_cache *sfmmu_cache = NULL; 9929 uint64_t hblkpa; 9930 ulong_t index; 9931 uint_t owner; /* set to 1 if using hblk_reserve */ 9932 uint_t forcefree; 9933 int sleep; 9934 9935 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 9936 9937 /* 9938 * If segkmem is not created yet, allocate from static hmeblks 9939 * created at the end of startup_modules(). See the block comment 9940 * in startup_modules() describing how we estimate the number of 9941 * static hmeblks that will be needed during re-map. 9942 */ 9943 if (!hblk_alloc_dynamic) { 9944 9945 if (size == TTE8K) { 9946 index = nucleus_hblk8.index; 9947 if (index >= nucleus_hblk8.len) { 9948 /* 9949 * If we panic here, see startup_modules() to 9950 * make sure that we are calculating the 9951 * number of hblk8's that we need correctly. 9952 */ 9953 panic("no nucleus hblk8 to allocate"); 9954 } 9955 hmeblkp = 9956 (struct hme_blk *)&nucleus_hblk8.list[index]; 9957 nucleus_hblk8.index++; 9958 SFMMU_STAT(sf_hblk8_nalloc); 9959 } else { 9960 index = nucleus_hblk1.index; 9961 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 9962 /* 9963 * If we panic here, see startup_modules() 9964 * and H8TOH1; most likely you need to 9965 * update the calculation of the number 9966 * of hblk1's the kernel needs to boot. 9967 */ 9968 panic("no nucleus hblk1 to allocate"); 9969 } 9970 hmeblkp = 9971 (struct hme_blk *)&nucleus_hblk1.list[index]; 9972 nucleus_hblk1.index++; 9973 SFMMU_STAT(sf_hblk1_nalloc); 9974 } 9975 9976 goto hblk_init; 9977 } 9978 9979 SFMMU_HASH_UNLOCK(hmebp); 9980 9981 if (sfmmup != KHATID) { 9982 if (mmu_page_sizes == max_mmu_page_sizes) { 9983 if (size < TTE256M) 9984 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 9985 size, flags); 9986 } else { 9987 if (size < TTE4M) 9988 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 9989 size, flags); 9990 } 9991 } 9992 9993 fill_hblk: 9994 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 9995 9996 if (owner && size == TTE8K) { 9997 9998 /* 9999 * We are really in a tight spot. We already own 10000 * hblk_reserve and we need another hblk. In anticipation 10001 * of this kind of scenario, we specifically set aside 10002 * HBLK_RESERVE_MIN number of hblks to be used exclusively 10003 * by owner of hblk_reserve. 10004 */ 10005 SFMMU_STAT(sf_hblk_recurse_cnt); 10006 10007 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 10008 panic("sfmmu_hblk_alloc: reserve list is empty"); 10009 10010 goto hblk_verify; 10011 } 10012 10013 ASSERT(!owner); 10014 10015 if ((flags & HAT_NO_KALLOC) == 0) { 10016 10017 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 10018 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 10019 10020 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 10021 hmeblkp = sfmmu_hblk_steal(size); 10022 } else { 10023 /* 10024 * if we are the owner of hblk_reserve, 10025 * swap hblk_reserve with hmeblkp and 10026 * start a fresh life. Hope things go 10027 * better this time. 10028 */ 10029 if (hblk_reserve_thread == curthread) { 10030 ASSERT(sfmmu_cache == sfmmu8_cache); 10031 sfmmu_hblk_swap(hmeblkp); 10032 hblk_reserve_thread = NULL; 10033 mutex_exit(&hblk_reserve_lock); 10034 goto fill_hblk; 10035 } 10036 /* 10037 * let's donate this hblk to our reserve list if 10038 * we are not mapping kernel range 10039 */ 10040 if (size == TTE8K && sfmmup != KHATID) 10041 if (sfmmu_put_free_hblk(hmeblkp, 0)) 10042 goto fill_hblk; 10043 } 10044 } else { 10045 /* 10046 * We are here to map the slab in sfmmu8_cache; let's 10047 * check if we could tap our reserve list; if successful, 10048 * this will avoid the pain of going thru sfmmu_hblk_swap 10049 */ 10050 SFMMU_STAT(sf_hblk_slab_cnt); 10051 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 10052 /* 10053 * let's start hblk_reserve dance 10054 */ 10055 SFMMU_STAT(sf_hblk_reserve_cnt); 10056 owner = 1; 10057 mutex_enter(&hblk_reserve_lock); 10058 hmeblkp = HBLK_RESERVE; 10059 hblk_reserve_thread = curthread; 10060 } 10061 } 10062 10063 hblk_verify: 10064 ASSERT(hmeblkp != NULL); 10065 set_hblk_sz(hmeblkp, size); 10066 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10067 SFMMU_HASH_LOCK(hmebp); 10068 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 10069 if (newhblkp != NULL) { 10070 SFMMU_HASH_UNLOCK(hmebp); 10071 if (hmeblkp != HBLK_RESERVE) { 10072 /* 10073 * This is really tricky! 10074 * 10075 * vmem_alloc(vmem_seg_arena) 10076 * vmem_alloc(vmem_internal_arena) 10077 * segkmem_alloc(heap_arena) 10078 * vmem_alloc(heap_arena) 10079 * page_create() 10080 * hat_memload() 10081 * kmem_cache_free() 10082 * kmem_cache_alloc() 10083 * kmem_slab_create() 10084 * vmem_alloc(kmem_internal_arena) 10085 * segkmem_alloc(heap_arena) 10086 * vmem_alloc(heap_arena) 10087 * page_create() 10088 * hat_memload() 10089 * kmem_cache_free() 10090 * ... 10091 * 10092 * Thus, hat_memload() could call kmem_cache_free 10093 * for enough number of times that we could easily 10094 * hit the bottom of the stack or run out of reserve 10095 * list of vmem_seg structs. So, we must donate 10096 * this hblk to reserve list if it's allocated 10097 * from sfmmu8_cache *and* mapping kernel range. 10098 * We don't need to worry about freeing hmeblk1's 10099 * to kmem since they don't map any kmem slabs. 10100 * 10101 * Note: When segkmem supports largepages, we must 10102 * free hmeblk1's to reserve list as well. 10103 */ 10104 forcefree = (sfmmup == KHATID) ? 1 : 0; 10105 if (size == TTE8K && 10106 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 10107 goto re_verify; 10108 } 10109 ASSERT(sfmmup != KHATID); 10110 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 10111 } else { 10112 /* 10113 * Hey! we don't need hblk_reserve any more. 10114 */ 10115 ASSERT(owner); 10116 hblk_reserve_thread = NULL; 10117 mutex_exit(&hblk_reserve_lock); 10118 owner = 0; 10119 } 10120 re_verify: 10121 /* 10122 * let's check if the goodies are still present 10123 */ 10124 SFMMU_HASH_LOCK(hmebp); 10125 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 10126 if (newhblkp != NULL) { 10127 /* 10128 * return newhblkp if it's not hblk_reserve; 10129 * if newhblkp is hblk_reserve, return it 10130 * _only if_ we are the owner of hblk_reserve. 10131 */ 10132 if (newhblkp != HBLK_RESERVE || owner) { 10133 return (newhblkp); 10134 } else { 10135 /* 10136 * we just hit hblk_reserve in the hash and 10137 * we are not the owner of that; 10138 * 10139 * block until hblk_reserve_thread completes 10140 * swapping hblk_reserve and try the dance 10141 * once again. 10142 */ 10143 SFMMU_HASH_UNLOCK(hmebp); 10144 mutex_enter(&hblk_reserve_lock); 10145 mutex_exit(&hblk_reserve_lock); 10146 SFMMU_STAT(sf_hblk_reserve_hit); 10147 goto fill_hblk; 10148 } 10149 } else { 10150 /* 10151 * it's no more! try the dance once again. 10152 */ 10153 SFMMU_HASH_UNLOCK(hmebp); 10154 goto fill_hblk; 10155 } 10156 } 10157 10158 hblk_init: 10159 set_hblk_sz(hmeblkp, size); 10160 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 10161 hmeblkp->hblk_next = (struct hme_blk *)NULL; 10162 hmeblkp->hblk_tag = hblktag; 10163 hmeblkp->hblk_shadow = shw_hblkp; 10164 hblkpa = hmeblkp->hblk_nextpa; 10165 hmeblkp->hblk_nextpa = 0; 10166 10167 ASSERT(get_hblk_ttesz(hmeblkp) == size); 10168 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 10169 ASSERT(hmeblkp->hblk_hmecnt == 0); 10170 ASSERT(hmeblkp->hblk_vcnt == 0); 10171 ASSERT(hmeblkp->hblk_lckcnt == 0); 10172 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 10173 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 10174 return (hmeblkp); 10175 } 10176 10177 /* 10178 * This function performs any cleanup required on the hme_blk 10179 * and returns it to the free list. 10180 */ 10181 /* ARGSUSED */ 10182 static void 10183 sfmmu_hblk_free(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 10184 uint64_t hblkpa, struct hme_blk **listp) 10185 { 10186 int shw_size, vshift; 10187 struct hme_blk *shw_hblkp; 10188 uint_t shw_mask, newshw_mask; 10189 uintptr_t vaddr; 10190 int size; 10191 uint_t critical; 10192 10193 ASSERT(hmeblkp); 10194 ASSERT(!hmeblkp->hblk_hmecnt); 10195 ASSERT(!hmeblkp->hblk_vcnt); 10196 ASSERT(!hmeblkp->hblk_lckcnt); 10197 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 10198 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 10199 10200 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 10201 10202 size = get_hblk_ttesz(hmeblkp); 10203 shw_hblkp = hmeblkp->hblk_shadow; 10204 if (shw_hblkp) { 10205 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 10206 if (mmu_page_sizes == max_mmu_page_sizes) { 10207 ASSERT(size < TTE256M); 10208 } else { 10209 ASSERT(size < TTE4M); 10210 } 10211 10212 shw_size = get_hblk_ttesz(shw_hblkp); 10213 vaddr = get_hblk_base(hmeblkp); 10214 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 10215 ASSERT(vshift < 8); 10216 /* 10217 * Atomically clear shadow mask bit 10218 */ 10219 do { 10220 shw_mask = shw_hblkp->hblk_shw_mask; 10221 ASSERT(shw_mask & (1 << vshift)); 10222 newshw_mask = shw_mask & ~(1 << vshift); 10223 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 10224 shw_mask, newshw_mask); 10225 } while (newshw_mask != shw_mask); 10226 hmeblkp->hblk_shadow = NULL; 10227 } 10228 hmeblkp->hblk_next = NULL; 10229 hmeblkp->hblk_nextpa = hblkpa; 10230 hmeblkp->hblk_shw_bit = 0; 10231 10232 if (hmeblkp->hblk_nuc_bit == 0) { 10233 10234 if (size == TTE8K && sfmmu_put_free_hblk(hmeblkp, critical)) 10235 return; 10236 10237 hmeblkp->hblk_next = *listp; 10238 *listp = hmeblkp; 10239 } 10240 } 10241 10242 static void 10243 sfmmu_hblks_list_purge(struct hme_blk **listp) 10244 { 10245 struct hme_blk *hmeblkp; 10246 10247 while ((hmeblkp = *listp) != NULL) { 10248 *listp = hmeblkp->hblk_next; 10249 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 10250 } 10251 } 10252 10253 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 10254 10255 static uint_t sfmmu_hblk_steal_twice; 10256 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 10257 10258 /* 10259 * Steal a hmeblk 10260 * Enough hmeblks were allocated at startup (nucleus hmeblks) and also 10261 * hmeblks were added dynamically. We should never ever not be able to 10262 * find one. Look for an unused/unlocked hmeblk in user hash table. 10263 */ 10264 static struct hme_blk * 10265 sfmmu_hblk_steal(int size) 10266 { 10267 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 10268 struct hmehash_bucket *hmebp; 10269 struct hme_blk *hmeblkp = NULL, *pr_hblk; 10270 uint64_t hblkpa, prevpa; 10271 int i; 10272 10273 for (;;) { 10274 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 10275 uhmehash_steal_hand; 10276 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 10277 10278 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 10279 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 10280 SFMMU_HASH_LOCK(hmebp); 10281 hmeblkp = hmebp->hmeblkp; 10282 hblkpa = hmebp->hmeh_nextpa; 10283 prevpa = 0; 10284 pr_hblk = NULL; 10285 while (hmeblkp) { 10286 /* 10287 * check if it is a hmeblk that is not locked 10288 * and not shared. skip shadow hmeblks with 10289 * shadow_mask set i.e valid count non zero. 10290 */ 10291 if ((get_hblk_ttesz(hmeblkp) == size) && 10292 (hmeblkp->hblk_shw_bit == 0 || 10293 hmeblkp->hblk_vcnt == 0) && 10294 (hmeblkp->hblk_lckcnt == 0)) { 10295 /* 10296 * there is a high probability that we 10297 * will find a free one. search some 10298 * buckets for a free hmeblk initially 10299 * before unloading a valid hmeblk. 10300 */ 10301 if ((hmeblkp->hblk_vcnt == 0 && 10302 hmeblkp->hblk_hmecnt == 0) || (i >= 10303 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 10304 if (sfmmu_steal_this_hblk(hmebp, 10305 hmeblkp, hblkpa, prevpa, 10306 pr_hblk)) { 10307 /* 10308 * Hblk is unloaded 10309 * successfully 10310 */ 10311 break; 10312 } 10313 } 10314 } 10315 pr_hblk = hmeblkp; 10316 prevpa = hblkpa; 10317 hblkpa = hmeblkp->hblk_nextpa; 10318 hmeblkp = hmeblkp->hblk_next; 10319 } 10320 10321 SFMMU_HASH_UNLOCK(hmebp); 10322 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 10323 hmebp = uhme_hash; 10324 } 10325 uhmehash_steal_hand = hmebp; 10326 10327 if (hmeblkp != NULL) 10328 break; 10329 10330 /* 10331 * in the worst case, look for a free one in the kernel 10332 * hash table. 10333 */ 10334 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 10335 SFMMU_HASH_LOCK(hmebp); 10336 hmeblkp = hmebp->hmeblkp; 10337 hblkpa = hmebp->hmeh_nextpa; 10338 prevpa = 0; 10339 pr_hblk = NULL; 10340 while (hmeblkp) { 10341 /* 10342 * check if it is free hmeblk 10343 */ 10344 if ((get_hblk_ttesz(hmeblkp) == size) && 10345 (hmeblkp->hblk_lckcnt == 0) && 10346 (hmeblkp->hblk_vcnt == 0) && 10347 (hmeblkp->hblk_hmecnt == 0)) { 10348 if (sfmmu_steal_this_hblk(hmebp, 10349 hmeblkp, hblkpa, prevpa, pr_hblk)) { 10350 break; 10351 } else { 10352 /* 10353 * Cannot fail since we have 10354 * hash lock. 10355 */ 10356 panic("fail to steal?"); 10357 } 10358 } 10359 10360 pr_hblk = hmeblkp; 10361 prevpa = hblkpa; 10362 hblkpa = hmeblkp->hblk_nextpa; 10363 hmeblkp = hmeblkp->hblk_next; 10364 } 10365 10366 SFMMU_HASH_UNLOCK(hmebp); 10367 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 10368 hmebp = khme_hash; 10369 } 10370 10371 if (hmeblkp != NULL) 10372 break; 10373 sfmmu_hblk_steal_twice++; 10374 } 10375 return (hmeblkp); 10376 } 10377 10378 /* 10379 * This routine does real work to prepare a hblk to be "stolen" by 10380 * unloading the mappings, updating shadow counts .... 10381 * It returns 1 if the block is ready to be reused (stolen), or 0 10382 * means the block cannot be stolen yet- pageunload is still working 10383 * on this hblk. 10384 */ 10385 static int 10386 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 10387 uint64_t hblkpa, uint64_t prevpa, struct hme_blk *pr_hblk) 10388 { 10389 int shw_size, vshift; 10390 struct hme_blk *shw_hblkp; 10391 uintptr_t vaddr; 10392 uint_t shw_mask, newshw_mask; 10393 10394 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 10395 10396 /* 10397 * check if the hmeblk is free, unload if necessary 10398 */ 10399 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 10400 sfmmu_t *sfmmup; 10401 demap_range_t dmr; 10402 10403 sfmmup = hblktosfmmu(hmeblkp); 10404 DEMAP_RANGE_INIT(sfmmup, &dmr); 10405 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 10406 (caddr_t)get_hblk_base(hmeblkp), 10407 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 10408 DEMAP_RANGE_FLUSH(&dmr); 10409 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 10410 /* 10411 * Pageunload is working on the same hblk. 10412 */ 10413 return (0); 10414 } 10415 10416 sfmmu_hblk_steal_unload_count++; 10417 } 10418 10419 ASSERT(hmeblkp->hblk_lckcnt == 0); 10420 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 10421 10422 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 10423 hmeblkp->hblk_nextpa = hblkpa; 10424 10425 shw_hblkp = hmeblkp->hblk_shadow; 10426 if (shw_hblkp) { 10427 shw_size = get_hblk_ttesz(shw_hblkp); 10428 vaddr = get_hblk_base(hmeblkp); 10429 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 10430 ASSERT(vshift < 8); 10431 /* 10432 * Atomically clear shadow mask bit 10433 */ 10434 do { 10435 shw_mask = shw_hblkp->hblk_shw_mask; 10436 ASSERT(shw_mask & (1 << vshift)); 10437 newshw_mask = shw_mask & ~(1 << vshift); 10438 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 10439 shw_mask, newshw_mask); 10440 } while (newshw_mask != shw_mask); 10441 hmeblkp->hblk_shadow = NULL; 10442 } 10443 10444 /* 10445 * remove shadow bit if we are stealing an unused shadow hmeblk. 10446 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 10447 * we are indeed allocating a shadow hmeblk. 10448 */ 10449 hmeblkp->hblk_shw_bit = 0; 10450 10451 sfmmu_hblk_steal_count++; 10452 SFMMU_STAT(sf_steal_count); 10453 10454 return (1); 10455 } 10456 10457 struct hme_blk * 10458 sfmmu_hmetohblk(struct sf_hment *sfhme) 10459 { 10460 struct hme_blk *hmeblkp; 10461 struct sf_hment *sfhme0; 10462 struct hme_blk *hblk_dummy = 0; 10463 10464 /* 10465 * No dummy sf_hments, please. 10466 */ 10467 ASSERT(sfhme->hme_tte.ll != 0); 10468 10469 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 10470 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 10471 (uintptr_t)&hblk_dummy->hblk_hme[0]); 10472 10473 return (hmeblkp); 10474 } 10475 10476 /* 10477 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 10478 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 10479 * KM_SLEEP allocation. 10480 * 10481 * Return 0 on success, -1 otherwise. 10482 */ 10483 static void 10484 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 10485 { 10486 struct tsb_info *tsbinfop, *next; 10487 tsb_replace_rc_t rc; 10488 boolean_t gotfirst = B_FALSE; 10489 10490 ASSERT(sfmmup != ksfmmup); 10491 ASSERT(sfmmu_hat_lock_held(sfmmup)); 10492 10493 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 10494 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 10495 } 10496 10497 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10498 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 10499 } else { 10500 return; 10501 } 10502 10503 ASSERT(sfmmup->sfmmu_tsb != NULL); 10504 10505 /* 10506 * Loop over all tsbinfo's replacing them with ones that actually have 10507 * a TSB. If any of the replacements ever fail, bail out of the loop. 10508 */ 10509 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 10510 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 10511 next = tsbinfop->tsb_next; 10512 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 10513 hatlockp, TSB_SWAPIN); 10514 if (rc != TSB_SUCCESS) { 10515 break; 10516 } 10517 gotfirst = B_TRUE; 10518 } 10519 10520 switch (rc) { 10521 case TSB_SUCCESS: 10522 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 10523 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 10524 return; 10525 case TSB_ALLOCFAIL: 10526 break; 10527 default: 10528 panic("sfmmu_replace_tsb returned unrecognized failure code " 10529 "%d", rc); 10530 } 10531 10532 /* 10533 * In this case, we failed to get one of our TSBs. If we failed to 10534 * get the first TSB, get one of minimum size (8KB). Walk the list 10535 * and throw away the tsbinfos, starting where the allocation failed; 10536 * we can get by with just one TSB as long as we don't leave the 10537 * SWAPPED tsbinfo structures lying around. 10538 */ 10539 tsbinfop = sfmmup->sfmmu_tsb; 10540 next = tsbinfop->tsb_next; 10541 tsbinfop->tsb_next = NULL; 10542 10543 sfmmu_hat_exit(hatlockp); 10544 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 10545 next = tsbinfop->tsb_next; 10546 sfmmu_tsbinfo_free(tsbinfop); 10547 } 10548 hatlockp = sfmmu_hat_enter(sfmmup); 10549 10550 /* 10551 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 10552 * pages. 10553 */ 10554 if (!gotfirst) { 10555 tsbinfop = sfmmup->sfmmu_tsb; 10556 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 10557 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 10558 ASSERT(rc == TSB_SUCCESS); 10559 } 10560 10561 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 10562 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 10563 } 10564 10565 /* 10566 * Handle exceptions for low level tsb_handler. 10567 * 10568 * There are many scenarios that could land us here: 10569 * 10570 * If the context is invalid we land here. The context can be invalid 10571 * for 3 reasons: 1) we couldn't allocate a new context and now need to 10572 * perform a wrap around operation in order to allocate a new context. 10573 * 2) Context was invalidated to change pagesize programming 3) ISMs or 10574 * TSBs configuration is changeing for this process and we are forced into 10575 * here to do a syncronization operation. If the context is valid we can 10576 * be here from window trap hanlder. In this case just call trap to handle 10577 * the fault. 10578 * 10579 * Note that the process will run in INVALID_CONTEXT before 10580 * faulting into here and subsequently loading the MMU registers 10581 * (including the TSB base register) associated with this process. 10582 * For this reason, the trap handlers must all test for 10583 * INVALID_CONTEXT before attempting to access any registers other 10584 * than the context registers. 10585 */ 10586 void 10587 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 10588 { 10589 sfmmu_t *sfmmup; 10590 uint_t ctxnum; 10591 klwp_id_t lwp; 10592 char lwp_save_state; 10593 hatlock_t *hatlockp; 10594 struct tsb_info *tsbinfop; 10595 10596 SFMMU_STAT(sf_tsb_exceptions); 10597 SFMMU_MMU_STAT(mmu_tsb_exceptions); 10598 sfmmup = astosfmmu(curthread->t_procp->p_as); 10599 ctxnum = tagaccess & TAGACC_CTX_MASK; 10600 10601 ASSERT(sfmmup != ksfmmup && ctxnum != KCONTEXT); 10602 ASSERT(sfmmup->sfmmu_ismhat == 0); 10603 /* 10604 * First, make sure we come out of here with a valid ctx, 10605 * since if we don't get one we'll simply loop on the 10606 * faulting instruction. 10607 * 10608 * If the ISM mappings are changing, the TSB is being relocated, or 10609 * the process is swapped out we serialize behind the controlling 10610 * thread with the sfmmu_flags and sfmmu_tsb_cv condition variable. 10611 * Otherwise we synchronize with the context stealer or the thread 10612 * that required us to change out our MMU registers (such 10613 * as a thread changing out our TSB while we were running) by 10614 * locking the HAT and grabbing the rwlock on the context as a 10615 * reader temporarily. 10616 */ 10617 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) || 10618 ctxnum == INVALID_CONTEXT); 10619 10620 if (ctxnum == INVALID_CONTEXT) { 10621 /* 10622 * Must set lwp state to LWP_SYS before 10623 * trying to acquire any adaptive lock 10624 */ 10625 lwp = ttolwp(curthread); 10626 ASSERT(lwp); 10627 lwp_save_state = lwp->lwp_state; 10628 lwp->lwp_state = LWP_SYS; 10629 10630 hatlockp = sfmmu_hat_enter(sfmmup); 10631 retry: 10632 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 10633 tsbinfop = tsbinfop->tsb_next) { 10634 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 10635 cv_wait(&sfmmup->sfmmu_tsb_cv, 10636 HATLOCK_MUTEXP(hatlockp)); 10637 goto retry; 10638 } 10639 } 10640 10641 /* 10642 * Wait for ISM maps to be updated. 10643 */ 10644 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 10645 cv_wait(&sfmmup->sfmmu_tsb_cv, 10646 HATLOCK_MUTEXP(hatlockp)); 10647 goto retry; 10648 } 10649 10650 /* 10651 * If we're swapping in, get TSB(s). Note that we must do 10652 * this before we get a ctx or load the MMU state. Once 10653 * we swap in we have to recheck to make sure the TSB(s) and 10654 * ISM mappings didn't change while we slept. 10655 */ 10656 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10657 sfmmu_tsb_swapin(sfmmup, hatlockp); 10658 goto retry; 10659 } 10660 10661 sfmmu_get_ctx(sfmmup); 10662 10663 sfmmu_hat_exit(hatlockp); 10664 /* 10665 * Must restore lwp_state if not calling 10666 * trap() for further processing. Restore 10667 * it anyway. 10668 */ 10669 lwp->lwp_state = lwp_save_state; 10670 if (sfmmup->sfmmu_ttecnt[TTE8K] != 0 || 10671 sfmmup->sfmmu_ttecnt[TTE64K] != 0 || 10672 sfmmup->sfmmu_ttecnt[TTE512K] != 0 || 10673 sfmmup->sfmmu_ttecnt[TTE4M] != 0 || 10674 sfmmup->sfmmu_ttecnt[TTE32M] != 0 || 10675 sfmmup->sfmmu_ttecnt[TTE256M] != 0) { 10676 return; 10677 } 10678 if (traptype == T_DATA_PROT) { 10679 traptype = T_DATA_MMU_MISS; 10680 } 10681 } 10682 trap(rp, (caddr_t)tagaccess, traptype, 0); 10683 } 10684 10685 /* 10686 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 10687 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 10688 * rather than spinning to avoid send mondo timeouts with 10689 * interrupts enabled. When the lock is acquired it is immediately 10690 * released and we return back to sfmmu_vatopfn just after 10691 * the GET_TTE call. 10692 */ 10693 void 10694 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 10695 { 10696 struct page **pp; 10697 10698 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 10699 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 10700 } 10701 10702 /* 10703 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 10704 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 10705 * cross traps which cannot be handled while spinning in the 10706 * trap handlers. Simply enter and exit the kpr_suspendlock spin 10707 * mutex, which is held by the holder of the suspend bit, and then 10708 * retry the trapped instruction after unwinding. 10709 */ 10710 /*ARGSUSED*/ 10711 void 10712 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 10713 { 10714 ASSERT(curthread != kreloc_thread); 10715 mutex_enter(&kpr_suspendlock); 10716 mutex_exit(&kpr_suspendlock); 10717 } 10718 10719 /* 10720 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 10721 * This routine may be called with all cpu's captured. Therefore, the 10722 * caller is responsible for holding all locks and disabling kernel 10723 * preemption. 10724 */ 10725 /* ARGSUSED */ 10726 static void 10727 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 10728 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 10729 { 10730 cpuset_t cpuset; 10731 caddr_t va; 10732 ism_ment_t *ment; 10733 sfmmu_t *sfmmup; 10734 int vcolor; 10735 int ttesz; 10736 10737 /* 10738 * Walk the ism_hat's mapping list and flush the page 10739 * from every hat sharing this ism_hat. This routine 10740 * may be called while all cpu's have been captured. 10741 * Therefore we can't attempt to grab any locks. For now 10742 * this means we will protect the ism mapping list under 10743 * a single lock which will be grabbed by the caller. 10744 * If hat_share/unshare scalibility becomes a performance 10745 * problem then we may need to re-think ism mapping list locking. 10746 */ 10747 ASSERT(ism_sfmmup->sfmmu_ismhat); 10748 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 10749 addr = addr - ISMID_STARTADDR; 10750 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 10751 10752 sfmmup = ment->iment_hat; 10753 10754 va = ment->iment_base_va; 10755 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 10756 10757 /* 10758 * Flush TSB of ISM mappings. 10759 */ 10760 ttesz = get_hblk_ttesz(hmeblkp); 10761 if (ttesz == TTE8K || ttesz == TTE4M) { 10762 sfmmu_unload_tsb(sfmmup, va, ttesz); 10763 } else { 10764 caddr_t sva = va; 10765 caddr_t eva; 10766 ASSERT(addr == (caddr_t)get_hblk_base(hmeblkp)); 10767 eva = sva + get_hblk_span(hmeblkp); 10768 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); 10769 } 10770 10771 cpuset = sfmmup->sfmmu_cpusran; 10772 CPUSET_AND(cpuset, cpu_ready_set); 10773 CPUSET_DEL(cpuset, CPU->cpu_id); 10774 10775 SFMMU_XCALL_STATS(sfmmup); 10776 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 10777 (uint64_t)sfmmup); 10778 10779 vtag_flushpage(va, (uint64_t)sfmmup); 10780 10781 /* 10782 * Flush D$ 10783 * When flushing D$ we must flush all 10784 * cpu's. See sfmmu_cache_flush(). 10785 */ 10786 if (cache_flush_flag == CACHE_FLUSH) { 10787 cpuset = cpu_ready_set; 10788 CPUSET_DEL(cpuset, CPU->cpu_id); 10789 10790 SFMMU_XCALL_STATS(sfmmup); 10791 vcolor = addr_to_vcolor(va); 10792 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 10793 vac_flushpage(pfnum, vcolor); 10794 } 10795 } 10796 } 10797 10798 /* 10799 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 10800 * a particular virtual address and ctx. If noflush is set we do not 10801 * flush the TLB/TSB. This function may or may not be called with the 10802 * HAT lock held. 10803 */ 10804 static void 10805 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 10806 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 10807 int hat_lock_held) 10808 { 10809 int vcolor; 10810 cpuset_t cpuset; 10811 hatlock_t *hatlockp; 10812 10813 /* 10814 * There is no longer a need to protect against ctx being 10815 * stolen here since we don't store the ctx in the TSB anymore. 10816 */ 10817 vcolor = addr_to_vcolor(addr); 10818 10819 /* 10820 * We must hold the hat lock during the flush of TLB, 10821 * to avoid a race with sfmmu_invalidate_ctx(), where 10822 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 10823 * causing TLB demap routine to skip flush on that MMU. 10824 * If the context on a MMU has already been set to 10825 * INVALID_CONTEXT, we just get an extra flush on 10826 * that MMU. 10827 */ 10828 if (!hat_lock_held && !tlb_noflush) 10829 hatlockp = sfmmu_hat_enter(sfmmup); 10830 10831 kpreempt_disable(); 10832 if (!tlb_noflush) { 10833 /* 10834 * Flush the TSB and TLB. 10835 */ 10836 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp); 10837 10838 cpuset = sfmmup->sfmmu_cpusran; 10839 CPUSET_AND(cpuset, cpu_ready_set); 10840 CPUSET_DEL(cpuset, CPU->cpu_id); 10841 10842 SFMMU_XCALL_STATS(sfmmup); 10843 10844 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 10845 (uint64_t)sfmmup); 10846 10847 vtag_flushpage(addr, (uint64_t)sfmmup); 10848 10849 } 10850 10851 if (!hat_lock_held && !tlb_noflush) 10852 sfmmu_hat_exit(hatlockp); 10853 10854 10855 /* 10856 * Flush the D$ 10857 * 10858 * Even if the ctx is stolen, we need to flush the 10859 * cache. Our ctx stealer only flushes the TLBs. 10860 */ 10861 if (cache_flush_flag == CACHE_FLUSH) { 10862 if (cpu_flag & FLUSH_ALL_CPUS) { 10863 cpuset = cpu_ready_set; 10864 } else { 10865 cpuset = sfmmup->sfmmu_cpusran; 10866 CPUSET_AND(cpuset, cpu_ready_set); 10867 } 10868 CPUSET_DEL(cpuset, CPU->cpu_id); 10869 SFMMU_XCALL_STATS(sfmmup); 10870 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 10871 vac_flushpage(pfnum, vcolor); 10872 } 10873 kpreempt_enable(); 10874 } 10875 10876 /* 10877 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 10878 * address and ctx. If noflush is set we do not currently do anything. 10879 * This function may or may not be called with the HAT lock held. 10880 */ 10881 static void 10882 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 10883 int tlb_noflush, int hat_lock_held) 10884 { 10885 cpuset_t cpuset; 10886 hatlock_t *hatlockp; 10887 10888 /* 10889 * If the process is exiting we have nothing to do. 10890 */ 10891 if (tlb_noflush) 10892 return; 10893 10894 /* 10895 * Flush TSB. 10896 */ 10897 if (!hat_lock_held) 10898 hatlockp = sfmmu_hat_enter(sfmmup); 10899 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp); 10900 10901 kpreempt_disable(); 10902 10903 cpuset = sfmmup->sfmmu_cpusran; 10904 CPUSET_AND(cpuset, cpu_ready_set); 10905 CPUSET_DEL(cpuset, CPU->cpu_id); 10906 10907 SFMMU_XCALL_STATS(sfmmup); 10908 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup); 10909 10910 vtag_flushpage(addr, (uint64_t)sfmmup); 10911 10912 if (!hat_lock_held) 10913 sfmmu_hat_exit(hatlockp); 10914 10915 kpreempt_enable(); 10916 10917 } 10918 10919 /* 10920 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 10921 * call handler that can flush a range of pages to save on xcalls. 10922 */ 10923 static int sfmmu_xcall_save; 10924 10925 static void 10926 sfmmu_tlb_range_demap(demap_range_t *dmrp) 10927 { 10928 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 10929 hatlock_t *hatlockp; 10930 cpuset_t cpuset; 10931 uint64_t sfmmu_pgcnt; 10932 pgcnt_t pgcnt = 0; 10933 int pgunload = 0; 10934 int dirtypg = 0; 10935 caddr_t addr = dmrp->dmr_addr; 10936 caddr_t eaddr; 10937 uint64_t bitvec = dmrp->dmr_bitvec; 10938 10939 ASSERT(bitvec & 1); 10940 10941 /* 10942 * Flush TSB and calculate number of pages to flush. 10943 */ 10944 while (bitvec != 0) { 10945 dirtypg = 0; 10946 /* 10947 * Find the first page to flush and then count how many 10948 * pages there are after it that also need to be flushed. 10949 * This way the number of TSB flushes is minimized. 10950 */ 10951 while ((bitvec & 1) == 0) { 10952 pgcnt++; 10953 addr += MMU_PAGESIZE; 10954 bitvec >>= 1; 10955 } 10956 while (bitvec & 1) { 10957 dirtypg++; 10958 bitvec >>= 1; 10959 } 10960 eaddr = addr + ptob(dirtypg); 10961 hatlockp = sfmmu_hat_enter(sfmmup); 10962 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 10963 sfmmu_hat_exit(hatlockp); 10964 pgunload += dirtypg; 10965 addr = eaddr; 10966 pgcnt += dirtypg; 10967 } 10968 10969 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 10970 if (sfmmup->sfmmu_free == 0) { 10971 addr = dmrp->dmr_addr; 10972 bitvec = dmrp->dmr_bitvec; 10973 10974 /* 10975 * make sure it has SFMMU_PGCNT_SHIFT bits only, 10976 * as it will be used to pack argument for xt_some 10977 */ 10978 ASSERT((pgcnt > 0) && 10979 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT))); 10980 10981 /* 10982 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in 10983 * the low 6 bits of sfmmup. This is doable since pgcnt 10984 * always >= 1. 10985 */ 10986 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK)); 10987 sfmmu_pgcnt = (uint64_t)sfmmup | 10988 ((pgcnt - 1) & SFMMU_PGCNT_MASK); 10989 10990 /* 10991 * We must hold the hat lock during the flush of TLB, 10992 * to avoid a race with sfmmu_invalidate_ctx(), where 10993 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 10994 * causing TLB demap routine to skip flush on that MMU. 10995 * If the context on a MMU has already been set to 10996 * INVALID_CONTEXT, we just get an extra flush on 10997 * that MMU. 10998 */ 10999 hatlockp = sfmmu_hat_enter(sfmmup); 11000 kpreempt_disable(); 11001 11002 cpuset = sfmmup->sfmmu_cpusran; 11003 CPUSET_AND(cpuset, cpu_ready_set); 11004 CPUSET_DEL(cpuset, CPU->cpu_id); 11005 11006 SFMMU_XCALL_STATS(sfmmup); 11007 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 11008 sfmmu_pgcnt); 11009 11010 for (; bitvec != 0; bitvec >>= 1) { 11011 if (bitvec & 1) 11012 vtag_flushpage(addr, (uint64_t)sfmmup); 11013 addr += MMU_PAGESIZE; 11014 } 11015 kpreempt_enable(); 11016 sfmmu_hat_exit(hatlockp); 11017 11018 sfmmu_xcall_save += (pgunload-1); 11019 } 11020 dmrp->dmr_bitvec = 0; 11021 } 11022 11023 /* 11024 * In cases where we need to synchronize with TLB/TSB miss trap 11025 * handlers, _and_ need to flush the TLB, it's a lot easier to 11026 * throw away the context from the process than to do a 11027 * special song and dance to keep things consistent for the 11028 * handlers. 11029 * 11030 * Since the process suddenly ends up without a context and our caller 11031 * holds the hat lock, threads that fault after this function is called 11032 * will pile up on the lock. We can then do whatever we need to 11033 * atomically from the context of the caller. The first blocked thread 11034 * to resume executing will get the process a new context, and the 11035 * process will resume executing. 11036 * 11037 * One added advantage of this approach is that on MMUs that 11038 * support a "flush all" operation, we will delay the flush until 11039 * cnum wrap-around, and then flush the TLB one time. This 11040 * is rather rare, so it's a lot less expensive than making 8000 11041 * x-calls to flush the TLB 8000 times. 11042 * 11043 * A per-process (PP) lock is used to synchronize ctx allocations in 11044 * resume() and ctx invalidations here. 11045 */ 11046 static void 11047 sfmmu_invalidate_ctx(sfmmu_t *sfmmup) 11048 { 11049 cpuset_t cpuset; 11050 int cnum, currcnum; 11051 mmu_ctx_t *mmu_ctxp; 11052 int i; 11053 uint_t pstate_save; 11054 11055 SFMMU_STAT(sf_ctx_inv); 11056 11057 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11058 ASSERT(sfmmup != ksfmmup); 11059 11060 kpreempt_disable(); 11061 11062 mmu_ctxp = CPU_MMU_CTXP(CPU); 11063 ASSERT(mmu_ctxp); 11064 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 11065 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 11066 11067 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum; 11068 11069 pstate_save = sfmmu_disable_intrs(); 11070 11071 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */ 11072 /* set HAT cnum invalid across all context domains. */ 11073 for (i = 0; i < max_mmu_ctxdoms; i++) { 11074 11075 cnum = sfmmup->sfmmu_ctxs[i].cnum; 11076 if (cnum == INVALID_CONTEXT) { 11077 continue; 11078 } 11079 11080 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 11081 } 11082 membar_enter(); /* make sure globally visible to all CPUs */ 11083 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */ 11084 11085 sfmmu_enable_intrs(pstate_save); 11086 11087 cpuset = sfmmup->sfmmu_cpusran; 11088 CPUSET_DEL(cpuset, CPU->cpu_id); 11089 CPUSET_AND(cpuset, cpu_ready_set); 11090 if (!CPUSET_ISNULL(cpuset)) { 11091 SFMMU_XCALL_STATS(sfmmup); 11092 xt_some(cpuset, sfmmu_raise_tsb_exception, 11093 (uint64_t)sfmmup, INVALID_CONTEXT); 11094 xt_sync(cpuset); 11095 SFMMU_STAT(sf_tsb_raise_exception); 11096 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 11097 } 11098 11099 /* 11100 * If the hat to-be-invalidated is the same as the current 11101 * process on local CPU we need to invalidate 11102 * this CPU context as well. 11103 */ 11104 if ((sfmmu_getctx_sec() == currcnum) && 11105 (currcnum != INVALID_CONTEXT)) { 11106 sfmmu_setctx_sec(INVALID_CONTEXT); 11107 sfmmu_clear_utsbinfo(); 11108 } 11109 11110 kpreempt_enable(); 11111 11112 /* 11113 * we hold the hat lock, so nobody should allocate a context 11114 * for us yet 11115 */ 11116 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT); 11117 } 11118 11119 /* 11120 * We need to flush the cache in all cpus. It is possible that 11121 * a process referenced a page as cacheable but has sinced exited 11122 * and cleared the mapping list. We still to flush it but have no 11123 * state so all cpus is the only alternative. 11124 */ 11125 void 11126 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 11127 { 11128 cpuset_t cpuset; 11129 11130 kpreempt_disable(); 11131 cpuset = cpu_ready_set; 11132 CPUSET_DEL(cpuset, CPU->cpu_id); 11133 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 11134 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 11135 xt_sync(cpuset); 11136 vac_flushpage(pfnum, vcolor); 11137 kpreempt_enable(); 11138 } 11139 11140 void 11141 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 11142 { 11143 cpuset_t cpuset; 11144 11145 ASSERT(vcolor >= 0); 11146 11147 kpreempt_disable(); 11148 cpuset = cpu_ready_set; 11149 CPUSET_DEL(cpuset, CPU->cpu_id); 11150 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 11151 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 11152 xt_sync(cpuset); 11153 vac_flushcolor(vcolor, pfnum); 11154 kpreempt_enable(); 11155 } 11156 11157 /* 11158 * We need to prevent processes from accessing the TSB using a cached physical 11159 * address. It's alright if they try to access the TSB via virtual address 11160 * since they will just fault on that virtual address once the mapping has 11161 * been suspended. 11162 */ 11163 #pragma weak sendmondo_in_recover 11164 11165 /* ARGSUSED */ 11166 static int 11167 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 11168 { 11169 hatlock_t *hatlockp; 11170 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 11171 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 11172 extern uint32_t sendmondo_in_recover; 11173 11174 if (flags != HAT_PRESUSPEND) 11175 return (0); 11176 11177 hatlockp = sfmmu_hat_enter(sfmmup); 11178 11179 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 11180 11181 /* 11182 * For Cheetah+ Erratum 25: 11183 * Wait for any active recovery to finish. We can't risk 11184 * relocating the TSB of the thread running mondo_recover_proc() 11185 * since, if we did that, we would deadlock. The scenario we are 11186 * trying to avoid is as follows: 11187 * 11188 * THIS CPU RECOVER CPU 11189 * -------- ----------- 11190 * Begins recovery, walking through TSB 11191 * hat_pagesuspend() TSB TTE 11192 * TLB miss on TSB TTE, spins at TL1 11193 * xt_sync() 11194 * send_mondo_timeout() 11195 * mondo_recover_proc() 11196 * ((deadlocked)) 11197 * 11198 * The second half of the workaround is that mondo_recover_proc() 11199 * checks to see if the tsb_info has the RELOC flag set, and if it 11200 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 11201 * and hence avoiding the TLB miss that could result in a deadlock. 11202 */ 11203 if (&sendmondo_in_recover) { 11204 membar_enter(); /* make sure RELOC flag visible */ 11205 while (sendmondo_in_recover) { 11206 drv_usecwait(1); 11207 membar_consumer(); 11208 } 11209 } 11210 11211 sfmmu_invalidate_ctx(sfmmup); 11212 sfmmu_hat_exit(hatlockp); 11213 11214 return (0); 11215 } 11216 11217 /* ARGSUSED */ 11218 static int 11219 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 11220 void *tsbinfo, pfn_t newpfn) 11221 { 11222 hatlock_t *hatlockp; 11223 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 11224 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 11225 11226 if (flags != HAT_POSTUNSUSPEND) 11227 return (0); 11228 11229 hatlockp = sfmmu_hat_enter(sfmmup); 11230 11231 SFMMU_STAT(sf_tsb_reloc); 11232 11233 /* 11234 * The process may have swapped out while we were relocating one 11235 * of its TSBs. If so, don't bother doing the setup since the 11236 * process can't be using the memory anymore. 11237 */ 11238 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 11239 ASSERT(va == tsbinfop->tsb_va); 11240 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 11241 sfmmu_setup_tsbinfo(sfmmup); 11242 11243 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 11244 sfmmu_inv_tsb(tsbinfop->tsb_va, 11245 TSB_BYTES(tsbinfop->tsb_szc)); 11246 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 11247 } 11248 } 11249 11250 membar_exit(); 11251 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 11252 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11253 11254 sfmmu_hat_exit(hatlockp); 11255 11256 return (0); 11257 } 11258 11259 /* 11260 * Allocate and initialize a tsb_info structure. Note that we may or may not 11261 * allocate a TSB here, depending on the flags passed in. 11262 */ 11263 static int 11264 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 11265 uint_t flags, sfmmu_t *sfmmup) 11266 { 11267 int err; 11268 11269 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 11270 sfmmu_tsbinfo_cache, KM_SLEEP); 11271 11272 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 11273 tsb_szc, flags, sfmmup)) != 0) { 11274 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 11275 SFMMU_STAT(sf_tsb_allocfail); 11276 *tsbinfopp = NULL; 11277 return (err); 11278 } 11279 SFMMU_STAT(sf_tsb_alloc); 11280 11281 /* 11282 * Bump the TSB size counters for this TSB size. 11283 */ 11284 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 11285 return (0); 11286 } 11287 11288 static void 11289 sfmmu_tsb_free(struct tsb_info *tsbinfo) 11290 { 11291 caddr_t tsbva = tsbinfo->tsb_va; 11292 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 11293 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 11294 vmem_t *vmp = tsbinfo->tsb_vmp; 11295 11296 /* 11297 * If we allocated this TSB from relocatable kernel memory, then we 11298 * need to uninstall the callback handler. 11299 */ 11300 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 11301 uintptr_t slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 11302 caddr_t slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 11303 page_t **ppl; 11304 int ret; 11305 11306 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 11307 ASSERT(ret == 0); 11308 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 11309 0); 11310 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 11311 } 11312 11313 if (kmem_cachep != NULL) { 11314 kmem_cache_free(kmem_cachep, tsbva); 11315 } else { 11316 vmem_xfree(vmp, (void *)tsbva, tsb_size); 11317 } 11318 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 11319 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 11320 } 11321 11322 static void 11323 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 11324 { 11325 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 11326 sfmmu_tsb_free(tsbinfo); 11327 } 11328 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 11329 11330 } 11331 11332 /* 11333 * Setup all the references to physical memory for this tsbinfo. 11334 * The underlying page(s) must be locked. 11335 */ 11336 static void 11337 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 11338 { 11339 ASSERT(pfn != PFN_INVALID); 11340 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 11341 11342 #ifndef sun4v 11343 if (tsbinfo->tsb_szc == 0) { 11344 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 11345 PROT_WRITE|PROT_READ, TTE8K); 11346 } else { 11347 /* 11348 * Round down PA and use a large mapping; the handlers will 11349 * compute the TSB pointer at the correct offset into the 11350 * big virtual page. NOTE: this assumes all TSBs larger 11351 * than 8K must come from physically contiguous slabs of 11352 * size tsb_slab_size. 11353 */ 11354 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 11355 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 11356 } 11357 tsbinfo->tsb_pa = ptob(pfn); 11358 11359 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 11360 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 11361 11362 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 11363 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 11364 #else /* sun4v */ 11365 tsbinfo->tsb_pa = ptob(pfn); 11366 #endif /* sun4v */ 11367 } 11368 11369 11370 /* 11371 * Returns zero on success, ENOMEM if over the high water mark, 11372 * or EAGAIN if the caller needs to retry with a smaller TSB 11373 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 11374 * 11375 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 11376 * is specified and the TSB requested is PAGESIZE, though it 11377 * may sleep waiting for memory if sufficient memory is not 11378 * available. 11379 */ 11380 static int 11381 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 11382 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 11383 { 11384 caddr_t vaddr = NULL; 11385 caddr_t slab_vaddr; 11386 uintptr_t slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 11387 int tsbbytes = TSB_BYTES(tsbcode); 11388 int lowmem = 0; 11389 struct kmem_cache *kmem_cachep = NULL; 11390 vmem_t *vmp = NULL; 11391 lgrp_id_t lgrpid = LGRP_NONE; 11392 pfn_t pfn; 11393 uint_t cbflags = HAC_SLEEP; 11394 page_t **pplist; 11395 int ret; 11396 11397 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 11398 flags |= TSB_ALLOC; 11399 11400 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 11401 11402 tsbinfo->tsb_sfmmu = sfmmup; 11403 11404 /* 11405 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 11406 * return. 11407 */ 11408 if ((flags & TSB_ALLOC) == 0) { 11409 tsbinfo->tsb_szc = tsbcode; 11410 tsbinfo->tsb_ttesz_mask = tteszmask; 11411 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 11412 tsbinfo->tsb_pa = -1; 11413 tsbinfo->tsb_tte.ll = 0; 11414 tsbinfo->tsb_next = NULL; 11415 tsbinfo->tsb_flags = TSB_SWAPPED; 11416 tsbinfo->tsb_cache = NULL; 11417 tsbinfo->tsb_vmp = NULL; 11418 return (0); 11419 } 11420 11421 #ifdef DEBUG 11422 /* 11423 * For debugging: 11424 * Randomly force allocation failures every tsb_alloc_mtbf 11425 * tries if TSB_FORCEALLOC is not specified. This will 11426 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 11427 * it is even, to allow testing of both failure paths... 11428 */ 11429 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 11430 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 11431 tsb_alloc_count = 0; 11432 tsb_alloc_fail_mtbf++; 11433 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 11434 } 11435 #endif /* DEBUG */ 11436 11437 /* 11438 * Enforce high water mark if we are not doing a forced allocation 11439 * and are not shrinking a process' TSB. 11440 */ 11441 if ((flags & TSB_SHRINK) == 0 && 11442 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 11443 if ((flags & TSB_FORCEALLOC) == 0) 11444 return (ENOMEM); 11445 lowmem = 1; 11446 } 11447 11448 /* 11449 * Allocate from the correct location based upon the size of the TSB 11450 * compared to the base page size, and what memory conditions dictate. 11451 * Note we always do nonblocking allocations from the TSB arena since 11452 * we don't want memory fragmentation to cause processes to block 11453 * indefinitely waiting for memory; until the kernel algorithms that 11454 * coalesce large pages are improved this is our best option. 11455 * 11456 * Algorithm: 11457 * If allocating a "large" TSB (>8K), allocate from the 11458 * appropriate kmem_tsb_default_arena vmem arena 11459 * else if low on memory or the TSB_FORCEALLOC flag is set or 11460 * tsb_forceheap is set 11461 * Allocate from kernel heap via sfmmu_tsb8k_cache with 11462 * KM_SLEEP (never fails) 11463 * else 11464 * Allocate from appropriate sfmmu_tsb_cache with 11465 * KM_NOSLEEP 11466 * endif 11467 */ 11468 if (tsb_lgrp_affinity) 11469 lgrpid = lgrp_home_id(curthread); 11470 if (lgrpid == LGRP_NONE) 11471 lgrpid = 0; /* use lgrp of boot CPU */ 11472 11473 if (tsbbytes > MMU_PAGESIZE) { 11474 vmp = kmem_tsb_default_arena[lgrpid]; 11475 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 0, 0, 11476 NULL, NULL, VM_NOSLEEP); 11477 #ifdef DEBUG 11478 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 11479 #else /* !DEBUG */ 11480 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 11481 #endif /* DEBUG */ 11482 kmem_cachep = sfmmu_tsb8k_cache; 11483 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 11484 ASSERT(vaddr != NULL); 11485 } else { 11486 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 11487 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 11488 } 11489 11490 tsbinfo->tsb_cache = kmem_cachep; 11491 tsbinfo->tsb_vmp = vmp; 11492 11493 if (vaddr == NULL) { 11494 return (EAGAIN); 11495 } 11496 11497 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 11498 kmem_cachep = tsbinfo->tsb_cache; 11499 11500 /* 11501 * If we are allocating from outside the cage, then we need to 11502 * register a relocation callback handler. Note that for now 11503 * since pseudo mappings always hang off of the slab's root page, 11504 * we need only lock the first 8K of the TSB slab. This is a bit 11505 * hacky but it is good for performance. 11506 */ 11507 if (kmem_cachep != sfmmu_tsb8k_cache) { 11508 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 11509 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 11510 ASSERT(ret == 0); 11511 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 11512 cbflags, (void *)tsbinfo, &pfn); 11513 11514 /* 11515 * Need to free up resources if we could not successfully 11516 * add the callback function and return an error condition. 11517 */ 11518 if (ret != 0) { 11519 if (kmem_cachep) { 11520 kmem_cache_free(kmem_cachep, vaddr); 11521 } else { 11522 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 11523 } 11524 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 11525 S_WRITE); 11526 return (EAGAIN); 11527 } 11528 } else { 11529 /* 11530 * Since allocation of 8K TSBs from heap is rare and occurs 11531 * during memory pressure we allocate them from permanent 11532 * memory rather than using callbacks to get the PFN. 11533 */ 11534 pfn = hat_getpfnum(kas.a_hat, vaddr); 11535 } 11536 11537 tsbinfo->tsb_va = vaddr; 11538 tsbinfo->tsb_szc = tsbcode; 11539 tsbinfo->tsb_ttesz_mask = tteszmask; 11540 tsbinfo->tsb_next = NULL; 11541 tsbinfo->tsb_flags = 0; 11542 11543 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 11544 11545 if (kmem_cachep != sfmmu_tsb8k_cache) { 11546 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 11547 } 11548 11549 sfmmu_inv_tsb(vaddr, tsbbytes); 11550 return (0); 11551 } 11552 11553 /* 11554 * Initialize per cpu tsb and per cpu tsbmiss_area 11555 */ 11556 void 11557 sfmmu_init_tsbs(void) 11558 { 11559 int i; 11560 struct tsbmiss *tsbmissp; 11561 struct kpmtsbm *kpmtsbmp; 11562 #ifndef sun4v 11563 extern int dcache_line_mask; 11564 #endif /* sun4v */ 11565 extern uint_t vac_colors; 11566 11567 /* 11568 * Init. tsb miss area. 11569 */ 11570 tsbmissp = tsbmiss_area; 11571 11572 for (i = 0; i < NCPU; tsbmissp++, i++) { 11573 /* 11574 * initialize the tsbmiss area. 11575 * Do this for all possible CPUs as some may be added 11576 * while the system is running. There is no cost to this. 11577 */ 11578 tsbmissp->ksfmmup = ksfmmup; 11579 #ifndef sun4v 11580 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 11581 #endif /* sun4v */ 11582 tsbmissp->khashstart = 11583 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 11584 tsbmissp->uhashstart = 11585 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 11586 tsbmissp->khashsz = khmehash_num; 11587 tsbmissp->uhashsz = uhmehash_num; 11588 } 11589 11590 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B', 11591 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0); 11592 11593 if (kpm_enable == 0) 11594 return; 11595 11596 /* -- Begin KPM specific init -- */ 11597 11598 if (kpm_smallpages) { 11599 /* 11600 * If we're using base pagesize pages for seg_kpm 11601 * mappings, we use the kernel TSB since we can't afford 11602 * to allocate a second huge TSB for these mappings. 11603 */ 11604 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 11605 kpm_tsbsz = ktsb_szcode; 11606 kpmsm_tsbbase = kpm_tsbbase; 11607 kpmsm_tsbsz = kpm_tsbsz; 11608 } else { 11609 /* 11610 * In VAC conflict case, just put the entries in the 11611 * kernel 8K indexed TSB for now so we can find them. 11612 * This could really be changed in the future if we feel 11613 * the need... 11614 */ 11615 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 11616 kpmsm_tsbsz = ktsb_szcode; 11617 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 11618 kpm_tsbsz = ktsb4m_szcode; 11619 } 11620 11621 kpmtsbmp = kpmtsbm_area; 11622 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 11623 /* 11624 * Initialize the kpmtsbm area. 11625 * Do this for all possible CPUs as some may be added 11626 * while the system is running. There is no cost to this. 11627 */ 11628 kpmtsbmp->vbase = kpm_vbase; 11629 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 11630 kpmtsbmp->sz_shift = kpm_size_shift; 11631 kpmtsbmp->kpmp_shift = kpmp_shift; 11632 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 11633 if (kpm_smallpages == 0) { 11634 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 11635 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 11636 } else { 11637 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 11638 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 11639 } 11640 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 11641 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 11642 #ifdef DEBUG 11643 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 11644 #endif /* DEBUG */ 11645 if (ktsb_phys) 11646 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 11647 } 11648 11649 /* -- End KPM specific init -- */ 11650 } 11651 11652 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 11653 struct tsb_info ktsb_info[2]; 11654 11655 /* 11656 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 11657 */ 11658 void 11659 sfmmu_init_ktsbinfo() 11660 { 11661 ASSERT(ksfmmup != NULL); 11662 ASSERT(ksfmmup->sfmmu_tsb == NULL); 11663 /* 11664 * Allocate tsbinfos for kernel and copy in data 11665 * to make debug easier and sun4v setup easier. 11666 */ 11667 ktsb_info[0].tsb_sfmmu = ksfmmup; 11668 ktsb_info[0].tsb_szc = ktsb_szcode; 11669 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 11670 ktsb_info[0].tsb_va = ktsb_base; 11671 ktsb_info[0].tsb_pa = ktsb_pbase; 11672 ktsb_info[0].tsb_flags = 0; 11673 ktsb_info[0].tsb_tte.ll = 0; 11674 ktsb_info[0].tsb_cache = NULL; 11675 11676 ktsb_info[1].tsb_sfmmu = ksfmmup; 11677 ktsb_info[1].tsb_szc = ktsb4m_szcode; 11678 ktsb_info[1].tsb_ttesz_mask = TSB4M; 11679 ktsb_info[1].tsb_va = ktsb4m_base; 11680 ktsb_info[1].tsb_pa = ktsb4m_pbase; 11681 ktsb_info[1].tsb_flags = 0; 11682 ktsb_info[1].tsb_tte.ll = 0; 11683 ktsb_info[1].tsb_cache = NULL; 11684 11685 /* Link them into ksfmmup. */ 11686 ktsb_info[0].tsb_next = &ktsb_info[1]; 11687 ktsb_info[1].tsb_next = NULL; 11688 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 11689 11690 sfmmu_setup_tsbinfo(ksfmmup); 11691 } 11692 11693 /* 11694 * Cache the last value returned from va_to_pa(). If the VA specified 11695 * in the current call to cached_va_to_pa() maps to the same Page (as the 11696 * previous call to cached_va_to_pa()), then compute the PA using 11697 * cached info, else call va_to_pa(). 11698 * 11699 * Note: this function is neither MT-safe nor consistent in the presence 11700 * of multiple, interleaved threads. This function was created to enable 11701 * an optimization used during boot (at a point when there's only one thread 11702 * executing on the "boot CPU", and before startup_vm() has been called). 11703 */ 11704 static uint64_t 11705 cached_va_to_pa(void *vaddr) 11706 { 11707 static uint64_t prev_vaddr_base = 0; 11708 static uint64_t prev_pfn = 0; 11709 11710 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 11711 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 11712 } else { 11713 uint64_t pa = va_to_pa(vaddr); 11714 11715 if (pa != ((uint64_t)-1)) { 11716 /* 11717 * Computed physical address is valid. Cache its 11718 * related info for the next cached_va_to_pa() call. 11719 */ 11720 prev_pfn = pa & MMU_PAGEMASK; 11721 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 11722 } 11723 11724 return (pa); 11725 } 11726 } 11727 11728 /* 11729 * Carve up our nucleus hblk region. We may allocate more hblks than 11730 * asked due to rounding errors but we are guaranteed to have at least 11731 * enough space to allocate the requested number of hblk8's and hblk1's. 11732 */ 11733 void 11734 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 11735 { 11736 struct hme_blk *hmeblkp; 11737 size_t hme8blk_sz, hme1blk_sz; 11738 size_t i; 11739 size_t hblk8_bound; 11740 ulong_t j = 0, k = 0; 11741 11742 ASSERT(addr != NULL && size != 0); 11743 11744 /* Need to use proper structure alignment */ 11745 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 11746 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 11747 11748 nucleus_hblk8.list = (void *)addr; 11749 nucleus_hblk8.index = 0; 11750 11751 /* 11752 * Use as much memory as possible for hblk8's since we 11753 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 11754 * We need to hold back enough space for the hblk1's which 11755 * we'll allocate next. 11756 */ 11757 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 11758 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 11759 hmeblkp = (struct hme_blk *)addr; 11760 addr += hme8blk_sz; 11761 hmeblkp->hblk_nuc_bit = 1; 11762 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 11763 } 11764 nucleus_hblk8.len = j; 11765 ASSERT(j >= nhblk8); 11766 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 11767 11768 nucleus_hblk1.list = (void *)addr; 11769 nucleus_hblk1.index = 0; 11770 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 11771 hmeblkp = (struct hme_blk *)addr; 11772 addr += hme1blk_sz; 11773 hmeblkp->hblk_nuc_bit = 1; 11774 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 11775 } 11776 ASSERT(k >= nhblk1); 11777 nucleus_hblk1.len = k; 11778 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 11779 } 11780 11781 /* 11782 * This function is currently not supported on this platform. For what 11783 * it's supposed to do, see hat.c and hat_srmmu.c 11784 */ 11785 /* ARGSUSED */ 11786 faultcode_t 11787 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 11788 uint_t flags) 11789 { 11790 ASSERT(hat->sfmmu_xhat_provider == NULL); 11791 return (FC_NOSUPPORT); 11792 } 11793 11794 /* 11795 * Searchs the mapping list of the page for a mapping of the same size. If not 11796 * found the corresponding bit is cleared in the p_index field. When large 11797 * pages are more prevalent in the system, we can maintain the mapping list 11798 * in order and we don't have to traverse the list each time. Just check the 11799 * next and prev entries, and if both are of different size, we clear the bit. 11800 */ 11801 static void 11802 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 11803 { 11804 struct sf_hment *sfhmep; 11805 struct hme_blk *hmeblkp; 11806 int index; 11807 pgcnt_t npgs; 11808 11809 ASSERT(ttesz > TTE8K); 11810 11811 ASSERT(sfmmu_mlist_held(pp)); 11812 11813 ASSERT(PP_ISMAPPED_LARGE(pp)); 11814 11815 /* 11816 * Traverse mapping list looking for another mapping of same size. 11817 * since we only want to clear index field if all mappings of 11818 * that size are gone. 11819 */ 11820 11821 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 11822 hmeblkp = sfmmu_hmetohblk(sfhmep); 11823 if (hmeblkp->hblk_xhat_bit) 11824 continue; 11825 if (hme_size(sfhmep) == ttesz) { 11826 /* 11827 * another mapping of the same size. don't clear index. 11828 */ 11829 return; 11830 } 11831 } 11832 11833 /* 11834 * Clear the p_index bit for large page. 11835 */ 11836 index = PAGESZ_TO_INDEX(ttesz); 11837 npgs = TTEPAGES(ttesz); 11838 while (npgs-- > 0) { 11839 ASSERT(pp->p_index & index); 11840 pp->p_index &= ~index; 11841 pp = PP_PAGENEXT(pp); 11842 } 11843 } 11844 11845 /* 11846 * return supported features 11847 */ 11848 /* ARGSUSED */ 11849 int 11850 hat_supported(enum hat_features feature, void *arg) 11851 { 11852 switch (feature) { 11853 case HAT_SHARED_PT: 11854 case HAT_DYNAMIC_ISM_UNMAP: 11855 case HAT_VMODSORT: 11856 return (1); 11857 default: 11858 return (0); 11859 } 11860 } 11861 11862 void 11863 hat_enter(struct hat *hat) 11864 { 11865 hatlock_t *hatlockp; 11866 11867 if (hat != ksfmmup) { 11868 hatlockp = TSB_HASH(hat); 11869 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 11870 } 11871 } 11872 11873 void 11874 hat_exit(struct hat *hat) 11875 { 11876 hatlock_t *hatlockp; 11877 11878 if (hat != ksfmmup) { 11879 hatlockp = TSB_HASH(hat); 11880 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 11881 } 11882 } 11883 11884 /*ARGSUSED*/ 11885 void 11886 hat_reserve(struct as *as, caddr_t addr, size_t len) 11887 { 11888 } 11889 11890 static void 11891 hat_kstat_init(void) 11892 { 11893 kstat_t *ksp; 11894 11895 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 11896 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 11897 KSTAT_FLAG_VIRTUAL); 11898 if (ksp) { 11899 ksp->ks_data = (void *) &sfmmu_global_stat; 11900 kstat_install(ksp); 11901 } 11902 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 11903 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 11904 KSTAT_FLAG_VIRTUAL); 11905 if (ksp) { 11906 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 11907 kstat_install(ksp); 11908 } 11909 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 11910 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 11911 KSTAT_FLAG_WRITABLE); 11912 if (ksp) { 11913 ksp->ks_update = sfmmu_kstat_percpu_update; 11914 kstat_install(ksp); 11915 } 11916 } 11917 11918 /* ARGSUSED */ 11919 static int 11920 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 11921 { 11922 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 11923 struct tsbmiss *tsbm = tsbmiss_area; 11924 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 11925 int i; 11926 11927 ASSERT(cpu_kstat); 11928 if (rw == KSTAT_READ) { 11929 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 11930 cpu_kstat->sf_itlb_misses = tsbm->itlb_misses; 11931 cpu_kstat->sf_dtlb_misses = tsbm->dtlb_misses; 11932 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 11933 tsbm->uprot_traps; 11934 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 11935 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 11936 11937 if (tsbm->itlb_misses > 0 && tsbm->dtlb_misses > 0) { 11938 cpu_kstat->sf_tsb_hits = 11939 (tsbm->itlb_misses + tsbm->dtlb_misses) - 11940 (tsbm->utsb_misses + tsbm->ktsb_misses + 11941 kpmtsbm->kpm_tsb_misses); 11942 } else { 11943 cpu_kstat->sf_tsb_hits = 0; 11944 } 11945 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 11946 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 11947 } 11948 } else { 11949 /* KSTAT_WRITE is used to clear stats */ 11950 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 11951 tsbm->itlb_misses = 0; 11952 tsbm->dtlb_misses = 0; 11953 tsbm->utsb_misses = 0; 11954 tsbm->ktsb_misses = 0; 11955 tsbm->uprot_traps = 0; 11956 tsbm->kprot_traps = 0; 11957 kpmtsbm->kpm_dtlb_misses = 0; 11958 kpmtsbm->kpm_tsb_misses = 0; 11959 } 11960 } 11961 return (0); 11962 } 11963 11964 #ifdef DEBUG 11965 11966 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 11967 11968 /* 11969 * A tte checker. *orig_old is the value we read before cas. 11970 * *cur is the value returned by cas. 11971 * *new is the desired value when we do the cas. 11972 * 11973 * *hmeblkp is currently unused. 11974 */ 11975 11976 /* ARGSUSED */ 11977 void 11978 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 11979 { 11980 pfn_t i, j, k; 11981 int cpuid = CPU->cpu_id; 11982 11983 gorig[cpuid] = orig_old; 11984 gcur[cpuid] = cur; 11985 gnew[cpuid] = new; 11986 11987 #ifdef lint 11988 hmeblkp = hmeblkp; 11989 #endif 11990 11991 if (TTE_IS_VALID(orig_old)) { 11992 if (TTE_IS_VALID(cur)) { 11993 i = TTE_TO_TTEPFN(orig_old); 11994 j = TTE_TO_TTEPFN(cur); 11995 k = TTE_TO_TTEPFN(new); 11996 if (i != j) { 11997 /* remap error? */ 11998 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j); 11999 } 12000 12001 if (i != k) { 12002 /* remap error? */ 12003 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k); 12004 } 12005 } else { 12006 if (TTE_IS_VALID(new)) { 12007 panic("chk_tte: invalid cur? "); 12008 } 12009 12010 i = TTE_TO_TTEPFN(orig_old); 12011 k = TTE_TO_TTEPFN(new); 12012 if (i != k) { 12013 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k); 12014 } 12015 } 12016 } else { 12017 if (TTE_IS_VALID(cur)) { 12018 j = TTE_TO_TTEPFN(cur); 12019 if (TTE_IS_VALID(new)) { 12020 k = TTE_TO_TTEPFN(new); 12021 if (j != k) { 12022 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx", 12023 j, k); 12024 } 12025 } else { 12026 panic("chk_tte: why here?"); 12027 } 12028 } else { 12029 if (!TTE_IS_VALID(new)) { 12030 panic("chk_tte: why here2 ?"); 12031 } 12032 } 12033 } 12034 } 12035 12036 #endif /* DEBUG */ 12037 12038 extern void prefetch_tsbe_read(struct tsbe *); 12039 extern void prefetch_tsbe_write(struct tsbe *); 12040 12041 12042 /* 12043 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 12044 * us optimal performance on Cheetah+. You can only have 8 outstanding 12045 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 12046 * prefetch to make the most utilization of the prefetch capability. 12047 */ 12048 #define TSBE_PREFETCH_STRIDE (7) 12049 12050 void 12051 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 12052 { 12053 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 12054 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 12055 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 12056 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 12057 struct tsbe *old; 12058 struct tsbe *new; 12059 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 12060 uint64_t va; 12061 int new_offset; 12062 int i; 12063 int vpshift; 12064 int last_prefetch; 12065 12066 if (old_bytes == new_bytes) { 12067 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 12068 } else { 12069 12070 /* 12071 * A TSBE is 16 bytes which means there are four TSBE's per 12072 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 12073 */ 12074 old = (struct tsbe *)old_tsbinfo->tsb_va; 12075 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 12076 for (i = 0; i < old_entries; i++, old++) { 12077 if (((i & (4-1)) == 0) && (i < last_prefetch)) 12078 prefetch_tsbe_read(old); 12079 if (!old->tte_tag.tag_invalid) { 12080 /* 12081 * We have a valid TTE to remap. Check the 12082 * size. We won't remap 64K or 512K TTEs 12083 * because they span more than one TSB entry 12084 * and are indexed using an 8K virt. page. 12085 * Ditto for 32M and 256M TTEs. 12086 */ 12087 if (TTE_CSZ(&old->tte_data) == TTE64K || 12088 TTE_CSZ(&old->tte_data) == TTE512K) 12089 continue; 12090 if (mmu_page_sizes == max_mmu_page_sizes) { 12091 if (TTE_CSZ(&old->tte_data) == TTE32M || 12092 TTE_CSZ(&old->tte_data) == TTE256M) 12093 continue; 12094 } 12095 12096 /* clear the lower 22 bits of the va */ 12097 va = *(uint64_t *)old << 22; 12098 /* turn va into a virtual pfn */ 12099 va >>= 22 - TSB_START_SIZE; 12100 /* 12101 * or in bits from the offset in the tsb 12102 * to get the real virtual pfn. These 12103 * correspond to bits [21:13] in the va 12104 */ 12105 vpshift = 12106 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 12107 0x1ff; 12108 va |= (i << vpshift); 12109 va >>= vpshift; 12110 new_offset = va & (new_entries - 1); 12111 new = new_base + new_offset; 12112 prefetch_tsbe_write(new); 12113 *new = *old; 12114 } 12115 } 12116 } 12117 } 12118 12119 /* 12120 * Kernel Physical Mapping (kpm) facility 12121 */ 12122 12123 /* -- hat_kpm interface section -- */ 12124 12125 /* 12126 * Mapin a locked page and return the vaddr. 12127 * When a kpme is provided by the caller it is added to 12128 * the page p_kpmelist. The page to be mapped in must 12129 * be at least read locked (p_selock). 12130 */ 12131 caddr_t 12132 hat_kpm_mapin(struct page *pp, struct kpme *kpme) 12133 { 12134 kmutex_t *pml; 12135 caddr_t vaddr; 12136 12137 if (kpm_enable == 0) { 12138 cmn_err(CE_WARN, "hat_kpm_mapin: kpm_enable not set"); 12139 return ((caddr_t)NULL); 12140 } 12141 12142 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 12143 cmn_err(CE_WARN, "hat_kpm_mapin: pp zero or not locked"); 12144 return ((caddr_t)NULL); 12145 } 12146 12147 pml = sfmmu_mlist_enter(pp); 12148 ASSERT(pp->p_kpmref >= 0); 12149 12150 vaddr = (pp->p_kpmref == 0) ? 12151 sfmmu_kpm_mapin(pp) : hat_kpm_page2va(pp, 1); 12152 12153 if (kpme != NULL) { 12154 /* 12155 * Tolerate multiple mapins for the same kpme to avoid 12156 * the need for an extra serialization. 12157 */ 12158 if ((sfmmu_kpme_lookup(kpme, pp)) == 0) 12159 sfmmu_kpme_add(kpme, pp); 12160 12161 ASSERT(pp->p_kpmref > 0); 12162 12163 } else { 12164 pp->p_kpmref++; 12165 } 12166 12167 sfmmu_mlist_exit(pml); 12168 return (vaddr); 12169 } 12170 12171 /* 12172 * Mapout a locked page. 12173 * When a kpme is provided by the caller it is removed from 12174 * the page p_kpmelist. The page to be mapped out must be at 12175 * least read locked (p_selock). 12176 * Note: The seg_kpm layer provides a mapout interface for the 12177 * case that a kpme is used and the underlying page is unlocked. 12178 * This can be used instead of calling this function directly. 12179 */ 12180 void 12181 hat_kpm_mapout(struct page *pp, struct kpme *kpme, caddr_t vaddr) 12182 { 12183 kmutex_t *pml; 12184 12185 if (kpm_enable == 0) { 12186 cmn_err(CE_WARN, "hat_kpm_mapout: kpm_enable not set"); 12187 return; 12188 } 12189 12190 if (IS_KPM_ADDR(vaddr) == 0) { 12191 cmn_err(CE_WARN, "hat_kpm_mapout: no kpm address"); 12192 return; 12193 } 12194 12195 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 12196 cmn_err(CE_WARN, "hat_kpm_mapout: page zero or not locked"); 12197 return; 12198 } 12199 12200 if (kpme != NULL) { 12201 ASSERT(pp == kpme->kpe_page); 12202 pp = kpme->kpe_page; 12203 pml = sfmmu_mlist_enter(pp); 12204 12205 if (sfmmu_kpme_lookup(kpme, pp) == 0) 12206 panic("hat_kpm_mapout: kpme not found pp=%p", 12207 (void *)pp); 12208 12209 ASSERT(pp->p_kpmref > 0); 12210 sfmmu_kpme_sub(kpme, pp); 12211 12212 } else { 12213 pml = sfmmu_mlist_enter(pp); 12214 pp->p_kpmref--; 12215 } 12216 12217 ASSERT(pp->p_kpmref >= 0); 12218 if (pp->p_kpmref == 0) 12219 sfmmu_kpm_mapout(pp, vaddr); 12220 12221 sfmmu_mlist_exit(pml); 12222 } 12223 12224 /* 12225 * Return the kpm virtual address for the page at pp. 12226 * If checkswap is non zero and the page is backed by a 12227 * swap vnode the physical address is used rather than 12228 * p_offset to determine the kpm region. 12229 * Note: The function has to be used w/ extreme care. The 12230 * stability of the page identity is in the responsibility 12231 * of the caller. 12232 */ 12233 caddr_t 12234 hat_kpm_page2va(struct page *pp, int checkswap) 12235 { 12236 int vcolor, vcolor_pa; 12237 uintptr_t paddr, vaddr; 12238 12239 ASSERT(kpm_enable); 12240 12241 paddr = ptob(pp->p_pagenum); 12242 vcolor_pa = addr_to_vcolor(paddr); 12243 12244 if (checkswap && pp->p_vnode && IS_SWAPFSVP(pp->p_vnode)) 12245 vcolor = (PP_ISNC(pp)) ? vcolor_pa : PP_GET_VCOLOR(pp); 12246 else 12247 vcolor = addr_to_vcolor(pp->p_offset); 12248 12249 vaddr = (uintptr_t)kpm_vbase + paddr; 12250 12251 if (vcolor_pa != vcolor) { 12252 vaddr += ((uintptr_t)(vcolor - vcolor_pa) << MMU_PAGESHIFT); 12253 vaddr += (vcolor_pa > vcolor) ? 12254 ((uintptr_t)vcolor_pa << kpm_size_shift) : 12255 ((uintptr_t)(vcolor - vcolor_pa) << kpm_size_shift); 12256 } 12257 12258 return ((caddr_t)vaddr); 12259 } 12260 12261 /* 12262 * Return the page for the kpm virtual address vaddr. 12263 * Caller is responsible for the kpm mapping and lock 12264 * state of the page. 12265 */ 12266 page_t * 12267 hat_kpm_vaddr2page(caddr_t vaddr) 12268 { 12269 uintptr_t paddr; 12270 pfn_t pfn; 12271 12272 ASSERT(IS_KPM_ADDR(vaddr)); 12273 12274 SFMMU_KPM_VTOP(vaddr, paddr); 12275 pfn = (pfn_t)btop(paddr); 12276 12277 return (page_numtopp_nolock(pfn)); 12278 } 12279 12280 /* page to kpm_page */ 12281 #define PP2KPMPG(pp, kp) { \ 12282 struct memseg *mseg; \ 12283 pgcnt_t inx; \ 12284 pfn_t pfn; \ 12285 \ 12286 pfn = pp->p_pagenum; \ 12287 mseg = page_numtomemseg_nolock(pfn); \ 12288 ASSERT(mseg); \ 12289 inx = ptokpmp(kpmptop(ptokpmp(pfn)) - mseg->kpm_pbase); \ 12290 ASSERT(inx < mseg->kpm_nkpmpgs); \ 12291 kp = &mseg->kpm_pages[inx]; \ 12292 } 12293 12294 /* page to kpm_spage */ 12295 #define PP2KPMSPG(pp, ksp) { \ 12296 struct memseg *mseg; \ 12297 pgcnt_t inx; \ 12298 pfn_t pfn; \ 12299 \ 12300 pfn = pp->p_pagenum; \ 12301 mseg = page_numtomemseg_nolock(pfn); \ 12302 ASSERT(mseg); \ 12303 inx = pfn - mseg->kpm_pbase; \ 12304 ksp = &mseg->kpm_spages[inx]; \ 12305 } 12306 12307 /* 12308 * hat_kpm_fault is called from segkpm_fault when a kpm tsbmiss occurred 12309 * which could not be resolved by the trap level tsbmiss handler for the 12310 * following reasons: 12311 * . The vaddr is in VAC alias range (always PAGESIZE mapping size). 12312 * . The kpm (s)page range of vaddr is in a VAC alias prevention state. 12313 * . tsbmiss handling at trap level is not desired (DEBUG kernel only, 12314 * kpm_tsbmtl == 0). 12315 */ 12316 int 12317 hat_kpm_fault(struct hat *hat, caddr_t vaddr) 12318 { 12319 int error; 12320 uintptr_t paddr; 12321 pfn_t pfn; 12322 struct memseg *mseg; 12323 page_t *pp; 12324 12325 if (kpm_enable == 0) { 12326 cmn_err(CE_WARN, "hat_kpm_fault: kpm_enable not set"); 12327 return (ENOTSUP); 12328 } 12329 12330 ASSERT(hat == ksfmmup); 12331 ASSERT(IS_KPM_ADDR(vaddr)); 12332 12333 SFMMU_KPM_VTOP(vaddr, paddr); 12334 pfn = (pfn_t)btop(paddr); 12335 mseg = page_numtomemseg_nolock(pfn); 12336 if (mseg == NULL) 12337 return (EFAULT); 12338 12339 pp = &mseg->pages[(pgcnt_t)(pfn - mseg->pages_base)]; 12340 ASSERT((pfn_t)pp->p_pagenum == pfn); 12341 12342 if (!PAGE_LOCKED(pp)) 12343 return (EFAULT); 12344 12345 if (kpm_smallpages == 0) 12346 error = sfmmu_kpm_fault(vaddr, mseg, pp); 12347 else 12348 error = sfmmu_kpm_fault_small(vaddr, mseg, pp); 12349 12350 return (error); 12351 } 12352 12353 extern krwlock_t memsegslock; 12354 12355 /* 12356 * memseg_hash[] was cleared, need to clear memseg_phash[] too. 12357 */ 12358 void 12359 hat_kpm_mseghash_clear(int nentries) 12360 { 12361 pgcnt_t i; 12362 12363 if (kpm_enable == 0) 12364 return; 12365 12366 for (i = 0; i < nentries; i++) 12367 memseg_phash[i] = MSEG_NULLPTR_PA; 12368 } 12369 12370 /* 12371 * Update memseg_phash[inx] when memseg_hash[inx] was changed. 12372 */ 12373 void 12374 hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp) 12375 { 12376 if (kpm_enable == 0) 12377 return; 12378 12379 memseg_phash[inx] = (msp) ? va_to_pa(msp) : MSEG_NULLPTR_PA; 12380 } 12381 12382 /* 12383 * Update kpm memseg members from basic memseg info. 12384 */ 12385 void 12386 hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs, 12387 offset_t kpm_pages_off) 12388 { 12389 if (kpm_enable == 0) 12390 return; 12391 12392 msp->kpm_pages = (kpm_page_t *)((caddr_t)msp->pages + kpm_pages_off); 12393 msp->kpm_nkpmpgs = nkpmpgs; 12394 msp->kpm_pbase = kpmptop(ptokpmp(msp->pages_base)); 12395 msp->pagespa = va_to_pa(msp->pages); 12396 msp->epagespa = va_to_pa(msp->epages); 12397 msp->kpm_pagespa = va_to_pa(msp->kpm_pages); 12398 } 12399 12400 /* 12401 * Setup nextpa when a memseg is inserted. 12402 * Assumes that the memsegslock is already held. 12403 */ 12404 void 12405 hat_kpm_addmem_mseg_insert(struct memseg *msp) 12406 { 12407 if (kpm_enable == 0) 12408 return; 12409 12410 ASSERT(RW_LOCK_HELD(&memsegslock)); 12411 msp->nextpa = (memsegs) ? va_to_pa(memsegs) : MSEG_NULLPTR_PA; 12412 } 12413 12414 /* 12415 * Setup memsegspa when a memseg is (head) inserted. 12416 * Called before memsegs is updated to complete a 12417 * memseg insert operation. 12418 * Assumes that the memsegslock is already held. 12419 */ 12420 void 12421 hat_kpm_addmem_memsegs_update(struct memseg *msp) 12422 { 12423 if (kpm_enable == 0) 12424 return; 12425 12426 ASSERT(RW_LOCK_HELD(&memsegslock)); 12427 ASSERT(memsegs); 12428 memsegspa = va_to_pa(msp); 12429 } 12430 12431 /* 12432 * Return end of metadata for an already setup memseg. 12433 * 12434 * Note: kpm_pages and kpm_spages are aliases and the underlying 12435 * member of struct memseg is a union, therefore they always have 12436 * the same address within a memseg. They must be differentiated 12437 * when pointer arithmetic is used with them. 12438 */ 12439 caddr_t 12440 hat_kpm_mseg_reuse(struct memseg *msp) 12441 { 12442 caddr_t end; 12443 12444 if (kpm_smallpages == 0) 12445 end = (caddr_t)(msp->kpm_pages + msp->kpm_nkpmpgs); 12446 else 12447 end = (caddr_t)(msp->kpm_spages + msp->kpm_nkpmpgs); 12448 12449 return (end); 12450 } 12451 12452 /* 12453 * Update memsegspa (when first memseg in list 12454 * is deleted) or nextpa when a memseg deleted. 12455 * Assumes that the memsegslock is already held. 12456 */ 12457 void 12458 hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp) 12459 { 12460 struct memseg *lmsp; 12461 12462 if (kpm_enable == 0) 12463 return; 12464 12465 ASSERT(RW_LOCK_HELD(&memsegslock)); 12466 12467 if (mspp == &memsegs) { 12468 memsegspa = (msp->next) ? 12469 va_to_pa(msp->next) : MSEG_NULLPTR_PA; 12470 } else { 12471 lmsp = (struct memseg *) 12472 ((uint64_t)mspp - offsetof(struct memseg, next)); 12473 lmsp->nextpa = (msp->next) ? 12474 va_to_pa(msp->next) : MSEG_NULLPTR_PA; 12475 } 12476 } 12477 12478 /* 12479 * Update kpm members for all memseg's involved in a split operation 12480 * and do the atomic update of the physical memseg chain. 12481 * 12482 * Note: kpm_pages and kpm_spages are aliases and the underlying member 12483 * of struct memseg is a union, therefore they always have the same 12484 * address within a memseg. With that the direct assignments and 12485 * va_to_pa conversions below don't have to be distinguished wrt. to 12486 * kpm_smallpages. They must be differentiated when pointer arithmetic 12487 * is used with them. 12488 * 12489 * Assumes that the memsegslock is already held. 12490 */ 12491 void 12492 hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp, 12493 struct memseg *lo, struct memseg *mid, struct memseg *hi) 12494 { 12495 pgcnt_t start, end, kbase, kstart, num; 12496 struct memseg *lmsp; 12497 12498 if (kpm_enable == 0) 12499 return; 12500 12501 ASSERT(RW_LOCK_HELD(&memsegslock)); 12502 ASSERT(msp && mid && msp->kpm_pages); 12503 12504 kbase = ptokpmp(msp->kpm_pbase); 12505 12506 if (lo) { 12507 num = lo->pages_end - lo->pages_base; 12508 start = kpmptop(ptokpmp(lo->pages_base)); 12509 /* align end to kpm page size granularity */ 12510 end = kpmptop(ptokpmp(start + num - 1)) + kpmpnpgs; 12511 lo->kpm_pbase = start; 12512 lo->kpm_nkpmpgs = ptokpmp(end - start); 12513 lo->kpm_pages = msp->kpm_pages; 12514 lo->kpm_pagespa = va_to_pa(lo->kpm_pages); 12515 lo->pagespa = va_to_pa(lo->pages); 12516 lo->epagespa = va_to_pa(lo->epages); 12517 lo->nextpa = va_to_pa(lo->next); 12518 } 12519 12520 /* mid */ 12521 num = mid->pages_end - mid->pages_base; 12522 kstart = ptokpmp(mid->pages_base); 12523 start = kpmptop(kstart); 12524 /* align end to kpm page size granularity */ 12525 end = kpmptop(ptokpmp(start + num - 1)) + kpmpnpgs; 12526 mid->kpm_pbase = start; 12527 mid->kpm_nkpmpgs = ptokpmp(end - start); 12528 if (kpm_smallpages == 0) { 12529 mid->kpm_pages = msp->kpm_pages + (kstart - kbase); 12530 } else { 12531 mid->kpm_spages = msp->kpm_spages + (kstart - kbase); 12532 } 12533 mid->kpm_pagespa = va_to_pa(mid->kpm_pages); 12534 mid->pagespa = va_to_pa(mid->pages); 12535 mid->epagespa = va_to_pa(mid->epages); 12536 mid->nextpa = (mid->next) ? va_to_pa(mid->next) : MSEG_NULLPTR_PA; 12537 12538 if (hi) { 12539 num = hi->pages_end - hi->pages_base; 12540 kstart = ptokpmp(hi->pages_base); 12541 start = kpmptop(kstart); 12542 /* align end to kpm page size granularity */ 12543 end = kpmptop(ptokpmp(start + num - 1)) + kpmpnpgs; 12544 hi->kpm_pbase = start; 12545 hi->kpm_nkpmpgs = ptokpmp(end - start); 12546 if (kpm_smallpages == 0) { 12547 hi->kpm_pages = msp->kpm_pages + (kstart - kbase); 12548 } else { 12549 hi->kpm_spages = msp->kpm_spages + (kstart - kbase); 12550 } 12551 hi->kpm_pagespa = va_to_pa(hi->kpm_pages); 12552 hi->pagespa = va_to_pa(hi->pages); 12553 hi->epagespa = va_to_pa(hi->epages); 12554 hi->nextpa = (hi->next) ? va_to_pa(hi->next) : MSEG_NULLPTR_PA; 12555 } 12556 12557 /* 12558 * Atomic update of the physical memseg chain 12559 */ 12560 if (mspp == &memsegs) { 12561 memsegspa = (lo) ? va_to_pa(lo) : va_to_pa(mid); 12562 } else { 12563 lmsp = (struct memseg *) 12564 ((uint64_t)mspp - offsetof(struct memseg, next)); 12565 lmsp->nextpa = (lo) ? va_to_pa(lo) : va_to_pa(mid); 12566 } 12567 } 12568 12569 /* 12570 * Walk the memsegs chain, applying func to each memseg span and vcolor. 12571 */ 12572 void 12573 hat_kpm_walk(void (*func)(void *, void *, size_t), void *arg) 12574 { 12575 pfn_t pbase, pend; 12576 int vcolor; 12577 void *base; 12578 size_t size; 12579 struct memseg *msp; 12580 extern uint_t vac_colors; 12581 12582 for (msp = memsegs; msp; msp = msp->next) { 12583 pbase = msp->pages_base; 12584 pend = msp->pages_end; 12585 for (vcolor = 0; vcolor < vac_colors; vcolor++) { 12586 base = ptob(pbase) + kpm_vbase + kpm_size * vcolor; 12587 size = ptob(pend - pbase); 12588 func(arg, base, size); 12589 } 12590 } 12591 } 12592 12593 12594 /* -- sfmmu_kpm internal section -- */ 12595 12596 /* 12597 * Return the page frame number if a valid segkpm mapping exists 12598 * for vaddr, otherwise return PFN_INVALID. No locks are grabbed. 12599 * Should only be used by other sfmmu routines. 12600 */ 12601 pfn_t 12602 sfmmu_kpm_vatopfn(caddr_t vaddr) 12603 { 12604 uintptr_t paddr; 12605 pfn_t pfn; 12606 page_t *pp; 12607 12608 ASSERT(kpm_enable && IS_KPM_ADDR(vaddr)); 12609 12610 SFMMU_KPM_VTOP(vaddr, paddr); 12611 pfn = (pfn_t)btop(paddr); 12612 pp = page_numtopp_nolock(pfn); 12613 if (pp && pp->p_kpmref) 12614 return (pfn); 12615 else 12616 return ((pfn_t)PFN_INVALID); 12617 } 12618 12619 /* 12620 * Lookup a kpme in the p_kpmelist. 12621 */ 12622 static int 12623 sfmmu_kpme_lookup(struct kpme *kpme, page_t *pp) 12624 { 12625 struct kpme *p; 12626 12627 for (p = pp->p_kpmelist; p; p = p->kpe_next) { 12628 if (p == kpme) 12629 return (1); 12630 } 12631 return (0); 12632 } 12633 12634 /* 12635 * Insert a kpme into the p_kpmelist and increment 12636 * the per page kpm reference count. 12637 */ 12638 static void 12639 sfmmu_kpme_add(struct kpme *kpme, page_t *pp) 12640 { 12641 ASSERT(pp->p_kpmref >= 0); 12642 12643 /* head insert */ 12644 kpme->kpe_prev = NULL; 12645 kpme->kpe_next = pp->p_kpmelist; 12646 12647 if (pp->p_kpmelist) 12648 pp->p_kpmelist->kpe_prev = kpme; 12649 12650 pp->p_kpmelist = kpme; 12651 kpme->kpe_page = pp; 12652 pp->p_kpmref++; 12653 } 12654 12655 /* 12656 * Remove a kpme from the p_kpmelist and decrement 12657 * the per page kpm reference count. 12658 */ 12659 static void 12660 sfmmu_kpme_sub(struct kpme *kpme, page_t *pp) 12661 { 12662 ASSERT(pp->p_kpmref > 0); 12663 12664 if (kpme->kpe_prev) { 12665 ASSERT(pp->p_kpmelist != kpme); 12666 ASSERT(kpme->kpe_prev->kpe_page == pp); 12667 kpme->kpe_prev->kpe_next = kpme->kpe_next; 12668 } else { 12669 ASSERT(pp->p_kpmelist == kpme); 12670 pp->p_kpmelist = kpme->kpe_next; 12671 } 12672 12673 if (kpme->kpe_next) { 12674 ASSERT(kpme->kpe_next->kpe_page == pp); 12675 kpme->kpe_next->kpe_prev = kpme->kpe_prev; 12676 } 12677 12678 kpme->kpe_next = kpme->kpe_prev = NULL; 12679 kpme->kpe_page = NULL; 12680 pp->p_kpmref--; 12681 } 12682 12683 /* 12684 * Mapin a single page, it is called every time a page changes it's state 12685 * from kpm-unmapped to kpm-mapped. It may not be called, when only a new 12686 * kpm instance does a mapin and wants to share the mapping. 12687 * Assumes that the mlist mutex is already grabbed. 12688 */ 12689 static caddr_t 12690 sfmmu_kpm_mapin(page_t *pp) 12691 { 12692 kpm_page_t *kp; 12693 kpm_hlk_t *kpmp; 12694 caddr_t vaddr; 12695 int kpm_vac_range; 12696 pfn_t pfn; 12697 tte_t tte; 12698 kmutex_t *pmtx; 12699 int uncached; 12700 kpm_spage_t *ksp; 12701 kpm_shlk_t *kpmsp; 12702 int oldval; 12703 12704 ASSERT(sfmmu_mlist_held(pp)); 12705 ASSERT(pp->p_kpmref == 0); 12706 12707 vaddr = sfmmu_kpm_getvaddr(pp, &kpm_vac_range); 12708 12709 ASSERT(IS_KPM_ADDR(vaddr)); 12710 uncached = PP_ISNC(pp); 12711 pfn = pp->p_pagenum; 12712 12713 if (kpm_smallpages) 12714 goto smallpages_mapin; 12715 12716 PP2KPMPG(pp, kp); 12717 12718 kpmp = KPMP_HASH(kp); 12719 mutex_enter(&kpmp->khl_mutex); 12720 12721 ASSERT(PP_ISKPMC(pp) == 0); 12722 ASSERT(PP_ISKPMS(pp) == 0); 12723 12724 if (uncached) { 12725 /* ASSERT(pp->p_share); XXX use hat_page_getshare */ 12726 if (kpm_vac_range == 0) { 12727 if (kp->kp_refcnts == 0) { 12728 /* 12729 * Must remove large page mapping if it exists. 12730 * Pages in uncached state can only be mapped 12731 * small (PAGESIZE) within the regular kpm 12732 * range. 12733 */ 12734 if (kp->kp_refcntc == -1) { 12735 /* remove go indication */ 12736 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 12737 &kpmp->khl_lock, KPMTSBM_STOP); 12738 } 12739 if (kp->kp_refcnt > 0 && kp->kp_refcntc == 0) 12740 sfmmu_kpm_demap_large(vaddr); 12741 } 12742 ASSERT(kp->kp_refcntc >= 0); 12743 kp->kp_refcntc++; 12744 } 12745 pmtx = sfmmu_page_enter(pp); 12746 PP_SETKPMC(pp); 12747 sfmmu_page_exit(pmtx); 12748 } 12749 12750 if ((kp->kp_refcntc > 0 || kp->kp_refcnts > 0) && kpm_vac_range == 0) { 12751 /* 12752 * Have to do a small (PAGESIZE) mapin within this kpm_page 12753 * range since it is marked to be in VAC conflict mode or 12754 * when there are still other small mappings around. 12755 */ 12756 12757 /* tte assembly */ 12758 if (uncached == 0) 12759 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 12760 else 12761 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 12762 12763 /* tsb dropin */ 12764 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 12765 12766 pmtx = sfmmu_page_enter(pp); 12767 PP_SETKPMS(pp); 12768 sfmmu_page_exit(pmtx); 12769 12770 kp->kp_refcnts++; 12771 ASSERT(kp->kp_refcnts > 0); 12772 goto exit; 12773 } 12774 12775 if (kpm_vac_range == 0) { 12776 /* 12777 * Fast path / regular case, no VAC conflict handling 12778 * in progress within this kpm_page range. 12779 */ 12780 if (kp->kp_refcnt == 0) { 12781 12782 /* tte assembly */ 12783 KPM_TTE_VCACHED(tte.ll, pfn, TTE4M); 12784 12785 /* tsb dropin */ 12786 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT4M); 12787 12788 /* Set go flag for TL tsbmiss handler */ 12789 if (kp->kp_refcntc == 0) 12790 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 12791 &kpmp->khl_lock, KPMTSBM_START); 12792 12793 ASSERT(kp->kp_refcntc == -1); 12794 } 12795 kp->kp_refcnt++; 12796 ASSERT(kp->kp_refcnt); 12797 12798 } else { 12799 /* 12800 * The page is not setup according to the common VAC 12801 * prevention rules for the regular and kpm mapping layer 12802 * E.g. the page layer was not able to deliver a right 12803 * vcolor'ed page for a given vaddr corresponding to 12804 * the wanted p_offset. It has to be mapped in small in 12805 * within the corresponding kpm vac range in order to 12806 * prevent VAC alias conflicts. 12807 */ 12808 12809 /* tte assembly */ 12810 if (uncached == 0) { 12811 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 12812 } else { 12813 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 12814 } 12815 12816 /* tsb dropin */ 12817 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 12818 12819 kp->kp_refcnta++; 12820 if (kp->kp_refcntc == -1) { 12821 ASSERT(kp->kp_refcnt > 0); 12822 12823 /* remove go indication */ 12824 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, &kpmp->khl_lock, 12825 KPMTSBM_STOP); 12826 } 12827 ASSERT(kp->kp_refcntc >= 0); 12828 } 12829 exit: 12830 mutex_exit(&kpmp->khl_mutex); 12831 return (vaddr); 12832 12833 smallpages_mapin: 12834 if (uncached == 0) { 12835 /* tte assembly */ 12836 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 12837 } else { 12838 /* ASSERT(pp->p_share); XXX use hat_page_getshare */ 12839 pmtx = sfmmu_page_enter(pp); 12840 PP_SETKPMC(pp); 12841 sfmmu_page_exit(pmtx); 12842 /* tte assembly */ 12843 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 12844 } 12845 12846 /* tsb dropin */ 12847 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 12848 12849 PP2KPMSPG(pp, ksp); 12850 kpmsp = KPMP_SHASH(ksp); 12851 12852 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, &kpmsp->kshl_lock, 12853 (uncached) ? KPM_MAPPEDSC : KPM_MAPPEDS); 12854 12855 if (oldval != 0) 12856 panic("sfmmu_kpm_mapin: stale smallpages mapping"); 12857 12858 return (vaddr); 12859 } 12860 12861 /* 12862 * Mapout a single page, it is called every time a page changes it's state 12863 * from kpm-mapped to kpm-unmapped. It may not be called, when only a kpm 12864 * instance calls mapout and there are still other instances mapping the 12865 * page. Assumes that the mlist mutex is already grabbed. 12866 * 12867 * Note: In normal mode (no VAC conflict prevention pending) TLB's are 12868 * not flushed. This is the core segkpm behavior to avoid xcalls. It is 12869 * no problem because a translation from a segkpm virtual address to a 12870 * physical address is always the same. The only downside is a slighty 12871 * increased window of vulnerability for misbehaving _kernel_ modules. 12872 */ 12873 static void 12874 sfmmu_kpm_mapout(page_t *pp, caddr_t vaddr) 12875 { 12876 kpm_page_t *kp; 12877 kpm_hlk_t *kpmp; 12878 int alias_range; 12879 kmutex_t *pmtx; 12880 kpm_spage_t *ksp; 12881 kpm_shlk_t *kpmsp; 12882 int oldval; 12883 12884 ASSERT(sfmmu_mlist_held(pp)); 12885 ASSERT(pp->p_kpmref == 0); 12886 12887 alias_range = IS_KPM_ALIAS_RANGE(vaddr); 12888 12889 if (kpm_smallpages) 12890 goto smallpages_mapout; 12891 12892 PP2KPMPG(pp, kp); 12893 kpmp = KPMP_HASH(kp); 12894 mutex_enter(&kpmp->khl_mutex); 12895 12896 if (alias_range) { 12897 ASSERT(PP_ISKPMS(pp) == 0); 12898 if (kp->kp_refcnta <= 0) { 12899 panic("sfmmu_kpm_mapout: bad refcnta kp=%p", 12900 (void *)kp); 12901 } 12902 12903 if (PP_ISTNC(pp)) { 12904 if (PP_ISKPMC(pp) == 0) { 12905 /* 12906 * Uncached kpm mappings must always have 12907 * forced "small page" mode. 12908 */ 12909 panic("sfmmu_kpm_mapout: uncached page not " 12910 "kpm marked"); 12911 } 12912 sfmmu_kpm_demap_small(vaddr); 12913 12914 pmtx = sfmmu_page_enter(pp); 12915 PP_CLRKPMC(pp); 12916 sfmmu_page_exit(pmtx); 12917 12918 /* 12919 * Check if we can resume cached mode. This might 12920 * be the case if the kpm mapping was the only 12921 * mapping in conflict with other non rule 12922 * compliant mappings. The page is no more marked 12923 * as kpm mapped, so the conv_tnc path will not 12924 * change kpm state. 12925 */ 12926 conv_tnc(pp, TTE8K); 12927 12928 } else if (PP_ISKPMC(pp) == 0) { 12929 /* remove TSB entry only */ 12930 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT); 12931 12932 } else { 12933 /* already demapped */ 12934 pmtx = sfmmu_page_enter(pp); 12935 PP_CLRKPMC(pp); 12936 sfmmu_page_exit(pmtx); 12937 } 12938 kp->kp_refcnta--; 12939 goto exit; 12940 } 12941 12942 if (kp->kp_refcntc <= 0 && kp->kp_refcnts == 0) { 12943 /* 12944 * Fast path / regular case. 12945 */ 12946 ASSERT(kp->kp_refcntc >= -1); 12947 ASSERT(!(pp->p_nrm & (P_KPMC | P_KPMS | P_TNC | P_PNC))); 12948 12949 if (kp->kp_refcnt <= 0) 12950 panic("sfmmu_kpm_mapout: bad refcnt kp=%p", (void *)kp); 12951 12952 if (--kp->kp_refcnt == 0) { 12953 /* remove go indication */ 12954 if (kp->kp_refcntc == -1) { 12955 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 12956 &kpmp->khl_lock, KPMTSBM_STOP); 12957 } 12958 ASSERT(kp->kp_refcntc == 0); 12959 12960 /* remove TSB entry */ 12961 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT4M); 12962 #ifdef DEBUG 12963 if (kpm_tlb_flush) 12964 sfmmu_kpm_demap_tlbs(vaddr); 12965 #endif 12966 } 12967 12968 } else { 12969 /* 12970 * The VAC alias path. 12971 * We come here if the kpm vaddr is not in any alias_range 12972 * and we are unmapping a page within the regular kpm_page 12973 * range. The kpm_page either holds conflict pages and/or 12974 * is in "small page" mode. If the page is not marked 12975 * P_KPMS it couldn't have a valid PAGESIZE sized TSB 12976 * entry. Dcache flushing is done lazy and follows the 12977 * rules of the regular virtual page coloring scheme. 12978 * 12979 * Per page states and required actions: 12980 * P_KPMC: remove a kpm mapping that is conflicting. 12981 * P_KPMS: remove a small kpm mapping within a kpm_page. 12982 * P_TNC: check if we can re-cache the page. 12983 * P_PNC: we cannot re-cache, sorry. 12984 * Per kpm_page: 12985 * kp_refcntc > 0: page is part of a kpm_page with conflicts. 12986 * kp_refcnts > 0: rm a small mapped page within a kpm_page. 12987 */ 12988 12989 if (PP_ISKPMS(pp)) { 12990 if (kp->kp_refcnts < 1) { 12991 panic("sfmmu_kpm_mapout: bad refcnts kp=%p", 12992 (void *)kp); 12993 } 12994 sfmmu_kpm_demap_small(vaddr); 12995 12996 /* 12997 * Check if we can resume cached mode. This might 12998 * be the case if the kpm mapping was the only 12999 * mapping in conflict with other non rule 13000 * compliant mappings. The page is no more marked 13001 * as kpm mapped, so the conv_tnc path will not 13002 * change kpm state. 13003 */ 13004 if (PP_ISTNC(pp)) { 13005 if (!PP_ISKPMC(pp)) { 13006 /* 13007 * Uncached kpm mappings must always 13008 * have forced "small page" mode. 13009 */ 13010 panic("sfmmu_kpm_mapout: uncached " 13011 "page not kpm marked"); 13012 } 13013 conv_tnc(pp, TTE8K); 13014 } 13015 kp->kp_refcnts--; 13016 kp->kp_refcnt++; 13017 pmtx = sfmmu_page_enter(pp); 13018 PP_CLRKPMS(pp); 13019 sfmmu_page_exit(pmtx); 13020 } 13021 13022 if (PP_ISKPMC(pp)) { 13023 if (kp->kp_refcntc < 1) { 13024 panic("sfmmu_kpm_mapout: bad refcntc kp=%p", 13025 (void *)kp); 13026 } 13027 pmtx = sfmmu_page_enter(pp); 13028 PP_CLRKPMC(pp); 13029 sfmmu_page_exit(pmtx); 13030 kp->kp_refcntc--; 13031 } 13032 13033 if (kp->kp_refcnt-- < 1) 13034 panic("sfmmu_kpm_mapout: bad refcnt kp=%p", (void *)kp); 13035 } 13036 exit: 13037 mutex_exit(&kpmp->khl_mutex); 13038 return; 13039 13040 smallpages_mapout: 13041 PP2KPMSPG(pp, ksp); 13042 kpmsp = KPMP_SHASH(ksp); 13043 13044 if (PP_ISKPMC(pp) == 0) { 13045 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13046 &kpmsp->kshl_lock, 0); 13047 13048 if (oldval != KPM_MAPPEDS) { 13049 /* 13050 * When we're called after sfmmu_kpm_hme_unload, 13051 * KPM_MAPPEDSC is valid too. 13052 */ 13053 if (oldval != KPM_MAPPEDSC) 13054 panic("sfmmu_kpm_mapout: incorrect mapping"); 13055 } 13056 13057 /* remove TSB entry */ 13058 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT); 13059 #ifdef DEBUG 13060 if (kpm_tlb_flush) 13061 sfmmu_kpm_demap_tlbs(vaddr); 13062 #endif 13063 13064 } else if (PP_ISTNC(pp)) { 13065 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13066 &kpmsp->kshl_lock, 0); 13067 13068 if (oldval != KPM_MAPPEDSC || PP_ISKPMC(pp) == 0) 13069 panic("sfmmu_kpm_mapout: inconsistent TNC mapping"); 13070 13071 sfmmu_kpm_demap_small(vaddr); 13072 13073 pmtx = sfmmu_page_enter(pp); 13074 PP_CLRKPMC(pp); 13075 sfmmu_page_exit(pmtx); 13076 13077 /* 13078 * Check if we can resume cached mode. This might be 13079 * the case if the kpm mapping was the only mapping 13080 * in conflict with other non rule compliant mappings. 13081 * The page is no more marked as kpm mapped, so the 13082 * conv_tnc path will not change the kpm state. 13083 */ 13084 conv_tnc(pp, TTE8K); 13085 13086 } else { 13087 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13088 &kpmsp->kshl_lock, 0); 13089 13090 if (oldval != KPM_MAPPEDSC) 13091 panic("sfmmu_kpm_mapout: inconsistent mapping"); 13092 13093 pmtx = sfmmu_page_enter(pp); 13094 PP_CLRKPMC(pp); 13095 sfmmu_page_exit(pmtx); 13096 } 13097 } 13098 13099 #define abs(x) ((x) < 0 ? -(x) : (x)) 13100 13101 /* 13102 * Determine appropriate kpm mapping address and handle any kpm/hme 13103 * conflicts. Page mapping list and its vcolor parts must be protected. 13104 */ 13105 static caddr_t 13106 sfmmu_kpm_getvaddr(page_t *pp, int *kpm_vac_rangep) 13107 { 13108 int vcolor, vcolor_pa; 13109 caddr_t vaddr; 13110 uintptr_t paddr; 13111 13112 13113 ASSERT(sfmmu_mlist_held(pp)); 13114 13115 paddr = ptob(pp->p_pagenum); 13116 vcolor_pa = addr_to_vcolor(paddr); 13117 13118 if (pp->p_vnode && IS_SWAPFSVP(pp->p_vnode)) { 13119 vcolor = (PP_NEWPAGE(pp) || PP_ISNC(pp)) ? 13120 vcolor_pa : PP_GET_VCOLOR(pp); 13121 } else { 13122 vcolor = addr_to_vcolor(pp->p_offset); 13123 } 13124 13125 vaddr = kpm_vbase + paddr; 13126 *kpm_vac_rangep = 0; 13127 13128 if (vcolor_pa != vcolor) { 13129 *kpm_vac_rangep = abs(vcolor - vcolor_pa); 13130 vaddr += ((uintptr_t)(vcolor - vcolor_pa) << MMU_PAGESHIFT); 13131 vaddr += (vcolor_pa > vcolor) ? 13132 ((uintptr_t)vcolor_pa << kpm_size_shift) : 13133 ((uintptr_t)(vcolor - vcolor_pa) << kpm_size_shift); 13134 13135 ASSERT(!PP_ISMAPPED_LARGE(pp)); 13136 } 13137 13138 if (PP_ISNC(pp)) 13139 return (vaddr); 13140 13141 if (PP_NEWPAGE(pp)) { 13142 PP_SET_VCOLOR(pp, vcolor); 13143 return (vaddr); 13144 } 13145 13146 if (PP_GET_VCOLOR(pp) == vcolor) 13147 return (vaddr); 13148 13149 ASSERT(!PP_ISMAPPED_KPM(pp)); 13150 sfmmu_kpm_vac_conflict(pp, vaddr); 13151 13152 return (vaddr); 13153 } 13154 13155 /* 13156 * VAC conflict state bit values. 13157 * The following defines are used to make the handling of the 13158 * various input states more concise. For that the kpm states 13159 * per kpm_page and per page are combined in a summary state. 13160 * Each single state has a corresponding bit value in the 13161 * summary state. These defines only apply for kpm large page 13162 * mappings. Within comments the abbreviations "kc, c, ks, s" 13163 * are used as short form of the actual state, e.g. "kc" for 13164 * "kp_refcntc > 0", etc. 13165 */ 13166 #define KPM_KC 0x00000008 /* kpm_page: kp_refcntc > 0 */ 13167 #define KPM_C 0x00000004 /* page: P_KPMC set */ 13168 #define KPM_KS 0x00000002 /* kpm_page: kp_refcnts > 0 */ 13169 #define KPM_S 0x00000001 /* page: P_KPMS set */ 13170 13171 /* 13172 * Summary states used in sfmmu_kpm_fault (KPM_TSBM_*). 13173 * See also more detailed comments within in the sfmmu_kpm_fault switch. 13174 * Abbreviations used: 13175 * CONFL: VAC conflict(s) within a kpm_page. 13176 * MAPS: Mapped small: Page mapped in using a regular page size kpm mapping. 13177 * RASM: Re-assembling of a large page mapping possible. 13178 * RPLS: Replace: TSB miss due to TSB replacement only. 13179 * BRKO: Breakup Other: A large kpm mapping has to be broken because another 13180 * page within the kpm_page is already involved in a VAC conflict. 13181 * BRKT: Breakup This: A large kpm mapping has to be broken, this page is 13182 * is involved in a VAC conflict. 13183 */ 13184 #define KPM_TSBM_CONFL_GONE (0) 13185 #define KPM_TSBM_MAPS_RASM (KPM_KS) 13186 #define KPM_TSBM_RPLS_RASM (KPM_KS | KPM_S) 13187 #define KPM_TSBM_MAPS_BRKO (KPM_KC) 13188 #define KPM_TSBM_MAPS (KPM_KC | KPM_KS) 13189 #define KPM_TSBM_RPLS (KPM_KC | KPM_KS | KPM_S) 13190 #define KPM_TSBM_MAPS_BRKT (KPM_KC | KPM_C) 13191 #define KPM_TSBM_MAPS_CONFL (KPM_KC | KPM_C | KPM_KS) 13192 #define KPM_TSBM_RPLS_CONFL (KPM_KC | KPM_C | KPM_KS | KPM_S) 13193 13194 /* 13195 * kpm fault handler for mappings with large page size. 13196 */ 13197 int 13198 sfmmu_kpm_fault(caddr_t vaddr, struct memseg *mseg, page_t *pp) 13199 { 13200 int error; 13201 pgcnt_t inx; 13202 kpm_page_t *kp; 13203 tte_t tte; 13204 pfn_t pfn = pp->p_pagenum; 13205 kpm_hlk_t *kpmp; 13206 kmutex_t *pml; 13207 int alias_range; 13208 int uncached = 0; 13209 kmutex_t *pmtx; 13210 int badstate; 13211 uint_t tsbmcase; 13212 13213 alias_range = IS_KPM_ALIAS_RANGE(vaddr); 13214 13215 inx = ptokpmp(kpmptop(ptokpmp(pfn)) - mseg->kpm_pbase); 13216 if (inx >= mseg->kpm_nkpmpgs) { 13217 cmn_err(CE_PANIC, "sfmmu_kpm_fault: kpm overflow in memseg " 13218 "0x%p pp 0x%p", (void *)mseg, (void *)pp); 13219 } 13220 13221 kp = &mseg->kpm_pages[inx]; 13222 kpmp = KPMP_HASH(kp); 13223 13224 pml = sfmmu_mlist_enter(pp); 13225 13226 if (!PP_ISMAPPED_KPM(pp)) { 13227 sfmmu_mlist_exit(pml); 13228 return (EFAULT); 13229 } 13230 13231 mutex_enter(&kpmp->khl_mutex); 13232 13233 if (alias_range) { 13234 ASSERT(!PP_ISMAPPED_LARGE(pp)); 13235 if (kp->kp_refcnta > 0) { 13236 if (PP_ISKPMC(pp)) { 13237 pmtx = sfmmu_page_enter(pp); 13238 PP_CLRKPMC(pp); 13239 sfmmu_page_exit(pmtx); 13240 } 13241 /* 13242 * Check for vcolor conflicts. Return here 13243 * w/ either no conflict (fast path), removed hme 13244 * mapping chains (unload conflict) or uncached 13245 * (uncache conflict). VACaches are cleaned and 13246 * p_vcolor and PP_TNC are set accordingly for the 13247 * conflict cases. Drop kpmp for uncache conflict 13248 * cases since it will be grabbed within 13249 * sfmmu_kpm_page_cache in case of an uncache 13250 * conflict. 13251 */ 13252 mutex_exit(&kpmp->khl_mutex); 13253 sfmmu_kpm_vac_conflict(pp, vaddr); 13254 mutex_enter(&kpmp->khl_mutex); 13255 13256 if (PP_ISNC(pp)) { 13257 uncached = 1; 13258 pmtx = sfmmu_page_enter(pp); 13259 PP_SETKPMC(pp); 13260 sfmmu_page_exit(pmtx); 13261 } 13262 goto smallexit; 13263 13264 } else { 13265 /* 13266 * We got a tsbmiss on a not active kpm_page range. 13267 * Let segkpm_fault decide how to panic. 13268 */ 13269 error = EFAULT; 13270 } 13271 goto exit; 13272 } 13273 13274 badstate = (kp->kp_refcnt < 0 || kp->kp_refcnts < 0); 13275 if (kp->kp_refcntc == -1) { 13276 /* 13277 * We should come here only if trap level tsb miss 13278 * handler is disabled. 13279 */ 13280 badstate |= (kp->kp_refcnt == 0 || kp->kp_refcnts > 0 || 13281 PP_ISKPMC(pp) || PP_ISKPMS(pp) || PP_ISNC(pp)); 13282 13283 if (badstate == 0) 13284 goto largeexit; 13285 } 13286 13287 if (badstate || kp->kp_refcntc < 0) 13288 goto badstate_exit; 13289 13290 /* 13291 * Combine the per kpm_page and per page kpm VAC states to 13292 * a summary state in order to make the kpm fault handling 13293 * more concise. 13294 */ 13295 tsbmcase = (((kp->kp_refcntc > 0) ? KPM_KC : 0) | 13296 ((kp->kp_refcnts > 0) ? KPM_KS : 0) | 13297 (PP_ISKPMC(pp) ? KPM_C : 0) | 13298 (PP_ISKPMS(pp) ? KPM_S : 0)); 13299 13300 switch (tsbmcase) { 13301 case KPM_TSBM_CONFL_GONE: /* - - - - */ 13302 /* 13303 * That's fine, we either have no more vac conflict in 13304 * this kpm page or someone raced in and has solved the 13305 * vac conflict for us -- call sfmmu_kpm_vac_conflict 13306 * to take care for correcting the vcolor and flushing 13307 * the dcache if required. 13308 */ 13309 mutex_exit(&kpmp->khl_mutex); 13310 sfmmu_kpm_vac_conflict(pp, vaddr); 13311 mutex_enter(&kpmp->khl_mutex); 13312 13313 if (PP_ISNC(pp) || kp->kp_refcnt <= 0 || 13314 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13315 panic("sfmmu_kpm_fault: inconsistent CONFL_GONE " 13316 "state, pp=%p", (void *)pp); 13317 } 13318 goto largeexit; 13319 13320 case KPM_TSBM_MAPS_RASM: /* - - ks - */ 13321 /* 13322 * All conflicts in this kpm page are gone but there are 13323 * already small mappings around, so we also map this 13324 * page small. This could be the trigger case for a 13325 * small mapping reaper, if this is really needed. 13326 * For now fall thru to the KPM_TSBM_MAPS handling. 13327 */ 13328 13329 case KPM_TSBM_MAPS: /* kc - ks - */ 13330 /* 13331 * Large page mapping is already broken, this page is not 13332 * conflicting, so map it small. Call sfmmu_kpm_vac_conflict 13333 * to take care for correcting the vcolor and flushing 13334 * the dcache if required. 13335 */ 13336 mutex_exit(&kpmp->khl_mutex); 13337 sfmmu_kpm_vac_conflict(pp, vaddr); 13338 mutex_enter(&kpmp->khl_mutex); 13339 13340 if (PP_ISNC(pp) || kp->kp_refcnt <= 0 || 13341 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13342 panic("sfmmu_kpm_fault: inconsistent MAPS state, " 13343 "pp=%p", (void *)pp); 13344 } 13345 kp->kp_refcnt--; 13346 kp->kp_refcnts++; 13347 pmtx = sfmmu_page_enter(pp); 13348 PP_SETKPMS(pp); 13349 sfmmu_page_exit(pmtx); 13350 goto smallexit; 13351 13352 case KPM_TSBM_RPLS_RASM: /* - - ks s */ 13353 /* 13354 * All conflicts in this kpm page are gone but this page 13355 * is mapped small. This could be the trigger case for a 13356 * small mapping reaper, if this is really needed. 13357 * For now we drop it in small again. Fall thru to the 13358 * KPM_TSBM_RPLS handling. 13359 */ 13360 13361 case KPM_TSBM_RPLS: /* kc - ks s */ 13362 /* 13363 * Large page mapping is already broken, this page is not 13364 * conflicting but already mapped small, so drop it in 13365 * small again. 13366 */ 13367 if (PP_ISNC(pp) || 13368 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13369 panic("sfmmu_kpm_fault: inconsistent RPLS state, " 13370 "pp=%p", (void *)pp); 13371 } 13372 goto smallexit; 13373 13374 case KPM_TSBM_MAPS_BRKO: /* kc - - - */ 13375 /* 13376 * The kpm page where we live in is marked conflicting 13377 * but this page is not conflicting. So we have to map it 13378 * in small. Call sfmmu_kpm_vac_conflict to take care for 13379 * correcting the vcolor and flushing the dcache if required. 13380 */ 13381 mutex_exit(&kpmp->khl_mutex); 13382 sfmmu_kpm_vac_conflict(pp, vaddr); 13383 mutex_enter(&kpmp->khl_mutex); 13384 13385 if (PP_ISNC(pp) || kp->kp_refcnt <= 0 || 13386 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13387 panic("sfmmu_kpm_fault: inconsistent MAPS_BRKO state, " 13388 "pp=%p", (void *)pp); 13389 } 13390 kp->kp_refcnt--; 13391 kp->kp_refcnts++; 13392 pmtx = sfmmu_page_enter(pp); 13393 PP_SETKPMS(pp); 13394 sfmmu_page_exit(pmtx); 13395 goto smallexit; 13396 13397 case KPM_TSBM_MAPS_BRKT: /* kc c - - */ 13398 case KPM_TSBM_MAPS_CONFL: /* kc c ks - */ 13399 if (!PP_ISMAPPED(pp)) { 13400 /* 13401 * We got a tsbmiss on kpm large page range that is 13402 * marked to contain vac conflicting pages introduced 13403 * by hme mappings. The hme mappings are all gone and 13404 * must have bypassed the kpm alias prevention logic. 13405 */ 13406 panic("sfmmu_kpm_fault: stale VAC conflict, pp=%p", 13407 (void *)pp); 13408 } 13409 13410 /* 13411 * Check for vcolor conflicts. Return here w/ either no 13412 * conflict (fast path), removed hme mapping chains 13413 * (unload conflict) or uncached (uncache conflict). 13414 * Dcache is cleaned and p_vcolor and P_TNC are set 13415 * accordingly. Drop kpmp for uncache conflict cases 13416 * since it will be grabbed within sfmmu_kpm_page_cache 13417 * in case of an uncache conflict. 13418 */ 13419 mutex_exit(&kpmp->khl_mutex); 13420 sfmmu_kpm_vac_conflict(pp, vaddr); 13421 mutex_enter(&kpmp->khl_mutex); 13422 13423 if (kp->kp_refcnt <= 0) 13424 panic("sfmmu_kpm_fault: bad refcnt kp=%p", (void *)kp); 13425 13426 if (PP_ISNC(pp)) { 13427 uncached = 1; 13428 } else { 13429 /* 13430 * When an unload conflict is solved and there are 13431 * no other small mappings around, we can resume 13432 * largepage mode. Otherwise we have to map or drop 13433 * in small. This could be a trigger for a small 13434 * mapping reaper when this was the last conflict 13435 * within the kpm page and when there are only 13436 * other small mappings around. 13437 */ 13438 ASSERT(addr_to_vcolor(vaddr) == PP_GET_VCOLOR(pp)); 13439 ASSERT(kp->kp_refcntc > 0); 13440 kp->kp_refcntc--; 13441 pmtx = sfmmu_page_enter(pp); 13442 PP_CLRKPMC(pp); 13443 sfmmu_page_exit(pmtx); 13444 ASSERT(PP_ISKPMS(pp) == 0); 13445 if (kp->kp_refcntc == 0 && kp->kp_refcnts == 0) 13446 goto largeexit; 13447 } 13448 13449 kp->kp_refcnt--; 13450 kp->kp_refcnts++; 13451 pmtx = sfmmu_page_enter(pp); 13452 PP_SETKPMS(pp); 13453 sfmmu_page_exit(pmtx); 13454 goto smallexit; 13455 13456 case KPM_TSBM_RPLS_CONFL: /* kc c ks s */ 13457 if (!PP_ISMAPPED(pp)) { 13458 /* 13459 * We got a tsbmiss on kpm large page range that is 13460 * marked to contain vac conflicting pages introduced 13461 * by hme mappings. They are all gone and must have 13462 * somehow bypassed the kpm alias prevention logic. 13463 */ 13464 panic("sfmmu_kpm_fault: stale VAC conflict, pp=%p", 13465 (void *)pp); 13466 } 13467 13468 /* 13469 * This state is only possible for an uncached mapping. 13470 */ 13471 if (!PP_ISNC(pp)) { 13472 panic("sfmmu_kpm_fault: page not uncached, pp=%p", 13473 (void *)pp); 13474 } 13475 uncached = 1; 13476 goto smallexit; 13477 13478 default: 13479 badstate_exit: 13480 panic("sfmmu_kpm_fault: inconsistent VAC state, vaddr=%p kp=%p " 13481 "pp=%p", (void *)vaddr, (void *)kp, (void *)pp); 13482 } 13483 13484 smallexit: 13485 /* tte assembly */ 13486 if (uncached == 0) 13487 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 13488 else 13489 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 13490 13491 /* tsb dropin */ 13492 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13493 13494 error = 0; 13495 goto exit; 13496 13497 largeexit: 13498 if (kp->kp_refcnt > 0) { 13499 13500 /* tte assembly */ 13501 KPM_TTE_VCACHED(tte.ll, pfn, TTE4M); 13502 13503 /* tsb dropin */ 13504 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT4M); 13505 13506 if (kp->kp_refcntc == 0) { 13507 /* Set "go" flag for TL tsbmiss handler */ 13508 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, &kpmp->khl_lock, 13509 KPMTSBM_START); 13510 } 13511 ASSERT(kp->kp_refcntc == -1); 13512 error = 0; 13513 13514 } else 13515 error = EFAULT; 13516 exit: 13517 mutex_exit(&kpmp->khl_mutex); 13518 sfmmu_mlist_exit(pml); 13519 return (error); 13520 } 13521 13522 /* 13523 * kpm fault handler for mappings with small page size. 13524 */ 13525 int 13526 sfmmu_kpm_fault_small(caddr_t vaddr, struct memseg *mseg, page_t *pp) 13527 { 13528 int error = 0; 13529 pgcnt_t inx; 13530 kpm_spage_t *ksp; 13531 kpm_shlk_t *kpmsp; 13532 kmutex_t *pml; 13533 pfn_t pfn = pp->p_pagenum; 13534 tte_t tte; 13535 kmutex_t *pmtx; 13536 int oldval; 13537 13538 inx = pfn - mseg->kpm_pbase; 13539 ksp = &mseg->kpm_spages[inx]; 13540 kpmsp = KPMP_SHASH(ksp); 13541 13542 pml = sfmmu_mlist_enter(pp); 13543 13544 if (!PP_ISMAPPED_KPM(pp)) { 13545 sfmmu_mlist_exit(pml); 13546 return (EFAULT); 13547 } 13548 13549 /* 13550 * kp_mapped lookup protected by mlist mutex 13551 */ 13552 if (ksp->kp_mapped == KPM_MAPPEDS) { 13553 /* 13554 * Fast path tsbmiss 13555 */ 13556 ASSERT(!PP_ISKPMC(pp)); 13557 ASSERT(!PP_ISNC(pp)); 13558 13559 /* tte assembly */ 13560 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 13561 13562 /* tsb dropin */ 13563 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13564 13565 } else if (ksp->kp_mapped == KPM_MAPPEDSC) { 13566 /* 13567 * Got here due to existing or gone kpm/hme VAC conflict. 13568 * Recheck for vcolor conflicts. Return here w/ either 13569 * no conflict, removed hme mapping chain (unload 13570 * conflict) or uncached (uncache conflict). VACaches 13571 * are cleaned and p_vcolor and PP_TNC are set accordingly 13572 * for the conflict cases. 13573 */ 13574 sfmmu_kpm_vac_conflict(pp, vaddr); 13575 13576 if (PP_ISNC(pp)) { 13577 /* ASSERT(pp->p_share); XXX use hat_page_getshare */ 13578 13579 /* tte assembly */ 13580 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 13581 13582 /* tsb dropin */ 13583 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13584 13585 } else { 13586 if (PP_ISKPMC(pp)) { 13587 pmtx = sfmmu_page_enter(pp); 13588 PP_CLRKPMC(pp); 13589 sfmmu_page_exit(pmtx); 13590 } 13591 13592 /* tte assembly */ 13593 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 13594 13595 /* tsb dropin */ 13596 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13597 13598 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13599 &kpmsp->kshl_lock, KPM_MAPPEDS); 13600 13601 if (oldval != KPM_MAPPEDSC) 13602 panic("sfmmu_kpm_fault_small: " 13603 "stale smallpages mapping"); 13604 } 13605 13606 } else { 13607 /* 13608 * We got a tsbmiss on a not active kpm_page range. 13609 * Let decide segkpm_fault how to panic. 13610 */ 13611 error = EFAULT; 13612 } 13613 13614 sfmmu_mlist_exit(pml); 13615 return (error); 13616 } 13617 13618 /* 13619 * Check/handle potential hme/kpm mapping conflicts 13620 */ 13621 static void 13622 sfmmu_kpm_vac_conflict(page_t *pp, caddr_t vaddr) 13623 { 13624 int vcolor; 13625 struct sf_hment *sfhmep; 13626 struct hat *tmphat; 13627 struct sf_hment *tmphme = NULL; 13628 struct hme_blk *hmeblkp; 13629 tte_t tte; 13630 13631 ASSERT(sfmmu_mlist_held(pp)); 13632 13633 if (PP_ISNC(pp)) 13634 return; 13635 13636 vcolor = addr_to_vcolor(vaddr); 13637 if (PP_GET_VCOLOR(pp) == vcolor) 13638 return; 13639 13640 /* 13641 * There could be no vcolor conflict between a large cached 13642 * hme page and a non alias range kpm page (neither large nor 13643 * small mapped). So if a hme conflict already exists between 13644 * a constituent page of a large hme mapping and a shared small 13645 * conflicting hme mapping, both mappings must be already 13646 * uncached at this point. 13647 */ 13648 ASSERT(!PP_ISMAPPED_LARGE(pp)); 13649 13650 if (!PP_ISMAPPED(pp)) { 13651 /* 13652 * Previous hme user of page had a different color 13653 * but since there are no current users 13654 * we just flush the cache and change the color. 13655 */ 13656 SFMMU_STAT(sf_pgcolor_conflict); 13657 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 13658 PP_SET_VCOLOR(pp, vcolor); 13659 return; 13660 } 13661 13662 /* 13663 * If we get here we have a vac conflict with a current hme 13664 * mapping. This must have been established by forcing a wrong 13665 * colored mapping, e.g. by using mmap(2) with MAP_FIXED. 13666 */ 13667 13668 /* 13669 * Check if any mapping is in same as or if it is locked 13670 * since in that case we need to uncache. 13671 */ 13672 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 13673 tmphme = sfhmep->hme_next; 13674 hmeblkp = sfmmu_hmetohblk(sfhmep); 13675 if (hmeblkp->hblk_xhat_bit) 13676 continue; 13677 tmphat = hblktosfmmu(hmeblkp); 13678 sfmmu_copytte(&sfhmep->hme_tte, &tte); 13679 ASSERT(TTE_IS_VALID(&tte)); 13680 if ((tmphat == ksfmmup) || hmeblkp->hblk_lckcnt) { 13681 /* 13682 * We have an uncache conflict 13683 */ 13684 SFMMU_STAT(sf_uncache_conflict); 13685 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 13686 return; 13687 } 13688 } 13689 13690 /* 13691 * We have an unload conflict 13692 */ 13693 SFMMU_STAT(sf_unload_conflict); 13694 13695 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 13696 tmphme = sfhmep->hme_next; 13697 hmeblkp = sfmmu_hmetohblk(sfhmep); 13698 if (hmeblkp->hblk_xhat_bit) 13699 continue; 13700 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 13701 } 13702 13703 /* 13704 * Unloads only does tlb flushes so we need to flush the 13705 * dcache vcolor here. 13706 */ 13707 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 13708 PP_SET_VCOLOR(pp, vcolor); 13709 } 13710 13711 /* 13712 * Remove all kpm mappings using kpme's for pp and check that 13713 * all kpm mappings (w/ and w/o kpme's) are gone. 13714 */ 13715 static void 13716 sfmmu_kpm_pageunload(page_t *pp) 13717 { 13718 caddr_t vaddr; 13719 struct kpme *kpme, *nkpme; 13720 13721 ASSERT(pp != NULL); 13722 ASSERT(pp->p_kpmref); 13723 ASSERT(sfmmu_mlist_held(pp)); 13724 13725 vaddr = hat_kpm_page2va(pp, 1); 13726 13727 for (kpme = pp->p_kpmelist; kpme; kpme = nkpme) { 13728 ASSERT(kpme->kpe_page == pp); 13729 13730 if (pp->p_kpmref == 0) 13731 panic("sfmmu_kpm_pageunload: stale p_kpmref pp=%p " 13732 "kpme=%p", (void *)pp, (void *)kpme); 13733 13734 nkpme = kpme->kpe_next; 13735 13736 /* Add instance callback here here if needed later */ 13737 sfmmu_kpme_sub(kpme, pp); 13738 } 13739 13740 /* 13741 * Also correct after mixed kpme/nonkpme mappings. If nonkpme 13742 * segkpm clients have unlocked the page and forgot to mapout 13743 * we panic here. 13744 */ 13745 if (pp->p_kpmref != 0) 13746 panic("sfmmu_kpm_pageunload: bad refcnt pp=%p", (void *)pp); 13747 13748 sfmmu_kpm_mapout(pp, vaddr); 13749 } 13750 13751 /* 13752 * Remove a large kpm mapping from kernel TSB and all TLB's. 13753 */ 13754 static void 13755 sfmmu_kpm_demap_large(caddr_t vaddr) 13756 { 13757 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT4M); 13758 sfmmu_kpm_demap_tlbs(vaddr); 13759 } 13760 13761 /* 13762 * Remove a small kpm mapping from kernel TSB and all TLB's. 13763 */ 13764 static void 13765 sfmmu_kpm_demap_small(caddr_t vaddr) 13766 { 13767 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT); 13768 sfmmu_kpm_demap_tlbs(vaddr); 13769 } 13770 13771 /* 13772 * Demap a kpm mapping in all TLB's. 13773 */ 13774 static void 13775 sfmmu_kpm_demap_tlbs(caddr_t vaddr) 13776 { 13777 cpuset_t cpuset; 13778 13779 kpreempt_disable(); 13780 cpuset = ksfmmup->sfmmu_cpusran; 13781 CPUSET_AND(cpuset, cpu_ready_set); 13782 CPUSET_DEL(cpuset, CPU->cpu_id); 13783 SFMMU_XCALL_STATS(ksfmmup); 13784 13785 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)vaddr, 13786 (uint64_t)ksfmmup); 13787 vtag_flushpage(vaddr, (uint64_t)ksfmmup); 13788 13789 kpreempt_enable(); 13790 } 13791 13792 /* 13793 * Summary states used in sfmmu_kpm_vac_unload (KPM_VUL__*). 13794 * See also more detailed comments within in the sfmmu_kpm_vac_unload switch. 13795 * Abbreviations used: 13796 * BIG: Large page kpm mapping in use. 13797 * CONFL: VAC conflict(s) within a kpm_page. 13798 * INCR: Count of conflicts within a kpm_page is going to be incremented. 13799 * DECR: Count of conflicts within a kpm_page is going to be decremented. 13800 * UNMAP_SMALL: A small (regular page size) mapping is going to be unmapped. 13801 * TNC: Temporary non cached: a kpm mapped page is mapped in TNC state. 13802 */ 13803 #define KPM_VUL_BIG (0) 13804 #define KPM_VUL_CONFL_INCR1 (KPM_KS) 13805 #define KPM_VUL_UNMAP_SMALL1 (KPM_KS | KPM_S) 13806 #define KPM_VUL_CONFL_INCR2 (KPM_KC) 13807 #define KPM_VUL_CONFL_INCR3 (KPM_KC | KPM_KS) 13808 #define KPM_VUL_UNMAP_SMALL2 (KPM_KC | KPM_KS | KPM_S) 13809 #define KPM_VUL_CONFL_DECR1 (KPM_KC | KPM_C) 13810 #define KPM_VUL_CONFL_DECR2 (KPM_KC | KPM_C | KPM_KS) 13811 #define KPM_VUL_TNC (KPM_KC | KPM_C | KPM_KS | KPM_S) 13812 13813 /* 13814 * Handle VAC unload conflicts introduced by hme mappings or vice 13815 * versa when a hme conflict mapping is replaced by a non conflict 13816 * one. Perform actions and state transitions according to the 13817 * various page and kpm_page entry states. VACache flushes are in 13818 * the responsibiliy of the caller. We still hold the mlist lock. 13819 */ 13820 static void 13821 sfmmu_kpm_vac_unload(page_t *pp, caddr_t vaddr) 13822 { 13823 kpm_page_t *kp; 13824 kpm_hlk_t *kpmp; 13825 caddr_t kpmvaddr = hat_kpm_page2va(pp, 1); 13826 int newcolor; 13827 kmutex_t *pmtx; 13828 uint_t vacunlcase; 13829 int badstate = 0; 13830 kpm_spage_t *ksp; 13831 kpm_shlk_t *kpmsp; 13832 13833 ASSERT(PAGE_LOCKED(pp)); 13834 ASSERT(sfmmu_mlist_held(pp)); 13835 ASSERT(!PP_ISNC(pp)); 13836 13837 newcolor = addr_to_vcolor(kpmvaddr) != addr_to_vcolor(vaddr); 13838 if (kpm_smallpages) 13839 goto smallpages_vac_unload; 13840 13841 PP2KPMPG(pp, kp); 13842 kpmp = KPMP_HASH(kp); 13843 mutex_enter(&kpmp->khl_mutex); 13844 13845 if (IS_KPM_ALIAS_RANGE(kpmvaddr)) { 13846 if (kp->kp_refcnta < 1) { 13847 panic("sfmmu_kpm_vac_unload: bad refcnta kpm_page=%p\n", 13848 (void *)kp); 13849 } 13850 13851 if (PP_ISKPMC(pp) == 0) { 13852 if (newcolor == 0) 13853 goto exit; 13854 sfmmu_kpm_demap_small(kpmvaddr); 13855 pmtx = sfmmu_page_enter(pp); 13856 PP_SETKPMC(pp); 13857 sfmmu_page_exit(pmtx); 13858 13859 } else if (newcolor == 0) { 13860 pmtx = sfmmu_page_enter(pp); 13861 PP_CLRKPMC(pp); 13862 sfmmu_page_exit(pmtx); 13863 13864 } else { 13865 badstate++; 13866 } 13867 13868 goto exit; 13869 } 13870 13871 badstate = (kp->kp_refcnt < 0 || kp->kp_refcnts < 0); 13872 if (kp->kp_refcntc == -1) { 13873 /* 13874 * We should come here only if trap level tsb miss 13875 * handler is disabled. 13876 */ 13877 badstate |= (kp->kp_refcnt == 0 || kp->kp_refcnts > 0 || 13878 PP_ISKPMC(pp) || PP_ISKPMS(pp) || PP_ISNC(pp)); 13879 } else { 13880 badstate |= (kp->kp_refcntc < 0); 13881 } 13882 13883 if (badstate) 13884 goto exit; 13885 13886 if (PP_ISKPMC(pp) == 0 && newcolor == 0) { 13887 ASSERT(PP_ISKPMS(pp) == 0); 13888 goto exit; 13889 } 13890 13891 /* 13892 * Combine the per kpm_page and per page kpm VAC states 13893 * to a summary state in order to make the vac unload 13894 * handling more concise. 13895 */ 13896 vacunlcase = (((kp->kp_refcntc > 0) ? KPM_KC : 0) | 13897 ((kp->kp_refcnts > 0) ? KPM_KS : 0) | 13898 (PP_ISKPMC(pp) ? KPM_C : 0) | 13899 (PP_ISKPMS(pp) ? KPM_S : 0)); 13900 13901 switch (vacunlcase) { 13902 case KPM_VUL_BIG: /* - - - - */ 13903 /* 13904 * Have to breakup the large page mapping to be 13905 * able to handle the conflicting hme vaddr. 13906 */ 13907 if (kp->kp_refcntc == -1) { 13908 /* remove go indication */ 13909 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 13910 &kpmp->khl_lock, KPMTSBM_STOP); 13911 } 13912 sfmmu_kpm_demap_large(kpmvaddr); 13913 13914 ASSERT(kp->kp_refcntc == 0); 13915 kp->kp_refcntc++; 13916 pmtx = sfmmu_page_enter(pp); 13917 PP_SETKPMC(pp); 13918 sfmmu_page_exit(pmtx); 13919 break; 13920 13921 case KPM_VUL_UNMAP_SMALL1: /* - - ks s */ 13922 case KPM_VUL_UNMAP_SMALL2: /* kc - ks s */ 13923 /* 13924 * New conflict w/ an active kpm page, actually mapped 13925 * in by small TSB/TLB entries. Remove the mapping and 13926 * update states. 13927 */ 13928 ASSERT(newcolor); 13929 sfmmu_kpm_demap_small(kpmvaddr); 13930 kp->kp_refcnts--; 13931 kp->kp_refcnt++; 13932 kp->kp_refcntc++; 13933 pmtx = sfmmu_page_enter(pp); 13934 PP_CLRKPMS(pp); 13935 PP_SETKPMC(pp); 13936 sfmmu_page_exit(pmtx); 13937 break; 13938 13939 case KPM_VUL_CONFL_INCR1: /* - - ks - */ 13940 case KPM_VUL_CONFL_INCR2: /* kc - - - */ 13941 case KPM_VUL_CONFL_INCR3: /* kc - ks - */ 13942 /* 13943 * New conflict on a active kpm mapped page not yet in 13944 * TSB/TLB. Mark page and increment the kpm_page conflict 13945 * count. 13946 */ 13947 ASSERT(newcolor); 13948 kp->kp_refcntc++; 13949 pmtx = sfmmu_page_enter(pp); 13950 PP_SETKPMC(pp); 13951 sfmmu_page_exit(pmtx); 13952 break; 13953 13954 case KPM_VUL_CONFL_DECR1: /* kc c - - */ 13955 case KPM_VUL_CONFL_DECR2: /* kc c ks - */ 13956 /* 13957 * A conflicting hme mapping is removed for an active 13958 * kpm page not yet in TSB/TLB. Unmark page and decrement 13959 * the kpm_page conflict count. 13960 */ 13961 ASSERT(newcolor == 0); 13962 kp->kp_refcntc--; 13963 pmtx = sfmmu_page_enter(pp); 13964 PP_CLRKPMC(pp); 13965 sfmmu_page_exit(pmtx); 13966 break; 13967 13968 case KPM_VUL_TNC: /* kc c ks s */ 13969 cmn_err(CE_NOTE, "sfmmu_kpm_vac_unload: " 13970 "page not in NC state"); 13971 /* FALLTHRU */ 13972 13973 default: 13974 badstate++; 13975 } 13976 exit: 13977 if (badstate) { 13978 panic("sfmmu_kpm_vac_unload: inconsistent VAC state, " 13979 "kpmvaddr=%p kp=%p pp=%p", 13980 (void *)kpmvaddr, (void *)kp, (void *)pp); 13981 } 13982 mutex_exit(&kpmp->khl_mutex); 13983 13984 return; 13985 13986 smallpages_vac_unload: 13987 if (newcolor == 0) 13988 return; 13989 13990 PP2KPMSPG(pp, ksp); 13991 kpmsp = KPMP_SHASH(ksp); 13992 13993 if (PP_ISKPMC(pp) == 0) { 13994 if (ksp->kp_mapped == KPM_MAPPEDS) { 13995 /* 13996 * Stop TL tsbmiss handling 13997 */ 13998 (void) sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13999 &kpmsp->kshl_lock, KPM_MAPPEDSC); 14000 14001 sfmmu_kpm_demap_small(kpmvaddr); 14002 14003 } else if (ksp->kp_mapped != KPM_MAPPEDSC) { 14004 panic("sfmmu_kpm_vac_unload: inconsistent mapping"); 14005 } 14006 14007 pmtx = sfmmu_page_enter(pp); 14008 PP_SETKPMC(pp); 14009 sfmmu_page_exit(pmtx); 14010 14011 } else { 14012 if (ksp->kp_mapped != KPM_MAPPEDSC) 14013 panic("sfmmu_kpm_vac_unload: inconsistent mapping"); 14014 } 14015 } 14016 14017 /* 14018 * Page is marked to be in VAC conflict to an existing kpm mapping 14019 * or is kpm mapped using only the regular pagesize. Called from 14020 * sfmmu_hblk_unload when a mlist is completely removed. 14021 */ 14022 static void 14023 sfmmu_kpm_hme_unload(page_t *pp) 14024 { 14025 /* tte assembly */ 14026 kpm_page_t *kp; 14027 kpm_hlk_t *kpmp; 14028 caddr_t vaddr; 14029 kmutex_t *pmtx; 14030 uint_t flags; 14031 kpm_spage_t *ksp; 14032 14033 ASSERT(sfmmu_mlist_held(pp)); 14034 ASSERT(PP_ISMAPPED_KPM(pp)); 14035 14036 flags = pp->p_nrm & (P_KPMC | P_KPMS); 14037 if (kpm_smallpages) 14038 goto smallpages_hme_unload; 14039 14040 if (flags == (P_KPMC | P_KPMS)) { 14041 panic("sfmmu_kpm_hme_unload: page should be uncached"); 14042 14043 } else if (flags == P_KPMS) { 14044 /* 14045 * Page mapped small but not involved in VAC conflict 14046 */ 14047 return; 14048 } 14049 14050 vaddr = hat_kpm_page2va(pp, 1); 14051 14052 PP2KPMPG(pp, kp); 14053 kpmp = KPMP_HASH(kp); 14054 mutex_enter(&kpmp->khl_mutex); 14055 14056 if (IS_KPM_ALIAS_RANGE(vaddr)) { 14057 if (kp->kp_refcnta < 1) { 14058 panic("sfmmu_kpm_hme_unload: bad refcnta kpm_page=%p\n", 14059 (void *)kp); 14060 } 14061 14062 } else { 14063 if (kp->kp_refcntc < 1) { 14064 panic("sfmmu_kpm_hme_unload: bad refcntc kpm_page=%p\n", 14065 (void *)kp); 14066 } 14067 kp->kp_refcntc--; 14068 } 14069 14070 pmtx = sfmmu_page_enter(pp); 14071 PP_CLRKPMC(pp); 14072 sfmmu_page_exit(pmtx); 14073 14074 mutex_exit(&kpmp->khl_mutex); 14075 return; 14076 14077 smallpages_hme_unload: 14078 if (flags != P_KPMC) 14079 panic("sfmmu_kpm_hme_unload: page should be uncached"); 14080 14081 vaddr = hat_kpm_page2va(pp, 1); 14082 PP2KPMSPG(pp, ksp); 14083 14084 if (ksp->kp_mapped != KPM_MAPPEDSC) 14085 panic("sfmmu_kpm_hme_unload: inconsistent mapping"); 14086 14087 /* 14088 * Keep KPM_MAPPEDSC until the next kpm tsbmiss where it 14089 * prevents TL tsbmiss handling and force a hat_kpm_fault. 14090 * There we can start over again. 14091 */ 14092 14093 pmtx = sfmmu_page_enter(pp); 14094 PP_CLRKPMC(pp); 14095 sfmmu_page_exit(pmtx); 14096 } 14097 14098 /* 14099 * Special hooks for sfmmu_page_cache_array() when changing the 14100 * cacheability of a page. It is used to obey the hat_kpm lock 14101 * ordering (mlist -> kpmp -> spl, and back). 14102 */ 14103 static kpm_hlk_t * 14104 sfmmu_kpm_kpmp_enter(page_t *pp, pgcnt_t npages) 14105 { 14106 kpm_page_t *kp; 14107 kpm_hlk_t *kpmp; 14108 14109 ASSERT(sfmmu_mlist_held(pp)); 14110 14111 if (kpm_smallpages || PP_ISMAPPED_KPM(pp) == 0) 14112 return (NULL); 14113 14114 ASSERT(npages <= kpmpnpgs); 14115 14116 PP2KPMPG(pp, kp); 14117 kpmp = KPMP_HASH(kp); 14118 mutex_enter(&kpmp->khl_mutex); 14119 14120 return (kpmp); 14121 } 14122 14123 static void 14124 sfmmu_kpm_kpmp_exit(kpm_hlk_t *kpmp) 14125 { 14126 if (kpm_smallpages || kpmp == NULL) 14127 return; 14128 14129 mutex_exit(&kpmp->khl_mutex); 14130 } 14131 14132 /* 14133 * Summary states used in sfmmu_kpm_page_cache (KPM_*). 14134 * See also more detailed comments within in the sfmmu_kpm_page_cache switch. 14135 * Abbreviations used: 14136 * UNC: Input state for an uncache request. 14137 * BIG: Large page kpm mapping in use. 14138 * SMALL: Page has a small kpm mapping within a kpm_page range. 14139 * NODEMAP: No demap needed. 14140 * NOP: No operation needed on this input state. 14141 * CACHE: Input state for a re-cache request. 14142 * MAPS: Page is in TNC and kpm VAC conflict state and kpm mapped small. 14143 * NOMAP: Page is in TNC and kpm VAC conflict state, but not small kpm 14144 * mapped. 14145 * NOMAPO: Page is in TNC and kpm VAC conflict state, but not small kpm 14146 * mapped. There are also other small kpm mappings within this 14147 * kpm_page. 14148 */ 14149 #define KPM_UNC_BIG (0) 14150 #define KPM_UNC_NODEMAP1 (KPM_KS) 14151 #define KPM_UNC_SMALL1 (KPM_KS | KPM_S) 14152 #define KPM_UNC_NODEMAP2 (KPM_KC) 14153 #define KPM_UNC_NODEMAP3 (KPM_KC | KPM_KS) 14154 #define KPM_UNC_SMALL2 (KPM_KC | KPM_KS | KPM_S) 14155 #define KPM_UNC_NOP1 (KPM_KC | KPM_C) 14156 #define KPM_UNC_NOP2 (KPM_KC | KPM_C | KPM_KS) 14157 #define KPM_CACHE_NOMAP (KPM_KC | KPM_C) 14158 #define KPM_CACHE_NOMAPO (KPM_KC | KPM_C | KPM_KS) 14159 #define KPM_CACHE_MAPS (KPM_KC | KPM_C | KPM_KS | KPM_S) 14160 14161 /* 14162 * This function is called when the virtual cacheability of a page 14163 * is changed and the page has an actice kpm mapping. The mlist mutex, 14164 * the spl hash lock and the kpmp mutex (if needed) are already grabbed. 14165 */ 14166 static void 14167 sfmmu_kpm_page_cache(page_t *pp, int flags, int cache_flush_tag) 14168 { 14169 kpm_page_t *kp; 14170 kpm_hlk_t *kpmp; 14171 caddr_t kpmvaddr; 14172 int badstate = 0; 14173 uint_t pgcacase; 14174 kpm_spage_t *ksp; 14175 kpm_shlk_t *kpmsp; 14176 int oldval; 14177 14178 ASSERT(PP_ISMAPPED_KPM(pp)); 14179 ASSERT(sfmmu_mlist_held(pp)); 14180 ASSERT(sfmmu_page_spl_held(pp)); 14181 14182 if (flags != HAT_TMPNC && flags != HAT_CACHE) 14183 panic("sfmmu_kpm_page_cache: bad flags"); 14184 14185 kpmvaddr = hat_kpm_page2va(pp, 1); 14186 14187 if (flags == HAT_TMPNC && cache_flush_tag == CACHE_FLUSH) { 14188 pfn_t pfn = pp->p_pagenum; 14189 int vcolor = addr_to_vcolor(kpmvaddr); 14190 cpuset_t cpuset = cpu_ready_set; 14191 14192 /* Flush vcolor in DCache */ 14193 CPUSET_DEL(cpuset, CPU->cpu_id); 14194 SFMMU_XCALL_STATS(ksfmmup); 14195 xt_some(cpuset, vac_flushpage_tl1, pfn, vcolor); 14196 vac_flushpage(pfn, vcolor); 14197 } 14198 14199 if (kpm_smallpages) 14200 goto smallpages_page_cache; 14201 14202 PP2KPMPG(pp, kp); 14203 kpmp = KPMP_HASH(kp); 14204 ASSERT(MUTEX_HELD(&kpmp->khl_mutex)); 14205 14206 if (IS_KPM_ALIAS_RANGE(kpmvaddr)) { 14207 if (kp->kp_refcnta < 1) { 14208 panic("sfmmu_kpm_page_cache: bad refcnta " 14209 "kpm_page=%p\n", (void *)kp); 14210 } 14211 sfmmu_kpm_demap_small(kpmvaddr); 14212 if (flags == HAT_TMPNC) { 14213 PP_SETKPMC(pp); 14214 ASSERT(!PP_ISKPMS(pp)); 14215 } else { 14216 ASSERT(PP_ISKPMC(pp)); 14217 PP_CLRKPMC(pp); 14218 } 14219 goto exit; 14220 } 14221 14222 badstate = (kp->kp_refcnt < 0 || kp->kp_refcnts < 0); 14223 if (kp->kp_refcntc == -1) { 14224 /* 14225 * We should come here only if trap level tsb miss 14226 * handler is disabled. 14227 */ 14228 badstate |= (kp->kp_refcnt == 0 || kp->kp_refcnts > 0 || 14229 PP_ISKPMC(pp) || PP_ISKPMS(pp) || PP_ISNC(pp)); 14230 } else { 14231 badstate |= (kp->kp_refcntc < 0); 14232 } 14233 14234 if (badstate) 14235 goto exit; 14236 14237 /* 14238 * Combine the per kpm_page and per page kpm VAC states to 14239 * a summary state in order to make the VAC cache/uncache 14240 * handling more concise. 14241 */ 14242 pgcacase = (((kp->kp_refcntc > 0) ? KPM_KC : 0) | 14243 ((kp->kp_refcnts > 0) ? KPM_KS : 0) | 14244 (PP_ISKPMC(pp) ? KPM_C : 0) | 14245 (PP_ISKPMS(pp) ? KPM_S : 0)); 14246 14247 if (flags == HAT_CACHE) { 14248 switch (pgcacase) { 14249 case KPM_CACHE_MAPS: /* kc c ks s */ 14250 sfmmu_kpm_demap_small(kpmvaddr); 14251 if (kp->kp_refcnts < 1) { 14252 panic("sfmmu_kpm_page_cache: bad refcnts " 14253 "kpm_page=%p\n", (void *)kp); 14254 } 14255 kp->kp_refcnts--; 14256 kp->kp_refcnt++; 14257 PP_CLRKPMS(pp); 14258 /* FALLTHRU */ 14259 14260 case KPM_CACHE_NOMAP: /* kc c - - */ 14261 case KPM_CACHE_NOMAPO: /* kc c ks - */ 14262 kp->kp_refcntc--; 14263 PP_CLRKPMC(pp); 14264 break; 14265 14266 default: 14267 badstate++; 14268 } 14269 goto exit; 14270 } 14271 14272 switch (pgcacase) { 14273 case KPM_UNC_BIG: /* - - - - */ 14274 if (kp->kp_refcnt < 1) { 14275 panic("sfmmu_kpm_page_cache: bad refcnt " 14276 "kpm_page=%p\n", (void *)kp); 14277 } 14278 14279 /* 14280 * Have to breakup the large page mapping in preparation 14281 * to the upcoming TNC mode handled by small mappings. 14282 * The demap can already be done due to another conflict 14283 * within the kpm_page. 14284 */ 14285 if (kp->kp_refcntc == -1) { 14286 /* remove go indication */ 14287 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 14288 &kpmp->khl_lock, KPMTSBM_STOP); 14289 } 14290 ASSERT(kp->kp_refcntc == 0); 14291 sfmmu_kpm_demap_large(kpmvaddr); 14292 kp->kp_refcntc++; 14293 PP_SETKPMC(pp); 14294 break; 14295 14296 case KPM_UNC_SMALL1: /* - - ks s */ 14297 case KPM_UNC_SMALL2: /* kc - ks s */ 14298 /* 14299 * Have to demap an already small kpm mapping in preparation 14300 * to the upcoming TNC mode. The demap can already be done 14301 * due to another conflict within the kpm_page. 14302 */ 14303 sfmmu_kpm_demap_small(kpmvaddr); 14304 kp->kp_refcntc++; 14305 kp->kp_refcnts--; 14306 kp->kp_refcnt++; 14307 PP_CLRKPMS(pp); 14308 PP_SETKPMC(pp); 14309 break; 14310 14311 case KPM_UNC_NODEMAP1: /* - - ks - */ 14312 /* fallthru */ 14313 14314 case KPM_UNC_NODEMAP2: /* kc - - - */ 14315 case KPM_UNC_NODEMAP3: /* kc - ks - */ 14316 kp->kp_refcntc++; 14317 PP_SETKPMC(pp); 14318 break; 14319 14320 case KPM_UNC_NOP1: /* kc c - - */ 14321 case KPM_UNC_NOP2: /* kc c ks - */ 14322 break; 14323 14324 default: 14325 badstate++; 14326 } 14327 exit: 14328 if (badstate) { 14329 panic("sfmmu_kpm_page_cache: inconsistent VAC state " 14330 "kpmvaddr=%p kp=%p pp=%p", (void *)kpmvaddr, 14331 (void *)kp, (void *)pp); 14332 } 14333 return; 14334 14335 smallpages_page_cache: 14336 PP2KPMSPG(pp, ksp); 14337 kpmsp = KPMP_SHASH(ksp); 14338 14339 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 14340 &kpmsp->kshl_lock, KPM_MAPPEDSC); 14341 14342 if (!(oldval == KPM_MAPPEDS || oldval == KPM_MAPPEDSC)) 14343 panic("smallpages_page_cache: inconsistent mapping"); 14344 14345 sfmmu_kpm_demap_small(kpmvaddr); 14346 14347 if (flags == HAT_TMPNC) { 14348 PP_SETKPMC(pp); 14349 ASSERT(!PP_ISKPMS(pp)); 14350 14351 } else { 14352 ASSERT(PP_ISKPMC(pp)); 14353 PP_CLRKPMC(pp); 14354 } 14355 14356 /* 14357 * Keep KPM_MAPPEDSC until the next kpm tsbmiss where it 14358 * prevents TL tsbmiss handling and force a hat_kpm_fault. 14359 * There we can start over again. 14360 */ 14361 } 14362 14363 /* 14364 * unused in sfmmu 14365 */ 14366 void 14367 hat_dump(void) 14368 { 14369 } 14370 14371 /* 14372 * Called when a thread is exiting and we have switched to the kernel address 14373 * space. Perform the same VM initialization resume() uses when switching 14374 * processes. 14375 * 14376 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 14377 * we call it anyway in case the semantics change in the future. 14378 */ 14379 /*ARGSUSED*/ 14380 void 14381 hat_thread_exit(kthread_t *thd) 14382 { 14383 uint64_t pgsz_cnum; 14384 uint_t pstate_save; 14385 14386 ASSERT(thd->t_procp->p_as == &kas); 14387 14388 pgsz_cnum = KCONTEXT; 14389 #ifdef sun4u 14390 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT); 14391 #endif 14392 /* 14393 * Note that sfmmu_load_mmustate() is currently a no-op for 14394 * kernel threads. We need to disable interrupts here, 14395 * simply because otherwise sfmmu_load_mmustate() would panic 14396 * if the caller does not disable interrupts. 14397 */ 14398 pstate_save = sfmmu_disable_intrs(); 14399 sfmmu_setctx_sec(pgsz_cnum); 14400 sfmmu_load_mmustate(ksfmmup); 14401 sfmmu_enable_intrs(pstate_save); 14402 } 14403