1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * VM - Hardware Address Translation management for Spitfire MMU. 31 * 32 * This file implements the machine specific hardware translation 33 * needed by the VM system. The machine independent interface is 34 * described in <vm/hat.h> while the machine dependent interface 35 * and data structures are described in <vm/hat_sfmmu.h>. 36 * 37 * The hat layer manages the address translation hardware as a cache 38 * driven by calls from the higher levels in the VM system. 39 */ 40 41 #include <sys/types.h> 42 #include <vm/hat.h> 43 #include <vm/hat_sfmmu.h> 44 #include <vm/page.h> 45 #include <sys/pte.h> 46 #include <sys/systm.h> 47 #include <sys/mman.h> 48 #include <sys/sysmacros.h> 49 #include <sys/machparam.h> 50 #include <sys/vtrace.h> 51 #include <sys/kmem.h> 52 #include <sys/mmu.h> 53 #include <sys/cmn_err.h> 54 #include <sys/cpu.h> 55 #include <sys/cpuvar.h> 56 #include <sys/debug.h> 57 #include <sys/lgrp.h> 58 #include <sys/archsystm.h> 59 #include <sys/machsystm.h> 60 #include <sys/vmsystm.h> 61 #include <vm/as.h> 62 #include <vm/seg.h> 63 #include <vm/seg_kp.h> 64 #include <vm/seg_kmem.h> 65 #include <vm/seg_kpm.h> 66 #include <vm/rm.h> 67 #include <sys/t_lock.h> 68 #include <sys/obpdefs.h> 69 #include <sys/vm_machparam.h> 70 #include <sys/var.h> 71 #include <sys/trap.h> 72 #include <sys/machtrap.h> 73 #include <sys/scb.h> 74 #include <sys/bitmap.h> 75 #include <sys/machlock.h> 76 #include <sys/membar.h> 77 #include <sys/atomic.h> 78 #include <sys/cpu_module.h> 79 #include <sys/prom_debug.h> 80 #include <sys/ksynch.h> 81 #include <sys/mem_config.h> 82 #include <sys/mem_cage.h> 83 #include <sys/dtrace.h> 84 #include <vm/vm_dep.h> 85 #include <vm/xhat_sfmmu.h> 86 #include <sys/fpu/fpusystm.h> 87 88 #if defined(SF_ERRATA_57) 89 extern caddr_t errata57_limit; 90 #endif 91 92 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 93 (sizeof (int64_t))) 94 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 95 96 #define HBLK_RESERVE_CNT 128 97 #define HBLK_RESERVE_MIN 20 98 99 static struct hme_blk *freehblkp; 100 static kmutex_t freehblkp_lock; 101 static int freehblkcnt; 102 103 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 104 static kmutex_t hblk_reserve_lock; 105 static kthread_t *hblk_reserve_thread; 106 107 static nucleus_hblk8_info_t nucleus_hblk8; 108 static nucleus_hblk1_info_t nucleus_hblk1; 109 110 /* 111 * SFMMU specific hat functions 112 */ 113 void hat_pagecachectl(struct page *, int); 114 115 /* flags for hat_pagecachectl */ 116 #define HAT_CACHE 0x1 117 #define HAT_UNCACHE 0x2 118 #define HAT_TMPNC 0x4 119 120 /* 121 * Flag to allow the creation of non-cacheable translations 122 * to system memory. It is off by default. At the moment this 123 * flag is used by the ecache error injector. The error injector 124 * will turn it on when creating such a translation then shut it 125 * off when it's finished. 126 */ 127 128 int sfmmu_allow_nc_trans = 0; 129 130 /* 131 * Flag to disable large page support. 132 * value of 1 => disable all large pages. 133 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 134 * 135 * For example, use the value 0x4 to disable 512K pages. 136 * 137 */ 138 #define LARGE_PAGES_OFF 0x1 139 140 /* 141 * WARNING: 512K pages MUST be disabled for ISM/DISM. If not 142 * a process would page fault indefinitely if it tried to 143 * access a 512K page. 144 */ 145 int disable_ism_large_pages = (1 << TTE512K); 146 int disable_large_pages = 0; 147 int disable_auto_large_pages = 0; 148 149 /* 150 * Private sfmmu data structures for hat management 151 */ 152 static struct kmem_cache *sfmmuid_cache; 153 154 /* 155 * Private sfmmu data structures for ctx management 156 */ 157 static struct ctx *ctxhand; /* hand used while stealing ctxs */ 158 static struct ctx *ctxfree; /* head of free ctx list */ 159 static struct ctx *ctxdirty; /* head of dirty ctx list */ 160 161 /* 162 * Private sfmmu data structures for tsb management 163 */ 164 static struct kmem_cache *sfmmu_tsbinfo_cache; 165 static struct kmem_cache *sfmmu_tsb8k_cache; 166 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 167 static vmem_t *kmem_tsb_arena; 168 169 /* 170 * sfmmu static variables for hmeblk resource management. 171 */ 172 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 173 static struct kmem_cache *sfmmu8_cache; 174 static struct kmem_cache *sfmmu1_cache; 175 static struct kmem_cache *pa_hment_cache; 176 177 static kmutex_t ctx_list_lock; /* mutex for ctx free/dirty lists */ 178 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 179 /* 180 * private data for ism 181 */ 182 static struct kmem_cache *ism_blk_cache; 183 static struct kmem_cache *ism_ment_cache; 184 #define ISMID_STARTADDR NULL 185 186 /* 187 * Whether to delay TLB flushes and use Cheetah's flush-all support 188 * when removing contexts from the dirty list. 189 */ 190 int delay_tlb_flush; 191 int disable_delay_tlb_flush; 192 193 /* 194 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 195 * HAT flags, synchronizing TLB/TSB coherency, and context management. 196 * The lock is hashed on the sfmmup since the case where we need to lock 197 * all processes is rare but does occur (e.g. we need to unload a shared 198 * mapping from all processes using the mapping). We have a lot of buckets, 199 * and each slab of sfmmu_t's can use about a quarter of them, giving us 200 * a fairly good distribution without wasting too much space and overhead 201 * when we have to grab them all. 202 */ 203 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 204 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 205 206 /* 207 * Hash algorithm optimized for a small number of slabs. 208 * 7 is (highbit((sizeof sfmmu_t)) - 1) 209 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 210 * kmem_cache, and thus they will be sequential within that cache. In 211 * addition, each new slab will have a different "color" up to cache_maxcolor 212 * which will skew the hashing for each successive slab which is allocated. 213 * If the size of sfmmu_t changed to a larger size, this algorithm may need 214 * to be revisited. 215 */ 216 #define TSB_HASH_SHIFT_BITS (7) 217 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 218 219 #ifdef DEBUG 220 int tsb_hash_debug = 0; 221 #define TSB_HASH(sfmmup) \ 222 (tsb_hash_debug ? &hat_lock[0] : \ 223 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 224 #else /* DEBUG */ 225 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 226 #endif /* DEBUG */ 227 228 229 /* sfmmu_replace_tsb() return codes. */ 230 typedef enum tsb_replace_rc { 231 TSB_SUCCESS, 232 TSB_ALLOCFAIL, 233 TSB_LOSTRACE, 234 TSB_ALREADY_SWAPPED, 235 TSB_CANTGROW 236 } tsb_replace_rc_t; 237 238 /* 239 * Flags for TSB allocation routines. 240 */ 241 #define TSB_ALLOC 0x01 242 #define TSB_FORCEALLOC 0x02 243 #define TSB_GROW 0x04 244 #define TSB_SHRINK 0x08 245 #define TSB_SWAPIN 0x10 246 247 /* 248 * Support for HAT callbacks. 249 */ 250 #define SFMMU_MAX_RELOC_CALLBACKS 10 251 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 252 static id_t sfmmu_cb_nextid = 0; 253 static id_t sfmmu_tsb_cb_id; 254 struct sfmmu_callback *sfmmu_cb_table; 255 256 /* 257 * Kernel page relocation is enabled by default for non-caged 258 * kernel pages. This has little effect unless segkmem_reloc is 259 * set, since by default kernel memory comes from inside the 260 * kernel cage. 261 */ 262 int hat_kpr_enabled = 1; 263 264 kmutex_t kpr_mutex; 265 kmutex_t kpr_suspendlock; 266 kthread_t *kreloc_thread; 267 268 /* 269 * Enable VA->PA translation sanity checking on DEBUG kernels. 270 * Disabled by default. This is incompatible with some 271 * drivers (error injector, RSM) so if it breaks you get 272 * to keep both pieces. 273 */ 274 int hat_check_vtop = 0; 275 276 /* 277 * Private sfmmu routines (prototypes) 278 */ 279 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 280 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 281 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t); 282 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 283 caddr_t, demap_range_t *, uint_t); 284 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 285 caddr_t, int); 286 static void sfmmu_hblk_free(struct hmehash_bucket *, struct hme_blk *, 287 uint64_t, struct hme_blk **); 288 static void sfmmu_hblks_list_purge(struct hme_blk **); 289 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 290 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 291 static struct hme_blk *sfmmu_hblk_steal(int); 292 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 293 struct hme_blk *, uint64_t, uint64_t, 294 struct hme_blk *); 295 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 296 297 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 298 uint_t, uint_t, pgcnt_t); 299 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 300 uint_t); 301 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 302 uint_t); 303 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 304 caddr_t, int); 305 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 306 struct hmehash_bucket *, caddr_t, uint_t, uint_t); 307 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 308 caddr_t, page_t **, uint_t); 309 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 310 311 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 312 pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *); 313 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 314 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 315 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 316 static int tst_tnc(page_t *pp, pgcnt_t); 317 static void conv_tnc(page_t *pp, int); 318 319 static struct ctx *sfmmu_get_ctx(sfmmu_t *); 320 static void sfmmu_free_ctx(sfmmu_t *, struct ctx *); 321 static void sfmmu_free_sfmmu(sfmmu_t *); 322 323 static void sfmmu_gettte(struct hat *, caddr_t, tte_t *); 324 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 325 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 326 327 static cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 328 static void hat_pagereload(struct page *, struct page *); 329 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 330 static void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 331 static void sfmmu_page_cache(page_t *, int, int, int); 332 333 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 334 pfn_t, int, int, int, int); 335 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 336 pfn_t, int); 337 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 338 static void sfmmu_tlb_range_demap(demap_range_t *); 339 static void sfmmu_tlb_ctx_demap(sfmmu_t *); 340 static void sfmmu_tlb_all_demap(void); 341 static void sfmmu_tlb_swap_ctx(sfmmu_t *, struct ctx *); 342 static void sfmmu_sync_mmustate(sfmmu_t *); 343 344 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 345 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 346 sfmmu_t *); 347 static void sfmmu_tsb_free(struct tsb_info *); 348 static void sfmmu_tsbinfo_free(struct tsb_info *); 349 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 350 sfmmu_t *); 351 352 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 353 static int sfmmu_select_tsb_szc(pgcnt_t); 354 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 355 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 356 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 357 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 358 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 359 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 360 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 361 hatlock_t *, uint_t); 362 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t); 363 364 static void sfmmu_cache_flush(pfn_t, int); 365 void sfmmu_cache_flushcolor(int, pfn_t); 366 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 367 caddr_t, demap_range_t *, uint_t, int); 368 369 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 370 static uint_t sfmmu_ptov_attr(tte_t *); 371 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 372 caddr_t, demap_range_t *, uint_t); 373 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 374 static int sfmmu_idcache_constructor(void *, void *, int); 375 static void sfmmu_idcache_destructor(void *, void *); 376 static int sfmmu_hblkcache_constructor(void *, void *, int); 377 static void sfmmu_hblkcache_destructor(void *, void *); 378 static void sfmmu_hblkcache_reclaim(void *); 379 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 380 struct hmehash_bucket *); 381 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 382 383 static void sfmmu_reuse_ctx(struct ctx *, sfmmu_t *); 384 static void sfmmu_disallow_ctx_steal(sfmmu_t *); 385 static void sfmmu_allow_ctx_steal(sfmmu_t *); 386 387 static void sfmmu_rm_large_mappings(page_t *, int); 388 389 static void hat_lock_init(void); 390 static void hat_kstat_init(void); 391 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 392 static void sfmmu_check_page_sizes(sfmmu_t *, int); 393 static int fnd_mapping_sz(page_t *); 394 static void iment_add(struct ism_ment *, struct hat *); 395 static void iment_sub(struct ism_ment *, struct hat *); 396 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 397 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 398 extern void sfmmu_clear_utsbinfo(void); 399 400 /* kpm prototypes */ 401 static caddr_t sfmmu_kpm_mapin(page_t *); 402 static void sfmmu_kpm_mapout(page_t *, caddr_t); 403 static int sfmmu_kpme_lookup(struct kpme *, page_t *); 404 static void sfmmu_kpme_add(struct kpme *, page_t *); 405 static void sfmmu_kpme_sub(struct kpme *, page_t *); 406 static caddr_t sfmmu_kpm_getvaddr(page_t *, int *); 407 static int sfmmu_kpm_fault(caddr_t, struct memseg *, page_t *); 408 static int sfmmu_kpm_fault_small(caddr_t, struct memseg *, page_t *); 409 static void sfmmu_kpm_vac_conflict(page_t *, caddr_t); 410 static void sfmmu_kpm_pageunload(page_t *); 411 static void sfmmu_kpm_vac_unload(page_t *, caddr_t); 412 static void sfmmu_kpm_demap_large(caddr_t); 413 static void sfmmu_kpm_demap_small(caddr_t); 414 static void sfmmu_kpm_demap_tlbs(caddr_t, int); 415 static void sfmmu_kpm_hme_unload(page_t *); 416 static kpm_hlk_t *sfmmu_kpm_kpmp_enter(page_t *, pgcnt_t); 417 static void sfmmu_kpm_kpmp_exit(kpm_hlk_t *kpmp); 418 static void sfmmu_kpm_page_cache(page_t *, int, int); 419 420 /* kpm globals */ 421 #ifdef DEBUG 422 /* 423 * Enable trap level tsbmiss handling 424 */ 425 int kpm_tsbmtl = 1; 426 427 /* 428 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 429 * required TLB shootdowns in this case, so handle w/ care. Off by default. 430 */ 431 int kpm_tlb_flush; 432 #endif /* DEBUG */ 433 434 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 435 436 #ifdef DEBUG 437 static void sfmmu_check_hblk_flist(); 438 #endif 439 440 /* 441 * Semi-private sfmmu data structures. Some of them are initialize in 442 * startup or in hat_init. Some of them are private but accessed by 443 * assembly code or mach_sfmmu.c 444 */ 445 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 446 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 447 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 448 uint64_t khme_hash_pa; /* PA of khme_hash */ 449 int uhmehash_num; /* # of buckets in user hash table */ 450 int khmehash_num; /* # of buckets in kernel hash table */ 451 struct ctx *ctxs; /* used by <machine/mmu.c> */ 452 uint_t nctxs; /* total number of contexts */ 453 454 int cache; /* describes system cache */ 455 456 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 457 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 458 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 459 int ktsb_sz; /* kernel 8k-indexed tsb size */ 460 461 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 462 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 463 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 464 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 465 466 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 467 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 468 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 469 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 470 471 #ifndef sun4v 472 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 473 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 474 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 475 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 476 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 477 #endif /* sun4v */ 478 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 479 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 480 481 /* 482 * Size to use for TSB slabs. Future platforms that support page sizes 483 * larger than 4M may wish to change these values, and provide their own 484 * assembly macros for building and decoding the TSB base register contents. 485 */ 486 uint_t tsb_slab_size = MMU_PAGESIZE4M; 487 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 488 uint_t tsb_slab_ttesz = TTE4M; 489 uint_t tsb_slab_mask = 0x1ff; /* 4M page alignment for 8K pfn */ 490 491 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 492 int tsb_max_growsize = UTSB_MAX_SZCODE; 493 494 /* 495 * Tunable parameters dealing with TSB policies. 496 */ 497 498 /* 499 * This undocumented tunable forces all 8K TSBs to be allocated from 500 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 501 */ 502 #ifdef DEBUG 503 int tsb_forceheap = 0; 504 #endif /* DEBUG */ 505 506 /* 507 * Decide whether to use per-lgroup arenas, or one global set of 508 * TSB arenas. The default is not to break up per-lgroup, since 509 * most platforms don't recognize any tangible benefit from it. 510 */ 511 int tsb_lgrp_affinity = 0; 512 513 /* 514 * Used for growing the TSB based on the process RSS. 515 * tsb_rss_factor is based on the smallest TSB, and is 516 * shifted by the TSB size to determine if we need to grow. 517 * The default will grow the TSB if the number of TTEs for 518 * this page size exceeds 75% of the number of TSB entries, 519 * which should _almost_ eliminate all conflict misses 520 * (at the expense of using up lots and lots of memory). 521 */ 522 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 523 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 524 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 525 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 526 default_tsb_size) 527 #define TSB_OK_SHRINK() \ 528 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 529 #define TSB_OK_GROW() \ 530 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 531 532 int enable_tsb_rss_sizing = 1; 533 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 534 535 /* which TSB size code to use for new address spaces or if rss sizing off */ 536 int default_tsb_size = TSB_8K_SZCODE; 537 538 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 539 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 540 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 541 542 #ifdef DEBUG 543 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 544 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 545 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 546 static int tsb_alloc_fail_mtbf = 0; 547 static int tsb_alloc_count = 0; 548 #endif /* DEBUG */ 549 550 /* if set to 1, will remap valid TTEs when growing TSB. */ 551 int tsb_remap_ttes = 1; 552 553 /* if we have more than this many mappings, allocate a second TSB. */ 554 int tsb_sectsb_threshold = 1; 555 556 /* 557 * kstat data 558 */ 559 struct sfmmu_global_stat sfmmu_global_stat; 560 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 561 562 /* 563 * Global data 564 */ 565 sfmmu_t *ksfmmup; /* kernel's hat id */ 566 struct ctx *kctx; /* kernel's context */ 567 568 #ifdef DEBUG 569 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 570 #endif 571 572 /* sfmmu locking operations */ 573 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 574 static int sfmmu_mlspl_held(struct page *, int); 575 576 static kmutex_t *sfmmu_page_enter(page_t *); 577 static void sfmmu_page_exit(kmutex_t *); 578 static int sfmmu_page_spl_held(struct page *); 579 580 /* sfmmu internal locking operations - accessed directly */ 581 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 582 kmutex_t **, kmutex_t **); 583 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 584 static hatlock_t * 585 sfmmu_hat_enter(sfmmu_t *); 586 static hatlock_t * 587 sfmmu_hat_tryenter(sfmmu_t *); 588 static void sfmmu_hat_exit(hatlock_t *); 589 static void sfmmu_hat_lock_all(void); 590 static void sfmmu_hat_unlock_all(void); 591 static void sfmmu_ismhat_enter(sfmmu_t *, int); 592 static void sfmmu_ismhat_exit(sfmmu_t *, int); 593 594 /* 595 * Array of mutexes protecting a page's mapping list and p_nrm field. 596 * 597 * The hash function looks complicated, but is made up so that: 598 * 599 * "pp" not shifted, so adjacent pp values will hash to different cache lines 600 * (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock) 601 * 602 * "pp" >> mml_shift, incorporates more source bits into the hash result 603 * 604 * "& (mml_table_size - 1), should be faster than using remainder "%" 605 * 606 * Hopefully, mml_table, mml_table_size and mml_shift are all in the same 607 * cacheline, since they get declared next to each other below. We'll trust 608 * ld not to do something random. 609 */ 610 #ifdef DEBUG 611 int mlist_hash_debug = 0; 612 #define MLIST_HASH(pp) (mlist_hash_debug ? &mml_table[0] : \ 613 &mml_table[((uintptr_t)(pp) + \ 614 ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]) 615 #else /* !DEBUG */ 616 #define MLIST_HASH(pp) &mml_table[ \ 617 ((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)] 618 #endif /* !DEBUG */ 619 620 kmutex_t *mml_table; 621 uint_t mml_table_sz; /* must be a power of 2 */ 622 uint_t mml_shift; /* log2(mml_table_sz) + 3 for align */ 623 624 /* 625 * kpm_page lock hash. 626 * All slots should be used equally and 2 adjacent kpm_page_t's 627 * shouldn't have their mutexes in the same cache line. 628 */ 629 #ifdef DEBUG 630 int kpmp_hash_debug = 0; 631 #define KPMP_HASH(kpp) (kpmp_hash_debug ? &kpmp_table[0] : &kpmp_table[ \ 632 ((uintptr_t)(kpp) + ((uintptr_t)(kpp) >> kpmp_shift)) \ 633 & (kpmp_table_sz - 1)]) 634 #else /* !DEBUG */ 635 #define KPMP_HASH(kpp) &kpmp_table[ \ 636 ((uintptr_t)(kpp) + ((uintptr_t)(kpp) >> kpmp_shift)) \ 637 & (kpmp_table_sz - 1)] 638 #endif /* DEBUG */ 639 640 kpm_hlk_t *kpmp_table; 641 uint_t kpmp_table_sz; /* must be a power of 2 */ 642 uchar_t kpmp_shift; 643 644 #ifdef DEBUG 645 #define KPMP_SHASH(kpp) (kpmp_hash_debug ? &kpmp_stable[0] : &kpmp_stable[ \ 646 (((uintptr_t)(kpp) << kpmp_shift) + (uintptr_t)(kpp)) \ 647 & (kpmp_stable_sz - 1)]) 648 #else /* !DEBUG */ 649 #define KPMP_SHASH(kpp) &kpmp_stable[ \ 650 (((uintptr_t)(kpp) << kpmp_shift) + (uintptr_t)(kpp)) \ 651 & (kpmp_stable_sz - 1)] 652 #endif /* DEBUG */ 653 654 kpm_shlk_t *kpmp_stable; 655 uint_t kpmp_stable_sz; /* must be a power of 2 */ 656 657 /* 658 * SPL_HASH was improved to avoid false cache line sharing 659 */ 660 #define SPL_TABLE_SIZE 128 661 #define SPL_MASK (SPL_TABLE_SIZE - 1) 662 #define SPL_SHIFT 7 /* log2(SPL_TABLE_SIZE) */ 663 664 #define SPL_INDEX(pp) \ 665 ((((uintptr_t)(pp) >> SPL_SHIFT) ^ \ 666 ((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \ 667 (SPL_TABLE_SIZE - 1)) 668 669 #define SPL_HASH(pp) \ 670 (&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex) 671 672 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 673 674 675 /* 676 * hat_unload_callback() will group together callbacks in order 677 * to avoid xt_sync() calls. This is the maximum size of the group. 678 */ 679 #define MAX_CB_ADDR 32 680 681 #ifdef DEBUG 682 683 /* 684 * Debugging trace ring buffer for stolen and freed ctxs. The 685 * stolen_ctxs[] array is protected by the ctx_trace_mutex. 686 */ 687 struct ctx_trace stolen_ctxs[TRSIZE]; 688 struct ctx_trace *ctx_trace_first = &stolen_ctxs[0]; 689 struct ctx_trace *ctx_trace_last = &stolen_ctxs[TRSIZE-1]; 690 struct ctx_trace *ctx_trace_ptr = &stolen_ctxs[0]; 691 kmutex_t ctx_trace_mutex; 692 uint_t num_ctx_stolen = 0; 693 694 int ism_debug = 0; 695 696 #endif /* DEBUG */ 697 698 tte_t hw_tte; 699 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 700 701 /* 702 * kpm virtual address to physical address 703 */ 704 #define SFMMU_KPM_VTOP(vaddr, paddr) { \ 705 uintptr_t r, v; \ 706 \ 707 r = ((vaddr) - kpm_vbase) >> (uintptr_t)kpm_size_shift; \ 708 (paddr) = (vaddr) - kpm_vbase; \ 709 if (r != 0) { \ 710 v = ((uintptr_t)(vaddr) >> MMU_PAGESHIFT) & \ 711 vac_colors_mask; \ 712 (paddr) -= r << kpm_size_shift; \ 713 if (r > v) \ 714 (paddr) += (r - v) << MMU_PAGESHIFT; \ 715 else \ 716 (paddr) -= r << MMU_PAGESHIFT; \ 717 } \ 718 } 719 720 /* 721 * Wrapper for vmem_xalloc since vmem_create only allows limited 722 * parameters for vm_source_alloc functions. This function allows us 723 * to specify alignment consistent with the size of the object being 724 * allocated. 725 */ 726 static void * 727 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 728 { 729 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 730 } 731 732 /* Common code for setting tsb_alloc_hiwater. */ 733 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 734 ptob(pages) / tsb_alloc_hiwater_factor 735 736 /* 737 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 738 * a single TSB. physmem is the number of physical pages so we need physmem 8K 739 * TTEs to represent all those physical pages. We round this up by using 740 * 1<<highbit(). To figure out which size code to use, remember that the size 741 * code is just an amount to shift the smallest TSB size to get the size of 742 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 743 * highbit() - 1) to get the size code for the smallest TSB that can represent 744 * all of physical memory, while erring on the side of too much. 745 * 746 * If the computed size code is less than the current tsb_max_growsize, we set 747 * tsb_max_growsize to the computed size code. In the case where the computed 748 * size code is greater than tsb_max_growsize, we have these restrictions that 749 * apply to increasing tsb_max_growsize: 750 * 1) TSBs can't grow larger than the TSB slab size 751 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 752 */ 753 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 754 int i, szc; \ 755 \ 756 i = highbit(pages); \ 757 if ((1 << (i - 1)) == (pages)) \ 758 i--; /* 2^n case, round down */ \ 759 szc = i - TSB_START_SIZE; \ 760 if (szc < tsb_max_growsize) \ 761 tsb_max_growsize = szc; \ 762 else if ((szc > tsb_max_growsize) && \ 763 (szc <= tsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT))) \ 764 tsb_max_growsize = MIN(szc, UTSB_MAX_SZCODE); \ 765 } 766 767 /* 768 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 769 * tsb_info which handles that TTE size. 770 */ 771 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) \ 772 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 773 ASSERT(sfmmu_hat_lock_held(sfmmup)); \ 774 if ((tte_szc) >= TTE4M) \ 775 (tsbinfop) = (tsbinfop)->tsb_next; 776 777 /* 778 * Return the number of mappings present in the HAT 779 * for a particular process and page size. 780 */ 781 #define SFMMU_TTE_CNT(sfmmup, szc) \ 782 (sfmmup)->sfmmu_iblk? \ 783 (sfmmup)->sfmmu_ismttecnt[(szc)] + \ 784 (sfmmup)->sfmmu_ttecnt[(szc)] : \ 785 (sfmmup)->sfmmu_ttecnt[(szc)]; 786 787 /* 788 * Macro to use to unload entries from the TSB. 789 * It has knowledge of which page sizes get replicated in the TSB 790 * and will call the appropriate unload routine for the appropriate size. 791 */ 792 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp) \ 793 { \ 794 int ttesz = get_hblk_ttesz(hmeblkp); \ 795 if (ttesz == TTE8K || ttesz == TTE4M) { \ 796 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 797 } else { \ 798 caddr_t sva = (caddr_t)get_hblk_base(hmeblkp); \ 799 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 800 ASSERT(addr >= sva && addr < eva); \ 801 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 802 } \ 803 } 804 805 806 /* Update tsb_alloc_hiwater after memory is configured. */ 807 /*ARGSUSED*/ 808 static void 809 sfmmu_update_tsb_post_add(void *arg, pgcnt_t delta_pages) 810 { 811 /* Assumes physmem has already been updated. */ 812 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 813 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 814 } 815 816 /* 817 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 818 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 819 * deleted. 820 */ 821 /*ARGSUSED*/ 822 static int 823 sfmmu_update_tsb_pre_del(void *arg, pgcnt_t delta_pages) 824 { 825 return (0); 826 } 827 828 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 829 /*ARGSUSED*/ 830 static void 831 sfmmu_update_tsb_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 832 { 833 /* 834 * Whether the delete was cancelled or not, just go ahead and update 835 * tsb_alloc_hiwater and tsb_max_growsize. 836 */ 837 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 838 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 839 } 840 841 static kphysm_setup_vector_t sfmmu_update_tsb_vec = { 842 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 843 sfmmu_update_tsb_post_add, /* post_add */ 844 sfmmu_update_tsb_pre_del, /* pre_del */ 845 sfmmu_update_tsb_post_del /* post_del */ 846 }; 847 848 849 /* 850 * HME_BLK HASH PRIMITIVES 851 */ 852 853 /* 854 * Enter a hme on the mapping list for page pp. 855 * When large pages are more prevalent in the system we might want to 856 * keep the mapping list in ascending order by the hment size. For now, 857 * small pages are more frequent, so don't slow it down. 858 */ 859 #define HME_ADD(hme, pp) \ 860 { \ 861 ASSERT(sfmmu_mlist_held(pp)); \ 862 \ 863 hme->hme_prev = NULL; \ 864 hme->hme_next = pp->p_mapping; \ 865 hme->hme_page = pp; \ 866 if (pp->p_mapping) { \ 867 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 868 ASSERT(pp->p_share > 0); \ 869 } else { \ 870 /* EMPTY */ \ 871 ASSERT(pp->p_share == 0); \ 872 } \ 873 pp->p_mapping = hme; \ 874 pp->p_share++; \ 875 } 876 877 /* 878 * Enter a hme on the mapping list for page pp. 879 * If we are unmapping a large translation, we need to make sure that the 880 * change is reflect in the corresponding bit of the p_index field. 881 */ 882 #define HME_SUB(hme, pp) \ 883 { \ 884 ASSERT(sfmmu_mlist_held(pp)); \ 885 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 886 \ 887 if (pp->p_mapping == NULL) { \ 888 panic("hme_remove - no mappings"); \ 889 } \ 890 \ 891 membar_stst(); /* ensure previous stores finish */ \ 892 \ 893 ASSERT(pp->p_share > 0); \ 894 pp->p_share--; \ 895 \ 896 if (hme->hme_prev) { \ 897 ASSERT(pp->p_mapping != hme); \ 898 ASSERT(hme->hme_prev->hme_page == pp || \ 899 IS_PAHME(hme->hme_prev)); \ 900 hme->hme_prev->hme_next = hme->hme_next; \ 901 } else { \ 902 ASSERT(pp->p_mapping == hme); \ 903 pp->p_mapping = hme->hme_next; \ 904 ASSERT((pp->p_mapping == NULL) ? \ 905 (pp->p_share == 0) : 1); \ 906 } \ 907 \ 908 if (hme->hme_next) { \ 909 ASSERT(hme->hme_next->hme_page == pp || \ 910 IS_PAHME(hme->hme_next)); \ 911 hme->hme_next->hme_prev = hme->hme_prev; \ 912 } \ 913 \ 914 /* zero out the entry */ \ 915 hme->hme_next = NULL; \ 916 hme->hme_prev = NULL; \ 917 hme->hme_page = NULL; \ 918 \ 919 if (hme_size(hme) > TTE8K) { \ 920 /* remove mappings for remainder of large pg */ \ 921 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 922 } \ 923 } 924 925 /* 926 * This function returns the hment given the hme_blk and a vaddr. 927 * It assumes addr has already been checked to belong to hme_blk's 928 * range. 929 */ 930 #define HBLKTOHME(hment, hmeblkp, addr) \ 931 { \ 932 int index; \ 933 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 934 } 935 936 /* 937 * Version of HBLKTOHME that also returns the index in hmeblkp 938 * of the hment. 939 */ 940 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 941 { \ 942 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 943 \ 944 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 945 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 946 } else \ 947 idx = 0; \ 948 \ 949 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 950 } 951 952 /* 953 * Disable any page sizes not supported by the CPU 954 */ 955 void 956 hat_init_pagesizes() 957 { 958 int i; 959 960 mmu_exported_page_sizes = 0; 961 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 962 extern int disable_text_largepages; 963 extern int disable_initdata_largepages; 964 965 szc_2_userszc[i] = (uint_t)-1; 966 userszc_2_szc[i] = (uint_t)-1; 967 968 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 969 disable_large_pages |= (1 << i); 970 disable_ism_large_pages |= (1 << i); 971 disable_text_largepages |= (1 << i); 972 disable_initdata_largepages |= (1 << i); 973 } else { 974 szc_2_userszc[i] = mmu_exported_page_sizes; 975 userszc_2_szc[mmu_exported_page_sizes] = i; 976 mmu_exported_page_sizes++; 977 } 978 } 979 980 disable_auto_large_pages = disable_large_pages; 981 982 /* 983 * Initialize mmu-specific large page sizes. 984 */ 985 if ((mmu_page_sizes == max_mmu_page_sizes) && 986 (&mmu_large_pages_disabled)) { 987 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 988 disable_ism_large_pages |= 989 mmu_large_pages_disabled(HAT_LOAD_SHARE); 990 disable_auto_large_pages |= 991 mmu_large_pages_disabled(HAT_LOAD_AUTOLPG); 992 } 993 994 } 995 996 /* 997 * Initialize the hardware address translation structures. 998 */ 999 void 1000 hat_init(void) 1001 { 1002 struct ctx *ctx; 1003 struct ctx *cur_ctx = NULL; 1004 int i; 1005 1006 hat_lock_init(); 1007 hat_kstat_init(); 1008 1009 /* 1010 * Hardware-only bits in a TTE 1011 */ 1012 MAKE_TTE_MASK(&hw_tte); 1013 1014 hat_init_pagesizes(); 1015 1016 /* Initialize the hash locks */ 1017 for (i = 0; i < khmehash_num; i++) { 1018 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1019 MUTEX_DEFAULT, NULL); 1020 } 1021 for (i = 0; i < uhmehash_num; i++) { 1022 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1023 MUTEX_DEFAULT, NULL); 1024 } 1025 khmehash_num--; /* make sure counter starts from 0 */ 1026 uhmehash_num--; /* make sure counter starts from 0 */ 1027 1028 /* 1029 * Initialize ctx structures and list lock. 1030 * We keep two lists of ctxs. The "free" list contains contexts 1031 * ready to use. The "dirty" list contains contexts that are OK 1032 * to use after flushing the TLBs of any stale mappings. 1033 */ 1034 mutex_init(&ctx_list_lock, NULL, MUTEX_DEFAULT, NULL); 1035 kctx = &ctxs[KCONTEXT]; 1036 ctx = &ctxs[NUM_LOCKED_CTXS]; 1037 ctxhand = ctxfree = ctx; /* head of free list */ 1038 ctxdirty = NULL; 1039 for (i = NUM_LOCKED_CTXS; i < nctxs; i++) { 1040 cur_ctx = &ctxs[i]; 1041 cur_ctx->ctx_flags = CTX_FREE_FLAG; 1042 cur_ctx->ctx_free = &ctxs[i + 1]; 1043 } 1044 cur_ctx->ctx_free = NULL; /* tail of free list */ 1045 1046 /* 1047 * Intialize ism mapping list lock. 1048 */ 1049 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1050 1051 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", sizeof (sfmmu_t), 1052 0, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1053 NULL, NULL, NULL, 0); 1054 1055 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1056 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1057 1058 /* 1059 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1060 * from the heap when low on memory or when TSB_FORCEALLOC is 1061 * specified, don't use magazines to cache them--we want to return 1062 * them to the system as quickly as possible. 1063 */ 1064 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1065 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1066 static_arena, KMC_NOMAGAZINE); 1067 1068 /* 1069 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1070 * memory, which corresponds to the old static reserve for TSBs. 1071 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1072 * memory we'll allocate for TSB slabs; beyond this point TSB 1073 * allocations will be taken from the kernel heap (via 1074 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1075 * consumer. 1076 */ 1077 if (tsb_alloc_hiwater_factor == 0) { 1078 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1079 } 1080 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1081 1082 /* Set tsb_max_growsize. */ 1083 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1084 1085 /* 1086 * On smaller memory systems, allocate TSB memory in 512K chunks 1087 * instead of the default 4M slab size. The trap handlers need to 1088 * be patched with the final slab shift since they need to be able 1089 * to construct the TSB pointer at runtime. 1090 */ 1091 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1092 !(disable_large_pages & (1 << TTE512K))) { 1093 tsb_slab_size = MMU_PAGESIZE512K; 1094 tsb_slab_shift = MMU_PAGESHIFT512K; 1095 tsb_slab_ttesz = TTE512K; 1096 tsb_slab_mask = 0x3f; /* 512K page alignment for 8K pfn */ 1097 } 1098 1099 /* 1100 * Set up memory callback to update tsb_alloc_hiwater and 1101 * tsb_max_growsize. 1102 */ 1103 i = kphysm_setup_func_register(&sfmmu_update_tsb_vec, (void *) 0); 1104 ASSERT(i == 0); 1105 1106 /* 1107 * kmem_tsb_arena is the source from which large TSB slabs are 1108 * drawn. The quantum of this arena corresponds to the largest 1109 * TSB size we can dynamically allocate for user processes. 1110 * Currently it must also be a supported page size since we 1111 * use exactly one translation entry to map each slab page. 1112 * 1113 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1114 * which most TSBs are allocated. Since most TSB allocations are 1115 * typically 8K we have a kmem cache we stack on top of each 1116 * kmem_tsb_default_arena to speed up those allocations. 1117 * 1118 * Note the two-level scheme of arenas is required only 1119 * because vmem_create doesn't allow us to specify alignment 1120 * requirements. If this ever changes the code could be 1121 * simplified to use only one level of arenas. 1122 */ 1123 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1124 sfmmu_vmem_xalloc_aligned_wrapper, vmem_xfree, heap_arena, 1125 0, VM_SLEEP); 1126 1127 if (tsb_lgrp_affinity) { 1128 char s[50]; 1129 for (i = 0; i < NLGRPS_MAX; i++) { 1130 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1131 kmem_tsb_default_arena[i] = 1132 vmem_create(s, NULL, 0, PAGESIZE, 1133 sfmmu_tsb_segkmem_alloc, sfmmu_tsb_segkmem_free, 1134 kmem_tsb_arena, 0, VM_SLEEP | VM_BESTFIT); 1135 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1136 sfmmu_tsb_cache[i] = kmem_cache_create(s, PAGESIZE, 1137 PAGESIZE, NULL, NULL, NULL, NULL, 1138 kmem_tsb_default_arena[i], 0); 1139 } 1140 } else { 1141 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1142 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1143 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1144 VM_SLEEP | VM_BESTFIT); 1145 1146 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1147 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1148 kmem_tsb_default_arena[0], 0); 1149 } 1150 1151 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1152 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1153 sfmmu_hblkcache_destructor, 1154 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1155 hat_memload_arena, KMC_NOHASH); 1156 1157 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1158 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP); 1159 1160 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1161 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1162 sfmmu_hblkcache_destructor, 1163 NULL, (void *)HME1BLK_SZ, 1164 hat_memload1_arena, KMC_NOHASH); 1165 1166 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1167 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1168 1169 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1170 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1171 NULL, NULL, static_arena, KMC_NOHASH); 1172 1173 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1174 sizeof (ism_ment_t), 0, NULL, NULL, 1175 NULL, NULL, NULL, 0); 1176 1177 /* 1178 * We grab the first hat for the kernel, 1179 */ 1180 AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER); 1181 kas.a_hat = hat_alloc(&kas); 1182 AS_LOCK_EXIT(&kas, &kas.a_lock); 1183 1184 /* 1185 * Initialize hblk_reserve. 1186 */ 1187 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1188 va_to_pa((caddr_t)hblk_reserve); 1189 1190 #ifndef sun4v 1191 /* 1192 * Reserve some kernel virtual address space for the locked TTEs 1193 * that allow us to probe the TSB from TL>0. 1194 */ 1195 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1196 0, 0, NULL, NULL, VM_SLEEP); 1197 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1198 0, 0, NULL, NULL, VM_SLEEP); 1199 #endif 1200 1201 /* 1202 * The big page VAC handling code assumes VAC 1203 * will not be bigger than the smallest big 1204 * page- which is 64K. 1205 */ 1206 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1207 cmn_err(CE_PANIC, "VAC too big!"); 1208 } 1209 1210 (void) xhat_init(); 1211 1212 uhme_hash_pa = va_to_pa(uhme_hash); 1213 khme_hash_pa = va_to_pa(khme_hash); 1214 1215 /* 1216 * Initialize relocation locks. kpr_suspendlock is held 1217 * at PIL_MAX to prevent interrupts from pinning the holder 1218 * of a suspended TTE which may access it leading to a 1219 * deadlock condition. 1220 */ 1221 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1222 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1223 } 1224 1225 /* 1226 * Initialize locking for the hat layer, called early during boot. 1227 */ 1228 static void 1229 hat_lock_init() 1230 { 1231 int i; 1232 struct ctx *ctx; 1233 1234 /* 1235 * initialize the array of mutexes protecting a page's mapping 1236 * list and p_nrm field. 1237 */ 1238 for (i = 0; i < mml_table_sz; i++) 1239 mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL); 1240 1241 if (kpm_enable) { 1242 for (i = 0; i < kpmp_table_sz; i++) { 1243 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1244 MUTEX_DEFAULT, NULL); 1245 } 1246 } 1247 1248 /* 1249 * Initialize array of mutex locks that protects sfmmu fields and 1250 * TSB lists. 1251 */ 1252 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1253 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1254 NULL); 1255 1256 #ifdef DEBUG 1257 mutex_init(&ctx_trace_mutex, NULL, MUTEX_DEFAULT, NULL); 1258 #endif /* DEBUG */ 1259 1260 for (ctx = ctxs, i = 0; i < nctxs; i++, ctx++) { 1261 rw_init(&ctx->ctx_rwlock, NULL, RW_DEFAULT, NULL); 1262 } 1263 } 1264 1265 extern caddr_t kmem64_base, kmem64_end; 1266 1267 #define SFMMU_KERNEL_MAXVA \ 1268 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1269 1270 /* 1271 * Allocate a hat structure. 1272 * Called when an address space first uses a hat. 1273 */ 1274 struct hat * 1275 hat_alloc(struct as *as) 1276 { 1277 sfmmu_t *sfmmup; 1278 struct ctx *ctx; 1279 int i; 1280 extern uint_t get_color_start(struct as *); 1281 1282 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1283 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1284 sfmmup->sfmmu_as = as; 1285 sfmmup->sfmmu_flags = 0; 1286 1287 if (as == &kas) { 1288 ctx = kctx; 1289 ksfmmup = sfmmup; 1290 sfmmup->sfmmu_cnum = ctxtoctxnum(ctx); 1291 ASSERT(sfmmup->sfmmu_cnum == KCONTEXT); 1292 sfmmup->sfmmu_cext = 0; 1293 ctx->ctx_sfmmu = sfmmup; 1294 ctx->ctx_flags = 0; 1295 sfmmup->sfmmu_clrstart = 0; 1296 sfmmup->sfmmu_tsb = NULL; 1297 /* 1298 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1299 * to setup tsb_info for ksfmmup. 1300 */ 1301 } else { 1302 1303 /* 1304 * Just set to invalid ctx. When it faults, it will 1305 * get a valid ctx. This would avoid the situation 1306 * where we get a ctx, but it gets stolen and then 1307 * we fault when we try to run and so have to get 1308 * another ctx. 1309 */ 1310 sfmmup->sfmmu_cnum = INVALID_CONTEXT; 1311 sfmmup->sfmmu_cext = 0; 1312 /* initialize original physical page coloring bin */ 1313 sfmmup->sfmmu_clrstart = get_color_start(as); 1314 #ifdef DEBUG 1315 if (tsb_random_size) { 1316 uint32_t randval = (uint32_t)gettick() >> 4; 1317 int size = randval % (tsb_max_growsize + 1); 1318 1319 /* chose a random tsb size for stress testing */ 1320 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1321 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1322 } else 1323 #endif /* DEBUG */ 1324 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1325 default_tsb_size, 1326 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1327 sfmmup->sfmmu_flags = HAT_SWAPPED; 1328 ASSERT(sfmmup->sfmmu_tsb != NULL); 1329 } 1330 sfmmu_setup_tsbinfo(sfmmup); 1331 for (i = 0; i < max_mmu_page_sizes; i++) { 1332 sfmmup->sfmmu_ttecnt[i] = 0; 1333 sfmmup->sfmmu_ismttecnt[i] = 0; 1334 sfmmup->sfmmu_pgsz[i] = TTE8K; 1335 } 1336 1337 sfmmup->sfmmu_iblk = NULL; 1338 sfmmup->sfmmu_ismhat = 0; 1339 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1340 if (sfmmup == ksfmmup) { 1341 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1342 } else { 1343 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1344 } 1345 sfmmup->sfmmu_free = 0; 1346 sfmmup->sfmmu_rmstat = 0; 1347 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1348 sfmmup->sfmmu_xhat_provider = NULL; 1349 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1350 return (sfmmup); 1351 } 1352 1353 /* 1354 * Hat_setup, makes an address space context the current active one. 1355 * In sfmmu this translates to setting the secondary context with the 1356 * corresponding context. 1357 */ 1358 void 1359 hat_setup(struct hat *sfmmup, int allocflag) 1360 { 1361 struct ctx *ctx; 1362 uint_t ctx_num; 1363 hatlock_t *hatlockp; 1364 1365 /* Init needs some special treatment. */ 1366 if (allocflag == HAT_INIT) { 1367 /* 1368 * Make sure that we have 1369 * 1. a TSB 1370 * 2. a valid ctx that doesn't get stolen after this point. 1371 */ 1372 hatlockp = sfmmu_hat_enter(sfmmup); 1373 1374 /* 1375 * Swap in the TSB. hat_init() allocates tsbinfos without 1376 * TSBs, but we need one for init, since the kernel does some 1377 * special things to set up its stack and needs the TSB to 1378 * resolve page faults. 1379 */ 1380 sfmmu_tsb_swapin(sfmmup, hatlockp); 1381 1382 sfmmu_disallow_ctx_steal(sfmmup); 1383 1384 kpreempt_disable(); 1385 1386 ctx = sfmmutoctx(sfmmup); 1387 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1388 ctx_num = ctxtoctxnum(ctx); 1389 ASSERT(sfmmup == ctx->ctx_sfmmu); 1390 ASSERT(ctx_num >= NUM_LOCKED_CTXS); 1391 sfmmu_setctx_sec(ctx_num); 1392 sfmmu_load_mmustate(sfmmup); 1393 1394 kpreempt_enable(); 1395 1396 /* 1397 * Allow ctx to be stolen. 1398 */ 1399 sfmmu_allow_ctx_steal(sfmmup); 1400 sfmmu_hat_exit(hatlockp); 1401 } else { 1402 ASSERT(allocflag == HAT_ALLOC); 1403 1404 hatlockp = sfmmu_hat_enter(sfmmup); 1405 kpreempt_disable(); 1406 1407 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1408 sfmmu_setctx_sec(INVALID_CONTEXT); 1409 sfmmu_clear_utsbinfo(); 1410 1411 kpreempt_enable(); 1412 sfmmu_hat_exit(hatlockp); 1413 } 1414 } 1415 1416 /* 1417 * Free all the translation resources for the specified address space. 1418 * Called from as_free when an address space is being destroyed. 1419 */ 1420 void 1421 hat_free_start(struct hat *sfmmup) 1422 { 1423 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 1424 ASSERT(sfmmup != ksfmmup); 1425 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1426 1427 sfmmup->sfmmu_free = 1; 1428 } 1429 1430 void 1431 hat_free_end(struct hat *sfmmup) 1432 { 1433 int i; 1434 1435 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1436 if (sfmmup->sfmmu_ismhat) { 1437 for (i = 0; i < mmu_page_sizes; i++) { 1438 sfmmup->sfmmu_ttecnt[i] = 0; 1439 sfmmup->sfmmu_ismttecnt[i] = 0; 1440 } 1441 } else { 1442 /* EMPTY */ 1443 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1444 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1445 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1446 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1447 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1448 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1449 } 1450 1451 if (sfmmup->sfmmu_rmstat) { 1452 hat_freestat(sfmmup->sfmmu_as, NULL); 1453 } 1454 if (!delay_tlb_flush) { 1455 sfmmu_tlb_ctx_demap(sfmmup); 1456 xt_sync(sfmmup->sfmmu_cpusran); 1457 } else { 1458 SFMMU_STAT(sf_tlbflush_deferred); 1459 } 1460 sfmmu_free_ctx(sfmmup, sfmmutoctx(sfmmup)); 1461 while (sfmmup->sfmmu_tsb != NULL) { 1462 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1463 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1464 sfmmup->sfmmu_tsb = next; 1465 } 1466 sfmmu_free_sfmmu(sfmmup); 1467 1468 kmem_cache_free(sfmmuid_cache, sfmmup); 1469 } 1470 1471 /* 1472 * Set up any translation structures, for the specified address space, 1473 * that are needed or preferred when the process is being swapped in. 1474 */ 1475 /* ARGSUSED */ 1476 void 1477 hat_swapin(struct hat *hat) 1478 { 1479 ASSERT(hat->sfmmu_xhat_provider == NULL); 1480 } 1481 1482 /* 1483 * Free all of the translation resources, for the specified address space, 1484 * that can be freed while the process is swapped out. Called from as_swapout. 1485 * Also, free up the ctx that this process was using. 1486 */ 1487 void 1488 hat_swapout(struct hat *sfmmup) 1489 { 1490 struct hmehash_bucket *hmebp; 1491 struct hme_blk *hmeblkp; 1492 struct hme_blk *pr_hblk = NULL; 1493 struct hme_blk *nx_hblk; 1494 struct ctx *ctx; 1495 int cnum; 1496 int i; 1497 uint64_t hblkpa, prevpa, nx_pa; 1498 struct hme_blk *list = NULL; 1499 hatlock_t *hatlockp; 1500 struct tsb_info *tsbinfop; 1501 struct free_tsb { 1502 struct free_tsb *next; 1503 struct tsb_info *tsbinfop; 1504 }; /* free list of TSBs */ 1505 struct free_tsb *freelist, *last, *next; 1506 1507 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1508 SFMMU_STAT(sf_swapout); 1509 1510 /* 1511 * There is no way to go from an as to all its translations in sfmmu. 1512 * Here is one of the times when we take the big hit and traverse 1513 * the hash looking for hme_blks to free up. Not only do we free up 1514 * this as hme_blks but all those that are free. We are obviously 1515 * swapping because we need memory so let's free up as much 1516 * as we can. 1517 * 1518 * Note that we don't flush TLB/TSB here -- it's not necessary 1519 * because: 1520 * 1) we free the ctx we're using and throw away the TSB(s); 1521 * 2) processes aren't runnable while being swapped out. 1522 */ 1523 ASSERT(sfmmup != KHATID); 1524 for (i = 0; i <= UHMEHASH_SZ; i++) { 1525 hmebp = &uhme_hash[i]; 1526 SFMMU_HASH_LOCK(hmebp); 1527 hmeblkp = hmebp->hmeblkp; 1528 hblkpa = hmebp->hmeh_nextpa; 1529 prevpa = 0; 1530 pr_hblk = NULL; 1531 while (hmeblkp) { 1532 1533 ASSERT(!hmeblkp->hblk_xhat_bit); 1534 1535 if ((hmeblkp->hblk_tag.htag_id == sfmmup) && 1536 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) { 1537 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 1538 (caddr_t)get_hblk_base(hmeblkp), 1539 get_hblk_endaddr(hmeblkp), 1540 NULL, HAT_UNLOAD); 1541 } 1542 nx_hblk = hmeblkp->hblk_next; 1543 nx_pa = hmeblkp->hblk_nextpa; 1544 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 1545 ASSERT(!hmeblkp->hblk_lckcnt); 1546 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 1547 prevpa, pr_hblk); 1548 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 1549 } else { 1550 pr_hblk = hmeblkp; 1551 prevpa = hblkpa; 1552 } 1553 hmeblkp = nx_hblk; 1554 hblkpa = nx_pa; 1555 } 1556 SFMMU_HASH_UNLOCK(hmebp); 1557 } 1558 1559 sfmmu_hblks_list_purge(&list); 1560 1561 /* 1562 * Now free up the ctx so that others can reuse it. 1563 */ 1564 hatlockp = sfmmu_hat_enter(sfmmup); 1565 ctx = sfmmutoctx(sfmmup); 1566 cnum = ctxtoctxnum(ctx); 1567 1568 if (cnum != INVALID_CONTEXT) { 1569 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 1570 if (sfmmup->sfmmu_cnum == cnum) { 1571 sfmmu_reuse_ctx(ctx, sfmmup); 1572 /* 1573 * Put ctx back to the free list. 1574 */ 1575 mutex_enter(&ctx_list_lock); 1576 CTX_SET_FLAGS(ctx, CTX_FREE_FLAG); 1577 ctx->ctx_free = ctxfree; 1578 ctxfree = ctx; 1579 mutex_exit(&ctx_list_lock); 1580 } 1581 rw_exit(&ctx->ctx_rwlock); 1582 } 1583 1584 /* 1585 * Free TSBs, but not tsbinfos, and set SWAPPED flag. 1586 * If TSBs were never swapped in, just return. 1587 * This implies that we don't support partial swapping 1588 * of TSBs -- either all are swapped out, or none are. 1589 * 1590 * We must hold the HAT lock here to prevent racing with another 1591 * thread trying to unmap TTEs from the TSB or running the post- 1592 * relocator after relocating the TSB's memory. Unfortunately, we 1593 * can't free memory while holding the HAT lock or we could 1594 * deadlock, so we build a list of TSBs to be freed after marking 1595 * the tsbinfos as swapped out and free them after dropping the 1596 * lock. 1597 */ 1598 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 1599 sfmmu_hat_exit(hatlockp); 1600 return; 1601 } 1602 1603 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED); 1604 last = freelist = NULL; 1605 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 1606 tsbinfop = tsbinfop->tsb_next) { 1607 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0); 1608 1609 /* 1610 * Cast the TSB into a struct free_tsb and put it on the free 1611 * list. 1612 */ 1613 if (freelist == NULL) { 1614 last = freelist = (struct free_tsb *)tsbinfop->tsb_va; 1615 } else { 1616 last->next = (struct free_tsb *)tsbinfop->tsb_va; 1617 last = last->next; 1618 } 1619 last->next = NULL; 1620 last->tsbinfop = tsbinfop; 1621 tsbinfop->tsb_flags |= TSB_SWAPPED; 1622 /* 1623 * Zero out the TTE to clear the valid bit. 1624 * Note we can't use a value like 0xbad because we want to 1625 * ensure diagnostic bits are NEVER set on TTEs that might 1626 * be loaded. The intent is to catch any invalid access 1627 * to the swapped TSB, such as a thread running with a valid 1628 * context without first calling sfmmu_tsb_swapin() to 1629 * allocate TSB memory. 1630 */ 1631 tsbinfop->tsb_tte.ll = 0; 1632 } 1633 1634 /* Now we can drop the lock and free the TSB memory. */ 1635 sfmmu_hat_exit(hatlockp); 1636 for (; freelist != NULL; freelist = next) { 1637 next = freelist->next; 1638 sfmmu_tsb_free(freelist->tsbinfop); 1639 } 1640 } 1641 1642 /* 1643 * Duplicate the translations of an as into another newas 1644 */ 1645 /* ARGSUSED */ 1646 int 1647 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 1648 uint_t flag) 1649 { 1650 ASSERT(hat->sfmmu_xhat_provider == NULL); 1651 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW)); 1652 1653 if (flag == HAT_DUP_COW) { 1654 panic("hat_dup: HAT_DUP_COW not supported"); 1655 } 1656 return (0); 1657 } 1658 1659 /* 1660 * Set up addr to map to page pp with protection prot. 1661 * As an optimization we also load the TSB with the 1662 * corresponding tte but it is no big deal if the tte gets kicked out. 1663 */ 1664 void 1665 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 1666 uint_t attr, uint_t flags) 1667 { 1668 tte_t tte; 1669 1670 1671 ASSERT(hat != NULL); 1672 ASSERT(PAGE_LOCKED(pp)); 1673 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 1674 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 1675 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 1676 1677 if (PP_ISFREE(pp)) { 1678 panic("hat_memload: loading a mapping to free page %p", 1679 (void *)pp); 1680 } 1681 1682 if (hat->sfmmu_xhat_provider) { 1683 XHAT_MEMLOAD(hat, addr, pp, attr, flags); 1684 return; 1685 } 1686 1687 ASSERT((hat == ksfmmup) || 1688 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 1689 1690 if (flags & ~SFMMU_LOAD_ALLFLAG) 1691 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 1692 flags & ~SFMMU_LOAD_ALLFLAG); 1693 1694 if (hat->sfmmu_rmstat) 1695 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 1696 1697 #if defined(SF_ERRATA_57) 1698 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 1699 (addr < errata57_limit) && (attr & PROT_EXEC) && 1700 !(flags & HAT_LOAD_SHARE)) { 1701 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 1702 " page executable"); 1703 attr &= ~PROT_EXEC; 1704 } 1705 #endif 1706 1707 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 1708 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags); 1709 1710 /* 1711 * Check TSB and TLB page sizes. 1712 */ 1713 if ((flags & HAT_LOAD_SHARE) == 0) { 1714 sfmmu_check_page_sizes(hat, 1); 1715 } 1716 } 1717 1718 /* 1719 * hat_devload can be called to map real memory (e.g. 1720 * /dev/kmem) and even though hat_devload will determine pf is 1721 * for memory, it will be unable to get a shared lock on the 1722 * page (because someone else has it exclusively) and will 1723 * pass dp = NULL. If tteload doesn't get a non-NULL 1724 * page pointer it can't cache memory. 1725 */ 1726 void 1727 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 1728 uint_t attr, int flags) 1729 { 1730 tte_t tte; 1731 struct page *pp = NULL; 1732 int use_lgpg = 0; 1733 1734 ASSERT(hat != NULL); 1735 1736 if (hat->sfmmu_xhat_provider) { 1737 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags); 1738 return; 1739 } 1740 1741 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 1742 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 1743 ASSERT((hat == ksfmmup) || 1744 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 1745 if (len == 0) 1746 panic("hat_devload: zero len"); 1747 if (flags & ~SFMMU_LOAD_ALLFLAG) 1748 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 1749 flags & ~SFMMU_LOAD_ALLFLAG); 1750 1751 #if defined(SF_ERRATA_57) 1752 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 1753 (addr < errata57_limit) && (attr & PROT_EXEC) && 1754 !(flags & HAT_LOAD_SHARE)) { 1755 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 1756 " page executable"); 1757 attr &= ~PROT_EXEC; 1758 } 1759 #endif 1760 1761 /* 1762 * If it's a memory page find its pp 1763 */ 1764 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 1765 pp = page_numtopp_nolock(pfn); 1766 if (pp == NULL) { 1767 flags |= HAT_LOAD_NOCONSIST; 1768 } else { 1769 if (PP_ISFREE(pp)) { 1770 panic("hat_memload: loading " 1771 "a mapping to free page %p", 1772 (void *)pp); 1773 } 1774 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 1775 panic("hat_memload: loading a mapping " 1776 "to unlocked relocatable page %p", 1777 (void *)pp); 1778 } 1779 ASSERT(len == MMU_PAGESIZE); 1780 } 1781 } 1782 1783 if (hat->sfmmu_rmstat) 1784 hat_resvstat(len, hat->sfmmu_as, addr); 1785 1786 if (flags & HAT_LOAD_NOCONSIST) { 1787 attr |= SFMMU_UNCACHEVTTE; 1788 use_lgpg = 1; 1789 } 1790 if (!pf_is_memory(pfn)) { 1791 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 1792 use_lgpg = 1; 1793 switch (attr & HAT_ORDER_MASK) { 1794 case HAT_STRICTORDER: 1795 case HAT_UNORDERED_OK: 1796 /* 1797 * we set the side effect bit for all non 1798 * memory mappings unless merging is ok 1799 */ 1800 attr |= SFMMU_SIDEFFECT; 1801 break; 1802 case HAT_MERGING_OK: 1803 case HAT_LOADCACHING_OK: 1804 case HAT_STORECACHING_OK: 1805 break; 1806 default: 1807 panic("hat_devload: bad attr"); 1808 break; 1809 } 1810 } 1811 while (len) { 1812 if (!use_lgpg) { 1813 sfmmu_memtte(&tte, pfn, attr, TTE8K); 1814 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1815 flags); 1816 len -= MMU_PAGESIZE; 1817 addr += MMU_PAGESIZE; 1818 pfn++; 1819 continue; 1820 } 1821 /* 1822 * try to use large pages, check va/pa alignments 1823 * Note that 32M/256M page sizes are not (yet) supported. 1824 */ 1825 if ((len >= MMU_PAGESIZE4M) && 1826 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 1827 !(disable_large_pages & (1 << TTE4M)) && 1828 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 1829 sfmmu_memtte(&tte, pfn, attr, TTE4M); 1830 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1831 flags); 1832 len -= MMU_PAGESIZE4M; 1833 addr += MMU_PAGESIZE4M; 1834 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 1835 } else if ((len >= MMU_PAGESIZE512K) && 1836 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 1837 !(disable_large_pages & (1 << TTE512K)) && 1838 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 1839 sfmmu_memtte(&tte, pfn, attr, TTE512K); 1840 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1841 flags); 1842 len -= MMU_PAGESIZE512K; 1843 addr += MMU_PAGESIZE512K; 1844 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 1845 } else if ((len >= MMU_PAGESIZE64K) && 1846 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 1847 !(disable_large_pages & (1 << TTE64K)) && 1848 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 1849 sfmmu_memtte(&tte, pfn, attr, TTE64K); 1850 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1851 flags); 1852 len -= MMU_PAGESIZE64K; 1853 addr += MMU_PAGESIZE64K; 1854 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 1855 } else { 1856 sfmmu_memtte(&tte, pfn, attr, TTE8K); 1857 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 1858 flags); 1859 len -= MMU_PAGESIZE; 1860 addr += MMU_PAGESIZE; 1861 pfn++; 1862 } 1863 } 1864 1865 /* 1866 * Check TSB and TLB page sizes. 1867 */ 1868 if ((flags & HAT_LOAD_SHARE) == 0) { 1869 sfmmu_check_page_sizes(hat, 1); 1870 } 1871 } 1872 1873 /* 1874 * Map the largest extend possible out of the page array. The array may NOT 1875 * be in order. The largest possible mapping a page can have 1876 * is specified in the p_szc field. The p_szc field 1877 * cannot change as long as there any mappings (large or small) 1878 * to any of the pages that make up the large page. (ie. any 1879 * promotion/demotion of page size is not up to the hat but up to 1880 * the page free list manager). The array 1881 * should consist of properly aligned contigous pages that are 1882 * part of a big page for a large mapping to be created. 1883 */ 1884 void 1885 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 1886 struct page **pps, uint_t attr, uint_t flags) 1887 { 1888 int ttesz; 1889 size_t mapsz; 1890 pgcnt_t numpg, npgs; 1891 tte_t tte; 1892 page_t *pp; 1893 int large_pages_disable; 1894 1895 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 1896 1897 if (hat->sfmmu_xhat_provider) { 1898 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags); 1899 return; 1900 } 1901 1902 if (hat->sfmmu_rmstat) 1903 hat_resvstat(len, hat->sfmmu_as, addr); 1904 1905 #if defined(SF_ERRATA_57) 1906 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 1907 (addr < errata57_limit) && (attr & PROT_EXEC) && 1908 !(flags & HAT_LOAD_SHARE)) { 1909 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 1910 "user page executable"); 1911 attr &= ~PROT_EXEC; 1912 } 1913 #endif 1914 1915 /* Get number of pages */ 1916 npgs = len >> MMU_PAGESHIFT; 1917 1918 if (flags & HAT_LOAD_SHARE) { 1919 large_pages_disable = disable_ism_large_pages; 1920 } else { 1921 large_pages_disable = disable_large_pages; 1922 } 1923 1924 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 1925 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs); 1926 return; 1927 } 1928 1929 while (npgs >= NHMENTS) { 1930 pp = *pps; 1931 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 1932 /* 1933 * Check if this page size is disabled. 1934 */ 1935 if (large_pages_disable & (1 << ttesz)) 1936 continue; 1937 1938 numpg = TTEPAGES(ttesz); 1939 mapsz = numpg << MMU_PAGESHIFT; 1940 if ((npgs >= numpg) && 1941 IS_P2ALIGNED(addr, mapsz) && 1942 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 1943 /* 1944 * At this point we have enough pages and 1945 * we know the virtual address and the pfn 1946 * are properly aligned. We still need 1947 * to check for physical contiguity but since 1948 * it is very likely that this is the case 1949 * we will assume they are so and undo 1950 * the request if necessary. It would 1951 * be great if we could get a hint flag 1952 * like HAT_CONTIG which would tell us 1953 * the pages are contigous for sure. 1954 */ 1955 sfmmu_memtte(&tte, (*pps)->p_pagenum, 1956 attr, ttesz); 1957 if (!sfmmu_tteload_array(hat, &tte, addr, 1958 pps, flags)) { 1959 break; 1960 } 1961 } 1962 } 1963 if (ttesz == TTE8K) { 1964 /* 1965 * We were not able to map array using a large page 1966 * batch a hmeblk or fraction at a time. 1967 */ 1968 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 1969 & (NHMENTS-1); 1970 numpg = NHMENTS - numpg; 1971 ASSERT(numpg <= npgs); 1972 mapsz = numpg * MMU_PAGESIZE; 1973 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 1974 numpg); 1975 } 1976 addr += mapsz; 1977 npgs -= numpg; 1978 pps += numpg; 1979 } 1980 1981 if (npgs) { 1982 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs); 1983 } 1984 1985 /* 1986 * Check TSB and TLB page sizes. 1987 */ 1988 if ((flags & HAT_LOAD_SHARE) == 0) { 1989 sfmmu_check_page_sizes(hat, 1); 1990 } 1991 } 1992 1993 /* 1994 * Function tries to batch 8K pages into the same hme blk. 1995 */ 1996 static void 1997 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 1998 uint_t attr, uint_t flags, pgcnt_t npgs) 1999 { 2000 tte_t tte; 2001 page_t *pp; 2002 struct hmehash_bucket *hmebp; 2003 struct hme_blk *hmeblkp; 2004 int index; 2005 2006 while (npgs) { 2007 /* 2008 * Acquire the hash bucket. 2009 */ 2010 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K); 2011 ASSERT(hmebp); 2012 2013 /* 2014 * Find the hment block. 2015 */ 2016 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2017 TTE8K, flags); 2018 ASSERT(hmeblkp); 2019 2020 do { 2021 /* 2022 * Make the tte. 2023 */ 2024 pp = *pps; 2025 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2026 2027 /* 2028 * Add the translation. 2029 */ 2030 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2031 vaddr, pps, flags); 2032 2033 /* 2034 * Goto next page. 2035 */ 2036 pps++; 2037 npgs--; 2038 2039 /* 2040 * Goto next address. 2041 */ 2042 vaddr += MMU_PAGESIZE; 2043 2044 /* 2045 * Don't crossover into a different hmentblk. 2046 */ 2047 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2048 (NHMENTS-1)); 2049 2050 } while (index != 0 && npgs != 0); 2051 2052 /* 2053 * Release the hash bucket. 2054 */ 2055 2056 sfmmu_tteload_release_hashbucket(hmebp); 2057 } 2058 } 2059 2060 /* 2061 * Construct a tte for a page: 2062 * 2063 * tte_valid = 1 2064 * tte_size2 = size & TTE_SZ2_BITS (Panther-only) 2065 * tte_size = size 2066 * tte_nfo = attr & HAT_NOFAULT 2067 * tte_ie = attr & HAT_STRUCTURE_LE 2068 * tte_hmenum = hmenum 2069 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2070 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2071 * tte_ref = 1 (optimization) 2072 * tte_wr_perm = attr & PROT_WRITE; 2073 * tte_no_sync = attr & HAT_NOSYNC 2074 * tte_lock = attr & SFMMU_LOCKTTE 2075 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2076 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2077 * tte_e = attr & SFMMU_SIDEFFECT 2078 * tte_priv = !(attr & PROT_USER) 2079 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2080 * tte_glb = 0 2081 */ 2082 void 2083 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2084 { 2085 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2086 2087 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2088 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2089 2090 if (TTE_IS_NOSYNC(ttep)) { 2091 TTE_SET_REF(ttep); 2092 if (TTE_IS_WRITABLE(ttep)) { 2093 TTE_SET_MOD(ttep); 2094 } 2095 } 2096 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2097 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2098 } 2099 } 2100 2101 /* 2102 * This function will add a translation to the hme_blk and allocate the 2103 * hme_blk if one does not exist. 2104 * If a page structure is specified then it will add the 2105 * corresponding hment to the mapping list. 2106 * It will also update the hmenum field for the tte. 2107 */ 2108 void 2109 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2110 uint_t flags) 2111 { 2112 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags); 2113 } 2114 2115 /* 2116 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2117 * Assumes that a particular page size may only be resident in one TSB. 2118 */ 2119 static void 2120 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2121 { 2122 struct tsb_info *tsbinfop = NULL; 2123 uint64_t tag; 2124 struct tsbe *tsbe_addr; 2125 uint64_t tsb_base; 2126 uint_t tsb_size; 2127 int vpshift = MMU_PAGESHIFT; 2128 int phys = 0; 2129 2130 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2131 phys = ktsb_phys; 2132 if (ttesz >= TTE4M) { 2133 #ifndef sun4v 2134 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2135 #endif 2136 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2137 tsb_size = ktsb4m_szcode; 2138 } else { 2139 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2140 tsb_size = ktsb_szcode; 2141 } 2142 } else { 2143 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2144 2145 /* 2146 * If there isn't a TSB for this page size, or the TSB is 2147 * swapped out, there is nothing to do. Note that the latter 2148 * case seems impossible but can occur if hat_pageunload() 2149 * is called on an ISM mapping while the process is swapped 2150 * out. 2151 */ 2152 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2153 return; 2154 2155 /* 2156 * If another thread is in the middle of relocating a TSB 2157 * we can't unload the entry so set a flag so that the 2158 * TSB will be flushed before it can be accessed by the 2159 * process. 2160 */ 2161 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2162 if (ttep == NULL) 2163 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2164 return; 2165 } 2166 #if defined(UTSB_PHYS) 2167 phys = 1; 2168 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2169 #else 2170 tsb_base = (uint64_t)tsbinfop->tsb_va; 2171 #endif 2172 tsb_size = tsbinfop->tsb_szc; 2173 } 2174 if (ttesz >= TTE4M) 2175 vpshift = MMU_PAGESHIFT4M; 2176 2177 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2178 tag = sfmmu_make_tsbtag(vaddr); 2179 2180 if (ttep == NULL) { 2181 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2182 } else { 2183 if (ttesz >= TTE4M) { 2184 SFMMU_STAT(sf_tsb_load4m); 2185 } else { 2186 SFMMU_STAT(sf_tsb_load8k); 2187 } 2188 2189 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2190 } 2191 } 2192 2193 /* 2194 * Unmap all entries from [start, end) matching the given page size. 2195 * 2196 * This function is used primarily to unmap replicated 64K or 512K entries 2197 * from the TSB that are inserted using the base page size TSB pointer, but 2198 * it may also be called to unmap a range of addresses from the TSB. 2199 */ 2200 void 2201 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2202 { 2203 struct tsb_info *tsbinfop; 2204 uint64_t tag; 2205 struct tsbe *tsbe_addr; 2206 caddr_t vaddr; 2207 uint64_t tsb_base; 2208 int vpshift, vpgsz; 2209 uint_t tsb_size; 2210 int phys = 0; 2211 2212 /* 2213 * Assumptions: 2214 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2215 * at a time shooting down any valid entries we encounter. 2216 * 2217 * If ttesz >= 4M we walk the range 4M at a time shooting 2218 * down any valid mappings we find. 2219 */ 2220 if (sfmmup == ksfmmup) { 2221 phys = ktsb_phys; 2222 if (ttesz >= TTE4M) { 2223 #ifndef sun4v 2224 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2225 #endif 2226 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2227 tsb_size = ktsb4m_szcode; 2228 } else { 2229 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2230 tsb_size = ktsb_szcode; 2231 } 2232 } else { 2233 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2234 2235 /* 2236 * If there isn't a TSB for this page size, or the TSB is 2237 * swapped out, there is nothing to do. Note that the latter 2238 * case seems impossible but can occur if hat_pageunload() 2239 * is called on an ISM mapping while the process is swapped 2240 * out. 2241 */ 2242 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2243 return; 2244 2245 /* 2246 * If another thread is in the middle of relocating a TSB 2247 * we can't unload the entry so set a flag so that the 2248 * TSB will be flushed before it can be accessed by the 2249 * process. 2250 */ 2251 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2252 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2253 return; 2254 } 2255 #if defined(UTSB_PHYS) 2256 phys = 1; 2257 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2258 #else 2259 tsb_base = (uint64_t)tsbinfop->tsb_va; 2260 #endif 2261 tsb_size = tsbinfop->tsb_szc; 2262 } 2263 if (ttesz >= TTE4M) { 2264 vpshift = MMU_PAGESHIFT4M; 2265 vpgsz = MMU_PAGESIZE4M; 2266 } else { 2267 vpshift = MMU_PAGESHIFT; 2268 vpgsz = MMU_PAGESIZE; 2269 } 2270 2271 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2272 tag = sfmmu_make_tsbtag(vaddr); 2273 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2274 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2275 } 2276 } 2277 2278 /* 2279 * Select the optimum TSB size given the number of mappings 2280 * that need to be cached. 2281 */ 2282 static int 2283 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2284 { 2285 int szc = 0; 2286 2287 #ifdef DEBUG 2288 if (tsb_grow_stress) { 2289 uint32_t randval = (uint32_t)gettick() >> 4; 2290 return (randval % (tsb_max_growsize + 1)); 2291 } 2292 #endif /* DEBUG */ 2293 2294 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2295 szc++; 2296 return (szc); 2297 } 2298 2299 /* 2300 * This function will add a translation to the hme_blk and allocate the 2301 * hme_blk if one does not exist. 2302 * If a page structure is specified then it will add the 2303 * corresponding hment to the mapping list. 2304 * It will also update the hmenum field for the tte. 2305 * Furthermore, it attempts to create a large page translation 2306 * for <addr,hat> at page array pps. It assumes addr and first 2307 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2308 */ 2309 static int 2310 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2311 page_t **pps, uint_t flags) 2312 { 2313 struct hmehash_bucket *hmebp; 2314 struct hme_blk *hmeblkp; 2315 int ret; 2316 uint_t size; 2317 2318 /* 2319 * Get mapping size. 2320 */ 2321 size = TTE_CSZ(ttep); 2322 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2323 2324 /* 2325 * Acquire the hash bucket. 2326 */ 2327 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size); 2328 ASSERT(hmebp); 2329 2330 /* 2331 * Find the hment block. 2332 */ 2333 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags); 2334 ASSERT(hmeblkp); 2335 2336 /* 2337 * Add the translation. 2338 */ 2339 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags); 2340 2341 /* 2342 * Release the hash bucket. 2343 */ 2344 sfmmu_tteload_release_hashbucket(hmebp); 2345 2346 return (ret); 2347 } 2348 2349 /* 2350 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2351 */ 2352 static struct hmehash_bucket * 2353 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size) 2354 { 2355 struct hmehash_bucket *hmebp; 2356 int hmeshift; 2357 2358 hmeshift = HME_HASH_SHIFT(size); 2359 2360 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 2361 2362 SFMMU_HASH_LOCK(hmebp); 2363 2364 return (hmebp); 2365 } 2366 2367 /* 2368 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2369 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2370 * allocated. 2371 */ 2372 static struct hme_blk * 2373 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2374 caddr_t vaddr, uint_t size, uint_t flags) 2375 { 2376 hmeblk_tag hblktag; 2377 int hmeshift; 2378 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 2379 uint64_t hblkpa, prevpa; 2380 struct kmem_cache *sfmmu_cache; 2381 uint_t forcefree; 2382 2383 hblktag.htag_id = sfmmup; 2384 hmeshift = HME_HASH_SHIFT(size); 2385 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2386 hblktag.htag_rehash = HME_HASH_REHASH(size); 2387 2388 ttearray_realloc: 2389 2390 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, 2391 pr_hblk, prevpa, &list); 2392 2393 /* 2394 * We block until hblk_reserve_lock is released; it's held by 2395 * the thread, temporarily using hblk_reserve, until hblk_reserve is 2396 * replaced by a hblk from sfmmu8_cache. 2397 */ 2398 if (hmeblkp == (struct hme_blk *)hblk_reserve && 2399 hblk_reserve_thread != curthread) { 2400 SFMMU_HASH_UNLOCK(hmebp); 2401 mutex_enter(&hblk_reserve_lock); 2402 mutex_exit(&hblk_reserve_lock); 2403 SFMMU_STAT(sf_hblk_reserve_hit); 2404 SFMMU_HASH_LOCK(hmebp); 2405 goto ttearray_realloc; 2406 } 2407 2408 if (hmeblkp == NULL) { 2409 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2410 hblktag, flags); 2411 } else { 2412 /* 2413 * It is possible for 8k and 64k hblks to collide since they 2414 * have the same rehash value. This is because we 2415 * lazily free hblks and 8K/64K blks could be lingering. 2416 * If we find size mismatch we free the block and & try again. 2417 */ 2418 if (get_hblk_ttesz(hmeblkp) != size) { 2419 ASSERT(!hmeblkp->hblk_vcnt); 2420 ASSERT(!hmeblkp->hblk_hmecnt); 2421 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 2422 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 2423 goto ttearray_realloc; 2424 } 2425 if (hmeblkp->hblk_shw_bit) { 2426 /* 2427 * if the hblk was previously used as a shadow hblk then 2428 * we will change it to a normal hblk 2429 */ 2430 if (hmeblkp->hblk_shw_mask) { 2431 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 2432 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2433 goto ttearray_realloc; 2434 } else { 2435 hmeblkp->hblk_shw_bit = 0; 2436 } 2437 } 2438 SFMMU_STAT(sf_hblk_hit); 2439 } 2440 2441 /* 2442 * hat_memload() should never call kmem_cache_free(); see block 2443 * comment showing the stacktrace in sfmmu_hblk_alloc(); 2444 * enqueue each hblk in the list to reserve list if it's created 2445 * from sfmmu8_cache *and* sfmmup == KHATID. 2446 */ 2447 forcefree = (sfmmup == KHATID) ? 1 : 0; 2448 while ((pr_hblk = list) != NULL) { 2449 list = pr_hblk->hblk_next; 2450 sfmmu_cache = get_hblk_cache(pr_hblk); 2451 if ((sfmmu_cache == sfmmu8_cache) && 2452 sfmmu_put_free_hblk(pr_hblk, forcefree)) 2453 continue; 2454 2455 ASSERT(sfmmup != KHATID); 2456 kmem_cache_free(sfmmu_cache, pr_hblk); 2457 } 2458 2459 ASSERT(get_hblk_ttesz(hmeblkp) == size); 2460 ASSERT(!hmeblkp->hblk_shw_bit); 2461 2462 return (hmeblkp); 2463 } 2464 2465 /* 2466 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 2467 * otherwise. 2468 */ 2469 static int 2470 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 2471 caddr_t vaddr, page_t **pps, uint_t flags) 2472 { 2473 page_t *pp = *pps; 2474 int hmenum, size, remap; 2475 tte_t tteold, flush_tte; 2476 #ifdef DEBUG 2477 tte_t orig_old; 2478 #endif /* DEBUG */ 2479 struct sf_hment *sfhme; 2480 kmutex_t *pml, *pmtx; 2481 hatlock_t *hatlockp; 2482 2483 /* 2484 * remove this panic when we decide to let user virtual address 2485 * space be >= USERLIMIT. 2486 */ 2487 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 2488 panic("user addr %p in kernel space", vaddr); 2489 #if defined(TTE_IS_GLOBAL) 2490 if (TTE_IS_GLOBAL(ttep)) 2491 panic("sfmmu_tteload: creating global tte"); 2492 #endif 2493 2494 #ifdef DEBUG 2495 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 2496 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 2497 panic("sfmmu_tteload: non cacheable memory tte"); 2498 #endif /* DEBUG */ 2499 2500 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 2501 !TTE_IS_MOD(ttep)) { 2502 /* 2503 * Don't load TSB for dummy as in ISM. Also don't preload 2504 * the TSB if the TTE isn't writable since we're likely to 2505 * fault on it again -- preloading can be fairly expensive. 2506 */ 2507 flags |= SFMMU_NO_TSBLOAD; 2508 } 2509 2510 size = TTE_CSZ(ttep); 2511 switch (size) { 2512 case TTE8K: 2513 SFMMU_STAT(sf_tteload8k); 2514 break; 2515 case TTE64K: 2516 SFMMU_STAT(sf_tteload64k); 2517 break; 2518 case TTE512K: 2519 SFMMU_STAT(sf_tteload512k); 2520 break; 2521 case TTE4M: 2522 SFMMU_STAT(sf_tteload4m); 2523 break; 2524 case (TTE32M): 2525 SFMMU_STAT(sf_tteload32m); 2526 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 2527 break; 2528 case (TTE256M): 2529 SFMMU_STAT(sf_tteload256m); 2530 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 2531 break; 2532 } 2533 2534 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2535 2536 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 2537 2538 /* 2539 * Need to grab mlist lock here so that pageunload 2540 * will not change tte behind us. 2541 */ 2542 if (pp) { 2543 pml = sfmmu_mlist_enter(pp); 2544 } 2545 2546 sfmmu_copytte(&sfhme->hme_tte, &tteold); 2547 /* 2548 * Look for corresponding hment and if valid verify 2549 * pfns are equal. 2550 */ 2551 remap = TTE_IS_VALID(&tteold); 2552 if (remap) { 2553 pfn_t new_pfn, old_pfn; 2554 2555 old_pfn = TTE_TO_PFN(vaddr, &tteold); 2556 new_pfn = TTE_TO_PFN(vaddr, ttep); 2557 2558 if (flags & HAT_LOAD_REMAP) { 2559 /* make sure we are remapping same type of pages */ 2560 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 2561 panic("sfmmu_tteload - tte remap io<->memory"); 2562 } 2563 if (old_pfn != new_pfn && 2564 (pp != NULL || sfhme->hme_page != NULL)) { 2565 panic("sfmmu_tteload - tte remap pp != NULL"); 2566 } 2567 } else if (old_pfn != new_pfn) { 2568 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 2569 (void *)hmeblkp); 2570 } 2571 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 2572 } 2573 2574 if (pp) { 2575 if (size == TTE8K) { 2576 /* 2577 * Handle VAC consistency 2578 */ 2579 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 2580 sfmmu_vac_conflict(sfmmup, vaddr, pp); 2581 } 2582 2583 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 2584 pmtx = sfmmu_page_enter(pp); 2585 PP_CLRRO(pp); 2586 sfmmu_page_exit(pmtx); 2587 } else if (!PP_ISMAPPED(pp) && 2588 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 2589 pmtx = sfmmu_page_enter(pp); 2590 if (!(PP_ISMOD(pp))) { 2591 PP_SETRO(pp); 2592 } 2593 sfmmu_page_exit(pmtx); 2594 } 2595 2596 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 2597 /* 2598 * sfmmu_pagearray_setup failed so return 2599 */ 2600 sfmmu_mlist_exit(pml); 2601 return (1); 2602 } 2603 } 2604 2605 /* 2606 * Make sure hment is not on a mapping list. 2607 */ 2608 ASSERT(remap || (sfhme->hme_page == NULL)); 2609 2610 /* if it is not a remap then hme->next better be NULL */ 2611 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 2612 2613 if (flags & HAT_LOAD_LOCK) { 2614 if (((int)hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 2615 panic("too high lckcnt-hmeblk %p", 2616 (void *)hmeblkp); 2617 } 2618 atomic_add_16(&hmeblkp->hblk_lckcnt, 1); 2619 2620 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 2621 } 2622 2623 if (pp && PP_ISNC(pp)) { 2624 /* 2625 * If the physical page is marked to be uncacheable, like 2626 * by a vac conflict, make sure the new mapping is also 2627 * uncacheable. 2628 */ 2629 TTE_CLR_VCACHEABLE(ttep); 2630 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 2631 } 2632 ttep->tte_hmenum = hmenum; 2633 2634 #ifdef DEBUG 2635 orig_old = tteold; 2636 #endif /* DEBUG */ 2637 2638 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 2639 if ((sfmmup == KHATID) && 2640 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 2641 sfmmu_copytte(&sfhme->hme_tte, &tteold); 2642 } 2643 #ifdef DEBUG 2644 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 2645 #endif /* DEBUG */ 2646 } 2647 2648 if (!TTE_IS_VALID(&tteold)) { 2649 2650 atomic_add_16(&hmeblkp->hblk_vcnt, 1); 2651 atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1); 2652 2653 /* 2654 * HAT_RELOAD_SHARE has been deprecated with lpg DISM. 2655 */ 2656 2657 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 2658 sfmmup != ksfmmup) { 2659 /* 2660 * If this is the first large mapping for the process 2661 * we must force any CPUs running this process to TL=0 2662 * where they will reload the HAT flags from the 2663 * tsbmiss area. This is necessary to make the large 2664 * mappings we are about to load visible to those CPUs; 2665 * otherwise they'll loop forever calling pagefault() 2666 * since we don't search large hash chains by default. 2667 */ 2668 hatlockp = sfmmu_hat_enter(sfmmup); 2669 if (size == TTE512K && 2670 !SFMMU_FLAGS_ISSET(sfmmup, HAT_512K_FLAG)) { 2671 SFMMU_FLAGS_SET(sfmmup, HAT_512K_FLAG); 2672 sfmmu_sync_mmustate(sfmmup); 2673 } else if (size == TTE4M && 2674 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4M_FLAG)) { 2675 SFMMU_FLAGS_SET(sfmmup, HAT_4M_FLAG); 2676 sfmmu_sync_mmustate(sfmmup); 2677 } else if (size == TTE64K && 2678 !SFMMU_FLAGS_ISSET(sfmmup, HAT_64K_FLAG)) { 2679 SFMMU_FLAGS_SET(sfmmup, HAT_64K_FLAG); 2680 /* no sync mmustate; 64K shares 8K hashes */ 2681 } else if (mmu_page_sizes == max_mmu_page_sizes) { 2682 if (size == TTE32M && 2683 !SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_FLAG)) { 2684 SFMMU_FLAGS_SET(sfmmup, HAT_32M_FLAG); 2685 sfmmu_sync_mmustate(sfmmup); 2686 } else if (size == TTE256M && 2687 !SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_FLAG)) { 2688 SFMMU_FLAGS_SET(sfmmup, HAT_256M_FLAG); 2689 sfmmu_sync_mmustate(sfmmup); 2690 } 2691 } 2692 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 2693 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 2694 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 2695 } 2696 sfmmu_hat_exit(hatlockp); 2697 } 2698 } 2699 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 2700 2701 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 2702 hw_tte.tte_intlo; 2703 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 2704 hw_tte.tte_inthi; 2705 2706 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 2707 /* 2708 * If remap and new tte differs from old tte we need 2709 * to sync the mod bit and flush TLB/TSB. We don't 2710 * need to sync ref bit because we currently always set 2711 * ref bit in tteload. 2712 */ 2713 ASSERT(TTE_IS_REF(ttep)); 2714 if (TTE_IS_MOD(&tteold)) { 2715 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 2716 } 2717 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 2718 xt_sync(sfmmup->sfmmu_cpusran); 2719 } 2720 2721 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 2722 /* 2723 * We only preload 8K and 4M mappings into the TSB, since 2724 * 64K and 512K mappings are replicated and hence don't 2725 * have a single, unique TSB entry. Ditto for 32M/256M. 2726 */ 2727 if (size == TTE8K || size == TTE4M) { 2728 hatlockp = sfmmu_hat_enter(sfmmup); 2729 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, size); 2730 sfmmu_hat_exit(hatlockp); 2731 } 2732 } 2733 if (pp) { 2734 if (!remap) { 2735 HME_ADD(sfhme, pp); 2736 atomic_add_16(&hmeblkp->hblk_hmecnt, 1); 2737 ASSERT(hmeblkp->hblk_hmecnt > 0); 2738 2739 /* 2740 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 2741 * see pageunload() for comment. 2742 */ 2743 } 2744 sfmmu_mlist_exit(pml); 2745 } 2746 2747 return (0); 2748 } 2749 /* 2750 * Function unlocks hash bucket. 2751 */ 2752 static void 2753 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 2754 { 2755 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2756 SFMMU_HASH_UNLOCK(hmebp); 2757 } 2758 2759 /* 2760 * function which checks and sets up page array for a large 2761 * translation. Will set p_vcolor, p_index, p_ro fields. 2762 * Assumes addr and pfnum of first page are properly aligned. 2763 * Will check for physical contiguity. If check fails it return 2764 * non null. 2765 */ 2766 static int 2767 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 2768 { 2769 int i, index, ttesz, osz; 2770 pfn_t pfnum; 2771 pgcnt_t npgs; 2772 int cflags = 0; 2773 page_t *pp, *pp1; 2774 kmutex_t *pmtx; 2775 int vac_err = 0; 2776 int newidx = 0; 2777 2778 ttesz = TTE_CSZ(ttep); 2779 2780 ASSERT(ttesz > TTE8K); 2781 2782 npgs = TTEPAGES(ttesz); 2783 index = PAGESZ_TO_INDEX(ttesz); 2784 2785 pfnum = (*pps)->p_pagenum; 2786 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 2787 2788 /* 2789 * Save the first pp so we can do HAT_TMPNC at the end. 2790 */ 2791 pp1 = *pps; 2792 osz = fnd_mapping_sz(pp1); 2793 2794 for (i = 0; i < npgs; i++, pps++) { 2795 pp = *pps; 2796 ASSERT(PAGE_LOCKED(pp)); 2797 ASSERT(pp->p_szc >= ttesz); 2798 ASSERT(pp->p_szc == pp1->p_szc); 2799 ASSERT(sfmmu_mlist_held(pp)); 2800 2801 /* 2802 * XXX is it possible to maintain P_RO on the root only? 2803 */ 2804 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 2805 pmtx = sfmmu_page_enter(pp); 2806 PP_CLRRO(pp); 2807 sfmmu_page_exit(pmtx); 2808 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 2809 !PP_ISMOD(pp)) { 2810 pmtx = sfmmu_page_enter(pp); 2811 if (!(PP_ISMOD(pp))) { 2812 PP_SETRO(pp); 2813 } 2814 sfmmu_page_exit(pmtx); 2815 } 2816 2817 /* 2818 * If this is a remap we skip vac & contiguity checks. 2819 */ 2820 if (remap) 2821 continue; 2822 2823 /* 2824 * set p_vcolor and detect any vac conflicts. 2825 */ 2826 if (vac_err == 0) { 2827 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 2828 2829 } 2830 2831 /* 2832 * Save current index in case we need to undo it. 2833 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 2834 * "SFMMU_INDEX_SHIFT 6" 2835 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 2836 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 2837 * 2838 * So: index = PAGESZ_TO_INDEX(ttesz); 2839 * if ttesz == 1 then index = 0x2 2840 * 2 then index = 0x4 2841 * 3 then index = 0x8 2842 * 4 then index = 0x10 2843 * 5 then index = 0x20 2844 * The code below checks if it's a new pagesize (ie, newidx) 2845 * in case we need to take it back out of p_index, 2846 * and then or's the new index into the existing index. 2847 */ 2848 if ((PP_MAPINDEX(pp) & index) == 0) 2849 newidx = 1; 2850 pp->p_index = (PP_MAPINDEX(pp) | index); 2851 2852 /* 2853 * contiguity check 2854 */ 2855 if (pp->p_pagenum != pfnum) { 2856 /* 2857 * If we fail the contiguity test then 2858 * the only thing we need to fix is the p_index field. 2859 * We might get a few extra flushes but since this 2860 * path is rare that is ok. The p_ro field will 2861 * get automatically fixed on the next tteload to 2862 * the page. NO TNC bit is set yet. 2863 */ 2864 while (i >= 0) { 2865 pp = *pps; 2866 if (newidx) 2867 pp->p_index = (PP_MAPINDEX(pp) & 2868 ~index); 2869 pps--; 2870 i--; 2871 } 2872 return (1); 2873 } 2874 pfnum++; 2875 addr += MMU_PAGESIZE; 2876 } 2877 2878 if (vac_err) { 2879 if (ttesz > osz) { 2880 /* 2881 * There are some smaller mappings that causes vac 2882 * conflicts. Convert all existing small mappings to 2883 * TNC. 2884 */ 2885 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 2886 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 2887 npgs); 2888 } else { 2889 /* EMPTY */ 2890 /* 2891 * If there exists an big page mapping, 2892 * that means the whole existing big page 2893 * has TNC setting already. No need to covert to 2894 * TNC again. 2895 */ 2896 ASSERT(PP_ISTNC(pp1)); 2897 } 2898 } 2899 2900 return (0); 2901 } 2902 2903 /* 2904 * Routine that detects vac consistency for a large page. It also 2905 * sets virtual color for all pp's for this big mapping. 2906 */ 2907 static int 2908 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 2909 { 2910 int vcolor, ocolor; 2911 2912 ASSERT(sfmmu_mlist_held(pp)); 2913 2914 if (PP_ISNC(pp)) { 2915 return (HAT_TMPNC); 2916 } 2917 2918 vcolor = addr_to_vcolor(addr); 2919 if (PP_NEWPAGE(pp)) { 2920 PP_SET_VCOLOR(pp, vcolor); 2921 return (0); 2922 } 2923 2924 ocolor = PP_GET_VCOLOR(pp); 2925 if (ocolor == vcolor) { 2926 return (0); 2927 } 2928 2929 if (!PP_ISMAPPED(pp)) { 2930 /* 2931 * Previous user of page had a differnet color 2932 * but since there are no current users 2933 * we just flush the cache and change the color. 2934 * As an optimization for large pages we flush the 2935 * entire cache of that color and set a flag. 2936 */ 2937 SFMMU_STAT(sf_pgcolor_conflict); 2938 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 2939 CacheColor_SetFlushed(*cflags, ocolor); 2940 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 2941 } 2942 PP_SET_VCOLOR(pp, vcolor); 2943 return (0); 2944 } 2945 2946 /* 2947 * We got a real conflict with a current mapping. 2948 * set flags to start unencaching all mappings 2949 * and return failure so we restart looping 2950 * the pp array from the beginning. 2951 */ 2952 return (HAT_TMPNC); 2953 } 2954 2955 /* 2956 * creates a large page shadow hmeblk for a tte. 2957 * The purpose of this routine is to allow us to do quick unloads because 2958 * the vm layer can easily pass a very large but sparsely populated range. 2959 */ 2960 static struct hme_blk * 2961 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 2962 { 2963 struct hmehash_bucket *hmebp; 2964 hmeblk_tag hblktag; 2965 int hmeshift, size, vshift; 2966 uint_t shw_mask, newshw_mask; 2967 struct hme_blk *hmeblkp; 2968 2969 ASSERT(sfmmup != KHATID); 2970 if (mmu_page_sizes == max_mmu_page_sizes) { 2971 ASSERT(ttesz < TTE256M); 2972 } else { 2973 ASSERT(ttesz < TTE4M); 2974 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 2975 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 2976 } 2977 2978 if (ttesz == TTE8K) { 2979 size = TTE512K; 2980 } else { 2981 size = ++ttesz; 2982 } 2983 2984 hblktag.htag_id = sfmmup; 2985 hmeshift = HME_HASH_SHIFT(size); 2986 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2987 hblktag.htag_rehash = HME_HASH_REHASH(size); 2988 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 2989 2990 SFMMU_HASH_LOCK(hmebp); 2991 2992 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 2993 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 2994 if (hmeblkp == NULL) { 2995 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2996 hblktag, flags); 2997 } 2998 ASSERT(hmeblkp); 2999 if (!hmeblkp->hblk_shw_mask) { 3000 /* 3001 * if this is a unused hblk it was just allocated or could 3002 * potentially be a previous large page hblk so we need to 3003 * set the shadow bit. 3004 */ 3005 hmeblkp->hblk_shw_bit = 1; 3006 } 3007 ASSERT(hmeblkp->hblk_shw_bit == 1); 3008 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3009 ASSERT(vshift < 8); 3010 /* 3011 * Atomically set shw mask bit 3012 */ 3013 do { 3014 shw_mask = hmeblkp->hblk_shw_mask; 3015 newshw_mask = shw_mask | (1 << vshift); 3016 newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask, 3017 newshw_mask); 3018 } while (newshw_mask != shw_mask); 3019 3020 SFMMU_HASH_UNLOCK(hmebp); 3021 3022 return (hmeblkp); 3023 } 3024 3025 /* 3026 * This routine cleanup a previous shadow hmeblk and changes it to 3027 * a regular hblk. This happens rarely but it is possible 3028 * when a process wants to use large pages and there are hblks still 3029 * lying around from the previous as that used these hmeblks. 3030 * The alternative was to cleanup the shadow hblks at unload time 3031 * but since so few user processes actually use large pages, it is 3032 * better to be lazy and cleanup at this time. 3033 */ 3034 static void 3035 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3036 struct hmehash_bucket *hmebp) 3037 { 3038 caddr_t addr, endaddr; 3039 int hashno, size; 3040 3041 ASSERT(hmeblkp->hblk_shw_bit); 3042 3043 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3044 3045 if (!hmeblkp->hblk_shw_mask) { 3046 hmeblkp->hblk_shw_bit = 0; 3047 return; 3048 } 3049 addr = (caddr_t)get_hblk_base(hmeblkp); 3050 endaddr = get_hblk_endaddr(hmeblkp); 3051 size = get_hblk_ttesz(hmeblkp); 3052 hashno = size - 1; 3053 ASSERT(hashno > 0); 3054 SFMMU_HASH_UNLOCK(hmebp); 3055 3056 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3057 3058 SFMMU_HASH_LOCK(hmebp); 3059 } 3060 3061 static void 3062 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3063 int hashno) 3064 { 3065 int hmeshift, shadow = 0; 3066 hmeblk_tag hblktag; 3067 struct hmehash_bucket *hmebp; 3068 struct hme_blk *hmeblkp; 3069 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3070 uint64_t hblkpa, prevpa, nx_pa; 3071 3072 ASSERT(hashno > 0); 3073 hblktag.htag_id = sfmmup; 3074 hblktag.htag_rehash = hashno; 3075 3076 hmeshift = HME_HASH_SHIFT(hashno); 3077 3078 while (addr < endaddr) { 3079 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3080 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3081 SFMMU_HASH_LOCK(hmebp); 3082 /* inline HME_HASH_SEARCH */ 3083 hmeblkp = hmebp->hmeblkp; 3084 hblkpa = hmebp->hmeh_nextpa; 3085 prevpa = 0; 3086 pr_hblk = NULL; 3087 while (hmeblkp) { 3088 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 3089 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3090 /* found hme_blk */ 3091 if (hmeblkp->hblk_shw_bit) { 3092 if (hmeblkp->hblk_shw_mask) { 3093 shadow = 1; 3094 sfmmu_shadow_hcleanup(sfmmup, 3095 hmeblkp, hmebp); 3096 break; 3097 } else { 3098 hmeblkp->hblk_shw_bit = 0; 3099 } 3100 } 3101 3102 /* 3103 * Hblk_hmecnt and hblk_vcnt could be non zero 3104 * since hblk_unload() does not gurantee that. 3105 * 3106 * XXX - this could cause tteload() to spin 3107 * where sfmmu_shadow_hcleanup() is called. 3108 */ 3109 } 3110 3111 nx_hblk = hmeblkp->hblk_next; 3112 nx_pa = hmeblkp->hblk_nextpa; 3113 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3114 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 3115 pr_hblk); 3116 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 3117 } else { 3118 pr_hblk = hmeblkp; 3119 prevpa = hblkpa; 3120 } 3121 hmeblkp = nx_hblk; 3122 hblkpa = nx_pa; 3123 } 3124 3125 SFMMU_HASH_UNLOCK(hmebp); 3126 3127 if (shadow) { 3128 /* 3129 * We found another shadow hblk so cleaned its 3130 * children. We need to go back and cleanup 3131 * the original hblk so we don't change the 3132 * addr. 3133 */ 3134 shadow = 0; 3135 } else { 3136 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3137 (1 << hmeshift)); 3138 } 3139 } 3140 sfmmu_hblks_list_purge(&list); 3141 } 3142 3143 /* 3144 * Release one hardware address translation lock on the given address range. 3145 */ 3146 void 3147 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3148 { 3149 struct hmehash_bucket *hmebp; 3150 hmeblk_tag hblktag; 3151 int hmeshift, hashno = 1; 3152 struct hme_blk *hmeblkp, *list = NULL; 3153 caddr_t endaddr; 3154 3155 ASSERT(sfmmup != NULL); 3156 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3157 3158 ASSERT((sfmmup == ksfmmup) || 3159 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3160 ASSERT((len & MMU_PAGEOFFSET) == 0); 3161 endaddr = addr + len; 3162 hblktag.htag_id = sfmmup; 3163 3164 /* 3165 * Spitfire supports 4 page sizes. 3166 * Most pages are expected to be of the smallest page size (8K) and 3167 * these will not need to be rehashed. 64K pages also don't need to be 3168 * rehashed because an hmeblk spans 64K of address space. 512K pages 3169 * might need 1 rehash and and 4M pages might need 2 rehashes. 3170 */ 3171 while (addr < endaddr) { 3172 hmeshift = HME_HASH_SHIFT(hashno); 3173 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3174 hblktag.htag_rehash = hashno; 3175 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3176 3177 SFMMU_HASH_LOCK(hmebp); 3178 3179 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3180 if (hmeblkp != NULL) { 3181 /* 3182 * If we encounter a shadow hmeblk then 3183 * we know there are no valid hmeblks mapping 3184 * this address at this size or larger. 3185 * Just increment address by the smallest 3186 * page size. 3187 */ 3188 if (hmeblkp->hblk_shw_bit) { 3189 addr += MMU_PAGESIZE; 3190 } else { 3191 addr = sfmmu_hblk_unlock(hmeblkp, addr, 3192 endaddr); 3193 } 3194 SFMMU_HASH_UNLOCK(hmebp); 3195 hashno = 1; 3196 continue; 3197 } 3198 SFMMU_HASH_UNLOCK(hmebp); 3199 3200 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 3201 /* 3202 * We have traversed the whole list and rehashed 3203 * if necessary without finding the address to unlock 3204 * which should never happen. 3205 */ 3206 panic("sfmmu_unlock: addr not found. " 3207 "addr %p hat %p", (void *)addr, (void *)sfmmup); 3208 } else { 3209 hashno++; 3210 } 3211 } 3212 3213 sfmmu_hblks_list_purge(&list); 3214 } 3215 3216 /* 3217 * Function to unlock a range of addresses in an hmeblk. It returns the 3218 * next address that needs to be unlocked. 3219 * Should be called with the hash lock held. 3220 */ 3221 static caddr_t 3222 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 3223 { 3224 struct sf_hment *sfhme; 3225 tte_t tteold, ttemod; 3226 int ttesz, ret; 3227 3228 ASSERT(in_hblk_range(hmeblkp, addr)); 3229 ASSERT(hmeblkp->hblk_shw_bit == 0); 3230 3231 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 3232 ttesz = get_hblk_ttesz(hmeblkp); 3233 3234 HBLKTOHME(sfhme, hmeblkp, addr); 3235 while (addr < endaddr) { 3236 readtte: 3237 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3238 if (TTE_IS_VALID(&tteold)) { 3239 3240 ttemod = tteold; 3241 3242 ret = sfmmu_modifytte_try(&tteold, &ttemod, 3243 &sfhme->hme_tte); 3244 3245 if (ret < 0) 3246 goto readtte; 3247 3248 if (hmeblkp->hblk_lckcnt == 0) 3249 panic("zero hblk lckcnt"); 3250 3251 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 3252 (uintptr_t)endaddr) 3253 panic("can't unlock large tte"); 3254 3255 ASSERT(hmeblkp->hblk_lckcnt > 0); 3256 atomic_add_16(&hmeblkp->hblk_lckcnt, -1); 3257 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 3258 } else { 3259 panic("sfmmu_hblk_unlock: invalid tte"); 3260 } 3261 addr += TTEBYTES(ttesz); 3262 sfhme++; 3263 } 3264 return (addr); 3265 } 3266 3267 /* 3268 * Physical Address Mapping Framework 3269 * 3270 * General rules: 3271 * 3272 * (1) Applies only to seg_kmem memory pages. To make things easier, 3273 * seg_kpm addresses are also accepted by the routines, but nothing 3274 * is done with them since by definition their PA mappings are static. 3275 * (2) hat_add_callback() may only be called while holding the page lock 3276 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()). 3277 * (3) prehandler() and posthandler() may not call hat_add_callback() or 3278 * hat_delete_callback(), nor should they allocate memory. Post quiesce 3279 * callbacks may not sleep or acquire adaptive mutex locks. 3280 * (4) Either prehandler() or posthandler() (but not both) may be specified 3281 * as being NULL. Specifying an errhandler() is optional. 3282 * 3283 * Details of using the framework: 3284 * 3285 * registering a callback (hat_register_callback()) 3286 * 3287 * Pass prehandler, posthandler, errhandler addresses 3288 * as described below. If capture_cpus argument is nonzero, 3289 * suspend callback to the prehandler will occur with CPUs 3290 * captured and executing xc_loop() and CPUs will remain 3291 * captured until after the posthandler suspend callback 3292 * occurs. 3293 * 3294 * adding a callback (hat_add_callback()) 3295 * 3296 * as_pagelock(); 3297 * hat_add_callback(); 3298 * save returned pfn in private data structures or program registers; 3299 * as_pageunlock(); 3300 * 3301 * prehandler() 3302 * 3303 * Stop all accesses by physical address to this memory page. 3304 * Called twice: the first, PRESUSPEND, is a context safe to acquire 3305 * adaptive locks. The second, SUSPEND, is called at high PIL with 3306 * CPUs captured so adaptive locks may NOT be acquired (and all spin 3307 * locks must be XCALL_PIL or higher locks). 3308 * 3309 * May return the following errors: 3310 * EIO: A fatal error has occurred. This will result in panic. 3311 * EAGAIN: The page cannot be suspended. This will fail the 3312 * relocation. 3313 * 0: Success. 3314 * 3315 * posthandler() 3316 * 3317 * Save new pfn in private data structures or program registers; 3318 * not allowed to fail (non-zero return values will result in panic). 3319 * 3320 * errhandler() 3321 * 3322 * called when an error occurs related to the callback. Currently 3323 * the only such error is HAT_CB_ERR_LEAKED which indicates that 3324 * a page is being freed, but there are still outstanding callback(s) 3325 * registered on the page. 3326 * 3327 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 3328 * 3329 * stop using physical address 3330 * hat_delete_callback(); 3331 * 3332 */ 3333 3334 /* 3335 * Register a callback class. Each subsystem should do this once and 3336 * cache the id_t returned for use in setting up and tearing down callbacks. 3337 * There is no facility for removing callback IDs once they are created, 3338 * so this should only be called from modules which cannot be unloaded. 3339 */ 3340 id_t 3341 hat_register_callback(int (*prehandler)(caddr_t, uint_t, uint_t, void *), 3342 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 3343 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 3344 int capture_cpus) 3345 { 3346 id_t id; 3347 3348 /* 3349 * If this callback has already been registered just return the 3350 * ID for it. 3351 */ 3352 for (id = 0; id < sfmmu_max_cb_id; id++) { 3353 if (sfmmu_cb_table[id].prehandler == prehandler && 3354 sfmmu_cb_table[id].posthandler == posthandler && 3355 sfmmu_cb_table[id].errhandler == errhandler && 3356 sfmmu_cb_table[id].capture_cpus == capture_cpus) { 3357 return (id); 3358 } 3359 } 3360 3361 id = sfmmu_cb_nextid++; 3362 3363 ASSERT(prehandler != NULL || posthandler != NULL); 3364 3365 if (id >= sfmmu_max_cb_id) 3366 panic("hat_register_callback: out of callback IDs"); 3367 3368 sfmmu_cb_table[id].prehandler = prehandler; 3369 sfmmu_cb_table[id].posthandler = posthandler; 3370 sfmmu_cb_table[id].errhandler = errhandler; 3371 sfmmu_cb_table[id].capture_cpus = capture_cpus; 3372 3373 return (id); 3374 } 3375 3376 /* 3377 * Add relocation callbacks to the specified addr/len which will be called 3378 * when relocating the associated page. See the description of pre and 3379 * posthandler above for more details. IMPT: this operation is only valid 3380 * on seg_kmem pages!! 3381 * 3382 * If HAC_PAGELOCK is included in flags, the underlying memory page is 3383 * locked internally so the caller must be able to deal with the callback 3384 * running even before this function has returned. If HAC_PAGELOCK is not 3385 * set, it is assumed that the underlying memory pages are locked. 3386 * 3387 * Since the caller must track the individual page boundaries anyway, 3388 * we only allow a callback to be added to a single page (large 3389 * or small). Thus [addr, addr + len) MUST be contained within a single 3390 * page. 3391 * 3392 * Registering multiple callbacks on the same [addr, addr+len) is supported, 3393 * in which case the corresponding callback will be called once with each 3394 * unique parameter specified. The number of subsequent deletes must match 3395 * since reference counts are held. If a callback is desired for each 3396 * virtual object with the same parameter specified for multiple callbacks, 3397 * a different virtual address should be specified at the time of 3398 * callback registration. 3399 * 3400 * Returns the pfn of the underlying kernel page in *rpfn 3401 * on success, or PFN_INVALID on failure. 3402 * 3403 * Returns values: 3404 * 0: success 3405 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 3406 * EINVAL: callback ID is not valid 3407 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 3408 * space, or crosses a page boundary 3409 */ 3410 int 3411 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 3412 void *pvt, pfn_t *rpfn) 3413 { 3414 struct hmehash_bucket *hmebp; 3415 hmeblk_tag hblktag; 3416 struct hme_blk *hmeblkp; 3417 int hmeshift, hashno; 3418 caddr_t saddr, eaddr, baseaddr; 3419 struct pa_hment *pahmep, *tpahmep; 3420 struct sf_hment *sfhmep, *osfhmep, *tsfhmep; 3421 kmutex_t *pml; 3422 tte_t tte; 3423 page_t *pp, *rpp; 3424 pfn_t pfn; 3425 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 3426 int locked = 0; 3427 3428 /* 3429 * For KPM mappings, just return the physical address since we 3430 * don't need to register any callbacks. 3431 */ 3432 if (IS_KPM_ADDR(vaddr)) { 3433 uint64_t paddr; 3434 SFMMU_KPM_VTOP(vaddr, paddr); 3435 *rpfn = btop(paddr); 3436 return (0); 3437 } 3438 3439 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 3440 *rpfn = PFN_INVALID; 3441 return (EINVAL); 3442 } 3443 3444 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 3445 *rpfn = PFN_INVALID; 3446 return (ENOMEM); 3447 } 3448 3449 sfhmep = &pahmep->sfment; 3450 3451 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 3452 eaddr = saddr + len; 3453 3454 rehash: 3455 /* Find the mapping(s) for this page */ 3456 for (hashno = TTE64K, hmeblkp = NULL; 3457 hmeblkp == NULL && hashno <= mmu_hashcnt; 3458 hashno++) { 3459 hmeshift = HME_HASH_SHIFT(hashno); 3460 hblktag.htag_id = ksfmmup; 3461 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 3462 hblktag.htag_rehash = hashno; 3463 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 3464 3465 SFMMU_HASH_LOCK(hmebp); 3466 3467 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3468 3469 if (hmeblkp == NULL) 3470 SFMMU_HASH_UNLOCK(hmebp); 3471 } 3472 3473 if (hmeblkp == NULL) { 3474 *rpfn = PFN_INVALID; 3475 return (ENXIO); 3476 } 3477 3478 /* 3479 * Make sure the boundaries for the callback fall within this 3480 * single mapping. 3481 */ 3482 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 3483 ASSERT(saddr >= baseaddr); 3484 if (eaddr > (caddr_t)get_hblk_endaddr(hmeblkp)) { 3485 SFMMU_HASH_UNLOCK(hmebp); 3486 *rpfn = PFN_INVALID; 3487 return (ENXIO); 3488 } 3489 3490 HBLKTOHME(osfhmep, hmeblkp, saddr); 3491 sfmmu_copytte(&osfhmep->hme_tte, &tte); 3492 3493 ASSERT(TTE_IS_VALID(&tte)); 3494 pfn = sfmmu_ttetopfn(&tte, vaddr); 3495 3496 pp = osfhmep->hme_page; 3497 pml = sfmmu_mlist_enter(pp); 3498 3499 if ((flags & HAC_PAGELOCK) && !locked) { 3500 if (!page_trylock(pp, SE_SHARED)) { 3501 /* 3502 * Somebody is holding SE_EXCL lock. Drop all 3503 * our locks, lookup the page in &kvp, and 3504 * retry. 3505 */ 3506 sfmmu_mlist_exit(pml); 3507 SFMMU_HASH_UNLOCK(hmebp); 3508 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 3509 ASSERT(pp != NULL); 3510 rpp = PP_PAGEROOT(pp); 3511 if (rpp != pp) { 3512 page_unlock(pp); 3513 (void) page_lock(rpp, SE_SHARED, NULL, 3514 P_NO_RECLAIM); 3515 } 3516 locked = 1; 3517 goto rehash; 3518 } 3519 locked = 1; 3520 } 3521 3522 if (!PAGE_LOCKED(pp) && !panicstr) 3523 panic("hat_add_callback: page 0x%p not locked", pp); 3524 3525 if (osfhmep->hme_page != pp || pp->p_vnode != &kvp || 3526 pp->p_offset < (u_offset_t)baseaddr || 3527 pp->p_offset > (u_offset_t)eaddr) { 3528 /* 3529 * The page moved before we got our hands on it. Drop 3530 * all the locks and try again. 3531 */ 3532 ASSERT((flags & HAC_PAGELOCK) != 0); 3533 sfmmu_mlist_exit(pml); 3534 SFMMU_HASH_UNLOCK(hmebp); 3535 page_unlock(pp); 3536 locked = 0; 3537 goto rehash; 3538 } 3539 3540 ASSERT(osfhmep->hme_page == pp); 3541 3542 for (tsfhmep = pp->p_mapping; tsfhmep != NULL; 3543 tsfhmep = tsfhmep->hme_next) { 3544 3545 /* 3546 * skip va to pa mappings 3547 */ 3548 if (!IS_PAHME(tsfhmep)) 3549 continue; 3550 3551 tpahmep = tsfhmep->hme_data; 3552 ASSERT(tpahmep != NULL); 3553 3554 /* 3555 * See if the pahment already exists. 3556 */ 3557 if ((tpahmep->pvt == pvt) && 3558 (tpahmep->addr == vaddr) && 3559 (tpahmep->len == len)) { 3560 ASSERT(tpahmep->cb_id == callback_id); 3561 tpahmep->refcnt++; 3562 pp->p_share++; 3563 3564 sfmmu_mlist_exit(pml); 3565 SFMMU_HASH_UNLOCK(hmebp); 3566 3567 if (locked) 3568 page_unlock(pp); 3569 3570 kmem_cache_free(pa_hment_cache, pahmep); 3571 3572 *rpfn = pfn; 3573 return (0); 3574 } 3575 } 3576 3577 /* 3578 * setup this shiny new pa_hment .. 3579 */ 3580 pp->p_share++; 3581 pahmep->cb_id = callback_id; 3582 pahmep->addr = vaddr; 3583 pahmep->len = len; 3584 pahmep->refcnt = 1; 3585 pahmep->flags = 0; 3586 pahmep->pvt = pvt; 3587 3588 /* 3589 * .. and also set up the sf_hment and link to p_mapping list. 3590 */ 3591 sfhmep->hme_tte.ll = 0; 3592 sfhmep->hme_data = pahmep; 3593 sfhmep->hme_prev = osfhmep; 3594 sfhmep->hme_next = osfhmep->hme_next; 3595 3596 if (osfhmep->hme_next) 3597 osfhmep->hme_next->hme_prev = sfhmep; 3598 3599 osfhmep->hme_next = sfhmep; 3600 3601 sfmmu_mlist_exit(pml); 3602 SFMMU_HASH_UNLOCK(hmebp); 3603 3604 *rpfn = pfn; 3605 if (locked) 3606 page_unlock(pp); 3607 3608 return (0); 3609 } 3610 3611 /* 3612 * Remove the relocation callbacks from the specified addr/len. 3613 */ 3614 void 3615 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags) 3616 { 3617 struct hmehash_bucket *hmebp; 3618 hmeblk_tag hblktag; 3619 struct hme_blk *hmeblkp; 3620 int hmeshift, hashno; 3621 caddr_t saddr, eaddr, baseaddr; 3622 struct pa_hment *pahmep; 3623 struct sf_hment *sfhmep, *osfhmep; 3624 kmutex_t *pml; 3625 tte_t tte; 3626 page_t *pp, *rpp; 3627 int locked = 0; 3628 3629 if (IS_KPM_ADDR(vaddr)) 3630 return; 3631 3632 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 3633 eaddr = saddr + len; 3634 3635 rehash: 3636 /* Find the mapping(s) for this page */ 3637 for (hashno = TTE64K, hmeblkp = NULL; 3638 hmeblkp == NULL && hashno <= mmu_hashcnt; 3639 hashno++) { 3640 hmeshift = HME_HASH_SHIFT(hashno); 3641 hblktag.htag_id = ksfmmup; 3642 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 3643 hblktag.htag_rehash = hashno; 3644 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 3645 3646 SFMMU_HASH_LOCK(hmebp); 3647 3648 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3649 3650 if (hmeblkp == NULL) 3651 SFMMU_HASH_UNLOCK(hmebp); 3652 } 3653 3654 if (hmeblkp == NULL) { 3655 if (!panicstr) { 3656 panic("hat_delete_callback: addr 0x%p not found", 3657 saddr); 3658 } 3659 return; 3660 } 3661 3662 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 3663 HBLKTOHME(osfhmep, hmeblkp, saddr); 3664 3665 sfmmu_copytte(&osfhmep->hme_tte, &tte); 3666 ASSERT(TTE_IS_VALID(&tte)); 3667 3668 pp = osfhmep->hme_page; 3669 pml = sfmmu_mlist_enter(pp); 3670 3671 if ((flags & HAC_PAGELOCK) && !locked) { 3672 if (!page_trylock(pp, SE_SHARED)) { 3673 /* 3674 * Somebody is holding SE_EXCL lock. Drop all 3675 * our locks, lookup the page in &kvp, and 3676 * retry. 3677 */ 3678 sfmmu_mlist_exit(pml); 3679 SFMMU_HASH_UNLOCK(hmebp); 3680 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 3681 ASSERT(pp != NULL); 3682 rpp = PP_PAGEROOT(pp); 3683 if (rpp != pp) { 3684 page_unlock(pp); 3685 (void) page_lock(rpp, SE_SHARED, NULL, 3686 P_NO_RECLAIM); 3687 } 3688 locked = 1; 3689 goto rehash; 3690 } 3691 locked = 1; 3692 } 3693 3694 ASSERT(PAGE_LOCKED(pp)); 3695 3696 if (osfhmep->hme_page != pp || pp->p_vnode != &kvp || 3697 pp->p_offset < (u_offset_t)baseaddr || 3698 pp->p_offset > (u_offset_t)eaddr) { 3699 /* 3700 * The page moved before we got our hands on it. Drop 3701 * all the locks and try again. 3702 */ 3703 ASSERT((flags & HAC_PAGELOCK) != 0); 3704 sfmmu_mlist_exit(pml); 3705 SFMMU_HASH_UNLOCK(hmebp); 3706 page_unlock(pp); 3707 locked = 0; 3708 goto rehash; 3709 } 3710 3711 ASSERT(osfhmep->hme_page == pp); 3712 3713 for (sfhmep = pp->p_mapping; sfhmep != NULL; 3714 sfhmep = sfhmep->hme_next) { 3715 3716 /* 3717 * skip va<->pa mappings 3718 */ 3719 if (!IS_PAHME(sfhmep)) 3720 continue; 3721 3722 pahmep = sfhmep->hme_data; 3723 ASSERT(pahmep != NULL); 3724 3725 /* 3726 * if pa_hment matches, remove it 3727 */ 3728 if ((pahmep->pvt == pvt) && 3729 (pahmep->addr == vaddr) && 3730 (pahmep->len == len)) { 3731 break; 3732 } 3733 } 3734 3735 if (sfhmep == NULL) { 3736 if (!panicstr) { 3737 panic("hat_delete_callback: pa_hment not found, pp %p", 3738 (void *)pp); 3739 } 3740 return; 3741 } 3742 3743 /* 3744 * Note: at this point a valid kernel mapping must still be 3745 * present on this page. 3746 */ 3747 pp->p_share--; 3748 if (pp->p_share <= 0) 3749 panic("hat_delete_callback: zero p_share"); 3750 3751 if (--pahmep->refcnt == 0) { 3752 if (pahmep->flags != 0) 3753 panic("hat_delete_callback: pa_hment is busy"); 3754 3755 /* 3756 * Remove sfhmep from the mapping list for the page. 3757 */ 3758 if (sfhmep->hme_prev) { 3759 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 3760 } else { 3761 pp->p_mapping = sfhmep->hme_next; 3762 } 3763 3764 if (sfhmep->hme_next) 3765 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 3766 3767 sfmmu_mlist_exit(pml); 3768 SFMMU_HASH_UNLOCK(hmebp); 3769 3770 if (locked) 3771 page_unlock(pp); 3772 3773 kmem_cache_free(pa_hment_cache, pahmep); 3774 return; 3775 } 3776 3777 sfmmu_mlist_exit(pml); 3778 SFMMU_HASH_UNLOCK(hmebp); 3779 if (locked) 3780 page_unlock(pp); 3781 } 3782 3783 /* 3784 * hat_probe returns 1 if the translation for the address 'addr' is 3785 * loaded, zero otherwise. 3786 * 3787 * hat_probe should be used only for advisorary purposes because it may 3788 * occasionally return the wrong value. The implementation must guarantee that 3789 * returning the wrong value is a very rare event. hat_probe is used 3790 * to implement optimizations in the segment drivers. 3791 * 3792 */ 3793 int 3794 hat_probe(struct hat *sfmmup, caddr_t addr) 3795 { 3796 pfn_t pfn; 3797 tte_t tte; 3798 3799 ASSERT(sfmmup != NULL); 3800 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3801 3802 ASSERT((sfmmup == ksfmmup) || 3803 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3804 3805 if (sfmmup == ksfmmup) { 3806 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 3807 == PFN_SUSPENDED) { 3808 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 3809 } 3810 } else { 3811 pfn = sfmmu_uvatopfn(addr, sfmmup); 3812 } 3813 3814 if (pfn != PFN_INVALID) 3815 return (1); 3816 else 3817 return (0); 3818 } 3819 3820 ssize_t 3821 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 3822 { 3823 tte_t tte; 3824 3825 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3826 3827 sfmmu_gettte(sfmmup, addr, &tte); 3828 if (TTE_IS_VALID(&tte)) { 3829 return (TTEBYTES(TTE_CSZ(&tte))); 3830 } 3831 return (-1); 3832 } 3833 3834 static void 3835 sfmmu_gettte(struct hat *sfmmup, caddr_t addr, tte_t *ttep) 3836 { 3837 struct hmehash_bucket *hmebp; 3838 hmeblk_tag hblktag; 3839 int hmeshift, hashno = 1; 3840 struct hme_blk *hmeblkp, *list = NULL; 3841 struct sf_hment *sfhmep; 3842 3843 /* support for ISM */ 3844 ism_map_t *ism_map; 3845 ism_blk_t *ism_blkp; 3846 int i; 3847 sfmmu_t *ism_hatid = NULL; 3848 sfmmu_t *locked_hatid = NULL; 3849 3850 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 3851 3852 ism_blkp = sfmmup->sfmmu_iblk; 3853 if (ism_blkp) { 3854 sfmmu_ismhat_enter(sfmmup, 0); 3855 locked_hatid = sfmmup; 3856 } 3857 while (ism_blkp && ism_hatid == NULL) { 3858 ism_map = ism_blkp->iblk_maps; 3859 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 3860 if (addr >= ism_start(ism_map[i]) && 3861 addr < ism_end(ism_map[i])) { 3862 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 3863 addr = (caddr_t)(addr - 3864 ism_start(ism_map[i])); 3865 break; 3866 } 3867 } 3868 ism_blkp = ism_blkp->iblk_next; 3869 } 3870 if (locked_hatid) { 3871 sfmmu_ismhat_exit(locked_hatid, 0); 3872 } 3873 3874 hblktag.htag_id = sfmmup; 3875 ttep->ll = 0; 3876 3877 do { 3878 hmeshift = HME_HASH_SHIFT(hashno); 3879 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3880 hblktag.htag_rehash = hashno; 3881 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3882 3883 SFMMU_HASH_LOCK(hmebp); 3884 3885 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3886 if (hmeblkp != NULL) { 3887 HBLKTOHME(sfhmep, hmeblkp, addr); 3888 sfmmu_copytte(&sfhmep->hme_tte, ttep); 3889 SFMMU_HASH_UNLOCK(hmebp); 3890 break; 3891 } 3892 SFMMU_HASH_UNLOCK(hmebp); 3893 hashno++; 3894 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 3895 3896 sfmmu_hblks_list_purge(&list); 3897 } 3898 3899 uint_t 3900 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 3901 { 3902 tte_t tte; 3903 3904 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3905 3906 sfmmu_gettte(sfmmup, addr, &tte); 3907 if (TTE_IS_VALID(&tte)) { 3908 *attr = sfmmu_ptov_attr(&tte); 3909 return (0); 3910 } 3911 *attr = 0; 3912 return ((uint_t)0xffffffff); 3913 } 3914 3915 /* 3916 * Enables more attributes on specified address range (ie. logical OR) 3917 */ 3918 void 3919 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 3920 { 3921 if (hat->sfmmu_xhat_provider) { 3922 XHAT_SETATTR(hat, addr, len, attr); 3923 return; 3924 } else { 3925 /* 3926 * This must be a CPU HAT. If the address space has 3927 * XHATs attached, change attributes for all of them, 3928 * just in case 3929 */ 3930 ASSERT(hat->sfmmu_as != NULL); 3931 if (hat->sfmmu_as->a_xhat != NULL) 3932 xhat_setattr_all(hat->sfmmu_as, addr, len, attr); 3933 } 3934 3935 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 3936 } 3937 3938 /* 3939 * Assigns attributes to the specified address range. All the attributes 3940 * are specified. 3941 */ 3942 void 3943 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 3944 { 3945 if (hat->sfmmu_xhat_provider) { 3946 XHAT_CHGATTR(hat, addr, len, attr); 3947 return; 3948 } else { 3949 /* 3950 * This must be a CPU HAT. If the address space has 3951 * XHATs attached, change attributes for all of them, 3952 * just in case 3953 */ 3954 ASSERT(hat->sfmmu_as != NULL); 3955 if (hat->sfmmu_as->a_xhat != NULL) 3956 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr); 3957 } 3958 3959 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 3960 } 3961 3962 /* 3963 * Remove attributes on the specified address range (ie. loginal NAND) 3964 */ 3965 void 3966 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 3967 { 3968 if (hat->sfmmu_xhat_provider) { 3969 XHAT_CLRATTR(hat, addr, len, attr); 3970 return; 3971 } else { 3972 /* 3973 * This must be a CPU HAT. If the address space has 3974 * XHATs attached, change attributes for all of them, 3975 * just in case 3976 */ 3977 ASSERT(hat->sfmmu_as != NULL); 3978 if (hat->sfmmu_as->a_xhat != NULL) 3979 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr); 3980 } 3981 3982 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 3983 } 3984 3985 /* 3986 * Change attributes on an address range to that specified by attr and mode. 3987 */ 3988 static void 3989 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 3990 int mode) 3991 { 3992 struct hmehash_bucket *hmebp; 3993 hmeblk_tag hblktag; 3994 int hmeshift, hashno = 1; 3995 struct hme_blk *hmeblkp, *list = NULL; 3996 caddr_t endaddr; 3997 cpuset_t cpuset; 3998 demap_range_t dmr; 3999 4000 CPUSET_ZERO(cpuset); 4001 4002 ASSERT((sfmmup == ksfmmup) || 4003 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4004 ASSERT((len & MMU_PAGEOFFSET) == 0); 4005 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4006 4007 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4008 ((addr + len) > (caddr_t)USERLIMIT)) { 4009 panic("user addr %p in kernel space", 4010 (void *)addr); 4011 } 4012 4013 endaddr = addr + len; 4014 hblktag.htag_id = sfmmup; 4015 DEMAP_RANGE_INIT(sfmmup, &dmr); 4016 4017 while (addr < endaddr) { 4018 hmeshift = HME_HASH_SHIFT(hashno); 4019 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4020 hblktag.htag_rehash = hashno; 4021 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4022 4023 SFMMU_HASH_LOCK(hmebp); 4024 4025 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4026 if (hmeblkp != NULL) { 4027 /* 4028 * We've encountered a shadow hmeblk so skip the range 4029 * of the next smaller mapping size. 4030 */ 4031 if (hmeblkp->hblk_shw_bit) { 4032 ASSERT(sfmmup != ksfmmup); 4033 ASSERT(hashno > 1); 4034 addr = (caddr_t)P2END((uintptr_t)addr, 4035 TTEBYTES(hashno - 1)); 4036 } else { 4037 addr = sfmmu_hblk_chgattr(sfmmup, 4038 hmeblkp, addr, endaddr, &dmr, attr, mode); 4039 } 4040 SFMMU_HASH_UNLOCK(hmebp); 4041 hashno = 1; 4042 continue; 4043 } 4044 SFMMU_HASH_UNLOCK(hmebp); 4045 4046 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4047 /* 4048 * We have traversed the whole list and rehashed 4049 * if necessary without finding the address to chgattr. 4050 * This is ok, so we increment the address by the 4051 * smallest hmeblk range for kernel mappings or for 4052 * user mappings with no large pages, and the largest 4053 * hmeblk range, to account for shadow hmeblks, for 4054 * user mappings with large pages and continue. 4055 */ 4056 if (sfmmup == ksfmmup) 4057 addr = (caddr_t)P2END((uintptr_t)addr, 4058 TTEBYTES(1)); 4059 else 4060 addr = (caddr_t)P2END((uintptr_t)addr, 4061 TTEBYTES(hashno)); 4062 hashno = 1; 4063 } else { 4064 hashno++; 4065 } 4066 } 4067 4068 sfmmu_hblks_list_purge(&list); 4069 DEMAP_RANGE_FLUSH(&dmr); 4070 cpuset = sfmmup->sfmmu_cpusran; 4071 xt_sync(cpuset); 4072 } 4073 4074 /* 4075 * This function chgattr on a range of addresses in an hmeblk. It returns the 4076 * next addres that needs to be chgattr. 4077 * It should be called with the hash lock held. 4078 * XXX It should be possible to optimize chgattr by not flushing every time but 4079 * on the other hand: 4080 * 1. do one flush crosscall. 4081 * 2. only flush if we are increasing permissions (make sure this will work) 4082 */ 4083 static caddr_t 4084 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4085 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 4086 { 4087 tte_t tte, tteattr, tteflags, ttemod; 4088 struct sf_hment *sfhmep; 4089 int ttesz; 4090 struct page *pp = NULL; 4091 kmutex_t *pml, *pmtx; 4092 int ret; 4093 int use_demap_range; 4094 #if defined(SF_ERRATA_57) 4095 int check_exec; 4096 #endif 4097 4098 ASSERT(in_hblk_range(hmeblkp, addr)); 4099 ASSERT(hmeblkp->hblk_shw_bit == 0); 4100 4101 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4102 ttesz = get_hblk_ttesz(hmeblkp); 4103 4104 /* 4105 * Flush the current demap region if addresses have been 4106 * skipped or the page size doesn't match. 4107 */ 4108 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 4109 if (use_demap_range) { 4110 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4111 } else { 4112 DEMAP_RANGE_FLUSH(dmrp); 4113 } 4114 4115 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 4116 #if defined(SF_ERRATA_57) 4117 check_exec = (sfmmup != ksfmmup) && 4118 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4119 TTE_IS_EXECUTABLE(&tteattr); 4120 #endif 4121 HBLKTOHME(sfhmep, hmeblkp, addr); 4122 while (addr < endaddr) { 4123 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4124 if (TTE_IS_VALID(&tte)) { 4125 if ((tte.ll & tteflags.ll) == tteattr.ll) { 4126 /* 4127 * if the new attr is the same as old 4128 * continue 4129 */ 4130 goto next_addr; 4131 } 4132 if (!TTE_IS_WRITABLE(&tteattr)) { 4133 /* 4134 * make sure we clear hw modify bit if we 4135 * removing write protections 4136 */ 4137 tteflags.tte_intlo |= TTE_HWWR_INT; 4138 } 4139 4140 pml = NULL; 4141 pp = sfhmep->hme_page; 4142 if (pp) { 4143 pml = sfmmu_mlist_enter(pp); 4144 } 4145 4146 if (pp != sfhmep->hme_page) { 4147 /* 4148 * tte must have been unloaded. 4149 */ 4150 ASSERT(pml); 4151 sfmmu_mlist_exit(pml); 4152 continue; 4153 } 4154 4155 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 4156 4157 ttemod = tte; 4158 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 4159 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 4160 4161 #if defined(SF_ERRATA_57) 4162 if (check_exec && addr < errata57_limit) 4163 ttemod.tte_exec_perm = 0; 4164 #endif 4165 ret = sfmmu_modifytte_try(&tte, &ttemod, 4166 &sfhmep->hme_tte); 4167 4168 if (ret < 0) { 4169 /* tte changed underneath us */ 4170 if (pml) { 4171 sfmmu_mlist_exit(pml); 4172 } 4173 continue; 4174 } 4175 4176 if (tteflags.tte_intlo & TTE_HWWR_INT) { 4177 /* 4178 * need to sync if we are clearing modify bit. 4179 */ 4180 sfmmu_ttesync(sfmmup, addr, &tte, pp); 4181 } 4182 4183 if (pp && PP_ISRO(pp)) { 4184 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 4185 pmtx = sfmmu_page_enter(pp); 4186 PP_CLRRO(pp); 4187 sfmmu_page_exit(pmtx); 4188 } 4189 } 4190 4191 if (ret > 0 && use_demap_range) { 4192 DEMAP_RANGE_MARKPG(dmrp, addr); 4193 } else if (ret > 0) { 4194 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 4195 } 4196 4197 if (pml) { 4198 sfmmu_mlist_exit(pml); 4199 } 4200 } 4201 next_addr: 4202 addr += TTEBYTES(ttesz); 4203 sfhmep++; 4204 DEMAP_RANGE_NEXTPG(dmrp); 4205 } 4206 return (addr); 4207 } 4208 4209 /* 4210 * This routine converts virtual attributes to physical ones. It will 4211 * update the tteflags field with the tte mask corresponding to the attributes 4212 * affected and it returns the new attributes. It will also clear the modify 4213 * bit if we are taking away write permission. This is necessary since the 4214 * modify bit is the hardware permission bit and we need to clear it in order 4215 * to detect write faults. 4216 */ 4217 static uint64_t 4218 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 4219 { 4220 tte_t ttevalue; 4221 4222 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 4223 4224 switch (mode) { 4225 case SFMMU_CHGATTR: 4226 /* all attributes specified */ 4227 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 4228 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 4229 ttemaskp->tte_inthi = TTEINTHI_ATTR; 4230 ttemaskp->tte_intlo = TTEINTLO_ATTR; 4231 break; 4232 case SFMMU_SETATTR: 4233 ASSERT(!(attr & ~HAT_PROT_MASK)); 4234 ttemaskp->ll = 0; 4235 ttevalue.ll = 0; 4236 /* 4237 * a valid tte implies exec and read for sfmmu 4238 * so no need to do anything about them. 4239 * since priviledged access implies user access 4240 * PROT_USER doesn't make sense either. 4241 */ 4242 if (attr & PROT_WRITE) { 4243 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 4244 ttevalue.tte_intlo |= TTE_WRPRM_INT; 4245 } 4246 break; 4247 case SFMMU_CLRATTR: 4248 /* attributes will be nand with current ones */ 4249 if (attr & ~(PROT_WRITE | PROT_USER)) { 4250 panic("sfmmu: attr %x not supported", attr); 4251 } 4252 ttemaskp->ll = 0; 4253 ttevalue.ll = 0; 4254 if (attr & PROT_WRITE) { 4255 /* clear both writable and modify bit */ 4256 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 4257 } 4258 if (attr & PROT_USER) { 4259 ttemaskp->tte_intlo |= TTE_PRIV_INT; 4260 ttevalue.tte_intlo |= TTE_PRIV_INT; 4261 } 4262 break; 4263 default: 4264 panic("sfmmu_vtop_attr: bad mode %x", mode); 4265 } 4266 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 4267 return (ttevalue.ll); 4268 } 4269 4270 static uint_t 4271 sfmmu_ptov_attr(tte_t *ttep) 4272 { 4273 uint_t attr; 4274 4275 ASSERT(TTE_IS_VALID(ttep)); 4276 4277 attr = PROT_READ; 4278 4279 if (TTE_IS_WRITABLE(ttep)) { 4280 attr |= PROT_WRITE; 4281 } 4282 if (TTE_IS_EXECUTABLE(ttep)) { 4283 attr |= PROT_EXEC; 4284 } 4285 if (!TTE_IS_PRIVILEGED(ttep)) { 4286 attr |= PROT_USER; 4287 } 4288 if (TTE_IS_NFO(ttep)) { 4289 attr |= HAT_NOFAULT; 4290 } 4291 if (TTE_IS_NOSYNC(ttep)) { 4292 attr |= HAT_NOSYNC; 4293 } 4294 if (TTE_IS_SIDEFFECT(ttep)) { 4295 attr |= SFMMU_SIDEFFECT; 4296 } 4297 if (!TTE_IS_VCACHEABLE(ttep)) { 4298 attr |= SFMMU_UNCACHEVTTE; 4299 } 4300 if (!TTE_IS_PCACHEABLE(ttep)) { 4301 attr |= SFMMU_UNCACHEPTTE; 4302 } 4303 return (attr); 4304 } 4305 4306 /* 4307 * hat_chgprot is a deprecated hat call. New segment drivers 4308 * should store all attributes and use hat_*attr calls. 4309 * 4310 * Change the protections in the virtual address range 4311 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 4312 * then remove write permission, leaving the other 4313 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 4314 * 4315 */ 4316 void 4317 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 4318 { 4319 struct hmehash_bucket *hmebp; 4320 hmeblk_tag hblktag; 4321 int hmeshift, hashno = 1; 4322 struct hme_blk *hmeblkp, *list = NULL; 4323 caddr_t endaddr; 4324 cpuset_t cpuset; 4325 demap_range_t dmr; 4326 4327 ASSERT((len & MMU_PAGEOFFSET) == 0); 4328 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4329 4330 if (sfmmup->sfmmu_xhat_provider) { 4331 XHAT_CHGPROT(sfmmup, addr, len, vprot); 4332 return; 4333 } else { 4334 /* 4335 * This must be a CPU HAT. If the address space has 4336 * XHATs attached, change attributes for all of them, 4337 * just in case 4338 */ 4339 ASSERT(sfmmup->sfmmu_as != NULL); 4340 if (sfmmup->sfmmu_as->a_xhat != NULL) 4341 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot); 4342 } 4343 4344 CPUSET_ZERO(cpuset); 4345 4346 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 4347 ((addr + len) > (caddr_t)USERLIMIT)) { 4348 panic("user addr %p vprot %x in kernel space", 4349 (void *)addr, vprot); 4350 } 4351 endaddr = addr + len; 4352 hblktag.htag_id = sfmmup; 4353 DEMAP_RANGE_INIT(sfmmup, &dmr); 4354 4355 while (addr < endaddr) { 4356 hmeshift = HME_HASH_SHIFT(hashno); 4357 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4358 hblktag.htag_rehash = hashno; 4359 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4360 4361 SFMMU_HASH_LOCK(hmebp); 4362 4363 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4364 if (hmeblkp != NULL) { 4365 /* 4366 * We've encountered a shadow hmeblk so skip the range 4367 * of the next smaller mapping size. 4368 */ 4369 if (hmeblkp->hblk_shw_bit) { 4370 ASSERT(sfmmup != ksfmmup); 4371 ASSERT(hashno > 1); 4372 addr = (caddr_t)P2END((uintptr_t)addr, 4373 TTEBYTES(hashno - 1)); 4374 } else { 4375 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 4376 addr, endaddr, &dmr, vprot); 4377 } 4378 SFMMU_HASH_UNLOCK(hmebp); 4379 hashno = 1; 4380 continue; 4381 } 4382 SFMMU_HASH_UNLOCK(hmebp); 4383 4384 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4385 /* 4386 * We have traversed the whole list and rehashed 4387 * if necessary without finding the address to chgprot. 4388 * This is ok so we increment the address by the 4389 * smallest hmeblk range for kernel mappings and the 4390 * largest hmeblk range, to account for shadow hmeblks, 4391 * for user mappings and continue. 4392 */ 4393 if (sfmmup == ksfmmup) 4394 addr = (caddr_t)P2END((uintptr_t)addr, 4395 TTEBYTES(1)); 4396 else 4397 addr = (caddr_t)P2END((uintptr_t)addr, 4398 TTEBYTES(hashno)); 4399 hashno = 1; 4400 } else { 4401 hashno++; 4402 } 4403 } 4404 4405 sfmmu_hblks_list_purge(&list); 4406 DEMAP_RANGE_FLUSH(&dmr); 4407 cpuset = sfmmup->sfmmu_cpusran; 4408 xt_sync(cpuset); 4409 } 4410 4411 /* 4412 * This function chgprots a range of addresses in an hmeblk. It returns the 4413 * next addres that needs to be chgprot. 4414 * It should be called with the hash lock held. 4415 * XXX It shold be possible to optimize chgprot by not flushing every time but 4416 * on the other hand: 4417 * 1. do one flush crosscall. 4418 * 2. only flush if we are increasing permissions (make sure this will work) 4419 */ 4420 static caddr_t 4421 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4422 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 4423 { 4424 uint_t pprot; 4425 tte_t tte, ttemod; 4426 struct sf_hment *sfhmep; 4427 uint_t tteflags; 4428 int ttesz; 4429 struct page *pp = NULL; 4430 kmutex_t *pml, *pmtx; 4431 int ret; 4432 int use_demap_range; 4433 #if defined(SF_ERRATA_57) 4434 int check_exec; 4435 #endif 4436 4437 ASSERT(in_hblk_range(hmeblkp, addr)); 4438 ASSERT(hmeblkp->hblk_shw_bit == 0); 4439 4440 #ifdef DEBUG 4441 if (get_hblk_ttesz(hmeblkp) != TTE8K && 4442 (endaddr < get_hblk_endaddr(hmeblkp))) { 4443 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 4444 } 4445 #endif /* DEBUG */ 4446 4447 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4448 ttesz = get_hblk_ttesz(hmeblkp); 4449 4450 pprot = sfmmu_vtop_prot(vprot, &tteflags); 4451 #if defined(SF_ERRATA_57) 4452 check_exec = (sfmmup != ksfmmup) && 4453 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4454 ((vprot & PROT_EXEC) == PROT_EXEC); 4455 #endif 4456 HBLKTOHME(sfhmep, hmeblkp, addr); 4457 4458 /* 4459 * Flush the current demap region if addresses have been 4460 * skipped or the page size doesn't match. 4461 */ 4462 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 4463 if (use_demap_range) { 4464 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4465 } else { 4466 DEMAP_RANGE_FLUSH(dmrp); 4467 } 4468 4469 while (addr < endaddr) { 4470 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4471 if (TTE_IS_VALID(&tte)) { 4472 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 4473 /* 4474 * if the new protection is the same as old 4475 * continue 4476 */ 4477 goto next_addr; 4478 } 4479 pml = NULL; 4480 pp = sfhmep->hme_page; 4481 if (pp) { 4482 pml = sfmmu_mlist_enter(pp); 4483 } 4484 if (pp != sfhmep->hme_page) { 4485 /* 4486 * tte most have been unloaded 4487 * underneath us. Recheck 4488 */ 4489 ASSERT(pml); 4490 sfmmu_mlist_exit(pml); 4491 continue; 4492 } 4493 4494 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 4495 4496 ttemod = tte; 4497 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 4498 #if defined(SF_ERRATA_57) 4499 if (check_exec && addr < errata57_limit) 4500 ttemod.tte_exec_perm = 0; 4501 #endif 4502 ret = sfmmu_modifytte_try(&tte, &ttemod, 4503 &sfhmep->hme_tte); 4504 4505 if (ret < 0) { 4506 /* tte changed underneath us */ 4507 if (pml) { 4508 sfmmu_mlist_exit(pml); 4509 } 4510 continue; 4511 } 4512 4513 if (tteflags & TTE_HWWR_INT) { 4514 /* 4515 * need to sync if we are clearing modify bit. 4516 */ 4517 sfmmu_ttesync(sfmmup, addr, &tte, pp); 4518 } 4519 4520 if (pp && PP_ISRO(pp)) { 4521 if (pprot & TTE_WRPRM_INT) { 4522 pmtx = sfmmu_page_enter(pp); 4523 PP_CLRRO(pp); 4524 sfmmu_page_exit(pmtx); 4525 } 4526 } 4527 4528 if (ret > 0 && use_demap_range) { 4529 DEMAP_RANGE_MARKPG(dmrp, addr); 4530 } else if (ret > 0) { 4531 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 4532 } 4533 4534 if (pml) { 4535 sfmmu_mlist_exit(pml); 4536 } 4537 } 4538 next_addr: 4539 addr += TTEBYTES(ttesz); 4540 sfhmep++; 4541 DEMAP_RANGE_NEXTPG(dmrp); 4542 } 4543 return (addr); 4544 } 4545 4546 /* 4547 * This routine is deprecated and should only be used by hat_chgprot. 4548 * The correct routine is sfmmu_vtop_attr. 4549 * This routine converts virtual page protections to physical ones. It will 4550 * update the tteflags field with the tte mask corresponding to the protections 4551 * affected and it returns the new protections. It will also clear the modify 4552 * bit if we are taking away write permission. This is necessary since the 4553 * modify bit is the hardware permission bit and we need to clear it in order 4554 * to detect write faults. 4555 * It accepts the following special protections: 4556 * ~PROT_WRITE = remove write permissions. 4557 * ~PROT_USER = remove user permissions. 4558 */ 4559 static uint_t 4560 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 4561 { 4562 if (vprot == (uint_t)~PROT_WRITE) { 4563 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 4564 return (0); /* will cause wrprm to be cleared */ 4565 } 4566 if (vprot == (uint_t)~PROT_USER) { 4567 *tteflagsp = TTE_PRIV_INT; 4568 return (0); /* will cause privprm to be cleared */ 4569 } 4570 if ((vprot == 0) || (vprot == PROT_USER) || 4571 ((vprot & PROT_ALL) != vprot)) { 4572 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 4573 } 4574 4575 switch (vprot) { 4576 case (PROT_READ): 4577 case (PROT_EXEC): 4578 case (PROT_EXEC | PROT_READ): 4579 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 4580 return (TTE_PRIV_INT); /* set prv and clr wrt */ 4581 case (PROT_WRITE): 4582 case (PROT_WRITE | PROT_READ): 4583 case (PROT_EXEC | PROT_WRITE): 4584 case (PROT_EXEC | PROT_WRITE | PROT_READ): 4585 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 4586 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 4587 case (PROT_USER | PROT_READ): 4588 case (PROT_USER | PROT_EXEC): 4589 case (PROT_USER | PROT_EXEC | PROT_READ): 4590 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 4591 return (0); /* clr prv and wrt */ 4592 case (PROT_USER | PROT_WRITE): 4593 case (PROT_USER | PROT_WRITE | PROT_READ): 4594 case (PROT_USER | PROT_EXEC | PROT_WRITE): 4595 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 4596 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 4597 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 4598 default: 4599 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 4600 } 4601 return (0); 4602 } 4603 4604 /* 4605 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 4606 * the normal algorithm would take too long for a very large VA range with 4607 * few real mappings. This routine just walks thru all HMEs in the global 4608 * hash table to find and remove mappings. 4609 */ 4610 static void 4611 hat_unload_large_virtual( 4612 struct hat *sfmmup, 4613 caddr_t startaddr, 4614 size_t len, 4615 uint_t flags, 4616 hat_callback_t *callback) 4617 { 4618 struct hmehash_bucket *hmebp; 4619 struct hme_blk *hmeblkp; 4620 struct hme_blk *pr_hblk = NULL; 4621 struct hme_blk *nx_hblk; 4622 struct hme_blk *list = NULL; 4623 int i; 4624 uint64_t hblkpa, prevpa, nx_pa; 4625 hatlock_t *hatlockp; 4626 struct tsb_info *tsbinfop; 4627 struct ctx *ctx; 4628 caddr_t endaddr = startaddr + len; 4629 caddr_t sa; 4630 caddr_t ea; 4631 caddr_t cb_sa[MAX_CB_ADDR]; 4632 caddr_t cb_ea[MAX_CB_ADDR]; 4633 int addr_cnt = 0; 4634 int a = 0; 4635 int cnum; 4636 4637 hatlockp = sfmmu_hat_enter(sfmmup); 4638 4639 /* 4640 * Since we know we're unmapping a huge range of addresses, 4641 * just throw away the context and switch to another. It's 4642 * cheaper than trying to unmap all of the TTEs we may find 4643 * from the TLB individually, which is too expensive in terms 4644 * of xcalls. Better yet, if we're exiting, no need to flush 4645 * anything at all! 4646 */ 4647 if (!sfmmup->sfmmu_free) { 4648 ctx = sfmmutoctx(sfmmup); 4649 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 4650 cnum = sfmmutoctxnum(sfmmup); 4651 if (cnum != INVALID_CONTEXT) { 4652 sfmmu_tlb_swap_ctx(sfmmup, ctx); 4653 } 4654 rw_exit(&ctx->ctx_rwlock); 4655 4656 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 4657 tsbinfop = tsbinfop->tsb_next) { 4658 if (tsbinfop->tsb_flags & TSB_SWAPPED) 4659 continue; 4660 sfmmu_inv_tsb(tsbinfop->tsb_va, 4661 TSB_BYTES(tsbinfop->tsb_szc)); 4662 } 4663 } 4664 4665 /* 4666 * Loop through all the hash buckets of HME blocks looking for matches. 4667 */ 4668 for (i = 0; i <= UHMEHASH_SZ; i++) { 4669 hmebp = &uhme_hash[i]; 4670 SFMMU_HASH_LOCK(hmebp); 4671 hmeblkp = hmebp->hmeblkp; 4672 hblkpa = hmebp->hmeh_nextpa; 4673 prevpa = 0; 4674 pr_hblk = NULL; 4675 while (hmeblkp) { 4676 nx_hblk = hmeblkp->hblk_next; 4677 nx_pa = hmeblkp->hblk_nextpa; 4678 4679 /* 4680 * skip if not this context, if a shadow block or 4681 * if the mapping is not in the requested range 4682 */ 4683 if (hmeblkp->hblk_tag.htag_id != sfmmup || 4684 hmeblkp->hblk_shw_bit || 4685 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 4686 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 4687 pr_hblk = hmeblkp; 4688 prevpa = hblkpa; 4689 goto next_block; 4690 } 4691 4692 /* 4693 * unload if there are any current valid mappings 4694 */ 4695 if (hmeblkp->hblk_vcnt != 0 || 4696 hmeblkp->hblk_hmecnt != 0) 4697 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 4698 sa, ea, NULL, flags); 4699 4700 /* 4701 * on unmap we also release the HME block itself, once 4702 * all mappings are gone. 4703 */ 4704 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 4705 !hmeblkp->hblk_vcnt && 4706 !hmeblkp->hblk_hmecnt) { 4707 ASSERT(!hmeblkp->hblk_lckcnt); 4708 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 4709 prevpa, pr_hblk); 4710 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 4711 } else { 4712 pr_hblk = hmeblkp; 4713 prevpa = hblkpa; 4714 } 4715 4716 if (callback == NULL) 4717 goto next_block; 4718 4719 /* 4720 * HME blocks may span more than one page, but we may be 4721 * unmapping only one page, so check for a smaller range 4722 * for the callback 4723 */ 4724 if (sa < startaddr) 4725 sa = startaddr; 4726 if (--ea > endaddr) 4727 ea = endaddr - 1; 4728 4729 cb_sa[addr_cnt] = sa; 4730 cb_ea[addr_cnt] = ea; 4731 if (++addr_cnt == MAX_CB_ADDR) { 4732 for (a = 0; a < MAX_CB_ADDR; ++a) { 4733 callback->hcb_start_addr = cb_sa[a]; 4734 callback->hcb_end_addr = cb_ea[a]; 4735 callback->hcb_function(callback); 4736 } 4737 addr_cnt = 0; 4738 } 4739 4740 next_block: 4741 hmeblkp = nx_hblk; 4742 hblkpa = nx_pa; 4743 } 4744 SFMMU_HASH_UNLOCK(hmebp); 4745 } 4746 4747 sfmmu_hblks_list_purge(&list); 4748 4749 for (a = 0; a < addr_cnt; ++a) { 4750 callback->hcb_start_addr = cb_sa[a]; 4751 callback->hcb_end_addr = cb_ea[a]; 4752 callback->hcb_function(callback); 4753 } 4754 4755 sfmmu_hat_exit(hatlockp); 4756 4757 /* 4758 * Check TSB and TLB page sizes if the process isn't exiting. 4759 */ 4760 if (!sfmmup->sfmmu_free) 4761 sfmmu_check_page_sizes(sfmmup, 0); 4762 } 4763 4764 4765 /* 4766 * Unload all the mappings in the range [addr..addr+len). addr and len must 4767 * be MMU_PAGESIZE aligned. 4768 */ 4769 4770 extern struct seg *segkmap; 4771 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 4772 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 4773 4774 4775 void 4776 hat_unload_callback( 4777 struct hat *sfmmup, 4778 caddr_t addr, 4779 size_t len, 4780 uint_t flags, 4781 hat_callback_t *callback) 4782 { 4783 struct hmehash_bucket *hmebp; 4784 hmeblk_tag hblktag; 4785 int hmeshift, hashno, iskernel; 4786 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 4787 caddr_t endaddr; 4788 cpuset_t cpuset; 4789 uint64_t hblkpa, prevpa; 4790 int addr_count = 0; 4791 int a; 4792 caddr_t cb_start_addr[MAX_CB_ADDR]; 4793 caddr_t cb_end_addr[MAX_CB_ADDR]; 4794 int issegkmap = ISSEGKMAP(sfmmup, addr); 4795 demap_range_t dmr, *dmrp; 4796 4797 if (sfmmup->sfmmu_xhat_provider) { 4798 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback); 4799 return; 4800 } else { 4801 /* 4802 * This must be a CPU HAT. If the address space has 4803 * XHATs attached, unload the mappings for all of them, 4804 * just in case 4805 */ 4806 ASSERT(sfmmup->sfmmu_as != NULL); 4807 if (sfmmup->sfmmu_as->a_xhat != NULL) 4808 xhat_unload_callback_all(sfmmup->sfmmu_as, addr, 4809 len, flags, callback); 4810 } 4811 4812 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 4813 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4814 4815 ASSERT(sfmmup != NULL); 4816 ASSERT((len & MMU_PAGEOFFSET) == 0); 4817 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 4818 4819 /* 4820 * Probing through a large VA range (say 63 bits) will be slow, even 4821 * at 4 Meg steps between the probes. So, when the virtual address range 4822 * is very large, search the HME entries for what to unload. 4823 * 4824 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 4825 * 4826 * UHMEHASH_SZ is number of hash buckets to examine 4827 * 4828 */ 4829 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 4830 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 4831 return; 4832 } 4833 4834 CPUSET_ZERO(cpuset); 4835 4836 /* 4837 * If the process is exiting, we can save a lot of fuss since 4838 * we'll flush the TLB when we free the ctx anyway. 4839 */ 4840 if (sfmmup->sfmmu_free) 4841 dmrp = NULL; 4842 else 4843 dmrp = &dmr; 4844 4845 DEMAP_RANGE_INIT(sfmmup, dmrp); 4846 endaddr = addr + len; 4847 hblktag.htag_id = sfmmup; 4848 4849 /* 4850 * It is likely for the vm to call unload over a wide range of 4851 * addresses that are actually very sparsely populated by 4852 * translations. In order to speed this up the sfmmu hat supports 4853 * the concept of shadow hmeblks. Dummy large page hmeblks that 4854 * correspond to actual small translations are allocated at tteload 4855 * time and are referred to as shadow hmeblks. Now, during unload 4856 * time, we first check if we have a shadow hmeblk for that 4857 * translation. The absence of one means the corresponding address 4858 * range is empty and can be skipped. 4859 * 4860 * The kernel is an exception to above statement and that is why 4861 * we don't use shadow hmeblks and hash starting from the smallest 4862 * page size. 4863 */ 4864 if (sfmmup == KHATID) { 4865 iskernel = 1; 4866 hashno = TTE64K; 4867 } else { 4868 iskernel = 0; 4869 if (mmu_page_sizes == max_mmu_page_sizes) { 4870 hashno = TTE256M; 4871 } else { 4872 hashno = TTE4M; 4873 } 4874 } 4875 while (addr < endaddr) { 4876 hmeshift = HME_HASH_SHIFT(hashno); 4877 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4878 hblktag.htag_rehash = hashno; 4879 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4880 4881 SFMMU_HASH_LOCK(hmebp); 4882 4883 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, pr_hblk, 4884 prevpa, &list); 4885 if (hmeblkp == NULL) { 4886 /* 4887 * didn't find an hmeblk. skip the appropiate 4888 * address range. 4889 */ 4890 SFMMU_HASH_UNLOCK(hmebp); 4891 if (iskernel) { 4892 if (hashno < mmu_hashcnt) { 4893 hashno++; 4894 continue; 4895 } else { 4896 hashno = TTE64K; 4897 addr = (caddr_t)roundup((uintptr_t)addr 4898 + 1, MMU_PAGESIZE64K); 4899 continue; 4900 } 4901 } 4902 addr = (caddr_t)roundup((uintptr_t)addr + 1, 4903 (1 << hmeshift)); 4904 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 4905 ASSERT(hashno == TTE64K); 4906 continue; 4907 } 4908 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 4909 hashno = TTE512K; 4910 continue; 4911 } 4912 if (mmu_page_sizes == max_mmu_page_sizes) { 4913 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 4914 hashno = TTE4M; 4915 continue; 4916 } 4917 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 4918 hashno = TTE32M; 4919 continue; 4920 } 4921 hashno = TTE256M; 4922 continue; 4923 } else { 4924 hashno = TTE4M; 4925 continue; 4926 } 4927 } 4928 ASSERT(hmeblkp); 4929 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 4930 /* 4931 * If the valid count is zero we can skip the range 4932 * mapped by this hmeblk. 4933 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 4934 * is used by segment drivers as a hint 4935 * that the mapping resource won't be used any longer. 4936 * The best example of this is during exit(). 4937 */ 4938 addr = (caddr_t)roundup((uintptr_t)addr + 1, 4939 get_hblk_span(hmeblkp)); 4940 if ((flags & HAT_UNLOAD_UNMAP) || 4941 (iskernel && !issegkmap)) { 4942 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 4943 pr_hblk); 4944 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 4945 } 4946 SFMMU_HASH_UNLOCK(hmebp); 4947 4948 if (iskernel) { 4949 hashno = TTE64K; 4950 continue; 4951 } 4952 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 4953 ASSERT(hashno == TTE64K); 4954 continue; 4955 } 4956 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 4957 hashno = TTE512K; 4958 continue; 4959 } 4960 if (mmu_page_sizes == max_mmu_page_sizes) { 4961 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 4962 hashno = TTE4M; 4963 continue; 4964 } 4965 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 4966 hashno = TTE32M; 4967 continue; 4968 } 4969 hashno = TTE256M; 4970 continue; 4971 } else { 4972 hashno = TTE4M; 4973 continue; 4974 } 4975 } 4976 if (hmeblkp->hblk_shw_bit) { 4977 /* 4978 * If we encounter a shadow hmeblk we know there is 4979 * smaller sized hmeblks mapping the same address space. 4980 * Decrement the hash size and rehash. 4981 */ 4982 ASSERT(sfmmup != KHATID); 4983 hashno--; 4984 SFMMU_HASH_UNLOCK(hmebp); 4985 continue; 4986 } 4987 4988 /* 4989 * track callback address ranges. 4990 * only start a new range when it's not contiguous 4991 */ 4992 if (callback != NULL) { 4993 if (addr_count > 0 && 4994 addr == cb_end_addr[addr_count - 1]) 4995 --addr_count; 4996 else 4997 cb_start_addr[addr_count] = addr; 4998 } 4999 5000 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5001 dmrp, flags); 5002 5003 if (callback != NULL) 5004 cb_end_addr[addr_count++] = addr; 5005 5006 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5007 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5008 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 5009 pr_hblk); 5010 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 5011 } 5012 SFMMU_HASH_UNLOCK(hmebp); 5013 5014 /* 5015 * Notify our caller as to exactly which pages 5016 * have been unloaded. We do these in clumps, 5017 * to minimize the number of xt_sync()s that need to occur. 5018 */ 5019 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5020 DEMAP_RANGE_FLUSH(dmrp); 5021 if (dmrp != NULL) { 5022 cpuset = sfmmup->sfmmu_cpusran; 5023 xt_sync(cpuset); 5024 } 5025 5026 for (a = 0; a < MAX_CB_ADDR; ++a) { 5027 callback->hcb_start_addr = cb_start_addr[a]; 5028 callback->hcb_end_addr = cb_end_addr[a]; 5029 callback->hcb_function(callback); 5030 } 5031 addr_count = 0; 5032 } 5033 if (iskernel) { 5034 hashno = TTE64K; 5035 continue; 5036 } 5037 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5038 ASSERT(hashno == TTE64K); 5039 continue; 5040 } 5041 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5042 hashno = TTE512K; 5043 continue; 5044 } 5045 if (mmu_page_sizes == max_mmu_page_sizes) { 5046 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5047 hashno = TTE4M; 5048 continue; 5049 } 5050 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5051 hashno = TTE32M; 5052 continue; 5053 } 5054 hashno = TTE256M; 5055 } else { 5056 hashno = TTE4M; 5057 } 5058 } 5059 5060 sfmmu_hblks_list_purge(&list); 5061 DEMAP_RANGE_FLUSH(dmrp); 5062 if (dmrp != NULL) { 5063 cpuset = sfmmup->sfmmu_cpusran; 5064 xt_sync(cpuset); 5065 } 5066 if (callback && addr_count != 0) { 5067 for (a = 0; a < addr_count; ++a) { 5068 callback->hcb_start_addr = cb_start_addr[a]; 5069 callback->hcb_end_addr = cb_end_addr[a]; 5070 callback->hcb_function(callback); 5071 } 5072 } 5073 5074 /* 5075 * Check TSB and TLB page sizes if the process isn't exiting. 5076 */ 5077 if (!sfmmup->sfmmu_free) 5078 sfmmu_check_page_sizes(sfmmup, 0); 5079 } 5080 5081 /* 5082 * Unload all the mappings in the range [addr..addr+len). addr and len must 5083 * be MMU_PAGESIZE aligned. 5084 */ 5085 void 5086 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 5087 { 5088 if (sfmmup->sfmmu_xhat_provider) { 5089 XHAT_UNLOAD(sfmmup, addr, len, flags); 5090 return; 5091 } 5092 hat_unload_callback(sfmmup, addr, len, flags, NULL); 5093 } 5094 5095 5096 /* 5097 * Find the largest mapping size for this page. 5098 */ 5099 static int 5100 fnd_mapping_sz(page_t *pp) 5101 { 5102 int sz; 5103 int p_index; 5104 5105 p_index = PP_MAPINDEX(pp); 5106 5107 sz = 0; 5108 p_index >>= 1; /* don't care about 8K bit */ 5109 for (; p_index; p_index >>= 1) { 5110 sz++; 5111 } 5112 5113 return (sz); 5114 } 5115 5116 /* 5117 * This function unloads a range of addresses for an hmeblk. 5118 * It returns the next address to be unloaded. 5119 * It should be called with the hash lock held. 5120 */ 5121 static caddr_t 5122 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5123 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 5124 { 5125 tte_t tte, ttemod; 5126 struct sf_hment *sfhmep; 5127 int ttesz; 5128 long ttecnt; 5129 page_t *pp; 5130 kmutex_t *pml; 5131 int ret; 5132 int use_demap_range; 5133 5134 ASSERT(in_hblk_range(hmeblkp, addr)); 5135 ASSERT(!hmeblkp->hblk_shw_bit); 5136 #ifdef DEBUG 5137 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5138 (endaddr < get_hblk_endaddr(hmeblkp))) { 5139 panic("sfmmu_hblk_unload: partial unload of large page"); 5140 } 5141 #endif /* DEBUG */ 5142 5143 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5144 ttesz = get_hblk_ttesz(hmeblkp); 5145 5146 use_demap_range = (do_virtual_coloring && 5147 TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 5148 if (use_demap_range) { 5149 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5150 } else { 5151 DEMAP_RANGE_FLUSH(dmrp); 5152 } 5153 ttecnt = 0; 5154 HBLKTOHME(sfhmep, hmeblkp, addr); 5155 5156 while (addr < endaddr) { 5157 pml = NULL; 5158 again: 5159 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5160 if (TTE_IS_VALID(&tte)) { 5161 pp = sfhmep->hme_page; 5162 if (pp && pml == NULL) { 5163 pml = sfmmu_mlist_enter(pp); 5164 } 5165 5166 /* 5167 * Verify if hme still points to 'pp' now that 5168 * we have p_mapping lock. 5169 */ 5170 if (sfhmep->hme_page != pp) { 5171 if (pp != NULL && sfhmep->hme_page != NULL) { 5172 if (pml) { 5173 sfmmu_mlist_exit(pml); 5174 } 5175 /* Re-start this iteration. */ 5176 continue; 5177 } 5178 ASSERT((pp != NULL) && 5179 (sfhmep->hme_page == NULL)); 5180 goto tte_unloaded; 5181 } 5182 5183 /* 5184 * This point on we have both HASH and p_mapping 5185 * lock. 5186 */ 5187 ASSERT(pp == sfhmep->hme_page); 5188 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5189 5190 /* 5191 * We need to loop on modify tte because it is 5192 * possible for pagesync to come along and 5193 * change the software bits beneath us. 5194 * 5195 * Page_unload can also invalidate the tte after 5196 * we read tte outside of p_mapping lock. 5197 */ 5198 ttemod = tte; 5199 5200 TTE_SET_INVALID(&ttemod); 5201 ret = sfmmu_modifytte_try(&tte, &ttemod, 5202 &sfhmep->hme_tte); 5203 5204 if (ret <= 0) { 5205 if (TTE_IS_VALID(&tte)) { 5206 goto again; 5207 } else { 5208 /* 5209 * We read in a valid pte, but it 5210 * is unloaded by page_unload. 5211 * hme_page has become NULL and 5212 * we hold no p_mapping lock. 5213 */ 5214 ASSERT(pp == NULL && pml == NULL); 5215 goto tte_unloaded; 5216 } 5217 } 5218 5219 if (!(flags & HAT_UNLOAD_NOSYNC)) { 5220 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5221 } 5222 5223 /* 5224 * Ok- we invalidated the tte. Do the rest of the job. 5225 */ 5226 ttecnt++; 5227 5228 if (flags & HAT_UNLOAD_UNLOCK) { 5229 ASSERT(hmeblkp->hblk_lckcnt > 0); 5230 atomic_add_16(&hmeblkp->hblk_lckcnt, -1); 5231 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 5232 } 5233 5234 /* 5235 * Normally we would need to flush the page 5236 * from the virtual cache at this point in 5237 * order to prevent a potential cache alias 5238 * inconsistency. 5239 * The particular scenario we need to worry 5240 * about is: 5241 * Given: va1 and va2 are two virtual address 5242 * that alias and map the same physical 5243 * address. 5244 * 1. mapping exists from va1 to pa and data 5245 * has been read into the cache. 5246 * 2. unload va1. 5247 * 3. load va2 and modify data using va2. 5248 * 4 unload va2. 5249 * 5. load va1 and reference data. Unless we 5250 * flush the data cache when we unload we will 5251 * get stale data. 5252 * Fortunately, page coloring eliminates the 5253 * above scenario by remembering the color a 5254 * physical page was last or is currently 5255 * mapped to. Now, we delay the flush until 5256 * the loading of translations. Only when the 5257 * new translation is of a different color 5258 * are we forced to flush. 5259 */ 5260 if (use_demap_range) { 5261 /* 5262 * Mark this page as needing a demap. 5263 */ 5264 DEMAP_RANGE_MARKPG(dmrp, addr); 5265 } else { 5266 if (do_virtual_coloring) { 5267 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 5268 sfmmup->sfmmu_free, 0); 5269 } else { 5270 pfn_t pfnum; 5271 5272 pfnum = TTE_TO_PFN(addr, &tte); 5273 sfmmu_tlbcache_demap(addr, sfmmup, 5274 hmeblkp, pfnum, sfmmup->sfmmu_free, 5275 FLUSH_NECESSARY_CPUS, 5276 CACHE_FLUSH, 0); 5277 } 5278 } 5279 5280 if (pp) { 5281 /* 5282 * Remove the hment from the mapping list 5283 */ 5284 ASSERT(hmeblkp->hblk_hmecnt > 0); 5285 5286 /* 5287 * Again, we cannot 5288 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 5289 */ 5290 HME_SUB(sfhmep, pp); 5291 membar_stst(); 5292 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 5293 } 5294 5295 ASSERT(hmeblkp->hblk_vcnt > 0); 5296 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 5297 5298 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 5299 !hmeblkp->hblk_lckcnt); 5300 5301 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 5302 if (PP_ISTNC(pp)) { 5303 /* 5304 * If page was temporary 5305 * uncached, try to recache 5306 * it. Note that HME_SUB() was 5307 * called above so p_index and 5308 * mlist had been updated. 5309 */ 5310 conv_tnc(pp, ttesz); 5311 } else if (pp->p_mapping == NULL) { 5312 ASSERT(kpm_enable); 5313 /* 5314 * Page is marked to be in VAC conflict 5315 * to an existing kpm mapping and/or is 5316 * kpm mapped using only the regular 5317 * pagesize. 5318 */ 5319 sfmmu_kpm_hme_unload(pp); 5320 } 5321 } 5322 } else if ((pp = sfhmep->hme_page) != NULL) { 5323 /* 5324 * TTE is invalid but the hme 5325 * still exists. let pageunload 5326 * complete its job. 5327 */ 5328 ASSERT(pml == NULL); 5329 pml = sfmmu_mlist_enter(pp); 5330 if (sfhmep->hme_page != NULL) { 5331 sfmmu_mlist_exit(pml); 5332 pml = NULL; 5333 goto again; 5334 } 5335 ASSERT(sfhmep->hme_page == NULL); 5336 } else if (hmeblkp->hblk_hmecnt != 0) { 5337 /* 5338 * pageunload may have not finished decrementing 5339 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 5340 * wait for pageunload to finish. Rely on pageunload 5341 * to decrement hblk_hmecnt after hblk_vcnt. 5342 */ 5343 pfn_t pfn = TTE_TO_TTEPFN(&tte); 5344 ASSERT(pml == NULL); 5345 if (pf_is_memory(pfn)) { 5346 pp = page_numtopp_nolock(pfn); 5347 if (pp != NULL) { 5348 pml = sfmmu_mlist_enter(pp); 5349 sfmmu_mlist_exit(pml); 5350 pml = NULL; 5351 } 5352 } 5353 } 5354 5355 tte_unloaded: 5356 /* 5357 * At this point, the tte we are looking at 5358 * should be unloaded, and hme has been unlinked 5359 * from page too. This is important because in 5360 * pageunload, it does ttesync() then HME_SUB. 5361 * We need to make sure HME_SUB has been completed 5362 * so we know ttesync() has been completed. Otherwise, 5363 * at exit time, after return from hat layer, VM will 5364 * release as structure which hat_setstat() (called 5365 * by ttesync()) needs. 5366 */ 5367 #ifdef DEBUG 5368 { 5369 tte_t dtte; 5370 5371 ASSERT(sfhmep->hme_page == NULL); 5372 5373 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 5374 ASSERT(!TTE_IS_VALID(&dtte)); 5375 } 5376 #endif 5377 5378 if (pml) { 5379 sfmmu_mlist_exit(pml); 5380 } 5381 5382 addr += TTEBYTES(ttesz); 5383 sfhmep++; 5384 DEMAP_RANGE_NEXTPG(dmrp); 5385 } 5386 if (ttecnt > 0) 5387 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 5388 return (addr); 5389 } 5390 5391 /* 5392 * Synchronize all the mappings in the range [addr..addr+len). 5393 * Can be called with clearflag having two states: 5394 * HAT_SYNC_DONTZERO means just return the rm stats 5395 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 5396 */ 5397 void 5398 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 5399 { 5400 struct hmehash_bucket *hmebp; 5401 hmeblk_tag hblktag; 5402 int hmeshift, hashno = 1; 5403 struct hme_blk *hmeblkp, *list = NULL; 5404 caddr_t endaddr; 5405 cpuset_t cpuset; 5406 5407 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 5408 ASSERT((sfmmup == ksfmmup) || 5409 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 5410 ASSERT((len & MMU_PAGEOFFSET) == 0); 5411 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 5412 (clearflag == HAT_SYNC_ZERORM)); 5413 5414 CPUSET_ZERO(cpuset); 5415 5416 endaddr = addr + len; 5417 hblktag.htag_id = sfmmup; 5418 /* 5419 * Spitfire supports 4 page sizes. 5420 * Most pages are expected to be of the smallest page 5421 * size (8K) and these will not need to be rehashed. 64K 5422 * pages also don't need to be rehashed because the an hmeblk 5423 * spans 64K of address space. 512K pages might need 1 rehash and 5424 * and 4M pages 2 rehashes. 5425 */ 5426 while (addr < endaddr) { 5427 hmeshift = HME_HASH_SHIFT(hashno); 5428 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5429 hblktag.htag_rehash = hashno; 5430 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5431 5432 SFMMU_HASH_LOCK(hmebp); 5433 5434 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5435 if (hmeblkp != NULL) { 5436 /* 5437 * We've encountered a shadow hmeblk so skip the range 5438 * of the next smaller mapping size. 5439 */ 5440 if (hmeblkp->hblk_shw_bit) { 5441 ASSERT(sfmmup != ksfmmup); 5442 ASSERT(hashno > 1); 5443 addr = (caddr_t)P2END((uintptr_t)addr, 5444 TTEBYTES(hashno - 1)); 5445 } else { 5446 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 5447 addr, endaddr, clearflag); 5448 } 5449 SFMMU_HASH_UNLOCK(hmebp); 5450 hashno = 1; 5451 continue; 5452 } 5453 SFMMU_HASH_UNLOCK(hmebp); 5454 5455 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5456 /* 5457 * We have traversed the whole list and rehashed 5458 * if necessary without finding the address to sync. 5459 * This is ok so we increment the address by the 5460 * smallest hmeblk range for kernel mappings and the 5461 * largest hmeblk range, to account for shadow hmeblks, 5462 * for user mappings and continue. 5463 */ 5464 if (sfmmup == ksfmmup) 5465 addr = (caddr_t)P2END((uintptr_t)addr, 5466 TTEBYTES(1)); 5467 else 5468 addr = (caddr_t)P2END((uintptr_t)addr, 5469 TTEBYTES(hashno)); 5470 hashno = 1; 5471 } else { 5472 hashno++; 5473 } 5474 } 5475 sfmmu_hblks_list_purge(&list); 5476 cpuset = sfmmup->sfmmu_cpusran; 5477 xt_sync(cpuset); 5478 } 5479 5480 static caddr_t 5481 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5482 caddr_t endaddr, int clearflag) 5483 { 5484 tte_t tte, ttemod; 5485 struct sf_hment *sfhmep; 5486 int ttesz; 5487 struct page *pp; 5488 kmutex_t *pml; 5489 int ret; 5490 5491 ASSERT(hmeblkp->hblk_shw_bit == 0); 5492 5493 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5494 5495 ttesz = get_hblk_ttesz(hmeblkp); 5496 HBLKTOHME(sfhmep, hmeblkp, addr); 5497 5498 while (addr < endaddr) { 5499 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5500 if (TTE_IS_VALID(&tte)) { 5501 pml = NULL; 5502 pp = sfhmep->hme_page; 5503 if (pp) { 5504 pml = sfmmu_mlist_enter(pp); 5505 } 5506 if (pp != sfhmep->hme_page) { 5507 /* 5508 * tte most have been unloaded 5509 * underneath us. Recheck 5510 */ 5511 ASSERT(pml); 5512 sfmmu_mlist_exit(pml); 5513 continue; 5514 } 5515 5516 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5517 5518 if (clearflag == HAT_SYNC_ZERORM) { 5519 ttemod = tte; 5520 TTE_CLR_RM(&ttemod); 5521 ret = sfmmu_modifytte_try(&tte, &ttemod, 5522 &sfhmep->hme_tte); 5523 if (ret < 0) { 5524 if (pml) { 5525 sfmmu_mlist_exit(pml); 5526 } 5527 continue; 5528 } 5529 5530 if (ret > 0) { 5531 sfmmu_tlb_demap(addr, sfmmup, 5532 hmeblkp, 0, 0); 5533 } 5534 } 5535 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5536 if (pml) { 5537 sfmmu_mlist_exit(pml); 5538 } 5539 } 5540 addr += TTEBYTES(ttesz); 5541 sfhmep++; 5542 } 5543 return (addr); 5544 } 5545 5546 /* 5547 * This function will sync a tte to the page struct and it will 5548 * update the hat stats. Currently it allows us to pass a NULL pp 5549 * and we will simply update the stats. We may want to change this 5550 * so we only keep stats for pages backed by pp's. 5551 */ 5552 static void 5553 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 5554 { 5555 uint_t rm = 0; 5556 int sz; 5557 pgcnt_t npgs; 5558 5559 ASSERT(TTE_IS_VALID(ttep)); 5560 5561 if (TTE_IS_NOSYNC(ttep)) { 5562 return; 5563 } 5564 5565 if (TTE_IS_REF(ttep)) { 5566 rm = P_REF; 5567 } 5568 if (TTE_IS_MOD(ttep)) { 5569 rm |= P_MOD; 5570 } 5571 5572 if (rm == 0) { 5573 return; 5574 } 5575 5576 sz = TTE_CSZ(ttep); 5577 if (sfmmup->sfmmu_rmstat) { 5578 int i; 5579 caddr_t vaddr = addr; 5580 5581 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) { 5582 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm); 5583 } 5584 5585 } 5586 5587 /* 5588 * XXX I want to use cas to update nrm bits but they 5589 * currently belong in common/vm and not in hat where 5590 * they should be. 5591 * The nrm bits are protected by the same mutex as 5592 * the one that protects the page's mapping list. 5593 */ 5594 if (!pp) 5595 return; 5596 ASSERT(sfmmu_mlist_held(pp)); 5597 /* 5598 * If the tte is for a large page, we need to sync all the 5599 * pages covered by the tte. 5600 */ 5601 if (sz != TTE8K) { 5602 ASSERT(pp->p_szc != 0); 5603 pp = PP_GROUPLEADER(pp, sz); 5604 ASSERT(sfmmu_mlist_held(pp)); 5605 } 5606 5607 /* Get number of pages from tte size. */ 5608 npgs = TTEPAGES(sz); 5609 5610 do { 5611 ASSERT(pp); 5612 ASSERT(sfmmu_mlist_held(pp)); 5613 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 5614 ((rm & P_MOD) != 0 && !PP_ISMOD(pp))) 5615 hat_page_setattr(pp, rm); 5616 5617 /* 5618 * Are we done? If not, we must have a large mapping. 5619 * For large mappings we need to sync the rest of the pages 5620 * covered by this tte; goto the next page. 5621 */ 5622 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 5623 } 5624 5625 /* 5626 * Execute pre-callback handler of each pa_hment linked to pp 5627 * 5628 * Inputs: 5629 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 5630 * capture_cpus: pointer to return value (below) 5631 * 5632 * Returns: 5633 * Propagates the subsystem callback return values back to the caller; 5634 * returns 0 on success. If capture_cpus is non-NULL, the value returned 5635 * is zero if all of the pa_hments are of a type that do not require 5636 * capturing CPUs prior to suspending the mapping, else it is 1. 5637 */ 5638 static int 5639 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 5640 { 5641 struct sf_hment *sfhmep; 5642 struct pa_hment *pahmep; 5643 int (*f)(caddr_t, uint_t, uint_t, void *); 5644 int ret; 5645 id_t id; 5646 int locked = 0; 5647 kmutex_t *pml; 5648 5649 ASSERT(PAGE_EXCL(pp)); 5650 if (!sfmmu_mlist_held(pp)) { 5651 pml = sfmmu_mlist_enter(pp); 5652 locked = 1; 5653 } 5654 5655 if (capture_cpus) 5656 *capture_cpus = 0; 5657 5658 top: 5659 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 5660 /* 5661 * skip sf_hments corresponding to VA<->PA mappings; 5662 * for pa_hment's, hme_tte.ll is zero 5663 */ 5664 if (!IS_PAHME(sfhmep)) 5665 continue; 5666 5667 pahmep = sfhmep->hme_data; 5668 ASSERT(pahmep != NULL); 5669 5670 /* 5671 * skip if pre-handler has been called earlier in this loop 5672 */ 5673 if (pahmep->flags & flag) 5674 continue; 5675 5676 id = pahmep->cb_id; 5677 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 5678 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 5679 *capture_cpus = 1; 5680 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 5681 pahmep->flags |= flag; 5682 continue; 5683 } 5684 5685 /* 5686 * Drop the mapping list lock to avoid locking order issues. 5687 */ 5688 if (locked) 5689 sfmmu_mlist_exit(pml); 5690 5691 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 5692 if (ret != 0) 5693 return (ret); /* caller must do the cleanup */ 5694 5695 if (locked) { 5696 pml = sfmmu_mlist_enter(pp); 5697 pahmep->flags |= flag; 5698 goto top; 5699 } 5700 5701 pahmep->flags |= flag; 5702 } 5703 5704 if (locked) 5705 sfmmu_mlist_exit(pml); 5706 5707 return (0); 5708 } 5709 5710 /* 5711 * Execute post-callback handler of each pa_hment linked to pp 5712 * 5713 * Same overall assumptions and restrictions apply as for 5714 * hat_pageprocess_precallbacks(). 5715 */ 5716 static void 5717 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 5718 { 5719 pfn_t pgpfn = pp->p_pagenum; 5720 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 5721 pfn_t newpfn; 5722 struct sf_hment *sfhmep; 5723 struct pa_hment *pahmep; 5724 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 5725 id_t id; 5726 int locked = 0; 5727 kmutex_t *pml; 5728 5729 ASSERT(PAGE_EXCL(pp)); 5730 if (!sfmmu_mlist_held(pp)) { 5731 pml = sfmmu_mlist_enter(pp); 5732 locked = 1; 5733 } 5734 5735 top: 5736 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 5737 /* 5738 * skip sf_hments corresponding to VA<->PA mappings; 5739 * for pa_hment's, hme_tte.ll is zero 5740 */ 5741 if (!IS_PAHME(sfhmep)) 5742 continue; 5743 5744 pahmep = sfhmep->hme_data; 5745 ASSERT(pahmep != NULL); 5746 5747 if ((pahmep->flags & flag) == 0) 5748 continue; 5749 5750 pahmep->flags &= ~flag; 5751 5752 id = pahmep->cb_id; 5753 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 5754 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 5755 continue; 5756 5757 /* 5758 * Convert the base page PFN into the constituent PFN 5759 * which is needed by the callback handler. 5760 */ 5761 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 5762 5763 /* 5764 * Drop the mapping list lock to avoid locking order issues. 5765 */ 5766 if (locked) 5767 sfmmu_mlist_exit(pml); 5768 5769 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 5770 != 0) 5771 panic("sfmmu: posthandler failed"); 5772 5773 if (locked) { 5774 pml = sfmmu_mlist_enter(pp); 5775 goto top; 5776 } 5777 } 5778 5779 if (locked) 5780 sfmmu_mlist_exit(pml); 5781 } 5782 5783 /* 5784 * Suspend locked kernel mapping 5785 */ 5786 void 5787 hat_pagesuspend(struct page *pp) 5788 { 5789 struct sf_hment *sfhmep; 5790 sfmmu_t *sfmmup; 5791 tte_t tte, ttemod; 5792 struct hme_blk *hmeblkp; 5793 caddr_t addr; 5794 int index, cons; 5795 cpuset_t cpuset; 5796 5797 ASSERT(PAGE_EXCL(pp)); 5798 ASSERT(sfmmu_mlist_held(pp)); 5799 5800 mutex_enter(&kpr_suspendlock); 5801 5802 /* 5803 * Call into dtrace to tell it we're about to suspend a 5804 * kernel mapping. This prevents us from running into issues 5805 * with probe context trying to touch a suspended page 5806 * in the relocation codepath itself. 5807 */ 5808 if (dtrace_kreloc_init) 5809 (*dtrace_kreloc_init)(); 5810 5811 index = PP_MAPINDEX(pp); 5812 cons = TTE8K; 5813 5814 retry: 5815 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 5816 5817 if (IS_PAHME(sfhmep)) 5818 continue; 5819 5820 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 5821 continue; 5822 5823 /* 5824 * Loop until we successfully set the suspend bit in 5825 * the TTE. 5826 */ 5827 again: 5828 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5829 ASSERT(TTE_IS_VALID(&tte)); 5830 5831 ttemod = tte; 5832 TTE_SET_SUSPEND(&ttemod); 5833 if (sfmmu_modifytte_try(&tte, &ttemod, 5834 &sfhmep->hme_tte) < 0) 5835 goto again; 5836 5837 /* 5838 * Invalidate TSB entry 5839 */ 5840 hmeblkp = sfmmu_hmetohblk(sfhmep); 5841 5842 sfmmup = hblktosfmmu(hmeblkp); 5843 ASSERT(sfmmup == ksfmmup); 5844 5845 addr = tte_to_vaddr(hmeblkp, tte); 5846 5847 /* 5848 * No need to make sure that the TSB for this sfmmu is 5849 * not being relocated since it is ksfmmup and thus it 5850 * will never be relocated. 5851 */ 5852 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp); 5853 5854 /* 5855 * Update xcall stats 5856 */ 5857 cpuset = cpu_ready_set; 5858 CPUSET_DEL(cpuset, CPU->cpu_id); 5859 5860 /* LINTED: constant in conditional context */ 5861 SFMMU_XCALL_STATS(KCONTEXT); 5862 5863 /* 5864 * Flush TLB entry on remote CPU's 5865 */ 5866 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, KCONTEXT); 5867 xt_sync(cpuset); 5868 5869 /* 5870 * Flush TLB entry on local CPU 5871 */ 5872 vtag_flushpage(addr, KCONTEXT); 5873 } 5874 5875 while (index != 0) { 5876 index = index >> 1; 5877 if (index != 0) 5878 cons++; 5879 if (index & 0x1) { 5880 pp = PP_GROUPLEADER(pp, cons); 5881 goto retry; 5882 } 5883 } 5884 } 5885 5886 #ifdef DEBUG 5887 5888 #define N_PRLE 1024 5889 struct prle { 5890 page_t *targ; 5891 page_t *repl; 5892 int status; 5893 int pausecpus; 5894 hrtime_t whence; 5895 }; 5896 5897 static struct prle page_relocate_log[N_PRLE]; 5898 static int prl_entry; 5899 static kmutex_t prl_mutex; 5900 5901 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 5902 mutex_enter(&prl_mutex); \ 5903 page_relocate_log[prl_entry].targ = *(t); \ 5904 page_relocate_log[prl_entry].repl = *(r); \ 5905 page_relocate_log[prl_entry].status = (s); \ 5906 page_relocate_log[prl_entry].pausecpus = (p); \ 5907 page_relocate_log[prl_entry].whence = gethrtime(); \ 5908 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 5909 mutex_exit(&prl_mutex); 5910 5911 #else /* !DEBUG */ 5912 #define PAGE_RELOCATE_LOG(t, r, s, p) 5913 #endif 5914 5915 /* 5916 * Core Kernel Page Relocation Algorithm 5917 * 5918 * Input: 5919 * 5920 * target : constituent pages are SE_EXCL locked. 5921 * replacement: constituent pages are SE_EXCL locked. 5922 * 5923 * Output: 5924 * 5925 * nrelocp: number of pages relocated 5926 */ 5927 int 5928 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 5929 { 5930 page_t *targ, *repl; 5931 page_t *tpp, *rpp; 5932 kmutex_t *low, *high; 5933 spgcnt_t npages, i; 5934 page_t *pl = NULL; 5935 int old_pil; 5936 cpuset_t cpuset; 5937 int cap_cpus; 5938 int ret; 5939 5940 if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) { 5941 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 5942 return (EAGAIN); 5943 } 5944 5945 mutex_enter(&kpr_mutex); 5946 kreloc_thread = curthread; 5947 5948 targ = *target; 5949 repl = *replacement; 5950 ASSERT(repl != NULL); 5951 ASSERT(targ->p_szc == repl->p_szc); 5952 5953 npages = page_get_pagecnt(targ->p_szc); 5954 5955 /* 5956 * unload VA<->PA mappings that are not locked 5957 */ 5958 tpp = targ; 5959 for (i = 0; i < npages; i++) { 5960 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 5961 tpp++; 5962 } 5963 5964 /* 5965 * Do "presuspend" callbacks, in a context from which we can still 5966 * block as needed. Note that we don't hold the mapping list lock 5967 * of "targ" at this point due to potential locking order issues; 5968 * we assume that between the hat_pageunload() above and holding 5969 * the SE_EXCL lock that the mapping list *cannot* change at this 5970 * point. 5971 */ 5972 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 5973 if (ret != 0) { 5974 /* 5975 * EIO translates to fatal error, for all others cleanup 5976 * and return EAGAIN. 5977 */ 5978 ASSERT(ret != EIO); 5979 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 5980 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 5981 kreloc_thread = NULL; 5982 mutex_exit(&kpr_mutex); 5983 return (EAGAIN); 5984 } 5985 5986 /* 5987 * acquire p_mapping list lock for both the target and replacement 5988 * root pages. 5989 * 5990 * low and high refer to the need to grab the mlist locks in a 5991 * specific order in order to prevent race conditions. Thus the 5992 * lower lock must be grabbed before the higher lock. 5993 * 5994 * This will block hat_unload's accessing p_mapping list. Since 5995 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 5996 * blocked. Thus, no one else will be accessing the p_mapping list 5997 * while we suspend and reload the locked mapping below. 5998 */ 5999 tpp = targ; 6000 rpp = repl; 6001 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6002 6003 kpreempt_disable(); 6004 6005 /* 6006 * If the replacement page is of a different virtual color 6007 * than the page it is replacing, we need to handle the VAC 6008 * consistency for it just as we would if we were setting up 6009 * a new mapping to a page. 6010 */ 6011 if ((tpp->p_szc == 0) && (PP_GET_VCOLOR(rpp) != NO_VCOLOR)) { 6012 if (tpp->p_vcolor != rpp->p_vcolor) { 6013 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 6014 rpp->p_pagenum); 6015 } 6016 } 6017 6018 /* 6019 * We raise our PIL to 13 so that we don't get captured by 6020 * another CPU or pinned by an interrupt thread. We can't go to 6021 * PIL 14 since the nexus driver(s) may need to interrupt at 6022 * that level in the case of IOMMU pseudo mappings. 6023 */ 6024 cpuset = cpu_ready_set; 6025 CPUSET_DEL(cpuset, CPU->cpu_id); 6026 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6027 old_pil = splr(XCALL_PIL); 6028 } else { 6029 old_pil = -1; 6030 xc_attention(cpuset); 6031 } 6032 ASSERT(getpil() == XCALL_PIL); 6033 6034 /* 6035 * Now do suspend callbacks. In the case of an IOMMU mapping 6036 * this will suspend all DMA activity to the page while it is 6037 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6038 * may be captured at this point we should have acquired any needed 6039 * locks in the presuspend callback. 6040 */ 6041 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6042 if (ret != 0) { 6043 repl = targ; 6044 goto suspend_fail; 6045 } 6046 6047 /* 6048 * Raise the PIL yet again, this time to block all high-level 6049 * interrupts on this CPU. This is necessary to prevent an 6050 * interrupt routine from pinning the thread which holds the 6051 * mapping suspended and then touching the suspended page. 6052 * 6053 * Once the page is suspended we also need to be careful to 6054 * avoid calling any functions which touch any seg_kmem memory 6055 * since that memory may be backed by the very page we are 6056 * relocating in here! 6057 */ 6058 hat_pagesuspend(targ); 6059 6060 /* 6061 * Now that we are confident everybody has stopped using this page, 6062 * copy the page contents. Note we use a physical copy to prevent 6063 * locking issues and to avoid fpRAS because we can't handle it in 6064 * this context. 6065 */ 6066 for (i = 0; i < npages; i++, tpp++, rpp++) { 6067 /* 6068 * Copy the contents of the page. 6069 */ 6070 ppcopy_kernel(tpp, rpp); 6071 } 6072 6073 tpp = targ; 6074 rpp = repl; 6075 for (i = 0; i < npages; i++, tpp++, rpp++) { 6076 /* 6077 * Copy attributes. VAC consistency was handled above, 6078 * if required. 6079 */ 6080 rpp->p_nrm = tpp->p_nrm; 6081 tpp->p_nrm = 0; 6082 rpp->p_index = tpp->p_index; 6083 tpp->p_index = 0; 6084 rpp->p_vcolor = tpp->p_vcolor; 6085 } 6086 6087 /* 6088 * First, unsuspend the page, if we set the suspend bit, and transfer 6089 * the mapping list from the target page to the replacement page. 6090 * Next process postcallbacks; since pa_hment's are linked only to the 6091 * p_mapping list of root page, we don't iterate over the constituent 6092 * pages. 6093 */ 6094 hat_pagereload(targ, repl); 6095 6096 suspend_fail: 6097 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 6098 6099 /* 6100 * Now lower our PIL and release any captured CPUs since we 6101 * are out of the "danger zone". After this it will again be 6102 * safe to acquire adaptive mutex locks, or to drop them... 6103 */ 6104 if (old_pil != -1) { 6105 splx(old_pil); 6106 } else { 6107 xc_dismissed(cpuset); 6108 } 6109 6110 kpreempt_enable(); 6111 6112 sfmmu_mlist_reloc_exit(low, high); 6113 6114 /* 6115 * Postsuspend callbacks should drop any locks held across 6116 * the suspend callbacks. As before, we don't hold the mapping 6117 * list lock at this point.. our assumption is that the mapping 6118 * list still can't change due to our holding SE_EXCL lock and 6119 * there being no unlocked mappings left. Hence the restriction 6120 * on calling context to hat_delete_callback() 6121 */ 6122 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 6123 if (ret != 0) { 6124 /* 6125 * The second presuspend call failed: we got here through 6126 * the suspend_fail label above. 6127 */ 6128 ASSERT(ret != EIO); 6129 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 6130 kreloc_thread = NULL; 6131 mutex_exit(&kpr_mutex); 6132 return (EAGAIN); 6133 } 6134 6135 /* 6136 * Now that we're out of the performance critical section we can 6137 * take care of updating the hash table, since we still 6138 * hold all the pages locked SE_EXCL at this point we 6139 * needn't worry about things changing out from under us. 6140 */ 6141 tpp = targ; 6142 rpp = repl; 6143 for (i = 0; i < npages; i++, tpp++, rpp++) { 6144 6145 /* 6146 * replace targ with replacement in page_hash table 6147 */ 6148 targ = tpp; 6149 page_relocate_hash(rpp, targ); 6150 6151 /* 6152 * concatenate target; caller of platform_page_relocate() 6153 * expects target to be concatenated after returning. 6154 */ 6155 ASSERT(targ->p_next == targ); 6156 ASSERT(targ->p_prev == targ); 6157 page_list_concat(&pl, &targ); 6158 } 6159 6160 ASSERT(*target == pl); 6161 *nrelocp = npages; 6162 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 6163 kreloc_thread = NULL; 6164 mutex_exit(&kpr_mutex); 6165 return (0); 6166 } 6167 6168 /* 6169 * Called when stray pa_hments are found attached to a page which is 6170 * being freed. Notify the subsystem which attached the pa_hment of 6171 * the error if it registered a suitable handler, else panic. 6172 */ 6173 static void 6174 sfmmu_pahment_leaked(struct pa_hment *pahmep) 6175 { 6176 id_t cb_id = pahmep->cb_id; 6177 6178 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 6179 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 6180 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 6181 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 6182 return; /* non-fatal */ 6183 } 6184 panic("pa_hment leaked: 0x%p", pahmep); 6185 } 6186 6187 /* 6188 * Remove all mappings to page 'pp'. 6189 */ 6190 int 6191 hat_pageunload(struct page *pp, uint_t forceflag) 6192 { 6193 struct page *origpp = pp; 6194 struct sf_hment *sfhme, *tmphme; 6195 struct hme_blk *hmeblkp; 6196 kmutex_t *pml, *pmtx; 6197 cpuset_t cpuset, tset; 6198 int index, cons; 6199 int xhme_blks; 6200 int pa_hments; 6201 6202 ASSERT(PAGE_EXCL(pp)); 6203 6204 retry_xhat: 6205 tmphme = NULL; 6206 xhme_blks = 0; 6207 pa_hments = 0; 6208 CPUSET_ZERO(cpuset); 6209 6210 pml = sfmmu_mlist_enter(pp); 6211 6212 if (pp->p_kpmref) 6213 sfmmu_kpm_pageunload(pp); 6214 ASSERT(!PP_ISMAPPED_KPM(pp)); 6215 6216 index = PP_MAPINDEX(pp); 6217 cons = TTE8K; 6218 retry: 6219 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6220 tmphme = sfhme->hme_next; 6221 6222 if (IS_PAHME(sfhme)) { 6223 ASSERT(sfhme->hme_data != NULL); 6224 pa_hments++; 6225 continue; 6226 } 6227 6228 hmeblkp = sfmmu_hmetohblk(sfhme); 6229 if (hmeblkp->hblk_xhat_bit) { 6230 struct xhat_hme_blk *xblk = 6231 (struct xhat_hme_blk *)hmeblkp; 6232 6233 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat, 6234 pp, forceflag, XBLK2PROVBLK(xblk)); 6235 6236 xhme_blks = 1; 6237 continue; 6238 } 6239 6240 /* 6241 * If there are kernel mappings don't unload them, they will 6242 * be suspended. 6243 */ 6244 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 6245 hmeblkp->hblk_tag.htag_id == ksfmmup) 6246 continue; 6247 6248 tset = sfmmu_pageunload(pp, sfhme, cons); 6249 CPUSET_OR(cpuset, tset); 6250 } 6251 6252 while (index != 0) { 6253 index = index >> 1; 6254 if (index != 0) 6255 cons++; 6256 if (index & 0x1) { 6257 /* Go to leading page */ 6258 pp = PP_GROUPLEADER(pp, cons); 6259 ASSERT(sfmmu_mlist_held(pp)); 6260 goto retry; 6261 } 6262 } 6263 6264 /* 6265 * cpuset may be empty if the page was only mapped by segkpm, 6266 * in which case we won't actually cross-trap. 6267 */ 6268 xt_sync(cpuset); 6269 6270 /* 6271 * The page should have no mappings at this point, unless 6272 * we were called from hat_page_relocate() in which case we 6273 * leave the locked mappings which will be suspended later. 6274 */ 6275 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments || 6276 (forceflag == SFMMU_KERNEL_RELOC)); 6277 6278 if (PP_ISTNC(pp)) { 6279 if (cons == TTE8K) { 6280 pmtx = sfmmu_page_enter(pp); 6281 PP_CLRTNC(pp); 6282 sfmmu_page_exit(pmtx); 6283 } else { 6284 conv_tnc(pp, cons); 6285 } 6286 } 6287 6288 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 6289 /* 6290 * Unlink any pa_hments and free them, calling back 6291 * the responsible subsystem to notify it of the error. 6292 * This can occur in situations such as drivers leaking 6293 * DMA handles: naughty, but common enough that we'd like 6294 * to keep the system running rather than bringing it 6295 * down with an obscure error like "pa_hment leaked" 6296 * which doesn't aid the user in debugging their driver. 6297 */ 6298 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6299 tmphme = sfhme->hme_next; 6300 if (IS_PAHME(sfhme)) { 6301 struct pa_hment *pahmep = sfhme->hme_data; 6302 sfmmu_pahment_leaked(pahmep); 6303 HME_SUB(sfhme, pp); 6304 kmem_cache_free(pa_hment_cache, pahmep); 6305 } 6306 } 6307 6308 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks); 6309 } 6310 6311 sfmmu_mlist_exit(pml); 6312 6313 /* 6314 * XHAT may not have finished unloading pages 6315 * because some other thread was waiting for 6316 * mlist lock and XHAT_PAGEUNLOAD let it do 6317 * the job. 6318 */ 6319 if (xhme_blks) { 6320 pp = origpp; 6321 goto retry_xhat; 6322 } 6323 6324 return (0); 6325 } 6326 6327 static cpuset_t 6328 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 6329 { 6330 struct hme_blk *hmeblkp; 6331 sfmmu_t *sfmmup; 6332 tte_t tte, ttemod; 6333 #ifdef DEBUG 6334 tte_t orig_old; 6335 #endif /* DEBUG */ 6336 caddr_t addr; 6337 int ttesz; 6338 int ret; 6339 cpuset_t cpuset; 6340 6341 ASSERT(pp != NULL); 6342 ASSERT(sfmmu_mlist_held(pp)); 6343 ASSERT(pp->p_vnode != &kvp); 6344 6345 CPUSET_ZERO(cpuset); 6346 6347 hmeblkp = sfmmu_hmetohblk(sfhme); 6348 6349 readtte: 6350 sfmmu_copytte(&sfhme->hme_tte, &tte); 6351 if (TTE_IS_VALID(&tte)) { 6352 sfmmup = hblktosfmmu(hmeblkp); 6353 ttesz = get_hblk_ttesz(hmeblkp); 6354 /* 6355 * Only unload mappings of 'cons' size. 6356 */ 6357 if (ttesz != cons) 6358 return (cpuset); 6359 6360 /* 6361 * Note that we have p_mapping lock, but no hash lock here. 6362 * hblk_unload() has to have both hash lock AND p_mapping 6363 * lock before it tries to modify tte. So, the tte could 6364 * not become invalid in the sfmmu_modifytte_try() below. 6365 */ 6366 ttemod = tte; 6367 #ifdef DEBUG 6368 orig_old = tte; 6369 #endif /* DEBUG */ 6370 6371 TTE_SET_INVALID(&ttemod); 6372 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 6373 if (ret < 0) { 6374 #ifdef DEBUG 6375 /* only R/M bits can change. */ 6376 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 6377 #endif /* DEBUG */ 6378 goto readtte; 6379 } 6380 6381 if (ret == 0) { 6382 panic("pageunload: cas failed?"); 6383 } 6384 6385 addr = tte_to_vaddr(hmeblkp, tte); 6386 6387 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6388 6389 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1); 6390 6391 /* 6392 * We need to flush the page from the virtual cache 6393 * in order to prevent a virtual cache alias 6394 * inconsistency. The particular scenario we need 6395 * to worry about is: 6396 * Given: va1 and va2 are two virtual address that 6397 * alias and will map the same physical address. 6398 * 1. mapping exists from va1 to pa and data has 6399 * been read into the cache. 6400 * 2. unload va1. 6401 * 3. load va2 and modify data using va2. 6402 * 4 unload va2. 6403 * 5. load va1 and reference data. Unless we flush 6404 * the data cache when we unload we will get 6405 * stale data. 6406 * This scenario is taken care of by using virtual 6407 * page coloring. 6408 */ 6409 if (sfmmup->sfmmu_ismhat) { 6410 /* 6411 * Flush TSBs, TLBs and caches 6412 * of every process 6413 * sharing this ism segment. 6414 */ 6415 sfmmu_hat_lock_all(); 6416 mutex_enter(&ism_mlist_lock); 6417 kpreempt_disable(); 6418 if (do_virtual_coloring) 6419 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 6420 pp->p_pagenum, CACHE_NO_FLUSH); 6421 else 6422 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 6423 pp->p_pagenum, CACHE_FLUSH); 6424 kpreempt_enable(); 6425 mutex_exit(&ism_mlist_lock); 6426 sfmmu_hat_unlock_all(); 6427 cpuset = cpu_ready_set; 6428 } else if (do_virtual_coloring) { 6429 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 6430 cpuset = sfmmup->sfmmu_cpusran; 6431 } else { 6432 sfmmu_tlbcache_demap(addr, sfmmup, hmeblkp, 6433 pp->p_pagenum, 0, FLUSH_NECESSARY_CPUS, 6434 CACHE_FLUSH, 0); 6435 cpuset = sfmmup->sfmmu_cpusran; 6436 } 6437 6438 /* 6439 * Hme_sub has to run after ttesync() and a_rss update. 6440 * See hblk_unload(). 6441 */ 6442 HME_SUB(sfhme, pp); 6443 membar_stst(); 6444 6445 /* 6446 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 6447 * since pteload may have done a HME_ADD() right after 6448 * we did the HME_SUB() above. Hmecnt is now maintained 6449 * by cas only. no lock guranteed its value. The only 6450 * gurantee we have is the hmecnt should not be less than 6451 * what it should be so the hblk will not be taken away. 6452 * It's also important that we decremented the hmecnt after 6453 * we are done with hmeblkp so that this hmeblk won't be 6454 * stolen. 6455 */ 6456 ASSERT(hmeblkp->hblk_hmecnt > 0); 6457 ASSERT(hmeblkp->hblk_vcnt > 0); 6458 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 6459 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 6460 /* 6461 * This is bug 4063182. 6462 * XXX: fixme 6463 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 6464 * !hmeblkp->hblk_lckcnt); 6465 */ 6466 } else { 6467 panic("invalid tte? pp %p &tte %p", 6468 (void *)pp, (void *)&tte); 6469 } 6470 6471 return (cpuset); 6472 } 6473 6474 /* 6475 * While relocating a kernel page, this function will move the mappings 6476 * from tpp to dpp and modify any associated data with these mappings. 6477 * It also unsuspends the suspended kernel mapping. 6478 */ 6479 static void 6480 hat_pagereload(struct page *tpp, struct page *dpp) 6481 { 6482 struct sf_hment *sfhme; 6483 tte_t tte, ttemod; 6484 int index, cons; 6485 6486 ASSERT(getpil() == PIL_MAX); 6487 ASSERT(sfmmu_mlist_held(tpp)); 6488 ASSERT(sfmmu_mlist_held(dpp)); 6489 6490 index = PP_MAPINDEX(tpp); 6491 cons = TTE8K; 6492 6493 /* Update real mappings to the page */ 6494 retry: 6495 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 6496 if (IS_PAHME(sfhme)) 6497 continue; 6498 sfmmu_copytte(&sfhme->hme_tte, &tte); 6499 ttemod = tte; 6500 6501 /* 6502 * replace old pfn with new pfn in TTE 6503 */ 6504 PFN_TO_TTE(ttemod, dpp->p_pagenum); 6505 6506 /* 6507 * clear suspend bit 6508 */ 6509 ASSERT(TTE_IS_SUSPEND(&ttemod)); 6510 TTE_CLR_SUSPEND(&ttemod); 6511 6512 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 6513 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 6514 6515 /* 6516 * set hme_page point to new page 6517 */ 6518 sfhme->hme_page = dpp; 6519 } 6520 6521 /* 6522 * move p_mapping list from old page to new page 6523 */ 6524 dpp->p_mapping = tpp->p_mapping; 6525 tpp->p_mapping = NULL; 6526 dpp->p_share = tpp->p_share; 6527 tpp->p_share = 0; 6528 6529 while (index != 0) { 6530 index = index >> 1; 6531 if (index != 0) 6532 cons++; 6533 if (index & 0x1) { 6534 tpp = PP_GROUPLEADER(tpp, cons); 6535 dpp = PP_GROUPLEADER(dpp, cons); 6536 goto retry; 6537 } 6538 } 6539 6540 if (dtrace_kreloc_fini) 6541 (*dtrace_kreloc_fini)(); 6542 mutex_exit(&kpr_suspendlock); 6543 } 6544 6545 uint_t 6546 hat_pagesync(struct page *pp, uint_t clearflag) 6547 { 6548 struct sf_hment *sfhme, *tmphme = NULL; 6549 struct hme_blk *hmeblkp; 6550 kmutex_t *pml; 6551 cpuset_t cpuset, tset; 6552 int index, cons; 6553 extern ulong_t po_share; 6554 page_t *save_pp = pp; 6555 6556 CPUSET_ZERO(cpuset); 6557 6558 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 6559 return (PP_GENERIC_ATTR(pp)); 6560 } 6561 6562 if ((clearflag == (HAT_SYNC_STOPON_REF | HAT_SYNC_DONTZERO)) && 6563 PP_ISREF(pp)) { 6564 return (PP_GENERIC_ATTR(pp)); 6565 } 6566 6567 if ((clearflag == (HAT_SYNC_STOPON_MOD | HAT_SYNC_DONTZERO)) && 6568 PP_ISMOD(pp)) { 6569 return (PP_GENERIC_ATTR(pp)); 6570 } 6571 6572 if ((clearflag & HAT_SYNC_STOPON_SHARED) != 0 && 6573 (pp->p_share > po_share) && 6574 !(clearflag & HAT_SYNC_ZERORM)) { 6575 if (PP_ISRO(pp)) 6576 hat_page_setattr(pp, P_REF); 6577 return (PP_GENERIC_ATTR(pp)); 6578 } 6579 6580 clearflag &= ~HAT_SYNC_STOPON_SHARED; 6581 pml = sfmmu_mlist_enter(pp); 6582 index = PP_MAPINDEX(pp); 6583 cons = TTE8K; 6584 retry: 6585 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6586 /* 6587 * We need to save the next hment on the list since 6588 * it is possible for pagesync to remove an invalid hment 6589 * from the list. 6590 */ 6591 tmphme = sfhme->hme_next; 6592 /* 6593 * If we are looking for large mappings and this hme doesn't 6594 * reach the range we are seeking, just ignore its. 6595 */ 6596 hmeblkp = sfmmu_hmetohblk(sfhme); 6597 if (hmeblkp->hblk_xhat_bit) 6598 continue; 6599 6600 if (hme_size(sfhme) < cons) 6601 continue; 6602 tset = sfmmu_pagesync(pp, sfhme, 6603 clearflag & ~HAT_SYNC_STOPON_RM); 6604 CPUSET_OR(cpuset, tset); 6605 /* 6606 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 6607 * as the "ref" or "mod" is set. 6608 */ 6609 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 6610 ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 6611 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp))) { 6612 index = 0; 6613 break; 6614 } 6615 } 6616 6617 while (index) { 6618 index = index >> 1; 6619 cons++; 6620 if (index & 0x1) { 6621 /* Go to leading page */ 6622 pp = PP_GROUPLEADER(pp, cons); 6623 goto retry; 6624 } 6625 } 6626 6627 xt_sync(cpuset); 6628 sfmmu_mlist_exit(pml); 6629 return (PP_GENERIC_ATTR(save_pp)); 6630 } 6631 6632 /* 6633 * Get all the hardware dependent attributes for a page struct 6634 */ 6635 static cpuset_t 6636 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 6637 uint_t clearflag) 6638 { 6639 caddr_t addr; 6640 tte_t tte, ttemod; 6641 struct hme_blk *hmeblkp; 6642 int ret; 6643 sfmmu_t *sfmmup; 6644 cpuset_t cpuset; 6645 6646 ASSERT(pp != NULL); 6647 ASSERT(sfmmu_mlist_held(pp)); 6648 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6649 (clearflag == HAT_SYNC_ZERORM)); 6650 6651 SFMMU_STAT(sf_pagesync); 6652 6653 CPUSET_ZERO(cpuset); 6654 6655 sfmmu_pagesync_retry: 6656 6657 sfmmu_copytte(&sfhme->hme_tte, &tte); 6658 if (TTE_IS_VALID(&tte)) { 6659 hmeblkp = sfmmu_hmetohblk(sfhme); 6660 sfmmup = hblktosfmmu(hmeblkp); 6661 addr = tte_to_vaddr(hmeblkp, tte); 6662 if (clearflag == HAT_SYNC_ZERORM) { 6663 ttemod = tte; 6664 TTE_CLR_RM(&ttemod); 6665 ret = sfmmu_modifytte_try(&tte, &ttemod, 6666 &sfhme->hme_tte); 6667 if (ret < 0) { 6668 /* 6669 * cas failed and the new value is not what 6670 * we want. 6671 */ 6672 goto sfmmu_pagesync_retry; 6673 } 6674 6675 if (ret > 0) { 6676 /* we win the cas */ 6677 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 6678 cpuset = sfmmup->sfmmu_cpusran; 6679 } 6680 } 6681 6682 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6683 } 6684 return (cpuset); 6685 } 6686 6687 /* 6688 * Remove write permission from a mappings to a page, so that 6689 * we can detect the next modification of it. This requires modifying 6690 * the TTE then invalidating (demap) any TLB entry using that TTE. 6691 * This code is similar to sfmmu_pagesync(). 6692 */ 6693 static cpuset_t 6694 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 6695 { 6696 caddr_t addr; 6697 tte_t tte; 6698 tte_t ttemod; 6699 struct hme_blk *hmeblkp; 6700 int ret; 6701 sfmmu_t *sfmmup; 6702 cpuset_t cpuset; 6703 6704 ASSERT(pp != NULL); 6705 ASSERT(sfmmu_mlist_held(pp)); 6706 6707 CPUSET_ZERO(cpuset); 6708 SFMMU_STAT(sf_clrwrt); 6709 6710 retry: 6711 6712 sfmmu_copytte(&sfhme->hme_tte, &tte); 6713 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 6714 hmeblkp = sfmmu_hmetohblk(sfhme); 6715 6716 /* 6717 * xhat mappings should never be to a VMODSORT page. 6718 */ 6719 ASSERT(hmeblkp->hblk_xhat_bit == 0); 6720 6721 sfmmup = hblktosfmmu(hmeblkp); 6722 addr = tte_to_vaddr(hmeblkp, tte); 6723 6724 ttemod = tte; 6725 TTE_CLR_WRT(&ttemod); 6726 TTE_CLR_MOD(&ttemod); 6727 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 6728 6729 /* 6730 * if cas failed and the new value is not what 6731 * we want retry 6732 */ 6733 if (ret < 0) 6734 goto retry; 6735 6736 /* we win the cas */ 6737 if (ret > 0) { 6738 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 6739 cpuset = sfmmup->sfmmu_cpusran; 6740 } 6741 } 6742 6743 return (cpuset); 6744 } 6745 6746 /* 6747 * Walk all mappings of a page, removing write permission and clearing the 6748 * ref/mod bits. This code is similar to hat_pagesync() 6749 */ 6750 static void 6751 hat_page_clrwrt(page_t *pp) 6752 { 6753 struct sf_hment *sfhme; 6754 struct sf_hment *tmphme = NULL; 6755 kmutex_t *pml; 6756 cpuset_t cpuset; 6757 cpuset_t tset; 6758 int index; 6759 int cons; 6760 6761 CPUSET_ZERO(cpuset); 6762 6763 pml = sfmmu_mlist_enter(pp); 6764 index = PP_MAPINDEX(pp); 6765 cons = TTE8K; 6766 retry: 6767 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 6768 tmphme = sfhme->hme_next; 6769 6770 /* 6771 * If we are looking for large mappings and this hme doesn't 6772 * reach the range we are seeking, just ignore its. 6773 */ 6774 6775 if (hme_size(sfhme) < cons) 6776 continue; 6777 6778 tset = sfmmu_pageclrwrt(pp, sfhme); 6779 CPUSET_OR(cpuset, tset); 6780 } 6781 6782 while (index) { 6783 index = index >> 1; 6784 cons++; 6785 if (index & 0x1) { 6786 /* Go to leading page */ 6787 pp = PP_GROUPLEADER(pp, cons); 6788 goto retry; 6789 } 6790 } 6791 6792 xt_sync(cpuset); 6793 sfmmu_mlist_exit(pml); 6794 } 6795 6796 /* 6797 * Set the given REF/MOD/RO bits for the given page. 6798 * For a vnode with a sorted v_pages list, we need to change 6799 * the attributes and the v_pages list together under page_vnode_mutex. 6800 */ 6801 void 6802 hat_page_setattr(page_t *pp, uint_t flag) 6803 { 6804 vnode_t *vp = pp->p_vnode; 6805 page_t **listp; 6806 kmutex_t *pmtx; 6807 kmutex_t *vphm = NULL; 6808 6809 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 6810 6811 /* 6812 * nothing to do if attribute already set 6813 */ 6814 if ((pp->p_nrm & flag) == flag) 6815 return; 6816 6817 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 6818 vphm = page_vnode_mutex(vp); 6819 mutex_enter(vphm); 6820 } 6821 6822 pmtx = sfmmu_page_enter(pp); 6823 pp->p_nrm |= flag; 6824 sfmmu_page_exit(pmtx); 6825 6826 if (vphm != NULL) { 6827 /* 6828 * Some File Systems examine v_pages for NULL w/o 6829 * grabbing the vphm mutex. Must not let it become NULL when 6830 * pp is the only page on the list. 6831 */ 6832 if (pp->p_vpnext != pp) { 6833 page_vpsub(&vp->v_pages, pp); 6834 if (vp->v_pages != NULL) 6835 listp = &vp->v_pages->p_vpprev->p_vpnext; 6836 else 6837 listp = &vp->v_pages; 6838 page_vpadd(listp, pp); 6839 } 6840 mutex_exit(vphm); 6841 } 6842 } 6843 6844 void 6845 hat_page_clrattr(page_t *pp, uint_t flag) 6846 { 6847 vnode_t *vp = pp->p_vnode; 6848 kmutex_t *vphm = NULL; 6849 kmutex_t *pmtx; 6850 6851 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 6852 6853 /* 6854 * For vnode with a sorted v_pages list, we need to change 6855 * the attributes and the v_pages list together under page_vnode_mutex. 6856 */ 6857 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 6858 vphm = page_vnode_mutex(vp); 6859 mutex_enter(vphm); 6860 } 6861 6862 pmtx = sfmmu_page_enter(pp); 6863 pp->p_nrm &= ~flag; 6864 sfmmu_page_exit(pmtx); 6865 6866 if (vphm != NULL) { 6867 /* 6868 * Some File Systems examine v_pages for NULL w/o 6869 * grabbing the vphm mutex. Must not let it become NULL when 6870 * pp is the only page on the list. 6871 */ 6872 if (pp->p_vpnext != pp) { 6873 page_vpsub(&vp->v_pages, pp); 6874 page_vpadd(&vp->v_pages, pp); 6875 } 6876 mutex_exit(vphm); 6877 6878 /* 6879 * VMODSORT works by removing write permissions and getting 6880 * a fault when a page is made dirty. At this point 6881 * we need to remove write permission from all mappings 6882 * to this page. 6883 */ 6884 hat_page_clrwrt(pp); 6885 } 6886 } 6887 6888 6889 uint_t 6890 hat_page_getattr(page_t *pp, uint_t flag) 6891 { 6892 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 6893 return ((uint_t)(pp->p_nrm & flag)); 6894 } 6895 6896 /* 6897 * DEBUG kernels: verify that a kernel va<->pa translation 6898 * is safe by checking the underlying page_t is in a page 6899 * relocation-safe state. 6900 */ 6901 #ifdef DEBUG 6902 void 6903 sfmmu_check_kpfn(pfn_t pfn) 6904 { 6905 page_t *pp; 6906 int index, cons; 6907 6908 if (hat_check_vtop == 0) 6909 return; 6910 6911 if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr) 6912 return; 6913 6914 pp = page_numtopp_nolock(pfn); 6915 if (!pp) 6916 return; 6917 6918 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 6919 return; 6920 6921 /* 6922 * Handed a large kernel page, we dig up the root page since we 6923 * know the root page might have the lock also. 6924 */ 6925 if (pp->p_szc != 0) { 6926 index = PP_MAPINDEX(pp); 6927 cons = TTE8K; 6928 again: 6929 while (index != 0) { 6930 index >>= 1; 6931 if (index != 0) 6932 cons++; 6933 if (index & 0x1) { 6934 pp = PP_GROUPLEADER(pp, cons); 6935 goto again; 6936 } 6937 } 6938 } 6939 6940 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 6941 return; 6942 6943 /* 6944 * Pages need to be locked or allocated "permanent" (either from 6945 * static_arena arena or explicitly setting PG_NORELOC when calling 6946 * page_create_va()) for VA->PA translations to be valid. 6947 */ 6948 if (!PP_ISNORELOC(pp)) 6949 panic("Illegal VA->PA translation, pp 0x%p not permanent", pp); 6950 else 6951 panic("Illegal VA->PA translation, pp 0x%p not locked", pp); 6952 } 6953 #endif /* DEBUG */ 6954 6955 /* 6956 * Returns a page frame number for a given virtual address. 6957 * Returns PFN_INVALID to indicate an invalid mapping 6958 */ 6959 pfn_t 6960 hat_getpfnum(struct hat *hat, caddr_t addr) 6961 { 6962 pfn_t pfn; 6963 tte_t tte; 6964 6965 /* 6966 * We would like to 6967 * ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 6968 * but we can't because the iommu driver will call this 6969 * routine at interrupt time and it can't grab the as lock 6970 * or it will deadlock: A thread could have the as lock 6971 * and be waiting for io. The io can't complete 6972 * because the interrupt thread is blocked trying to grab 6973 * the as lock. 6974 */ 6975 6976 ASSERT(hat->sfmmu_xhat_provider == NULL); 6977 6978 if (hat == ksfmmup) { 6979 if (segkpm && IS_KPM_ADDR(addr)) 6980 return (sfmmu_kpm_vatopfn(addr)); 6981 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 6982 == PFN_SUSPENDED) { 6983 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 6984 } 6985 sfmmu_check_kpfn(pfn); 6986 return (pfn); 6987 } else { 6988 return (sfmmu_uvatopfn(addr, hat)); 6989 } 6990 } 6991 6992 /* 6993 * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged. 6994 * Use hat_getpfnum(kas.a_hat, ...) instead. 6995 * 6996 * We'd like to return PFN_INVALID if the mappings have underlying page_t's 6997 * but can't right now due to the fact that some software has grown to use 6998 * this interface incorrectly. So for now when the interface is misused, 6999 * return a warning to the user that in the future it won't work in the 7000 * way they're abusing it, and carry on (after disabling page relocation). 7001 */ 7002 pfn_t 7003 hat_getkpfnum(caddr_t addr) 7004 { 7005 pfn_t pfn; 7006 tte_t tte; 7007 int badcaller = 0; 7008 extern int segkmem_reloc; 7009 7010 if (segkpm && IS_KPM_ADDR(addr)) { 7011 badcaller = 1; 7012 pfn = sfmmu_kpm_vatopfn(addr); 7013 } else { 7014 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7015 == PFN_SUSPENDED) { 7016 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7017 } 7018 badcaller = pf_is_memory(pfn); 7019 } 7020 7021 if (badcaller) { 7022 /* 7023 * We can't return PFN_INVALID or the caller may panic 7024 * or corrupt the system. The only alternative is to 7025 * disable page relocation at this point for all kernel 7026 * memory. This will impact any callers of page_relocate() 7027 * such as FMA or DR. 7028 * 7029 * RFE: Add junk here to spit out an ereport so the sysadmin 7030 * can be advised that he should upgrade his device driver 7031 * so that this doesn't happen. 7032 */ 7033 hat_getkpfnum_badcall(caller()); 7034 if (hat_kpr_enabled && segkmem_reloc) { 7035 hat_kpr_enabled = 0; 7036 segkmem_reloc = 0; 7037 cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED"); 7038 } 7039 } 7040 return (pfn); 7041 } 7042 7043 pfn_t 7044 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup) 7045 { 7046 struct hmehash_bucket *hmebp; 7047 hmeblk_tag hblktag; 7048 int hmeshift, hashno = 1; 7049 struct hme_blk *hmeblkp = NULL; 7050 7051 struct sf_hment *sfhmep; 7052 tte_t tte; 7053 pfn_t pfn; 7054 7055 /* support for ISM */ 7056 ism_map_t *ism_map; 7057 ism_blk_t *ism_blkp; 7058 int i; 7059 sfmmu_t *ism_hatid = NULL; 7060 sfmmu_t *locked_hatid = NULL; 7061 7062 7063 ASSERT(sfmmup != ksfmmup); 7064 SFMMU_STAT(sf_user_vtop); 7065 /* 7066 * Set ism_hatid if vaddr falls in a ISM segment. 7067 */ 7068 ism_blkp = sfmmup->sfmmu_iblk; 7069 if (ism_blkp) { 7070 sfmmu_ismhat_enter(sfmmup, 0); 7071 locked_hatid = sfmmup; 7072 } 7073 while (ism_blkp && ism_hatid == NULL) { 7074 ism_map = ism_blkp->iblk_maps; 7075 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 7076 if (vaddr >= ism_start(ism_map[i]) && 7077 vaddr < ism_end(ism_map[i])) { 7078 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 7079 vaddr = (caddr_t)(vaddr - 7080 ism_start(ism_map[i])); 7081 break; 7082 } 7083 } 7084 ism_blkp = ism_blkp->iblk_next; 7085 } 7086 if (locked_hatid) { 7087 sfmmu_ismhat_exit(locked_hatid, 0); 7088 } 7089 7090 hblktag.htag_id = sfmmup; 7091 do { 7092 hmeshift = HME_HASH_SHIFT(hashno); 7093 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 7094 hblktag.htag_rehash = hashno; 7095 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 7096 7097 SFMMU_HASH_LOCK(hmebp); 7098 7099 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 7100 if (hmeblkp != NULL) { 7101 HBLKTOHME(sfhmep, hmeblkp, vaddr); 7102 sfmmu_copytte(&sfhmep->hme_tte, &tte); 7103 if (TTE_IS_VALID(&tte)) { 7104 pfn = TTE_TO_PFN(vaddr, &tte); 7105 } else { 7106 pfn = PFN_INVALID; 7107 } 7108 SFMMU_HASH_UNLOCK(hmebp); 7109 return (pfn); 7110 } 7111 SFMMU_HASH_UNLOCK(hmebp); 7112 hashno++; 7113 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 7114 return (PFN_INVALID); 7115 } 7116 7117 7118 /* 7119 * For compatability with AT&T and later optimizations 7120 */ 7121 /* ARGSUSED */ 7122 void 7123 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 7124 { 7125 ASSERT(hat != NULL); 7126 ASSERT(hat->sfmmu_xhat_provider == NULL); 7127 } 7128 7129 /* 7130 * Return the number of mappings to a particular page. 7131 * This number is an approximation of the number of 7132 * number of people sharing the page. 7133 */ 7134 ulong_t 7135 hat_page_getshare(page_t *pp) 7136 { 7137 page_t *spp = pp; /* start page */ 7138 kmutex_t *pml; 7139 ulong_t cnt; 7140 int index, sz = TTE64K; 7141 7142 /* 7143 * We need to grab the mlist lock to make sure any outstanding 7144 * load/unloads complete. Otherwise we could return zero 7145 * even though the unload(s) hasn't finished yet. 7146 */ 7147 pml = sfmmu_mlist_enter(spp); 7148 cnt = spp->p_share; 7149 7150 if (kpm_enable) 7151 cnt += spp->p_kpmref; 7152 7153 /* 7154 * If we have any large mappings, we count the number of 7155 * mappings that this large page is part of. 7156 */ 7157 index = PP_MAPINDEX(spp); 7158 index >>= 1; 7159 while (index) { 7160 pp = PP_GROUPLEADER(spp, sz); 7161 if ((index & 0x1) && pp != spp) { 7162 cnt += pp->p_share; 7163 spp = pp; 7164 } 7165 index >>= 1; 7166 sz++; 7167 } 7168 sfmmu_mlist_exit(pml); 7169 return (cnt); 7170 } 7171 7172 /* 7173 * Unload all large mappings to the pp and reset the p_szc field of every 7174 * constituent page according to the remaining mappings. 7175 * 7176 * pp must be locked SE_EXCL. Even though no other constituent pages are 7177 * locked it's legal to unload the large mappings to the pp because all 7178 * constituent pages of large locked mappings have to be locked SE_SHARED. 7179 * This means if we have SE_EXCL lock on one of constituent pages none of the 7180 * large mappings to pp are locked. 7181 * 7182 * Decrease p_szc field starting from the last constituent page and ending 7183 * with the root page. This method is used because other threads rely on the 7184 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 7185 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 7186 * ensures that p_szc changes of the constituent pages appears atomic for all 7187 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 7188 * 7189 * This mechanism is only used for file system pages where it's not always 7190 * possible to get SE_EXCL locks on all constituent pages to demote the size 7191 * code (as is done for anonymous or kernel large pages). 7192 * 7193 * See more comments in front of sfmmu_mlspl_enter(). 7194 */ 7195 void 7196 hat_page_demote(page_t *pp) 7197 { 7198 int index; 7199 int sz; 7200 cpuset_t cpuset; 7201 int sync = 0; 7202 page_t *rootpp; 7203 struct sf_hment *sfhme; 7204 struct sf_hment *tmphme = NULL; 7205 struct hme_blk *hmeblkp; 7206 uint_t pszc; 7207 page_t *lastpp; 7208 cpuset_t tset; 7209 pgcnt_t npgs; 7210 kmutex_t *pml; 7211 kmutex_t *pmtx; 7212 7213 ASSERT(PAGE_EXCL(pp)); 7214 ASSERT(!PP_ISFREE(pp)); 7215 ASSERT(page_szc_lock_assert(pp)); 7216 pml = sfmmu_mlist_enter(pp); 7217 pmtx = sfmmu_page_enter(pp); 7218 7219 pszc = pp->p_szc; 7220 if (pszc == 0) { 7221 goto out; 7222 } 7223 7224 index = PP_MAPINDEX(pp) >> 1; 7225 7226 if (index) { 7227 CPUSET_ZERO(cpuset); 7228 sz = TTE64K; 7229 sync = 1; 7230 } 7231 7232 while (index) { 7233 if (!(index & 0x1)) { 7234 index >>= 1; 7235 sz++; 7236 continue; 7237 } 7238 ASSERT(sz <= pszc); 7239 rootpp = PP_GROUPLEADER(pp, sz); 7240 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 7241 tmphme = sfhme->hme_next; 7242 hmeblkp = sfmmu_hmetohblk(sfhme); 7243 if (hme_size(sfhme) != sz) { 7244 continue; 7245 } 7246 if (hmeblkp->hblk_xhat_bit) { 7247 cmn_err(CE_PANIC, 7248 "hat_page_demote: xhat hmeblk"); 7249 } 7250 tset = sfmmu_pageunload(rootpp, sfhme, sz); 7251 CPUSET_OR(cpuset, tset); 7252 } 7253 if (index >>= 1) { 7254 sz++; 7255 } 7256 } 7257 7258 ASSERT(!PP_ISMAPPED_LARGE(pp)); 7259 7260 if (sync) { 7261 xt_sync(cpuset); 7262 if (PP_ISTNC(pp)) { 7263 conv_tnc(rootpp, sz); 7264 } 7265 } 7266 7267 ASSERT(pp->p_szc == pszc); 7268 rootpp = PP_PAGEROOT(pp); 7269 ASSERT(rootpp->p_szc == pszc); 7270 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 7271 7272 while (lastpp != rootpp) { 7273 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 7274 ASSERT(sz < pszc); 7275 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 7276 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 7277 while (--npgs > 0) { 7278 lastpp->p_szc = (uchar_t)sz; 7279 lastpp = PP_PAGEPREV(lastpp); 7280 } 7281 if (sz) { 7282 /* 7283 * make sure before current root's pszc 7284 * is updated all updates to constituent pages pszc 7285 * fields are globally visible. 7286 */ 7287 membar_producer(); 7288 } 7289 lastpp->p_szc = sz; 7290 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 7291 if (lastpp != rootpp) { 7292 lastpp = PP_PAGEPREV(lastpp); 7293 } 7294 } 7295 if (sz == 0) { 7296 /* the loop above doesn't cover this case */ 7297 rootpp->p_szc = 0; 7298 } 7299 out: 7300 ASSERT(pp->p_szc == 0); 7301 sfmmu_page_exit(pmtx); 7302 sfmmu_mlist_exit(pml); 7303 } 7304 7305 /* 7306 * Refresh the HAT ismttecnt[] element for size szc. 7307 * Caller must have set ISM busy flag to prevent mapping 7308 * lists from changing while we're traversing them. 7309 */ 7310 pgcnt_t 7311 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 7312 { 7313 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 7314 ism_map_t *ism_map; 7315 pgcnt_t npgs = 0; 7316 int j; 7317 7318 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 7319 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 7320 ism_map = ism_blkp->iblk_maps; 7321 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) 7322 npgs += ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 7323 } 7324 sfmmup->sfmmu_ismttecnt[szc] = npgs; 7325 return (npgs); 7326 } 7327 7328 /* 7329 * Yield the memory claim requirement for an address space. 7330 * 7331 * This is currently implemented as the number of bytes that have active 7332 * hardware translations that have page structures. Therefore, it can 7333 * underestimate the traditional resident set size, eg, if the 7334 * physical page is present and the hardware translation is missing; 7335 * and it can overestimate the rss, eg, if there are active 7336 * translations to a frame buffer with page structs. 7337 * Also, it does not take sharing into account. 7338 * 7339 * Note that we don't acquire locks here since this function is most often 7340 * called from the clock thread. 7341 */ 7342 size_t 7343 hat_get_mapped_size(struct hat *hat) 7344 { 7345 size_t assize = 0; 7346 int i; 7347 7348 if (hat == NULL) 7349 return (0); 7350 7351 ASSERT(hat->sfmmu_xhat_provider == NULL); 7352 7353 for (i = 0; i < mmu_page_sizes; i++) 7354 assize += (pgcnt_t)hat->sfmmu_ttecnt[i] * TTEBYTES(i); 7355 7356 if (hat->sfmmu_iblk == NULL) 7357 return (assize); 7358 7359 for (i = 0; i < mmu_page_sizes; i++) 7360 assize += (pgcnt_t)hat->sfmmu_ismttecnt[i] * TTEBYTES(i); 7361 7362 return (assize); 7363 } 7364 7365 int 7366 hat_stats_enable(struct hat *hat) 7367 { 7368 hatlock_t *hatlockp; 7369 7370 ASSERT(hat->sfmmu_xhat_provider == NULL); 7371 7372 hatlockp = sfmmu_hat_enter(hat); 7373 hat->sfmmu_rmstat++; 7374 sfmmu_hat_exit(hatlockp); 7375 return (1); 7376 } 7377 7378 void 7379 hat_stats_disable(struct hat *hat) 7380 { 7381 hatlock_t *hatlockp; 7382 7383 ASSERT(hat->sfmmu_xhat_provider == NULL); 7384 7385 hatlockp = sfmmu_hat_enter(hat); 7386 hat->sfmmu_rmstat--; 7387 sfmmu_hat_exit(hatlockp); 7388 } 7389 7390 /* 7391 * Routines for entering or removing ourselves from the 7392 * ism_hat's mapping list. 7393 */ 7394 static void 7395 iment_add(struct ism_ment *iment, struct hat *ism_hat) 7396 { 7397 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 7398 7399 iment->iment_prev = NULL; 7400 iment->iment_next = ism_hat->sfmmu_iment; 7401 if (ism_hat->sfmmu_iment) { 7402 ism_hat->sfmmu_iment->iment_prev = iment; 7403 } 7404 ism_hat->sfmmu_iment = iment; 7405 } 7406 7407 static void 7408 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 7409 { 7410 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 7411 7412 if (ism_hat->sfmmu_iment == NULL) { 7413 panic("ism map entry remove - no entries"); 7414 } 7415 7416 if (iment->iment_prev) { 7417 ASSERT(ism_hat->sfmmu_iment != iment); 7418 iment->iment_prev->iment_next = iment->iment_next; 7419 } else { 7420 ASSERT(ism_hat->sfmmu_iment == iment); 7421 ism_hat->sfmmu_iment = iment->iment_next; 7422 } 7423 7424 if (iment->iment_next) { 7425 iment->iment_next->iment_prev = iment->iment_prev; 7426 } 7427 7428 /* 7429 * zero out the entry 7430 */ 7431 iment->iment_next = NULL; 7432 iment->iment_prev = NULL; 7433 iment->iment_hat = NULL; 7434 } 7435 7436 /* 7437 * Hat_share()/unshare() return an (non-zero) error 7438 * when saddr and daddr are not properly aligned. 7439 * 7440 * The top level mapping element determines the alignment 7441 * requirement for saddr and daddr, depending on different 7442 * architectures. 7443 * 7444 * When hat_share()/unshare() are not supported, 7445 * HATOP_SHARE()/UNSHARE() return 0 7446 */ 7447 int 7448 hat_share(struct hat *sfmmup, caddr_t addr, 7449 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 7450 { 7451 ism_blk_t *ism_blkp; 7452 ism_blk_t *new_iblk; 7453 ism_map_t *ism_map; 7454 ism_ment_t *ism_ment; 7455 int i, added; 7456 hatlock_t *hatlockp; 7457 int reload_mmu = 0; 7458 uint_t ismshift = page_get_shift(ismszc); 7459 size_t ismpgsz = page_get_pagesize(ismszc); 7460 uint_t ismmask = (uint_t)ismpgsz - 1; 7461 size_t sh_size = ISM_SHIFT(ismshift, len); 7462 ushort_t ismhatflag; 7463 7464 #ifdef DEBUG 7465 caddr_t eaddr = addr + len; 7466 #endif /* DEBUG */ 7467 7468 ASSERT(ism_hatid != NULL && sfmmup != NULL); 7469 ASSERT(sptaddr == ISMID_STARTADDR); 7470 /* 7471 * Check the alignment. 7472 */ 7473 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 7474 return (EINVAL); 7475 7476 /* 7477 * Check size alignment. 7478 */ 7479 if (!ISM_ALIGNED(ismshift, len)) 7480 return (EINVAL); 7481 7482 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 7483 7484 /* 7485 * Allocate ism_ment for the ism_hat's mapping list, and an 7486 * ism map blk in case we need one. We must do our 7487 * allocations before acquiring locks to prevent a deadlock 7488 * in the kmem allocator on the mapping list lock. 7489 */ 7490 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 7491 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 7492 7493 /* 7494 * Serialize ISM mappings with the ISM busy flag, and also the 7495 * trap handlers. 7496 */ 7497 sfmmu_ismhat_enter(sfmmup, 0); 7498 7499 /* 7500 * Allocate an ism map blk if necessary. 7501 */ 7502 if (sfmmup->sfmmu_iblk == NULL) { 7503 sfmmup->sfmmu_iblk = new_iblk; 7504 bzero(new_iblk, sizeof (*new_iblk)); 7505 new_iblk->iblk_nextpa = (uint64_t)-1; 7506 membar_stst(); /* make sure next ptr visible to all CPUs */ 7507 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 7508 reload_mmu = 1; 7509 new_iblk = NULL; 7510 } 7511 7512 #ifdef DEBUG 7513 /* 7514 * Make sure mapping does not already exist. 7515 */ 7516 ism_blkp = sfmmup->sfmmu_iblk; 7517 while (ism_blkp) { 7518 ism_map = ism_blkp->iblk_maps; 7519 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 7520 if ((addr >= ism_start(ism_map[i]) && 7521 addr < ism_end(ism_map[i])) || 7522 eaddr > ism_start(ism_map[i]) && 7523 eaddr <= ism_end(ism_map[i])) { 7524 panic("sfmmu_share: Already mapped!"); 7525 } 7526 } 7527 ism_blkp = ism_blkp->iblk_next; 7528 } 7529 #endif /* DEBUG */ 7530 7531 ASSERT(ismszc >= TTE4M); 7532 if (ismszc == TTE4M) { 7533 ismhatflag = HAT_4M_FLAG; 7534 } else if (ismszc == TTE32M) { 7535 ismhatflag = HAT_32M_FLAG; 7536 } else if (ismszc == TTE256M) { 7537 ismhatflag = HAT_256M_FLAG; 7538 } 7539 /* 7540 * Add mapping to first available mapping slot. 7541 */ 7542 ism_blkp = sfmmup->sfmmu_iblk; 7543 added = 0; 7544 while (!added) { 7545 ism_map = ism_blkp->iblk_maps; 7546 for (i = 0; i < ISM_MAP_SLOTS; i++) { 7547 if (ism_map[i].imap_ismhat == NULL) { 7548 7549 ism_map[i].imap_ismhat = ism_hatid; 7550 ism_map[i].imap_vb_shift = (ushort_t)ismshift; 7551 ism_map[i].imap_hatflags = ismhatflag; 7552 ism_map[i].imap_sz_mask = ismmask; 7553 /* 7554 * imap_seg is checked in ISM_CHECK to see if 7555 * non-NULL, then other info assumed valid. 7556 */ 7557 membar_stst(); 7558 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 7559 ism_map[i].imap_ment = ism_ment; 7560 7561 /* 7562 * Now add ourselves to the ism_hat's 7563 * mapping list. 7564 */ 7565 ism_ment->iment_hat = sfmmup; 7566 ism_ment->iment_base_va = addr; 7567 ism_hatid->sfmmu_ismhat = 1; 7568 ism_hatid->sfmmu_flags = 0; 7569 mutex_enter(&ism_mlist_lock); 7570 iment_add(ism_ment, ism_hatid); 7571 mutex_exit(&ism_mlist_lock); 7572 added = 1; 7573 break; 7574 } 7575 } 7576 if (!added && ism_blkp->iblk_next == NULL) { 7577 ism_blkp->iblk_next = new_iblk; 7578 new_iblk = NULL; 7579 bzero(ism_blkp->iblk_next, 7580 sizeof (*ism_blkp->iblk_next)); 7581 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 7582 membar_stst(); 7583 ism_blkp->iblk_nextpa = 7584 va_to_pa((caddr_t)ism_blkp->iblk_next); 7585 } 7586 ism_blkp = ism_blkp->iblk_next; 7587 } 7588 7589 /* 7590 * Update our counters for this sfmmup's ism mappings. 7591 */ 7592 for (i = 0; i <= ismszc; i++) { 7593 if (!(disable_ism_large_pages & (1 << i))) 7594 (void) ism_tsb_entries(sfmmup, i); 7595 } 7596 7597 hatlockp = sfmmu_hat_enter(sfmmup); 7598 7599 /* 7600 * For ISM and DISM we do not support 512K pages, so we only 7601 * only search the 4M and 8K/64K hashes for 4 pagesize cpus, and search 7602 * the 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 7603 */ 7604 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 7605 7606 if (ismszc > TTE4M && !SFMMU_FLAGS_ISSET(sfmmup, HAT_4M_FLAG)) 7607 SFMMU_FLAGS_SET(sfmmup, HAT_4M_FLAG); 7608 7609 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_64K_FLAG)) 7610 SFMMU_FLAGS_SET(sfmmup, HAT_64K_FLAG); 7611 7612 /* 7613 * If we updated the ismblkpa for this HAT or we need 7614 * to start searching the 256M or 32M or 4M hash, we must 7615 * make sure all CPUs running this process reload their 7616 * tsbmiss area. Otherwise they will fail to load the mappings 7617 * in the tsbmiss handler and will loop calling pagefault(). 7618 */ 7619 switch (ismszc) { 7620 case TTE256M: 7621 if (reload_mmu || !SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_FLAG)) { 7622 SFMMU_FLAGS_SET(sfmmup, HAT_256M_FLAG); 7623 sfmmu_sync_mmustate(sfmmup); 7624 } 7625 break; 7626 case TTE32M: 7627 if (reload_mmu || !SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_FLAG)) { 7628 SFMMU_FLAGS_SET(sfmmup, HAT_32M_FLAG); 7629 sfmmu_sync_mmustate(sfmmup); 7630 } 7631 break; 7632 case TTE4M: 7633 if (reload_mmu || !SFMMU_FLAGS_ISSET(sfmmup, HAT_4M_FLAG)) { 7634 SFMMU_FLAGS_SET(sfmmup, HAT_4M_FLAG); 7635 sfmmu_sync_mmustate(sfmmup); 7636 } 7637 break; 7638 default: 7639 break; 7640 } 7641 7642 /* 7643 * Now we can drop the locks. 7644 */ 7645 sfmmu_ismhat_exit(sfmmup, 1); 7646 sfmmu_hat_exit(hatlockp); 7647 7648 /* 7649 * Free up ismblk if we didn't use it. 7650 */ 7651 if (new_iblk != NULL) 7652 kmem_cache_free(ism_blk_cache, new_iblk); 7653 7654 /* 7655 * Check TSB and TLB page sizes. 7656 */ 7657 sfmmu_check_page_sizes(sfmmup, 1); 7658 7659 return (0); 7660 } 7661 7662 /* 7663 * hat_unshare removes exactly one ism_map from 7664 * this process's as. It expects multiple calls 7665 * to hat_unshare for multiple shm segments. 7666 */ 7667 void 7668 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 7669 { 7670 ism_map_t *ism_map; 7671 ism_ment_t *free_ment = NULL; 7672 ism_blk_t *ism_blkp; 7673 struct hat *ism_hatid; 7674 struct ctx *ctx; 7675 int cnum, found, i; 7676 hatlock_t *hatlockp; 7677 struct tsb_info *tsbinfo; 7678 uint_t ismshift = page_get_shift(ismszc); 7679 size_t sh_size = ISM_SHIFT(ismshift, len); 7680 7681 ASSERT(ISM_ALIGNED(ismshift, addr)); 7682 ASSERT(ISM_ALIGNED(ismshift, len)); 7683 ASSERT(sfmmup != NULL); 7684 ASSERT(sfmmup != ksfmmup); 7685 7686 if (sfmmup->sfmmu_xhat_provider) { 7687 XHAT_UNSHARE(sfmmup, addr, len); 7688 return; 7689 } else { 7690 /* 7691 * This must be a CPU HAT. If the address space has 7692 * XHATs attached, inform all XHATs that ISM segment 7693 * is going away 7694 */ 7695 ASSERT(sfmmup->sfmmu_as != NULL); 7696 if (sfmmup->sfmmu_as->a_xhat != NULL) 7697 xhat_unshare_all(sfmmup->sfmmu_as, addr, len); 7698 } 7699 7700 /* 7701 * Make sure that during the entire time ISM mappings are removed, 7702 * the trap handlers serialize behind us, and that no one else 7703 * can be mucking with ISM mappings. This also lets us get away 7704 * with not doing expensive cross calls to flush the TLB -- we 7705 * just discard the context, flush the entire TSB, and call it 7706 * a day. 7707 */ 7708 sfmmu_ismhat_enter(sfmmup, 0); 7709 7710 /* 7711 * Remove the mapping. 7712 * 7713 * We can't have any holes in the ism map. 7714 * The tsb miss code while searching the ism map will 7715 * stop on an empty map slot. So we must move 7716 * everyone past the hole up 1 if any. 7717 * 7718 * Also empty ism map blks are not freed until the 7719 * process exits. This is to prevent a MT race condition 7720 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 7721 */ 7722 found = 0; 7723 ism_blkp = sfmmup->sfmmu_iblk; 7724 while (!found && ism_blkp) { 7725 ism_map = ism_blkp->iblk_maps; 7726 for (i = 0; i < ISM_MAP_SLOTS; i++) { 7727 if (addr == ism_start(ism_map[i]) && 7728 sh_size == (size_t)(ism_size(ism_map[i]))) { 7729 found = 1; 7730 break; 7731 } 7732 } 7733 if (!found) 7734 ism_blkp = ism_blkp->iblk_next; 7735 } 7736 7737 if (found) { 7738 ism_hatid = ism_map[i].imap_ismhat; 7739 ASSERT(ism_hatid != NULL); 7740 ASSERT(ism_hatid->sfmmu_ismhat == 1); 7741 ASSERT(ism_hatid->sfmmu_cnum == INVALID_CONTEXT); 7742 7743 /* 7744 * First remove ourselves from the ism mapping list. 7745 */ 7746 mutex_enter(&ism_mlist_lock); 7747 iment_sub(ism_map[i].imap_ment, ism_hatid); 7748 mutex_exit(&ism_mlist_lock); 7749 free_ment = ism_map[i].imap_ment; 7750 7751 /* 7752 * Now gurantee that any other cpu 7753 * that tries to process an ISM miss 7754 * will go to tl=0. 7755 */ 7756 hatlockp = sfmmu_hat_enter(sfmmup); 7757 ctx = sfmmutoctx(sfmmup); 7758 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 7759 cnum = sfmmutoctxnum(sfmmup); 7760 7761 if (cnum != INVALID_CONTEXT) { 7762 sfmmu_tlb_swap_ctx(sfmmup, ctx); 7763 } 7764 rw_exit(&ctx->ctx_rwlock); 7765 sfmmu_hat_exit(hatlockp); 7766 7767 /* 7768 * We delete the ism map by copying 7769 * the next map over the current one. 7770 * We will take the next one in the maps 7771 * array or from the next ism_blk. 7772 */ 7773 while (ism_blkp) { 7774 ism_map = ism_blkp->iblk_maps; 7775 while (i < (ISM_MAP_SLOTS - 1)) { 7776 ism_map[i] = ism_map[i + 1]; 7777 i++; 7778 } 7779 /* i == (ISM_MAP_SLOTS - 1) */ 7780 ism_blkp = ism_blkp->iblk_next; 7781 if (ism_blkp) { 7782 ism_map[i] = ism_blkp->iblk_maps[0]; 7783 i = 0; 7784 } else { 7785 ism_map[i].imap_seg = 0; 7786 ism_map[i].imap_vb_shift = 0; 7787 ism_map[i].imap_hatflags = 0; 7788 ism_map[i].imap_sz_mask = 0; 7789 ism_map[i].imap_ismhat = NULL; 7790 ism_map[i].imap_ment = NULL; 7791 } 7792 } 7793 7794 /* 7795 * Now flush entire TSB for the process, since 7796 * demapping page by page can be too expensive. 7797 * We don't have to flush the TLB here anymore 7798 * since we switch to a new TLB ctx instead. 7799 * Also, there is no need to flush if the process 7800 * is exiting since the TSB will be freed later. 7801 */ 7802 if (!sfmmup->sfmmu_free) { 7803 hatlockp = sfmmu_hat_enter(sfmmup); 7804 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 7805 tsbinfo = tsbinfo->tsb_next) { 7806 if (tsbinfo->tsb_flags & TSB_SWAPPED) 7807 continue; 7808 sfmmu_inv_tsb(tsbinfo->tsb_va, 7809 TSB_BYTES(tsbinfo->tsb_szc)); 7810 } 7811 sfmmu_hat_exit(hatlockp); 7812 } 7813 } 7814 7815 /* 7816 * Update our counters for this sfmmup's ism mappings. 7817 */ 7818 for (i = 0; i <= ismszc; i++) { 7819 if (!(disable_ism_large_pages & (1 << i))) 7820 (void) ism_tsb_entries(sfmmup, i); 7821 } 7822 7823 sfmmu_ismhat_exit(sfmmup, 0); 7824 7825 /* 7826 * We must do our freeing here after dropping locks 7827 * to prevent a deadlock in the kmem allocator on the 7828 * mapping list lock. 7829 */ 7830 if (free_ment != NULL) 7831 kmem_cache_free(ism_ment_cache, free_ment); 7832 7833 /* 7834 * Check TSB and TLB page sizes if the process isn't exiting. 7835 */ 7836 if (!sfmmup->sfmmu_free) 7837 sfmmu_check_page_sizes(sfmmup, 0); 7838 } 7839 7840 /* ARGSUSED */ 7841 static int 7842 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 7843 { 7844 /* void *buf is sfmmu_t pointer */ 7845 return (0); 7846 } 7847 7848 /* ARGSUSED */ 7849 static void 7850 sfmmu_idcache_destructor(void *buf, void *cdrarg) 7851 { 7852 /* void *buf is sfmmu_t pointer */ 7853 } 7854 7855 /* 7856 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 7857 * field to be the pa of this hmeblk 7858 */ 7859 /* ARGSUSED */ 7860 static int 7861 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 7862 { 7863 struct hme_blk *hmeblkp; 7864 7865 bzero(buf, (size_t)cdrarg); 7866 hmeblkp = (struct hme_blk *)buf; 7867 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 7868 7869 #ifdef HBLK_TRACE 7870 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 7871 #endif /* HBLK_TRACE */ 7872 7873 return (0); 7874 } 7875 7876 /* ARGSUSED */ 7877 static void 7878 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 7879 { 7880 7881 #ifdef HBLK_TRACE 7882 7883 struct hme_blk *hmeblkp; 7884 7885 hmeblkp = (struct hme_blk *)buf; 7886 mutex_destroy(&hmeblkp->hblk_audit_lock); 7887 7888 #endif /* HBLK_TRACE */ 7889 } 7890 7891 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 7892 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 7893 /* 7894 * The kmem allocator will callback into our reclaim routine when the system 7895 * is running low in memory. We traverse the hash and free up all unused but 7896 * still cached hme_blks. We also traverse the free list and free them up 7897 * as well. 7898 */ 7899 /*ARGSUSED*/ 7900 static void 7901 sfmmu_hblkcache_reclaim(void *cdrarg) 7902 { 7903 int i; 7904 uint64_t hblkpa, prevpa, nx_pa; 7905 struct hmehash_bucket *hmebp; 7906 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 7907 static struct hmehash_bucket *uhmehash_reclaim_hand; 7908 static struct hmehash_bucket *khmehash_reclaim_hand; 7909 struct hme_blk *list = NULL; 7910 7911 hmebp = uhmehash_reclaim_hand; 7912 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 7913 uhmehash_reclaim_hand = hmebp = uhme_hash; 7914 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 7915 7916 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 7917 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 7918 hmeblkp = hmebp->hmeblkp; 7919 hblkpa = hmebp->hmeh_nextpa; 7920 prevpa = 0; 7921 pr_hblk = NULL; 7922 while (hmeblkp) { 7923 nx_hblk = hmeblkp->hblk_next; 7924 nx_pa = hmeblkp->hblk_nextpa; 7925 if (!hmeblkp->hblk_vcnt && 7926 !hmeblkp->hblk_hmecnt) { 7927 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 7928 prevpa, pr_hblk); 7929 sfmmu_hblk_free(hmebp, hmeblkp, 7930 hblkpa, &list); 7931 } else { 7932 pr_hblk = hmeblkp; 7933 prevpa = hblkpa; 7934 } 7935 hmeblkp = nx_hblk; 7936 hblkpa = nx_pa; 7937 } 7938 SFMMU_HASH_UNLOCK(hmebp); 7939 } 7940 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 7941 hmebp = uhme_hash; 7942 } 7943 7944 hmebp = khmehash_reclaim_hand; 7945 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 7946 khmehash_reclaim_hand = hmebp = khme_hash; 7947 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 7948 7949 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 7950 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 7951 hmeblkp = hmebp->hmeblkp; 7952 hblkpa = hmebp->hmeh_nextpa; 7953 prevpa = 0; 7954 pr_hblk = NULL; 7955 while (hmeblkp) { 7956 nx_hblk = hmeblkp->hblk_next; 7957 nx_pa = hmeblkp->hblk_nextpa; 7958 if (!hmeblkp->hblk_vcnt && 7959 !hmeblkp->hblk_hmecnt) { 7960 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 7961 prevpa, pr_hblk); 7962 sfmmu_hblk_free(hmebp, hmeblkp, 7963 hblkpa, &list); 7964 } else { 7965 pr_hblk = hmeblkp; 7966 prevpa = hblkpa; 7967 } 7968 hmeblkp = nx_hblk; 7969 hblkpa = nx_pa; 7970 } 7971 SFMMU_HASH_UNLOCK(hmebp); 7972 } 7973 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 7974 hmebp = khme_hash; 7975 } 7976 sfmmu_hblks_list_purge(&list); 7977 } 7978 7979 /* 7980 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 7981 * same goes for sfmmu_get_addrvcolor(). 7982 * 7983 * This function will return the virtual color for the specified page. The 7984 * virtual color corresponds to this page current mapping or its last mapping. 7985 * It is used by memory allocators to choose addresses with the correct 7986 * alignment so vac consistency is automatically maintained. If the page 7987 * has no color it returns -1. 7988 */ 7989 int 7990 sfmmu_get_ppvcolor(struct page *pp) 7991 { 7992 int color; 7993 7994 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 7995 return (-1); 7996 } 7997 color = PP_GET_VCOLOR(pp); 7998 ASSERT(color < mmu_btop(shm_alignment)); 7999 return (color); 8000 } 8001 8002 /* 8003 * This function will return the desired alignment for vac consistency 8004 * (vac color) given a virtual address. If no vac is present it returns -1. 8005 */ 8006 int 8007 sfmmu_get_addrvcolor(caddr_t vaddr) 8008 { 8009 if (cache & CACHE_VAC) { 8010 return (addr_to_vcolor(vaddr)); 8011 } else { 8012 return (-1); 8013 } 8014 8015 } 8016 8017 /* 8018 * Check for conflicts. 8019 * A conflict exists if the new and existent mappings do not match in 8020 * their "shm_alignment fields. If conflicts exist, the existant mappings 8021 * are flushed unless one of them is locked. If one of them is locked, then 8022 * the mappings are flushed and converted to non-cacheable mappings. 8023 */ 8024 static void 8025 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 8026 { 8027 struct hat *tmphat; 8028 struct sf_hment *sfhmep, *tmphme = NULL; 8029 struct hme_blk *hmeblkp; 8030 int vcolor; 8031 tte_t tte; 8032 8033 ASSERT(sfmmu_mlist_held(pp)); 8034 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 8035 8036 vcolor = addr_to_vcolor(addr); 8037 if (PP_NEWPAGE(pp)) { 8038 PP_SET_VCOLOR(pp, vcolor); 8039 return; 8040 } 8041 8042 if (PP_GET_VCOLOR(pp) == vcolor) { 8043 return; 8044 } 8045 8046 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 8047 /* 8048 * Previous user of page had a different color 8049 * but since there are no current users 8050 * we just flush the cache and change the color. 8051 */ 8052 SFMMU_STAT(sf_pgcolor_conflict); 8053 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 8054 PP_SET_VCOLOR(pp, vcolor); 8055 return; 8056 } 8057 8058 /* 8059 * If we get here we have a vac conflict with a current 8060 * mapping. VAC conflict policy is as follows. 8061 * - The default is to unload the other mappings unless: 8062 * - If we have a large mapping we uncache the page. 8063 * We need to uncache the rest of the large page too. 8064 * - If any of the mappings are locked we uncache the page. 8065 * - If the requested mapping is inconsistent 8066 * with another mapping and that mapping 8067 * is in the same address space we have to 8068 * make it non-cached. The default thing 8069 * to do is unload the inconsistent mapping 8070 * but if they are in the same address space 8071 * we run the risk of unmapping the pc or the 8072 * stack which we will use as we return to the user, 8073 * in which case we can then fault on the thing 8074 * we just unloaded and get into an infinite loop. 8075 */ 8076 if (PP_ISMAPPED_LARGE(pp)) { 8077 int sz; 8078 8079 /* 8080 * Existing mapping is for big pages. We don't unload 8081 * existing big mappings to satisfy new mappings. 8082 * Always convert all mappings to TNC. 8083 */ 8084 sz = fnd_mapping_sz(pp); 8085 pp = PP_GROUPLEADER(pp, sz); 8086 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 8087 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 8088 TTEPAGES(sz)); 8089 8090 return; 8091 } 8092 8093 /* 8094 * check if any mapping is in same as or if it is locked 8095 * since in that case we need to uncache. 8096 */ 8097 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 8098 tmphme = sfhmep->hme_next; 8099 hmeblkp = sfmmu_hmetohblk(sfhmep); 8100 if (hmeblkp->hblk_xhat_bit) 8101 continue; 8102 tmphat = hblktosfmmu(hmeblkp); 8103 sfmmu_copytte(&sfhmep->hme_tte, &tte); 8104 ASSERT(TTE_IS_VALID(&tte)); 8105 if ((tmphat == hat) || hmeblkp->hblk_lckcnt) { 8106 /* 8107 * We have an uncache conflict 8108 */ 8109 SFMMU_STAT(sf_uncache_conflict); 8110 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 8111 return; 8112 } 8113 } 8114 8115 /* 8116 * We have an unload conflict 8117 * We have already checked for LARGE mappings, therefore 8118 * the remaining mapping(s) must be TTE8K. 8119 */ 8120 SFMMU_STAT(sf_unload_conflict); 8121 8122 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 8123 tmphme = sfhmep->hme_next; 8124 hmeblkp = sfmmu_hmetohblk(sfhmep); 8125 if (hmeblkp->hblk_xhat_bit) 8126 continue; 8127 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 8128 } 8129 8130 if (PP_ISMAPPED_KPM(pp)) 8131 sfmmu_kpm_vac_unload(pp, addr); 8132 8133 /* 8134 * Unloads only do TLB flushes so we need to flush the 8135 * cache here. 8136 */ 8137 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 8138 PP_SET_VCOLOR(pp, vcolor); 8139 } 8140 8141 /* 8142 * Whenever a mapping is unloaded and the page is in TNC state, 8143 * we see if the page can be made cacheable again. 'pp' is 8144 * the page that we just unloaded a mapping from, the size 8145 * of mapping that was unloaded is 'ottesz'. 8146 * Remark: 8147 * The recache policy for mpss pages can leave a performance problem 8148 * under the following circumstances: 8149 * . A large page in uncached mode has just been unmapped. 8150 * . All constituent pages are TNC due to a conflicting small mapping. 8151 * . There are many other, non conflicting, small mappings around for 8152 * a lot of the constituent pages. 8153 * . We're called w/ the "old" groupleader page and the old ottesz, 8154 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 8155 * we end up w/ TTE8K or npages == 1. 8156 * . We call tst_tnc w/ the old groupleader only, and if there is no 8157 * conflict, we re-cache only this page. 8158 * . All other small mappings are not checked and will be left in TNC mode. 8159 * The problem is not very serious because: 8160 * . mpss is actually only defined for heap and stack, so the probability 8161 * is not very high that a large page mapping exists in parallel to a small 8162 * one (this is possible, but seems to be bad programming style in the 8163 * appl). 8164 * . The problem gets a little bit more serious, when those TNC pages 8165 * have to be mapped into kernel space, e.g. for networking. 8166 * . When VAC alias conflicts occur in applications, this is regarded 8167 * as an application bug. So if kstat's show them, the appl should 8168 * be changed anyway. 8169 */ 8170 static void 8171 conv_tnc(page_t *pp, int ottesz) 8172 { 8173 int cursz, dosz; 8174 pgcnt_t curnpgs, dopgs; 8175 pgcnt_t pg64k; 8176 page_t *pp2; 8177 8178 /* 8179 * Determine how big a range we check for TNC and find 8180 * leader page. cursz is the size of the biggest 8181 * mapping that still exist on 'pp'. 8182 */ 8183 if (PP_ISMAPPED_LARGE(pp)) { 8184 cursz = fnd_mapping_sz(pp); 8185 } else { 8186 cursz = TTE8K; 8187 } 8188 8189 if (ottesz >= cursz) { 8190 dosz = ottesz; 8191 pp2 = pp; 8192 } else { 8193 dosz = cursz; 8194 pp2 = PP_GROUPLEADER(pp, dosz); 8195 } 8196 8197 pg64k = TTEPAGES(TTE64K); 8198 dopgs = TTEPAGES(dosz); 8199 8200 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 8201 8202 while (dopgs != 0) { 8203 curnpgs = TTEPAGES(cursz); 8204 if (tst_tnc(pp2, curnpgs)) { 8205 SFMMU_STAT_ADD(sf_recache, curnpgs); 8206 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 8207 curnpgs); 8208 } 8209 8210 ASSERT(dopgs >= curnpgs); 8211 dopgs -= curnpgs; 8212 8213 if (dopgs == 0) { 8214 break; 8215 } 8216 8217 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 8218 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 8219 cursz = fnd_mapping_sz(pp2); 8220 } else { 8221 cursz = TTE8K; 8222 } 8223 } 8224 } 8225 8226 /* 8227 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 8228 * returns 0 otherwise. Note that oaddr argument is valid for only 8229 * 8k pages. 8230 */ 8231 static int 8232 tst_tnc(page_t *pp, pgcnt_t npages) 8233 { 8234 struct sf_hment *sfhme; 8235 struct hme_blk *hmeblkp; 8236 tte_t tte; 8237 caddr_t vaddr; 8238 int clr_valid = 0; 8239 int color, color1, bcolor; 8240 int i, ncolors; 8241 8242 ASSERT(pp != NULL); 8243 ASSERT(!(cache & CACHE_WRITEBACK)); 8244 8245 if (npages > 1) { 8246 ncolors = CACHE_NUM_COLOR; 8247 } 8248 8249 for (i = 0; i < npages; i++) { 8250 ASSERT(sfmmu_mlist_held(pp)); 8251 ASSERT(PP_ISTNC(pp)); 8252 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 8253 8254 if (PP_ISPNC(pp)) { 8255 return (0); 8256 } 8257 8258 clr_valid = 0; 8259 if (PP_ISMAPPED_KPM(pp)) { 8260 caddr_t kpmvaddr; 8261 8262 ASSERT(kpm_enable); 8263 kpmvaddr = hat_kpm_page2va(pp, 1); 8264 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 8265 color1 = addr_to_vcolor(kpmvaddr); 8266 clr_valid = 1; 8267 } 8268 8269 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 8270 hmeblkp = sfmmu_hmetohblk(sfhme); 8271 if (hmeblkp->hblk_xhat_bit) 8272 continue; 8273 8274 sfmmu_copytte(&sfhme->hme_tte, &tte); 8275 ASSERT(TTE_IS_VALID(&tte)); 8276 8277 vaddr = tte_to_vaddr(hmeblkp, tte); 8278 color = addr_to_vcolor(vaddr); 8279 8280 if (npages > 1) { 8281 /* 8282 * If there is a big mapping, make sure 8283 * 8K mapping is consistent with the big 8284 * mapping. 8285 */ 8286 bcolor = i % ncolors; 8287 if (color != bcolor) { 8288 return (0); 8289 } 8290 } 8291 if (!clr_valid) { 8292 clr_valid = 1; 8293 color1 = color; 8294 } 8295 8296 if (color1 != color) { 8297 return (0); 8298 } 8299 } 8300 8301 pp = PP_PAGENEXT(pp); 8302 } 8303 8304 return (1); 8305 } 8306 8307 static void 8308 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 8309 pgcnt_t npages) 8310 { 8311 kmutex_t *pmtx; 8312 int i, ncolors, bcolor; 8313 kpm_hlk_t *kpmp; 8314 cpuset_t cpuset; 8315 8316 ASSERT(pp != NULL); 8317 ASSERT(!(cache & CACHE_WRITEBACK)); 8318 8319 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 8320 pmtx = sfmmu_page_enter(pp); 8321 8322 /* 8323 * Fast path caching single unmapped page 8324 */ 8325 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 8326 flags == HAT_CACHE) { 8327 PP_CLRTNC(pp); 8328 PP_CLRPNC(pp); 8329 sfmmu_page_exit(pmtx); 8330 sfmmu_kpm_kpmp_exit(kpmp); 8331 return; 8332 } 8333 8334 /* 8335 * We need to capture all cpus in order to change cacheability 8336 * because we can't allow one cpu to access the same physical 8337 * page using a cacheable and a non-cachebale mapping at the same 8338 * time. Since we may end up walking the ism mapping list 8339 * have to grab it's lock now since we can't after all the 8340 * cpus have been captured. 8341 */ 8342 sfmmu_hat_lock_all(); 8343 mutex_enter(&ism_mlist_lock); 8344 kpreempt_disable(); 8345 cpuset = cpu_ready_set; 8346 xc_attention(cpuset); 8347 8348 if (npages > 1) { 8349 /* 8350 * Make sure all colors are flushed since the 8351 * sfmmu_page_cache() only flushes one color- 8352 * it does not know big pages. 8353 */ 8354 ncolors = CACHE_NUM_COLOR; 8355 if (flags & HAT_TMPNC) { 8356 for (i = 0; i < ncolors; i++) { 8357 sfmmu_cache_flushcolor(i, pp->p_pagenum); 8358 } 8359 cache_flush_flag = CACHE_NO_FLUSH; 8360 } 8361 } 8362 8363 for (i = 0; i < npages; i++) { 8364 8365 ASSERT(sfmmu_mlist_held(pp)); 8366 8367 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 8368 8369 if (npages > 1) { 8370 bcolor = i % ncolors; 8371 } else { 8372 bcolor = NO_VCOLOR; 8373 } 8374 8375 sfmmu_page_cache(pp, flags, cache_flush_flag, 8376 bcolor); 8377 } 8378 8379 pp = PP_PAGENEXT(pp); 8380 } 8381 8382 xt_sync(cpuset); 8383 xc_dismissed(cpuset); 8384 mutex_exit(&ism_mlist_lock); 8385 sfmmu_hat_unlock_all(); 8386 sfmmu_page_exit(pmtx); 8387 sfmmu_kpm_kpmp_exit(kpmp); 8388 kpreempt_enable(); 8389 } 8390 8391 /* 8392 * This function changes the virtual cacheability of all mappings to a 8393 * particular page. When changing from uncache to cacheable the mappings will 8394 * only be changed if all of them have the same virtual color. 8395 * We need to flush the cache in all cpus. It is possible that 8396 * a process referenced a page as cacheable but has sinced exited 8397 * and cleared the mapping list. We still to flush it but have no 8398 * state so all cpus is the only alternative. 8399 */ 8400 static void 8401 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 8402 { 8403 struct sf_hment *sfhme; 8404 struct hme_blk *hmeblkp; 8405 sfmmu_t *sfmmup; 8406 tte_t tte, ttemod; 8407 caddr_t vaddr; 8408 int ret, color; 8409 pfn_t pfn; 8410 8411 color = bcolor; 8412 pfn = pp->p_pagenum; 8413 8414 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 8415 8416 hmeblkp = sfmmu_hmetohblk(sfhme); 8417 8418 if (hmeblkp->hblk_xhat_bit) 8419 continue; 8420 8421 sfmmu_copytte(&sfhme->hme_tte, &tte); 8422 ASSERT(TTE_IS_VALID(&tte)); 8423 vaddr = tte_to_vaddr(hmeblkp, tte); 8424 color = addr_to_vcolor(vaddr); 8425 8426 #ifdef DEBUG 8427 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 8428 ASSERT(color == bcolor); 8429 } 8430 #endif 8431 8432 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 8433 8434 ttemod = tte; 8435 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 8436 TTE_CLR_VCACHEABLE(&ttemod); 8437 } else { /* flags & HAT_CACHE */ 8438 TTE_SET_VCACHEABLE(&ttemod); 8439 } 8440 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 8441 if (ret < 0) { 8442 /* 8443 * Since all cpus are captured modifytte should not 8444 * fail. 8445 */ 8446 panic("sfmmu_page_cache: write to tte failed"); 8447 } 8448 8449 sfmmup = hblktosfmmu(hmeblkp); 8450 if (cache_flush_flag == CACHE_FLUSH) { 8451 /* 8452 * Flush TSBs, TLBs and caches 8453 */ 8454 if (sfmmup->sfmmu_ismhat) { 8455 if (flags & HAT_CACHE) { 8456 SFMMU_STAT(sf_ism_recache); 8457 } else { 8458 SFMMU_STAT(sf_ism_uncache); 8459 } 8460 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 8461 pfn, CACHE_FLUSH); 8462 } else { 8463 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 8464 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 8465 } 8466 8467 /* 8468 * all cache entries belonging to this pfn are 8469 * now flushed. 8470 */ 8471 cache_flush_flag = CACHE_NO_FLUSH; 8472 } else { 8473 8474 /* 8475 * Flush only TSBs and TLBs. 8476 */ 8477 if (sfmmup->sfmmu_ismhat) { 8478 if (flags & HAT_CACHE) { 8479 SFMMU_STAT(sf_ism_recache); 8480 } else { 8481 SFMMU_STAT(sf_ism_uncache); 8482 } 8483 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 8484 pfn, CACHE_NO_FLUSH); 8485 } else { 8486 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 8487 } 8488 } 8489 } 8490 8491 if (PP_ISMAPPED_KPM(pp)) 8492 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 8493 8494 switch (flags) { 8495 8496 default: 8497 panic("sfmmu_pagecache: unknown flags"); 8498 break; 8499 8500 case HAT_CACHE: 8501 PP_CLRTNC(pp); 8502 PP_CLRPNC(pp); 8503 PP_SET_VCOLOR(pp, color); 8504 break; 8505 8506 case HAT_TMPNC: 8507 PP_SETTNC(pp); 8508 PP_SET_VCOLOR(pp, NO_VCOLOR); 8509 break; 8510 8511 case HAT_UNCACHE: 8512 PP_SETPNC(pp); 8513 PP_CLRTNC(pp); 8514 PP_SET_VCOLOR(pp, NO_VCOLOR); 8515 break; 8516 } 8517 } 8518 8519 /* 8520 * This routine gets called when the system has run out of free contexts. 8521 * This will simply choose context passed to it to be stolen and reused. 8522 */ 8523 /* ARGSUSED */ 8524 static void 8525 sfmmu_reuse_ctx(struct ctx *ctx, sfmmu_t *sfmmup) 8526 { 8527 sfmmu_t *stolen_sfmmup; 8528 cpuset_t cpuset; 8529 ushort_t cnum = ctxtoctxnum(ctx); 8530 8531 ASSERT(cnum != KCONTEXT); 8532 ASSERT(rw_read_locked(&ctx->ctx_rwlock) == 0); /* write locked */ 8533 8534 /* 8535 * simply steal and reuse the ctx passed to us. 8536 */ 8537 stolen_sfmmup = ctx->ctx_sfmmu; 8538 ASSERT(sfmmu_hat_lock_held(sfmmup)); 8539 ASSERT(stolen_sfmmup->sfmmu_cnum == cnum); 8540 ASSERT(stolen_sfmmup != ksfmmup); 8541 8542 TRACE_CTXS(&ctx_trace_mutex, ctx_trace_ptr, cnum, stolen_sfmmup, 8543 sfmmup, CTX_TRC_STEAL); 8544 SFMMU_STAT(sf_ctxsteal); 8545 8546 /* 8547 * Update sfmmu and ctx structs. After this point all threads 8548 * belonging to this hat/proc will fault and not use the ctx 8549 * being stolen. 8550 */ 8551 kpreempt_disable(); 8552 /* 8553 * Enforce reverse order of assignments from sfmmu_get_ctx(). This 8554 * is done to prevent a race where a thread faults with the context 8555 * but the TSB has changed. 8556 */ 8557 stolen_sfmmup->sfmmu_cnum = INVALID_CONTEXT; 8558 membar_enter(); 8559 ctx->ctx_sfmmu = NULL; 8560 8561 /* 8562 * 1. flush TLB in all CPUs that ran the process whose ctx 8563 * we are stealing. 8564 * 2. change context for all other CPUs to INVALID_CONTEXT, 8565 * if they are running in the context that we are going to steal. 8566 */ 8567 cpuset = stolen_sfmmup->sfmmu_cpusran; 8568 CPUSET_DEL(cpuset, CPU->cpu_id); 8569 CPUSET_AND(cpuset, cpu_ready_set); 8570 SFMMU_XCALL_STATS(cnum); 8571 xt_some(cpuset, sfmmu_ctx_steal_tl1, cnum, INVALID_CONTEXT); 8572 xt_sync(cpuset); 8573 8574 /* 8575 * flush TLB of local processor 8576 */ 8577 vtag_flushctx(cnum); 8578 8579 /* 8580 * If we just stole the ctx from the current process 8581 * on local cpu then we also invalidate his context 8582 * here. 8583 */ 8584 if (sfmmu_getctx_sec() == cnum) { 8585 sfmmu_setctx_sec(INVALID_CONTEXT); 8586 sfmmu_clear_utsbinfo(); 8587 } 8588 8589 kpreempt_enable(); 8590 SFMMU_STAT(sf_tlbflush_ctx); 8591 } 8592 8593 /* 8594 * Returns a context with the reader lock held. 8595 * 8596 * We maintain 2 different list of contexts. The first list 8597 * is the free list and it is headed by ctxfree. These contexts 8598 * are ready to use. The second list is the dirty list and is 8599 * headed by ctxdirty. These contexts have been freed but haven't 8600 * been flushed from the TLB. 8601 * 8602 * It's the responsibility of the caller to guarantee that the 8603 * process serializes on calls here by taking the HAT lock for 8604 * the hat. 8605 * 8606 * Changing the page size is a rather complicated process, so 8607 * rather than jump through lots of hoops to special case it, 8608 * the easiest way to go about it is to tell the MMU we want 8609 * to change page sizes and then switch to using a different 8610 * context. When we program the context registers for the 8611 * process, we can take care of setting up the (new) page size 8612 * for that context at that point. 8613 */ 8614 8615 static struct ctx * 8616 sfmmu_get_ctx(sfmmu_t *sfmmup) 8617 { 8618 struct ctx *ctx; 8619 ushort_t cnum; 8620 struct ctx *lastctx = &ctxs[nctxs-1]; 8621 struct ctx *firstctx = &ctxs[NUM_LOCKED_CTXS]; 8622 uint_t found_stealable_ctx; 8623 uint_t retry_count = 0; 8624 8625 #define NEXT_CTX(ctx) (((ctx) >= lastctx) ? firstctx : ((ctx) + 1)) 8626 8627 retry: 8628 8629 ASSERT(sfmmup->sfmmu_cnum != KCONTEXT); 8630 /* 8631 * Check to see if this process has already got a ctx. 8632 * In that case just set the sec-ctx, grab a readers lock, and 8633 * return. 8634 * 8635 * We have to double check after we get the readers lock on the 8636 * context, since it could be stolen in this short window. 8637 */ 8638 if (sfmmup->sfmmu_cnum >= NUM_LOCKED_CTXS) { 8639 ctx = sfmmutoctx(sfmmup); 8640 rw_enter(&ctx->ctx_rwlock, RW_READER); 8641 if (ctx->ctx_sfmmu == sfmmup) { 8642 return (ctx); 8643 } else { 8644 rw_exit(&ctx->ctx_rwlock); 8645 } 8646 } 8647 8648 found_stealable_ctx = 0; 8649 mutex_enter(&ctx_list_lock); 8650 if ((ctx = ctxfree) != NULL) { 8651 /* 8652 * Found a ctx in free list. Delete it from the list and 8653 * use it. There's a short window where the stealer can 8654 * look at the context before we grab the lock on the 8655 * context, so we have to handle that with the free flag. 8656 */ 8657 SFMMU_STAT(sf_ctxfree); 8658 ctxfree = ctx->ctx_free; 8659 ctx->ctx_sfmmu = NULL; 8660 mutex_exit(&ctx_list_lock); 8661 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 8662 ASSERT(ctx->ctx_sfmmu == NULL); 8663 ASSERT((ctx->ctx_flags & CTX_FREE_FLAG) != 0); 8664 } else if ((ctx = ctxdirty) != NULL) { 8665 /* 8666 * No free contexts. If we have at least one dirty ctx 8667 * then flush the TLBs on all cpus if necessary and move 8668 * the dirty list to the free list. 8669 */ 8670 SFMMU_STAT(sf_ctxdirty); 8671 ctxdirty = NULL; 8672 if (delay_tlb_flush) 8673 sfmmu_tlb_all_demap(); 8674 ctxfree = ctx->ctx_free; 8675 ctx->ctx_sfmmu = NULL; 8676 mutex_exit(&ctx_list_lock); 8677 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 8678 ASSERT(ctx->ctx_sfmmu == NULL); 8679 ASSERT((ctx->ctx_flags & CTX_FREE_FLAG) != 0); 8680 } else { 8681 /* 8682 * No free context available, so steal one. 8683 * 8684 * The policy to choose the appropriate context is simple; 8685 * just sweep all the ctxs using ctxhand. This will steal 8686 * the LRU ctx. 8687 * 8688 * We however only steal a non-free context that can be 8689 * write locked. Keep searching till we find a stealable 8690 * ctx. 8691 */ 8692 mutex_exit(&ctx_list_lock); 8693 ctx = ctxhand; 8694 do { 8695 /* 8696 * If you get the writers lock, and the ctx isn't 8697 * a free ctx, THEN you can steal this ctx. 8698 */ 8699 if ((ctx->ctx_flags & CTX_FREE_FLAG) == 0 && 8700 rw_tryenter(&ctx->ctx_rwlock, RW_WRITER) != 0) { 8701 if (ctx->ctx_flags & CTX_FREE_FLAG) { 8702 /* let the first guy have it */ 8703 rw_exit(&ctx->ctx_rwlock); 8704 } else { 8705 found_stealable_ctx = 1; 8706 break; 8707 } 8708 } 8709 ctx = NEXT_CTX(ctx); 8710 } while (ctx != ctxhand); 8711 8712 if (found_stealable_ctx) { 8713 /* 8714 * Try and reuse the ctx. 8715 */ 8716 sfmmu_reuse_ctx(ctx, sfmmup); 8717 8718 } else if (retry_count++ < GET_CTX_RETRY_CNT) { 8719 goto retry; 8720 8721 } else { 8722 panic("Can't find any stealable context"); 8723 } 8724 } 8725 8726 ASSERT(rw_read_locked(&ctx->ctx_rwlock) == 0); /* write locked */ 8727 ctx->ctx_sfmmu = sfmmup; 8728 8729 /* 8730 * Clear the ctx_flags field. 8731 */ 8732 ctx->ctx_flags = 0; 8733 8734 cnum = ctxtoctxnum(ctx); 8735 membar_exit(); 8736 sfmmup->sfmmu_cnum = cnum; 8737 8738 /* 8739 * Let the MMU set up the page sizes to use for 8740 * this context in the TLB. Don't program 2nd dtlb for ism hat. 8741 */ 8742 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) 8743 mmu_set_ctx_page_sizes(sfmmup); 8744 8745 /* 8746 * Downgrade to reader's lock. 8747 */ 8748 rw_downgrade(&ctx->ctx_rwlock); 8749 8750 /* 8751 * If this value doesn't get set to what we want 8752 * it won't matter, so don't worry about locking. 8753 */ 8754 ctxhand = NEXT_CTX(ctx); 8755 8756 /* 8757 * Better not have been stolen while we held the ctx' 8758 * lock or we're hosed. 8759 */ 8760 ASSERT(sfmmup == sfmmutoctx(sfmmup)->ctx_sfmmu); 8761 8762 return (ctx); 8763 8764 #undef NEXT_CTX 8765 } 8766 8767 8768 /* 8769 * Set the process context to INVALID_CONTEXT (but 8770 * without stealing the ctx) so that it faults and 8771 * reloads the MMU state from TL=0. Caller must 8772 * hold the hat lock since we don't acquire it here. 8773 */ 8774 static void 8775 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 8776 { 8777 int cnum; 8778 cpuset_t cpuset; 8779 8780 ASSERT(sfmmup != ksfmmup); 8781 ASSERT(sfmmu_hat_lock_held(sfmmup)); 8782 8783 kpreempt_disable(); 8784 8785 cnum = sfmmutoctxnum(sfmmup); 8786 if (cnum != INVALID_CONTEXT) { 8787 cpuset = sfmmup->sfmmu_cpusran; 8788 CPUSET_DEL(cpuset, CPU->cpu_id); 8789 CPUSET_AND(cpuset, cpu_ready_set); 8790 SFMMU_XCALL_STATS(cnum); 8791 8792 xt_some(cpuset, sfmmu_raise_tsb_exception, 8793 cnum, INVALID_CONTEXT); 8794 xt_sync(cpuset); 8795 8796 /* 8797 * If the process is running on the local CPU 8798 * we need to update the MMU state here as well. 8799 */ 8800 if (sfmmu_getctx_sec() == cnum) 8801 sfmmu_load_mmustate(sfmmup); 8802 8803 SFMMU_STAT(sf_tsb_raise_exception); 8804 } 8805 8806 kpreempt_enable(); 8807 } 8808 8809 8810 /* 8811 * Replace the specified TSB with a new TSB. This function gets called when 8812 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the 8813 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 8814 * (8K). 8815 * 8816 * Caller must hold the HAT lock, but should assume any tsb_info 8817 * pointers it has are no longer valid after calling this function. 8818 * 8819 * Return values: 8820 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 8821 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 8822 * something to this tsbinfo/TSB 8823 * TSB_SUCCESS Operation succeeded 8824 */ 8825 static tsb_replace_rc_t 8826 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 8827 hatlock_t *hatlockp, uint_t flags) 8828 { 8829 struct tsb_info *new_tsbinfo = NULL; 8830 struct tsb_info *curtsb, *prevtsb; 8831 uint_t tte_sz_mask; 8832 cpuset_t cpuset; 8833 struct ctx *ctx = NULL; 8834 int ctxnum; 8835 8836 ASSERT(sfmmup != ksfmmup); 8837 ASSERT(sfmmup->sfmmu_ismhat == 0); 8838 ASSERT(sfmmu_hat_lock_held(sfmmup)); 8839 ASSERT(szc <= tsb_max_growsize); 8840 8841 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 8842 return (TSB_LOSTRACE); 8843 8844 /* 8845 * Find the tsb_info ahead of this one in the list, and 8846 * also make sure that the tsb_info passed in really 8847 * exists! 8848 */ 8849 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 8850 curtsb != old_tsbinfo && curtsb != NULL; 8851 prevtsb = curtsb, curtsb = curtsb->tsb_next); 8852 ASSERT(curtsb != NULL); 8853 8854 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 8855 /* 8856 * The process is swapped out, so just set the new size 8857 * code. When it swaps back in, we'll allocate a new one 8858 * of the new chosen size. 8859 */ 8860 curtsb->tsb_szc = szc; 8861 return (TSB_SUCCESS); 8862 } 8863 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 8864 8865 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 8866 8867 /* 8868 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 8869 * If we fail to allocate a TSB, exit. 8870 */ 8871 sfmmu_hat_exit(hatlockp); 8872 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, tte_sz_mask, 8873 flags, sfmmup)) { 8874 (void) sfmmu_hat_enter(sfmmup); 8875 if (!(flags & TSB_SWAPIN)) 8876 SFMMU_STAT(sf_tsb_resize_failures); 8877 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 8878 return (TSB_ALLOCFAIL); 8879 } 8880 (void) sfmmu_hat_enter(sfmmup); 8881 8882 /* 8883 * Re-check to make sure somebody else didn't muck with us while we 8884 * didn't hold the HAT lock. If the process swapped out, fine, just 8885 * exit; this can happen if we try to shrink the TSB from the context 8886 * of another process (such as on an ISM unmap), though it is rare. 8887 */ 8888 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 8889 SFMMU_STAT(sf_tsb_resize_failures); 8890 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 8891 sfmmu_hat_exit(hatlockp); 8892 sfmmu_tsbinfo_free(new_tsbinfo); 8893 (void) sfmmu_hat_enter(sfmmup); 8894 return (TSB_LOSTRACE); 8895 } 8896 8897 #ifdef DEBUG 8898 /* Reverify that the tsb_info still exists.. for debugging only */ 8899 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 8900 curtsb != old_tsbinfo && curtsb != NULL; 8901 prevtsb = curtsb, curtsb = curtsb->tsb_next); 8902 ASSERT(curtsb != NULL); 8903 #endif /* DEBUG */ 8904 8905 /* 8906 * Quiesce any CPUs running this process on their next TLB miss 8907 * so they atomically see the new tsb_info. We temporarily set the 8908 * context to invalid context so new threads that come on processor 8909 * after we do the xcall to cpusran will also serialize behind the 8910 * HAT lock on TLB miss and will see the new TSB. Since this short 8911 * race with a new thread coming on processor is relatively rare, 8912 * this synchronization mechanism should be cheaper than always 8913 * pausing all CPUs for the duration of the setup, which is what 8914 * the old implementation did. This is particuarly true if we are 8915 * copying a huge chunk of memory around during that window. 8916 * 8917 * The memory barriers are to make sure things stay consistent 8918 * with resume() since it does not hold the HAT lock while 8919 * walking the list of tsb_info structures. 8920 */ 8921 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 8922 /* The TSB is either growing or shrinking. */ 8923 ctx = sfmmutoctx(sfmmup); 8924 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 8925 8926 ctxnum = sfmmutoctxnum(sfmmup); 8927 sfmmup->sfmmu_cnum = INVALID_CONTEXT; 8928 membar_enter(); /* make sure visible on all CPUs */ 8929 8930 kpreempt_disable(); 8931 if (ctxnum != INVALID_CONTEXT) { 8932 cpuset = sfmmup->sfmmu_cpusran; 8933 CPUSET_DEL(cpuset, CPU->cpu_id); 8934 CPUSET_AND(cpuset, cpu_ready_set); 8935 SFMMU_XCALL_STATS(ctxnum); 8936 8937 xt_some(cpuset, sfmmu_raise_tsb_exception, 8938 ctxnum, INVALID_CONTEXT); 8939 xt_sync(cpuset); 8940 8941 SFMMU_STAT(sf_tsb_raise_exception); 8942 } 8943 kpreempt_enable(); 8944 } else { 8945 /* 8946 * It is illegal to swap in TSBs from a process other 8947 * than a process being swapped in. This in turn 8948 * implies we do not have a valid MMU context here 8949 * since a process needs one to resolve translation 8950 * misses. 8951 */ 8952 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 8953 ASSERT(sfmmutoctxnum(sfmmup) == INVALID_CONTEXT); 8954 } 8955 8956 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 8957 membar_stst(); /* strict ordering required */ 8958 if (prevtsb) 8959 prevtsb->tsb_next = new_tsbinfo; 8960 else 8961 sfmmup->sfmmu_tsb = new_tsbinfo; 8962 membar_enter(); /* make sure new TSB globally visible */ 8963 sfmmu_setup_tsbinfo(sfmmup); 8964 8965 /* 8966 * We need to migrate TSB entries from the old TSB to the new TSB 8967 * if tsb_remap_ttes is set and the TSB is growing. 8968 */ 8969 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 8970 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 8971 8972 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 8973 kpreempt_disable(); 8974 membar_exit(); 8975 sfmmup->sfmmu_cnum = ctxnum; 8976 if (ctxnum != INVALID_CONTEXT && 8977 sfmmu_getctx_sec() == ctxnum) { 8978 sfmmu_load_mmustate(sfmmup); 8979 } 8980 kpreempt_enable(); 8981 rw_exit(&ctx->ctx_rwlock); 8982 } 8983 8984 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 8985 8986 /* 8987 * Drop the HAT lock to free our old tsb_info. 8988 */ 8989 sfmmu_hat_exit(hatlockp); 8990 8991 if ((flags & TSB_GROW) == TSB_GROW) { 8992 SFMMU_STAT(sf_tsb_grow); 8993 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 8994 SFMMU_STAT(sf_tsb_shrink); 8995 } 8996 8997 sfmmu_tsbinfo_free(old_tsbinfo); 8998 8999 (void) sfmmu_hat_enter(sfmmup); 9000 return (TSB_SUCCESS); 9001 } 9002 9003 /* 9004 * Steal context from process, forcing the process to switch to another 9005 * context on the next TLB miss, and therefore start using the TLB that 9006 * is reprogrammed for the new page sizes. 9007 */ 9008 void 9009 sfmmu_steal_context(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 9010 { 9011 struct ctx *ctx; 9012 int i, cnum; 9013 hatlock_t *hatlockp = NULL; 9014 9015 hatlockp = sfmmu_hat_enter(sfmmup); 9016 /* USIII+-IV+ optimization, requires hat lock */ 9017 if (tmp_pgsz) { 9018 for (i = 0; i < mmu_page_sizes; i++) 9019 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 9020 } 9021 SFMMU_STAT(sf_tlb_reprog_pgsz); 9022 ctx = sfmmutoctx(sfmmup); 9023 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 9024 cnum = sfmmutoctxnum(sfmmup); 9025 9026 if (cnum != INVALID_CONTEXT) { 9027 sfmmu_tlb_swap_ctx(sfmmup, ctx); 9028 } 9029 rw_exit(&ctx->ctx_rwlock); 9030 sfmmu_hat_exit(hatlockp); 9031 } 9032 9033 /* 9034 * This function assumes that there are either four or six supported page 9035 * sizes and at most two programmable TLBs, so we need to decide which 9036 * page sizes are most important and then tell the MMU layer so it 9037 * can adjust the TLB page sizes accordingly (if supported). 9038 * 9039 * If these assumptions change, this function will need to be 9040 * updated to support whatever the new limits are. 9041 * 9042 * The growing flag is nonzero if we are growing the address space, 9043 * and zero if it is shrinking. This allows us to decide whether 9044 * to grow or shrink our TSB, depending upon available memory 9045 * conditions. 9046 */ 9047 static void 9048 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 9049 { 9050 uint64_t ttecnt[MMU_PAGE_SIZES]; 9051 uint64_t tte8k_cnt, tte4m_cnt; 9052 uint8_t i; 9053 9054 /* 9055 * Kernel threads, processes with small address spaces not using 9056 * large pages, and dummy ISM HATs need not apply. 9057 */ 9058 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 9059 return; 9060 9061 if ((sfmmup->sfmmu_flags & HAT_LGPG_FLAGS) == 0 && 9062 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 9063 return; 9064 9065 for (i = 0; i < mmu_page_sizes; i++) { 9066 ttecnt[i] = SFMMU_TTE_CNT(sfmmup, i); 9067 } 9068 9069 /* Check pagesizes in use, and possibly reprogram DTLB. */ 9070 if (&mmu_check_page_sizes) 9071 mmu_check_page_sizes(sfmmup, ttecnt); 9072 9073 /* 9074 * Calculate the number of 8k ttes to represent the span of these 9075 * pages. 9076 */ 9077 tte8k_cnt = ttecnt[TTE8K] + 9078 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 9079 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 9080 if (mmu_page_sizes == max_mmu_page_sizes) { 9081 tte4m_cnt = ttecnt[TTE4M] + 9082 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 9083 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 9084 } else { 9085 tte4m_cnt = ttecnt[TTE4M]; 9086 } 9087 9088 /* 9089 * Inflate TSB sizes by a factor of 2 if this process 9090 * uses 4M text pages to minimize extra conflict misses 9091 * in the first TSB since without counting text pages 9092 * 8K TSB may become too small. 9093 * 9094 * Also double the size of the second TSB to minimize 9095 * extra conflict misses due to competition between 4M text pages 9096 * and data pages. 9097 */ 9098 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 9099 tte8k_cnt <<= 1; 9100 tte4m_cnt <<= 1; 9101 } 9102 9103 /* 9104 * Check to see if our TSB is the right size; we may need to 9105 * grow or shrink it. If the process is small, our work is 9106 * finished at this point. 9107 */ 9108 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= tsb_sectsb_threshold) { 9109 return; 9110 } 9111 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt); 9112 } 9113 9114 static void 9115 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 9116 uint64_t tte4m_cnt) 9117 { 9118 int tsb_bits; 9119 uint_t tsb_szc; 9120 struct tsb_info *tsbinfop; 9121 hatlock_t *hatlockp = NULL; 9122 9123 hatlockp = sfmmu_hat_enter(sfmmup); 9124 ASSERT(hatlockp != NULL); 9125 tsbinfop = sfmmup->sfmmu_tsb; 9126 ASSERT(tsbinfop != NULL); 9127 9128 /* 9129 * If we're growing, select the size based on RSS. If we're 9130 * shrinking, leave some room so we don't have to turn around and 9131 * grow again immediately. 9132 */ 9133 if (growing) 9134 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 9135 else 9136 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 9137 9138 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 9139 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 9140 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 9141 hatlockp, TSB_SHRINK); 9142 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 9143 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 9144 hatlockp, TSB_GROW); 9145 } 9146 tsbinfop = sfmmup->sfmmu_tsb; 9147 9148 /* 9149 * With the TLB and first TSB out of the way, we need to see if 9150 * we need a second TSB for 4M pages. If we managed to reprogram 9151 * the TLB page sizes above, the process will start using this new 9152 * TSB right away; otherwise, it will start using it on the next 9153 * context switch. Either way, it's no big deal so there's no 9154 * synchronization with the trap handlers here unless we grow the 9155 * TSB (in which case it's required to prevent using the old one 9156 * after it's freed). Note: second tsb is required for 32M/256M 9157 * page sizes. 9158 */ 9159 if (tte4m_cnt > tsb_sectsb_threshold) { 9160 /* 9161 * If we're growing, select the size based on RSS. If we're 9162 * shrinking, leave some room so we don't have to turn 9163 * around and grow again immediately. 9164 */ 9165 if (growing) 9166 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 9167 else 9168 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 9169 if (tsbinfop->tsb_next == NULL) { 9170 struct tsb_info *newtsb; 9171 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 9172 0 : TSB_ALLOC; 9173 9174 sfmmu_hat_exit(hatlockp); 9175 9176 /* 9177 * Try to allocate a TSB for 4[32|256]M pages. If we 9178 * can't get the size we want, retry w/a minimum sized 9179 * TSB. If that still didn't work, give up; we can 9180 * still run without one. 9181 */ 9182 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 9183 TSB4M|TSB32M|TSB256M:TSB4M; 9184 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 9185 allocflags, sfmmup) != 0) && 9186 (sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 9187 tsb_bits, allocflags, sfmmup) != 0)) { 9188 return; 9189 } 9190 9191 hatlockp = sfmmu_hat_enter(sfmmup); 9192 9193 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 9194 sfmmup->sfmmu_tsb->tsb_next = newtsb; 9195 SFMMU_STAT(sf_tsb_sectsb_create); 9196 sfmmu_setup_tsbinfo(sfmmup); 9197 sfmmu_hat_exit(hatlockp); 9198 return; 9199 } else { 9200 /* 9201 * It's annoying, but possible for us 9202 * to get here.. we dropped the HAT lock 9203 * because of locking order in the kmem 9204 * allocator, and while we were off getting 9205 * our memory, some other thread decided to 9206 * do us a favor and won the race to get a 9207 * second TSB for this process. Sigh. 9208 */ 9209 sfmmu_hat_exit(hatlockp); 9210 sfmmu_tsbinfo_free(newtsb); 9211 return; 9212 } 9213 } 9214 9215 /* 9216 * We have a second TSB, see if it's big enough. 9217 */ 9218 tsbinfop = tsbinfop->tsb_next; 9219 9220 /* 9221 * Check to see if our second TSB is the right size; 9222 * we may need to grow or shrink it. 9223 * To prevent thrashing (e.g. growing the TSB on a 9224 * subsequent map operation), only try to shrink if 9225 * the TSB reach exceeds twice the virtual address 9226 * space size. 9227 */ 9228 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 9229 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 9230 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 9231 tsb_szc, hatlockp, TSB_SHRINK); 9232 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 9233 TSB_OK_GROW()) { 9234 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 9235 tsb_szc, hatlockp, TSB_GROW); 9236 } 9237 } 9238 9239 sfmmu_hat_exit(hatlockp); 9240 } 9241 9242 /* 9243 * Get the preferred page size code for a hat. 9244 * This is only advice, so locking is not done; 9245 * this transitory information could change 9246 * following the call anyway. This interface is 9247 * sun4 private. 9248 */ 9249 /*ARGSUSED*/ 9250 uint_t 9251 hat_preferred_pgsz(struct hat *hat, caddr_t vaddr, size_t maplen, int maptype) 9252 { 9253 sfmmu_t *sfmmup = (sfmmu_t *)hat; 9254 uint_t szc, maxszc = mmu_page_sizes - 1; 9255 size_t pgsz; 9256 9257 if (maptype == MAPPGSZ_ISM) { 9258 for (szc = maxszc; szc >= TTE4M; szc--) { 9259 if (disable_ism_large_pages & (1 << szc)) 9260 continue; 9261 9262 pgsz = hw_page_array[szc].hp_size; 9263 if ((maplen >= pgsz) && IS_P2ALIGNED(vaddr, pgsz)) 9264 return (szc); 9265 } 9266 return (TTE4M); 9267 } else if (&mmu_preferred_pgsz) { /* USIII+-USIV+ */ 9268 return (mmu_preferred_pgsz(sfmmup, vaddr, maplen)); 9269 } else { /* USIII, USII, Niagara */ 9270 for (szc = maxszc; szc > TTE8K; szc--) { 9271 if (disable_large_pages & (1 << szc)) 9272 continue; 9273 9274 pgsz = hw_page_array[szc].hp_size; 9275 if ((maplen >= pgsz) && IS_P2ALIGNED(vaddr, pgsz)) 9276 return (szc); 9277 } 9278 return (TTE8K); 9279 } 9280 } 9281 9282 /* 9283 * Free up a ctx 9284 */ 9285 static void 9286 sfmmu_free_ctx(sfmmu_t *sfmmup, struct ctx *ctx) 9287 { 9288 int ctxnum; 9289 9290 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 9291 9292 TRACE_CTXS(&ctx_trace_mutex, ctx_trace_ptr, sfmmup->sfmmu_cnum, 9293 sfmmup, 0, CTX_TRC_FREE); 9294 9295 if (sfmmup->sfmmu_cnum == INVALID_CONTEXT) { 9296 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 9297 rw_exit(&ctx->ctx_rwlock); 9298 return; 9299 } 9300 9301 ASSERT(sfmmup == ctx->ctx_sfmmu); 9302 9303 ctx->ctx_sfmmu = NULL; 9304 ctx->ctx_flags = 0; 9305 sfmmup->sfmmu_cnum = INVALID_CONTEXT; 9306 membar_enter(); 9307 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 9308 ctxnum = sfmmu_getctx_sec(); 9309 if (ctxnum == ctxtoctxnum(ctx)) { 9310 sfmmu_setctx_sec(INVALID_CONTEXT); 9311 sfmmu_clear_utsbinfo(); 9312 } 9313 9314 /* 9315 * Put the freed ctx on the dirty list 9316 */ 9317 mutex_enter(&ctx_list_lock); 9318 CTX_SET_FLAGS(ctx, CTX_FREE_FLAG); 9319 ctx->ctx_free = ctxdirty; 9320 ctxdirty = ctx; 9321 mutex_exit(&ctx_list_lock); 9322 9323 rw_exit(&ctx->ctx_rwlock); 9324 } 9325 9326 /* 9327 * Free up a sfmmu 9328 * Since the sfmmu is currently embedded in the hat struct we simply zero 9329 * out our fields and free up the ism map blk list if any. 9330 */ 9331 static void 9332 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 9333 { 9334 ism_blk_t *blkp, *nx_blkp; 9335 #ifdef DEBUG 9336 ism_map_t *map; 9337 int i; 9338 #endif 9339 9340 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 9341 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 9342 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 9343 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 9344 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 9345 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 9346 ASSERT(sfmmup->sfmmu_cnum == INVALID_CONTEXT); 9347 sfmmup->sfmmu_free = 0; 9348 sfmmup->sfmmu_ismhat = 0; 9349 9350 blkp = sfmmup->sfmmu_iblk; 9351 sfmmup->sfmmu_iblk = NULL; 9352 9353 while (blkp) { 9354 #ifdef DEBUG 9355 map = blkp->iblk_maps; 9356 for (i = 0; i < ISM_MAP_SLOTS; i++) { 9357 ASSERT(map[i].imap_seg == 0); 9358 ASSERT(map[i].imap_ismhat == NULL); 9359 ASSERT(map[i].imap_ment == NULL); 9360 } 9361 #endif 9362 nx_blkp = blkp->iblk_next; 9363 blkp->iblk_next = NULL; 9364 blkp->iblk_nextpa = (uint64_t)-1; 9365 kmem_cache_free(ism_blk_cache, blkp); 9366 blkp = nx_blkp; 9367 } 9368 } 9369 9370 /* 9371 * Locking primitves accessed by HATLOCK macros 9372 */ 9373 9374 #define SFMMU_SPL_MTX (0x0) 9375 #define SFMMU_ML_MTX (0x1) 9376 9377 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 9378 SPL_HASH(pg) : MLIST_HASH(pg)) 9379 9380 kmutex_t * 9381 sfmmu_page_enter(struct page *pp) 9382 { 9383 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 9384 } 9385 9386 static void 9387 sfmmu_page_exit(kmutex_t *spl) 9388 { 9389 mutex_exit(spl); 9390 } 9391 9392 static int 9393 sfmmu_page_spl_held(struct page *pp) 9394 { 9395 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 9396 } 9397 9398 kmutex_t * 9399 sfmmu_mlist_enter(struct page *pp) 9400 { 9401 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 9402 } 9403 9404 void 9405 sfmmu_mlist_exit(kmutex_t *mml) 9406 { 9407 mutex_exit(mml); 9408 } 9409 9410 int 9411 sfmmu_mlist_held(struct page *pp) 9412 { 9413 9414 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 9415 } 9416 9417 /* 9418 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 9419 * sfmmu_mlist_enter() case mml_table lock array is used and for 9420 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 9421 * 9422 * The lock is taken on a root page so that it protects an operation on all 9423 * constituent pages of a large page pp belongs to. 9424 * 9425 * The routine takes a lock from the appropriate array. The lock is determined 9426 * by hashing the root page. After taking the lock this routine checks if the 9427 * root page has the same size code that was used to determine the root (i.e 9428 * that root hasn't changed). If root page has the expected p_szc field we 9429 * have the right lock and it's returned to the caller. If root's p_szc 9430 * decreased we release the lock and retry from the beginning. This case can 9431 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 9432 * value and taking the lock. The number of retries due to p_szc decrease is 9433 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 9434 * determined by hashing pp itself. 9435 * 9436 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 9437 * possible that p_szc can increase. To increase p_szc a thread has to lock 9438 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 9439 * callers that don't hold a page locked recheck if hmeblk through which pp 9440 * was found still maps this pp. If it doesn't map it anymore returned lock 9441 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 9442 * p_szc increase after taking the lock it returns this lock without further 9443 * retries because in this case the caller doesn't care about which lock was 9444 * taken. The caller will drop it right away. 9445 * 9446 * After the routine returns it's guaranteed that hat_page_demote() can't 9447 * change p_szc field of any of constituent pages of a large page pp belongs 9448 * to as long as pp was either locked at least SHARED prior to this call or 9449 * the caller finds that hment that pointed to this pp still references this 9450 * pp (this also assumes that the caller holds hme hash bucket lock so that 9451 * the same pp can't be remapped into the same hmeblk after it was unmapped by 9452 * hat_pageunload()). 9453 */ 9454 static kmutex_t * 9455 sfmmu_mlspl_enter(struct page *pp, int type) 9456 { 9457 kmutex_t *mtx; 9458 uint_t prev_rszc = UINT_MAX; 9459 page_t *rootpp; 9460 uint_t szc; 9461 uint_t rszc; 9462 uint_t pszc = pp->p_szc; 9463 9464 ASSERT(pp != NULL); 9465 9466 again: 9467 if (pszc == 0) { 9468 mtx = SFMMU_MLSPL_MTX(type, pp); 9469 mutex_enter(mtx); 9470 return (mtx); 9471 } 9472 9473 /* The lock lives in the root page */ 9474 rootpp = PP_GROUPLEADER(pp, pszc); 9475 mtx = SFMMU_MLSPL_MTX(type, rootpp); 9476 mutex_enter(mtx); 9477 9478 /* 9479 * Return mml in the following 3 cases: 9480 * 9481 * 1) If pp itself is root since if its p_szc decreased before we took 9482 * the lock pp is still the root of smaller szc page. And if its p_szc 9483 * increased it doesn't matter what lock we return (see comment in 9484 * front of this routine). 9485 * 9486 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 9487 * large page we have the right lock since any previous potential 9488 * hat_page_demote() is done demoting from greater than current root's 9489 * p_szc because hat_page_demote() changes root's p_szc last. No 9490 * further hat_page_demote() can start or be in progress since it 9491 * would need the same lock we currently hold. 9492 * 9493 * 3) If rootpp's p_szc increased since previous iteration it doesn't 9494 * matter what lock we return (see comment in front of this routine). 9495 */ 9496 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 9497 rszc >= prev_rszc) { 9498 return (mtx); 9499 } 9500 9501 /* 9502 * hat_page_demote() could have decreased root's p_szc. 9503 * In this case pp's p_szc must also be smaller than pszc. 9504 * Retry. 9505 */ 9506 if (rszc < pszc) { 9507 szc = pp->p_szc; 9508 if (szc < pszc) { 9509 mutex_exit(mtx); 9510 pszc = szc; 9511 goto again; 9512 } 9513 /* 9514 * pp's p_szc increased after it was decreased. 9515 * page cannot be mapped. Return current lock. The caller 9516 * will drop it right away. 9517 */ 9518 return (mtx); 9519 } 9520 9521 /* 9522 * root's p_szc is greater than pp's p_szc. 9523 * hat_page_demote() is not done with all pages 9524 * yet. Wait for it to complete. 9525 */ 9526 mutex_exit(mtx); 9527 rootpp = PP_GROUPLEADER(rootpp, rszc); 9528 mtx = SFMMU_MLSPL_MTX(type, rootpp); 9529 mutex_enter(mtx); 9530 mutex_exit(mtx); 9531 prev_rszc = rszc; 9532 goto again; 9533 } 9534 9535 static int 9536 sfmmu_mlspl_held(struct page *pp, int type) 9537 { 9538 kmutex_t *mtx; 9539 9540 ASSERT(pp != NULL); 9541 /* The lock lives in the root page */ 9542 pp = PP_PAGEROOT(pp); 9543 ASSERT(pp != NULL); 9544 9545 mtx = SFMMU_MLSPL_MTX(type, pp); 9546 return (MUTEX_HELD(mtx)); 9547 } 9548 9549 static uint_t 9550 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 9551 { 9552 struct hme_blk *hblkp; 9553 9554 if (freehblkp != NULL) { 9555 mutex_enter(&freehblkp_lock); 9556 if (freehblkp != NULL) { 9557 /* 9558 * If the current thread is owning hblk_reserve, 9559 * let it succede even if freehblkcnt is really low. 9560 */ 9561 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 9562 SFMMU_STAT(sf_get_free_throttle); 9563 mutex_exit(&freehblkp_lock); 9564 return (0); 9565 } 9566 freehblkcnt--; 9567 *hmeblkpp = freehblkp; 9568 hblkp = *hmeblkpp; 9569 freehblkp = hblkp->hblk_next; 9570 mutex_exit(&freehblkp_lock); 9571 hblkp->hblk_next = NULL; 9572 SFMMU_STAT(sf_get_free_success); 9573 return (1); 9574 } 9575 mutex_exit(&freehblkp_lock); 9576 } 9577 SFMMU_STAT(sf_get_free_fail); 9578 return (0); 9579 } 9580 9581 static uint_t 9582 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 9583 { 9584 struct hme_blk *hblkp; 9585 9586 /* 9587 * If the current thread is mapping into kernel space, 9588 * let it succede even if freehblkcnt is max 9589 * so that it will avoid freeing it to kmem. 9590 * This will prevent stack overflow due to 9591 * possible recursion since kmem_cache_free() 9592 * might require creation of a slab which 9593 * in turn needs an hmeblk to map that slab; 9594 * let's break this vicious chain at the first 9595 * opportunity. 9596 */ 9597 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 9598 mutex_enter(&freehblkp_lock); 9599 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 9600 SFMMU_STAT(sf_put_free_success); 9601 freehblkcnt++; 9602 hmeblkp->hblk_next = freehblkp; 9603 freehblkp = hmeblkp; 9604 mutex_exit(&freehblkp_lock); 9605 return (1); 9606 } 9607 mutex_exit(&freehblkp_lock); 9608 } 9609 9610 /* 9611 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 9612 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 9613 * we are not in the process of mapping into kernel space. 9614 */ 9615 ASSERT(!critical); 9616 while (freehblkcnt > HBLK_RESERVE_CNT) { 9617 mutex_enter(&freehblkp_lock); 9618 if (freehblkcnt > HBLK_RESERVE_CNT) { 9619 freehblkcnt--; 9620 hblkp = freehblkp; 9621 freehblkp = hblkp->hblk_next; 9622 mutex_exit(&freehblkp_lock); 9623 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 9624 kmem_cache_free(sfmmu8_cache, hblkp); 9625 continue; 9626 } 9627 mutex_exit(&freehblkp_lock); 9628 } 9629 SFMMU_STAT(sf_put_free_fail); 9630 return (0); 9631 } 9632 9633 static void 9634 sfmmu_hblk_swap(struct hme_blk *new) 9635 { 9636 struct hme_blk *old, *hblkp, *prev; 9637 uint64_t hblkpa, prevpa, newpa; 9638 caddr_t base, vaddr, endaddr; 9639 struct hmehash_bucket *hmebp; 9640 struct sf_hment *osfhme, *nsfhme; 9641 page_t *pp; 9642 kmutex_t *pml; 9643 tte_t tte; 9644 9645 #ifdef DEBUG 9646 hmeblk_tag hblktag; 9647 struct hme_blk *found; 9648 #endif 9649 old = HBLK_RESERVE; 9650 9651 /* 9652 * save pa before bcopy clobbers it 9653 */ 9654 newpa = new->hblk_nextpa; 9655 9656 base = (caddr_t)get_hblk_base(old); 9657 endaddr = base + get_hblk_span(old); 9658 9659 /* 9660 * acquire hash bucket lock. 9661 */ 9662 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K); 9663 9664 /* 9665 * copy contents from old to new 9666 */ 9667 bcopy((void *)old, (void *)new, HME8BLK_SZ); 9668 9669 /* 9670 * add new to hash chain 9671 */ 9672 sfmmu_hblk_hash_add(hmebp, new, newpa); 9673 9674 /* 9675 * search hash chain for hblk_reserve; this needs to be performed 9676 * after adding new, otherwise prevpa and prev won't correspond 9677 * to the hblk which is prior to old in hash chain when we call 9678 * sfmmu_hblk_hash_rm to remove old later. 9679 */ 9680 for (prevpa = 0, prev = NULL, 9681 hblkpa = hmebp->hmeh_nextpa, hblkp = hmebp->hmeblkp; 9682 hblkp != NULL && hblkp != old; 9683 prevpa = hblkpa, prev = hblkp, 9684 hblkpa = hblkp->hblk_nextpa, hblkp = hblkp->hblk_next); 9685 9686 if (hblkp != old) 9687 panic("sfmmu_hblk_swap: hblk_reserve not found"); 9688 9689 /* 9690 * p_mapping list is still pointing to hments in hblk_reserve; 9691 * fix up p_mapping list so that they point to hments in new. 9692 * 9693 * Since all these mappings are created by hblk_reserve_thread 9694 * on the way and it's using at least one of the buffers from each of 9695 * the newly minted slabs, there is no danger of any of these 9696 * mappings getting unloaded by another thread. 9697 * 9698 * tsbmiss could only modify ref/mod bits of hments in old/new. 9699 * Since all of these hments hold mappings established by segkmem 9700 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 9701 * have no meaning for the mappings in hblk_reserve. hments in 9702 * old and new are identical except for ref/mod bits. 9703 */ 9704 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 9705 9706 HBLKTOHME(osfhme, old, vaddr); 9707 sfmmu_copytte(&osfhme->hme_tte, &tte); 9708 9709 if (TTE_IS_VALID(&tte)) { 9710 if ((pp = osfhme->hme_page) == NULL) 9711 panic("sfmmu_hblk_swap: page not mapped"); 9712 9713 pml = sfmmu_mlist_enter(pp); 9714 9715 if (pp != osfhme->hme_page) 9716 panic("sfmmu_hblk_swap: mapping changed"); 9717 9718 HBLKTOHME(nsfhme, new, vaddr); 9719 9720 HME_ADD(nsfhme, pp); 9721 HME_SUB(osfhme, pp); 9722 9723 sfmmu_mlist_exit(pml); 9724 } 9725 } 9726 9727 /* 9728 * remove old from hash chain 9729 */ 9730 sfmmu_hblk_hash_rm(hmebp, old, prevpa, prev); 9731 9732 #ifdef DEBUG 9733 9734 hblktag.htag_id = ksfmmup; 9735 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 9736 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 9737 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 9738 9739 if (found != new) 9740 panic("sfmmu_hblk_swap: new hblk not found"); 9741 #endif 9742 9743 SFMMU_HASH_UNLOCK(hmebp); 9744 9745 /* 9746 * Reset hblk_reserve 9747 */ 9748 bzero((void *)old, HME8BLK_SZ); 9749 old->hblk_nextpa = va_to_pa((caddr_t)old); 9750 } 9751 9752 /* 9753 * Grab the mlist mutex for both pages passed in. 9754 * 9755 * low and high will be returned as pointers to the mutexes for these pages. 9756 * low refers to the mutex residing in the lower bin of the mlist hash, while 9757 * high refers to the mutex residing in the higher bin of the mlist hash. This 9758 * is due to the locking order restrictions on the same thread grabbing 9759 * multiple mlist mutexes. The low lock must be acquired before the high lock. 9760 * 9761 * If both pages hash to the same mutex, only grab that single mutex, and 9762 * high will be returned as NULL 9763 * If the pages hash to different bins in the hash, grab the lower addressed 9764 * lock first and then the higher addressed lock in order to follow the locking 9765 * rules involved with the same thread grabbing multiple mlist mutexes. 9766 * low and high will both have non-NULL values. 9767 */ 9768 static void 9769 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 9770 kmutex_t **low, kmutex_t **high) 9771 { 9772 kmutex_t *mml_targ, *mml_repl; 9773 9774 /* 9775 * no need to do the dance around szc as in sfmmu_mlist_enter() 9776 * because this routine is only called by hat_page_relocate() and all 9777 * targ and repl pages are already locked EXCL so szc can't change. 9778 */ 9779 9780 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 9781 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 9782 9783 if (mml_targ == mml_repl) { 9784 *low = mml_targ; 9785 *high = NULL; 9786 } else { 9787 if (mml_targ < mml_repl) { 9788 *low = mml_targ; 9789 *high = mml_repl; 9790 } else { 9791 *low = mml_repl; 9792 *high = mml_targ; 9793 } 9794 } 9795 9796 mutex_enter(*low); 9797 if (*high) 9798 mutex_enter(*high); 9799 } 9800 9801 static void 9802 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 9803 { 9804 if (high) 9805 mutex_exit(high); 9806 mutex_exit(low); 9807 } 9808 9809 static hatlock_t * 9810 sfmmu_hat_enter(sfmmu_t *sfmmup) 9811 { 9812 hatlock_t *hatlockp; 9813 9814 if (sfmmup != ksfmmup) { 9815 hatlockp = TSB_HASH(sfmmup); 9816 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 9817 return (hatlockp); 9818 } 9819 return (NULL); 9820 } 9821 9822 static hatlock_t * 9823 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 9824 { 9825 hatlock_t *hatlockp; 9826 9827 if (sfmmup != ksfmmup) { 9828 hatlockp = TSB_HASH(sfmmup); 9829 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 9830 return (NULL); 9831 return (hatlockp); 9832 } 9833 return (NULL); 9834 } 9835 9836 static void 9837 sfmmu_hat_exit(hatlock_t *hatlockp) 9838 { 9839 if (hatlockp != NULL) 9840 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 9841 } 9842 9843 static void 9844 sfmmu_hat_lock_all(void) 9845 { 9846 int i; 9847 for (i = 0; i < SFMMU_NUM_LOCK; i++) 9848 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 9849 } 9850 9851 static void 9852 sfmmu_hat_unlock_all(void) 9853 { 9854 int i; 9855 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 9856 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 9857 } 9858 9859 int 9860 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 9861 { 9862 ASSERT(sfmmup != ksfmmup); 9863 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 9864 } 9865 9866 /* 9867 * Locking primitives to provide consistency between ISM unmap 9868 * and other operations. Since ISM unmap can take a long time, we 9869 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 9870 * contention on the hatlock buckets while ISM segments are being 9871 * unmapped. The tradeoff is that the flags don't prevent priority 9872 * inversion from occurring, so we must request kernel priority in 9873 * case we have to sleep to keep from getting buried while holding 9874 * the HAT_ISMBUSY flag set, which in turn could block other kernel 9875 * threads from running (for example, in sfmmu_uvatopfn()). 9876 */ 9877 static void 9878 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 9879 { 9880 hatlock_t *hatlockp; 9881 9882 THREAD_KPRI_REQUEST(); 9883 if (!hatlock_held) 9884 hatlockp = sfmmu_hat_enter(sfmmup); 9885 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 9886 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 9887 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 9888 if (!hatlock_held) 9889 sfmmu_hat_exit(hatlockp); 9890 } 9891 9892 static void 9893 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 9894 { 9895 hatlock_t *hatlockp; 9896 9897 if (!hatlock_held) 9898 hatlockp = sfmmu_hat_enter(sfmmup); 9899 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 9900 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 9901 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 9902 if (!hatlock_held) 9903 sfmmu_hat_exit(hatlockp); 9904 THREAD_KPRI_RELEASE(); 9905 } 9906 9907 /* 9908 * 9909 * Algorithm: 9910 * 9911 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 9912 * hblks. 9913 * 9914 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 9915 * 9916 * (a) try to return an hblk from reserve pool of free hblks; 9917 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 9918 * and return hblk_reserve. 9919 * 9920 * (3) call kmem_cache_alloc() to allocate hblk; 9921 * 9922 * (a) if hblk_reserve_lock is held by the current thread, 9923 * atomically replace hblk_reserve by the hblk that is 9924 * returned by kmem_cache_alloc; release hblk_reserve_lock 9925 * and call kmem_cache_alloc() again. 9926 * (b) if reserve pool is not full, add the hblk that is 9927 * returned by kmem_cache_alloc to reserve pool and 9928 * call kmem_cache_alloc again. 9929 * 9930 */ 9931 static struct hme_blk * 9932 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 9933 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 9934 uint_t flags) 9935 { 9936 struct hme_blk *hmeblkp = NULL; 9937 struct hme_blk *newhblkp; 9938 struct hme_blk *shw_hblkp = NULL; 9939 struct kmem_cache *sfmmu_cache = NULL; 9940 uint64_t hblkpa; 9941 ulong_t index; 9942 uint_t owner; /* set to 1 if using hblk_reserve */ 9943 uint_t forcefree; 9944 int sleep; 9945 9946 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 9947 9948 /* 9949 * If segkmem is not created yet, allocate from static hmeblks 9950 * created at the end of startup_modules(). See the block comment 9951 * in startup_modules() describing how we estimate the number of 9952 * static hmeblks that will be needed during re-map. 9953 */ 9954 if (!hblk_alloc_dynamic) { 9955 9956 if (size == TTE8K) { 9957 index = nucleus_hblk8.index; 9958 if (index >= nucleus_hblk8.len) { 9959 /* 9960 * If we panic here, see startup_modules() to 9961 * make sure that we are calculating the 9962 * number of hblk8's that we need correctly. 9963 */ 9964 panic("no nucleus hblk8 to allocate"); 9965 } 9966 hmeblkp = 9967 (struct hme_blk *)&nucleus_hblk8.list[index]; 9968 nucleus_hblk8.index++; 9969 SFMMU_STAT(sf_hblk8_nalloc); 9970 } else { 9971 index = nucleus_hblk1.index; 9972 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 9973 /* 9974 * If we panic here, see startup_modules() 9975 * and H8TOH1; most likely you need to 9976 * update the calculation of the number 9977 * of hblk1's the kernel needs to boot. 9978 */ 9979 panic("no nucleus hblk1 to allocate"); 9980 } 9981 hmeblkp = 9982 (struct hme_blk *)&nucleus_hblk1.list[index]; 9983 nucleus_hblk1.index++; 9984 SFMMU_STAT(sf_hblk1_nalloc); 9985 } 9986 9987 goto hblk_init; 9988 } 9989 9990 SFMMU_HASH_UNLOCK(hmebp); 9991 9992 if (sfmmup != KHATID) { 9993 if (mmu_page_sizes == max_mmu_page_sizes) { 9994 if (size < TTE256M) 9995 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 9996 size, flags); 9997 } else { 9998 if (size < TTE4M) 9999 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 10000 size, flags); 10001 } 10002 } 10003 10004 fill_hblk: 10005 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 10006 10007 if (owner && size == TTE8K) { 10008 10009 /* 10010 * We are really in a tight spot. We already own 10011 * hblk_reserve and we need another hblk. In anticipation 10012 * of this kind of scenario, we specifically set aside 10013 * HBLK_RESERVE_MIN number of hblks to be used exclusively 10014 * by owner of hblk_reserve. 10015 */ 10016 SFMMU_STAT(sf_hblk_recurse_cnt); 10017 10018 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 10019 panic("sfmmu_hblk_alloc: reserve list is empty"); 10020 10021 goto hblk_verify; 10022 } 10023 10024 ASSERT(!owner); 10025 10026 if ((flags & HAT_NO_KALLOC) == 0) { 10027 10028 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 10029 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 10030 10031 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 10032 hmeblkp = sfmmu_hblk_steal(size); 10033 } else { 10034 /* 10035 * if we are the owner of hblk_reserve, 10036 * swap hblk_reserve with hmeblkp and 10037 * start a fresh life. Hope things go 10038 * better this time. 10039 */ 10040 if (hblk_reserve_thread == curthread) { 10041 ASSERT(sfmmu_cache == sfmmu8_cache); 10042 sfmmu_hblk_swap(hmeblkp); 10043 hblk_reserve_thread = NULL; 10044 mutex_exit(&hblk_reserve_lock); 10045 goto fill_hblk; 10046 } 10047 /* 10048 * let's donate this hblk to our reserve list if 10049 * we are not mapping kernel range 10050 */ 10051 if (size == TTE8K && sfmmup != KHATID) 10052 if (sfmmu_put_free_hblk(hmeblkp, 0)) 10053 goto fill_hblk; 10054 } 10055 } else { 10056 /* 10057 * We are here to map the slab in sfmmu8_cache; let's 10058 * check if we could tap our reserve list; if successful, 10059 * this will avoid the pain of going thru sfmmu_hblk_swap 10060 */ 10061 SFMMU_STAT(sf_hblk_slab_cnt); 10062 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 10063 /* 10064 * let's start hblk_reserve dance 10065 */ 10066 SFMMU_STAT(sf_hblk_reserve_cnt); 10067 owner = 1; 10068 mutex_enter(&hblk_reserve_lock); 10069 hmeblkp = HBLK_RESERVE; 10070 hblk_reserve_thread = curthread; 10071 } 10072 } 10073 10074 hblk_verify: 10075 ASSERT(hmeblkp != NULL); 10076 set_hblk_sz(hmeblkp, size); 10077 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10078 SFMMU_HASH_LOCK(hmebp); 10079 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 10080 if (newhblkp != NULL) { 10081 SFMMU_HASH_UNLOCK(hmebp); 10082 if (hmeblkp != HBLK_RESERVE) { 10083 /* 10084 * This is really tricky! 10085 * 10086 * vmem_alloc(vmem_seg_arena) 10087 * vmem_alloc(vmem_internal_arena) 10088 * segkmem_alloc(heap_arena) 10089 * vmem_alloc(heap_arena) 10090 * page_create() 10091 * hat_memload() 10092 * kmem_cache_free() 10093 * kmem_cache_alloc() 10094 * kmem_slab_create() 10095 * vmem_alloc(kmem_internal_arena) 10096 * segkmem_alloc(heap_arena) 10097 * vmem_alloc(heap_arena) 10098 * page_create() 10099 * hat_memload() 10100 * kmem_cache_free() 10101 * ... 10102 * 10103 * Thus, hat_memload() could call kmem_cache_free 10104 * for enough number of times that we could easily 10105 * hit the bottom of the stack or run out of reserve 10106 * list of vmem_seg structs. So, we must donate 10107 * this hblk to reserve list if it's allocated 10108 * from sfmmu8_cache *and* mapping kernel range. 10109 * We don't need to worry about freeing hmeblk1's 10110 * to kmem since they don't map any kmem slabs. 10111 * 10112 * Note: When segkmem supports largepages, we must 10113 * free hmeblk1's to reserve list as well. 10114 */ 10115 forcefree = (sfmmup == KHATID) ? 1 : 0; 10116 if (size == TTE8K && 10117 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 10118 goto re_verify; 10119 } 10120 ASSERT(sfmmup != KHATID); 10121 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 10122 } else { 10123 /* 10124 * Hey! we don't need hblk_reserve any more. 10125 */ 10126 ASSERT(owner); 10127 hblk_reserve_thread = NULL; 10128 mutex_exit(&hblk_reserve_lock); 10129 owner = 0; 10130 } 10131 re_verify: 10132 /* 10133 * let's check if the goodies are still present 10134 */ 10135 SFMMU_HASH_LOCK(hmebp); 10136 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 10137 if (newhblkp != NULL) { 10138 /* 10139 * return newhblkp if it's not hblk_reserve; 10140 * if newhblkp is hblk_reserve, return it 10141 * _only if_ we are the owner of hblk_reserve. 10142 */ 10143 if (newhblkp != HBLK_RESERVE || owner) { 10144 return (newhblkp); 10145 } else { 10146 /* 10147 * we just hit hblk_reserve in the hash and 10148 * we are not the owner of that; 10149 * 10150 * block until hblk_reserve_thread completes 10151 * swapping hblk_reserve and try the dance 10152 * once again. 10153 */ 10154 SFMMU_HASH_UNLOCK(hmebp); 10155 mutex_enter(&hblk_reserve_lock); 10156 mutex_exit(&hblk_reserve_lock); 10157 SFMMU_STAT(sf_hblk_reserve_hit); 10158 goto fill_hblk; 10159 } 10160 } else { 10161 /* 10162 * it's no more! try the dance once again. 10163 */ 10164 SFMMU_HASH_UNLOCK(hmebp); 10165 goto fill_hblk; 10166 } 10167 } 10168 10169 hblk_init: 10170 set_hblk_sz(hmeblkp, size); 10171 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 10172 hmeblkp->hblk_next = (struct hme_blk *)NULL; 10173 hmeblkp->hblk_tag = hblktag; 10174 hmeblkp->hblk_shadow = shw_hblkp; 10175 hblkpa = hmeblkp->hblk_nextpa; 10176 hmeblkp->hblk_nextpa = 0; 10177 10178 ASSERT(get_hblk_ttesz(hmeblkp) == size); 10179 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 10180 ASSERT(hmeblkp->hblk_hmecnt == 0); 10181 ASSERT(hmeblkp->hblk_vcnt == 0); 10182 ASSERT(hmeblkp->hblk_lckcnt == 0); 10183 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 10184 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 10185 return (hmeblkp); 10186 } 10187 10188 /* 10189 * This function performs any cleanup required on the hme_blk 10190 * and returns it to the free list. 10191 */ 10192 /* ARGSUSED */ 10193 static void 10194 sfmmu_hblk_free(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 10195 uint64_t hblkpa, struct hme_blk **listp) 10196 { 10197 int shw_size, vshift; 10198 struct hme_blk *shw_hblkp; 10199 uint_t shw_mask, newshw_mask; 10200 uintptr_t vaddr; 10201 int size; 10202 uint_t critical; 10203 10204 ASSERT(hmeblkp); 10205 ASSERT(!hmeblkp->hblk_hmecnt); 10206 ASSERT(!hmeblkp->hblk_vcnt); 10207 ASSERT(!hmeblkp->hblk_lckcnt); 10208 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 10209 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 10210 10211 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 10212 10213 size = get_hblk_ttesz(hmeblkp); 10214 shw_hblkp = hmeblkp->hblk_shadow; 10215 if (shw_hblkp) { 10216 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 10217 if (mmu_page_sizes == max_mmu_page_sizes) { 10218 ASSERT(size < TTE256M); 10219 } else { 10220 ASSERT(size < TTE4M); 10221 } 10222 10223 shw_size = get_hblk_ttesz(shw_hblkp); 10224 vaddr = get_hblk_base(hmeblkp); 10225 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 10226 ASSERT(vshift < 8); 10227 /* 10228 * Atomically clear shadow mask bit 10229 */ 10230 do { 10231 shw_mask = shw_hblkp->hblk_shw_mask; 10232 ASSERT(shw_mask & (1 << vshift)); 10233 newshw_mask = shw_mask & ~(1 << vshift); 10234 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 10235 shw_mask, newshw_mask); 10236 } while (newshw_mask != shw_mask); 10237 hmeblkp->hblk_shadow = NULL; 10238 } 10239 hmeblkp->hblk_next = NULL; 10240 hmeblkp->hblk_nextpa = hblkpa; 10241 hmeblkp->hblk_shw_bit = 0; 10242 10243 if (hmeblkp->hblk_nuc_bit == 0) { 10244 10245 if (size == TTE8K && sfmmu_put_free_hblk(hmeblkp, critical)) 10246 return; 10247 10248 hmeblkp->hblk_next = *listp; 10249 *listp = hmeblkp; 10250 } 10251 } 10252 10253 static void 10254 sfmmu_hblks_list_purge(struct hme_blk **listp) 10255 { 10256 struct hme_blk *hmeblkp; 10257 10258 while ((hmeblkp = *listp) != NULL) { 10259 *listp = hmeblkp->hblk_next; 10260 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 10261 } 10262 } 10263 10264 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 10265 10266 static uint_t sfmmu_hblk_steal_twice; 10267 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 10268 10269 /* 10270 * Steal a hmeblk 10271 * Enough hmeblks were allocated at startup (nucleus hmeblks) and also 10272 * hmeblks were added dynamically. We should never ever not be able to 10273 * find one. Look for an unused/unlocked hmeblk in user hash table. 10274 */ 10275 static struct hme_blk * 10276 sfmmu_hblk_steal(int size) 10277 { 10278 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 10279 struct hmehash_bucket *hmebp; 10280 struct hme_blk *hmeblkp = NULL, *pr_hblk; 10281 uint64_t hblkpa, prevpa; 10282 int i; 10283 10284 for (;;) { 10285 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 10286 uhmehash_steal_hand; 10287 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 10288 10289 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 10290 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 10291 SFMMU_HASH_LOCK(hmebp); 10292 hmeblkp = hmebp->hmeblkp; 10293 hblkpa = hmebp->hmeh_nextpa; 10294 prevpa = 0; 10295 pr_hblk = NULL; 10296 while (hmeblkp) { 10297 /* 10298 * check if it is a hmeblk that is not locked 10299 * and not shared. skip shadow hmeblks with 10300 * shadow_mask set i.e valid count non zero. 10301 */ 10302 if ((get_hblk_ttesz(hmeblkp) == size) && 10303 (hmeblkp->hblk_shw_bit == 0 || 10304 hmeblkp->hblk_vcnt == 0) && 10305 (hmeblkp->hblk_lckcnt == 0)) { 10306 /* 10307 * there is a high probability that we 10308 * will find a free one. search some 10309 * buckets for a free hmeblk initially 10310 * before unloading a valid hmeblk. 10311 */ 10312 if ((hmeblkp->hblk_vcnt == 0 && 10313 hmeblkp->hblk_hmecnt == 0) || (i >= 10314 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 10315 if (sfmmu_steal_this_hblk(hmebp, 10316 hmeblkp, hblkpa, prevpa, 10317 pr_hblk)) { 10318 /* 10319 * Hblk is unloaded 10320 * successfully 10321 */ 10322 break; 10323 } 10324 } 10325 } 10326 pr_hblk = hmeblkp; 10327 prevpa = hblkpa; 10328 hblkpa = hmeblkp->hblk_nextpa; 10329 hmeblkp = hmeblkp->hblk_next; 10330 } 10331 10332 SFMMU_HASH_UNLOCK(hmebp); 10333 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 10334 hmebp = uhme_hash; 10335 } 10336 uhmehash_steal_hand = hmebp; 10337 10338 if (hmeblkp != NULL) 10339 break; 10340 10341 /* 10342 * in the worst case, look for a free one in the kernel 10343 * hash table. 10344 */ 10345 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 10346 SFMMU_HASH_LOCK(hmebp); 10347 hmeblkp = hmebp->hmeblkp; 10348 hblkpa = hmebp->hmeh_nextpa; 10349 prevpa = 0; 10350 pr_hblk = NULL; 10351 while (hmeblkp) { 10352 /* 10353 * check if it is free hmeblk 10354 */ 10355 if ((get_hblk_ttesz(hmeblkp) == size) && 10356 (hmeblkp->hblk_lckcnt == 0) && 10357 (hmeblkp->hblk_vcnt == 0) && 10358 (hmeblkp->hblk_hmecnt == 0)) { 10359 if (sfmmu_steal_this_hblk(hmebp, 10360 hmeblkp, hblkpa, prevpa, pr_hblk)) { 10361 break; 10362 } else { 10363 /* 10364 * Cannot fail since we have 10365 * hash lock. 10366 */ 10367 panic("fail to steal?"); 10368 } 10369 } 10370 10371 pr_hblk = hmeblkp; 10372 prevpa = hblkpa; 10373 hblkpa = hmeblkp->hblk_nextpa; 10374 hmeblkp = hmeblkp->hblk_next; 10375 } 10376 10377 SFMMU_HASH_UNLOCK(hmebp); 10378 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 10379 hmebp = khme_hash; 10380 } 10381 10382 if (hmeblkp != NULL) 10383 break; 10384 sfmmu_hblk_steal_twice++; 10385 } 10386 return (hmeblkp); 10387 } 10388 10389 /* 10390 * This routine does real work to prepare a hblk to be "stolen" by 10391 * unloading the mappings, updating shadow counts .... 10392 * It returns 1 if the block is ready to be reused (stolen), or 0 10393 * means the block cannot be stolen yet- pageunload is still working 10394 * on this hblk. 10395 */ 10396 static int 10397 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 10398 uint64_t hblkpa, uint64_t prevpa, struct hme_blk *pr_hblk) 10399 { 10400 int shw_size, vshift; 10401 struct hme_blk *shw_hblkp; 10402 uintptr_t vaddr; 10403 uint_t shw_mask, newshw_mask; 10404 10405 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 10406 10407 /* 10408 * check if the hmeblk is free, unload if necessary 10409 */ 10410 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 10411 sfmmu_t *sfmmup; 10412 demap_range_t dmr; 10413 10414 sfmmup = hblktosfmmu(hmeblkp); 10415 DEMAP_RANGE_INIT(sfmmup, &dmr); 10416 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 10417 (caddr_t)get_hblk_base(hmeblkp), 10418 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 10419 DEMAP_RANGE_FLUSH(&dmr); 10420 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 10421 /* 10422 * Pageunload is working on the same hblk. 10423 */ 10424 return (0); 10425 } 10426 10427 sfmmu_hblk_steal_unload_count++; 10428 } 10429 10430 ASSERT(hmeblkp->hblk_lckcnt == 0); 10431 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 10432 10433 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 10434 hmeblkp->hblk_nextpa = hblkpa; 10435 10436 shw_hblkp = hmeblkp->hblk_shadow; 10437 if (shw_hblkp) { 10438 shw_size = get_hblk_ttesz(shw_hblkp); 10439 vaddr = get_hblk_base(hmeblkp); 10440 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 10441 ASSERT(vshift < 8); 10442 /* 10443 * Atomically clear shadow mask bit 10444 */ 10445 do { 10446 shw_mask = shw_hblkp->hblk_shw_mask; 10447 ASSERT(shw_mask & (1 << vshift)); 10448 newshw_mask = shw_mask & ~(1 << vshift); 10449 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 10450 shw_mask, newshw_mask); 10451 } while (newshw_mask != shw_mask); 10452 hmeblkp->hblk_shadow = NULL; 10453 } 10454 10455 /* 10456 * remove shadow bit if we are stealing an unused shadow hmeblk. 10457 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 10458 * we are indeed allocating a shadow hmeblk. 10459 */ 10460 hmeblkp->hblk_shw_bit = 0; 10461 10462 sfmmu_hblk_steal_count++; 10463 SFMMU_STAT(sf_steal_count); 10464 10465 return (1); 10466 } 10467 10468 struct hme_blk * 10469 sfmmu_hmetohblk(struct sf_hment *sfhme) 10470 { 10471 struct hme_blk *hmeblkp; 10472 struct sf_hment *sfhme0; 10473 struct hme_blk *hblk_dummy = 0; 10474 10475 /* 10476 * No dummy sf_hments, please. 10477 */ 10478 ASSERT(sfhme->hme_tte.ll != 0); 10479 10480 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 10481 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 10482 (uintptr_t)&hblk_dummy->hblk_hme[0]); 10483 10484 return (hmeblkp); 10485 } 10486 10487 /* 10488 * Make sure that there is a valid ctx, if not get a ctx. 10489 * Also, get a readers lock on the ctx, so that the ctx cannot 10490 * be stolen underneath us. 10491 */ 10492 static void 10493 sfmmu_disallow_ctx_steal(sfmmu_t *sfmmup) 10494 { 10495 struct ctx *ctx; 10496 10497 ASSERT(sfmmup != ksfmmup); 10498 ASSERT(sfmmup->sfmmu_ismhat == 0); 10499 10500 /* 10501 * If ctx has been stolen, get a ctx. 10502 */ 10503 if (sfmmup->sfmmu_cnum == INVALID_CONTEXT) { 10504 /* 10505 * Our ctx was stolen. Get a ctx with rlock. 10506 */ 10507 ctx = sfmmu_get_ctx(sfmmup); 10508 return; 10509 } else { 10510 ctx = sfmmutoctx(sfmmup); 10511 } 10512 10513 /* 10514 * Get the reader lock. 10515 */ 10516 rw_enter(&ctx->ctx_rwlock, RW_READER); 10517 if (ctx->ctx_sfmmu != sfmmup) { 10518 /* 10519 * The ctx got stolen, so spin again. 10520 */ 10521 rw_exit(&ctx->ctx_rwlock); 10522 ctx = sfmmu_get_ctx(sfmmup); 10523 } 10524 10525 ASSERT(sfmmup->sfmmu_cnum >= NUM_LOCKED_CTXS); 10526 } 10527 10528 /* 10529 * Decrement reference count for our ctx. If the reference count 10530 * becomes 0, our ctx can be stolen by someone. 10531 */ 10532 static void 10533 sfmmu_allow_ctx_steal(sfmmu_t *sfmmup) 10534 { 10535 struct ctx *ctx; 10536 10537 ASSERT(sfmmup != ksfmmup); 10538 ASSERT(sfmmup->sfmmu_ismhat == 0); 10539 ctx = sfmmutoctx(sfmmup); 10540 10541 ASSERT(sfmmup == ctx->ctx_sfmmu); 10542 ASSERT(sfmmup->sfmmu_cnum != INVALID_CONTEXT); 10543 rw_exit(&ctx->ctx_rwlock); 10544 } 10545 10546 /* 10547 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 10548 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 10549 * KM_SLEEP allocation. 10550 * 10551 * Return 0 on success, -1 otherwise. 10552 */ 10553 static void 10554 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 10555 { 10556 struct tsb_info *tsbinfop, *next; 10557 tsb_replace_rc_t rc; 10558 boolean_t gotfirst = B_FALSE; 10559 10560 ASSERT(sfmmup != ksfmmup); 10561 ASSERT(sfmmu_hat_lock_held(sfmmup)); 10562 10563 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 10564 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 10565 } 10566 10567 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10568 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 10569 } else { 10570 return; 10571 } 10572 10573 ASSERT(sfmmup->sfmmu_tsb != NULL); 10574 10575 /* 10576 * Loop over all tsbinfo's replacing them with ones that actually have 10577 * a TSB. If any of the replacements ever fail, bail out of the loop. 10578 */ 10579 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 10580 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 10581 next = tsbinfop->tsb_next; 10582 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 10583 hatlockp, TSB_SWAPIN); 10584 if (rc != TSB_SUCCESS) { 10585 break; 10586 } 10587 gotfirst = B_TRUE; 10588 } 10589 10590 switch (rc) { 10591 case TSB_SUCCESS: 10592 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 10593 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 10594 return; 10595 case TSB_ALLOCFAIL: 10596 break; 10597 default: 10598 panic("sfmmu_replace_tsb returned unrecognized failure code " 10599 "%d", rc); 10600 } 10601 10602 /* 10603 * In this case, we failed to get one of our TSBs. If we failed to 10604 * get the first TSB, get one of minimum size (8KB). Walk the list 10605 * and throw away the tsbinfos, starting where the allocation failed; 10606 * we can get by with just one TSB as long as we don't leave the 10607 * SWAPPED tsbinfo structures lying around. 10608 */ 10609 tsbinfop = sfmmup->sfmmu_tsb; 10610 next = tsbinfop->tsb_next; 10611 tsbinfop->tsb_next = NULL; 10612 10613 sfmmu_hat_exit(hatlockp); 10614 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 10615 next = tsbinfop->tsb_next; 10616 sfmmu_tsbinfo_free(tsbinfop); 10617 } 10618 hatlockp = sfmmu_hat_enter(sfmmup); 10619 10620 /* 10621 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 10622 * pages. 10623 */ 10624 if (!gotfirst) { 10625 tsbinfop = sfmmup->sfmmu_tsb; 10626 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 10627 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 10628 ASSERT(rc == TSB_SUCCESS); 10629 } 10630 10631 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 10632 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 10633 } 10634 10635 /* 10636 * Handle exceptions for low level tsb_handler. 10637 * 10638 * There are many scenarios that could land us here: 10639 * 10640 * 1) Process has no context. In this case, ctx is 10641 * INVALID_CONTEXT and sfmmup->sfmmu_cnum == 1 so 10642 * we will acquire a context before returning. 10643 * 2) Need to re-load our MMU state. In this case, 10644 * ctx is INVALID_CONTEXT and sfmmup->sfmmu_cnum != 1. 10645 * 3) ISM mappings are being updated. This is handled 10646 * just like case #2. 10647 * 4) We wish to program a new page size into the TLB. 10648 * This is handled just like case #1, since changing 10649 * TLB page size requires us to flush the TLB. 10650 * 5) Window fault and no valid translation found. 10651 * 10652 * Cases 1-4, ctx is INVALID_CONTEXT so we handle it and then 10653 * exit which will retry the trapped instruction. Case #5 we 10654 * punt to trap() which will raise us a trap level and handle 10655 * the fault before unwinding. 10656 * 10657 * Note that the process will run in INVALID_CONTEXT before 10658 * faulting into here and subsequently loading the MMU registers 10659 * (including the TSB base register) associated with this process. 10660 * For this reason, the trap handlers must all test for 10661 * INVALID_CONTEXT before attempting to access any registers other 10662 * than the context registers. 10663 */ 10664 void 10665 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 10666 { 10667 sfmmu_t *sfmmup; 10668 uint_t ctxnum; 10669 klwp_id_t lwp; 10670 char lwp_save_state; 10671 hatlock_t *hatlockp; 10672 struct tsb_info *tsbinfop; 10673 10674 SFMMU_STAT(sf_tsb_exceptions); 10675 sfmmup = astosfmmu(curthread->t_procp->p_as); 10676 ctxnum = tagaccess & TAGACC_CTX_MASK; 10677 10678 ASSERT(sfmmup != ksfmmup && ctxnum != KCONTEXT); 10679 ASSERT(sfmmup->sfmmu_ismhat == 0); 10680 /* 10681 * First, make sure we come out of here with a valid ctx, 10682 * since if we don't get one we'll simply loop on the 10683 * faulting instruction. 10684 * 10685 * If the ISM mappings are changing, the TSB is being relocated, or 10686 * the process is swapped out we serialize behind the controlling 10687 * thread with the sfmmu_flags and sfmmu_tsb_cv condition variable. 10688 * Otherwise we synchronize with the context stealer or the thread 10689 * that required us to change out our MMU registers (such 10690 * as a thread changing out our TSB while we were running) by 10691 * locking the HAT and grabbing the rwlock on the context as a 10692 * reader temporarily. 10693 */ 10694 if (ctxnum == INVALID_CONTEXT || 10695 SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10696 /* 10697 * Must set lwp state to LWP_SYS before 10698 * trying to acquire any adaptive lock 10699 */ 10700 lwp = ttolwp(curthread); 10701 ASSERT(lwp); 10702 lwp_save_state = lwp->lwp_state; 10703 lwp->lwp_state = LWP_SYS; 10704 10705 hatlockp = sfmmu_hat_enter(sfmmup); 10706 retry: 10707 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 10708 tsbinfop = tsbinfop->tsb_next) { 10709 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 10710 cv_wait(&sfmmup->sfmmu_tsb_cv, 10711 HATLOCK_MUTEXP(hatlockp)); 10712 goto retry; 10713 } 10714 } 10715 10716 /* 10717 * Wait for ISM maps to be updated. 10718 */ 10719 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 10720 cv_wait(&sfmmup->sfmmu_tsb_cv, 10721 HATLOCK_MUTEXP(hatlockp)); 10722 goto retry; 10723 } 10724 10725 /* 10726 * If we're swapping in, get TSB(s). Note that we must do 10727 * this before we get a ctx or load the MMU state. Once 10728 * we swap in we have to recheck to make sure the TSB(s) and 10729 * ISM mappings didn't change while we slept. 10730 */ 10731 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10732 sfmmu_tsb_swapin(sfmmup, hatlockp); 10733 goto retry; 10734 } 10735 10736 sfmmu_disallow_ctx_steal(sfmmup); 10737 ctxnum = sfmmup->sfmmu_cnum; 10738 kpreempt_disable(); 10739 sfmmu_setctx_sec(ctxnum); 10740 sfmmu_load_mmustate(sfmmup); 10741 kpreempt_enable(); 10742 sfmmu_allow_ctx_steal(sfmmup); 10743 sfmmu_hat_exit(hatlockp); 10744 /* 10745 * Must restore lwp_state if not calling 10746 * trap() for further processing. Restore 10747 * it anyway. 10748 */ 10749 lwp->lwp_state = lwp_save_state; 10750 if (sfmmup->sfmmu_ttecnt[TTE8K] != 0 || 10751 sfmmup->sfmmu_ttecnt[TTE64K] != 0 || 10752 sfmmup->sfmmu_ttecnt[TTE512K] != 0 || 10753 sfmmup->sfmmu_ttecnt[TTE4M] != 0 || 10754 sfmmup->sfmmu_ttecnt[TTE32M] != 0 || 10755 sfmmup->sfmmu_ttecnt[TTE256M] != 0) { 10756 return; 10757 } 10758 if (traptype == T_DATA_PROT) { 10759 traptype = T_DATA_MMU_MISS; 10760 } 10761 } 10762 trap(rp, (caddr_t)tagaccess, traptype, 0); 10763 } 10764 10765 /* 10766 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 10767 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 10768 * rather than spinning to avoid send mondo timeouts with 10769 * interrupts enabled. When the lock is acquired it is immediately 10770 * released and we return back to sfmmu_vatopfn just after 10771 * the GET_TTE call. 10772 */ 10773 void 10774 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 10775 { 10776 struct page **pp; 10777 10778 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 10779 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 10780 } 10781 10782 /* 10783 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 10784 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 10785 * cross traps which cannot be handled while spinning in the 10786 * trap handlers. Simply enter and exit the kpr_suspendlock spin 10787 * mutex, which is held by the holder of the suspend bit, and then 10788 * retry the trapped instruction after unwinding. 10789 */ 10790 /*ARGSUSED*/ 10791 void 10792 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 10793 { 10794 ASSERT(curthread != kreloc_thread); 10795 mutex_enter(&kpr_suspendlock); 10796 mutex_exit(&kpr_suspendlock); 10797 } 10798 10799 /* 10800 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 10801 * This routine may be called with all cpu's captured. Therefore, the 10802 * caller is responsible for holding all locks and disabling kernel 10803 * preemption. 10804 */ 10805 /* ARGSUSED */ 10806 static void 10807 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 10808 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 10809 { 10810 cpuset_t cpuset; 10811 caddr_t va; 10812 ism_ment_t *ment; 10813 sfmmu_t *sfmmup; 10814 int ctxnum; 10815 int vcolor; 10816 int ttesz; 10817 10818 /* 10819 * Walk the ism_hat's mapping list and flush the page 10820 * from every hat sharing this ism_hat. This routine 10821 * may be called while all cpu's have been captured. 10822 * Therefore we can't attempt to grab any locks. For now 10823 * this means we will protect the ism mapping list under 10824 * a single lock which will be grabbed by the caller. 10825 * If hat_share/unshare scalibility becomes a performance 10826 * problem then we may need to re-think ism mapping list locking. 10827 */ 10828 ASSERT(ism_sfmmup->sfmmu_ismhat); 10829 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 10830 addr = addr - ISMID_STARTADDR; 10831 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 10832 10833 sfmmup = ment->iment_hat; 10834 ctxnum = sfmmup->sfmmu_cnum; 10835 va = ment->iment_base_va; 10836 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 10837 10838 /* 10839 * Flush TSB of ISM mappings. 10840 */ 10841 ttesz = get_hblk_ttesz(hmeblkp); 10842 if (ttesz == TTE8K || ttesz == TTE4M) { 10843 sfmmu_unload_tsb(sfmmup, va, ttesz); 10844 } else { 10845 caddr_t sva = va; 10846 caddr_t eva; 10847 ASSERT(addr == (caddr_t)get_hblk_base(hmeblkp)); 10848 eva = sva + get_hblk_span(hmeblkp); 10849 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); 10850 } 10851 10852 if (ctxnum != INVALID_CONTEXT) { 10853 /* 10854 * Flush TLBs. We don't need to do this for 10855 * invalid context since the flushing is already 10856 * done as part of context stealing. 10857 */ 10858 cpuset = sfmmup->sfmmu_cpusran; 10859 CPUSET_AND(cpuset, cpu_ready_set); 10860 CPUSET_DEL(cpuset, CPU->cpu_id); 10861 SFMMU_XCALL_STATS(ctxnum); 10862 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 10863 ctxnum); 10864 vtag_flushpage(va, ctxnum); 10865 } 10866 10867 /* 10868 * Flush D$ 10869 * When flushing D$ we must flush all 10870 * cpu's. See sfmmu_cache_flush(). 10871 */ 10872 if (cache_flush_flag == CACHE_FLUSH) { 10873 cpuset = cpu_ready_set; 10874 CPUSET_DEL(cpuset, CPU->cpu_id); 10875 SFMMU_XCALL_STATS(ctxnum); 10876 vcolor = addr_to_vcolor(va); 10877 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 10878 vac_flushpage(pfnum, vcolor); 10879 } 10880 } 10881 } 10882 10883 /* 10884 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 10885 * a particular virtual address and ctx. If noflush is set we do not 10886 * flush the TLB/TSB. This function may or may not be called with the 10887 * HAT lock held. 10888 */ 10889 static void 10890 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 10891 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 10892 int hat_lock_held) 10893 { 10894 int ctxnum, vcolor; 10895 cpuset_t cpuset; 10896 hatlock_t *hatlockp; 10897 10898 /* 10899 * There is no longer a need to protect against ctx being 10900 * stolen here since we don't store the ctx in the TSB anymore. 10901 */ 10902 vcolor = addr_to_vcolor(addr); 10903 10904 kpreempt_disable(); 10905 if (!tlb_noflush) { 10906 /* 10907 * Flush the TSB. 10908 */ 10909 if (!hat_lock_held) 10910 hatlockp = sfmmu_hat_enter(sfmmup); 10911 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp); 10912 ctxnum = (int)sfmmutoctxnum(sfmmup); 10913 if (!hat_lock_held) 10914 sfmmu_hat_exit(hatlockp); 10915 10916 if (ctxnum != INVALID_CONTEXT) { 10917 /* 10918 * Flush TLBs. We don't need to do this if our 10919 * context is invalid context. Since we hold the 10920 * HAT lock the context must have been stolen and 10921 * hence will be flushed before re-use. 10922 */ 10923 cpuset = sfmmup->sfmmu_cpusran; 10924 CPUSET_AND(cpuset, cpu_ready_set); 10925 CPUSET_DEL(cpuset, CPU->cpu_id); 10926 SFMMU_XCALL_STATS(ctxnum); 10927 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 10928 ctxnum); 10929 vtag_flushpage(addr, ctxnum); 10930 } 10931 } 10932 10933 /* 10934 * Flush the D$ 10935 * 10936 * Even if the ctx is stolen, we need to flush the 10937 * cache. Our ctx stealer only flushes the TLBs. 10938 */ 10939 if (cache_flush_flag == CACHE_FLUSH) { 10940 if (cpu_flag & FLUSH_ALL_CPUS) { 10941 cpuset = cpu_ready_set; 10942 } else { 10943 cpuset = sfmmup->sfmmu_cpusran; 10944 CPUSET_AND(cpuset, cpu_ready_set); 10945 } 10946 CPUSET_DEL(cpuset, CPU->cpu_id); 10947 SFMMU_XCALL_STATS(sfmmutoctxnum(sfmmup)); 10948 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 10949 vac_flushpage(pfnum, vcolor); 10950 } 10951 kpreempt_enable(); 10952 } 10953 10954 /* 10955 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 10956 * address and ctx. If noflush is set we do not currently do anything. 10957 * This function may or may not be called with the HAT lock held. 10958 */ 10959 static void 10960 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 10961 int tlb_noflush, int hat_lock_held) 10962 { 10963 int ctxnum; 10964 cpuset_t cpuset; 10965 hatlock_t *hatlockp; 10966 10967 /* 10968 * If the process is exiting we have nothing to do. 10969 */ 10970 if (tlb_noflush) 10971 return; 10972 10973 /* 10974 * Flush TSB. 10975 */ 10976 if (!hat_lock_held) 10977 hatlockp = sfmmu_hat_enter(sfmmup); 10978 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp); 10979 ctxnum = sfmmutoctxnum(sfmmup); 10980 if (!hat_lock_held) 10981 sfmmu_hat_exit(hatlockp); 10982 10983 /* 10984 * Flush TLBs. We don't need to do this if our context is invalid 10985 * context. Since we hold the HAT lock the context must have been 10986 * stolen and hence will be flushed before re-use. 10987 */ 10988 if (ctxnum != INVALID_CONTEXT) { 10989 /* 10990 * There is no need to protect against ctx being stolen. 10991 * If the ctx is stolen we will simply get an extra flush. 10992 */ 10993 kpreempt_disable(); 10994 cpuset = sfmmup->sfmmu_cpusran; 10995 CPUSET_AND(cpuset, cpu_ready_set); 10996 CPUSET_DEL(cpuset, CPU->cpu_id); 10997 SFMMU_XCALL_STATS(ctxnum); 10998 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, ctxnum); 10999 vtag_flushpage(addr, ctxnum); 11000 kpreempt_enable(); 11001 } 11002 } 11003 11004 /* 11005 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 11006 * call handler that can flush a range of pages to save on xcalls. 11007 */ 11008 static int sfmmu_xcall_save; 11009 11010 static void 11011 sfmmu_tlb_range_demap(demap_range_t *dmrp) 11012 { 11013 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 11014 int ctxnum; 11015 hatlock_t *hatlockp; 11016 cpuset_t cpuset; 11017 uint64_t ctx_pgcnt; 11018 pgcnt_t pgcnt = 0; 11019 int pgunload = 0; 11020 int dirtypg = 0; 11021 caddr_t addr = dmrp->dmr_addr; 11022 caddr_t eaddr; 11023 uint64_t bitvec = dmrp->dmr_bitvec; 11024 11025 ASSERT(bitvec & 1); 11026 11027 /* 11028 * Flush TSB and calculate number of pages to flush. 11029 */ 11030 while (bitvec != 0) { 11031 dirtypg = 0; 11032 /* 11033 * Find the first page to flush and then count how many 11034 * pages there are after it that also need to be flushed. 11035 * This way the number of TSB flushes is minimized. 11036 */ 11037 while ((bitvec & 1) == 0) { 11038 pgcnt++; 11039 addr += MMU_PAGESIZE; 11040 bitvec >>= 1; 11041 } 11042 while (bitvec & 1) { 11043 dirtypg++; 11044 bitvec >>= 1; 11045 } 11046 eaddr = addr + ptob(dirtypg); 11047 hatlockp = sfmmu_hat_enter(sfmmup); 11048 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 11049 sfmmu_hat_exit(hatlockp); 11050 pgunload += dirtypg; 11051 addr = eaddr; 11052 pgcnt += dirtypg; 11053 } 11054 11055 /* 11056 * In the case where context is invalid context, bail. 11057 * We hold the hat lock while checking the ctx to prevent 11058 * a race with sfmmu_replace_tsb() which temporarily sets 11059 * the ctx to INVALID_CONTEXT to force processes to enter 11060 * sfmmu_tsbmiss_exception(). 11061 */ 11062 hatlockp = sfmmu_hat_enter(sfmmup); 11063 ctxnum = sfmmutoctxnum(sfmmup); 11064 sfmmu_hat_exit(hatlockp); 11065 if (ctxnum == INVALID_CONTEXT) { 11066 dmrp->dmr_bitvec = 0; 11067 return; 11068 } 11069 11070 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 11071 if (sfmmup->sfmmu_free == 0) { 11072 addr = dmrp->dmr_addr; 11073 bitvec = dmrp->dmr_bitvec; 11074 ctx_pgcnt = (uint64_t)((ctxnum << 16) | pgcnt); 11075 kpreempt_disable(); 11076 cpuset = sfmmup->sfmmu_cpusran; 11077 CPUSET_AND(cpuset, cpu_ready_set); 11078 CPUSET_DEL(cpuset, CPU->cpu_id); 11079 SFMMU_XCALL_STATS(ctxnum); 11080 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 11081 ctx_pgcnt); 11082 for (; bitvec != 0; bitvec >>= 1) { 11083 if (bitvec & 1) 11084 vtag_flushpage(addr, ctxnum); 11085 addr += MMU_PAGESIZE; 11086 } 11087 kpreempt_enable(); 11088 sfmmu_xcall_save += (pgunload-1); 11089 } 11090 dmrp->dmr_bitvec = 0; 11091 } 11092 11093 /* 11094 * Flushes only TLB. 11095 */ 11096 static void 11097 sfmmu_tlb_ctx_demap(sfmmu_t *sfmmup) 11098 { 11099 int ctxnum; 11100 cpuset_t cpuset; 11101 11102 ctxnum = (int)sfmmutoctxnum(sfmmup); 11103 if (ctxnum == INVALID_CONTEXT) { 11104 /* 11105 * if ctx was stolen then simply return 11106 * whoever stole ctx is responsible for flush. 11107 */ 11108 return; 11109 } 11110 ASSERT(ctxnum != KCONTEXT); 11111 /* 11112 * There is no need to protect against ctx being stolen. If the 11113 * ctx is stolen we will simply get an extra flush. 11114 */ 11115 kpreempt_disable(); 11116 11117 cpuset = sfmmup->sfmmu_cpusran; 11118 CPUSET_DEL(cpuset, CPU->cpu_id); 11119 CPUSET_AND(cpuset, cpu_ready_set); 11120 SFMMU_XCALL_STATS(ctxnum); 11121 11122 /* 11123 * Flush TLB. 11124 * RFE: it might be worth delaying the TLB flush as well. In that 11125 * case each cpu would have to traverse the dirty list and flush 11126 * each one of those ctx from the TLB. 11127 */ 11128 vtag_flushctx(ctxnum); 11129 xt_some(cpuset, vtag_flushctx_tl1, ctxnum, 0); 11130 11131 kpreempt_enable(); 11132 SFMMU_STAT(sf_tlbflush_ctx); 11133 } 11134 11135 /* 11136 * Flushes all TLBs. 11137 */ 11138 static void 11139 sfmmu_tlb_all_demap(void) 11140 { 11141 cpuset_t cpuset; 11142 11143 /* 11144 * There is no need to protect against ctx being stolen. If the 11145 * ctx is stolen we will simply get an extra flush. 11146 */ 11147 kpreempt_disable(); 11148 11149 cpuset = cpu_ready_set; 11150 CPUSET_DEL(cpuset, CPU->cpu_id); 11151 /* LINTED: constant in conditional context */ 11152 SFMMU_XCALL_STATS(INVALID_CONTEXT); 11153 11154 vtag_flushall(); 11155 xt_some(cpuset, vtag_flushall_tl1, 0, 0); 11156 xt_sync(cpuset); 11157 11158 kpreempt_enable(); 11159 SFMMU_STAT(sf_tlbflush_all); 11160 } 11161 11162 /* 11163 * In cases where we need to synchronize with TLB/TSB miss trap 11164 * handlers, _and_ need to flush the TLB, it's a lot easier to 11165 * steal the context from the process and free it than to do a 11166 * special song and dance to keep things consistent for the 11167 * handlers. 11168 * 11169 * Since the process suddenly ends up without a context and our caller 11170 * holds the hat lock, threads that fault after this function is called 11171 * will pile up on the lock. We can then do whatever we need to 11172 * atomically from the context of the caller. The first blocked thread 11173 * to resume executing will get the process a new context, and the 11174 * process will resume executing. 11175 * 11176 * One added advantage of this approach is that on MMUs that 11177 * support a "flush all" operation, we will delay the flush until 11178 * we run out of contexts, and then flush the TLB one time. This 11179 * is rather rare, so it's a lot less expensive than making 8000 11180 * x-calls to flush the TLB 8000 times. Another is that we can do 11181 * all of this without pausing CPUs, due to some knowledge of how 11182 * resume() loads processes onto the processor; it sets the thread 11183 * into cpusran, and _then_ looks at cnum. Because we do things in 11184 * the reverse order here, we guarantee exactly one of the following 11185 * statements is always true: 11186 * 11187 * 1) Nobody is in resume() so we have nothing to worry about anyway. 11188 * 2) The thread in resume() isn't in cpusran when we do the xcall, 11189 * so we know when it does set itself it'll see cnum is 11190 * INVALID_CONTEXT. 11191 * 3) The thread in resume() is in cpusran, and already might have 11192 * looked at the old cnum. That's OK, because we'll xcall it 11193 * and, if necessary, flush the TLB along with the rest of the 11194 * crowd. 11195 */ 11196 static void 11197 sfmmu_tlb_swap_ctx(sfmmu_t *sfmmup, struct ctx *ctx) 11198 { 11199 cpuset_t cpuset; 11200 int cnum; 11201 11202 if (sfmmup->sfmmu_cnum == INVALID_CONTEXT) 11203 return; 11204 11205 SFMMU_STAT(sf_ctx_swap); 11206 11207 kpreempt_disable(); 11208 11209 ASSERT(rw_read_locked(&ctx->ctx_rwlock) == 0); 11210 ASSERT(ctx->ctx_sfmmu == sfmmup); 11211 11212 cnum = ctxtoctxnum(ctx); 11213 ASSERT(sfmmup->sfmmu_cnum == cnum); 11214 ASSERT(cnum >= NUM_LOCKED_CTXS); 11215 11216 sfmmup->sfmmu_cnum = INVALID_CONTEXT; 11217 membar_enter(); /* make sure visible on all CPUs */ 11218 ctx->ctx_sfmmu = NULL; 11219 11220 cpuset = sfmmup->sfmmu_cpusran; 11221 CPUSET_DEL(cpuset, CPU->cpu_id); 11222 CPUSET_AND(cpuset, cpu_ready_set); 11223 SFMMU_XCALL_STATS(cnum); 11224 11225 /* 11226 * Force anybody running this process on CPU 11227 * to enter sfmmu_tsbmiss_exception() on the 11228 * next TLB miss, synchronize behind us on 11229 * the HAT lock, and grab a new context. At 11230 * that point the new page size will become 11231 * active in the TLB for the new context. 11232 * See sfmmu_get_ctx() for details. 11233 */ 11234 if (delay_tlb_flush) { 11235 xt_some(cpuset, sfmmu_raise_tsb_exception, 11236 cnum, INVALID_CONTEXT); 11237 SFMMU_STAT(sf_tlbflush_deferred); 11238 } else { 11239 xt_some(cpuset, sfmmu_ctx_steal_tl1, cnum, INVALID_CONTEXT); 11240 vtag_flushctx(cnum); 11241 SFMMU_STAT(sf_tlbflush_ctx); 11242 } 11243 xt_sync(cpuset); 11244 11245 /* 11246 * If we just stole the ctx from the current 11247 * process on local CPU we need to invalidate 11248 * this CPU context as well. 11249 */ 11250 if (sfmmu_getctx_sec() == cnum) { 11251 sfmmu_setctx_sec(INVALID_CONTEXT); 11252 sfmmu_clear_utsbinfo(); 11253 } 11254 11255 kpreempt_enable(); 11256 11257 /* 11258 * Now put old ctx on the dirty list since we may not 11259 * have flushed the context out of the TLB. We'll let 11260 * the next guy who uses this ctx flush it instead. 11261 */ 11262 mutex_enter(&ctx_list_lock); 11263 CTX_SET_FLAGS(ctx, CTX_FREE_FLAG); 11264 ctx->ctx_free = ctxdirty; 11265 ctxdirty = ctx; 11266 mutex_exit(&ctx_list_lock); 11267 } 11268 11269 /* 11270 * We need to flush the cache in all cpus. It is possible that 11271 * a process referenced a page as cacheable but has sinced exited 11272 * and cleared the mapping list. We still to flush it but have no 11273 * state so all cpus is the only alternative. 11274 */ 11275 void 11276 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 11277 { 11278 cpuset_t cpuset; 11279 int ctxnum = INVALID_CONTEXT; 11280 11281 kpreempt_disable(); 11282 cpuset = cpu_ready_set; 11283 CPUSET_DEL(cpuset, CPU->cpu_id); 11284 SFMMU_XCALL_STATS(ctxnum); /* account to any ctx */ 11285 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 11286 xt_sync(cpuset); 11287 vac_flushpage(pfnum, vcolor); 11288 kpreempt_enable(); 11289 } 11290 11291 void 11292 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 11293 { 11294 cpuset_t cpuset; 11295 int ctxnum = INVALID_CONTEXT; 11296 11297 ASSERT(vcolor >= 0); 11298 11299 kpreempt_disable(); 11300 cpuset = cpu_ready_set; 11301 CPUSET_DEL(cpuset, CPU->cpu_id); 11302 SFMMU_XCALL_STATS(ctxnum); /* account to any ctx */ 11303 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 11304 xt_sync(cpuset); 11305 vac_flushcolor(vcolor, pfnum); 11306 kpreempt_enable(); 11307 } 11308 11309 /* 11310 * We need to prevent processes from accessing the TSB using a cached physical 11311 * address. It's alright if they try to access the TSB via virtual address 11312 * since they will just fault on that virtual address once the mapping has 11313 * been suspended. 11314 */ 11315 #pragma weak sendmondo_in_recover 11316 11317 /* ARGSUSED */ 11318 static int 11319 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 11320 { 11321 hatlock_t *hatlockp; 11322 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 11323 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 11324 struct ctx *ctx; 11325 int cnum; 11326 extern uint32_t sendmondo_in_recover; 11327 11328 if (flags != HAT_PRESUSPEND) 11329 return (0); 11330 11331 hatlockp = sfmmu_hat_enter(sfmmup); 11332 11333 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 11334 11335 /* 11336 * For Cheetah+ Erratum 25: 11337 * Wait for any active recovery to finish. We can't risk 11338 * relocating the TSB of the thread running mondo_recover_proc() 11339 * since, if we did that, we would deadlock. The scenario we are 11340 * trying to avoid is as follows: 11341 * 11342 * THIS CPU RECOVER CPU 11343 * -------- ----------- 11344 * Begins recovery, walking through TSB 11345 * hat_pagesuspend() TSB TTE 11346 * TLB miss on TSB TTE, spins at TL1 11347 * xt_sync() 11348 * send_mondo_timeout() 11349 * mondo_recover_proc() 11350 * ((deadlocked)) 11351 * 11352 * The second half of the workaround is that mondo_recover_proc() 11353 * checks to see if the tsb_info has the RELOC flag set, and if it 11354 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 11355 * and hence avoiding the TLB miss that could result in a deadlock. 11356 */ 11357 if (&sendmondo_in_recover) { 11358 membar_enter(); /* make sure RELOC flag visible */ 11359 while (sendmondo_in_recover) { 11360 drv_usecwait(1); 11361 membar_consumer(); 11362 } 11363 } 11364 11365 ctx = sfmmutoctx(sfmmup); 11366 rw_enter(&ctx->ctx_rwlock, RW_WRITER); 11367 cnum = sfmmutoctxnum(sfmmup); 11368 11369 if (cnum != INVALID_CONTEXT) { 11370 /* 11371 * Force all threads for this sfmmu to sfmmu_tsbmiss_exception 11372 * on their next TLB miss. 11373 */ 11374 sfmmu_tlb_swap_ctx(sfmmup, ctx); 11375 } 11376 11377 rw_exit(&ctx->ctx_rwlock); 11378 11379 sfmmu_hat_exit(hatlockp); 11380 11381 return (0); 11382 } 11383 11384 /* ARGSUSED */ 11385 static int 11386 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 11387 void *tsbinfo, pfn_t newpfn) 11388 { 11389 hatlock_t *hatlockp; 11390 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 11391 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 11392 11393 if (flags != HAT_POSTUNSUSPEND) 11394 return (0); 11395 11396 hatlockp = sfmmu_hat_enter(sfmmup); 11397 11398 SFMMU_STAT(sf_tsb_reloc); 11399 11400 /* 11401 * The process may have swapped out while we were relocating one 11402 * of its TSBs. If so, don't bother doing the setup since the 11403 * process can't be using the memory anymore. 11404 */ 11405 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 11406 ASSERT(va == tsbinfop->tsb_va); 11407 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 11408 sfmmu_setup_tsbinfo(sfmmup); 11409 11410 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 11411 sfmmu_inv_tsb(tsbinfop->tsb_va, 11412 TSB_BYTES(tsbinfop->tsb_szc)); 11413 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 11414 } 11415 } 11416 11417 membar_exit(); 11418 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 11419 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11420 11421 sfmmu_hat_exit(hatlockp); 11422 11423 return (0); 11424 } 11425 11426 /* 11427 * Allocate and initialize a tsb_info structure. Note that we may or may not 11428 * allocate a TSB here, depending on the flags passed in. 11429 */ 11430 static int 11431 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 11432 uint_t flags, sfmmu_t *sfmmup) 11433 { 11434 int err; 11435 11436 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 11437 sfmmu_tsbinfo_cache, KM_SLEEP); 11438 11439 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 11440 tsb_szc, flags, sfmmup)) != 0) { 11441 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 11442 SFMMU_STAT(sf_tsb_allocfail); 11443 *tsbinfopp = NULL; 11444 return (err); 11445 } 11446 SFMMU_STAT(sf_tsb_alloc); 11447 11448 /* 11449 * Bump the TSB size counters for this TSB size. 11450 */ 11451 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 11452 return (0); 11453 } 11454 11455 static void 11456 sfmmu_tsb_free(struct tsb_info *tsbinfo) 11457 { 11458 caddr_t tsbva = tsbinfo->tsb_va; 11459 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 11460 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 11461 vmem_t *vmp = tsbinfo->tsb_vmp; 11462 11463 /* 11464 * If we allocated this TSB from relocatable kernel memory, then we 11465 * need to uninstall the callback handler. 11466 */ 11467 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 11468 uintptr_t slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 11469 caddr_t slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 11470 page_t **ppl; 11471 int ret; 11472 11473 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 11474 ASSERT(ret == 0); 11475 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 11476 0); 11477 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 11478 } 11479 11480 if (kmem_cachep != NULL) { 11481 kmem_cache_free(kmem_cachep, tsbva); 11482 } else { 11483 vmem_xfree(vmp, (void *)tsbva, tsb_size); 11484 } 11485 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 11486 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 11487 } 11488 11489 static void 11490 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 11491 { 11492 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 11493 sfmmu_tsb_free(tsbinfo); 11494 } 11495 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 11496 11497 } 11498 11499 /* 11500 * Setup all the references to physical memory for this tsbinfo. 11501 * The underlying page(s) must be locked. 11502 */ 11503 static void 11504 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 11505 { 11506 ASSERT(pfn != PFN_INVALID); 11507 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 11508 11509 #ifndef sun4v 11510 if (tsbinfo->tsb_szc == 0) { 11511 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 11512 PROT_WRITE|PROT_READ, TTE8K); 11513 } else { 11514 /* 11515 * Round down PA and use a large mapping; the handlers will 11516 * compute the TSB pointer at the correct offset into the 11517 * big virtual page. NOTE: this assumes all TSBs larger 11518 * than 8K must come from physically contiguous slabs of 11519 * size tsb_slab_size. 11520 */ 11521 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 11522 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 11523 } 11524 tsbinfo->tsb_pa = ptob(pfn); 11525 11526 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 11527 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 11528 11529 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 11530 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 11531 #else /* sun4v */ 11532 tsbinfo->tsb_pa = ptob(pfn); 11533 #endif /* sun4v */ 11534 } 11535 11536 11537 /* 11538 * Returns zero on success, ENOMEM if over the high water mark, 11539 * or EAGAIN if the caller needs to retry with a smaller TSB 11540 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 11541 * 11542 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 11543 * is specified and the TSB requested is PAGESIZE, though it 11544 * may sleep waiting for memory if sufficient memory is not 11545 * available. 11546 */ 11547 static int 11548 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 11549 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 11550 { 11551 caddr_t vaddr = NULL; 11552 caddr_t slab_vaddr; 11553 uintptr_t slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 11554 int tsbbytes = TSB_BYTES(tsbcode); 11555 int lowmem = 0; 11556 struct kmem_cache *kmem_cachep = NULL; 11557 vmem_t *vmp = NULL; 11558 lgrp_id_t lgrpid = LGRP_NONE; 11559 pfn_t pfn; 11560 uint_t cbflags = HAC_SLEEP; 11561 page_t **pplist; 11562 int ret; 11563 11564 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 11565 flags |= TSB_ALLOC; 11566 11567 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 11568 11569 tsbinfo->tsb_sfmmu = sfmmup; 11570 11571 /* 11572 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 11573 * return. 11574 */ 11575 if ((flags & TSB_ALLOC) == 0) { 11576 tsbinfo->tsb_szc = tsbcode; 11577 tsbinfo->tsb_ttesz_mask = tteszmask; 11578 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 11579 tsbinfo->tsb_pa = -1; 11580 tsbinfo->tsb_tte.ll = 0; 11581 tsbinfo->tsb_next = NULL; 11582 tsbinfo->tsb_flags = TSB_SWAPPED; 11583 tsbinfo->tsb_cache = NULL; 11584 tsbinfo->tsb_vmp = NULL; 11585 return (0); 11586 } 11587 11588 #ifdef DEBUG 11589 /* 11590 * For debugging: 11591 * Randomly force allocation failures every tsb_alloc_mtbf 11592 * tries if TSB_FORCEALLOC is not specified. This will 11593 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 11594 * it is even, to allow testing of both failure paths... 11595 */ 11596 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 11597 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 11598 tsb_alloc_count = 0; 11599 tsb_alloc_fail_mtbf++; 11600 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 11601 } 11602 #endif /* DEBUG */ 11603 11604 /* 11605 * Enforce high water mark if we are not doing a forced allocation 11606 * and are not shrinking a process' TSB. 11607 */ 11608 if ((flags & TSB_SHRINK) == 0 && 11609 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 11610 if ((flags & TSB_FORCEALLOC) == 0) 11611 return (ENOMEM); 11612 lowmem = 1; 11613 } 11614 11615 /* 11616 * Allocate from the correct location based upon the size of the TSB 11617 * compared to the base page size, and what memory conditions dictate. 11618 * Note we always do nonblocking allocations from the TSB arena since 11619 * we don't want memory fragmentation to cause processes to block 11620 * indefinitely waiting for memory; until the kernel algorithms that 11621 * coalesce large pages are improved this is our best option. 11622 * 11623 * Algorithm: 11624 * If allocating a "large" TSB (>8K), allocate from the 11625 * appropriate kmem_tsb_default_arena vmem arena 11626 * else if low on memory or the TSB_FORCEALLOC flag is set or 11627 * tsb_forceheap is set 11628 * Allocate from kernel heap via sfmmu_tsb8k_cache with 11629 * KM_SLEEP (never fails) 11630 * else 11631 * Allocate from appropriate sfmmu_tsb_cache with 11632 * KM_NOSLEEP 11633 * endif 11634 */ 11635 if (tsb_lgrp_affinity) 11636 lgrpid = lgrp_home_id(curthread); 11637 if (lgrpid == LGRP_NONE) 11638 lgrpid = 0; /* use lgrp of boot CPU */ 11639 11640 if (tsbbytes > MMU_PAGESIZE) { 11641 vmp = kmem_tsb_default_arena[lgrpid]; 11642 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 0, 0, 11643 NULL, NULL, VM_NOSLEEP); 11644 #ifdef DEBUG 11645 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 11646 #else /* !DEBUG */ 11647 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 11648 #endif /* DEBUG */ 11649 kmem_cachep = sfmmu_tsb8k_cache; 11650 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 11651 ASSERT(vaddr != NULL); 11652 } else { 11653 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 11654 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 11655 } 11656 11657 tsbinfo->tsb_cache = kmem_cachep; 11658 tsbinfo->tsb_vmp = vmp; 11659 11660 if (vaddr == NULL) { 11661 return (EAGAIN); 11662 } 11663 11664 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 11665 kmem_cachep = tsbinfo->tsb_cache; 11666 11667 /* 11668 * If we are allocating from outside the cage, then we need to 11669 * register a relocation callback handler. Note that for now 11670 * since pseudo mappings always hang off of the slab's root page, 11671 * we need only lock the first 8K of the TSB slab. This is a bit 11672 * hacky but it is good for performance. 11673 */ 11674 if (kmem_cachep != sfmmu_tsb8k_cache) { 11675 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 11676 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 11677 ASSERT(ret == 0); 11678 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 11679 cbflags, (void *)tsbinfo, &pfn); 11680 11681 /* 11682 * Need to free up resources if we could not successfully 11683 * add the callback function and return an error condition. 11684 */ 11685 if (ret != 0) { 11686 if (kmem_cachep) { 11687 kmem_cache_free(kmem_cachep, vaddr); 11688 } else { 11689 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 11690 } 11691 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 11692 S_WRITE); 11693 return (EAGAIN); 11694 } 11695 } else { 11696 /* 11697 * Since allocation of 8K TSBs from heap is rare and occurs 11698 * during memory pressure we allocate them from permanent 11699 * memory rather than using callbacks to get the PFN. 11700 */ 11701 pfn = hat_getpfnum(kas.a_hat, vaddr); 11702 } 11703 11704 tsbinfo->tsb_va = vaddr; 11705 tsbinfo->tsb_szc = tsbcode; 11706 tsbinfo->tsb_ttesz_mask = tteszmask; 11707 tsbinfo->tsb_next = NULL; 11708 tsbinfo->tsb_flags = 0; 11709 11710 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 11711 11712 if (kmem_cachep != sfmmu_tsb8k_cache) { 11713 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 11714 } 11715 11716 sfmmu_inv_tsb(vaddr, tsbbytes); 11717 return (0); 11718 } 11719 11720 /* 11721 * Initialize per cpu tsb and per cpu tsbmiss_area 11722 */ 11723 void 11724 sfmmu_init_tsbs(void) 11725 { 11726 int i; 11727 struct tsbmiss *tsbmissp; 11728 struct kpmtsbm *kpmtsbmp; 11729 #ifndef sun4v 11730 extern int dcache_line_mask; 11731 #endif /* sun4v */ 11732 extern uint_t vac_colors; 11733 11734 /* 11735 * Init. tsb miss area. 11736 */ 11737 tsbmissp = tsbmiss_area; 11738 11739 for (i = 0; i < NCPU; tsbmissp++, i++) { 11740 /* 11741 * initialize the tsbmiss area. 11742 * Do this for all possible CPUs as some may be added 11743 * while the system is running. There is no cost to this. 11744 */ 11745 tsbmissp->ksfmmup = ksfmmup; 11746 #ifndef sun4v 11747 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 11748 #endif /* sun4v */ 11749 tsbmissp->khashstart = 11750 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 11751 tsbmissp->uhashstart = 11752 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 11753 tsbmissp->khashsz = khmehash_num; 11754 tsbmissp->uhashsz = uhmehash_num; 11755 } 11756 11757 if (kpm_enable == 0) 11758 return; 11759 11760 if (kpm_smallpages) { 11761 /* 11762 * If we're using base pagesize pages for seg_kpm 11763 * mappings, we use the kernel TSB since we can't afford 11764 * to allocate a second huge TSB for these mappings. 11765 */ 11766 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 11767 kpm_tsbsz = ktsb_szcode; 11768 kpmsm_tsbbase = kpm_tsbbase; 11769 kpmsm_tsbsz = kpm_tsbsz; 11770 } else { 11771 /* 11772 * In VAC conflict case, just put the entries in the 11773 * kernel 8K indexed TSB for now so we can find them. 11774 * This could really be changed in the future if we feel 11775 * the need... 11776 */ 11777 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 11778 kpmsm_tsbsz = ktsb_szcode; 11779 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 11780 kpm_tsbsz = ktsb4m_szcode; 11781 } 11782 11783 kpmtsbmp = kpmtsbm_area; 11784 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 11785 /* 11786 * Initialize the kpmtsbm area. 11787 * Do this for all possible CPUs as some may be added 11788 * while the system is running. There is no cost to this. 11789 */ 11790 kpmtsbmp->vbase = kpm_vbase; 11791 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 11792 kpmtsbmp->sz_shift = kpm_size_shift; 11793 kpmtsbmp->kpmp_shift = kpmp_shift; 11794 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 11795 if (kpm_smallpages == 0) { 11796 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 11797 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 11798 } else { 11799 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 11800 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 11801 } 11802 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 11803 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 11804 #ifdef DEBUG 11805 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 11806 #endif /* DEBUG */ 11807 if (ktsb_phys) 11808 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 11809 } 11810 11811 sfmmu_tsb_cb_id = hat_register_callback(sfmmu_tsb_pre_relocator, 11812 sfmmu_tsb_post_relocator, NULL, 0); 11813 } 11814 11815 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 11816 struct tsb_info ktsb_info[2]; 11817 11818 /* 11819 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 11820 */ 11821 void 11822 sfmmu_init_ktsbinfo() 11823 { 11824 ASSERT(ksfmmup != NULL); 11825 ASSERT(ksfmmup->sfmmu_tsb == NULL); 11826 /* 11827 * Allocate tsbinfos for kernel and copy in data 11828 * to make debug easier and sun4v setup easier. 11829 */ 11830 ktsb_info[0].tsb_sfmmu = ksfmmup; 11831 ktsb_info[0].tsb_szc = ktsb_szcode; 11832 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 11833 ktsb_info[0].tsb_va = ktsb_base; 11834 ktsb_info[0].tsb_pa = ktsb_pbase; 11835 ktsb_info[0].tsb_flags = 0; 11836 ktsb_info[0].tsb_tte.ll = 0; 11837 ktsb_info[0].tsb_cache = NULL; 11838 11839 ktsb_info[1].tsb_sfmmu = ksfmmup; 11840 ktsb_info[1].tsb_szc = ktsb4m_szcode; 11841 ktsb_info[1].tsb_ttesz_mask = TSB4M; 11842 ktsb_info[1].tsb_va = ktsb4m_base; 11843 ktsb_info[1].tsb_pa = ktsb4m_pbase; 11844 ktsb_info[1].tsb_flags = 0; 11845 ktsb_info[1].tsb_tte.ll = 0; 11846 ktsb_info[1].tsb_cache = NULL; 11847 11848 /* Link them into ksfmmup. */ 11849 ktsb_info[0].tsb_next = &ktsb_info[1]; 11850 ktsb_info[1].tsb_next = NULL; 11851 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 11852 11853 sfmmu_setup_tsbinfo(ksfmmup); 11854 } 11855 11856 /* 11857 * Cache the last value returned from va_to_pa(). If the VA specified 11858 * in the current call to cached_va_to_pa() maps to the same Page (as the 11859 * previous call to cached_va_to_pa()), then compute the PA using 11860 * cached info, else call va_to_pa(). 11861 * 11862 * Note: this function is neither MT-safe nor consistent in the presence 11863 * of multiple, interleaved threads. This function was created to enable 11864 * an optimization used during boot (at a point when there's only one thread 11865 * executing on the "boot CPU", and before startup_vm() has been called). 11866 */ 11867 static uint64_t 11868 cached_va_to_pa(void *vaddr) 11869 { 11870 static uint64_t prev_vaddr_base = 0; 11871 static uint64_t prev_pfn = 0; 11872 11873 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 11874 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 11875 } else { 11876 uint64_t pa = va_to_pa(vaddr); 11877 11878 if (pa != ((uint64_t)-1)) { 11879 /* 11880 * Computed physical address is valid. Cache its 11881 * related info for the next cached_va_to_pa() call. 11882 */ 11883 prev_pfn = pa & MMU_PAGEMASK; 11884 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 11885 } 11886 11887 return (pa); 11888 } 11889 } 11890 11891 /* 11892 * Carve up our nucleus hblk region. We may allocate more hblks than 11893 * asked due to rounding errors but we are guaranteed to have at least 11894 * enough space to allocate the requested number of hblk8's and hblk1's. 11895 */ 11896 void 11897 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 11898 { 11899 struct hme_blk *hmeblkp; 11900 size_t hme8blk_sz, hme1blk_sz; 11901 size_t i; 11902 size_t hblk8_bound; 11903 ulong_t j = 0, k = 0; 11904 11905 ASSERT(addr != NULL && size != 0); 11906 11907 /* Need to use proper structure alignment */ 11908 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 11909 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 11910 11911 nucleus_hblk8.list = (void *)addr; 11912 nucleus_hblk8.index = 0; 11913 11914 /* 11915 * Use as much memory as possible for hblk8's since we 11916 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 11917 * We need to hold back enough space for the hblk1's which 11918 * we'll allocate next. 11919 */ 11920 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 11921 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 11922 hmeblkp = (struct hme_blk *)addr; 11923 addr += hme8blk_sz; 11924 hmeblkp->hblk_nuc_bit = 1; 11925 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 11926 } 11927 nucleus_hblk8.len = j; 11928 ASSERT(j >= nhblk8); 11929 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 11930 11931 nucleus_hblk1.list = (void *)addr; 11932 nucleus_hblk1.index = 0; 11933 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 11934 hmeblkp = (struct hme_blk *)addr; 11935 addr += hme1blk_sz; 11936 hmeblkp->hblk_nuc_bit = 1; 11937 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 11938 } 11939 ASSERT(k >= nhblk1); 11940 nucleus_hblk1.len = k; 11941 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 11942 } 11943 11944 /* 11945 * This function is currently not supported on this platform. For what 11946 * it's supposed to do, see hat.c and hat_srmmu.c 11947 */ 11948 /* ARGSUSED */ 11949 faultcode_t 11950 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 11951 uint_t flags) 11952 { 11953 ASSERT(hat->sfmmu_xhat_provider == NULL); 11954 return (FC_NOSUPPORT); 11955 } 11956 11957 /* 11958 * Searchs the mapping list of the page for a mapping of the same size. If not 11959 * found the corresponding bit is cleared in the p_index field. When large 11960 * pages are more prevalent in the system, we can maintain the mapping list 11961 * in order and we don't have to traverse the list each time. Just check the 11962 * next and prev entries, and if both are of different size, we clear the bit. 11963 */ 11964 static void 11965 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 11966 { 11967 struct sf_hment *sfhmep; 11968 struct hme_blk *hmeblkp; 11969 int index; 11970 pgcnt_t npgs; 11971 11972 ASSERT(ttesz > TTE8K); 11973 11974 ASSERT(sfmmu_mlist_held(pp)); 11975 11976 ASSERT(PP_ISMAPPED_LARGE(pp)); 11977 11978 /* 11979 * Traverse mapping list looking for another mapping of same size. 11980 * since we only want to clear index field if all mappings of 11981 * that size are gone. 11982 */ 11983 11984 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 11985 hmeblkp = sfmmu_hmetohblk(sfhmep); 11986 if (hmeblkp->hblk_xhat_bit) 11987 continue; 11988 if (hme_size(sfhmep) == ttesz) { 11989 /* 11990 * another mapping of the same size. don't clear index. 11991 */ 11992 return; 11993 } 11994 } 11995 11996 /* 11997 * Clear the p_index bit for large page. 11998 */ 11999 index = PAGESZ_TO_INDEX(ttesz); 12000 npgs = TTEPAGES(ttesz); 12001 while (npgs-- > 0) { 12002 ASSERT(pp->p_index & index); 12003 pp->p_index &= ~index; 12004 pp = PP_PAGENEXT(pp); 12005 } 12006 } 12007 12008 /* 12009 * return supported features 12010 */ 12011 /* ARGSUSED */ 12012 int 12013 hat_supported(enum hat_features feature, void *arg) 12014 { 12015 switch (feature) { 12016 case HAT_SHARED_PT: 12017 case HAT_DYNAMIC_ISM_UNMAP: 12018 case HAT_VMODSORT: 12019 return (1); 12020 default: 12021 return (0); 12022 } 12023 } 12024 12025 void 12026 hat_enter(struct hat *hat) 12027 { 12028 hatlock_t *hatlockp; 12029 12030 if (hat != ksfmmup) { 12031 hatlockp = TSB_HASH(hat); 12032 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 12033 } 12034 } 12035 12036 void 12037 hat_exit(struct hat *hat) 12038 { 12039 hatlock_t *hatlockp; 12040 12041 if (hat != ksfmmup) { 12042 hatlockp = TSB_HASH(hat); 12043 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 12044 } 12045 } 12046 12047 /*ARGSUSED*/ 12048 void 12049 hat_reserve(struct as *as, caddr_t addr, size_t len) 12050 { 12051 } 12052 12053 static void 12054 hat_kstat_init(void) 12055 { 12056 kstat_t *ksp; 12057 12058 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 12059 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 12060 KSTAT_FLAG_VIRTUAL); 12061 if (ksp) { 12062 ksp->ks_data = (void *) &sfmmu_global_stat; 12063 kstat_install(ksp); 12064 } 12065 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 12066 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 12067 KSTAT_FLAG_VIRTUAL); 12068 if (ksp) { 12069 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 12070 kstat_install(ksp); 12071 } 12072 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 12073 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 12074 KSTAT_FLAG_WRITABLE); 12075 if (ksp) { 12076 ksp->ks_update = sfmmu_kstat_percpu_update; 12077 kstat_install(ksp); 12078 } 12079 } 12080 12081 /* ARGSUSED */ 12082 static int 12083 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 12084 { 12085 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 12086 struct tsbmiss *tsbm = tsbmiss_area; 12087 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 12088 int i; 12089 12090 ASSERT(cpu_kstat); 12091 if (rw == KSTAT_READ) { 12092 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 12093 cpu_kstat->sf_itlb_misses = tsbm->itlb_misses; 12094 cpu_kstat->sf_dtlb_misses = tsbm->dtlb_misses; 12095 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 12096 tsbm->uprot_traps; 12097 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 12098 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 12099 12100 if (tsbm->itlb_misses > 0 && tsbm->dtlb_misses > 0) { 12101 cpu_kstat->sf_tsb_hits = 12102 (tsbm->itlb_misses + tsbm->dtlb_misses) - 12103 (tsbm->utsb_misses + tsbm->ktsb_misses + 12104 kpmtsbm->kpm_tsb_misses); 12105 } else { 12106 cpu_kstat->sf_tsb_hits = 0; 12107 } 12108 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 12109 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 12110 } 12111 } else { 12112 /* KSTAT_WRITE is used to clear stats */ 12113 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 12114 tsbm->itlb_misses = 0; 12115 tsbm->dtlb_misses = 0; 12116 tsbm->utsb_misses = 0; 12117 tsbm->ktsb_misses = 0; 12118 tsbm->uprot_traps = 0; 12119 tsbm->kprot_traps = 0; 12120 kpmtsbm->kpm_dtlb_misses = 0; 12121 kpmtsbm->kpm_tsb_misses = 0; 12122 } 12123 } 12124 return (0); 12125 } 12126 12127 #ifdef DEBUG 12128 12129 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 12130 12131 /* 12132 * A tte checker. *orig_old is the value we read before cas. 12133 * *cur is the value returned by cas. 12134 * *new is the desired value when we do the cas. 12135 * 12136 * *hmeblkp is currently unused. 12137 */ 12138 12139 /* ARGSUSED */ 12140 void 12141 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 12142 { 12143 uint_t i, j, k; 12144 int cpuid = CPU->cpu_id; 12145 12146 gorig[cpuid] = orig_old; 12147 gcur[cpuid] = cur; 12148 gnew[cpuid] = new; 12149 12150 #ifdef lint 12151 hmeblkp = hmeblkp; 12152 #endif 12153 12154 if (TTE_IS_VALID(orig_old)) { 12155 if (TTE_IS_VALID(cur)) { 12156 i = TTE_TO_TTEPFN(orig_old); 12157 j = TTE_TO_TTEPFN(cur); 12158 k = TTE_TO_TTEPFN(new); 12159 if (i != j) { 12160 /* remap error? */ 12161 panic("chk_tte: bad pfn, 0x%x, 0x%x", 12162 i, j); 12163 } 12164 12165 if (i != k) { 12166 /* remap error? */ 12167 panic("chk_tte: bad pfn2, 0x%x, 0x%x", 12168 i, k); 12169 } 12170 } else { 12171 if (TTE_IS_VALID(new)) { 12172 panic("chk_tte: invalid cur? "); 12173 } 12174 12175 i = TTE_TO_TTEPFN(orig_old); 12176 k = TTE_TO_TTEPFN(new); 12177 if (i != k) { 12178 panic("chk_tte: bad pfn3, 0x%x, 0x%x", 12179 i, k); 12180 } 12181 } 12182 } else { 12183 if (TTE_IS_VALID(cur)) { 12184 j = TTE_TO_TTEPFN(cur); 12185 if (TTE_IS_VALID(new)) { 12186 k = TTE_TO_TTEPFN(new); 12187 if (j != k) { 12188 panic("chk_tte: bad pfn4, 0x%x, 0x%x", 12189 j, k); 12190 } 12191 } else { 12192 panic("chk_tte: why here?"); 12193 } 12194 } else { 12195 if (!TTE_IS_VALID(new)) { 12196 panic("chk_tte: why here2 ?"); 12197 } 12198 } 12199 } 12200 } 12201 12202 #endif /* DEBUG */ 12203 12204 extern void prefetch_tsbe_read(struct tsbe *); 12205 extern void prefetch_tsbe_write(struct tsbe *); 12206 12207 12208 /* 12209 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 12210 * us optimal performance on Cheetah+. You can only have 8 outstanding 12211 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 12212 * prefetch to make the most utilization of the prefetch capability. 12213 */ 12214 #define TSBE_PREFETCH_STRIDE (7) 12215 12216 void 12217 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 12218 { 12219 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 12220 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 12221 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 12222 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 12223 struct tsbe *old; 12224 struct tsbe *new; 12225 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 12226 uint64_t va; 12227 int new_offset; 12228 int i; 12229 int vpshift; 12230 int last_prefetch; 12231 12232 if (old_bytes == new_bytes) { 12233 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 12234 } else { 12235 12236 /* 12237 * A TSBE is 16 bytes which means there are four TSBE's per 12238 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 12239 */ 12240 old = (struct tsbe *)old_tsbinfo->tsb_va; 12241 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 12242 for (i = 0; i < old_entries; i++, old++) { 12243 if (((i & (4-1)) == 0) && (i < last_prefetch)) 12244 prefetch_tsbe_read(old); 12245 if (!old->tte_tag.tag_invalid) { 12246 /* 12247 * We have a valid TTE to remap. Check the 12248 * size. We won't remap 64K or 512K TTEs 12249 * because they span more than one TSB entry 12250 * and are indexed using an 8K virt. page. 12251 * Ditto for 32M and 256M TTEs. 12252 */ 12253 if (TTE_CSZ(&old->tte_data) == TTE64K || 12254 TTE_CSZ(&old->tte_data) == TTE512K) 12255 continue; 12256 if (mmu_page_sizes == max_mmu_page_sizes) { 12257 if (TTE_CSZ(&old->tte_data) == TTE32M || 12258 TTE_CSZ(&old->tte_data) == TTE256M) 12259 continue; 12260 } 12261 12262 /* clear the lower 22 bits of the va */ 12263 va = *(uint64_t *)old << 22; 12264 /* turn va into a virtual pfn */ 12265 va >>= 22 - TSB_START_SIZE; 12266 /* 12267 * or in bits from the offset in the tsb 12268 * to get the real virtual pfn. These 12269 * correspond to bits [21:13] in the va 12270 */ 12271 vpshift = 12272 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 12273 0x1ff; 12274 va |= (i << vpshift); 12275 va >>= vpshift; 12276 new_offset = va & (new_entries - 1); 12277 new = new_base + new_offset; 12278 prefetch_tsbe_write(new); 12279 *new = *old; 12280 } 12281 } 12282 } 12283 } 12284 12285 /* 12286 * Kernel Physical Mapping (kpm) facility 12287 */ 12288 12289 /* -- hat_kpm interface section -- */ 12290 12291 /* 12292 * Mapin a locked page and return the vaddr. 12293 * When a kpme is provided by the caller it is added to 12294 * the page p_kpmelist. The page to be mapped in must 12295 * be at least read locked (p_selock). 12296 */ 12297 caddr_t 12298 hat_kpm_mapin(struct page *pp, struct kpme *kpme) 12299 { 12300 kmutex_t *pml; 12301 caddr_t vaddr; 12302 12303 if (kpm_enable == 0) { 12304 cmn_err(CE_WARN, "hat_kpm_mapin: kpm_enable not set"); 12305 return ((caddr_t)NULL); 12306 } 12307 12308 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 12309 cmn_err(CE_WARN, "hat_kpm_mapin: pp zero or not locked"); 12310 return ((caddr_t)NULL); 12311 } 12312 12313 pml = sfmmu_mlist_enter(pp); 12314 ASSERT(pp->p_kpmref >= 0); 12315 12316 vaddr = (pp->p_kpmref == 0) ? 12317 sfmmu_kpm_mapin(pp) : hat_kpm_page2va(pp, 1); 12318 12319 if (kpme != NULL) { 12320 /* 12321 * Tolerate multiple mapins for the same kpme to avoid 12322 * the need for an extra serialization. 12323 */ 12324 if ((sfmmu_kpme_lookup(kpme, pp)) == 0) 12325 sfmmu_kpme_add(kpme, pp); 12326 12327 ASSERT(pp->p_kpmref > 0); 12328 12329 } else { 12330 pp->p_kpmref++; 12331 } 12332 12333 sfmmu_mlist_exit(pml); 12334 return (vaddr); 12335 } 12336 12337 /* 12338 * Mapout a locked page. 12339 * When a kpme is provided by the caller it is removed from 12340 * the page p_kpmelist. The page to be mapped out must be at 12341 * least read locked (p_selock). 12342 * Note: The seg_kpm layer provides a mapout interface for the 12343 * case that a kpme is used and the underlying page is unlocked. 12344 * This can be used instead of calling this function directly. 12345 */ 12346 void 12347 hat_kpm_mapout(struct page *pp, struct kpme *kpme, caddr_t vaddr) 12348 { 12349 kmutex_t *pml; 12350 12351 if (kpm_enable == 0) { 12352 cmn_err(CE_WARN, "hat_kpm_mapout: kpm_enable not set"); 12353 return; 12354 } 12355 12356 if (IS_KPM_ADDR(vaddr) == 0) { 12357 cmn_err(CE_WARN, "hat_kpm_mapout: no kpm address"); 12358 return; 12359 } 12360 12361 if (pp == NULL || PAGE_LOCKED(pp) == 0) { 12362 cmn_err(CE_WARN, "hat_kpm_mapout: page zero or not locked"); 12363 return; 12364 } 12365 12366 if (kpme != NULL) { 12367 ASSERT(pp == kpme->kpe_page); 12368 pp = kpme->kpe_page; 12369 pml = sfmmu_mlist_enter(pp); 12370 12371 if (sfmmu_kpme_lookup(kpme, pp) == 0) 12372 panic("hat_kpm_mapout: kpme not found pp=%p", 12373 (void *)pp); 12374 12375 ASSERT(pp->p_kpmref > 0); 12376 sfmmu_kpme_sub(kpme, pp); 12377 12378 } else { 12379 pml = sfmmu_mlist_enter(pp); 12380 pp->p_kpmref--; 12381 } 12382 12383 ASSERT(pp->p_kpmref >= 0); 12384 if (pp->p_kpmref == 0) 12385 sfmmu_kpm_mapout(pp, vaddr); 12386 12387 sfmmu_mlist_exit(pml); 12388 } 12389 12390 /* 12391 * Return the kpm virtual address for the page at pp. 12392 * If checkswap is non zero and the page is backed by a 12393 * swap vnode the physical address is used rather than 12394 * p_offset to determine the kpm region. 12395 * Note: The function has to be used w/ extreme care. The 12396 * stability of the page identity is in the responsibility 12397 * of the caller. 12398 */ 12399 caddr_t 12400 hat_kpm_page2va(struct page *pp, int checkswap) 12401 { 12402 int vcolor, vcolor_pa; 12403 uintptr_t paddr, vaddr; 12404 12405 ASSERT(kpm_enable); 12406 12407 paddr = ptob(pp->p_pagenum); 12408 vcolor_pa = addr_to_vcolor(paddr); 12409 12410 if (checkswap && pp->p_vnode && IS_SWAPFSVP(pp->p_vnode)) 12411 vcolor = (PP_ISNC(pp)) ? vcolor_pa : PP_GET_VCOLOR(pp); 12412 else 12413 vcolor = addr_to_vcolor(pp->p_offset); 12414 12415 vaddr = (uintptr_t)kpm_vbase + paddr; 12416 12417 if (vcolor_pa != vcolor) { 12418 vaddr += ((uintptr_t)(vcolor - vcolor_pa) << MMU_PAGESHIFT); 12419 vaddr += (vcolor_pa > vcolor) ? 12420 ((uintptr_t)vcolor_pa << kpm_size_shift) : 12421 ((uintptr_t)(vcolor - vcolor_pa) << kpm_size_shift); 12422 } 12423 12424 return ((caddr_t)vaddr); 12425 } 12426 12427 /* 12428 * Return the page for the kpm virtual address vaddr. 12429 * Caller is responsible for the kpm mapping and lock 12430 * state of the page. 12431 */ 12432 page_t * 12433 hat_kpm_vaddr2page(caddr_t vaddr) 12434 { 12435 uintptr_t paddr; 12436 pfn_t pfn; 12437 12438 ASSERT(IS_KPM_ADDR(vaddr)); 12439 12440 SFMMU_KPM_VTOP(vaddr, paddr); 12441 pfn = (pfn_t)btop(paddr); 12442 12443 return (page_numtopp_nolock(pfn)); 12444 } 12445 12446 /* page to kpm_page */ 12447 #define PP2KPMPG(pp, kp) { \ 12448 struct memseg *mseg; \ 12449 pgcnt_t inx; \ 12450 pfn_t pfn; \ 12451 \ 12452 pfn = pp->p_pagenum; \ 12453 mseg = page_numtomemseg_nolock(pfn); \ 12454 ASSERT(mseg); \ 12455 inx = ptokpmp(kpmptop(ptokpmp(pfn)) - mseg->kpm_pbase); \ 12456 ASSERT(inx < mseg->kpm_nkpmpgs); \ 12457 kp = &mseg->kpm_pages[inx]; \ 12458 } 12459 12460 /* page to kpm_spage */ 12461 #define PP2KPMSPG(pp, ksp) { \ 12462 struct memseg *mseg; \ 12463 pgcnt_t inx; \ 12464 pfn_t pfn; \ 12465 \ 12466 pfn = pp->p_pagenum; \ 12467 mseg = page_numtomemseg_nolock(pfn); \ 12468 ASSERT(mseg); \ 12469 inx = pfn - mseg->kpm_pbase; \ 12470 ksp = &mseg->kpm_spages[inx]; \ 12471 } 12472 12473 /* 12474 * hat_kpm_fault is called from segkpm_fault when a kpm tsbmiss occurred 12475 * which could not be resolved by the trap level tsbmiss handler for the 12476 * following reasons: 12477 * . The vaddr is in VAC alias range (always PAGESIZE mapping size). 12478 * . The kpm (s)page range of vaddr is in a VAC alias prevention state. 12479 * . tsbmiss handling at trap level is not desired (DEBUG kernel only, 12480 * kpm_tsbmtl == 0). 12481 */ 12482 int 12483 hat_kpm_fault(struct hat *hat, caddr_t vaddr) 12484 { 12485 int error; 12486 uintptr_t paddr; 12487 pfn_t pfn; 12488 struct memseg *mseg; 12489 page_t *pp; 12490 12491 if (kpm_enable == 0) { 12492 cmn_err(CE_WARN, "hat_kpm_fault: kpm_enable not set"); 12493 return (ENOTSUP); 12494 } 12495 12496 ASSERT(hat == ksfmmup); 12497 ASSERT(IS_KPM_ADDR(vaddr)); 12498 12499 SFMMU_KPM_VTOP(vaddr, paddr); 12500 pfn = (pfn_t)btop(paddr); 12501 mseg = page_numtomemseg_nolock(pfn); 12502 if (mseg == NULL) 12503 return (EFAULT); 12504 12505 pp = &mseg->pages[(pgcnt_t)(pfn - mseg->pages_base)]; 12506 ASSERT((pfn_t)pp->p_pagenum == pfn); 12507 12508 if (!PAGE_LOCKED(pp)) 12509 return (EFAULT); 12510 12511 if (kpm_smallpages == 0) 12512 error = sfmmu_kpm_fault(vaddr, mseg, pp); 12513 else 12514 error = sfmmu_kpm_fault_small(vaddr, mseg, pp); 12515 12516 return (error); 12517 } 12518 12519 extern krwlock_t memsegslock; 12520 12521 /* 12522 * memseg_hash[] was cleared, need to clear memseg_phash[] too. 12523 */ 12524 void 12525 hat_kpm_mseghash_clear(int nentries) 12526 { 12527 pgcnt_t i; 12528 12529 if (kpm_enable == 0) 12530 return; 12531 12532 for (i = 0; i < nentries; i++) 12533 memseg_phash[i] = MSEG_NULLPTR_PA; 12534 } 12535 12536 /* 12537 * Update memseg_phash[inx] when memseg_hash[inx] was changed. 12538 */ 12539 void 12540 hat_kpm_mseghash_update(pgcnt_t inx, struct memseg *msp) 12541 { 12542 if (kpm_enable == 0) 12543 return; 12544 12545 memseg_phash[inx] = (msp) ? va_to_pa(msp) : MSEG_NULLPTR_PA; 12546 } 12547 12548 /* 12549 * Update kpm memseg members from basic memseg info. 12550 */ 12551 void 12552 hat_kpm_addmem_mseg_update(struct memseg *msp, pgcnt_t nkpmpgs, 12553 offset_t kpm_pages_off) 12554 { 12555 if (kpm_enable == 0) 12556 return; 12557 12558 msp->kpm_pages = (kpm_page_t *)((caddr_t)msp->pages + kpm_pages_off); 12559 msp->kpm_nkpmpgs = nkpmpgs; 12560 msp->kpm_pbase = kpmptop(ptokpmp(msp->pages_base)); 12561 msp->pagespa = va_to_pa(msp->pages); 12562 msp->epagespa = va_to_pa(msp->epages); 12563 msp->kpm_pagespa = va_to_pa(msp->kpm_pages); 12564 } 12565 12566 /* 12567 * Setup nextpa when a memseg is inserted. 12568 * Assumes that the memsegslock is already held. 12569 */ 12570 void 12571 hat_kpm_addmem_mseg_insert(struct memseg *msp) 12572 { 12573 if (kpm_enable == 0) 12574 return; 12575 12576 ASSERT(RW_LOCK_HELD(&memsegslock)); 12577 msp->nextpa = (memsegs) ? va_to_pa(memsegs) : MSEG_NULLPTR_PA; 12578 } 12579 12580 /* 12581 * Setup memsegspa when a memseg is (head) inserted. 12582 * Called before memsegs is updated to complete a 12583 * memseg insert operation. 12584 * Assumes that the memsegslock is already held. 12585 */ 12586 void 12587 hat_kpm_addmem_memsegs_update(struct memseg *msp) 12588 { 12589 if (kpm_enable == 0) 12590 return; 12591 12592 ASSERT(RW_LOCK_HELD(&memsegslock)); 12593 ASSERT(memsegs); 12594 memsegspa = va_to_pa(msp); 12595 } 12596 12597 /* 12598 * Return end of metadata for an already setup memseg. 12599 * 12600 * Note: kpm_pages and kpm_spages are aliases and the underlying 12601 * member of struct memseg is a union, therefore they always have 12602 * the same address within a memseg. They must be differentiated 12603 * when pointer arithmetic is used with them. 12604 */ 12605 caddr_t 12606 hat_kpm_mseg_reuse(struct memseg *msp) 12607 { 12608 caddr_t end; 12609 12610 if (kpm_smallpages == 0) 12611 end = (caddr_t)(msp->kpm_pages + msp->kpm_nkpmpgs); 12612 else 12613 end = (caddr_t)(msp->kpm_spages + msp->kpm_nkpmpgs); 12614 12615 return (end); 12616 } 12617 12618 /* 12619 * Update memsegspa (when first memseg in list 12620 * is deleted) or nextpa when a memseg deleted. 12621 * Assumes that the memsegslock is already held. 12622 */ 12623 void 12624 hat_kpm_delmem_mseg_update(struct memseg *msp, struct memseg **mspp) 12625 { 12626 struct memseg *lmsp; 12627 12628 if (kpm_enable == 0) 12629 return; 12630 12631 ASSERT(RW_LOCK_HELD(&memsegslock)); 12632 12633 if (mspp == &memsegs) { 12634 memsegspa = (msp->next) ? 12635 va_to_pa(msp->next) : MSEG_NULLPTR_PA; 12636 } else { 12637 lmsp = (struct memseg *) 12638 ((uint64_t)mspp - offsetof(struct memseg, next)); 12639 lmsp->nextpa = (msp->next) ? 12640 va_to_pa(msp->next) : MSEG_NULLPTR_PA; 12641 } 12642 } 12643 12644 /* 12645 * Update kpm members for all memseg's involved in a split operation 12646 * and do the atomic update of the physical memseg chain. 12647 * 12648 * Note: kpm_pages and kpm_spages are aliases and the underlying member 12649 * of struct memseg is a union, therefore they always have the same 12650 * address within a memseg. With that the direct assignments and 12651 * va_to_pa conversions below don't have to be distinguished wrt. to 12652 * kpm_smallpages. They must be differentiated when pointer arithmetic 12653 * is used with them. 12654 * 12655 * Assumes that the memsegslock is already held. 12656 */ 12657 void 12658 hat_kpm_split_mseg_update(struct memseg *msp, struct memseg **mspp, 12659 struct memseg *lo, struct memseg *mid, struct memseg *hi) 12660 { 12661 pgcnt_t start, end, kbase, kstart, num; 12662 struct memseg *lmsp; 12663 12664 if (kpm_enable == 0) 12665 return; 12666 12667 ASSERT(RW_LOCK_HELD(&memsegslock)); 12668 ASSERT(msp && mid && msp->kpm_pages); 12669 12670 kbase = ptokpmp(msp->kpm_pbase); 12671 12672 if (lo) { 12673 num = lo->pages_end - lo->pages_base; 12674 start = kpmptop(ptokpmp(lo->pages_base)); 12675 /* align end to kpm page size granularity */ 12676 end = kpmptop(ptokpmp(start + num - 1)) + kpmpnpgs; 12677 lo->kpm_pbase = start; 12678 lo->kpm_nkpmpgs = ptokpmp(end - start); 12679 lo->kpm_pages = msp->kpm_pages; 12680 lo->kpm_pagespa = va_to_pa(lo->kpm_pages); 12681 lo->pagespa = va_to_pa(lo->pages); 12682 lo->epagespa = va_to_pa(lo->epages); 12683 lo->nextpa = va_to_pa(lo->next); 12684 } 12685 12686 /* mid */ 12687 num = mid->pages_end - mid->pages_base; 12688 kstart = ptokpmp(mid->pages_base); 12689 start = kpmptop(kstart); 12690 /* align end to kpm page size granularity */ 12691 end = kpmptop(ptokpmp(start + num - 1)) + kpmpnpgs; 12692 mid->kpm_pbase = start; 12693 mid->kpm_nkpmpgs = ptokpmp(end - start); 12694 if (kpm_smallpages == 0) { 12695 mid->kpm_pages = msp->kpm_pages + (kstart - kbase); 12696 } else { 12697 mid->kpm_spages = msp->kpm_spages + (kstart - kbase); 12698 } 12699 mid->kpm_pagespa = va_to_pa(mid->kpm_pages); 12700 mid->pagespa = va_to_pa(mid->pages); 12701 mid->epagespa = va_to_pa(mid->epages); 12702 mid->nextpa = (mid->next) ? va_to_pa(mid->next) : MSEG_NULLPTR_PA; 12703 12704 if (hi) { 12705 num = hi->pages_end - hi->pages_base; 12706 kstart = ptokpmp(hi->pages_base); 12707 start = kpmptop(kstart); 12708 /* align end to kpm page size granularity */ 12709 end = kpmptop(ptokpmp(start + num - 1)) + kpmpnpgs; 12710 hi->kpm_pbase = start; 12711 hi->kpm_nkpmpgs = ptokpmp(end - start); 12712 if (kpm_smallpages == 0) { 12713 hi->kpm_pages = msp->kpm_pages + (kstart - kbase); 12714 } else { 12715 hi->kpm_spages = msp->kpm_spages + (kstart - kbase); 12716 } 12717 hi->kpm_pagespa = va_to_pa(hi->kpm_pages); 12718 hi->pagespa = va_to_pa(hi->pages); 12719 hi->epagespa = va_to_pa(hi->epages); 12720 hi->nextpa = (hi->next) ? va_to_pa(hi->next) : MSEG_NULLPTR_PA; 12721 } 12722 12723 /* 12724 * Atomic update of the physical memseg chain 12725 */ 12726 if (mspp == &memsegs) { 12727 memsegspa = (lo) ? va_to_pa(lo) : va_to_pa(mid); 12728 } else { 12729 lmsp = (struct memseg *) 12730 ((uint64_t)mspp - offsetof(struct memseg, next)); 12731 lmsp->nextpa = (lo) ? va_to_pa(lo) : va_to_pa(mid); 12732 } 12733 } 12734 12735 /* 12736 * Walk the memsegs chain, applying func to each memseg span and vcolor. 12737 */ 12738 void 12739 hat_kpm_walk(void (*func)(void *, void *, size_t), void *arg) 12740 { 12741 pfn_t pbase, pend; 12742 int vcolor; 12743 void *base; 12744 size_t size; 12745 struct memseg *msp; 12746 extern uint_t vac_colors; 12747 12748 for (msp = memsegs; msp; msp = msp->next) { 12749 pbase = msp->pages_base; 12750 pend = msp->pages_end; 12751 for (vcolor = 0; vcolor < vac_colors; vcolor++) { 12752 base = ptob(pbase) + kpm_vbase + kpm_size * vcolor; 12753 size = ptob(pend - pbase); 12754 func(arg, base, size); 12755 } 12756 } 12757 } 12758 12759 12760 /* -- sfmmu_kpm internal section -- */ 12761 12762 /* 12763 * Return the page frame number if a valid segkpm mapping exists 12764 * for vaddr, otherwise return PFN_INVALID. No locks are grabbed. 12765 * Should only be used by other sfmmu routines. 12766 */ 12767 pfn_t 12768 sfmmu_kpm_vatopfn(caddr_t vaddr) 12769 { 12770 uintptr_t paddr; 12771 pfn_t pfn; 12772 page_t *pp; 12773 12774 ASSERT(kpm_enable && IS_KPM_ADDR(vaddr)); 12775 12776 SFMMU_KPM_VTOP(vaddr, paddr); 12777 pfn = (pfn_t)btop(paddr); 12778 pp = page_numtopp_nolock(pfn); 12779 if (pp && pp->p_kpmref) 12780 return (pfn); 12781 else 12782 return ((pfn_t)PFN_INVALID); 12783 } 12784 12785 /* 12786 * Lookup a kpme in the p_kpmelist. 12787 */ 12788 static int 12789 sfmmu_kpme_lookup(struct kpme *kpme, page_t *pp) 12790 { 12791 struct kpme *p; 12792 12793 for (p = pp->p_kpmelist; p; p = p->kpe_next) { 12794 if (p == kpme) 12795 return (1); 12796 } 12797 return (0); 12798 } 12799 12800 /* 12801 * Insert a kpme into the p_kpmelist and increment 12802 * the per page kpm reference count. 12803 */ 12804 static void 12805 sfmmu_kpme_add(struct kpme *kpme, page_t *pp) 12806 { 12807 ASSERT(pp->p_kpmref >= 0); 12808 12809 /* head insert */ 12810 kpme->kpe_prev = NULL; 12811 kpme->kpe_next = pp->p_kpmelist; 12812 12813 if (pp->p_kpmelist) 12814 pp->p_kpmelist->kpe_prev = kpme; 12815 12816 pp->p_kpmelist = kpme; 12817 kpme->kpe_page = pp; 12818 pp->p_kpmref++; 12819 } 12820 12821 /* 12822 * Remove a kpme from the p_kpmelist and decrement 12823 * the per page kpm reference count. 12824 */ 12825 static void 12826 sfmmu_kpme_sub(struct kpme *kpme, page_t *pp) 12827 { 12828 ASSERT(pp->p_kpmref > 0); 12829 12830 if (kpme->kpe_prev) { 12831 ASSERT(pp->p_kpmelist != kpme); 12832 ASSERT(kpme->kpe_prev->kpe_page == pp); 12833 kpme->kpe_prev->kpe_next = kpme->kpe_next; 12834 } else { 12835 ASSERT(pp->p_kpmelist == kpme); 12836 pp->p_kpmelist = kpme->kpe_next; 12837 } 12838 12839 if (kpme->kpe_next) { 12840 ASSERT(kpme->kpe_next->kpe_page == pp); 12841 kpme->kpe_next->kpe_prev = kpme->kpe_prev; 12842 } 12843 12844 kpme->kpe_next = kpme->kpe_prev = NULL; 12845 kpme->kpe_page = NULL; 12846 pp->p_kpmref--; 12847 } 12848 12849 /* 12850 * Mapin a single page, it is called every time a page changes it's state 12851 * from kpm-unmapped to kpm-mapped. It may not be called, when only a new 12852 * kpm instance does a mapin and wants to share the mapping. 12853 * Assumes that the mlist mutex is already grabbed. 12854 */ 12855 static caddr_t 12856 sfmmu_kpm_mapin(page_t *pp) 12857 { 12858 kpm_page_t *kp; 12859 kpm_hlk_t *kpmp; 12860 caddr_t vaddr; 12861 int kpm_vac_range; 12862 pfn_t pfn; 12863 tte_t tte; 12864 kmutex_t *pmtx; 12865 int uncached; 12866 kpm_spage_t *ksp; 12867 kpm_shlk_t *kpmsp; 12868 int oldval; 12869 12870 ASSERT(sfmmu_mlist_held(pp)); 12871 ASSERT(pp->p_kpmref == 0); 12872 12873 vaddr = sfmmu_kpm_getvaddr(pp, &kpm_vac_range); 12874 12875 ASSERT(IS_KPM_ADDR(vaddr)); 12876 uncached = PP_ISNC(pp); 12877 pfn = pp->p_pagenum; 12878 12879 if (kpm_smallpages) 12880 goto smallpages_mapin; 12881 12882 PP2KPMPG(pp, kp); 12883 12884 kpmp = KPMP_HASH(kp); 12885 mutex_enter(&kpmp->khl_mutex); 12886 12887 ASSERT(PP_ISKPMC(pp) == 0); 12888 ASSERT(PP_ISKPMS(pp) == 0); 12889 12890 if (uncached) { 12891 /* ASSERT(pp->p_share); XXX use hat_page_getshare */ 12892 if (kpm_vac_range == 0) { 12893 if (kp->kp_refcnts == 0) { 12894 /* 12895 * Must remove large page mapping if it exists. 12896 * Pages in uncached state can only be mapped 12897 * small (PAGESIZE) within the regular kpm 12898 * range. 12899 */ 12900 if (kp->kp_refcntc == -1) { 12901 /* remove go indication */ 12902 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 12903 &kpmp->khl_lock, KPMTSBM_STOP); 12904 } 12905 if (kp->kp_refcnt > 0 && kp->kp_refcntc == 0) 12906 sfmmu_kpm_demap_large(vaddr); 12907 } 12908 ASSERT(kp->kp_refcntc >= 0); 12909 kp->kp_refcntc++; 12910 } 12911 pmtx = sfmmu_page_enter(pp); 12912 PP_SETKPMC(pp); 12913 sfmmu_page_exit(pmtx); 12914 } 12915 12916 if ((kp->kp_refcntc > 0 || kp->kp_refcnts > 0) && kpm_vac_range == 0) { 12917 /* 12918 * Have to do a small (PAGESIZE) mapin within this kpm_page 12919 * range since it is marked to be in VAC conflict mode or 12920 * when there are still other small mappings around. 12921 */ 12922 12923 /* tte assembly */ 12924 if (uncached == 0) 12925 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 12926 else 12927 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 12928 12929 /* tsb dropin */ 12930 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 12931 12932 pmtx = sfmmu_page_enter(pp); 12933 PP_SETKPMS(pp); 12934 sfmmu_page_exit(pmtx); 12935 12936 kp->kp_refcnts++; 12937 ASSERT(kp->kp_refcnts > 0); 12938 goto exit; 12939 } 12940 12941 if (kpm_vac_range == 0) { 12942 /* 12943 * Fast path / regular case, no VAC conflict handling 12944 * in progress within this kpm_page range. 12945 */ 12946 if (kp->kp_refcnt == 0) { 12947 12948 /* tte assembly */ 12949 KPM_TTE_VCACHED(tte.ll, pfn, TTE4M); 12950 12951 /* tsb dropin */ 12952 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT4M); 12953 12954 /* Set go flag for TL tsbmiss handler */ 12955 if (kp->kp_refcntc == 0) 12956 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 12957 &kpmp->khl_lock, KPMTSBM_START); 12958 12959 ASSERT(kp->kp_refcntc == -1); 12960 } 12961 kp->kp_refcnt++; 12962 ASSERT(kp->kp_refcnt); 12963 12964 } else { 12965 /* 12966 * The page is not setup according to the common VAC 12967 * prevention rules for the regular and kpm mapping layer 12968 * E.g. the page layer was not able to deliver a right 12969 * vcolor'ed page for a given vaddr corresponding to 12970 * the wanted p_offset. It has to be mapped in small in 12971 * within the corresponding kpm vac range in order to 12972 * prevent VAC alias conflicts. 12973 */ 12974 12975 /* tte assembly */ 12976 if (uncached == 0) { 12977 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 12978 } else { 12979 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 12980 } 12981 12982 /* tsb dropin */ 12983 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 12984 12985 kp->kp_refcnta++; 12986 if (kp->kp_refcntc == -1) { 12987 ASSERT(kp->kp_refcnt > 0); 12988 12989 /* remove go indication */ 12990 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, &kpmp->khl_lock, 12991 KPMTSBM_STOP); 12992 } 12993 ASSERT(kp->kp_refcntc >= 0); 12994 } 12995 exit: 12996 mutex_exit(&kpmp->khl_mutex); 12997 return (vaddr); 12998 12999 smallpages_mapin: 13000 if (uncached == 0) { 13001 /* tte assembly */ 13002 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 13003 } else { 13004 /* ASSERT(pp->p_share); XXX use hat_page_getshare */ 13005 pmtx = sfmmu_page_enter(pp); 13006 PP_SETKPMC(pp); 13007 sfmmu_page_exit(pmtx); 13008 /* tte assembly */ 13009 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 13010 } 13011 13012 /* tsb dropin */ 13013 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13014 13015 PP2KPMSPG(pp, ksp); 13016 kpmsp = KPMP_SHASH(ksp); 13017 13018 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, &kpmsp->kshl_lock, 13019 (uncached) ? KPM_MAPPEDSC : KPM_MAPPEDS); 13020 13021 if (oldval != 0) 13022 panic("sfmmu_kpm_mapin: stale smallpages mapping"); 13023 13024 return (vaddr); 13025 } 13026 13027 /* 13028 * Mapout a single page, it is called every time a page changes it's state 13029 * from kpm-mapped to kpm-unmapped. It may not be called, when only a kpm 13030 * instance calls mapout and there are still other instances mapping the 13031 * page. Assumes that the mlist mutex is already grabbed. 13032 * 13033 * Note: In normal mode (no VAC conflict prevention pending) TLB's are 13034 * not flushed. This is the core segkpm behavior to avoid xcalls. It is 13035 * no problem because a translation from a segkpm virtual address to a 13036 * physical address is always the same. The only downside is a slighty 13037 * increased window of vulnerability for misbehaving _kernel_ modules. 13038 */ 13039 static void 13040 sfmmu_kpm_mapout(page_t *pp, caddr_t vaddr) 13041 { 13042 kpm_page_t *kp; 13043 kpm_hlk_t *kpmp; 13044 int alias_range; 13045 kmutex_t *pmtx; 13046 kpm_spage_t *ksp; 13047 kpm_shlk_t *kpmsp; 13048 int oldval; 13049 13050 ASSERT(sfmmu_mlist_held(pp)); 13051 ASSERT(pp->p_kpmref == 0); 13052 13053 alias_range = IS_KPM_ALIAS_RANGE(vaddr); 13054 13055 if (kpm_smallpages) 13056 goto smallpages_mapout; 13057 13058 PP2KPMPG(pp, kp); 13059 kpmp = KPMP_HASH(kp); 13060 mutex_enter(&kpmp->khl_mutex); 13061 13062 if (alias_range) { 13063 ASSERT(PP_ISKPMS(pp) == 0); 13064 if (kp->kp_refcnta <= 0) { 13065 panic("sfmmu_kpm_mapout: bad refcnta kp=%p", 13066 (void *)kp); 13067 } 13068 13069 if (PP_ISTNC(pp)) { 13070 if (PP_ISKPMC(pp) == 0) { 13071 /* 13072 * Uncached kpm mappings must always have 13073 * forced "small page" mode. 13074 */ 13075 panic("sfmmu_kpm_mapout: uncached page not " 13076 "kpm marked"); 13077 } 13078 sfmmu_kpm_demap_small(vaddr); 13079 13080 pmtx = sfmmu_page_enter(pp); 13081 PP_CLRKPMC(pp); 13082 sfmmu_page_exit(pmtx); 13083 13084 /* 13085 * Check if we can resume cached mode. This might 13086 * be the case if the kpm mapping was the only 13087 * mapping in conflict with other non rule 13088 * compliant mappings. The page is no more marked 13089 * as kpm mapped, so the conv_tnc path will not 13090 * change kpm state. 13091 */ 13092 conv_tnc(pp, TTE8K); 13093 13094 } else if (PP_ISKPMC(pp) == 0) { 13095 /* remove TSB entry only */ 13096 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT); 13097 13098 } else { 13099 /* already demapped */ 13100 pmtx = sfmmu_page_enter(pp); 13101 PP_CLRKPMC(pp); 13102 sfmmu_page_exit(pmtx); 13103 } 13104 kp->kp_refcnta--; 13105 goto exit; 13106 } 13107 13108 if (kp->kp_refcntc <= 0 && kp->kp_refcnts == 0) { 13109 /* 13110 * Fast path / regular case. 13111 */ 13112 ASSERT(kp->kp_refcntc >= -1); 13113 ASSERT(!(pp->p_nrm & (P_KPMC | P_KPMS | P_TNC | P_PNC))); 13114 13115 if (kp->kp_refcnt <= 0) 13116 panic("sfmmu_kpm_mapout: bad refcnt kp=%p", (void *)kp); 13117 13118 if (--kp->kp_refcnt == 0) { 13119 /* remove go indication */ 13120 if (kp->kp_refcntc == -1) { 13121 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 13122 &kpmp->khl_lock, KPMTSBM_STOP); 13123 } 13124 ASSERT(kp->kp_refcntc == 0); 13125 13126 /* remove TSB entry */ 13127 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT4M); 13128 #ifdef DEBUG 13129 if (kpm_tlb_flush) 13130 sfmmu_kpm_demap_tlbs(vaddr, KCONTEXT); 13131 #endif 13132 } 13133 13134 } else { 13135 /* 13136 * The VAC alias path. 13137 * We come here if the kpm vaddr is not in any alias_range 13138 * and we are unmapping a page within the regular kpm_page 13139 * range. The kpm_page either holds conflict pages and/or 13140 * is in "small page" mode. If the page is not marked 13141 * P_KPMS it couldn't have a valid PAGESIZE sized TSB 13142 * entry. Dcache flushing is done lazy and follows the 13143 * rules of the regular virtual page coloring scheme. 13144 * 13145 * Per page states and required actions: 13146 * P_KPMC: remove a kpm mapping that is conflicting. 13147 * P_KPMS: remove a small kpm mapping within a kpm_page. 13148 * P_TNC: check if we can re-cache the page. 13149 * P_PNC: we cannot re-cache, sorry. 13150 * Per kpm_page: 13151 * kp_refcntc > 0: page is part of a kpm_page with conflicts. 13152 * kp_refcnts > 0: rm a small mapped page within a kpm_page. 13153 */ 13154 13155 if (PP_ISKPMS(pp)) { 13156 if (kp->kp_refcnts < 1) { 13157 panic("sfmmu_kpm_mapout: bad refcnts kp=%p", 13158 (void *)kp); 13159 } 13160 sfmmu_kpm_demap_small(vaddr); 13161 13162 /* 13163 * Check if we can resume cached mode. This might 13164 * be the case if the kpm mapping was the only 13165 * mapping in conflict with other non rule 13166 * compliant mappings. The page is no more marked 13167 * as kpm mapped, so the conv_tnc path will not 13168 * change kpm state. 13169 */ 13170 if (PP_ISTNC(pp)) { 13171 if (!PP_ISKPMC(pp)) { 13172 /* 13173 * Uncached kpm mappings must always 13174 * have forced "small page" mode. 13175 */ 13176 panic("sfmmu_kpm_mapout: uncached " 13177 "page not kpm marked"); 13178 } 13179 conv_tnc(pp, TTE8K); 13180 } 13181 kp->kp_refcnts--; 13182 kp->kp_refcnt++; 13183 pmtx = sfmmu_page_enter(pp); 13184 PP_CLRKPMS(pp); 13185 sfmmu_page_exit(pmtx); 13186 } 13187 13188 if (PP_ISKPMC(pp)) { 13189 if (kp->kp_refcntc < 1) { 13190 panic("sfmmu_kpm_mapout: bad refcntc kp=%p", 13191 (void *)kp); 13192 } 13193 pmtx = sfmmu_page_enter(pp); 13194 PP_CLRKPMC(pp); 13195 sfmmu_page_exit(pmtx); 13196 kp->kp_refcntc--; 13197 } 13198 13199 if (kp->kp_refcnt-- < 1) 13200 panic("sfmmu_kpm_mapout: bad refcnt kp=%p", (void *)kp); 13201 } 13202 exit: 13203 mutex_exit(&kpmp->khl_mutex); 13204 return; 13205 13206 smallpages_mapout: 13207 PP2KPMSPG(pp, ksp); 13208 kpmsp = KPMP_SHASH(ksp); 13209 13210 if (PP_ISKPMC(pp) == 0) { 13211 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13212 &kpmsp->kshl_lock, 0); 13213 13214 if (oldval != KPM_MAPPEDS) { 13215 /* 13216 * When we're called after sfmmu_kpm_hme_unload, 13217 * KPM_MAPPEDSC is valid too. 13218 */ 13219 if (oldval != KPM_MAPPEDSC) 13220 panic("sfmmu_kpm_mapout: incorrect mapping"); 13221 } 13222 13223 /* remove TSB entry */ 13224 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT); 13225 #ifdef DEBUG 13226 if (kpm_tlb_flush) 13227 sfmmu_kpm_demap_tlbs(vaddr, KCONTEXT); 13228 #endif 13229 13230 } else if (PP_ISTNC(pp)) { 13231 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13232 &kpmsp->kshl_lock, 0); 13233 13234 if (oldval != KPM_MAPPEDSC || PP_ISKPMC(pp) == 0) 13235 panic("sfmmu_kpm_mapout: inconsistent TNC mapping"); 13236 13237 sfmmu_kpm_demap_small(vaddr); 13238 13239 pmtx = sfmmu_page_enter(pp); 13240 PP_CLRKPMC(pp); 13241 sfmmu_page_exit(pmtx); 13242 13243 /* 13244 * Check if we can resume cached mode. This might be 13245 * the case if the kpm mapping was the only mapping 13246 * in conflict with other non rule compliant mappings. 13247 * The page is no more marked as kpm mapped, so the 13248 * conv_tnc path will not change the kpm state. 13249 */ 13250 conv_tnc(pp, TTE8K); 13251 13252 } else { 13253 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13254 &kpmsp->kshl_lock, 0); 13255 13256 if (oldval != KPM_MAPPEDSC) 13257 panic("sfmmu_kpm_mapout: inconsistent mapping"); 13258 13259 pmtx = sfmmu_page_enter(pp); 13260 PP_CLRKPMC(pp); 13261 sfmmu_page_exit(pmtx); 13262 } 13263 } 13264 13265 #define abs(x) ((x) < 0 ? -(x) : (x)) 13266 13267 /* 13268 * Determine appropriate kpm mapping address and handle any kpm/hme 13269 * conflicts. Page mapping list and its vcolor parts must be protected. 13270 */ 13271 static caddr_t 13272 sfmmu_kpm_getvaddr(page_t *pp, int *kpm_vac_rangep) 13273 { 13274 int vcolor, vcolor_pa; 13275 caddr_t vaddr; 13276 uintptr_t paddr; 13277 13278 13279 ASSERT(sfmmu_mlist_held(pp)); 13280 13281 paddr = ptob(pp->p_pagenum); 13282 vcolor_pa = addr_to_vcolor(paddr); 13283 13284 if (IS_SWAPFSVP(pp->p_vnode)) { 13285 vcolor = (PP_NEWPAGE(pp) || PP_ISNC(pp)) ? 13286 vcolor_pa : PP_GET_VCOLOR(pp); 13287 } else { 13288 vcolor = addr_to_vcolor(pp->p_offset); 13289 } 13290 13291 vaddr = kpm_vbase + paddr; 13292 *kpm_vac_rangep = 0; 13293 13294 if (vcolor_pa != vcolor) { 13295 *kpm_vac_rangep = abs(vcolor - vcolor_pa); 13296 vaddr += ((uintptr_t)(vcolor - vcolor_pa) << MMU_PAGESHIFT); 13297 vaddr += (vcolor_pa > vcolor) ? 13298 ((uintptr_t)vcolor_pa << kpm_size_shift) : 13299 ((uintptr_t)(vcolor - vcolor_pa) << kpm_size_shift); 13300 13301 ASSERT(!PP_ISMAPPED_LARGE(pp)); 13302 } 13303 13304 if (PP_ISNC(pp)) 13305 return (vaddr); 13306 13307 if (PP_NEWPAGE(pp)) { 13308 PP_SET_VCOLOR(pp, vcolor); 13309 return (vaddr); 13310 } 13311 13312 if (PP_GET_VCOLOR(pp) == vcolor) 13313 return (vaddr); 13314 13315 ASSERT(!PP_ISMAPPED_KPM(pp)); 13316 sfmmu_kpm_vac_conflict(pp, vaddr); 13317 13318 return (vaddr); 13319 } 13320 13321 /* 13322 * VAC conflict state bit values. 13323 * The following defines are used to make the handling of the 13324 * various input states more concise. For that the kpm states 13325 * per kpm_page and per page are combined in a summary state. 13326 * Each single state has a corresponding bit value in the 13327 * summary state. These defines only apply for kpm large page 13328 * mappings. Within comments the abbreviations "kc, c, ks, s" 13329 * are used as short form of the actual state, e.g. "kc" for 13330 * "kp_refcntc > 0", etc. 13331 */ 13332 #define KPM_KC 0x00000008 /* kpm_page: kp_refcntc > 0 */ 13333 #define KPM_C 0x00000004 /* page: P_KPMC set */ 13334 #define KPM_KS 0x00000002 /* kpm_page: kp_refcnts > 0 */ 13335 #define KPM_S 0x00000001 /* page: P_KPMS set */ 13336 13337 /* 13338 * Summary states used in sfmmu_kpm_fault (KPM_TSBM_*). 13339 * See also more detailed comments within in the sfmmu_kpm_fault switch. 13340 * Abbreviations used: 13341 * CONFL: VAC conflict(s) within a kpm_page. 13342 * MAPS: Mapped small: Page mapped in using a regular page size kpm mapping. 13343 * RASM: Re-assembling of a large page mapping possible. 13344 * RPLS: Replace: TSB miss due to TSB replacement only. 13345 * BRKO: Breakup Other: A large kpm mapping has to be broken because another 13346 * page within the kpm_page is already involved in a VAC conflict. 13347 * BRKT: Breakup This: A large kpm mapping has to be broken, this page is 13348 * is involved in a VAC conflict. 13349 */ 13350 #define KPM_TSBM_CONFL_GONE (0) 13351 #define KPM_TSBM_MAPS_RASM (KPM_KS) 13352 #define KPM_TSBM_RPLS_RASM (KPM_KS | KPM_S) 13353 #define KPM_TSBM_MAPS_BRKO (KPM_KC) 13354 #define KPM_TSBM_MAPS (KPM_KC | KPM_KS) 13355 #define KPM_TSBM_RPLS (KPM_KC | KPM_KS | KPM_S) 13356 #define KPM_TSBM_MAPS_BRKT (KPM_KC | KPM_C) 13357 #define KPM_TSBM_MAPS_CONFL (KPM_KC | KPM_C | KPM_KS) 13358 #define KPM_TSBM_RPLS_CONFL (KPM_KC | KPM_C | KPM_KS | KPM_S) 13359 13360 /* 13361 * kpm fault handler for mappings with large page size. 13362 */ 13363 int 13364 sfmmu_kpm_fault(caddr_t vaddr, struct memseg *mseg, page_t *pp) 13365 { 13366 int error; 13367 pgcnt_t inx; 13368 kpm_page_t *kp; 13369 tte_t tte; 13370 pfn_t pfn = pp->p_pagenum; 13371 kpm_hlk_t *kpmp; 13372 kmutex_t *pml; 13373 int alias_range; 13374 int uncached = 0; 13375 kmutex_t *pmtx; 13376 int badstate; 13377 uint_t tsbmcase; 13378 13379 alias_range = IS_KPM_ALIAS_RANGE(vaddr); 13380 13381 inx = ptokpmp(kpmptop(ptokpmp(pfn)) - mseg->kpm_pbase); 13382 if (inx >= mseg->kpm_nkpmpgs) { 13383 cmn_err(CE_PANIC, "sfmmu_kpm_fault: kpm overflow in memseg " 13384 "0x%p pp 0x%p", (void *)mseg, (void *)pp); 13385 } 13386 13387 kp = &mseg->kpm_pages[inx]; 13388 kpmp = KPMP_HASH(kp); 13389 13390 pml = sfmmu_mlist_enter(pp); 13391 13392 if (!PP_ISMAPPED_KPM(pp)) { 13393 sfmmu_mlist_exit(pml); 13394 return (EFAULT); 13395 } 13396 13397 mutex_enter(&kpmp->khl_mutex); 13398 13399 if (alias_range) { 13400 ASSERT(!PP_ISMAPPED_LARGE(pp)); 13401 if (kp->kp_refcnta > 0) { 13402 if (PP_ISKPMC(pp)) { 13403 pmtx = sfmmu_page_enter(pp); 13404 PP_CLRKPMC(pp); 13405 sfmmu_page_exit(pmtx); 13406 } 13407 /* 13408 * Check for vcolor conflicts. Return here 13409 * w/ either no conflict (fast path), removed hme 13410 * mapping chains (unload conflict) or uncached 13411 * (uncache conflict). VACaches are cleaned and 13412 * p_vcolor and PP_TNC are set accordingly for the 13413 * conflict cases. Drop kpmp for uncache conflict 13414 * cases since it will be grabbed within 13415 * sfmmu_kpm_page_cache in case of an uncache 13416 * conflict. 13417 */ 13418 mutex_exit(&kpmp->khl_mutex); 13419 sfmmu_kpm_vac_conflict(pp, vaddr); 13420 mutex_enter(&kpmp->khl_mutex); 13421 13422 if (PP_ISNC(pp)) { 13423 uncached = 1; 13424 pmtx = sfmmu_page_enter(pp); 13425 PP_SETKPMC(pp); 13426 sfmmu_page_exit(pmtx); 13427 } 13428 goto smallexit; 13429 13430 } else { 13431 /* 13432 * We got a tsbmiss on a not active kpm_page range. 13433 * Let segkpm_fault decide how to panic. 13434 */ 13435 error = EFAULT; 13436 } 13437 goto exit; 13438 } 13439 13440 badstate = (kp->kp_refcnt < 0 || kp->kp_refcnts < 0); 13441 if (kp->kp_refcntc == -1) { 13442 /* 13443 * We should come here only if trap level tsb miss 13444 * handler is disabled. 13445 */ 13446 badstate |= (kp->kp_refcnt == 0 || kp->kp_refcnts > 0 || 13447 PP_ISKPMC(pp) || PP_ISKPMS(pp) || PP_ISNC(pp)); 13448 13449 if (badstate == 0) 13450 goto largeexit; 13451 } 13452 13453 if (badstate || kp->kp_refcntc < 0) 13454 goto badstate_exit; 13455 13456 /* 13457 * Combine the per kpm_page and per page kpm VAC states to 13458 * a summary state in order to make the kpm fault handling 13459 * more concise. 13460 */ 13461 tsbmcase = (((kp->kp_refcntc > 0) ? KPM_KC : 0) | 13462 ((kp->kp_refcnts > 0) ? KPM_KS : 0) | 13463 (PP_ISKPMC(pp) ? KPM_C : 0) | 13464 (PP_ISKPMS(pp) ? KPM_S : 0)); 13465 13466 switch (tsbmcase) { 13467 case KPM_TSBM_CONFL_GONE: /* - - - - */ 13468 /* 13469 * That's fine, we either have no more vac conflict in 13470 * this kpm page or someone raced in and has solved the 13471 * vac conflict for us -- call sfmmu_kpm_vac_conflict 13472 * to take care for correcting the vcolor and flushing 13473 * the dcache if required. 13474 */ 13475 mutex_exit(&kpmp->khl_mutex); 13476 sfmmu_kpm_vac_conflict(pp, vaddr); 13477 mutex_enter(&kpmp->khl_mutex); 13478 13479 if (PP_ISNC(pp) || kp->kp_refcnt <= 0 || 13480 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13481 panic("sfmmu_kpm_fault: inconsistent CONFL_GONE " 13482 "state, pp=%p", (void *)pp); 13483 } 13484 goto largeexit; 13485 13486 case KPM_TSBM_MAPS_RASM: /* - - ks - */ 13487 /* 13488 * All conflicts in this kpm page are gone but there are 13489 * already small mappings around, so we also map this 13490 * page small. This could be the trigger case for a 13491 * small mapping reaper, if this is really needed. 13492 * For now fall thru to the KPM_TSBM_MAPS handling. 13493 */ 13494 13495 case KPM_TSBM_MAPS: /* kc - ks - */ 13496 /* 13497 * Large page mapping is already broken, this page is not 13498 * conflicting, so map it small. Call sfmmu_kpm_vac_conflict 13499 * to take care for correcting the vcolor and flushing 13500 * the dcache if required. 13501 */ 13502 mutex_exit(&kpmp->khl_mutex); 13503 sfmmu_kpm_vac_conflict(pp, vaddr); 13504 mutex_enter(&kpmp->khl_mutex); 13505 13506 if (PP_ISNC(pp) || kp->kp_refcnt <= 0 || 13507 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13508 panic("sfmmu_kpm_fault: inconsistent MAPS state, " 13509 "pp=%p", (void *)pp); 13510 } 13511 kp->kp_refcnt--; 13512 kp->kp_refcnts++; 13513 pmtx = sfmmu_page_enter(pp); 13514 PP_SETKPMS(pp); 13515 sfmmu_page_exit(pmtx); 13516 goto smallexit; 13517 13518 case KPM_TSBM_RPLS_RASM: /* - - ks s */ 13519 /* 13520 * All conflicts in this kpm page are gone but this page 13521 * is mapped small. This could be the trigger case for a 13522 * small mapping reaper, if this is really needed. 13523 * For now we drop it in small again. Fall thru to the 13524 * KPM_TSBM_RPLS handling. 13525 */ 13526 13527 case KPM_TSBM_RPLS: /* kc - ks s */ 13528 /* 13529 * Large page mapping is already broken, this page is not 13530 * conflicting but already mapped small, so drop it in 13531 * small again. 13532 */ 13533 if (PP_ISNC(pp) || 13534 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13535 panic("sfmmu_kpm_fault: inconsistent RPLS state, " 13536 "pp=%p", (void *)pp); 13537 } 13538 goto smallexit; 13539 13540 case KPM_TSBM_MAPS_BRKO: /* kc - - - */ 13541 /* 13542 * The kpm page where we live in is marked conflicting 13543 * but this page is not conflicting. So we have to map it 13544 * in small. Call sfmmu_kpm_vac_conflict to take care for 13545 * correcting the vcolor and flushing the dcache if required. 13546 */ 13547 mutex_exit(&kpmp->khl_mutex); 13548 sfmmu_kpm_vac_conflict(pp, vaddr); 13549 mutex_enter(&kpmp->khl_mutex); 13550 13551 if (PP_ISNC(pp) || kp->kp_refcnt <= 0 || 13552 addr_to_vcolor(vaddr) != PP_GET_VCOLOR(pp)) { 13553 panic("sfmmu_kpm_fault: inconsistent MAPS_BRKO state, " 13554 "pp=%p", (void *)pp); 13555 } 13556 kp->kp_refcnt--; 13557 kp->kp_refcnts++; 13558 pmtx = sfmmu_page_enter(pp); 13559 PP_SETKPMS(pp); 13560 sfmmu_page_exit(pmtx); 13561 goto smallexit; 13562 13563 case KPM_TSBM_MAPS_BRKT: /* kc c - - */ 13564 case KPM_TSBM_MAPS_CONFL: /* kc c ks - */ 13565 if (!PP_ISMAPPED(pp)) { 13566 /* 13567 * We got a tsbmiss on kpm large page range that is 13568 * marked to contain vac conflicting pages introduced 13569 * by hme mappings. The hme mappings are all gone and 13570 * must have bypassed the kpm alias prevention logic. 13571 */ 13572 panic("sfmmu_kpm_fault: stale VAC conflict, pp=%p", 13573 (void *)pp); 13574 } 13575 13576 /* 13577 * Check for vcolor conflicts. Return here w/ either no 13578 * conflict (fast path), removed hme mapping chains 13579 * (unload conflict) or uncached (uncache conflict). 13580 * Dcache is cleaned and p_vcolor and P_TNC are set 13581 * accordingly. Drop kpmp for uncache conflict cases 13582 * since it will be grabbed within sfmmu_kpm_page_cache 13583 * in case of an uncache conflict. 13584 */ 13585 mutex_exit(&kpmp->khl_mutex); 13586 sfmmu_kpm_vac_conflict(pp, vaddr); 13587 mutex_enter(&kpmp->khl_mutex); 13588 13589 if (kp->kp_refcnt <= 0) 13590 panic("sfmmu_kpm_fault: bad refcnt kp=%p", (void *)kp); 13591 13592 if (PP_ISNC(pp)) { 13593 uncached = 1; 13594 } else { 13595 /* 13596 * When an unload conflict is solved and there are 13597 * no other small mappings around, we can resume 13598 * largepage mode. Otherwise we have to map or drop 13599 * in small. This could be a trigger for a small 13600 * mapping reaper when this was the last conflict 13601 * within the kpm page and when there are only 13602 * other small mappings around. 13603 */ 13604 ASSERT(addr_to_vcolor(vaddr) == PP_GET_VCOLOR(pp)); 13605 ASSERT(kp->kp_refcntc > 0); 13606 kp->kp_refcntc--; 13607 pmtx = sfmmu_page_enter(pp); 13608 PP_CLRKPMC(pp); 13609 sfmmu_page_exit(pmtx); 13610 ASSERT(PP_ISKPMS(pp) == 0); 13611 if (kp->kp_refcntc == 0 && kp->kp_refcnts == 0) 13612 goto largeexit; 13613 } 13614 13615 kp->kp_refcnt--; 13616 kp->kp_refcnts++; 13617 pmtx = sfmmu_page_enter(pp); 13618 PP_SETKPMS(pp); 13619 sfmmu_page_exit(pmtx); 13620 goto smallexit; 13621 13622 case KPM_TSBM_RPLS_CONFL: /* kc c ks s */ 13623 if (!PP_ISMAPPED(pp)) { 13624 /* 13625 * We got a tsbmiss on kpm large page range that is 13626 * marked to contain vac conflicting pages introduced 13627 * by hme mappings. They are all gone and must have 13628 * somehow bypassed the kpm alias prevention logic. 13629 */ 13630 panic("sfmmu_kpm_fault: stale VAC conflict, pp=%p", 13631 (void *)pp); 13632 } 13633 13634 /* 13635 * This state is only possible for an uncached mapping. 13636 */ 13637 if (!PP_ISNC(pp)) { 13638 panic("sfmmu_kpm_fault: page not uncached, pp=%p", 13639 (void *)pp); 13640 } 13641 uncached = 1; 13642 goto smallexit; 13643 13644 default: 13645 badstate_exit: 13646 panic("sfmmu_kpm_fault: inconsistent VAC state, vaddr=%p kp=%p " 13647 "pp=%p", (void *)vaddr, (void *)kp, (void *)pp); 13648 } 13649 13650 smallexit: 13651 /* tte assembly */ 13652 if (uncached == 0) 13653 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 13654 else 13655 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 13656 13657 /* tsb dropin */ 13658 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13659 13660 error = 0; 13661 goto exit; 13662 13663 largeexit: 13664 if (kp->kp_refcnt > 0) { 13665 13666 /* tte assembly */ 13667 KPM_TTE_VCACHED(tte.ll, pfn, TTE4M); 13668 13669 /* tsb dropin */ 13670 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT4M); 13671 13672 if (kp->kp_refcntc == 0) { 13673 /* Set "go" flag for TL tsbmiss handler */ 13674 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, &kpmp->khl_lock, 13675 KPMTSBM_START); 13676 } 13677 ASSERT(kp->kp_refcntc == -1); 13678 error = 0; 13679 13680 } else 13681 error = EFAULT; 13682 exit: 13683 mutex_exit(&kpmp->khl_mutex); 13684 sfmmu_mlist_exit(pml); 13685 return (error); 13686 } 13687 13688 /* 13689 * kpm fault handler for mappings with small page size. 13690 */ 13691 int 13692 sfmmu_kpm_fault_small(caddr_t vaddr, struct memseg *mseg, page_t *pp) 13693 { 13694 int error = 0; 13695 pgcnt_t inx; 13696 kpm_spage_t *ksp; 13697 kpm_shlk_t *kpmsp; 13698 kmutex_t *pml; 13699 pfn_t pfn = pp->p_pagenum; 13700 tte_t tte; 13701 kmutex_t *pmtx; 13702 int oldval; 13703 13704 inx = pfn - mseg->kpm_pbase; 13705 ksp = &mseg->kpm_spages[inx]; 13706 kpmsp = KPMP_SHASH(ksp); 13707 13708 pml = sfmmu_mlist_enter(pp); 13709 13710 if (!PP_ISMAPPED_KPM(pp)) { 13711 sfmmu_mlist_exit(pml); 13712 return (EFAULT); 13713 } 13714 13715 /* 13716 * kp_mapped lookup protected by mlist mutex 13717 */ 13718 if (ksp->kp_mapped == KPM_MAPPEDS) { 13719 /* 13720 * Fast path tsbmiss 13721 */ 13722 ASSERT(!PP_ISKPMC(pp)); 13723 ASSERT(!PP_ISNC(pp)); 13724 13725 /* tte assembly */ 13726 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 13727 13728 /* tsb dropin */ 13729 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13730 13731 } else if (ksp->kp_mapped == KPM_MAPPEDSC) { 13732 /* 13733 * Got here due to existing or gone kpm/hme VAC conflict. 13734 * Recheck for vcolor conflicts. Return here w/ either 13735 * no conflict, removed hme mapping chain (unload 13736 * conflict) or uncached (uncache conflict). VACaches 13737 * are cleaned and p_vcolor and PP_TNC are set accordingly 13738 * for the conflict cases. 13739 */ 13740 sfmmu_kpm_vac_conflict(pp, vaddr); 13741 13742 if (PP_ISNC(pp)) { 13743 /* ASSERT(pp->p_share); XXX use hat_page_getshare */ 13744 13745 /* tte assembly */ 13746 KPM_TTE_VUNCACHED(tte.ll, pfn, TTE8K); 13747 13748 /* tsb dropin */ 13749 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13750 13751 } else { 13752 if (PP_ISKPMC(pp)) { 13753 pmtx = sfmmu_page_enter(pp); 13754 PP_CLRKPMC(pp); 13755 sfmmu_page_exit(pmtx); 13756 } 13757 13758 /* tte assembly */ 13759 KPM_TTE_VCACHED(tte.ll, pfn, TTE8K); 13760 13761 /* tsb dropin */ 13762 sfmmu_kpm_load_tsb(vaddr, &tte, MMU_PAGESHIFT); 13763 13764 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 13765 &kpmsp->kshl_lock, KPM_MAPPEDS); 13766 13767 if (oldval != KPM_MAPPEDSC) 13768 panic("sfmmu_kpm_fault_small: " 13769 "stale smallpages mapping"); 13770 } 13771 13772 } else { 13773 /* 13774 * We got a tsbmiss on a not active kpm_page range. 13775 * Let decide segkpm_fault how to panic. 13776 */ 13777 error = EFAULT; 13778 } 13779 13780 sfmmu_mlist_exit(pml); 13781 return (error); 13782 } 13783 13784 /* 13785 * Check/handle potential hme/kpm mapping conflicts 13786 */ 13787 static void 13788 sfmmu_kpm_vac_conflict(page_t *pp, caddr_t vaddr) 13789 { 13790 int vcolor; 13791 struct sf_hment *sfhmep; 13792 struct hat *tmphat; 13793 struct sf_hment *tmphme = NULL; 13794 struct hme_blk *hmeblkp; 13795 tte_t tte; 13796 13797 ASSERT(sfmmu_mlist_held(pp)); 13798 13799 if (PP_ISNC(pp)) 13800 return; 13801 13802 vcolor = addr_to_vcolor(vaddr); 13803 if (PP_GET_VCOLOR(pp) == vcolor) 13804 return; 13805 13806 /* 13807 * There could be no vcolor conflict between a large cached 13808 * hme page and a non alias range kpm page (neither large nor 13809 * small mapped). So if a hme conflict already exists between 13810 * a constituent page of a large hme mapping and a shared small 13811 * conflicting hme mapping, both mappings must be already 13812 * uncached at this point. 13813 */ 13814 ASSERT(!PP_ISMAPPED_LARGE(pp)); 13815 13816 if (!PP_ISMAPPED(pp)) { 13817 /* 13818 * Previous hme user of page had a different color 13819 * but since there are no current users 13820 * we just flush the cache and change the color. 13821 */ 13822 SFMMU_STAT(sf_pgcolor_conflict); 13823 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 13824 PP_SET_VCOLOR(pp, vcolor); 13825 return; 13826 } 13827 13828 /* 13829 * If we get here we have a vac conflict with a current hme 13830 * mapping. This must have been established by forcing a wrong 13831 * colored mapping, e.g. by using mmap(2) with MAP_FIXED. 13832 */ 13833 13834 /* 13835 * Check if any mapping is in same as or if it is locked 13836 * since in that case we need to uncache. 13837 */ 13838 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 13839 tmphme = sfhmep->hme_next; 13840 hmeblkp = sfmmu_hmetohblk(sfhmep); 13841 if (hmeblkp->hblk_xhat_bit) 13842 continue; 13843 tmphat = hblktosfmmu(hmeblkp); 13844 sfmmu_copytte(&sfhmep->hme_tte, &tte); 13845 ASSERT(TTE_IS_VALID(&tte)); 13846 if ((tmphat == ksfmmup) || hmeblkp->hblk_lckcnt) { 13847 /* 13848 * We have an uncache conflict 13849 */ 13850 SFMMU_STAT(sf_uncache_conflict); 13851 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 13852 return; 13853 } 13854 } 13855 13856 /* 13857 * We have an unload conflict 13858 */ 13859 SFMMU_STAT(sf_unload_conflict); 13860 13861 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 13862 tmphme = sfhmep->hme_next; 13863 hmeblkp = sfmmu_hmetohblk(sfhmep); 13864 if (hmeblkp->hblk_xhat_bit) 13865 continue; 13866 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 13867 } 13868 13869 /* 13870 * Unloads only does tlb flushes so we need to flush the 13871 * dcache vcolor here. 13872 */ 13873 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 13874 PP_SET_VCOLOR(pp, vcolor); 13875 } 13876 13877 /* 13878 * Remove all kpm mappings using kpme's for pp and check that 13879 * all kpm mappings (w/ and w/o kpme's) are gone. 13880 */ 13881 static void 13882 sfmmu_kpm_pageunload(page_t *pp) 13883 { 13884 caddr_t vaddr; 13885 struct kpme *kpme, *nkpme; 13886 13887 ASSERT(pp != NULL); 13888 ASSERT(pp->p_kpmref); 13889 ASSERT(sfmmu_mlist_held(pp)); 13890 13891 vaddr = hat_kpm_page2va(pp, 1); 13892 13893 for (kpme = pp->p_kpmelist; kpme; kpme = nkpme) { 13894 ASSERT(kpme->kpe_page == pp); 13895 13896 if (pp->p_kpmref == 0) 13897 panic("sfmmu_kpm_pageunload: stale p_kpmref pp=%p " 13898 "kpme=%p", (void *)pp, (void *)kpme); 13899 13900 nkpme = kpme->kpe_next; 13901 13902 /* Add instance callback here here if needed later */ 13903 sfmmu_kpme_sub(kpme, pp); 13904 } 13905 13906 /* 13907 * Also correct after mixed kpme/nonkpme mappings. If nonkpme 13908 * segkpm clients have unlocked the page and forgot to mapout 13909 * we panic here. 13910 */ 13911 if (pp->p_kpmref != 0) 13912 panic("sfmmu_kpm_pageunload: bad refcnt pp=%p", (void *)pp); 13913 13914 sfmmu_kpm_mapout(pp, vaddr); 13915 } 13916 13917 /* 13918 * Remove a large kpm mapping from kernel TSB and all TLB's. 13919 */ 13920 static void 13921 sfmmu_kpm_demap_large(caddr_t vaddr) 13922 { 13923 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT4M); 13924 sfmmu_kpm_demap_tlbs(vaddr, KCONTEXT); 13925 } 13926 13927 /* 13928 * Remove a small kpm mapping from kernel TSB and all TLB's. 13929 */ 13930 static void 13931 sfmmu_kpm_demap_small(caddr_t vaddr) 13932 { 13933 sfmmu_kpm_unload_tsb(vaddr, MMU_PAGESHIFT); 13934 sfmmu_kpm_demap_tlbs(vaddr, KCONTEXT); 13935 } 13936 13937 /* 13938 * Demap a kpm mapping in all TLB's. 13939 */ 13940 static void 13941 sfmmu_kpm_demap_tlbs(caddr_t vaddr, int ctxnum) 13942 { 13943 cpuset_t cpuset; 13944 13945 kpreempt_disable(); 13946 cpuset = ksfmmup->sfmmu_cpusran; 13947 CPUSET_AND(cpuset, cpu_ready_set); 13948 CPUSET_DEL(cpuset, CPU->cpu_id); 13949 SFMMU_XCALL_STATS(ctxnum); 13950 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)vaddr, ctxnum); 13951 vtag_flushpage(vaddr, ctxnum); 13952 kpreempt_enable(); 13953 } 13954 13955 /* 13956 * Summary states used in sfmmu_kpm_vac_unload (KPM_VUL__*). 13957 * See also more detailed comments within in the sfmmu_kpm_vac_unload switch. 13958 * Abbreviations used: 13959 * BIG: Large page kpm mapping in use. 13960 * CONFL: VAC conflict(s) within a kpm_page. 13961 * INCR: Count of conflicts within a kpm_page is going to be incremented. 13962 * DECR: Count of conflicts within a kpm_page is going to be decremented. 13963 * UNMAP_SMALL: A small (regular page size) mapping is going to be unmapped. 13964 * TNC: Temporary non cached: a kpm mapped page is mapped in TNC state. 13965 */ 13966 #define KPM_VUL_BIG (0) 13967 #define KPM_VUL_CONFL_INCR1 (KPM_KS) 13968 #define KPM_VUL_UNMAP_SMALL1 (KPM_KS | KPM_S) 13969 #define KPM_VUL_CONFL_INCR2 (KPM_KC) 13970 #define KPM_VUL_CONFL_INCR3 (KPM_KC | KPM_KS) 13971 #define KPM_VUL_UNMAP_SMALL2 (KPM_KC | KPM_KS | KPM_S) 13972 #define KPM_VUL_CONFL_DECR1 (KPM_KC | KPM_C) 13973 #define KPM_VUL_CONFL_DECR2 (KPM_KC | KPM_C | KPM_KS) 13974 #define KPM_VUL_TNC (KPM_KC | KPM_C | KPM_KS | KPM_S) 13975 13976 /* 13977 * Handle VAC unload conflicts introduced by hme mappings or vice 13978 * versa when a hme conflict mapping is replaced by a non conflict 13979 * one. Perform actions and state transitions according to the 13980 * various page and kpm_page entry states. VACache flushes are in 13981 * the responsibiliy of the caller. We still hold the mlist lock. 13982 */ 13983 static void 13984 sfmmu_kpm_vac_unload(page_t *pp, caddr_t vaddr) 13985 { 13986 kpm_page_t *kp; 13987 kpm_hlk_t *kpmp; 13988 caddr_t kpmvaddr = hat_kpm_page2va(pp, 1); 13989 int newcolor; 13990 kmutex_t *pmtx; 13991 uint_t vacunlcase; 13992 int badstate = 0; 13993 kpm_spage_t *ksp; 13994 kpm_shlk_t *kpmsp; 13995 13996 ASSERT(PAGE_LOCKED(pp)); 13997 ASSERT(sfmmu_mlist_held(pp)); 13998 ASSERT(!PP_ISNC(pp)); 13999 14000 newcolor = addr_to_vcolor(kpmvaddr) != addr_to_vcolor(vaddr); 14001 if (kpm_smallpages) 14002 goto smallpages_vac_unload; 14003 14004 PP2KPMPG(pp, kp); 14005 kpmp = KPMP_HASH(kp); 14006 mutex_enter(&kpmp->khl_mutex); 14007 14008 if (IS_KPM_ALIAS_RANGE(kpmvaddr)) { 14009 if (kp->kp_refcnta < 1) { 14010 panic("sfmmu_kpm_vac_unload: bad refcnta kpm_page=%p\n", 14011 (void *)kp); 14012 } 14013 14014 if (PP_ISKPMC(pp) == 0) { 14015 if (newcolor == 0) 14016 goto exit; 14017 sfmmu_kpm_demap_small(kpmvaddr); 14018 pmtx = sfmmu_page_enter(pp); 14019 PP_SETKPMC(pp); 14020 sfmmu_page_exit(pmtx); 14021 14022 } else if (newcolor == 0) { 14023 pmtx = sfmmu_page_enter(pp); 14024 PP_CLRKPMC(pp); 14025 sfmmu_page_exit(pmtx); 14026 14027 } else { 14028 badstate++; 14029 } 14030 14031 goto exit; 14032 } 14033 14034 badstate = (kp->kp_refcnt < 0 || kp->kp_refcnts < 0); 14035 if (kp->kp_refcntc == -1) { 14036 /* 14037 * We should come here only if trap level tsb miss 14038 * handler is disabled. 14039 */ 14040 badstate |= (kp->kp_refcnt == 0 || kp->kp_refcnts > 0 || 14041 PP_ISKPMC(pp) || PP_ISKPMS(pp) || PP_ISNC(pp)); 14042 } else { 14043 badstate |= (kp->kp_refcntc < 0); 14044 } 14045 14046 if (badstate) 14047 goto exit; 14048 14049 if (PP_ISKPMC(pp) == 0 && newcolor == 0) { 14050 ASSERT(PP_ISKPMS(pp) == 0); 14051 goto exit; 14052 } 14053 14054 /* 14055 * Combine the per kpm_page and per page kpm VAC states 14056 * to a summary state in order to make the vac unload 14057 * handling more concise. 14058 */ 14059 vacunlcase = (((kp->kp_refcntc > 0) ? KPM_KC : 0) | 14060 ((kp->kp_refcnts > 0) ? KPM_KS : 0) | 14061 (PP_ISKPMC(pp) ? KPM_C : 0) | 14062 (PP_ISKPMS(pp) ? KPM_S : 0)); 14063 14064 switch (vacunlcase) { 14065 case KPM_VUL_BIG: /* - - - - */ 14066 /* 14067 * Have to breakup the large page mapping to be 14068 * able to handle the conflicting hme vaddr. 14069 */ 14070 if (kp->kp_refcntc == -1) { 14071 /* remove go indication */ 14072 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 14073 &kpmp->khl_lock, KPMTSBM_STOP); 14074 } 14075 sfmmu_kpm_demap_large(kpmvaddr); 14076 14077 ASSERT(kp->kp_refcntc == 0); 14078 kp->kp_refcntc++; 14079 pmtx = sfmmu_page_enter(pp); 14080 PP_SETKPMC(pp); 14081 sfmmu_page_exit(pmtx); 14082 break; 14083 14084 case KPM_VUL_UNMAP_SMALL1: /* - - ks s */ 14085 case KPM_VUL_UNMAP_SMALL2: /* kc - ks s */ 14086 /* 14087 * New conflict w/ an active kpm page, actually mapped 14088 * in by small TSB/TLB entries. Remove the mapping and 14089 * update states. 14090 */ 14091 ASSERT(newcolor); 14092 sfmmu_kpm_demap_small(kpmvaddr); 14093 kp->kp_refcnts--; 14094 kp->kp_refcnt++; 14095 kp->kp_refcntc++; 14096 pmtx = sfmmu_page_enter(pp); 14097 PP_CLRKPMS(pp); 14098 PP_SETKPMC(pp); 14099 sfmmu_page_exit(pmtx); 14100 break; 14101 14102 case KPM_VUL_CONFL_INCR1: /* - - ks - */ 14103 case KPM_VUL_CONFL_INCR2: /* kc - - - */ 14104 case KPM_VUL_CONFL_INCR3: /* kc - ks - */ 14105 /* 14106 * New conflict on a active kpm mapped page not yet in 14107 * TSB/TLB. Mark page and increment the kpm_page conflict 14108 * count. 14109 */ 14110 ASSERT(newcolor); 14111 kp->kp_refcntc++; 14112 pmtx = sfmmu_page_enter(pp); 14113 PP_SETKPMC(pp); 14114 sfmmu_page_exit(pmtx); 14115 break; 14116 14117 case KPM_VUL_CONFL_DECR1: /* kc c - - */ 14118 case KPM_VUL_CONFL_DECR2: /* kc c ks - */ 14119 /* 14120 * A conflicting hme mapping is removed for an active 14121 * kpm page not yet in TSB/TLB. Unmark page and decrement 14122 * the kpm_page conflict count. 14123 */ 14124 ASSERT(newcolor == 0); 14125 kp->kp_refcntc--; 14126 pmtx = sfmmu_page_enter(pp); 14127 PP_CLRKPMC(pp); 14128 sfmmu_page_exit(pmtx); 14129 break; 14130 14131 case KPM_VUL_TNC: /* kc c ks s */ 14132 cmn_err(CE_NOTE, "sfmmu_kpm_vac_unload: " 14133 "page not in NC state"); 14134 /* FALLTHRU */ 14135 14136 default: 14137 badstate++; 14138 } 14139 exit: 14140 if (badstate) { 14141 panic("sfmmu_kpm_vac_unload: inconsistent VAC state, " 14142 "kpmvaddr=%p kp=%p pp=%p", 14143 (void *)kpmvaddr, (void *)kp, (void *)pp); 14144 } 14145 mutex_exit(&kpmp->khl_mutex); 14146 14147 return; 14148 14149 smallpages_vac_unload: 14150 if (newcolor == 0) 14151 return; 14152 14153 PP2KPMSPG(pp, ksp); 14154 kpmsp = KPMP_SHASH(ksp); 14155 14156 if (PP_ISKPMC(pp) == 0) { 14157 if (ksp->kp_mapped == KPM_MAPPEDS) { 14158 /* 14159 * Stop TL tsbmiss handling 14160 */ 14161 (void) sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 14162 &kpmsp->kshl_lock, KPM_MAPPEDSC); 14163 14164 sfmmu_kpm_demap_small(kpmvaddr); 14165 14166 } else if (ksp->kp_mapped != KPM_MAPPEDSC) { 14167 panic("sfmmu_kpm_vac_unload: inconsistent mapping"); 14168 } 14169 14170 pmtx = sfmmu_page_enter(pp); 14171 PP_SETKPMC(pp); 14172 sfmmu_page_exit(pmtx); 14173 14174 } else { 14175 if (ksp->kp_mapped != KPM_MAPPEDSC) 14176 panic("sfmmu_kpm_vac_unload: inconsistent mapping"); 14177 } 14178 } 14179 14180 /* 14181 * Page is marked to be in VAC conflict to an existing kpm mapping 14182 * or is kpm mapped using only the regular pagesize. Called from 14183 * sfmmu_hblk_unload when a mlist is completely removed. 14184 */ 14185 static void 14186 sfmmu_kpm_hme_unload(page_t *pp) 14187 { 14188 /* tte assembly */ 14189 kpm_page_t *kp; 14190 kpm_hlk_t *kpmp; 14191 caddr_t vaddr; 14192 kmutex_t *pmtx; 14193 uint_t flags; 14194 kpm_spage_t *ksp; 14195 14196 ASSERT(sfmmu_mlist_held(pp)); 14197 ASSERT(PP_ISMAPPED_KPM(pp)); 14198 14199 flags = pp->p_nrm & (P_KPMC | P_KPMS); 14200 if (kpm_smallpages) 14201 goto smallpages_hme_unload; 14202 14203 if (flags == (P_KPMC | P_KPMS)) { 14204 panic("sfmmu_kpm_hme_unload: page should be uncached"); 14205 14206 } else if (flags == P_KPMS) { 14207 /* 14208 * Page mapped small but not involved in VAC conflict 14209 */ 14210 return; 14211 } 14212 14213 vaddr = hat_kpm_page2va(pp, 1); 14214 14215 PP2KPMPG(pp, kp); 14216 kpmp = KPMP_HASH(kp); 14217 mutex_enter(&kpmp->khl_mutex); 14218 14219 if (IS_KPM_ALIAS_RANGE(vaddr)) { 14220 if (kp->kp_refcnta < 1) { 14221 panic("sfmmu_kpm_hme_unload: bad refcnta kpm_page=%p\n", 14222 (void *)kp); 14223 } 14224 14225 } else { 14226 if (kp->kp_refcntc < 1) { 14227 panic("sfmmu_kpm_hme_unload: bad refcntc kpm_page=%p\n", 14228 (void *)kp); 14229 } 14230 kp->kp_refcntc--; 14231 } 14232 14233 pmtx = sfmmu_page_enter(pp); 14234 PP_CLRKPMC(pp); 14235 sfmmu_page_exit(pmtx); 14236 14237 mutex_exit(&kpmp->khl_mutex); 14238 return; 14239 14240 smallpages_hme_unload: 14241 if (flags != P_KPMC) 14242 panic("sfmmu_kpm_hme_unload: page should be uncached"); 14243 14244 vaddr = hat_kpm_page2va(pp, 1); 14245 PP2KPMSPG(pp, ksp); 14246 14247 if (ksp->kp_mapped != KPM_MAPPEDSC) 14248 panic("sfmmu_kpm_hme_unload: inconsistent mapping"); 14249 14250 /* 14251 * Keep KPM_MAPPEDSC until the next kpm tsbmiss where it 14252 * prevents TL tsbmiss handling and force a hat_kpm_fault. 14253 * There we can start over again. 14254 */ 14255 14256 pmtx = sfmmu_page_enter(pp); 14257 PP_CLRKPMC(pp); 14258 sfmmu_page_exit(pmtx); 14259 } 14260 14261 /* 14262 * Special hooks for sfmmu_page_cache_array() when changing the 14263 * cacheability of a page. It is used to obey the hat_kpm lock 14264 * ordering (mlist -> kpmp -> spl, and back). 14265 */ 14266 static kpm_hlk_t * 14267 sfmmu_kpm_kpmp_enter(page_t *pp, pgcnt_t npages) 14268 { 14269 kpm_page_t *kp; 14270 kpm_hlk_t *kpmp; 14271 14272 ASSERT(sfmmu_mlist_held(pp)); 14273 14274 if (kpm_smallpages || PP_ISMAPPED_KPM(pp) == 0) 14275 return (NULL); 14276 14277 ASSERT(npages <= kpmpnpgs); 14278 14279 PP2KPMPG(pp, kp); 14280 kpmp = KPMP_HASH(kp); 14281 mutex_enter(&kpmp->khl_mutex); 14282 14283 return (kpmp); 14284 } 14285 14286 static void 14287 sfmmu_kpm_kpmp_exit(kpm_hlk_t *kpmp) 14288 { 14289 if (kpm_smallpages || kpmp == NULL) 14290 return; 14291 14292 mutex_exit(&kpmp->khl_mutex); 14293 } 14294 14295 /* 14296 * Summary states used in sfmmu_kpm_page_cache (KPM_*). 14297 * See also more detailed comments within in the sfmmu_kpm_page_cache switch. 14298 * Abbreviations used: 14299 * UNC: Input state for an uncache request. 14300 * BIG: Large page kpm mapping in use. 14301 * SMALL: Page has a small kpm mapping within a kpm_page range. 14302 * NODEMAP: No demap needed. 14303 * NOP: No operation needed on this input state. 14304 * CACHE: Input state for a re-cache request. 14305 * MAPS: Page is in TNC and kpm VAC conflict state and kpm mapped small. 14306 * NOMAP: Page is in TNC and kpm VAC conflict state, but not small kpm 14307 * mapped. 14308 * NOMAPO: Page is in TNC and kpm VAC conflict state, but not small kpm 14309 * mapped. There are also other small kpm mappings within this 14310 * kpm_page. 14311 */ 14312 #define KPM_UNC_BIG (0) 14313 #define KPM_UNC_NODEMAP1 (KPM_KS) 14314 #define KPM_UNC_SMALL1 (KPM_KS | KPM_S) 14315 #define KPM_UNC_NODEMAP2 (KPM_KC) 14316 #define KPM_UNC_NODEMAP3 (KPM_KC | KPM_KS) 14317 #define KPM_UNC_SMALL2 (KPM_KC | KPM_KS | KPM_S) 14318 #define KPM_UNC_NOP1 (KPM_KC | KPM_C) 14319 #define KPM_UNC_NOP2 (KPM_KC | KPM_C | KPM_KS) 14320 #define KPM_CACHE_NOMAP (KPM_KC | KPM_C) 14321 #define KPM_CACHE_NOMAPO (KPM_KC | KPM_C | KPM_KS) 14322 #define KPM_CACHE_MAPS (KPM_KC | KPM_C | KPM_KS | KPM_S) 14323 14324 /* 14325 * This function is called when the virtual cacheability of a page 14326 * is changed and the page has an actice kpm mapping. The mlist mutex, 14327 * the spl hash lock and the kpmp mutex (if needed) are already grabbed. 14328 */ 14329 static void 14330 sfmmu_kpm_page_cache(page_t *pp, int flags, int cache_flush_tag) 14331 { 14332 kpm_page_t *kp; 14333 kpm_hlk_t *kpmp; 14334 caddr_t kpmvaddr; 14335 int badstate = 0; 14336 uint_t pgcacase; 14337 kpm_spage_t *ksp; 14338 kpm_shlk_t *kpmsp; 14339 int oldval; 14340 14341 ASSERT(PP_ISMAPPED_KPM(pp)); 14342 ASSERT(sfmmu_mlist_held(pp)); 14343 ASSERT(sfmmu_page_spl_held(pp)); 14344 14345 if (flags != HAT_TMPNC && flags != HAT_CACHE) 14346 panic("sfmmu_kpm_page_cache: bad flags"); 14347 14348 kpmvaddr = hat_kpm_page2va(pp, 1); 14349 14350 if (flags == HAT_TMPNC && cache_flush_tag == CACHE_FLUSH) { 14351 pfn_t pfn = pp->p_pagenum; 14352 int vcolor = addr_to_vcolor(kpmvaddr); 14353 cpuset_t cpuset = cpu_ready_set; 14354 14355 /* Flush vcolor in DCache */ 14356 CPUSET_DEL(cpuset, CPU->cpu_id); 14357 SFMMU_XCALL_STATS(ksfmmup->sfmmu_cnum); 14358 xt_some(cpuset, vac_flushpage_tl1, pfn, vcolor); 14359 vac_flushpage(pfn, vcolor); 14360 } 14361 14362 if (kpm_smallpages) 14363 goto smallpages_page_cache; 14364 14365 PP2KPMPG(pp, kp); 14366 kpmp = KPMP_HASH(kp); 14367 ASSERT(MUTEX_HELD(&kpmp->khl_mutex)); 14368 14369 if (IS_KPM_ALIAS_RANGE(kpmvaddr)) { 14370 if (kp->kp_refcnta < 1) { 14371 panic("sfmmu_kpm_page_cache: bad refcnta " 14372 "kpm_page=%p\n", (void *)kp); 14373 } 14374 sfmmu_kpm_demap_small(kpmvaddr); 14375 if (flags == HAT_TMPNC) { 14376 PP_SETKPMC(pp); 14377 ASSERT(!PP_ISKPMS(pp)); 14378 } else { 14379 ASSERT(PP_ISKPMC(pp)); 14380 PP_CLRKPMC(pp); 14381 } 14382 goto exit; 14383 } 14384 14385 badstate = (kp->kp_refcnt < 0 || kp->kp_refcnts < 0); 14386 if (kp->kp_refcntc == -1) { 14387 /* 14388 * We should come here only if trap level tsb miss 14389 * handler is disabled. 14390 */ 14391 badstate |= (kp->kp_refcnt == 0 || kp->kp_refcnts > 0 || 14392 PP_ISKPMC(pp) || PP_ISKPMS(pp) || PP_ISNC(pp)); 14393 } else { 14394 badstate |= (kp->kp_refcntc < 0); 14395 } 14396 14397 if (badstate) 14398 goto exit; 14399 14400 /* 14401 * Combine the per kpm_page and per page kpm VAC states to 14402 * a summary state in order to make the VAC cache/uncache 14403 * handling more concise. 14404 */ 14405 pgcacase = (((kp->kp_refcntc > 0) ? KPM_KC : 0) | 14406 ((kp->kp_refcnts > 0) ? KPM_KS : 0) | 14407 (PP_ISKPMC(pp) ? KPM_C : 0) | 14408 (PP_ISKPMS(pp) ? KPM_S : 0)); 14409 14410 if (flags == HAT_CACHE) { 14411 switch (pgcacase) { 14412 case KPM_CACHE_MAPS: /* kc c ks s */ 14413 sfmmu_kpm_demap_small(kpmvaddr); 14414 if (kp->kp_refcnts < 1) { 14415 panic("sfmmu_kpm_page_cache: bad refcnts " 14416 "kpm_page=%p\n", (void *)kp); 14417 } 14418 kp->kp_refcnts--; 14419 kp->kp_refcnt++; 14420 PP_CLRKPMS(pp); 14421 /* FALLTHRU */ 14422 14423 case KPM_CACHE_NOMAP: /* kc c - - */ 14424 case KPM_CACHE_NOMAPO: /* kc c ks - */ 14425 kp->kp_refcntc--; 14426 PP_CLRKPMC(pp); 14427 break; 14428 14429 default: 14430 badstate++; 14431 } 14432 goto exit; 14433 } 14434 14435 switch (pgcacase) { 14436 case KPM_UNC_BIG: /* - - - - */ 14437 if (kp->kp_refcnt < 1) { 14438 panic("sfmmu_kpm_page_cache: bad refcnt " 14439 "kpm_page=%p\n", (void *)kp); 14440 } 14441 14442 /* 14443 * Have to breakup the large page mapping in preparation 14444 * to the upcoming TNC mode handled by small mappings. 14445 * The demap can already be done due to another conflict 14446 * within the kpm_page. 14447 */ 14448 if (kp->kp_refcntc == -1) { 14449 /* remove go indication */ 14450 sfmmu_kpm_tsbmtl(&kp->kp_refcntc, 14451 &kpmp->khl_lock, KPMTSBM_STOP); 14452 } 14453 ASSERT(kp->kp_refcntc == 0); 14454 sfmmu_kpm_demap_large(kpmvaddr); 14455 kp->kp_refcntc++; 14456 PP_SETKPMC(pp); 14457 break; 14458 14459 case KPM_UNC_SMALL1: /* - - ks s */ 14460 case KPM_UNC_SMALL2: /* kc - ks s */ 14461 /* 14462 * Have to demap an already small kpm mapping in preparation 14463 * to the upcoming TNC mode. The demap can already be done 14464 * due to another conflict within the kpm_page. 14465 */ 14466 sfmmu_kpm_demap_small(kpmvaddr); 14467 kp->kp_refcntc++; 14468 kp->kp_refcnts--; 14469 kp->kp_refcnt++; 14470 PP_CLRKPMS(pp); 14471 PP_SETKPMC(pp); 14472 break; 14473 14474 case KPM_UNC_NODEMAP1: /* - - ks - */ 14475 /* fallthru */ 14476 14477 case KPM_UNC_NODEMAP2: /* kc - - - */ 14478 case KPM_UNC_NODEMAP3: /* kc - ks - */ 14479 kp->kp_refcntc++; 14480 PP_SETKPMC(pp); 14481 break; 14482 14483 case KPM_UNC_NOP1: /* kc c - - */ 14484 case KPM_UNC_NOP2: /* kc c ks - */ 14485 break; 14486 14487 default: 14488 badstate++; 14489 } 14490 exit: 14491 if (badstate) { 14492 panic("sfmmu_kpm_page_cache: inconsistent VAC state " 14493 "kpmvaddr=%p kp=%p pp=%p", (void *)kpmvaddr, 14494 (void *)kp, (void *)pp); 14495 } 14496 return; 14497 14498 smallpages_page_cache: 14499 PP2KPMSPG(pp, ksp); 14500 kpmsp = KPMP_SHASH(ksp); 14501 14502 oldval = sfmmu_kpm_stsbmtl(&ksp->kp_mapped, 14503 &kpmsp->kshl_lock, KPM_MAPPEDSC); 14504 14505 if (!(oldval == KPM_MAPPEDS || oldval == KPM_MAPPEDSC)) 14506 panic("smallpages_page_cache: inconsistent mapping"); 14507 14508 sfmmu_kpm_demap_small(kpmvaddr); 14509 14510 if (flags == HAT_TMPNC) { 14511 PP_SETKPMC(pp); 14512 ASSERT(!PP_ISKPMS(pp)); 14513 14514 } else { 14515 ASSERT(PP_ISKPMC(pp)); 14516 PP_CLRKPMC(pp); 14517 } 14518 14519 /* 14520 * Keep KPM_MAPPEDSC until the next kpm tsbmiss where it 14521 * prevents TL tsbmiss handling and force a hat_kpm_fault. 14522 * There we can start over again. 14523 */ 14524 } 14525 14526 /* 14527 * unused in sfmmu 14528 */ 14529 void 14530 hat_dump(void) 14531 { 14532 } 14533 14534 /* 14535 * Called when a thread is exiting and we have switched to the kernel address 14536 * space. Perform the same VM initialization resume() uses when switching 14537 * processes. 14538 * 14539 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 14540 * we call it anyway in case the semantics change in the future. 14541 */ 14542 /*ARGSUSED*/ 14543 void 14544 hat_thread_exit(kthread_t *thd) 14545 { 14546 ASSERT(thd->t_procp->p_as == &kas); 14547 14548 sfmmu_setctx_sec(KCONTEXT); 14549 sfmmu_load_mmustate(ksfmmup); 14550 } 14551