1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * VM - Hardware Address Translation management for Spitfire MMU. 28 * 29 * This file implements the machine specific hardware translation 30 * needed by the VM system. The machine independent interface is 31 * described in <vm/hat.h> while the machine dependent interface 32 * and data structures are described in <vm/hat_sfmmu.h>. 33 * 34 * The hat layer manages the address translation hardware as a cache 35 * driven by calls from the higher levels in the VM system. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/kstat.h> 40 #include <vm/hat.h> 41 #include <vm/hat_sfmmu.h> 42 #include <vm/page.h> 43 #include <sys/pte.h> 44 #include <sys/systm.h> 45 #include <sys/mman.h> 46 #include <sys/sysmacros.h> 47 #include <sys/machparam.h> 48 #include <sys/vtrace.h> 49 #include <sys/kmem.h> 50 #include <sys/mmu.h> 51 #include <sys/cmn_err.h> 52 #include <sys/cpu.h> 53 #include <sys/cpuvar.h> 54 #include <sys/debug.h> 55 #include <sys/lgrp.h> 56 #include <sys/archsystm.h> 57 #include <sys/machsystm.h> 58 #include <sys/vmsystm.h> 59 #include <vm/as.h> 60 #include <vm/seg.h> 61 #include <vm/seg_kp.h> 62 #include <vm/seg_kmem.h> 63 #include <vm/seg_kpm.h> 64 #include <vm/rm.h> 65 #include <sys/t_lock.h> 66 #include <sys/obpdefs.h> 67 #include <sys/vm_machparam.h> 68 #include <sys/var.h> 69 #include <sys/trap.h> 70 #include <sys/machtrap.h> 71 #include <sys/scb.h> 72 #include <sys/bitmap.h> 73 #include <sys/machlock.h> 74 #include <sys/membar.h> 75 #include <sys/atomic.h> 76 #include <sys/cpu_module.h> 77 #include <sys/prom_debug.h> 78 #include <sys/ksynch.h> 79 #include <sys/mem_config.h> 80 #include <sys/mem_cage.h> 81 #include <vm/vm_dep.h> 82 #include <vm/xhat_sfmmu.h> 83 #include <sys/fpu/fpusystm.h> 84 #include <vm/mach_kpm.h> 85 #include <sys/callb.h> 86 87 #ifdef DEBUG 88 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ 89 if (SFMMU_IS_SHMERID_VALID(rid)) { \ 90 caddr_t _eaddr = (saddr) + (len); \ 91 sf_srd_t *_srdp; \ 92 sf_region_t *_rgnp; \ 93 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 94 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \ 95 ASSERT((hat) != ksfmmup); \ 96 _srdp = (hat)->sfmmu_srdp; \ 97 ASSERT(_srdp != NULL); \ 98 ASSERT(_srdp->srd_refcnt != 0); \ 99 _rgnp = _srdp->srd_hmergnp[(rid)]; \ 100 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \ 101 ASSERT(_rgnp->rgn_refcnt != 0); \ 102 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \ 103 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 104 SFMMU_REGION_HME); \ 105 ASSERT((saddr) >= _rgnp->rgn_saddr); \ 106 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \ 107 ASSERT(_eaddr > _rgnp->rgn_saddr); \ 108 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \ 109 } 110 111 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \ 112 { \ 113 caddr_t _hsva; \ 114 caddr_t _heva; \ 115 caddr_t _rsva; \ 116 caddr_t _reva; \ 117 int _ttesz = get_hblk_ttesz(hmeblkp); \ 118 int _flagtte; \ 119 ASSERT((srdp)->srd_refcnt != 0); \ 120 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 121 ASSERT((rgnp)->rgn_id == rid); \ 122 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \ 123 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 124 SFMMU_REGION_HME); \ 125 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \ 126 _hsva = (caddr_t)get_hblk_base(hmeblkp); \ 127 _heva = get_hblk_endaddr(hmeblkp); \ 128 _rsva = (caddr_t)P2ALIGN( \ 129 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \ 130 _reva = (caddr_t)P2ROUNDUP( \ 131 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \ 132 HBLK_MIN_BYTES); \ 133 ASSERT(_hsva >= _rsva); \ 134 ASSERT(_hsva < _reva); \ 135 ASSERT(_heva > _rsva); \ 136 ASSERT(_heva <= _reva); \ 137 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \ 138 _ttesz; \ 139 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \ 140 } 141 142 #else /* DEBUG */ 143 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len) 144 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 145 #endif /* DEBUG */ 146 147 #if defined(SF_ERRATA_57) 148 extern caddr_t errata57_limit; 149 #endif 150 151 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 152 (sizeof (int64_t))) 153 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 154 155 #define HBLK_RESERVE_CNT 128 156 #define HBLK_RESERVE_MIN 20 157 158 static struct hme_blk *freehblkp; 159 static kmutex_t freehblkp_lock; 160 static int freehblkcnt; 161 162 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 163 static kmutex_t hblk_reserve_lock; 164 static kthread_t *hblk_reserve_thread; 165 166 static nucleus_hblk8_info_t nucleus_hblk8; 167 static nucleus_hblk1_info_t nucleus_hblk1; 168 169 /* 170 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here 171 * after the initial phase of removing an hmeblk from the hash chain, see 172 * the detailed comment in sfmmu_hblk_hash_rm() for further details. 173 */ 174 static cpu_hme_pend_t *cpu_hme_pend; 175 static uint_t cpu_hme_pend_thresh; 176 /* 177 * SFMMU specific hat functions 178 */ 179 void hat_pagecachectl(struct page *, int); 180 181 /* flags for hat_pagecachectl */ 182 #define HAT_CACHE 0x1 183 #define HAT_UNCACHE 0x2 184 #define HAT_TMPNC 0x4 185 186 /* 187 * This flag is set to 0 via the MD in platforms that do not support 188 * I-cache coherency in hardware. Used to enable "soft exec" mode. 189 * The MD "coherency" property is optional, and defaults to 1 (because 190 * coherent I-cache is the norm.) 191 */ 192 uint_t icache_is_coherent = 1; 193 194 /* 195 * Flag to allow the creation of non-cacheable translations 196 * to system memory. It is off by default. At the moment this 197 * flag is used by the ecache error injector. The error injector 198 * will turn it on when creating such a translation then shut it 199 * off when it's finished. 200 */ 201 202 int sfmmu_allow_nc_trans = 0; 203 204 /* 205 * Flag to disable large page support. 206 * value of 1 => disable all large pages. 207 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 208 * 209 * For example, use the value 0x4 to disable 512K pages. 210 * 211 */ 212 #define LARGE_PAGES_OFF 0x1 213 214 /* 215 * The disable_large_pages and disable_ism_large_pages variables control 216 * hat_memload_array and the page sizes to be used by ISM and the kernel. 217 * 218 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables 219 * are only used to control which OOB pages to use at upper VM segment creation 220 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines. 221 * Their values may come from platform or CPU specific code to disable page 222 * sizes that should not be used. 223 * 224 * WARNING: 512K pages are currently not supported for ISM/DISM. 225 */ 226 uint_t disable_large_pages = 0; 227 uint_t disable_ism_large_pages = (1 << TTE512K); 228 uint_t disable_auto_data_large_pages = 0; 229 uint_t disable_auto_text_large_pages = 0; 230 231 /* 232 * Private sfmmu data structures for hat management 233 */ 234 static struct kmem_cache *sfmmuid_cache; 235 static struct kmem_cache *mmuctxdom_cache; 236 237 /* 238 * Private sfmmu data structures for tsb management 239 */ 240 static struct kmem_cache *sfmmu_tsbinfo_cache; 241 static struct kmem_cache *sfmmu_tsb8k_cache; 242 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 243 static vmem_t *kmem_bigtsb_arena; 244 static vmem_t *kmem_tsb_arena; 245 246 /* 247 * sfmmu static variables for hmeblk resource management. 248 */ 249 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 250 static struct kmem_cache *sfmmu8_cache; 251 static struct kmem_cache *sfmmu1_cache; 252 static struct kmem_cache *pa_hment_cache; 253 254 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 255 /* 256 * private data for ism 257 */ 258 static struct kmem_cache *ism_blk_cache; 259 static struct kmem_cache *ism_ment_cache; 260 #define ISMID_STARTADDR NULL 261 262 /* 263 * Region management data structures and function declarations. 264 */ 265 266 static void sfmmu_leave_srd(sfmmu_t *); 267 static int sfmmu_srdcache_constructor(void *, void *, int); 268 static void sfmmu_srdcache_destructor(void *, void *); 269 static int sfmmu_rgncache_constructor(void *, void *, int); 270 static void sfmmu_rgncache_destructor(void *, void *); 271 static int sfrgnmap_isnull(sf_region_map_t *); 272 static int sfhmergnmap_isnull(sf_hmeregion_map_t *); 273 static int sfmmu_scdcache_constructor(void *, void *, int); 274 static void sfmmu_scdcache_destructor(void *, void *); 275 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t, 276 size_t, void *, u_offset_t); 277 278 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1; 279 static sf_srd_bucket_t *srd_buckets; 280 static struct kmem_cache *srd_cache; 281 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1; 282 static struct kmem_cache *region_cache; 283 static struct kmem_cache *scd_cache; 284 285 #ifdef sun4v 286 int use_bigtsb_arena = 1; 287 #else 288 int use_bigtsb_arena = 0; 289 #endif 290 291 /* External /etc/system tunable, for turning on&off the shctx support */ 292 int disable_shctx = 0; 293 /* Internal variable, set by MD if the HW supports shctx feature */ 294 int shctx_on = 0; 295 296 #ifdef DEBUG 297 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int); 298 #endif 299 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *); 300 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *); 301 302 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *); 303 static void sfmmu_find_scd(sfmmu_t *); 304 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *); 305 static void sfmmu_finish_join_scd(sfmmu_t *); 306 static void sfmmu_leave_scd(sfmmu_t *, uchar_t); 307 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *); 308 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *); 309 static void sfmmu_free_scd_tsbs(sfmmu_t *); 310 static void sfmmu_tsb_inv_ctx(sfmmu_t *); 311 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *); 312 static void sfmmu_ism_hatflags(sfmmu_t *, int); 313 static int sfmmu_srd_lock_held(sf_srd_t *); 314 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *); 315 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *); 316 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *); 317 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *); 318 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *); 319 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *); 320 321 /* 322 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 323 * HAT flags, synchronizing TLB/TSB coherency, and context management. 324 * The lock is hashed on the sfmmup since the case where we need to lock 325 * all processes is rare but does occur (e.g. we need to unload a shared 326 * mapping from all processes using the mapping). We have a lot of buckets, 327 * and each slab of sfmmu_t's can use about a quarter of them, giving us 328 * a fairly good distribution without wasting too much space and overhead 329 * when we have to grab them all. 330 */ 331 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 332 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 333 334 /* 335 * Hash algorithm optimized for a small number of slabs. 336 * 7 is (highbit((sizeof sfmmu_t)) - 1) 337 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 338 * kmem_cache, and thus they will be sequential within that cache. In 339 * addition, each new slab will have a different "color" up to cache_maxcolor 340 * which will skew the hashing for each successive slab which is allocated. 341 * If the size of sfmmu_t changed to a larger size, this algorithm may need 342 * to be revisited. 343 */ 344 #define TSB_HASH_SHIFT_BITS (7) 345 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 346 347 #ifdef DEBUG 348 int tsb_hash_debug = 0; 349 #define TSB_HASH(sfmmup) \ 350 (tsb_hash_debug ? &hat_lock[0] : \ 351 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 352 #else /* DEBUG */ 353 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 354 #endif /* DEBUG */ 355 356 357 /* sfmmu_replace_tsb() return codes. */ 358 typedef enum tsb_replace_rc { 359 TSB_SUCCESS, 360 TSB_ALLOCFAIL, 361 TSB_LOSTRACE, 362 TSB_ALREADY_SWAPPED, 363 TSB_CANTGROW 364 } tsb_replace_rc_t; 365 366 /* 367 * Flags for TSB allocation routines. 368 */ 369 #define TSB_ALLOC 0x01 370 #define TSB_FORCEALLOC 0x02 371 #define TSB_GROW 0x04 372 #define TSB_SHRINK 0x08 373 #define TSB_SWAPIN 0x10 374 375 /* 376 * Support for HAT callbacks. 377 */ 378 #define SFMMU_MAX_RELOC_CALLBACKS 10 379 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 380 static id_t sfmmu_cb_nextid = 0; 381 static id_t sfmmu_tsb_cb_id; 382 struct sfmmu_callback *sfmmu_cb_table; 383 384 /* 385 * Kernel page relocation is enabled by default for non-caged 386 * kernel pages. This has little effect unless segkmem_reloc is 387 * set, since by default kernel memory comes from inside the 388 * kernel cage. 389 */ 390 int hat_kpr_enabled = 1; 391 392 kmutex_t kpr_mutex; 393 kmutex_t kpr_suspendlock; 394 kthread_t *kreloc_thread; 395 396 /* 397 * Enable VA->PA translation sanity checking on DEBUG kernels. 398 * Disabled by default. This is incompatible with some 399 * drivers (error injector, RSM) so if it breaks you get 400 * to keep both pieces. 401 */ 402 int hat_check_vtop = 0; 403 404 /* 405 * Private sfmmu routines (prototypes) 406 */ 407 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 408 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 409 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t, 410 uint_t); 411 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 412 caddr_t, demap_range_t *, uint_t); 413 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 414 caddr_t, int); 415 static void sfmmu_hblk_free(struct hme_blk **); 416 static void sfmmu_hblks_list_purge(struct hme_blk **, int); 417 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 418 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 419 static struct hme_blk *sfmmu_hblk_steal(int); 420 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 421 struct hme_blk *, uint64_t, struct hme_blk *); 422 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 423 424 static void hat_do_memload_array(struct hat *, caddr_t, size_t, 425 struct page **, uint_t, uint_t, uint_t); 426 static void hat_do_memload(struct hat *, caddr_t, struct page *, 427 uint_t, uint_t, uint_t); 428 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 429 uint_t, uint_t, pgcnt_t, uint_t); 430 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 431 uint_t); 432 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 433 uint_t, uint_t); 434 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 435 caddr_t, int, uint_t); 436 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 437 struct hmehash_bucket *, caddr_t, uint_t, uint_t, 438 uint_t); 439 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 440 caddr_t, page_t **, uint_t, uint_t); 441 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 442 443 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 444 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *); 445 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 446 #ifdef VAC 447 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 448 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 449 int tst_tnc(page_t *pp, pgcnt_t); 450 void conv_tnc(page_t *pp, int); 451 #endif 452 453 static void sfmmu_get_ctx(sfmmu_t *); 454 static void sfmmu_free_sfmmu(sfmmu_t *); 455 456 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 457 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 458 459 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 460 static void hat_pagereload(struct page *, struct page *); 461 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 462 #ifdef VAC 463 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 464 static void sfmmu_page_cache(page_t *, int, int, int); 465 #endif 466 467 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *, 468 struct hme_blk *, int); 469 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 470 pfn_t, int, int, int, int); 471 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 472 pfn_t, int); 473 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 474 static void sfmmu_tlb_range_demap(demap_range_t *); 475 static void sfmmu_invalidate_ctx(sfmmu_t *); 476 static void sfmmu_sync_mmustate(sfmmu_t *); 477 478 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 479 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 480 sfmmu_t *); 481 static void sfmmu_tsb_free(struct tsb_info *); 482 static void sfmmu_tsbinfo_free(struct tsb_info *); 483 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 484 sfmmu_t *); 485 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *); 486 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 487 static int sfmmu_select_tsb_szc(pgcnt_t); 488 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 489 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 490 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 491 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 492 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 493 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 494 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 495 hatlock_t *, uint_t); 496 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int); 497 498 #ifdef VAC 499 void sfmmu_cache_flush(pfn_t, int); 500 void sfmmu_cache_flushcolor(int, pfn_t); 501 #endif 502 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 503 caddr_t, demap_range_t *, uint_t, int); 504 505 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 506 static uint_t sfmmu_ptov_attr(tte_t *); 507 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 508 caddr_t, demap_range_t *, uint_t); 509 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 510 static int sfmmu_idcache_constructor(void *, void *, int); 511 static void sfmmu_idcache_destructor(void *, void *); 512 static int sfmmu_hblkcache_constructor(void *, void *, int); 513 static void sfmmu_hblkcache_destructor(void *, void *); 514 static void sfmmu_hblkcache_reclaim(void *); 515 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 516 struct hmehash_bucket *); 517 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *, 518 struct hme_blk *, struct hme_blk **, int); 519 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *, 520 uint64_t); 521 static struct hme_blk *sfmmu_check_pending_hblks(int); 522 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 523 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int); 524 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t, 525 int, caddr_t *); 526 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *); 527 528 static void sfmmu_rm_large_mappings(page_t *, int); 529 530 static void hat_lock_init(void); 531 static void hat_kstat_init(void); 532 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 533 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *); 534 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t); 535 static void sfmmu_check_page_sizes(sfmmu_t *, int); 536 int fnd_mapping_sz(page_t *); 537 static void iment_add(struct ism_ment *, struct hat *); 538 static void iment_sub(struct ism_ment *, struct hat *); 539 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 540 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 541 extern void sfmmu_clear_utsbinfo(void); 542 543 static void sfmmu_ctx_wrap_around(mmu_ctx_t *); 544 545 /* kpm globals */ 546 #ifdef DEBUG 547 /* 548 * Enable trap level tsbmiss handling 549 */ 550 int kpm_tsbmtl = 1; 551 552 /* 553 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 554 * required TLB shootdowns in this case, so handle w/ care. Off by default. 555 */ 556 int kpm_tlb_flush; 557 #endif /* DEBUG */ 558 559 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 560 561 #ifdef DEBUG 562 static void sfmmu_check_hblk_flist(); 563 #endif 564 565 /* 566 * Semi-private sfmmu data structures. Some of them are initialize in 567 * startup or in hat_init. Some of them are private but accessed by 568 * assembly code or mach_sfmmu.c 569 */ 570 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 571 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 572 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 573 uint64_t khme_hash_pa; /* PA of khme_hash */ 574 int uhmehash_num; /* # of buckets in user hash table */ 575 int khmehash_num; /* # of buckets in kernel hash table */ 576 577 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */ 578 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */ 579 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ 580 581 #define DEFAULT_NUM_CTXS_PER_MMU 8192 582 uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; 583 584 int cache; /* describes system cache */ 585 586 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 587 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 588 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 589 int ktsb_sz; /* kernel 8k-indexed tsb size */ 590 591 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 592 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 593 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 594 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 595 596 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 597 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 598 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 599 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 600 601 #ifndef sun4v 602 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 603 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 604 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 605 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 606 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 607 #endif /* sun4v */ 608 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 609 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 610 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */ 611 612 /* 613 * Size to use for TSB slabs. Future platforms that support page sizes 614 * larger than 4M may wish to change these values, and provide their own 615 * assembly macros for building and decoding the TSB base register contents. 616 * Note disable_large_pages will override the value set here. 617 */ 618 static uint_t tsb_slab_ttesz = TTE4M; 619 size_t tsb_slab_size = MMU_PAGESIZE4M; 620 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 621 /* PFN mask for TTE */ 622 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT; 623 624 /* 625 * Size to use for TSB slabs. These are used only when 256M tsb arenas 626 * exist. 627 */ 628 static uint_t bigtsb_slab_ttesz = TTE256M; 629 static size_t bigtsb_slab_size = MMU_PAGESIZE256M; 630 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M; 631 /* 256M page alignment for 8K pfn */ 632 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT; 633 634 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 635 static int tsb_max_growsize = 0; 636 637 /* 638 * Tunable parameters dealing with TSB policies. 639 */ 640 641 /* 642 * This undocumented tunable forces all 8K TSBs to be allocated from 643 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 644 */ 645 #ifdef DEBUG 646 int tsb_forceheap = 0; 647 #endif /* DEBUG */ 648 649 /* 650 * Decide whether to use per-lgroup arenas, or one global set of 651 * TSB arenas. The default is not to break up per-lgroup, since 652 * most platforms don't recognize any tangible benefit from it. 653 */ 654 int tsb_lgrp_affinity = 0; 655 656 /* 657 * Used for growing the TSB based on the process RSS. 658 * tsb_rss_factor is based on the smallest TSB, and is 659 * shifted by the TSB size to determine if we need to grow. 660 * The default will grow the TSB if the number of TTEs for 661 * this page size exceeds 75% of the number of TSB entries, 662 * which should _almost_ eliminate all conflict misses 663 * (at the expense of using up lots and lots of memory). 664 */ 665 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 666 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 667 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 668 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 669 default_tsb_size) 670 #define TSB_OK_SHRINK() \ 671 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 672 #define TSB_OK_GROW() \ 673 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 674 675 int enable_tsb_rss_sizing = 1; 676 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 677 678 /* which TSB size code to use for new address spaces or if rss sizing off */ 679 int default_tsb_size = TSB_8K_SZCODE; 680 681 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 682 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 683 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 684 685 #ifdef DEBUG 686 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 687 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 688 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 689 static int tsb_alloc_fail_mtbf = 0; 690 static int tsb_alloc_count = 0; 691 #endif /* DEBUG */ 692 693 /* if set to 1, will remap valid TTEs when growing TSB. */ 694 int tsb_remap_ttes = 1; 695 696 /* 697 * If we have more than this many mappings, allocate a second TSB. 698 * This default is chosen because the I/D fully associative TLBs are 699 * assumed to have at least 8 available entries. Platforms with a 700 * larger fully-associative TLB could probably override the default. 701 */ 702 703 #ifdef sun4v 704 int tsb_sectsb_threshold = 0; 705 #else 706 int tsb_sectsb_threshold = 8; 707 #endif 708 709 /* 710 * kstat data 711 */ 712 struct sfmmu_global_stat sfmmu_global_stat; 713 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 714 715 /* 716 * Global data 717 */ 718 sfmmu_t *ksfmmup; /* kernel's hat id */ 719 720 #ifdef DEBUG 721 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 722 #endif 723 724 /* sfmmu locking operations */ 725 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 726 static int sfmmu_mlspl_held(struct page *, int); 727 728 kmutex_t *sfmmu_page_enter(page_t *); 729 void sfmmu_page_exit(kmutex_t *); 730 int sfmmu_page_spl_held(struct page *); 731 732 /* sfmmu internal locking operations - accessed directly */ 733 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 734 kmutex_t **, kmutex_t **); 735 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 736 static hatlock_t * 737 sfmmu_hat_enter(sfmmu_t *); 738 static hatlock_t * 739 sfmmu_hat_tryenter(sfmmu_t *); 740 static void sfmmu_hat_exit(hatlock_t *); 741 static void sfmmu_hat_lock_all(void); 742 static void sfmmu_hat_unlock_all(void); 743 static void sfmmu_ismhat_enter(sfmmu_t *, int); 744 static void sfmmu_ismhat_exit(sfmmu_t *, int); 745 746 /* 747 * Array of mutexes protecting a page's mapping list and p_nrm field. 748 * 749 * The hash function looks complicated, but is made up so that: 750 * 751 * "pp" not shifted, so adjacent pp values will hash to different cache lines 752 * (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock) 753 * 754 * "pp" >> mml_shift, incorporates more source bits into the hash result 755 * 756 * "& (mml_table_size - 1), should be faster than using remainder "%" 757 * 758 * Hopefully, mml_table, mml_table_size and mml_shift are all in the same 759 * cacheline, since they get declared next to each other below. We'll trust 760 * ld not to do something random. 761 */ 762 #ifdef DEBUG 763 int mlist_hash_debug = 0; 764 #define MLIST_HASH(pp) (mlist_hash_debug ? &mml_table[0] : \ 765 &mml_table[((uintptr_t)(pp) + \ 766 ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]) 767 #else /* !DEBUG */ 768 #define MLIST_HASH(pp) &mml_table[ \ 769 ((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)] 770 #endif /* !DEBUG */ 771 772 kmutex_t *mml_table; 773 uint_t mml_table_sz; /* must be a power of 2 */ 774 uint_t mml_shift; /* log2(mml_table_sz) + 3 for align */ 775 776 kpm_hlk_t *kpmp_table; 777 uint_t kpmp_table_sz; /* must be a power of 2 */ 778 uchar_t kpmp_shift; 779 780 kpm_shlk_t *kpmp_stable; 781 uint_t kpmp_stable_sz; /* must be a power of 2 */ 782 783 /* 784 * SPL_HASH was improved to avoid false cache line sharing 785 */ 786 #define SPL_TABLE_SIZE 128 787 #define SPL_MASK (SPL_TABLE_SIZE - 1) 788 #define SPL_SHIFT 7 /* log2(SPL_TABLE_SIZE) */ 789 790 #define SPL_INDEX(pp) \ 791 ((((uintptr_t)(pp) >> SPL_SHIFT) ^ \ 792 ((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \ 793 (SPL_TABLE_SIZE - 1)) 794 795 #define SPL_HASH(pp) \ 796 (&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex) 797 798 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 799 800 801 /* 802 * hat_unload_callback() will group together callbacks in order 803 * to avoid xt_sync() calls. This is the maximum size of the group. 804 */ 805 #define MAX_CB_ADDR 32 806 807 tte_t hw_tte; 808 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 809 810 static char *mmu_ctx_kstat_names[] = { 811 "mmu_ctx_tsb_exceptions", 812 "mmu_ctx_tsb_raise_exception", 813 "mmu_ctx_wrap_around", 814 }; 815 816 /* 817 * Wrapper for vmem_xalloc since vmem_create only allows limited 818 * parameters for vm_source_alloc functions. This function allows us 819 * to specify alignment consistent with the size of the object being 820 * allocated. 821 */ 822 static void * 823 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 824 { 825 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 826 } 827 828 /* Common code for setting tsb_alloc_hiwater. */ 829 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 830 ptob(pages) / tsb_alloc_hiwater_factor 831 832 /* 833 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 834 * a single TSB. physmem is the number of physical pages so we need physmem 8K 835 * TTEs to represent all those physical pages. We round this up by using 836 * 1<<highbit(). To figure out which size code to use, remember that the size 837 * code is just an amount to shift the smallest TSB size to get the size of 838 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 839 * highbit() - 1) to get the size code for the smallest TSB that can represent 840 * all of physical memory, while erring on the side of too much. 841 * 842 * Restrict tsb_max_growsize to make sure that: 843 * 1) TSBs can't grow larger than the TSB slab size 844 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 845 */ 846 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 847 int _i, _szc, _slabszc, _tsbszc; \ 848 \ 849 _i = highbit(pages); \ 850 if ((1 << (_i - 1)) == (pages)) \ 851 _i--; /* 2^n case, round down */ \ 852 _szc = _i - TSB_START_SIZE; \ 853 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \ 854 _tsbszc = MIN(_szc, _slabszc); \ 855 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \ 856 } 857 858 /* 859 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 860 * tsb_info which handles that TTE size. 861 */ 862 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \ 863 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 864 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \ 865 sfmmu_hat_lock_held(sfmmup)); \ 866 if ((tte_szc) >= TTE4M) { \ 867 ASSERT((tsbinfop) != NULL); \ 868 (tsbinfop) = (tsbinfop)->tsb_next; \ 869 } \ 870 } 871 872 /* 873 * Macro to use to unload entries from the TSB. 874 * It has knowledge of which page sizes get replicated in the TSB 875 * and will call the appropriate unload routine for the appropriate size. 876 */ 877 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \ 878 { \ 879 int ttesz = get_hblk_ttesz(hmeblkp); \ 880 if (ttesz == TTE8K || ttesz == TTE4M) { \ 881 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 882 } else { \ 883 caddr_t sva = ismhat ? addr : \ 884 (caddr_t)get_hblk_base(hmeblkp); \ 885 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 886 ASSERT(addr >= sva && addr < eva); \ 887 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 888 } \ 889 } 890 891 892 /* Update tsb_alloc_hiwater after memory is configured. */ 893 /*ARGSUSED*/ 894 static void 895 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages) 896 { 897 /* Assumes physmem has already been updated. */ 898 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 899 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 900 } 901 902 /* 903 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 904 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 905 * deleted. 906 */ 907 /*ARGSUSED*/ 908 static int 909 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages) 910 { 911 return (0); 912 } 913 914 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 915 /*ARGSUSED*/ 916 static void 917 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 918 { 919 /* 920 * Whether the delete was cancelled or not, just go ahead and update 921 * tsb_alloc_hiwater and tsb_max_growsize. 922 */ 923 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 924 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 925 } 926 927 static kphysm_setup_vector_t sfmmu_update_vec = { 928 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 929 sfmmu_update_post_add, /* post_add */ 930 sfmmu_update_pre_del, /* pre_del */ 931 sfmmu_update_post_del /* post_del */ 932 }; 933 934 935 /* 936 * HME_BLK HASH PRIMITIVES 937 */ 938 939 /* 940 * Enter a hme on the mapping list for page pp. 941 * When large pages are more prevalent in the system we might want to 942 * keep the mapping list in ascending order by the hment size. For now, 943 * small pages are more frequent, so don't slow it down. 944 */ 945 #define HME_ADD(hme, pp) \ 946 { \ 947 ASSERT(sfmmu_mlist_held(pp)); \ 948 \ 949 hme->hme_prev = NULL; \ 950 hme->hme_next = pp->p_mapping; \ 951 hme->hme_page = pp; \ 952 if (pp->p_mapping) { \ 953 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 954 ASSERT(pp->p_share > 0); \ 955 } else { \ 956 /* EMPTY */ \ 957 ASSERT(pp->p_share == 0); \ 958 } \ 959 pp->p_mapping = hme; \ 960 pp->p_share++; \ 961 } 962 963 /* 964 * Enter a hme on the mapping list for page pp. 965 * If we are unmapping a large translation, we need to make sure that the 966 * change is reflect in the corresponding bit of the p_index field. 967 */ 968 #define HME_SUB(hme, pp) \ 969 { \ 970 ASSERT(sfmmu_mlist_held(pp)); \ 971 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 972 \ 973 if (pp->p_mapping == NULL) { \ 974 panic("hme_remove - no mappings"); \ 975 } \ 976 \ 977 membar_stst(); /* ensure previous stores finish */ \ 978 \ 979 ASSERT(pp->p_share > 0); \ 980 pp->p_share--; \ 981 \ 982 if (hme->hme_prev) { \ 983 ASSERT(pp->p_mapping != hme); \ 984 ASSERT(hme->hme_prev->hme_page == pp || \ 985 IS_PAHME(hme->hme_prev)); \ 986 hme->hme_prev->hme_next = hme->hme_next; \ 987 } else { \ 988 ASSERT(pp->p_mapping == hme); \ 989 pp->p_mapping = hme->hme_next; \ 990 ASSERT((pp->p_mapping == NULL) ? \ 991 (pp->p_share == 0) : 1); \ 992 } \ 993 \ 994 if (hme->hme_next) { \ 995 ASSERT(hme->hme_next->hme_page == pp || \ 996 IS_PAHME(hme->hme_next)); \ 997 hme->hme_next->hme_prev = hme->hme_prev; \ 998 } \ 999 \ 1000 /* zero out the entry */ \ 1001 hme->hme_next = NULL; \ 1002 hme->hme_prev = NULL; \ 1003 hme->hme_page = NULL; \ 1004 \ 1005 if (hme_size(hme) > TTE8K) { \ 1006 /* remove mappings for remainder of large pg */ \ 1007 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 1008 } \ 1009 } 1010 1011 /* 1012 * This function returns the hment given the hme_blk and a vaddr. 1013 * It assumes addr has already been checked to belong to hme_blk's 1014 * range. 1015 */ 1016 #define HBLKTOHME(hment, hmeblkp, addr) \ 1017 { \ 1018 int index; \ 1019 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 1020 } 1021 1022 /* 1023 * Version of HBLKTOHME that also returns the index in hmeblkp 1024 * of the hment. 1025 */ 1026 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 1027 { \ 1028 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 1029 \ 1030 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 1031 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 1032 } else \ 1033 idx = 0; \ 1034 \ 1035 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 1036 } 1037 1038 /* 1039 * Disable any page sizes not supported by the CPU 1040 */ 1041 void 1042 hat_init_pagesizes() 1043 { 1044 int i; 1045 1046 mmu_exported_page_sizes = 0; 1047 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 1048 1049 szc_2_userszc[i] = (uint_t)-1; 1050 userszc_2_szc[i] = (uint_t)-1; 1051 1052 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 1053 disable_large_pages |= (1 << i); 1054 } else { 1055 szc_2_userszc[i] = mmu_exported_page_sizes; 1056 userszc_2_szc[mmu_exported_page_sizes] = i; 1057 mmu_exported_page_sizes++; 1058 } 1059 } 1060 1061 disable_ism_large_pages |= disable_large_pages; 1062 disable_auto_data_large_pages = disable_large_pages; 1063 disable_auto_text_large_pages = disable_large_pages; 1064 1065 /* 1066 * Initialize mmu-specific large page sizes. 1067 */ 1068 if (&mmu_large_pages_disabled) { 1069 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 1070 disable_ism_large_pages |= 1071 mmu_large_pages_disabled(HAT_LOAD_SHARE); 1072 disable_auto_data_large_pages |= 1073 mmu_large_pages_disabled(HAT_AUTO_DATA); 1074 disable_auto_text_large_pages |= 1075 mmu_large_pages_disabled(HAT_AUTO_TEXT); 1076 } 1077 } 1078 1079 /* 1080 * Initialize the hardware address translation structures. 1081 */ 1082 void 1083 hat_init(void) 1084 { 1085 int i; 1086 uint_t sz; 1087 size_t size; 1088 1089 hat_lock_init(); 1090 hat_kstat_init(); 1091 1092 /* 1093 * Hardware-only bits in a TTE 1094 */ 1095 MAKE_TTE_MASK(&hw_tte); 1096 1097 hat_init_pagesizes(); 1098 1099 /* Initialize the hash locks */ 1100 for (i = 0; i < khmehash_num; i++) { 1101 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1102 MUTEX_DEFAULT, NULL); 1103 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1104 } 1105 for (i = 0; i < uhmehash_num; i++) { 1106 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1107 MUTEX_DEFAULT, NULL); 1108 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1109 } 1110 khmehash_num--; /* make sure counter starts from 0 */ 1111 uhmehash_num--; /* make sure counter starts from 0 */ 1112 1113 /* 1114 * Allocate context domain structures. 1115 * 1116 * A platform may choose to modify max_mmu_ctxdoms in 1117 * set_platform_defaults(). If a platform does not define 1118 * a set_platform_defaults() or does not choose to modify 1119 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU. 1120 * 1121 * For sun4v, there will be one global context domain, this is to 1122 * avoid the ldom cpu substitution problem. 1123 * 1124 * For all platforms that have CPUs sharing MMUs, this 1125 * value must be defined. 1126 */ 1127 if (max_mmu_ctxdoms == 0) { 1128 #ifndef sun4v 1129 max_mmu_ctxdoms = max_ncpus; 1130 #else /* sun4v */ 1131 max_mmu_ctxdoms = 1; 1132 #endif /* sun4v */ 1133 } 1134 1135 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *); 1136 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP); 1137 1138 /* mmu_ctx_t is 64 bytes aligned */ 1139 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache", 1140 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1141 /* 1142 * MMU context domain initialization for the Boot CPU. 1143 * This needs the context domains array allocated above. 1144 */ 1145 mutex_enter(&cpu_lock); 1146 sfmmu_cpu_init(CPU); 1147 mutex_exit(&cpu_lock); 1148 1149 /* 1150 * Intialize ism mapping list lock. 1151 */ 1152 1153 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1154 1155 /* 1156 * Each sfmmu structure carries an array of MMU context info 1157 * structures, one per context domain. The size of this array depends 1158 * on the maximum number of context domains. So, the size of the 1159 * sfmmu structure varies per platform. 1160 * 1161 * sfmmu is allocated from static arena, because trap 1162 * handler at TL > 0 is not allowed to touch kernel relocatable 1163 * memory. sfmmu's alignment is changed to 64 bytes from 1164 * default 8 bytes, as the lower 6 bits will be used to pass 1165 * pgcnt to vtag_flush_pgcnt_tl1. 1166 */ 1167 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1); 1168 1169 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size, 1170 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1171 NULL, NULL, static_arena, 0); 1172 1173 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1174 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1175 1176 /* 1177 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1178 * from the heap when low on memory or when TSB_FORCEALLOC is 1179 * specified, don't use magazines to cache them--we want to return 1180 * them to the system as quickly as possible. 1181 */ 1182 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1183 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1184 static_arena, KMC_NOMAGAZINE); 1185 1186 /* 1187 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1188 * memory, which corresponds to the old static reserve for TSBs. 1189 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1190 * memory we'll allocate for TSB slabs; beyond this point TSB 1191 * allocations will be taken from the kernel heap (via 1192 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1193 * consumer. 1194 */ 1195 if (tsb_alloc_hiwater_factor == 0) { 1196 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1197 } 1198 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1199 1200 for (sz = tsb_slab_ttesz; sz > 0; sz--) { 1201 if (!(disable_large_pages & (1 << sz))) 1202 break; 1203 } 1204 1205 if (sz < tsb_slab_ttesz) { 1206 tsb_slab_ttesz = sz; 1207 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz; 1208 tsb_slab_size = 1 << tsb_slab_shift; 1209 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1; 1210 use_bigtsb_arena = 0; 1211 } else if (use_bigtsb_arena && 1212 (disable_large_pages & (1 << bigtsb_slab_ttesz))) { 1213 use_bigtsb_arena = 0; 1214 } 1215 1216 if (!use_bigtsb_arena) { 1217 bigtsb_slab_shift = tsb_slab_shift; 1218 } 1219 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1220 1221 /* 1222 * On smaller memory systems, allocate TSB memory in smaller chunks 1223 * than the default 4M slab size. We also honor disable_large_pages 1224 * here. 1225 * 1226 * The trap handlers need to be patched with the final slab shift, 1227 * since they need to be able to construct the TSB pointer at runtime. 1228 */ 1229 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1230 !(disable_large_pages & (1 << TTE512K))) { 1231 tsb_slab_ttesz = TTE512K; 1232 tsb_slab_shift = MMU_PAGESHIFT512K; 1233 tsb_slab_size = MMU_PAGESIZE512K; 1234 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT; 1235 use_bigtsb_arena = 0; 1236 } 1237 1238 if (!use_bigtsb_arena) { 1239 bigtsb_slab_ttesz = tsb_slab_ttesz; 1240 bigtsb_slab_shift = tsb_slab_shift; 1241 bigtsb_slab_size = tsb_slab_size; 1242 bigtsb_slab_mask = tsb_slab_mask; 1243 } 1244 1245 1246 /* 1247 * Set up memory callback to update tsb_alloc_hiwater and 1248 * tsb_max_growsize. 1249 */ 1250 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0); 1251 ASSERT(i == 0); 1252 1253 /* 1254 * kmem_tsb_arena is the source from which large TSB slabs are 1255 * drawn. The quantum of this arena corresponds to the largest 1256 * TSB size we can dynamically allocate for user processes. 1257 * Currently it must also be a supported page size since we 1258 * use exactly one translation entry to map each slab page. 1259 * 1260 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1261 * which most TSBs are allocated. Since most TSB allocations are 1262 * typically 8K we have a kmem cache we stack on top of each 1263 * kmem_tsb_default_arena to speed up those allocations. 1264 * 1265 * Note the two-level scheme of arenas is required only 1266 * because vmem_create doesn't allow us to specify alignment 1267 * requirements. If this ever changes the code could be 1268 * simplified to use only one level of arenas. 1269 * 1270 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena 1271 * will be provided in addition to the 4M kmem_tsb_arena. 1272 */ 1273 if (use_bigtsb_arena) { 1274 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0, 1275 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper, 1276 vmem_xfree, heap_arena, 0, VM_SLEEP); 1277 } 1278 1279 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1280 sfmmu_vmem_xalloc_aligned_wrapper, 1281 vmem_xfree, heap_arena, 0, VM_SLEEP); 1282 1283 if (tsb_lgrp_affinity) { 1284 char s[50]; 1285 for (i = 0; i < NLGRPS_MAX; i++) { 1286 if (use_bigtsb_arena) { 1287 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i); 1288 kmem_bigtsb_default_arena[i] = vmem_create(s, 1289 NULL, 0, 2 * tsb_slab_size, 1290 sfmmu_tsb_segkmem_alloc, 1291 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 1292 0, VM_SLEEP | VM_BESTFIT); 1293 } 1294 1295 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1296 kmem_tsb_default_arena[i] = vmem_create(s, 1297 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1298 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1299 VM_SLEEP | VM_BESTFIT); 1300 1301 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1302 sfmmu_tsb_cache[i] = kmem_cache_create(s, 1303 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1304 kmem_tsb_default_arena[i], 0); 1305 } 1306 } else { 1307 if (use_bigtsb_arena) { 1308 kmem_bigtsb_default_arena[0] = 1309 vmem_create("kmem_bigtsb_default", NULL, 0, 1310 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc, 1311 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0, 1312 VM_SLEEP | VM_BESTFIT); 1313 } 1314 1315 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1316 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1317 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1318 VM_SLEEP | VM_BESTFIT); 1319 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1320 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1321 kmem_tsb_default_arena[0], 0); 1322 } 1323 1324 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1325 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1326 sfmmu_hblkcache_destructor, 1327 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1328 hat_memload_arena, KMC_NOHASH); 1329 1330 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1331 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP); 1332 1333 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1334 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1335 sfmmu_hblkcache_destructor, 1336 NULL, (void *)HME1BLK_SZ, 1337 hat_memload1_arena, KMC_NOHASH); 1338 1339 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1340 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1341 1342 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1343 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1344 NULL, NULL, static_arena, KMC_NOHASH); 1345 1346 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1347 sizeof (ism_ment_t), 0, NULL, NULL, 1348 NULL, NULL, NULL, 0); 1349 1350 /* 1351 * We grab the first hat for the kernel, 1352 */ 1353 AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER); 1354 kas.a_hat = hat_alloc(&kas); 1355 AS_LOCK_EXIT(&kas, &kas.a_lock); 1356 1357 /* 1358 * Initialize hblk_reserve. 1359 */ 1360 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1361 va_to_pa((caddr_t)hblk_reserve); 1362 1363 #ifndef UTSB_PHYS 1364 /* 1365 * Reserve some kernel virtual address space for the locked TTEs 1366 * that allow us to probe the TSB from TL>0. 1367 */ 1368 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1369 0, 0, NULL, NULL, VM_SLEEP); 1370 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1371 0, 0, NULL, NULL, VM_SLEEP); 1372 #endif 1373 1374 #ifdef VAC 1375 /* 1376 * The big page VAC handling code assumes VAC 1377 * will not be bigger than the smallest big 1378 * page- which is 64K. 1379 */ 1380 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1381 cmn_err(CE_PANIC, "VAC too big!"); 1382 } 1383 #endif 1384 1385 (void) xhat_init(); 1386 1387 uhme_hash_pa = va_to_pa(uhme_hash); 1388 khme_hash_pa = va_to_pa(khme_hash); 1389 1390 /* 1391 * Initialize relocation locks. kpr_suspendlock is held 1392 * at PIL_MAX to prevent interrupts from pinning the holder 1393 * of a suspended TTE which may access it leading to a 1394 * deadlock condition. 1395 */ 1396 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1397 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1398 1399 /* 1400 * If Shared context support is disabled via /etc/system 1401 * set shctx_on to 0 here if it was set to 1 earlier in boot 1402 * sequence by cpu module initialization code. 1403 */ 1404 if (shctx_on && disable_shctx) { 1405 shctx_on = 0; 1406 } 1407 1408 if (shctx_on) { 1409 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS * 1410 sizeof (srd_buckets[0]), KM_SLEEP); 1411 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) { 1412 mutex_init(&srd_buckets[i].srdb_lock, NULL, 1413 MUTEX_DEFAULT, NULL); 1414 } 1415 1416 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t), 1417 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor, 1418 NULL, NULL, NULL, 0); 1419 region_cache = kmem_cache_create("region_cache", 1420 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor, 1421 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0); 1422 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t), 1423 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor, 1424 NULL, NULL, NULL, 0); 1425 } 1426 1427 /* 1428 * Pre-allocate hrm_hashtab before enabling the collection of 1429 * refmod statistics. Allocating on the fly would mean us 1430 * running the risk of suffering recursive mutex enters or 1431 * deadlocks. 1432 */ 1433 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1434 KM_SLEEP); 1435 1436 /* Allocate per-cpu pending freelist of hmeblks */ 1437 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64, 1438 KM_SLEEP); 1439 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP( 1440 (uintptr_t)cpu_hme_pend, 64); 1441 1442 for (i = 0; i < NCPU; i++) { 1443 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT, 1444 NULL); 1445 } 1446 1447 if (cpu_hme_pend_thresh == 0) { 1448 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH; 1449 } 1450 } 1451 1452 /* 1453 * Initialize locking for the hat layer, called early during boot. 1454 */ 1455 static void 1456 hat_lock_init() 1457 { 1458 int i; 1459 1460 /* 1461 * initialize the array of mutexes protecting a page's mapping 1462 * list and p_nrm field. 1463 */ 1464 for (i = 0; i < mml_table_sz; i++) 1465 mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL); 1466 1467 if (kpm_enable) { 1468 for (i = 0; i < kpmp_table_sz; i++) { 1469 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1470 MUTEX_DEFAULT, NULL); 1471 } 1472 } 1473 1474 /* 1475 * Initialize array of mutex locks that protects sfmmu fields and 1476 * TSB lists. 1477 */ 1478 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1479 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1480 NULL); 1481 } 1482 1483 #define SFMMU_KERNEL_MAXVA \ 1484 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1485 1486 /* 1487 * Allocate a hat structure. 1488 * Called when an address space first uses a hat. 1489 */ 1490 struct hat * 1491 hat_alloc(struct as *as) 1492 { 1493 sfmmu_t *sfmmup; 1494 int i; 1495 uint64_t cnum; 1496 extern uint_t get_color_start(struct as *); 1497 1498 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1499 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1500 sfmmup->sfmmu_as = as; 1501 sfmmup->sfmmu_flags = 0; 1502 sfmmup->sfmmu_tteflags = 0; 1503 sfmmup->sfmmu_rtteflags = 0; 1504 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock); 1505 1506 if (as == &kas) { 1507 ksfmmup = sfmmup; 1508 sfmmup->sfmmu_cext = 0; 1509 cnum = KCONTEXT; 1510 1511 sfmmup->sfmmu_clrstart = 0; 1512 sfmmup->sfmmu_tsb = NULL; 1513 /* 1514 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1515 * to setup tsb_info for ksfmmup. 1516 */ 1517 } else { 1518 1519 /* 1520 * Just set to invalid ctx. When it faults, it will 1521 * get a valid ctx. This would avoid the situation 1522 * where we get a ctx, but it gets stolen and then 1523 * we fault when we try to run and so have to get 1524 * another ctx. 1525 */ 1526 sfmmup->sfmmu_cext = 0; 1527 cnum = INVALID_CONTEXT; 1528 1529 /* initialize original physical page coloring bin */ 1530 sfmmup->sfmmu_clrstart = get_color_start(as); 1531 #ifdef DEBUG 1532 if (tsb_random_size) { 1533 uint32_t randval = (uint32_t)gettick() >> 4; 1534 int size = randval % (tsb_max_growsize + 1); 1535 1536 /* chose a random tsb size for stress testing */ 1537 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1538 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1539 } else 1540 #endif /* DEBUG */ 1541 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1542 default_tsb_size, 1543 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1544 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID; 1545 ASSERT(sfmmup->sfmmu_tsb != NULL); 1546 } 1547 1548 ASSERT(max_mmu_ctxdoms > 0); 1549 for (i = 0; i < max_mmu_ctxdoms; i++) { 1550 sfmmup->sfmmu_ctxs[i].cnum = cnum; 1551 sfmmup->sfmmu_ctxs[i].gnum = 0; 1552 } 1553 1554 for (i = 0; i < max_mmu_page_sizes; i++) { 1555 sfmmup->sfmmu_ttecnt[i] = 0; 1556 sfmmup->sfmmu_scdrttecnt[i] = 0; 1557 sfmmup->sfmmu_ismttecnt[i] = 0; 1558 sfmmup->sfmmu_scdismttecnt[i] = 0; 1559 sfmmup->sfmmu_pgsz[i] = TTE8K; 1560 } 1561 sfmmup->sfmmu_tsb0_4minflcnt = 0; 1562 sfmmup->sfmmu_iblk = NULL; 1563 sfmmup->sfmmu_ismhat = 0; 1564 sfmmup->sfmmu_scdhat = 0; 1565 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1566 if (sfmmup == ksfmmup) { 1567 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1568 } else { 1569 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1570 } 1571 sfmmup->sfmmu_free = 0; 1572 sfmmup->sfmmu_rmstat = 0; 1573 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1574 sfmmup->sfmmu_xhat_provider = NULL; 1575 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1576 sfmmup->sfmmu_srdp = NULL; 1577 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map); 1578 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 1579 sfmmup->sfmmu_scdp = NULL; 1580 sfmmup->sfmmu_scd_link.next = NULL; 1581 sfmmup->sfmmu_scd_link.prev = NULL; 1582 return (sfmmup); 1583 } 1584 1585 /* 1586 * Create per-MMU context domain kstats for a given MMU ctx. 1587 */ 1588 static void 1589 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp) 1590 { 1591 mmu_ctx_stat_t stat; 1592 kstat_t *mmu_kstat; 1593 1594 ASSERT(MUTEX_HELD(&cpu_lock)); 1595 ASSERT(mmu_ctxp->mmu_kstat == NULL); 1596 1597 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx", 1598 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1599 1600 if (mmu_kstat == NULL) { 1601 cmn_err(CE_WARN, "kstat_create for MMU %d failed", 1602 mmu_ctxp->mmu_idx); 1603 } else { 1604 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data; 1605 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++) 1606 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat], 1607 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64); 1608 mmu_ctxp->mmu_kstat = mmu_kstat; 1609 kstat_install(mmu_kstat); 1610 } 1611 } 1612 1613 /* 1614 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU 1615 * context domain information for a given CPU. If a platform does not 1616 * specify that interface, then the function below is used instead to return 1617 * default information. The defaults are as follows: 1618 * 1619 * - For sun4u systems there's one MMU context domain per CPU. 1620 * This default is used by all sun4u systems except OPL. OPL systems 1621 * provide platform specific interface to map CPU ids to MMU ids 1622 * because on OPL more than 1 CPU shares a single MMU. 1623 * Note that on sun4v, there is one global context domain for 1624 * the entire system. This is to avoid running into potential problem 1625 * with ldom physical cpu substitution feature. 1626 * - The number of MMU context IDs supported on any CPU in the 1627 * system is 8K. 1628 */ 1629 /*ARGSUSED*/ 1630 static void 1631 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop) 1632 { 1633 infop->mmu_nctxs = nctxs; 1634 #ifndef sun4v 1635 infop->mmu_idx = cpu[cpuid]->cpu_seqid; 1636 #else /* sun4v */ 1637 infop->mmu_idx = 0; 1638 #endif /* sun4v */ 1639 } 1640 1641 /* 1642 * Called during CPU initialization to set the MMU context-related information 1643 * for a CPU. 1644 * 1645 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum. 1646 */ 1647 void 1648 sfmmu_cpu_init(cpu_t *cp) 1649 { 1650 mmu_ctx_info_t info; 1651 mmu_ctx_t *mmu_ctxp; 1652 1653 ASSERT(MUTEX_HELD(&cpu_lock)); 1654 1655 if (&plat_cpuid_to_mmu_ctx_info == NULL) 1656 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1657 else 1658 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1659 1660 ASSERT(info.mmu_idx < max_mmu_ctxdoms); 1661 1662 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) { 1663 /* Each mmu_ctx is cacheline aligned. */ 1664 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP); 1665 bzero(mmu_ctxp, sizeof (mmu_ctx_t)); 1666 1667 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN, 1668 (void *)ipltospl(DISP_LEVEL)); 1669 mmu_ctxp->mmu_idx = info.mmu_idx; 1670 mmu_ctxp->mmu_nctxs = info.mmu_nctxs; 1671 /* 1672 * Globally for lifetime of a system, 1673 * gnum must always increase. 1674 * mmu_saved_gnum is protected by the cpu_lock. 1675 */ 1676 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1; 1677 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 1678 1679 sfmmu_mmu_kstat_create(mmu_ctxp); 1680 1681 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp; 1682 } else { 1683 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx); 1684 } 1685 1686 /* 1687 * The mmu_lock is acquired here to prevent races with 1688 * the wrap-around code. 1689 */ 1690 mutex_enter(&mmu_ctxp->mmu_lock); 1691 1692 1693 mmu_ctxp->mmu_ncpus++; 1694 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1695 CPU_MMU_IDX(cp) = info.mmu_idx; 1696 CPU_MMU_CTXP(cp) = mmu_ctxp; 1697 1698 mutex_exit(&mmu_ctxp->mmu_lock); 1699 } 1700 1701 /* 1702 * Called to perform MMU context-related cleanup for a CPU. 1703 */ 1704 void 1705 sfmmu_cpu_cleanup(cpu_t *cp) 1706 { 1707 mmu_ctx_t *mmu_ctxp; 1708 1709 ASSERT(MUTEX_HELD(&cpu_lock)); 1710 1711 mmu_ctxp = CPU_MMU_CTXP(cp); 1712 ASSERT(mmu_ctxp != NULL); 1713 1714 /* 1715 * The mmu_lock is acquired here to prevent races with 1716 * the wrap-around code. 1717 */ 1718 mutex_enter(&mmu_ctxp->mmu_lock); 1719 1720 CPU_MMU_CTXP(cp) = NULL; 1721 1722 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1723 if (--mmu_ctxp->mmu_ncpus == 0) { 1724 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL; 1725 mutex_exit(&mmu_ctxp->mmu_lock); 1726 mutex_destroy(&mmu_ctxp->mmu_lock); 1727 1728 if (mmu_ctxp->mmu_kstat) 1729 kstat_delete(mmu_ctxp->mmu_kstat); 1730 1731 /* mmu_saved_gnum is protected by the cpu_lock. */ 1732 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum) 1733 mmu_saved_gnum = mmu_ctxp->mmu_gnum; 1734 1735 kmem_cache_free(mmuctxdom_cache, mmu_ctxp); 1736 1737 return; 1738 } 1739 1740 mutex_exit(&mmu_ctxp->mmu_lock); 1741 } 1742 1743 /* 1744 * Hat_setup, makes an address space context the current active one. 1745 * In sfmmu this translates to setting the secondary context with the 1746 * corresponding context. 1747 */ 1748 void 1749 hat_setup(struct hat *sfmmup, int allocflag) 1750 { 1751 hatlock_t *hatlockp; 1752 1753 /* Init needs some special treatment. */ 1754 if (allocflag == HAT_INIT) { 1755 /* 1756 * Make sure that we have 1757 * 1. a TSB 1758 * 2. a valid ctx that doesn't get stolen after this point. 1759 */ 1760 hatlockp = sfmmu_hat_enter(sfmmup); 1761 1762 /* 1763 * Swap in the TSB. hat_init() allocates tsbinfos without 1764 * TSBs, but we need one for init, since the kernel does some 1765 * special things to set up its stack and needs the TSB to 1766 * resolve page faults. 1767 */ 1768 sfmmu_tsb_swapin(sfmmup, hatlockp); 1769 1770 sfmmu_get_ctx(sfmmup); 1771 1772 sfmmu_hat_exit(hatlockp); 1773 } else { 1774 ASSERT(allocflag == HAT_ALLOC); 1775 1776 hatlockp = sfmmu_hat_enter(sfmmup); 1777 kpreempt_disable(); 1778 1779 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1780 /* 1781 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter, 1782 * pagesize bits don't matter in this case since we are passing 1783 * INVALID_CONTEXT to it. 1784 * Compatibility Note: hw takes care of MMU_SCONTEXT1 1785 */ 1786 sfmmu_setctx_sec(INVALID_CONTEXT); 1787 sfmmu_clear_utsbinfo(); 1788 1789 kpreempt_enable(); 1790 sfmmu_hat_exit(hatlockp); 1791 } 1792 } 1793 1794 /* 1795 * Free all the translation resources for the specified address space. 1796 * Called from as_free when an address space is being destroyed. 1797 */ 1798 void 1799 hat_free_start(struct hat *sfmmup) 1800 { 1801 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 1802 ASSERT(sfmmup != ksfmmup); 1803 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1804 1805 sfmmup->sfmmu_free = 1; 1806 if (sfmmup->sfmmu_scdp != NULL) { 1807 sfmmu_leave_scd(sfmmup, 0); 1808 } 1809 1810 ASSERT(sfmmup->sfmmu_scdp == NULL); 1811 } 1812 1813 void 1814 hat_free_end(struct hat *sfmmup) 1815 { 1816 int i; 1817 1818 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1819 ASSERT(sfmmup->sfmmu_free == 1); 1820 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1821 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1822 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1823 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1824 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1825 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1826 1827 if (sfmmup->sfmmu_rmstat) { 1828 hat_freestat(sfmmup->sfmmu_as, NULL); 1829 } 1830 1831 while (sfmmup->sfmmu_tsb != NULL) { 1832 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1833 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1834 sfmmup->sfmmu_tsb = next; 1835 } 1836 1837 if (sfmmup->sfmmu_srdp != NULL) { 1838 sfmmu_leave_srd(sfmmup); 1839 ASSERT(sfmmup->sfmmu_srdp == NULL); 1840 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1841 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) { 1842 kmem_free(sfmmup->sfmmu_hmeregion_links[i], 1843 SFMMU_L2_HMERLINKS_SIZE); 1844 sfmmup->sfmmu_hmeregion_links[i] = NULL; 1845 } 1846 } 1847 } 1848 sfmmu_free_sfmmu(sfmmup); 1849 1850 #ifdef DEBUG 1851 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1852 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL); 1853 } 1854 #endif 1855 1856 kmem_cache_free(sfmmuid_cache, sfmmup); 1857 } 1858 1859 /* 1860 * Set up any translation structures, for the specified address space, 1861 * that are needed or preferred when the process is being swapped in. 1862 */ 1863 /* ARGSUSED */ 1864 void 1865 hat_swapin(struct hat *hat) 1866 { 1867 ASSERT(hat->sfmmu_xhat_provider == NULL); 1868 } 1869 1870 /* 1871 * Free all of the translation resources, for the specified address space, 1872 * that can be freed while the process is swapped out. Called from as_swapout. 1873 * Also, free up the ctx that this process was using. 1874 */ 1875 void 1876 hat_swapout(struct hat *sfmmup) 1877 { 1878 struct hmehash_bucket *hmebp; 1879 struct hme_blk *hmeblkp; 1880 struct hme_blk *pr_hblk = NULL; 1881 struct hme_blk *nx_hblk; 1882 int i; 1883 struct hme_blk *list = NULL; 1884 hatlock_t *hatlockp; 1885 struct tsb_info *tsbinfop; 1886 struct free_tsb { 1887 struct free_tsb *next; 1888 struct tsb_info *tsbinfop; 1889 }; /* free list of TSBs */ 1890 struct free_tsb *freelist, *last, *next; 1891 1892 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1893 SFMMU_STAT(sf_swapout); 1894 1895 /* 1896 * There is no way to go from an as to all its translations in sfmmu. 1897 * Here is one of the times when we take the big hit and traverse 1898 * the hash looking for hme_blks to free up. Not only do we free up 1899 * this as hme_blks but all those that are free. We are obviously 1900 * swapping because we need memory so let's free up as much 1901 * as we can. 1902 * 1903 * Note that we don't flush TLB/TSB here -- it's not necessary 1904 * because: 1905 * 1) we free the ctx we're using and throw away the TSB(s); 1906 * 2) processes aren't runnable while being swapped out. 1907 */ 1908 ASSERT(sfmmup != KHATID); 1909 for (i = 0; i <= UHMEHASH_SZ; i++) { 1910 hmebp = &uhme_hash[i]; 1911 SFMMU_HASH_LOCK(hmebp); 1912 hmeblkp = hmebp->hmeblkp; 1913 pr_hblk = NULL; 1914 while (hmeblkp) { 1915 1916 ASSERT(!hmeblkp->hblk_xhat_bit); 1917 1918 if ((hmeblkp->hblk_tag.htag_id == sfmmup) && 1919 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) { 1920 ASSERT(!hmeblkp->hblk_shared); 1921 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 1922 (caddr_t)get_hblk_base(hmeblkp), 1923 get_hblk_endaddr(hmeblkp), 1924 NULL, HAT_UNLOAD); 1925 } 1926 nx_hblk = hmeblkp->hblk_next; 1927 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 1928 ASSERT(!hmeblkp->hblk_lckcnt); 1929 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 1930 &list, 0); 1931 } else { 1932 pr_hblk = hmeblkp; 1933 } 1934 hmeblkp = nx_hblk; 1935 } 1936 SFMMU_HASH_UNLOCK(hmebp); 1937 } 1938 1939 sfmmu_hblks_list_purge(&list, 0); 1940 1941 /* 1942 * Now free up the ctx so that others can reuse it. 1943 */ 1944 hatlockp = sfmmu_hat_enter(sfmmup); 1945 1946 sfmmu_invalidate_ctx(sfmmup); 1947 1948 /* 1949 * Free TSBs, but not tsbinfos, and set SWAPPED flag. 1950 * If TSBs were never swapped in, just return. 1951 * This implies that we don't support partial swapping 1952 * of TSBs -- either all are swapped out, or none are. 1953 * 1954 * We must hold the HAT lock here to prevent racing with another 1955 * thread trying to unmap TTEs from the TSB or running the post- 1956 * relocator after relocating the TSB's memory. Unfortunately, we 1957 * can't free memory while holding the HAT lock or we could 1958 * deadlock, so we build a list of TSBs to be freed after marking 1959 * the tsbinfos as swapped out and free them after dropping the 1960 * lock. 1961 */ 1962 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 1963 sfmmu_hat_exit(hatlockp); 1964 return; 1965 } 1966 1967 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED); 1968 last = freelist = NULL; 1969 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 1970 tsbinfop = tsbinfop->tsb_next) { 1971 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0); 1972 1973 /* 1974 * Cast the TSB into a struct free_tsb and put it on the free 1975 * list. 1976 */ 1977 if (freelist == NULL) { 1978 last = freelist = (struct free_tsb *)tsbinfop->tsb_va; 1979 } else { 1980 last->next = (struct free_tsb *)tsbinfop->tsb_va; 1981 last = last->next; 1982 } 1983 last->next = NULL; 1984 last->tsbinfop = tsbinfop; 1985 tsbinfop->tsb_flags |= TSB_SWAPPED; 1986 /* 1987 * Zero out the TTE to clear the valid bit. 1988 * Note we can't use a value like 0xbad because we want to 1989 * ensure diagnostic bits are NEVER set on TTEs that might 1990 * be loaded. The intent is to catch any invalid access 1991 * to the swapped TSB, such as a thread running with a valid 1992 * context without first calling sfmmu_tsb_swapin() to 1993 * allocate TSB memory. 1994 */ 1995 tsbinfop->tsb_tte.ll = 0; 1996 } 1997 1998 /* Now we can drop the lock and free the TSB memory. */ 1999 sfmmu_hat_exit(hatlockp); 2000 for (; freelist != NULL; freelist = next) { 2001 next = freelist->next; 2002 sfmmu_tsb_free(freelist->tsbinfop); 2003 } 2004 } 2005 2006 /* 2007 * Duplicate the translations of an as into another newas 2008 */ 2009 /* ARGSUSED */ 2010 int 2011 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 2012 uint_t flag) 2013 { 2014 sf_srd_t *srdp; 2015 sf_scd_t *scdp; 2016 int i; 2017 extern uint_t get_color_start(struct as *); 2018 2019 ASSERT(hat->sfmmu_xhat_provider == NULL); 2020 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) || 2021 (flag == HAT_DUP_SRD)); 2022 ASSERT(hat != ksfmmup); 2023 ASSERT(newhat != ksfmmup); 2024 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp); 2025 2026 if (flag == HAT_DUP_COW) { 2027 panic("hat_dup: HAT_DUP_COW not supported"); 2028 } 2029 2030 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) { 2031 ASSERT(srdp->srd_evp != NULL); 2032 VN_HOLD(srdp->srd_evp); 2033 ASSERT(srdp->srd_refcnt > 0); 2034 newhat->sfmmu_srdp = srdp; 2035 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 2036 } 2037 2038 /* 2039 * HAT_DUP_ALL flag is used after as duplication is done. 2040 */ 2041 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) { 2042 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2); 2043 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags; 2044 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) { 2045 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG; 2046 } 2047 2048 /* check if need to join scd */ 2049 if ((scdp = hat->sfmmu_scdp) != NULL && 2050 newhat->sfmmu_scdp != scdp) { 2051 int ret; 2052 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map, 2053 &scdp->scd_region_map, ret); 2054 ASSERT(ret); 2055 sfmmu_join_scd(scdp, newhat); 2056 ASSERT(newhat->sfmmu_scdp == scdp && 2057 scdp->scd_refcnt >= 2); 2058 for (i = 0; i < max_mmu_page_sizes; i++) { 2059 newhat->sfmmu_ismttecnt[i] = 2060 hat->sfmmu_ismttecnt[i]; 2061 newhat->sfmmu_scdismttecnt[i] = 2062 hat->sfmmu_scdismttecnt[i]; 2063 } 2064 } 2065 2066 sfmmu_check_page_sizes(newhat, 1); 2067 } 2068 2069 if (flag == HAT_DUP_ALL && consistent_coloring == 0 && 2070 update_proc_pgcolorbase_after_fork != 0) { 2071 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as); 2072 } 2073 return (0); 2074 } 2075 2076 void 2077 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 2078 uint_t attr, uint_t flags) 2079 { 2080 hat_do_memload(hat, addr, pp, attr, flags, 2081 SFMMU_INVALID_SHMERID); 2082 } 2083 2084 void 2085 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2086 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2087 { 2088 uint_t rid; 2089 if (rcookie == HAT_INVALID_REGION_COOKIE || 2090 hat->sfmmu_xhat_provider != NULL) { 2091 hat_do_memload(hat, addr, pp, attr, flags, 2092 SFMMU_INVALID_SHMERID); 2093 return; 2094 } 2095 rid = (uint_t)((uint64_t)rcookie); 2096 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2097 hat_do_memload(hat, addr, pp, attr, flags, rid); 2098 } 2099 2100 /* 2101 * Set up addr to map to page pp with protection prot. 2102 * As an optimization we also load the TSB with the 2103 * corresponding tte but it is no big deal if the tte gets kicked out. 2104 */ 2105 static void 2106 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp, 2107 uint_t attr, uint_t flags, uint_t rid) 2108 { 2109 tte_t tte; 2110 2111 2112 ASSERT(hat != NULL); 2113 ASSERT(PAGE_LOCKED(pp)); 2114 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2115 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2116 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2117 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE); 2118 2119 if (PP_ISFREE(pp)) { 2120 panic("hat_memload: loading a mapping to free page %p", 2121 (void *)pp); 2122 } 2123 2124 if (hat->sfmmu_xhat_provider) { 2125 /* no regions for xhats */ 2126 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2127 XHAT_MEMLOAD(hat, addr, pp, attr, flags); 2128 return; 2129 } 2130 2131 ASSERT((hat == ksfmmup) || 2132 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2133 2134 if (flags & ~SFMMU_LOAD_ALLFLAG) 2135 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 2136 flags & ~SFMMU_LOAD_ALLFLAG); 2137 2138 if (hat->sfmmu_rmstat) 2139 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 2140 2141 #if defined(SF_ERRATA_57) 2142 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2143 (addr < errata57_limit) && (attr & PROT_EXEC) && 2144 !(flags & HAT_LOAD_SHARE)) { 2145 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 2146 " page executable"); 2147 attr &= ~PROT_EXEC; 2148 } 2149 #endif 2150 2151 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2152 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid); 2153 2154 /* 2155 * Check TSB and TLB page sizes. 2156 */ 2157 if ((flags & HAT_LOAD_SHARE) == 0) { 2158 sfmmu_check_page_sizes(hat, 1); 2159 } 2160 } 2161 2162 /* 2163 * hat_devload can be called to map real memory (e.g. 2164 * /dev/kmem) and even though hat_devload will determine pf is 2165 * for memory, it will be unable to get a shared lock on the 2166 * page (because someone else has it exclusively) and will 2167 * pass dp = NULL. If tteload doesn't get a non-NULL 2168 * page pointer it can't cache memory. 2169 */ 2170 void 2171 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 2172 uint_t attr, int flags) 2173 { 2174 tte_t tte; 2175 struct page *pp = NULL; 2176 int use_lgpg = 0; 2177 2178 ASSERT(hat != NULL); 2179 2180 if (hat->sfmmu_xhat_provider) { 2181 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags); 2182 return; 2183 } 2184 2185 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2186 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2187 ASSERT((hat == ksfmmup) || 2188 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2189 if (len == 0) 2190 panic("hat_devload: zero len"); 2191 if (flags & ~SFMMU_LOAD_ALLFLAG) 2192 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 2193 flags & ~SFMMU_LOAD_ALLFLAG); 2194 2195 #if defined(SF_ERRATA_57) 2196 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2197 (addr < errata57_limit) && (attr & PROT_EXEC) && 2198 !(flags & HAT_LOAD_SHARE)) { 2199 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 2200 " page executable"); 2201 attr &= ~PROT_EXEC; 2202 } 2203 #endif 2204 2205 /* 2206 * If it's a memory page find its pp 2207 */ 2208 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 2209 pp = page_numtopp_nolock(pfn); 2210 if (pp == NULL) { 2211 flags |= HAT_LOAD_NOCONSIST; 2212 } else { 2213 if (PP_ISFREE(pp)) { 2214 panic("hat_memload: loading " 2215 "a mapping to free page %p", 2216 (void *)pp); 2217 } 2218 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2219 panic("hat_memload: loading a mapping " 2220 "to unlocked relocatable page %p", 2221 (void *)pp); 2222 } 2223 ASSERT(len == MMU_PAGESIZE); 2224 } 2225 } 2226 2227 if (hat->sfmmu_rmstat) 2228 hat_resvstat(len, hat->sfmmu_as, addr); 2229 2230 if (flags & HAT_LOAD_NOCONSIST) { 2231 attr |= SFMMU_UNCACHEVTTE; 2232 use_lgpg = 1; 2233 } 2234 if (!pf_is_memory(pfn)) { 2235 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 2236 use_lgpg = 1; 2237 switch (attr & HAT_ORDER_MASK) { 2238 case HAT_STRICTORDER: 2239 case HAT_UNORDERED_OK: 2240 /* 2241 * we set the side effect bit for all non 2242 * memory mappings unless merging is ok 2243 */ 2244 attr |= SFMMU_SIDEFFECT; 2245 break; 2246 case HAT_MERGING_OK: 2247 case HAT_LOADCACHING_OK: 2248 case HAT_STORECACHING_OK: 2249 break; 2250 default: 2251 panic("hat_devload: bad attr"); 2252 break; 2253 } 2254 } 2255 while (len) { 2256 if (!use_lgpg) { 2257 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2258 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2259 flags, SFMMU_INVALID_SHMERID); 2260 len -= MMU_PAGESIZE; 2261 addr += MMU_PAGESIZE; 2262 pfn++; 2263 continue; 2264 } 2265 /* 2266 * try to use large pages, check va/pa alignments 2267 * Note that 32M/256M page sizes are not (yet) supported. 2268 */ 2269 if ((len >= MMU_PAGESIZE4M) && 2270 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 2271 !(disable_large_pages & (1 << TTE4M)) && 2272 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 2273 sfmmu_memtte(&tte, pfn, attr, TTE4M); 2274 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2275 flags, SFMMU_INVALID_SHMERID); 2276 len -= MMU_PAGESIZE4M; 2277 addr += MMU_PAGESIZE4M; 2278 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 2279 } else if ((len >= MMU_PAGESIZE512K) && 2280 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 2281 !(disable_large_pages & (1 << TTE512K)) && 2282 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 2283 sfmmu_memtte(&tte, pfn, attr, TTE512K); 2284 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2285 flags, SFMMU_INVALID_SHMERID); 2286 len -= MMU_PAGESIZE512K; 2287 addr += MMU_PAGESIZE512K; 2288 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 2289 } else if ((len >= MMU_PAGESIZE64K) && 2290 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 2291 !(disable_large_pages & (1 << TTE64K)) && 2292 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 2293 sfmmu_memtte(&tte, pfn, attr, TTE64K); 2294 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2295 flags, SFMMU_INVALID_SHMERID); 2296 len -= MMU_PAGESIZE64K; 2297 addr += MMU_PAGESIZE64K; 2298 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 2299 } else { 2300 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2301 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2302 flags, SFMMU_INVALID_SHMERID); 2303 len -= MMU_PAGESIZE; 2304 addr += MMU_PAGESIZE; 2305 pfn++; 2306 } 2307 } 2308 2309 /* 2310 * Check TSB and TLB page sizes. 2311 */ 2312 if ((flags & HAT_LOAD_SHARE) == 0) { 2313 sfmmu_check_page_sizes(hat, 1); 2314 } 2315 } 2316 2317 void 2318 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 2319 struct page **pps, uint_t attr, uint_t flags) 2320 { 2321 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2322 SFMMU_INVALID_SHMERID); 2323 } 2324 2325 void 2326 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2327 struct page **pps, uint_t attr, uint_t flags, 2328 hat_region_cookie_t rcookie) 2329 { 2330 uint_t rid; 2331 if (rcookie == HAT_INVALID_REGION_COOKIE || 2332 hat->sfmmu_xhat_provider != NULL) { 2333 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2334 SFMMU_INVALID_SHMERID); 2335 return; 2336 } 2337 rid = (uint_t)((uint64_t)rcookie); 2338 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2339 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid); 2340 } 2341 2342 /* 2343 * Map the largest extend possible out of the page array. The array may NOT 2344 * be in order. The largest possible mapping a page can have 2345 * is specified in the p_szc field. The p_szc field 2346 * cannot change as long as there any mappings (large or small) 2347 * to any of the pages that make up the large page. (ie. any 2348 * promotion/demotion of page size is not up to the hat but up to 2349 * the page free list manager). The array 2350 * should consist of properly aligned contigous pages that are 2351 * part of a big page for a large mapping to be created. 2352 */ 2353 static void 2354 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len, 2355 struct page **pps, uint_t attr, uint_t flags, uint_t rid) 2356 { 2357 int ttesz; 2358 size_t mapsz; 2359 pgcnt_t numpg, npgs; 2360 tte_t tte; 2361 page_t *pp; 2362 uint_t large_pages_disable; 2363 2364 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2365 SFMMU_VALIDATE_HMERID(hat, rid, addr, len); 2366 2367 if (hat->sfmmu_xhat_provider) { 2368 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2369 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags); 2370 return; 2371 } 2372 2373 if (hat->sfmmu_rmstat) 2374 hat_resvstat(len, hat->sfmmu_as, addr); 2375 2376 #if defined(SF_ERRATA_57) 2377 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2378 (addr < errata57_limit) && (attr & PROT_EXEC) && 2379 !(flags & HAT_LOAD_SHARE)) { 2380 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 2381 "user page executable"); 2382 attr &= ~PROT_EXEC; 2383 } 2384 #endif 2385 2386 /* Get number of pages */ 2387 npgs = len >> MMU_PAGESHIFT; 2388 2389 if (flags & HAT_LOAD_SHARE) { 2390 large_pages_disable = disable_ism_large_pages; 2391 } else { 2392 large_pages_disable = disable_large_pages; 2393 } 2394 2395 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 2396 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2397 rid); 2398 return; 2399 } 2400 2401 while (npgs >= NHMENTS) { 2402 pp = *pps; 2403 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 2404 /* 2405 * Check if this page size is disabled. 2406 */ 2407 if (large_pages_disable & (1 << ttesz)) 2408 continue; 2409 2410 numpg = TTEPAGES(ttesz); 2411 mapsz = numpg << MMU_PAGESHIFT; 2412 if ((npgs >= numpg) && 2413 IS_P2ALIGNED(addr, mapsz) && 2414 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 2415 /* 2416 * At this point we have enough pages and 2417 * we know the virtual address and the pfn 2418 * are properly aligned. We still need 2419 * to check for physical contiguity but since 2420 * it is very likely that this is the case 2421 * we will assume they are so and undo 2422 * the request if necessary. It would 2423 * be great if we could get a hint flag 2424 * like HAT_CONTIG which would tell us 2425 * the pages are contigous for sure. 2426 */ 2427 sfmmu_memtte(&tte, (*pps)->p_pagenum, 2428 attr, ttesz); 2429 if (!sfmmu_tteload_array(hat, &tte, addr, 2430 pps, flags, rid)) { 2431 break; 2432 } 2433 } 2434 } 2435 if (ttesz == TTE8K) { 2436 /* 2437 * We were not able to map array using a large page 2438 * batch a hmeblk or fraction at a time. 2439 */ 2440 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 2441 & (NHMENTS-1); 2442 numpg = NHMENTS - numpg; 2443 ASSERT(numpg <= npgs); 2444 mapsz = numpg * MMU_PAGESIZE; 2445 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 2446 numpg, rid); 2447 } 2448 addr += mapsz; 2449 npgs -= numpg; 2450 pps += numpg; 2451 } 2452 2453 if (npgs) { 2454 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2455 rid); 2456 } 2457 2458 /* 2459 * Check TSB and TLB page sizes. 2460 */ 2461 if ((flags & HAT_LOAD_SHARE) == 0) { 2462 sfmmu_check_page_sizes(hat, 1); 2463 } 2464 } 2465 2466 /* 2467 * Function tries to batch 8K pages into the same hme blk. 2468 */ 2469 static void 2470 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 2471 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid) 2472 { 2473 tte_t tte; 2474 page_t *pp; 2475 struct hmehash_bucket *hmebp; 2476 struct hme_blk *hmeblkp; 2477 int index; 2478 2479 while (npgs) { 2480 /* 2481 * Acquire the hash bucket. 2482 */ 2483 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K, 2484 rid); 2485 ASSERT(hmebp); 2486 2487 /* 2488 * Find the hment block. 2489 */ 2490 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2491 TTE8K, flags, rid); 2492 ASSERT(hmeblkp); 2493 2494 do { 2495 /* 2496 * Make the tte. 2497 */ 2498 pp = *pps; 2499 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2500 2501 /* 2502 * Add the translation. 2503 */ 2504 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2505 vaddr, pps, flags, rid); 2506 2507 /* 2508 * Goto next page. 2509 */ 2510 pps++; 2511 npgs--; 2512 2513 /* 2514 * Goto next address. 2515 */ 2516 vaddr += MMU_PAGESIZE; 2517 2518 /* 2519 * Don't crossover into a different hmentblk. 2520 */ 2521 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2522 (NHMENTS-1)); 2523 2524 } while (index != 0 && npgs != 0); 2525 2526 /* 2527 * Release the hash bucket. 2528 */ 2529 2530 sfmmu_tteload_release_hashbucket(hmebp); 2531 } 2532 } 2533 2534 /* 2535 * Construct a tte for a page: 2536 * 2537 * tte_valid = 1 2538 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only) 2539 * tte_size = size 2540 * tte_nfo = attr & HAT_NOFAULT 2541 * tte_ie = attr & HAT_STRUCTURE_LE 2542 * tte_hmenum = hmenum 2543 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2544 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2545 * tte_ref = 1 (optimization) 2546 * tte_wr_perm = attr & PROT_WRITE; 2547 * tte_no_sync = attr & HAT_NOSYNC 2548 * tte_lock = attr & SFMMU_LOCKTTE 2549 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2550 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2551 * tte_e = attr & SFMMU_SIDEFFECT 2552 * tte_priv = !(attr & PROT_USER) 2553 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2554 * tte_glb = 0 2555 */ 2556 void 2557 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2558 { 2559 ASSERT((attr & ~(SFMMU_LOAD_ALLATTR | HAT_ATTR_NOSOFTEXEC)) == 0); 2560 2561 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2562 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2563 2564 if (TTE_IS_NOSYNC(ttep)) { 2565 TTE_SET_REF(ttep); 2566 if (TTE_IS_WRITABLE(ttep)) { 2567 TTE_SET_MOD(ttep); 2568 } 2569 } 2570 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2571 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2572 } 2573 2574 /* 2575 * Disable hardware execute permission to force a fault if 2576 * this page is executed, so we can detect the execution. Set 2577 * the soft exec bit to remember that this TTE has execute 2578 * permission. 2579 */ 2580 if (TTE_IS_EXECUTABLE(ttep) && (attr & HAT_ATTR_NOSOFTEXEC) == 0 && 2581 icache_is_coherent == 0) { 2582 TTE_CLR_EXEC(ttep); 2583 TTE_SET_SOFTEXEC(ttep); 2584 } 2585 } 2586 2587 /* 2588 * This function will add a translation to the hme_blk and allocate the 2589 * hme_blk if one does not exist. 2590 * If a page structure is specified then it will add the 2591 * corresponding hment to the mapping list. 2592 * It will also update the hmenum field for the tte. 2593 * 2594 * Currently this function is only used for kernel mappings. 2595 * So pass invalid region to sfmmu_tteload_array(). 2596 */ 2597 void 2598 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2599 uint_t flags) 2600 { 2601 ASSERT(sfmmup == ksfmmup); 2602 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags, 2603 SFMMU_INVALID_SHMERID); 2604 } 2605 2606 /* 2607 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2608 * Assumes that a particular page size may only be resident in one TSB. 2609 */ 2610 static void 2611 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2612 { 2613 struct tsb_info *tsbinfop = NULL; 2614 uint64_t tag; 2615 struct tsbe *tsbe_addr; 2616 uint64_t tsb_base; 2617 uint_t tsb_size; 2618 int vpshift = MMU_PAGESHIFT; 2619 int phys = 0; 2620 2621 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2622 phys = ktsb_phys; 2623 if (ttesz >= TTE4M) { 2624 #ifndef sun4v 2625 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2626 #endif 2627 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2628 tsb_size = ktsb4m_szcode; 2629 } else { 2630 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2631 tsb_size = ktsb_szcode; 2632 } 2633 } else { 2634 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2635 2636 /* 2637 * If there isn't a TSB for this page size, or the TSB is 2638 * swapped out, there is nothing to do. Note that the latter 2639 * case seems impossible but can occur if hat_pageunload() 2640 * is called on an ISM mapping while the process is swapped 2641 * out. 2642 */ 2643 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2644 return; 2645 2646 /* 2647 * If another thread is in the middle of relocating a TSB 2648 * we can't unload the entry so set a flag so that the 2649 * TSB will be flushed before it can be accessed by the 2650 * process. 2651 */ 2652 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2653 if (ttep == NULL) 2654 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2655 return; 2656 } 2657 #if defined(UTSB_PHYS) 2658 phys = 1; 2659 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2660 #else 2661 tsb_base = (uint64_t)tsbinfop->tsb_va; 2662 #endif 2663 tsb_size = tsbinfop->tsb_szc; 2664 } 2665 if (ttesz >= TTE4M) 2666 vpshift = MMU_PAGESHIFT4M; 2667 2668 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2669 tag = sfmmu_make_tsbtag(vaddr); 2670 2671 if (ttep == NULL) { 2672 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2673 } else { 2674 if (ttesz >= TTE4M) { 2675 SFMMU_STAT(sf_tsb_load4m); 2676 } else { 2677 SFMMU_STAT(sf_tsb_load8k); 2678 } 2679 2680 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2681 } 2682 } 2683 2684 /* 2685 * Unmap all entries from [start, end) matching the given page size. 2686 * 2687 * This function is used primarily to unmap replicated 64K or 512K entries 2688 * from the TSB that are inserted using the base page size TSB pointer, but 2689 * it may also be called to unmap a range of addresses from the TSB. 2690 */ 2691 void 2692 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2693 { 2694 struct tsb_info *tsbinfop; 2695 uint64_t tag; 2696 struct tsbe *tsbe_addr; 2697 caddr_t vaddr; 2698 uint64_t tsb_base; 2699 int vpshift, vpgsz; 2700 uint_t tsb_size; 2701 int phys = 0; 2702 2703 /* 2704 * Assumptions: 2705 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2706 * at a time shooting down any valid entries we encounter. 2707 * 2708 * If ttesz >= 4M we walk the range 4M at a time shooting 2709 * down any valid mappings we find. 2710 */ 2711 if (sfmmup == ksfmmup) { 2712 phys = ktsb_phys; 2713 if (ttesz >= TTE4M) { 2714 #ifndef sun4v 2715 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2716 #endif 2717 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2718 tsb_size = ktsb4m_szcode; 2719 } else { 2720 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2721 tsb_size = ktsb_szcode; 2722 } 2723 } else { 2724 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2725 2726 /* 2727 * If there isn't a TSB for this page size, or the TSB is 2728 * swapped out, there is nothing to do. Note that the latter 2729 * case seems impossible but can occur if hat_pageunload() 2730 * is called on an ISM mapping while the process is swapped 2731 * out. 2732 */ 2733 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2734 return; 2735 2736 /* 2737 * If another thread is in the middle of relocating a TSB 2738 * we can't unload the entry so set a flag so that the 2739 * TSB will be flushed before it can be accessed by the 2740 * process. 2741 */ 2742 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2743 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2744 return; 2745 } 2746 #if defined(UTSB_PHYS) 2747 phys = 1; 2748 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2749 #else 2750 tsb_base = (uint64_t)tsbinfop->tsb_va; 2751 #endif 2752 tsb_size = tsbinfop->tsb_szc; 2753 } 2754 if (ttesz >= TTE4M) { 2755 vpshift = MMU_PAGESHIFT4M; 2756 vpgsz = MMU_PAGESIZE4M; 2757 } else { 2758 vpshift = MMU_PAGESHIFT; 2759 vpgsz = MMU_PAGESIZE; 2760 } 2761 2762 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2763 tag = sfmmu_make_tsbtag(vaddr); 2764 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2765 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2766 } 2767 } 2768 2769 /* 2770 * Select the optimum TSB size given the number of mappings 2771 * that need to be cached. 2772 */ 2773 static int 2774 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2775 { 2776 int szc = 0; 2777 2778 #ifdef DEBUG 2779 if (tsb_grow_stress) { 2780 uint32_t randval = (uint32_t)gettick() >> 4; 2781 return (randval % (tsb_max_growsize + 1)); 2782 } 2783 #endif /* DEBUG */ 2784 2785 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2786 szc++; 2787 return (szc); 2788 } 2789 2790 /* 2791 * This function will add a translation to the hme_blk and allocate the 2792 * hme_blk if one does not exist. 2793 * If a page structure is specified then it will add the 2794 * corresponding hment to the mapping list. 2795 * It will also update the hmenum field for the tte. 2796 * Furthermore, it attempts to create a large page translation 2797 * for <addr,hat> at page array pps. It assumes addr and first 2798 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2799 */ 2800 static int 2801 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2802 page_t **pps, uint_t flags, uint_t rid) 2803 { 2804 struct hmehash_bucket *hmebp; 2805 struct hme_blk *hmeblkp; 2806 int ret; 2807 uint_t size; 2808 2809 /* 2810 * Get mapping size. 2811 */ 2812 size = TTE_CSZ(ttep); 2813 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2814 2815 /* 2816 * Acquire the hash bucket. 2817 */ 2818 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid); 2819 ASSERT(hmebp); 2820 2821 /* 2822 * Find the hment block. 2823 */ 2824 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags, 2825 rid); 2826 ASSERT(hmeblkp); 2827 2828 /* 2829 * Add the translation. 2830 */ 2831 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags, 2832 rid); 2833 2834 /* 2835 * Release the hash bucket. 2836 */ 2837 sfmmu_tteload_release_hashbucket(hmebp); 2838 2839 return (ret); 2840 } 2841 2842 /* 2843 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2844 */ 2845 static struct hmehash_bucket * 2846 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size, 2847 uint_t rid) 2848 { 2849 struct hmehash_bucket *hmebp; 2850 int hmeshift; 2851 void *htagid = sfmmutohtagid(sfmmup, rid); 2852 2853 ASSERT(htagid != NULL); 2854 2855 hmeshift = HME_HASH_SHIFT(size); 2856 2857 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift); 2858 2859 SFMMU_HASH_LOCK(hmebp); 2860 2861 return (hmebp); 2862 } 2863 2864 /* 2865 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2866 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2867 * allocated. 2868 */ 2869 static struct hme_blk * 2870 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2871 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid) 2872 { 2873 hmeblk_tag hblktag; 2874 int hmeshift; 2875 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 2876 2877 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 2878 2879 hblktag.htag_id = sfmmutohtagid(sfmmup, rid); 2880 ASSERT(hblktag.htag_id != NULL); 2881 hmeshift = HME_HASH_SHIFT(size); 2882 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2883 hblktag.htag_rehash = HME_HASH_REHASH(size); 2884 hblktag.htag_rid = rid; 2885 2886 ttearray_realloc: 2887 2888 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 2889 2890 /* 2891 * We block until hblk_reserve_lock is released; it's held by 2892 * the thread, temporarily using hblk_reserve, until hblk_reserve is 2893 * replaced by a hblk from sfmmu8_cache. 2894 */ 2895 if (hmeblkp == (struct hme_blk *)hblk_reserve && 2896 hblk_reserve_thread != curthread) { 2897 SFMMU_HASH_UNLOCK(hmebp); 2898 mutex_enter(&hblk_reserve_lock); 2899 mutex_exit(&hblk_reserve_lock); 2900 SFMMU_STAT(sf_hblk_reserve_hit); 2901 SFMMU_HASH_LOCK(hmebp); 2902 goto ttearray_realloc; 2903 } 2904 2905 if (hmeblkp == NULL) { 2906 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2907 hblktag, flags, rid); 2908 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2909 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2910 } else { 2911 /* 2912 * It is possible for 8k and 64k hblks to collide since they 2913 * have the same rehash value. This is because we 2914 * lazily free hblks and 8K/64K blks could be lingering. 2915 * If we find size mismatch we free the block and & try again. 2916 */ 2917 if (get_hblk_ttesz(hmeblkp) != size) { 2918 ASSERT(!hmeblkp->hblk_vcnt); 2919 ASSERT(!hmeblkp->hblk_hmecnt); 2920 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 2921 &list, 0); 2922 goto ttearray_realloc; 2923 } 2924 if (hmeblkp->hblk_shw_bit) { 2925 /* 2926 * if the hblk was previously used as a shadow hblk then 2927 * we will change it to a normal hblk 2928 */ 2929 ASSERT(!hmeblkp->hblk_shared); 2930 if (hmeblkp->hblk_shw_mask) { 2931 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 2932 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2933 goto ttearray_realloc; 2934 } else { 2935 hmeblkp->hblk_shw_bit = 0; 2936 } 2937 } 2938 SFMMU_STAT(sf_hblk_hit); 2939 } 2940 2941 /* 2942 * hat_memload() should never call kmem_cache_free() for kernel hmeblks; 2943 * see block comment showing the stacktrace in sfmmu_hblk_alloc(); 2944 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will 2945 * just add these hmeblks to the per-cpu pending queue. 2946 */ 2947 sfmmu_hblks_list_purge(&list, 1); 2948 2949 ASSERT(get_hblk_ttesz(hmeblkp) == size); 2950 ASSERT(!hmeblkp->hblk_shw_bit); 2951 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2952 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2953 ASSERT(hmeblkp->hblk_tag.htag_rid == rid); 2954 2955 return (hmeblkp); 2956 } 2957 2958 /* 2959 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 2960 * otherwise. 2961 */ 2962 static int 2963 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 2964 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid) 2965 { 2966 page_t *pp = *pps; 2967 int hmenum, size, remap; 2968 tte_t tteold, flush_tte; 2969 #ifdef DEBUG 2970 tte_t orig_old; 2971 #endif /* DEBUG */ 2972 struct sf_hment *sfhme; 2973 kmutex_t *pml, *pmtx; 2974 hatlock_t *hatlockp; 2975 int myflt; 2976 2977 /* 2978 * remove this panic when we decide to let user virtual address 2979 * space be >= USERLIMIT. 2980 */ 2981 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 2982 panic("user addr %p in kernel space", (void *)vaddr); 2983 #if defined(TTE_IS_GLOBAL) 2984 if (TTE_IS_GLOBAL(ttep)) 2985 panic("sfmmu_tteload: creating global tte"); 2986 #endif 2987 2988 #ifdef DEBUG 2989 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 2990 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 2991 panic("sfmmu_tteload: non cacheable memory tte"); 2992 #endif /* DEBUG */ 2993 2994 /* don't simulate dirty bit for writeable ISM/DISM mappings */ 2995 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) { 2996 TTE_SET_REF(ttep); 2997 TTE_SET_MOD(ttep); 2998 } 2999 3000 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 3001 !TTE_IS_MOD(ttep)) { 3002 /* 3003 * Don't load TSB for dummy as in ISM. Also don't preload 3004 * the TSB if the TTE isn't writable since we're likely to 3005 * fault on it again -- preloading can be fairly expensive. 3006 */ 3007 flags |= SFMMU_NO_TSBLOAD; 3008 } 3009 3010 size = TTE_CSZ(ttep); 3011 switch (size) { 3012 case TTE8K: 3013 SFMMU_STAT(sf_tteload8k); 3014 break; 3015 case TTE64K: 3016 SFMMU_STAT(sf_tteload64k); 3017 break; 3018 case TTE512K: 3019 SFMMU_STAT(sf_tteload512k); 3020 break; 3021 case TTE4M: 3022 SFMMU_STAT(sf_tteload4m); 3023 break; 3024 case (TTE32M): 3025 SFMMU_STAT(sf_tteload32m); 3026 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3027 break; 3028 case (TTE256M): 3029 SFMMU_STAT(sf_tteload256m); 3030 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3031 break; 3032 } 3033 3034 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 3035 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 3036 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3037 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3038 3039 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 3040 3041 /* 3042 * Need to grab mlist lock here so that pageunload 3043 * will not change tte behind us. 3044 */ 3045 if (pp) { 3046 pml = sfmmu_mlist_enter(pp); 3047 } 3048 3049 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3050 /* 3051 * Look for corresponding hment and if valid verify 3052 * pfns are equal. 3053 */ 3054 remap = TTE_IS_VALID(&tteold); 3055 if (remap) { 3056 pfn_t new_pfn, old_pfn; 3057 3058 old_pfn = TTE_TO_PFN(vaddr, &tteold); 3059 new_pfn = TTE_TO_PFN(vaddr, ttep); 3060 3061 if (flags & HAT_LOAD_REMAP) { 3062 /* make sure we are remapping same type of pages */ 3063 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 3064 panic("sfmmu_tteload - tte remap io<->memory"); 3065 } 3066 if (old_pfn != new_pfn && 3067 (pp != NULL || sfhme->hme_page != NULL)) { 3068 panic("sfmmu_tteload - tte remap pp != NULL"); 3069 } 3070 } else if (old_pfn != new_pfn) { 3071 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 3072 (void *)hmeblkp); 3073 } 3074 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 3075 3076 if (TTE_IS_EXECUTABLE(&tteold) && TTE_IS_SOFTEXEC(ttep)) { 3077 TTE_SET_EXEC(ttep); 3078 } 3079 } 3080 3081 if (pp) { 3082 /* 3083 * If we know that this page will be executed, because 3084 * it was in the past (PP_ISEXEC is already true), or 3085 * if the caller says it will likely be executed 3086 * (HAT_LOAD_TEXT is true), then there is no need to 3087 * dynamically detect execution with a soft exec 3088 * fault. Enable hardware execute permission now. 3089 */ 3090 if ((PP_ISEXEC(pp) || (flags & HAT_LOAD_TEXT)) && 3091 TTE_IS_SOFTEXEC(ttep)) { 3092 TTE_SET_EXEC(ttep); 3093 } 3094 3095 if (size == TTE8K) { 3096 #ifdef VAC 3097 /* 3098 * Handle VAC consistency 3099 */ 3100 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 3101 sfmmu_vac_conflict(sfmmup, vaddr, pp); 3102 } 3103 #endif 3104 3105 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3106 pmtx = sfmmu_page_enter(pp); 3107 PP_CLRRO(pp); 3108 sfmmu_page_exit(pmtx); 3109 } else if (!PP_ISMAPPED(pp) && 3110 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 3111 pmtx = sfmmu_page_enter(pp); 3112 if (!(PP_ISMOD(pp))) { 3113 PP_SETRO(pp); 3114 } 3115 sfmmu_page_exit(pmtx); 3116 } 3117 3118 if (TTE_EXECUTED(ttep)) { 3119 pmtx = sfmmu_page_enter(pp); 3120 PP_SETEXEC(pp); 3121 sfmmu_page_exit(pmtx); 3122 } 3123 3124 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 3125 /* 3126 * sfmmu_pagearray_setup failed so return 3127 */ 3128 sfmmu_mlist_exit(pml); 3129 return (1); 3130 } 3131 3132 } else if (TTE_IS_SOFTEXEC(ttep)) { 3133 TTE_SET_EXEC(ttep); 3134 } 3135 3136 /* 3137 * Make sure hment is not on a mapping list. 3138 */ 3139 ASSERT(remap || (sfhme->hme_page == NULL)); 3140 3141 /* if it is not a remap then hme->next better be NULL */ 3142 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 3143 3144 if (flags & HAT_LOAD_LOCK) { 3145 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 3146 panic("too high lckcnt-hmeblk %p", 3147 (void *)hmeblkp); 3148 } 3149 atomic_add_32(&hmeblkp->hblk_lckcnt, 1); 3150 3151 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 3152 } 3153 3154 #ifdef VAC 3155 if (pp && PP_ISNC(pp)) { 3156 /* 3157 * If the physical page is marked to be uncacheable, like 3158 * by a vac conflict, make sure the new mapping is also 3159 * uncacheable. 3160 */ 3161 TTE_CLR_VCACHEABLE(ttep); 3162 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 3163 } 3164 #endif 3165 ttep->tte_hmenum = hmenum; 3166 3167 #ifdef DEBUG 3168 orig_old = tteold; 3169 #endif /* DEBUG */ 3170 3171 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 3172 if ((sfmmup == KHATID) && 3173 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 3174 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3175 } 3176 #ifdef DEBUG 3177 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 3178 #endif /* DEBUG */ 3179 } 3180 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 3181 3182 if (!TTE_IS_VALID(&tteold)) { 3183 3184 atomic_add_16(&hmeblkp->hblk_vcnt, 1); 3185 if (rid == SFMMU_INVALID_SHMERID) { 3186 atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1); 3187 } else { 3188 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 3189 sf_region_t *rgnp = srdp->srd_hmergnp[rid]; 3190 /* 3191 * We already accounted for region ttecnt's in sfmmu 3192 * during hat_join_region() processing. Here we 3193 * only update ttecnt's in region struture. 3194 */ 3195 atomic_add_long(&rgnp->rgn_ttecnt[size], 1); 3196 } 3197 } 3198 3199 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup); 3200 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 3201 sfmmup != ksfmmup) { 3202 uchar_t tteflag = 1 << size; 3203 if (rid == SFMMU_INVALID_SHMERID) { 3204 if (!(sfmmup->sfmmu_tteflags & tteflag)) { 3205 hatlockp = sfmmu_hat_enter(sfmmup); 3206 sfmmup->sfmmu_tteflags |= tteflag; 3207 sfmmu_hat_exit(hatlockp); 3208 } 3209 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) { 3210 hatlockp = sfmmu_hat_enter(sfmmup); 3211 sfmmup->sfmmu_rtteflags |= tteflag; 3212 sfmmu_hat_exit(hatlockp); 3213 } 3214 /* 3215 * Update the current CPU tsbmiss area, so the current thread 3216 * won't need to take the tsbmiss for the new pagesize. 3217 * The other threads in the process will update their tsb 3218 * miss area lazily in sfmmu_tsbmiss_exception() when they 3219 * fail to find the translation for a newly added pagesize. 3220 */ 3221 if (size > TTE64K && myflt) { 3222 struct tsbmiss *tsbmp; 3223 kpreempt_disable(); 3224 tsbmp = &tsbmiss_area[CPU->cpu_id]; 3225 if (rid == SFMMU_INVALID_SHMERID) { 3226 if (!(tsbmp->uhat_tteflags & tteflag)) { 3227 tsbmp->uhat_tteflags |= tteflag; 3228 } 3229 } else { 3230 if (!(tsbmp->uhat_rtteflags & tteflag)) { 3231 tsbmp->uhat_rtteflags |= tteflag; 3232 } 3233 } 3234 kpreempt_enable(); 3235 } 3236 } 3237 3238 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 3239 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 3240 hatlockp = sfmmu_hat_enter(sfmmup); 3241 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 3242 sfmmu_hat_exit(hatlockp); 3243 } 3244 3245 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 3246 hw_tte.tte_intlo; 3247 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 3248 hw_tte.tte_inthi; 3249 3250 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 3251 /* 3252 * If remap and new tte differs from old tte we need 3253 * to sync the mod bit and flush TLB/TSB. We don't 3254 * need to sync ref bit because we currently always set 3255 * ref bit in tteload. 3256 */ 3257 ASSERT(TTE_IS_REF(ttep)); 3258 if (TTE_IS_MOD(&tteold) || (TTE_EXECUTED(&tteold) && 3259 !TTE_IS_EXECUTABLE(ttep))) { 3260 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 3261 } 3262 /* 3263 * hwtte bits shouldn't change for SRD hmeblks as long as SRD 3264 * hmes are only used for read only text. Adding this code for 3265 * completeness and future use of shared hmeblks with writable 3266 * mappings of VMODSORT vnodes. 3267 */ 3268 if (hmeblkp->hblk_shared) { 3269 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr, 3270 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1); 3271 xt_sync(cpuset); 3272 SFMMU_STAT_ADD(sf_region_remap_demap, 1); 3273 } else { 3274 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 3275 xt_sync(sfmmup->sfmmu_cpusran); 3276 } 3277 } 3278 3279 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 3280 /* 3281 * We only preload 8K and 4M mappings into the TSB, since 3282 * 64K and 512K mappings are replicated and hence don't 3283 * have a single, unique TSB entry. Ditto for 32M/256M. 3284 */ 3285 if (size == TTE8K || size == TTE4M) { 3286 sf_scd_t *scdp; 3287 hatlockp = sfmmu_hat_enter(sfmmup); 3288 /* 3289 * Don't preload private TSB if the mapping is used 3290 * by the shctx in the SCD. 3291 */ 3292 scdp = sfmmup->sfmmu_scdp; 3293 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL || 3294 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 3295 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, 3296 size); 3297 } 3298 sfmmu_hat_exit(hatlockp); 3299 } 3300 } 3301 if (pp) { 3302 if (!remap) { 3303 HME_ADD(sfhme, pp); 3304 atomic_add_16(&hmeblkp->hblk_hmecnt, 1); 3305 ASSERT(hmeblkp->hblk_hmecnt > 0); 3306 3307 /* 3308 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 3309 * see pageunload() for comment. 3310 */ 3311 } 3312 sfmmu_mlist_exit(pml); 3313 } 3314 3315 return (0); 3316 } 3317 /* 3318 * Function unlocks hash bucket. 3319 */ 3320 static void 3321 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 3322 { 3323 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3324 SFMMU_HASH_UNLOCK(hmebp); 3325 } 3326 3327 /* 3328 * function which checks and sets up page array for a large 3329 * translation. Will set p_vcolor, p_index, p_ro fields. 3330 * Assumes addr and pfnum of first page are properly aligned. 3331 * Will check for physical contiguity. If check fails it return 3332 * non null. 3333 */ 3334 static int 3335 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 3336 { 3337 int i, index, ttesz; 3338 pfn_t pfnum; 3339 pgcnt_t npgs; 3340 page_t *pp, *pp1; 3341 kmutex_t *pmtx; 3342 #ifdef VAC 3343 int osz; 3344 int cflags = 0; 3345 int vac_err = 0; 3346 #endif 3347 int newidx = 0; 3348 3349 ttesz = TTE_CSZ(ttep); 3350 3351 ASSERT(ttesz > TTE8K); 3352 3353 npgs = TTEPAGES(ttesz); 3354 index = PAGESZ_TO_INDEX(ttesz); 3355 3356 pfnum = (*pps)->p_pagenum; 3357 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 3358 3359 /* 3360 * Save the first pp so we can do HAT_TMPNC at the end. 3361 */ 3362 pp1 = *pps; 3363 #ifdef VAC 3364 osz = fnd_mapping_sz(pp1); 3365 #endif 3366 3367 for (i = 0; i < npgs; i++, pps++) { 3368 pp = *pps; 3369 ASSERT(PAGE_LOCKED(pp)); 3370 ASSERT(pp->p_szc >= ttesz); 3371 ASSERT(pp->p_szc == pp1->p_szc); 3372 ASSERT(sfmmu_mlist_held(pp)); 3373 3374 /* 3375 * XXX is it possible to maintain P_RO on the root only? 3376 */ 3377 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3378 pmtx = sfmmu_page_enter(pp); 3379 PP_CLRRO(pp); 3380 sfmmu_page_exit(pmtx); 3381 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 3382 !PP_ISMOD(pp)) { 3383 pmtx = sfmmu_page_enter(pp); 3384 if (!(PP_ISMOD(pp))) { 3385 PP_SETRO(pp); 3386 } 3387 sfmmu_page_exit(pmtx); 3388 } 3389 3390 if (TTE_EXECUTED(ttep)) { 3391 pmtx = sfmmu_page_enter(pp); 3392 PP_SETEXEC(pp); 3393 sfmmu_page_exit(pmtx); 3394 } 3395 3396 /* 3397 * If this is a remap we skip vac & contiguity checks. 3398 */ 3399 if (remap) 3400 continue; 3401 3402 /* 3403 * set p_vcolor and detect any vac conflicts. 3404 */ 3405 #ifdef VAC 3406 if (vac_err == 0) { 3407 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 3408 3409 } 3410 #endif 3411 3412 /* 3413 * Save current index in case we need to undo it. 3414 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 3415 * "SFMMU_INDEX_SHIFT 6" 3416 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 3417 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 3418 * 3419 * So: index = PAGESZ_TO_INDEX(ttesz); 3420 * if ttesz == 1 then index = 0x2 3421 * 2 then index = 0x4 3422 * 3 then index = 0x8 3423 * 4 then index = 0x10 3424 * 5 then index = 0x20 3425 * The code below checks if it's a new pagesize (ie, newidx) 3426 * in case we need to take it back out of p_index, 3427 * and then or's the new index into the existing index. 3428 */ 3429 if ((PP_MAPINDEX(pp) & index) == 0) 3430 newidx = 1; 3431 pp->p_index = (PP_MAPINDEX(pp) | index); 3432 3433 /* 3434 * contiguity check 3435 */ 3436 if (pp->p_pagenum != pfnum) { 3437 /* 3438 * If we fail the contiguity test then 3439 * the only thing we need to fix is the p_index field. 3440 * We might get a few extra flushes but since this 3441 * path is rare that is ok. The p_ro field will 3442 * get automatically fixed on the next tteload to 3443 * the page. NO TNC bit is set yet. 3444 */ 3445 while (i >= 0) { 3446 pp = *pps; 3447 if (newidx) 3448 pp->p_index = (PP_MAPINDEX(pp) & 3449 ~index); 3450 pps--; 3451 i--; 3452 } 3453 return (1); 3454 } 3455 pfnum++; 3456 addr += MMU_PAGESIZE; 3457 } 3458 3459 #ifdef VAC 3460 if (vac_err) { 3461 if (ttesz > osz) { 3462 /* 3463 * There are some smaller mappings that causes vac 3464 * conflicts. Convert all existing small mappings to 3465 * TNC. 3466 */ 3467 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 3468 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 3469 npgs); 3470 } else { 3471 /* EMPTY */ 3472 /* 3473 * If there exists an big page mapping, 3474 * that means the whole existing big page 3475 * has TNC setting already. No need to covert to 3476 * TNC again. 3477 */ 3478 ASSERT(PP_ISTNC(pp1)); 3479 } 3480 } 3481 #endif /* VAC */ 3482 3483 return (0); 3484 } 3485 3486 #ifdef VAC 3487 /* 3488 * Routine that detects vac consistency for a large page. It also 3489 * sets virtual color for all pp's for this big mapping. 3490 */ 3491 static int 3492 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 3493 { 3494 int vcolor, ocolor; 3495 3496 ASSERT(sfmmu_mlist_held(pp)); 3497 3498 if (PP_ISNC(pp)) { 3499 return (HAT_TMPNC); 3500 } 3501 3502 vcolor = addr_to_vcolor(addr); 3503 if (PP_NEWPAGE(pp)) { 3504 PP_SET_VCOLOR(pp, vcolor); 3505 return (0); 3506 } 3507 3508 ocolor = PP_GET_VCOLOR(pp); 3509 if (ocolor == vcolor) { 3510 return (0); 3511 } 3512 3513 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 3514 /* 3515 * Previous user of page had a differnet color 3516 * but since there are no current users 3517 * we just flush the cache and change the color. 3518 * As an optimization for large pages we flush the 3519 * entire cache of that color and set a flag. 3520 */ 3521 SFMMU_STAT(sf_pgcolor_conflict); 3522 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 3523 CacheColor_SetFlushed(*cflags, ocolor); 3524 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 3525 } 3526 PP_SET_VCOLOR(pp, vcolor); 3527 return (0); 3528 } 3529 3530 /* 3531 * We got a real conflict with a current mapping. 3532 * set flags to start unencaching all mappings 3533 * and return failure so we restart looping 3534 * the pp array from the beginning. 3535 */ 3536 return (HAT_TMPNC); 3537 } 3538 #endif /* VAC */ 3539 3540 /* 3541 * creates a large page shadow hmeblk for a tte. 3542 * The purpose of this routine is to allow us to do quick unloads because 3543 * the vm layer can easily pass a very large but sparsely populated range. 3544 */ 3545 static struct hme_blk * 3546 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 3547 { 3548 struct hmehash_bucket *hmebp; 3549 hmeblk_tag hblktag; 3550 int hmeshift, size, vshift; 3551 uint_t shw_mask, newshw_mask; 3552 struct hme_blk *hmeblkp; 3553 3554 ASSERT(sfmmup != KHATID); 3555 if (mmu_page_sizes == max_mmu_page_sizes) { 3556 ASSERT(ttesz < TTE256M); 3557 } else { 3558 ASSERT(ttesz < TTE4M); 3559 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 3560 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 3561 } 3562 3563 if (ttesz == TTE8K) { 3564 size = TTE512K; 3565 } else { 3566 size = ++ttesz; 3567 } 3568 3569 hblktag.htag_id = sfmmup; 3570 hmeshift = HME_HASH_SHIFT(size); 3571 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3572 hblktag.htag_rehash = HME_HASH_REHASH(size); 3573 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3574 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 3575 3576 SFMMU_HASH_LOCK(hmebp); 3577 3578 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3579 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 3580 if (hmeblkp == NULL) { 3581 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3582 hblktag, flags, SFMMU_INVALID_SHMERID); 3583 } 3584 ASSERT(hmeblkp); 3585 if (!hmeblkp->hblk_shw_mask) { 3586 /* 3587 * if this is a unused hblk it was just allocated or could 3588 * potentially be a previous large page hblk so we need to 3589 * set the shadow bit. 3590 */ 3591 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3592 hmeblkp->hblk_shw_bit = 1; 3593 } else if (hmeblkp->hblk_shw_bit == 0) { 3594 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p", 3595 (void *)hmeblkp); 3596 } 3597 ASSERT(hmeblkp->hblk_shw_bit == 1); 3598 ASSERT(!hmeblkp->hblk_shared); 3599 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3600 ASSERT(vshift < 8); 3601 /* 3602 * Atomically set shw mask bit 3603 */ 3604 do { 3605 shw_mask = hmeblkp->hblk_shw_mask; 3606 newshw_mask = shw_mask | (1 << vshift); 3607 newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask, 3608 newshw_mask); 3609 } while (newshw_mask != shw_mask); 3610 3611 SFMMU_HASH_UNLOCK(hmebp); 3612 3613 return (hmeblkp); 3614 } 3615 3616 /* 3617 * This routine cleanup a previous shadow hmeblk and changes it to 3618 * a regular hblk. This happens rarely but it is possible 3619 * when a process wants to use large pages and there are hblks still 3620 * lying around from the previous as that used these hmeblks. 3621 * The alternative was to cleanup the shadow hblks at unload time 3622 * but since so few user processes actually use large pages, it is 3623 * better to be lazy and cleanup at this time. 3624 */ 3625 static void 3626 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3627 struct hmehash_bucket *hmebp) 3628 { 3629 caddr_t addr, endaddr; 3630 int hashno, size; 3631 3632 ASSERT(hmeblkp->hblk_shw_bit); 3633 ASSERT(!hmeblkp->hblk_shared); 3634 3635 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3636 3637 if (!hmeblkp->hblk_shw_mask) { 3638 hmeblkp->hblk_shw_bit = 0; 3639 return; 3640 } 3641 addr = (caddr_t)get_hblk_base(hmeblkp); 3642 endaddr = get_hblk_endaddr(hmeblkp); 3643 size = get_hblk_ttesz(hmeblkp); 3644 hashno = size - 1; 3645 ASSERT(hashno > 0); 3646 SFMMU_HASH_UNLOCK(hmebp); 3647 3648 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3649 3650 SFMMU_HASH_LOCK(hmebp); 3651 } 3652 3653 static void 3654 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3655 int hashno) 3656 { 3657 int hmeshift, shadow = 0; 3658 hmeblk_tag hblktag; 3659 struct hmehash_bucket *hmebp; 3660 struct hme_blk *hmeblkp; 3661 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3662 3663 ASSERT(hashno > 0); 3664 hblktag.htag_id = sfmmup; 3665 hblktag.htag_rehash = hashno; 3666 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3667 3668 hmeshift = HME_HASH_SHIFT(hashno); 3669 3670 while (addr < endaddr) { 3671 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3672 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3673 SFMMU_HASH_LOCK(hmebp); 3674 /* inline HME_HASH_SEARCH */ 3675 hmeblkp = hmebp->hmeblkp; 3676 pr_hblk = NULL; 3677 while (hmeblkp) { 3678 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3679 /* found hme_blk */ 3680 ASSERT(!hmeblkp->hblk_shared); 3681 if (hmeblkp->hblk_shw_bit) { 3682 if (hmeblkp->hblk_shw_mask) { 3683 shadow = 1; 3684 sfmmu_shadow_hcleanup(sfmmup, 3685 hmeblkp, hmebp); 3686 break; 3687 } else { 3688 hmeblkp->hblk_shw_bit = 0; 3689 } 3690 } 3691 3692 /* 3693 * Hblk_hmecnt and hblk_vcnt could be non zero 3694 * since hblk_unload() does not gurantee that. 3695 * 3696 * XXX - this could cause tteload() to spin 3697 * where sfmmu_shadow_hcleanup() is called. 3698 */ 3699 } 3700 3701 nx_hblk = hmeblkp->hblk_next; 3702 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3703 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3704 &list, 0); 3705 } else { 3706 pr_hblk = hmeblkp; 3707 } 3708 hmeblkp = nx_hblk; 3709 } 3710 3711 SFMMU_HASH_UNLOCK(hmebp); 3712 3713 if (shadow) { 3714 /* 3715 * We found another shadow hblk so cleaned its 3716 * children. We need to go back and cleanup 3717 * the original hblk so we don't change the 3718 * addr. 3719 */ 3720 shadow = 0; 3721 } else { 3722 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3723 (1 << hmeshift)); 3724 } 3725 } 3726 sfmmu_hblks_list_purge(&list, 0); 3727 } 3728 3729 /* 3730 * This routine's job is to delete stale invalid shared hmeregions hmeblks that 3731 * may still linger on after pageunload. 3732 */ 3733 static void 3734 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz) 3735 { 3736 int hmeshift; 3737 hmeblk_tag hblktag; 3738 struct hmehash_bucket *hmebp; 3739 struct hme_blk *hmeblkp; 3740 struct hme_blk *pr_hblk; 3741 struct hme_blk *list = NULL; 3742 3743 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3744 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3745 3746 hmeshift = HME_HASH_SHIFT(ttesz); 3747 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3748 hblktag.htag_rehash = ttesz; 3749 hblktag.htag_rid = rid; 3750 hblktag.htag_id = srdp; 3751 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3752 3753 SFMMU_HASH_LOCK(hmebp); 3754 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3755 if (hmeblkp != NULL) { 3756 ASSERT(hmeblkp->hblk_shared); 3757 ASSERT(!hmeblkp->hblk_shw_bit); 3758 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3759 panic("sfmmu_cleanup_rhblk: valid hmeblk"); 3760 } 3761 ASSERT(!hmeblkp->hblk_lckcnt); 3762 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3763 &list, 0); 3764 } 3765 SFMMU_HASH_UNLOCK(hmebp); 3766 sfmmu_hblks_list_purge(&list, 0); 3767 } 3768 3769 /* ARGSUSED */ 3770 static void 3771 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 3772 size_t r_size, void *r_obj, u_offset_t r_objoff) 3773 { 3774 } 3775 3776 /* 3777 * Searches for an hmeblk which maps addr, then unloads this mapping 3778 * and updates *eaddrp, if the hmeblk is found. 3779 */ 3780 static void 3781 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr, 3782 caddr_t eaddr, int ttesz, caddr_t *eaddrp) 3783 { 3784 int hmeshift; 3785 hmeblk_tag hblktag; 3786 struct hmehash_bucket *hmebp; 3787 struct hme_blk *hmeblkp; 3788 struct hme_blk *pr_hblk; 3789 struct hme_blk *list = NULL; 3790 3791 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3792 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3793 ASSERT(ttesz >= HBLK_MIN_TTESZ); 3794 3795 hmeshift = HME_HASH_SHIFT(ttesz); 3796 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3797 hblktag.htag_rehash = ttesz; 3798 hblktag.htag_rid = rid; 3799 hblktag.htag_id = srdp; 3800 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3801 3802 SFMMU_HASH_LOCK(hmebp); 3803 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3804 if (hmeblkp != NULL) { 3805 ASSERT(hmeblkp->hblk_shared); 3806 ASSERT(!hmeblkp->hblk_lckcnt); 3807 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3808 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr, 3809 eaddr, NULL, HAT_UNLOAD); 3810 ASSERT(*eaddrp > addr); 3811 } 3812 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3813 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3814 &list, 0); 3815 } 3816 SFMMU_HASH_UNLOCK(hmebp); 3817 sfmmu_hblks_list_purge(&list, 0); 3818 } 3819 3820 static void 3821 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp) 3822 { 3823 int ttesz = rgnp->rgn_pgszc; 3824 size_t rsz = rgnp->rgn_size; 3825 caddr_t rsaddr = rgnp->rgn_saddr; 3826 caddr_t readdr = rsaddr + rsz; 3827 caddr_t rhsaddr; 3828 caddr_t va; 3829 uint_t rid = rgnp->rgn_id; 3830 caddr_t cbsaddr; 3831 caddr_t cbeaddr; 3832 hat_rgn_cb_func_t rcbfunc; 3833 ulong_t cnt; 3834 3835 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3836 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3837 3838 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz))); 3839 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz))); 3840 if (ttesz < HBLK_MIN_TTESZ) { 3841 ttesz = HBLK_MIN_TTESZ; 3842 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES); 3843 } else { 3844 rhsaddr = rsaddr; 3845 } 3846 3847 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) { 3848 rcbfunc = sfmmu_rgn_cb_noop; 3849 } 3850 3851 while (ttesz >= HBLK_MIN_TTESZ) { 3852 cbsaddr = rsaddr; 3853 cbeaddr = rsaddr; 3854 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 3855 ttesz--; 3856 continue; 3857 } 3858 cnt = 0; 3859 va = rsaddr; 3860 while (va < readdr) { 3861 ASSERT(va >= rhsaddr); 3862 if (va != cbeaddr) { 3863 if (cbeaddr != cbsaddr) { 3864 ASSERT(cbeaddr > cbsaddr); 3865 (*rcbfunc)(cbsaddr, cbeaddr, 3866 rsaddr, rsz, rgnp->rgn_obj, 3867 rgnp->rgn_objoff); 3868 } 3869 cbsaddr = va; 3870 cbeaddr = va; 3871 } 3872 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr, 3873 ttesz, &cbeaddr); 3874 cnt++; 3875 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz)); 3876 } 3877 if (cbeaddr != cbsaddr) { 3878 ASSERT(cbeaddr > cbsaddr); 3879 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr, 3880 rsz, rgnp->rgn_obj, 3881 rgnp->rgn_objoff); 3882 } 3883 ttesz--; 3884 } 3885 } 3886 3887 /* 3888 * Release one hardware address translation lock on the given address range. 3889 */ 3890 void 3891 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3892 { 3893 struct hmehash_bucket *hmebp; 3894 hmeblk_tag hblktag; 3895 int hmeshift, hashno = 1; 3896 struct hme_blk *hmeblkp, *list = NULL; 3897 caddr_t endaddr; 3898 3899 ASSERT(sfmmup != NULL); 3900 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3901 3902 ASSERT((sfmmup == ksfmmup) || 3903 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3904 ASSERT((len & MMU_PAGEOFFSET) == 0); 3905 endaddr = addr + len; 3906 hblktag.htag_id = sfmmup; 3907 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3908 3909 /* 3910 * Spitfire supports 4 page sizes. 3911 * Most pages are expected to be of the smallest page size (8K) and 3912 * these will not need to be rehashed. 64K pages also don't need to be 3913 * rehashed because an hmeblk spans 64K of address space. 512K pages 3914 * might need 1 rehash and and 4M pages might need 2 rehashes. 3915 */ 3916 while (addr < endaddr) { 3917 hmeshift = HME_HASH_SHIFT(hashno); 3918 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3919 hblktag.htag_rehash = hashno; 3920 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3921 3922 SFMMU_HASH_LOCK(hmebp); 3923 3924 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3925 if (hmeblkp != NULL) { 3926 ASSERT(!hmeblkp->hblk_shared); 3927 /* 3928 * If we encounter a shadow hmeblk then 3929 * we know there are no valid hmeblks mapping 3930 * this address at this size or larger. 3931 * Just increment address by the smallest 3932 * page size. 3933 */ 3934 if (hmeblkp->hblk_shw_bit) { 3935 addr += MMU_PAGESIZE; 3936 } else { 3937 addr = sfmmu_hblk_unlock(hmeblkp, addr, 3938 endaddr); 3939 } 3940 SFMMU_HASH_UNLOCK(hmebp); 3941 hashno = 1; 3942 continue; 3943 } 3944 SFMMU_HASH_UNLOCK(hmebp); 3945 3946 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 3947 /* 3948 * We have traversed the whole list and rehashed 3949 * if necessary without finding the address to unlock 3950 * which should never happen. 3951 */ 3952 panic("sfmmu_unlock: addr not found. " 3953 "addr %p hat %p", (void *)addr, (void *)sfmmup); 3954 } else { 3955 hashno++; 3956 } 3957 } 3958 3959 sfmmu_hblks_list_purge(&list, 0); 3960 } 3961 3962 void 3963 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len, 3964 hat_region_cookie_t rcookie) 3965 { 3966 sf_srd_t *srdp; 3967 sf_region_t *rgnp; 3968 int ttesz; 3969 uint_t rid; 3970 caddr_t eaddr; 3971 caddr_t va; 3972 int hmeshift; 3973 hmeblk_tag hblktag; 3974 struct hmehash_bucket *hmebp; 3975 struct hme_blk *hmeblkp; 3976 struct hme_blk *pr_hblk; 3977 struct hme_blk *list; 3978 3979 if (rcookie == HAT_INVALID_REGION_COOKIE) { 3980 hat_unlock(sfmmup, addr, len); 3981 return; 3982 } 3983 3984 ASSERT(sfmmup != NULL); 3985 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3986 ASSERT(sfmmup != ksfmmup); 3987 3988 srdp = sfmmup->sfmmu_srdp; 3989 rid = (uint_t)((uint64_t)rcookie); 3990 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3991 eaddr = addr + len; 3992 va = addr; 3993 list = NULL; 3994 rgnp = srdp->srd_hmergnp[rid]; 3995 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len); 3996 3997 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc))); 3998 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc))); 3999 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) { 4000 ttesz = HBLK_MIN_TTESZ; 4001 } else { 4002 ttesz = rgnp->rgn_pgszc; 4003 } 4004 while (va < eaddr) { 4005 while (ttesz < rgnp->rgn_pgszc && 4006 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) { 4007 ttesz++; 4008 } 4009 while (ttesz >= HBLK_MIN_TTESZ) { 4010 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 4011 ttesz--; 4012 continue; 4013 } 4014 hmeshift = HME_HASH_SHIFT(ttesz); 4015 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift); 4016 hblktag.htag_rehash = ttesz; 4017 hblktag.htag_rid = rid; 4018 hblktag.htag_id = srdp; 4019 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift); 4020 SFMMU_HASH_LOCK(hmebp); 4021 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, 4022 &list); 4023 if (hmeblkp == NULL) { 4024 SFMMU_HASH_UNLOCK(hmebp); 4025 ttesz--; 4026 continue; 4027 } 4028 ASSERT(hmeblkp->hblk_shared); 4029 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr); 4030 ASSERT(va >= eaddr || 4031 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz))); 4032 SFMMU_HASH_UNLOCK(hmebp); 4033 break; 4034 } 4035 if (ttesz < HBLK_MIN_TTESZ) { 4036 panic("hat_unlock_region: addr not found " 4037 "addr %p hat %p", (void *)va, (void *)sfmmup); 4038 } 4039 } 4040 sfmmu_hblks_list_purge(&list, 0); 4041 } 4042 4043 /* 4044 * Function to unlock a range of addresses in an hmeblk. It returns the 4045 * next address that needs to be unlocked. 4046 * Should be called with the hash lock held. 4047 */ 4048 static caddr_t 4049 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 4050 { 4051 struct sf_hment *sfhme; 4052 tte_t tteold, ttemod; 4053 int ttesz, ret; 4054 4055 ASSERT(in_hblk_range(hmeblkp, addr)); 4056 ASSERT(hmeblkp->hblk_shw_bit == 0); 4057 4058 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4059 ttesz = get_hblk_ttesz(hmeblkp); 4060 4061 HBLKTOHME(sfhme, hmeblkp, addr); 4062 while (addr < endaddr) { 4063 readtte: 4064 sfmmu_copytte(&sfhme->hme_tte, &tteold); 4065 if (TTE_IS_VALID(&tteold)) { 4066 4067 ttemod = tteold; 4068 4069 ret = sfmmu_modifytte_try(&tteold, &ttemod, 4070 &sfhme->hme_tte); 4071 4072 if (ret < 0) 4073 goto readtte; 4074 4075 if (hmeblkp->hblk_lckcnt == 0) 4076 panic("zero hblk lckcnt"); 4077 4078 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 4079 (uintptr_t)endaddr) 4080 panic("can't unlock large tte"); 4081 4082 ASSERT(hmeblkp->hblk_lckcnt > 0); 4083 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 4084 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 4085 } else { 4086 panic("sfmmu_hblk_unlock: invalid tte"); 4087 } 4088 addr += TTEBYTES(ttesz); 4089 sfhme++; 4090 } 4091 return (addr); 4092 } 4093 4094 /* 4095 * Physical Address Mapping Framework 4096 * 4097 * General rules: 4098 * 4099 * (1) Applies only to seg_kmem memory pages. To make things easier, 4100 * seg_kpm addresses are also accepted by the routines, but nothing 4101 * is done with them since by definition their PA mappings are static. 4102 * (2) hat_add_callback() may only be called while holding the page lock 4103 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()), 4104 * or passing HAC_PAGELOCK flag. 4105 * (3) prehandler() and posthandler() may not call hat_add_callback() or 4106 * hat_delete_callback(), nor should they allocate memory. Post quiesce 4107 * callbacks may not sleep or acquire adaptive mutex locks. 4108 * (4) Either prehandler() or posthandler() (but not both) may be specified 4109 * as being NULL. Specifying an errhandler() is optional. 4110 * 4111 * Details of using the framework: 4112 * 4113 * registering a callback (hat_register_callback()) 4114 * 4115 * Pass prehandler, posthandler, errhandler addresses 4116 * as described below. If capture_cpus argument is nonzero, 4117 * suspend callback to the prehandler will occur with CPUs 4118 * captured and executing xc_loop() and CPUs will remain 4119 * captured until after the posthandler suspend callback 4120 * occurs. 4121 * 4122 * adding a callback (hat_add_callback()) 4123 * 4124 * as_pagelock(); 4125 * hat_add_callback(); 4126 * save returned pfn in private data structures or program registers; 4127 * as_pageunlock(); 4128 * 4129 * prehandler() 4130 * 4131 * Stop all accesses by physical address to this memory page. 4132 * Called twice: the first, PRESUSPEND, is a context safe to acquire 4133 * adaptive locks. The second, SUSPEND, is called at high PIL with 4134 * CPUs captured so adaptive locks may NOT be acquired (and all spin 4135 * locks must be XCALL_PIL or higher locks). 4136 * 4137 * May return the following errors: 4138 * EIO: A fatal error has occurred. This will result in panic. 4139 * EAGAIN: The page cannot be suspended. This will fail the 4140 * relocation. 4141 * 0: Success. 4142 * 4143 * posthandler() 4144 * 4145 * Save new pfn in private data structures or program registers; 4146 * not allowed to fail (non-zero return values will result in panic). 4147 * 4148 * errhandler() 4149 * 4150 * called when an error occurs related to the callback. Currently 4151 * the only such error is HAT_CB_ERR_LEAKED which indicates that 4152 * a page is being freed, but there are still outstanding callback(s) 4153 * registered on the page. 4154 * 4155 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 4156 * 4157 * stop using physical address 4158 * hat_delete_callback(); 4159 * 4160 */ 4161 4162 /* 4163 * Register a callback class. Each subsystem should do this once and 4164 * cache the id_t returned for use in setting up and tearing down callbacks. 4165 * 4166 * There is no facility for removing callback IDs once they are created; 4167 * the "key" should be unique for each module, so in case a module is unloaded 4168 * and subsequently re-loaded, we can recycle the module's previous entry. 4169 */ 4170 id_t 4171 hat_register_callback(int key, 4172 int (*prehandler)(caddr_t, uint_t, uint_t, void *), 4173 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 4174 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 4175 int capture_cpus) 4176 { 4177 id_t id; 4178 4179 /* 4180 * Search the table for a pre-existing callback associated with 4181 * the identifier "key". If one exists, we re-use that entry in 4182 * the table for this instance, otherwise we assign the next 4183 * available table slot. 4184 */ 4185 for (id = 0; id < sfmmu_max_cb_id; id++) { 4186 if (sfmmu_cb_table[id].key == key) 4187 break; 4188 } 4189 4190 if (id == sfmmu_max_cb_id) { 4191 id = sfmmu_cb_nextid++; 4192 if (id >= sfmmu_max_cb_id) 4193 panic("hat_register_callback: out of callback IDs"); 4194 } 4195 4196 ASSERT(prehandler != NULL || posthandler != NULL); 4197 4198 sfmmu_cb_table[id].key = key; 4199 sfmmu_cb_table[id].prehandler = prehandler; 4200 sfmmu_cb_table[id].posthandler = posthandler; 4201 sfmmu_cb_table[id].errhandler = errhandler; 4202 sfmmu_cb_table[id].capture_cpus = capture_cpus; 4203 4204 return (id); 4205 } 4206 4207 #define HAC_COOKIE_NONE (void *)-1 4208 4209 /* 4210 * Add relocation callbacks to the specified addr/len which will be called 4211 * when relocating the associated page. See the description of pre and 4212 * posthandler above for more details. 4213 * 4214 * If HAC_PAGELOCK is included in flags, the underlying memory page is 4215 * locked internally so the caller must be able to deal with the callback 4216 * running even before this function has returned. If HAC_PAGELOCK is not 4217 * set, it is assumed that the underlying memory pages are locked. 4218 * 4219 * Since the caller must track the individual page boundaries anyway, 4220 * we only allow a callback to be added to a single page (large 4221 * or small). Thus [addr, addr + len) MUST be contained within a single 4222 * page. 4223 * 4224 * Registering multiple callbacks on the same [addr, addr+len) is supported, 4225 * _provided_that_ a unique parameter is specified for each callback. 4226 * If multiple callbacks are registered on the same range the callback will 4227 * be invoked with each unique parameter. Registering the same callback with 4228 * the same argument more than once will result in corrupted kernel state. 4229 * 4230 * Returns the pfn of the underlying kernel page in *rpfn 4231 * on success, or PFN_INVALID on failure. 4232 * 4233 * cookiep (if passed) provides storage space for an opaque cookie 4234 * to return later to hat_delete_callback(). This cookie makes the callback 4235 * deletion significantly quicker by avoiding a potentially lengthy hash 4236 * search. 4237 * 4238 * Returns values: 4239 * 0: success 4240 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 4241 * EINVAL: callback ID is not valid 4242 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 4243 * space 4244 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary 4245 */ 4246 int 4247 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 4248 void *pvt, pfn_t *rpfn, void **cookiep) 4249 { 4250 struct hmehash_bucket *hmebp; 4251 hmeblk_tag hblktag; 4252 struct hme_blk *hmeblkp; 4253 int hmeshift, hashno; 4254 caddr_t saddr, eaddr, baseaddr; 4255 struct pa_hment *pahmep; 4256 struct sf_hment *sfhmep, *osfhmep; 4257 kmutex_t *pml; 4258 tte_t tte; 4259 page_t *pp; 4260 vnode_t *vp; 4261 u_offset_t off; 4262 pfn_t pfn; 4263 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 4264 int locked = 0; 4265 4266 /* 4267 * For KPM mappings, just return the physical address since we 4268 * don't need to register any callbacks. 4269 */ 4270 if (IS_KPM_ADDR(vaddr)) { 4271 uint64_t paddr; 4272 SFMMU_KPM_VTOP(vaddr, paddr); 4273 *rpfn = btop(paddr); 4274 if (cookiep != NULL) 4275 *cookiep = HAC_COOKIE_NONE; 4276 return (0); 4277 } 4278 4279 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 4280 *rpfn = PFN_INVALID; 4281 return (EINVAL); 4282 } 4283 4284 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 4285 *rpfn = PFN_INVALID; 4286 return (ENOMEM); 4287 } 4288 4289 sfhmep = &pahmep->sfment; 4290 4291 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4292 eaddr = saddr + len; 4293 4294 rehash: 4295 /* Find the mapping(s) for this page */ 4296 for (hashno = TTE64K, hmeblkp = NULL; 4297 hmeblkp == NULL && hashno <= mmu_hashcnt; 4298 hashno++) { 4299 hmeshift = HME_HASH_SHIFT(hashno); 4300 hblktag.htag_id = ksfmmup; 4301 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4302 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4303 hblktag.htag_rehash = hashno; 4304 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4305 4306 SFMMU_HASH_LOCK(hmebp); 4307 4308 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4309 4310 if (hmeblkp == NULL) 4311 SFMMU_HASH_UNLOCK(hmebp); 4312 } 4313 4314 if (hmeblkp == NULL) { 4315 kmem_cache_free(pa_hment_cache, pahmep); 4316 *rpfn = PFN_INVALID; 4317 return (ENXIO); 4318 } 4319 4320 ASSERT(!hmeblkp->hblk_shared); 4321 4322 HBLKTOHME(osfhmep, hmeblkp, saddr); 4323 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4324 4325 if (!TTE_IS_VALID(&tte)) { 4326 SFMMU_HASH_UNLOCK(hmebp); 4327 kmem_cache_free(pa_hment_cache, pahmep); 4328 *rpfn = PFN_INVALID; 4329 return (ENXIO); 4330 } 4331 4332 /* 4333 * Make sure the boundaries for the callback fall within this 4334 * single mapping. 4335 */ 4336 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 4337 ASSERT(saddr >= baseaddr); 4338 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) { 4339 SFMMU_HASH_UNLOCK(hmebp); 4340 kmem_cache_free(pa_hment_cache, pahmep); 4341 *rpfn = PFN_INVALID; 4342 return (ERANGE); 4343 } 4344 4345 pfn = sfmmu_ttetopfn(&tte, vaddr); 4346 4347 /* 4348 * The pfn may not have a page_t underneath in which case we 4349 * just return it. This can happen if we are doing I/O to a 4350 * static portion of the kernel's address space, for instance. 4351 */ 4352 pp = osfhmep->hme_page; 4353 if (pp == NULL) { 4354 SFMMU_HASH_UNLOCK(hmebp); 4355 kmem_cache_free(pa_hment_cache, pahmep); 4356 *rpfn = pfn; 4357 if (cookiep) 4358 *cookiep = HAC_COOKIE_NONE; 4359 return (0); 4360 } 4361 ASSERT(pp == PP_PAGEROOT(pp)); 4362 4363 vp = pp->p_vnode; 4364 off = pp->p_offset; 4365 4366 pml = sfmmu_mlist_enter(pp); 4367 4368 if (flags & HAC_PAGELOCK) { 4369 if (!page_trylock(pp, SE_SHARED)) { 4370 /* 4371 * Somebody is holding SE_EXCL lock. Might 4372 * even be hat_page_relocate(). Drop all 4373 * our locks, lookup the page in &kvp, and 4374 * retry. If it doesn't exist in &kvp and &zvp, 4375 * then we must be dealing with a kernel mapped 4376 * page which doesn't actually belong to 4377 * segkmem so we punt. 4378 */ 4379 sfmmu_mlist_exit(pml); 4380 SFMMU_HASH_UNLOCK(hmebp); 4381 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4382 4383 /* check zvp before giving up */ 4384 if (pp == NULL) 4385 pp = page_lookup(&zvp, (u_offset_t)saddr, 4386 SE_SHARED); 4387 4388 /* Okay, we didn't find it, give up */ 4389 if (pp == NULL) { 4390 kmem_cache_free(pa_hment_cache, pahmep); 4391 *rpfn = pfn; 4392 if (cookiep) 4393 *cookiep = HAC_COOKIE_NONE; 4394 return (0); 4395 } 4396 page_unlock(pp); 4397 goto rehash; 4398 } 4399 locked = 1; 4400 } 4401 4402 if (!PAGE_LOCKED(pp) && !panicstr) 4403 panic("hat_add_callback: page 0x%p not locked", (void *)pp); 4404 4405 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4406 pp->p_offset != off) { 4407 /* 4408 * The page moved before we got our hands on it. Drop 4409 * all the locks and try again. 4410 */ 4411 ASSERT((flags & HAC_PAGELOCK) != 0); 4412 sfmmu_mlist_exit(pml); 4413 SFMMU_HASH_UNLOCK(hmebp); 4414 page_unlock(pp); 4415 locked = 0; 4416 goto rehash; 4417 } 4418 4419 if (!VN_ISKAS(vp)) { 4420 /* 4421 * This is not a segkmem page but another page which 4422 * has been kernel mapped. It had better have at least 4423 * a share lock on it. Return the pfn. 4424 */ 4425 sfmmu_mlist_exit(pml); 4426 SFMMU_HASH_UNLOCK(hmebp); 4427 if (locked) 4428 page_unlock(pp); 4429 kmem_cache_free(pa_hment_cache, pahmep); 4430 ASSERT(PAGE_LOCKED(pp)); 4431 *rpfn = pfn; 4432 if (cookiep) 4433 *cookiep = HAC_COOKIE_NONE; 4434 return (0); 4435 } 4436 4437 /* 4438 * Setup this pa_hment and link its embedded dummy sf_hment into 4439 * the mapping list. 4440 */ 4441 pp->p_share++; 4442 pahmep->cb_id = callback_id; 4443 pahmep->addr = vaddr; 4444 pahmep->len = len; 4445 pahmep->refcnt = 1; 4446 pahmep->flags = 0; 4447 pahmep->pvt = pvt; 4448 4449 sfhmep->hme_tte.ll = 0; 4450 sfhmep->hme_data = pahmep; 4451 sfhmep->hme_prev = osfhmep; 4452 sfhmep->hme_next = osfhmep->hme_next; 4453 4454 if (osfhmep->hme_next) 4455 osfhmep->hme_next->hme_prev = sfhmep; 4456 4457 osfhmep->hme_next = sfhmep; 4458 4459 sfmmu_mlist_exit(pml); 4460 SFMMU_HASH_UNLOCK(hmebp); 4461 4462 if (locked) 4463 page_unlock(pp); 4464 4465 *rpfn = pfn; 4466 if (cookiep) 4467 *cookiep = (void *)pahmep; 4468 4469 return (0); 4470 } 4471 4472 /* 4473 * Remove the relocation callbacks from the specified addr/len. 4474 */ 4475 void 4476 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags, 4477 void *cookie) 4478 { 4479 struct hmehash_bucket *hmebp; 4480 hmeblk_tag hblktag; 4481 struct hme_blk *hmeblkp; 4482 int hmeshift, hashno; 4483 caddr_t saddr; 4484 struct pa_hment *pahmep; 4485 struct sf_hment *sfhmep, *osfhmep; 4486 kmutex_t *pml; 4487 tte_t tte; 4488 page_t *pp; 4489 vnode_t *vp; 4490 u_offset_t off; 4491 int locked = 0; 4492 4493 /* 4494 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to 4495 * remove so just return. 4496 */ 4497 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr)) 4498 return; 4499 4500 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4501 4502 rehash: 4503 /* Find the mapping(s) for this page */ 4504 for (hashno = TTE64K, hmeblkp = NULL; 4505 hmeblkp == NULL && hashno <= mmu_hashcnt; 4506 hashno++) { 4507 hmeshift = HME_HASH_SHIFT(hashno); 4508 hblktag.htag_id = ksfmmup; 4509 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4510 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4511 hblktag.htag_rehash = hashno; 4512 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4513 4514 SFMMU_HASH_LOCK(hmebp); 4515 4516 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4517 4518 if (hmeblkp == NULL) 4519 SFMMU_HASH_UNLOCK(hmebp); 4520 } 4521 4522 if (hmeblkp == NULL) 4523 return; 4524 4525 ASSERT(!hmeblkp->hblk_shared); 4526 4527 HBLKTOHME(osfhmep, hmeblkp, saddr); 4528 4529 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4530 if (!TTE_IS_VALID(&tte)) { 4531 SFMMU_HASH_UNLOCK(hmebp); 4532 return; 4533 } 4534 4535 pp = osfhmep->hme_page; 4536 if (pp == NULL) { 4537 SFMMU_HASH_UNLOCK(hmebp); 4538 ASSERT(cookie == NULL); 4539 return; 4540 } 4541 4542 vp = pp->p_vnode; 4543 off = pp->p_offset; 4544 4545 pml = sfmmu_mlist_enter(pp); 4546 4547 if (flags & HAC_PAGELOCK) { 4548 if (!page_trylock(pp, SE_SHARED)) { 4549 /* 4550 * Somebody is holding SE_EXCL lock. Might 4551 * even be hat_page_relocate(). Drop all 4552 * our locks, lookup the page in &kvp, and 4553 * retry. If it doesn't exist in &kvp and &zvp, 4554 * then we must be dealing with a kernel mapped 4555 * page which doesn't actually belong to 4556 * segkmem so we punt. 4557 */ 4558 sfmmu_mlist_exit(pml); 4559 SFMMU_HASH_UNLOCK(hmebp); 4560 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4561 /* check zvp before giving up */ 4562 if (pp == NULL) 4563 pp = page_lookup(&zvp, (u_offset_t)saddr, 4564 SE_SHARED); 4565 4566 if (pp == NULL) { 4567 ASSERT(cookie == NULL); 4568 return; 4569 } 4570 page_unlock(pp); 4571 goto rehash; 4572 } 4573 locked = 1; 4574 } 4575 4576 ASSERT(PAGE_LOCKED(pp)); 4577 4578 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4579 pp->p_offset != off) { 4580 /* 4581 * The page moved before we got our hands on it. Drop 4582 * all the locks and try again. 4583 */ 4584 ASSERT((flags & HAC_PAGELOCK) != 0); 4585 sfmmu_mlist_exit(pml); 4586 SFMMU_HASH_UNLOCK(hmebp); 4587 page_unlock(pp); 4588 locked = 0; 4589 goto rehash; 4590 } 4591 4592 if (!VN_ISKAS(vp)) { 4593 /* 4594 * This is not a segkmem page but another page which 4595 * has been kernel mapped. 4596 */ 4597 sfmmu_mlist_exit(pml); 4598 SFMMU_HASH_UNLOCK(hmebp); 4599 if (locked) 4600 page_unlock(pp); 4601 ASSERT(cookie == NULL); 4602 return; 4603 } 4604 4605 if (cookie != NULL) { 4606 pahmep = (struct pa_hment *)cookie; 4607 sfhmep = &pahmep->sfment; 4608 } else { 4609 for (sfhmep = pp->p_mapping; sfhmep != NULL; 4610 sfhmep = sfhmep->hme_next) { 4611 4612 /* 4613 * skip va<->pa mappings 4614 */ 4615 if (!IS_PAHME(sfhmep)) 4616 continue; 4617 4618 pahmep = sfhmep->hme_data; 4619 ASSERT(pahmep != NULL); 4620 4621 /* 4622 * if pa_hment matches, remove it 4623 */ 4624 if ((pahmep->pvt == pvt) && 4625 (pahmep->addr == vaddr) && 4626 (pahmep->len == len)) { 4627 break; 4628 } 4629 } 4630 } 4631 4632 if (sfhmep == NULL) { 4633 if (!panicstr) { 4634 panic("hat_delete_callback: pa_hment not found, pp %p", 4635 (void *)pp); 4636 } 4637 return; 4638 } 4639 4640 /* 4641 * Note: at this point a valid kernel mapping must still be 4642 * present on this page. 4643 */ 4644 pp->p_share--; 4645 if (pp->p_share <= 0) 4646 panic("hat_delete_callback: zero p_share"); 4647 4648 if (--pahmep->refcnt == 0) { 4649 if (pahmep->flags != 0) 4650 panic("hat_delete_callback: pa_hment is busy"); 4651 4652 /* 4653 * Remove sfhmep from the mapping list for the page. 4654 */ 4655 if (sfhmep->hme_prev) { 4656 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 4657 } else { 4658 pp->p_mapping = sfhmep->hme_next; 4659 } 4660 4661 if (sfhmep->hme_next) 4662 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 4663 4664 sfmmu_mlist_exit(pml); 4665 SFMMU_HASH_UNLOCK(hmebp); 4666 4667 if (locked) 4668 page_unlock(pp); 4669 4670 kmem_cache_free(pa_hment_cache, pahmep); 4671 return; 4672 } 4673 4674 sfmmu_mlist_exit(pml); 4675 SFMMU_HASH_UNLOCK(hmebp); 4676 if (locked) 4677 page_unlock(pp); 4678 } 4679 4680 /* 4681 * hat_probe returns 1 if the translation for the address 'addr' is 4682 * loaded, zero otherwise. 4683 * 4684 * hat_probe should be used only for advisorary purposes because it may 4685 * occasionally return the wrong value. The implementation must guarantee that 4686 * returning the wrong value is a very rare event. hat_probe is used 4687 * to implement optimizations in the segment drivers. 4688 * 4689 */ 4690 int 4691 hat_probe(struct hat *sfmmup, caddr_t addr) 4692 { 4693 pfn_t pfn; 4694 tte_t tte; 4695 4696 ASSERT(sfmmup != NULL); 4697 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4698 4699 ASSERT((sfmmup == ksfmmup) || 4700 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4701 4702 if (sfmmup == ksfmmup) { 4703 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 4704 == PFN_SUSPENDED) { 4705 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 4706 } 4707 } else { 4708 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL); 4709 } 4710 4711 if (pfn != PFN_INVALID) 4712 return (1); 4713 else 4714 return (0); 4715 } 4716 4717 ssize_t 4718 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 4719 { 4720 tte_t tte; 4721 4722 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4723 4724 if (sfmmup == ksfmmup) { 4725 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4726 return (-1); 4727 } 4728 } else { 4729 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4730 return (-1); 4731 } 4732 } 4733 4734 ASSERT(TTE_IS_VALID(&tte)); 4735 return (TTEBYTES(TTE_CSZ(&tte))); 4736 } 4737 4738 uint_t 4739 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 4740 { 4741 tte_t tte; 4742 4743 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4744 4745 if (sfmmup == ksfmmup) { 4746 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4747 tte.ll = 0; 4748 } 4749 } else { 4750 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4751 tte.ll = 0; 4752 } 4753 } 4754 if (TTE_IS_VALID(&tte)) { 4755 *attr = sfmmu_ptov_attr(&tte); 4756 return (0); 4757 } 4758 *attr = 0; 4759 return ((uint_t)0xffffffff); 4760 } 4761 4762 /* 4763 * Enables more attributes on specified address range (ie. logical OR) 4764 */ 4765 void 4766 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4767 { 4768 if (hat->sfmmu_xhat_provider) { 4769 XHAT_SETATTR(hat, addr, len, attr); 4770 return; 4771 } else { 4772 /* 4773 * This must be a CPU HAT. If the address space has 4774 * XHATs attached, change attributes for all of them, 4775 * just in case 4776 */ 4777 ASSERT(hat->sfmmu_as != NULL); 4778 if (hat->sfmmu_as->a_xhat != NULL) 4779 xhat_setattr_all(hat->sfmmu_as, addr, len, attr); 4780 } 4781 4782 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 4783 } 4784 4785 /* 4786 * Assigns attributes to the specified address range. All the attributes 4787 * are specified. 4788 */ 4789 void 4790 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4791 { 4792 if (hat->sfmmu_xhat_provider) { 4793 XHAT_CHGATTR(hat, addr, len, attr); 4794 return; 4795 } else { 4796 /* 4797 * This must be a CPU HAT. If the address space has 4798 * XHATs attached, change attributes for all of them, 4799 * just in case 4800 */ 4801 ASSERT(hat->sfmmu_as != NULL); 4802 if (hat->sfmmu_as->a_xhat != NULL) 4803 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr); 4804 } 4805 4806 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 4807 } 4808 4809 /* 4810 * Remove attributes on the specified address range (ie. loginal NAND) 4811 */ 4812 void 4813 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4814 { 4815 if (hat->sfmmu_xhat_provider) { 4816 XHAT_CLRATTR(hat, addr, len, attr); 4817 return; 4818 } else { 4819 /* 4820 * This must be a CPU HAT. If the address space has 4821 * XHATs attached, change attributes for all of them, 4822 * just in case 4823 */ 4824 ASSERT(hat->sfmmu_as != NULL); 4825 if (hat->sfmmu_as->a_xhat != NULL) 4826 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr); 4827 } 4828 4829 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 4830 } 4831 4832 /* 4833 * Change attributes on an address range to that specified by attr and mode. 4834 */ 4835 static void 4836 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 4837 int mode) 4838 { 4839 struct hmehash_bucket *hmebp; 4840 hmeblk_tag hblktag; 4841 int hmeshift, hashno = 1; 4842 struct hme_blk *hmeblkp, *list = NULL; 4843 caddr_t endaddr; 4844 cpuset_t cpuset; 4845 demap_range_t dmr; 4846 4847 CPUSET_ZERO(cpuset); 4848 4849 ASSERT((sfmmup == ksfmmup) || 4850 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4851 ASSERT((len & MMU_PAGEOFFSET) == 0); 4852 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4853 4854 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4855 ((addr + len) > (caddr_t)USERLIMIT)) { 4856 panic("user addr %p in kernel space", 4857 (void *)addr); 4858 } 4859 4860 endaddr = addr + len; 4861 hblktag.htag_id = sfmmup; 4862 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4863 DEMAP_RANGE_INIT(sfmmup, &dmr); 4864 4865 while (addr < endaddr) { 4866 hmeshift = HME_HASH_SHIFT(hashno); 4867 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4868 hblktag.htag_rehash = hashno; 4869 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4870 4871 SFMMU_HASH_LOCK(hmebp); 4872 4873 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4874 if (hmeblkp != NULL) { 4875 ASSERT(!hmeblkp->hblk_shared); 4876 /* 4877 * We've encountered a shadow hmeblk so skip the range 4878 * of the next smaller mapping size. 4879 */ 4880 if (hmeblkp->hblk_shw_bit) { 4881 ASSERT(sfmmup != ksfmmup); 4882 ASSERT(hashno > 1); 4883 addr = (caddr_t)P2END((uintptr_t)addr, 4884 TTEBYTES(hashno - 1)); 4885 } else { 4886 addr = sfmmu_hblk_chgattr(sfmmup, 4887 hmeblkp, addr, endaddr, &dmr, attr, mode); 4888 } 4889 SFMMU_HASH_UNLOCK(hmebp); 4890 hashno = 1; 4891 continue; 4892 } 4893 SFMMU_HASH_UNLOCK(hmebp); 4894 4895 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4896 /* 4897 * We have traversed the whole list and rehashed 4898 * if necessary without finding the address to chgattr. 4899 * This is ok, so we increment the address by the 4900 * smallest hmeblk range for kernel mappings or for 4901 * user mappings with no large pages, and the largest 4902 * hmeblk range, to account for shadow hmeblks, for 4903 * user mappings with large pages and continue. 4904 */ 4905 if (sfmmup == ksfmmup) 4906 addr = (caddr_t)P2END((uintptr_t)addr, 4907 TTEBYTES(1)); 4908 else 4909 addr = (caddr_t)P2END((uintptr_t)addr, 4910 TTEBYTES(hashno)); 4911 hashno = 1; 4912 } else { 4913 hashno++; 4914 } 4915 } 4916 4917 sfmmu_hblks_list_purge(&list, 0); 4918 DEMAP_RANGE_FLUSH(&dmr); 4919 cpuset = sfmmup->sfmmu_cpusran; 4920 xt_sync(cpuset); 4921 } 4922 4923 /* 4924 * This function chgattr on a range of addresses in an hmeblk. It returns the 4925 * next addres that needs to be chgattr. 4926 * It should be called with the hash lock held. 4927 * XXX It should be possible to optimize chgattr by not flushing every time but 4928 * on the other hand: 4929 * 1. do one flush crosscall. 4930 * 2. only flush if we are increasing permissions (make sure this will work) 4931 */ 4932 static caddr_t 4933 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4934 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 4935 { 4936 tte_t tte, tteattr, tteflags, ttemod; 4937 struct sf_hment *sfhmep; 4938 int ttesz; 4939 struct page *pp = NULL; 4940 kmutex_t *pml, *pmtx; 4941 int ret; 4942 int use_demap_range; 4943 #if defined(SF_ERRATA_57) 4944 int check_exec; 4945 #endif 4946 4947 ASSERT(in_hblk_range(hmeblkp, addr)); 4948 ASSERT(hmeblkp->hblk_shw_bit == 0); 4949 ASSERT(!hmeblkp->hblk_shared); 4950 4951 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4952 ttesz = get_hblk_ttesz(hmeblkp); 4953 4954 /* 4955 * Flush the current demap region if addresses have been 4956 * skipped or the page size doesn't match. 4957 */ 4958 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 4959 if (use_demap_range) { 4960 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4961 } else { 4962 DEMAP_RANGE_FLUSH(dmrp); 4963 } 4964 4965 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 4966 #if defined(SF_ERRATA_57) 4967 check_exec = (sfmmup != ksfmmup) && 4968 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4969 TTE_IS_EXECUTABLE(&tteattr); 4970 #endif 4971 HBLKTOHME(sfhmep, hmeblkp, addr); 4972 while (addr < endaddr) { 4973 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4974 if (TTE_IS_VALID(&tte)) { 4975 if ((tte.ll & tteflags.ll) == tteattr.ll) { 4976 /* 4977 * if the new attr is the same as old 4978 * continue 4979 */ 4980 goto next_addr; 4981 } 4982 if (!TTE_IS_WRITABLE(&tteattr)) { 4983 /* 4984 * make sure we clear hw modify bit if we 4985 * removing write protections 4986 */ 4987 tteflags.tte_intlo |= TTE_HWWR_INT; 4988 } 4989 4990 pml = NULL; 4991 pp = sfhmep->hme_page; 4992 if (pp) { 4993 pml = sfmmu_mlist_enter(pp); 4994 } 4995 4996 if (pp != sfhmep->hme_page) { 4997 /* 4998 * tte must have been unloaded. 4999 */ 5000 ASSERT(pml); 5001 sfmmu_mlist_exit(pml); 5002 continue; 5003 } 5004 5005 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5006 5007 ttemod = tte; 5008 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 5009 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 5010 5011 #if defined(SF_ERRATA_57) 5012 if (check_exec && addr < errata57_limit) 5013 ttemod.tte_exec_perm = 0; 5014 #endif 5015 ret = sfmmu_modifytte_try(&tte, &ttemod, 5016 &sfhmep->hme_tte); 5017 5018 if (ret < 0) { 5019 /* tte changed underneath us */ 5020 if (pml) { 5021 sfmmu_mlist_exit(pml); 5022 } 5023 continue; 5024 } 5025 5026 if ((tteflags.tte_intlo & TTE_HWWR_INT) || 5027 (TTE_EXECUTED(&tte) && 5028 !TTE_IS_EXECUTABLE(&ttemod))) { 5029 /* 5030 * need to sync if clearing modify/exec bit. 5031 */ 5032 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5033 } 5034 5035 if (pp && PP_ISRO(pp)) { 5036 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 5037 pmtx = sfmmu_page_enter(pp); 5038 PP_CLRRO(pp); 5039 sfmmu_page_exit(pmtx); 5040 } 5041 } 5042 5043 if (ret > 0 && use_demap_range) { 5044 DEMAP_RANGE_MARKPG(dmrp, addr); 5045 } else if (ret > 0) { 5046 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5047 } 5048 5049 if (pml) { 5050 sfmmu_mlist_exit(pml); 5051 } 5052 } 5053 next_addr: 5054 addr += TTEBYTES(ttesz); 5055 sfhmep++; 5056 DEMAP_RANGE_NEXTPG(dmrp); 5057 } 5058 return (addr); 5059 } 5060 5061 /* 5062 * This routine converts virtual attributes to physical ones. It will 5063 * update the tteflags field with the tte mask corresponding to the attributes 5064 * affected and it returns the new attributes. It will also clear the modify 5065 * bit if we are taking away write permission. This is necessary since the 5066 * modify bit is the hardware permission bit and we need to clear it in order 5067 * to detect write faults. 5068 */ 5069 static uint64_t 5070 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 5071 { 5072 tte_t ttevalue; 5073 5074 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 5075 5076 switch (mode) { 5077 case SFMMU_CHGATTR: 5078 /* all attributes specified */ 5079 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 5080 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 5081 ttemaskp->tte_inthi = TTEINTHI_ATTR; 5082 ttemaskp->tte_intlo = TTEINTLO_ATTR; 5083 if (!icache_is_coherent) { 5084 if (!(attr & PROT_EXEC)) { 5085 TTE_SET_SOFTEXEC(ttemaskp); 5086 } else { 5087 TTE_CLR_EXEC(ttemaskp); 5088 TTE_SET_SOFTEXEC(&ttevalue); 5089 } 5090 } 5091 break; 5092 case SFMMU_SETATTR: 5093 ASSERT(!(attr & ~HAT_PROT_MASK)); 5094 ttemaskp->ll = 0; 5095 ttevalue.ll = 0; 5096 /* 5097 * a valid tte implies exec and read for sfmmu 5098 * so no need to do anything about them. 5099 * since priviledged access implies user access 5100 * PROT_USER doesn't make sense either. 5101 */ 5102 if (attr & PROT_WRITE) { 5103 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 5104 ttevalue.tte_intlo |= TTE_WRPRM_INT; 5105 } 5106 break; 5107 case SFMMU_CLRATTR: 5108 /* attributes will be nand with current ones */ 5109 if (attr & ~(PROT_WRITE | PROT_USER)) { 5110 panic("sfmmu: attr %x not supported", attr); 5111 } 5112 ttemaskp->ll = 0; 5113 ttevalue.ll = 0; 5114 if (attr & PROT_WRITE) { 5115 /* clear both writable and modify bit */ 5116 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 5117 } 5118 if (attr & PROT_USER) { 5119 ttemaskp->tte_intlo |= TTE_PRIV_INT; 5120 ttevalue.tte_intlo |= TTE_PRIV_INT; 5121 } 5122 break; 5123 default: 5124 panic("sfmmu_vtop_attr: bad mode %x", mode); 5125 } 5126 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 5127 return (ttevalue.ll); 5128 } 5129 5130 static uint_t 5131 sfmmu_ptov_attr(tte_t *ttep) 5132 { 5133 uint_t attr; 5134 5135 ASSERT(TTE_IS_VALID(ttep)); 5136 5137 attr = PROT_READ; 5138 5139 if (TTE_IS_WRITABLE(ttep)) { 5140 attr |= PROT_WRITE; 5141 } 5142 if (TTE_IS_EXECUTABLE(ttep)) { 5143 attr |= PROT_EXEC; 5144 } 5145 if (TTE_IS_SOFTEXEC(ttep)) { 5146 attr |= PROT_EXEC; 5147 } 5148 if (!TTE_IS_PRIVILEGED(ttep)) { 5149 attr |= PROT_USER; 5150 } 5151 if (TTE_IS_NFO(ttep)) { 5152 attr |= HAT_NOFAULT; 5153 } 5154 if (TTE_IS_NOSYNC(ttep)) { 5155 attr |= HAT_NOSYNC; 5156 } 5157 if (TTE_IS_SIDEFFECT(ttep)) { 5158 attr |= SFMMU_SIDEFFECT; 5159 } 5160 if (!TTE_IS_VCACHEABLE(ttep)) { 5161 attr |= SFMMU_UNCACHEVTTE; 5162 } 5163 if (!TTE_IS_PCACHEABLE(ttep)) { 5164 attr |= SFMMU_UNCACHEPTTE; 5165 } 5166 return (attr); 5167 } 5168 5169 /* 5170 * hat_chgprot is a deprecated hat call. New segment drivers 5171 * should store all attributes and use hat_*attr calls. 5172 * 5173 * Change the protections in the virtual address range 5174 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 5175 * then remove write permission, leaving the other 5176 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 5177 * 5178 */ 5179 void 5180 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 5181 { 5182 struct hmehash_bucket *hmebp; 5183 hmeblk_tag hblktag; 5184 int hmeshift, hashno = 1; 5185 struct hme_blk *hmeblkp, *list = NULL; 5186 caddr_t endaddr; 5187 cpuset_t cpuset; 5188 demap_range_t dmr; 5189 5190 ASSERT((len & MMU_PAGEOFFSET) == 0); 5191 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 5192 5193 if (sfmmup->sfmmu_xhat_provider) { 5194 XHAT_CHGPROT(sfmmup, addr, len, vprot); 5195 return; 5196 } else { 5197 /* 5198 * This must be a CPU HAT. If the address space has 5199 * XHATs attached, change attributes for all of them, 5200 * just in case 5201 */ 5202 ASSERT(sfmmup->sfmmu_as != NULL); 5203 if (sfmmup->sfmmu_as->a_xhat != NULL) 5204 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot); 5205 } 5206 5207 CPUSET_ZERO(cpuset); 5208 5209 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 5210 ((addr + len) > (caddr_t)USERLIMIT)) { 5211 panic("user addr %p vprot %x in kernel space", 5212 (void *)addr, vprot); 5213 } 5214 endaddr = addr + len; 5215 hblktag.htag_id = sfmmup; 5216 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5217 DEMAP_RANGE_INIT(sfmmup, &dmr); 5218 5219 while (addr < endaddr) { 5220 hmeshift = HME_HASH_SHIFT(hashno); 5221 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5222 hblktag.htag_rehash = hashno; 5223 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5224 5225 SFMMU_HASH_LOCK(hmebp); 5226 5227 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5228 if (hmeblkp != NULL) { 5229 ASSERT(!hmeblkp->hblk_shared); 5230 /* 5231 * We've encountered a shadow hmeblk so skip the range 5232 * of the next smaller mapping size. 5233 */ 5234 if (hmeblkp->hblk_shw_bit) { 5235 ASSERT(sfmmup != ksfmmup); 5236 ASSERT(hashno > 1); 5237 addr = (caddr_t)P2END((uintptr_t)addr, 5238 TTEBYTES(hashno - 1)); 5239 } else { 5240 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 5241 addr, endaddr, &dmr, vprot); 5242 } 5243 SFMMU_HASH_UNLOCK(hmebp); 5244 hashno = 1; 5245 continue; 5246 } 5247 SFMMU_HASH_UNLOCK(hmebp); 5248 5249 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5250 /* 5251 * We have traversed the whole list and rehashed 5252 * if necessary without finding the address to chgprot. 5253 * This is ok so we increment the address by the 5254 * smallest hmeblk range for kernel mappings and the 5255 * largest hmeblk range, to account for shadow hmeblks, 5256 * for user mappings and continue. 5257 */ 5258 if (sfmmup == ksfmmup) 5259 addr = (caddr_t)P2END((uintptr_t)addr, 5260 TTEBYTES(1)); 5261 else 5262 addr = (caddr_t)P2END((uintptr_t)addr, 5263 TTEBYTES(hashno)); 5264 hashno = 1; 5265 } else { 5266 hashno++; 5267 } 5268 } 5269 5270 sfmmu_hblks_list_purge(&list, 0); 5271 DEMAP_RANGE_FLUSH(&dmr); 5272 cpuset = sfmmup->sfmmu_cpusran; 5273 xt_sync(cpuset); 5274 } 5275 5276 /* 5277 * This function chgprots a range of addresses in an hmeblk. It returns the 5278 * next addres that needs to be chgprot. 5279 * It should be called with the hash lock held. 5280 * XXX It shold be possible to optimize chgprot by not flushing every time but 5281 * on the other hand: 5282 * 1. do one flush crosscall. 5283 * 2. only flush if we are increasing permissions (make sure this will work) 5284 */ 5285 static caddr_t 5286 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5287 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 5288 { 5289 uint_t pprot; 5290 tte_t tte, ttemod; 5291 struct sf_hment *sfhmep; 5292 uint_t tteflags; 5293 int ttesz; 5294 struct page *pp = NULL; 5295 kmutex_t *pml, *pmtx; 5296 int ret; 5297 int use_demap_range; 5298 #if defined(SF_ERRATA_57) 5299 int check_exec; 5300 #endif 5301 5302 ASSERT(in_hblk_range(hmeblkp, addr)); 5303 ASSERT(hmeblkp->hblk_shw_bit == 0); 5304 ASSERT(!hmeblkp->hblk_shared); 5305 5306 #ifdef DEBUG 5307 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5308 (endaddr < get_hblk_endaddr(hmeblkp))) { 5309 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 5310 } 5311 #endif /* DEBUG */ 5312 5313 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5314 ttesz = get_hblk_ttesz(hmeblkp); 5315 5316 pprot = sfmmu_vtop_prot(vprot, &tteflags); 5317 #if defined(SF_ERRATA_57) 5318 check_exec = (sfmmup != ksfmmup) && 5319 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 5320 ((vprot & PROT_EXEC) == PROT_EXEC); 5321 #endif 5322 HBLKTOHME(sfhmep, hmeblkp, addr); 5323 5324 /* 5325 * Flush the current demap region if addresses have been 5326 * skipped or the page size doesn't match. 5327 */ 5328 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 5329 if (use_demap_range) { 5330 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5331 } else { 5332 DEMAP_RANGE_FLUSH(dmrp); 5333 } 5334 5335 while (addr < endaddr) { 5336 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5337 if (TTE_IS_VALID(&tte)) { 5338 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 5339 /* 5340 * if the new protection is the same as old 5341 * continue 5342 */ 5343 goto next_addr; 5344 } 5345 pml = NULL; 5346 pp = sfhmep->hme_page; 5347 if (pp) { 5348 pml = sfmmu_mlist_enter(pp); 5349 } 5350 if (pp != sfhmep->hme_page) { 5351 /* 5352 * tte most have been unloaded 5353 * underneath us. Recheck 5354 */ 5355 ASSERT(pml); 5356 sfmmu_mlist_exit(pml); 5357 continue; 5358 } 5359 5360 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5361 5362 ttemod = tte; 5363 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 5364 ASSERT(TTE_IS_SOFTEXEC(&tte) == 5365 TTE_IS_SOFTEXEC(&ttemod)); 5366 ASSERT(TTE_IS_EXECUTABLE(&tte) == 5367 TTE_IS_EXECUTABLE(&ttemod)); 5368 5369 #if defined(SF_ERRATA_57) 5370 if (check_exec && addr < errata57_limit) 5371 ttemod.tte_exec_perm = 0; 5372 #endif 5373 ret = sfmmu_modifytte_try(&tte, &ttemod, 5374 &sfhmep->hme_tte); 5375 5376 if (ret < 0) { 5377 /* tte changed underneath us */ 5378 if (pml) { 5379 sfmmu_mlist_exit(pml); 5380 } 5381 continue; 5382 } 5383 5384 if (tteflags & TTE_HWWR_INT) { 5385 /* 5386 * need to sync if we are clearing modify bit. 5387 */ 5388 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5389 } 5390 5391 if (pp && PP_ISRO(pp)) { 5392 if (pprot & TTE_WRPRM_INT) { 5393 pmtx = sfmmu_page_enter(pp); 5394 PP_CLRRO(pp); 5395 sfmmu_page_exit(pmtx); 5396 } 5397 } 5398 5399 if (ret > 0 && use_demap_range) { 5400 DEMAP_RANGE_MARKPG(dmrp, addr); 5401 } else if (ret > 0) { 5402 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5403 } 5404 5405 if (pml) { 5406 sfmmu_mlist_exit(pml); 5407 } 5408 } 5409 next_addr: 5410 addr += TTEBYTES(ttesz); 5411 sfhmep++; 5412 DEMAP_RANGE_NEXTPG(dmrp); 5413 } 5414 return (addr); 5415 } 5416 5417 /* 5418 * This routine is deprecated and should only be used by hat_chgprot. 5419 * The correct routine is sfmmu_vtop_attr. 5420 * This routine converts virtual page protections to physical ones. It will 5421 * update the tteflags field with the tte mask corresponding to the protections 5422 * affected and it returns the new protections. It will also clear the modify 5423 * bit if we are taking away write permission. This is necessary since the 5424 * modify bit is the hardware permission bit and we need to clear it in order 5425 * to detect write faults. 5426 * It accepts the following special protections: 5427 * ~PROT_WRITE = remove write permissions. 5428 * ~PROT_USER = remove user permissions. 5429 */ 5430 static uint_t 5431 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 5432 { 5433 if (vprot == (uint_t)~PROT_WRITE) { 5434 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 5435 return (0); /* will cause wrprm to be cleared */ 5436 } 5437 if (vprot == (uint_t)~PROT_USER) { 5438 *tteflagsp = TTE_PRIV_INT; 5439 return (0); /* will cause privprm to be cleared */ 5440 } 5441 if ((vprot == 0) || (vprot == PROT_USER) || 5442 ((vprot & PROT_ALL) != vprot)) { 5443 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5444 } 5445 5446 switch (vprot) { 5447 case (PROT_READ): 5448 case (PROT_EXEC): 5449 case (PROT_EXEC | PROT_READ): 5450 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5451 return (TTE_PRIV_INT); /* set prv and clr wrt */ 5452 case (PROT_WRITE): 5453 case (PROT_WRITE | PROT_READ): 5454 case (PROT_EXEC | PROT_WRITE): 5455 case (PROT_EXEC | PROT_WRITE | PROT_READ): 5456 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5457 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 5458 case (PROT_USER | PROT_READ): 5459 case (PROT_USER | PROT_EXEC): 5460 case (PROT_USER | PROT_EXEC | PROT_READ): 5461 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5462 return (0); /* clr prv and wrt */ 5463 case (PROT_USER | PROT_WRITE): 5464 case (PROT_USER | PROT_WRITE | PROT_READ): 5465 case (PROT_USER | PROT_EXEC | PROT_WRITE): 5466 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 5467 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5468 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 5469 default: 5470 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5471 } 5472 return (0); 5473 } 5474 5475 /* 5476 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 5477 * the normal algorithm would take too long for a very large VA range with 5478 * few real mappings. This routine just walks thru all HMEs in the global 5479 * hash table to find and remove mappings. 5480 */ 5481 static void 5482 hat_unload_large_virtual( 5483 struct hat *sfmmup, 5484 caddr_t startaddr, 5485 size_t len, 5486 uint_t flags, 5487 hat_callback_t *callback) 5488 { 5489 struct hmehash_bucket *hmebp; 5490 struct hme_blk *hmeblkp; 5491 struct hme_blk *pr_hblk = NULL; 5492 struct hme_blk *nx_hblk; 5493 struct hme_blk *list = NULL; 5494 int i; 5495 demap_range_t dmr, *dmrp; 5496 cpuset_t cpuset; 5497 caddr_t endaddr = startaddr + len; 5498 caddr_t sa; 5499 caddr_t ea; 5500 caddr_t cb_sa[MAX_CB_ADDR]; 5501 caddr_t cb_ea[MAX_CB_ADDR]; 5502 int addr_cnt = 0; 5503 int a = 0; 5504 5505 if (sfmmup->sfmmu_free) { 5506 dmrp = NULL; 5507 } else { 5508 dmrp = &dmr; 5509 DEMAP_RANGE_INIT(sfmmup, dmrp); 5510 } 5511 5512 /* 5513 * Loop through all the hash buckets of HME blocks looking for matches. 5514 */ 5515 for (i = 0; i <= UHMEHASH_SZ; i++) { 5516 hmebp = &uhme_hash[i]; 5517 SFMMU_HASH_LOCK(hmebp); 5518 hmeblkp = hmebp->hmeblkp; 5519 pr_hblk = NULL; 5520 while (hmeblkp) { 5521 nx_hblk = hmeblkp->hblk_next; 5522 5523 /* 5524 * skip if not this context, if a shadow block or 5525 * if the mapping is not in the requested range 5526 */ 5527 if (hmeblkp->hblk_tag.htag_id != sfmmup || 5528 hmeblkp->hblk_shw_bit || 5529 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 5530 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 5531 pr_hblk = hmeblkp; 5532 goto next_block; 5533 } 5534 5535 ASSERT(!hmeblkp->hblk_shared); 5536 /* 5537 * unload if there are any current valid mappings 5538 */ 5539 if (hmeblkp->hblk_vcnt != 0 || 5540 hmeblkp->hblk_hmecnt != 0) 5541 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 5542 sa, ea, dmrp, flags); 5543 5544 /* 5545 * on unmap we also release the HME block itself, once 5546 * all mappings are gone. 5547 */ 5548 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 5549 !hmeblkp->hblk_vcnt && 5550 !hmeblkp->hblk_hmecnt) { 5551 ASSERT(!hmeblkp->hblk_lckcnt); 5552 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5553 &list, 0); 5554 } else { 5555 pr_hblk = hmeblkp; 5556 } 5557 5558 if (callback == NULL) 5559 goto next_block; 5560 5561 /* 5562 * HME blocks may span more than one page, but we may be 5563 * unmapping only one page, so check for a smaller range 5564 * for the callback 5565 */ 5566 if (sa < startaddr) 5567 sa = startaddr; 5568 if (--ea > endaddr) 5569 ea = endaddr - 1; 5570 5571 cb_sa[addr_cnt] = sa; 5572 cb_ea[addr_cnt] = ea; 5573 if (++addr_cnt == MAX_CB_ADDR) { 5574 if (dmrp != NULL) { 5575 DEMAP_RANGE_FLUSH(dmrp); 5576 cpuset = sfmmup->sfmmu_cpusran; 5577 xt_sync(cpuset); 5578 } 5579 5580 for (a = 0; a < MAX_CB_ADDR; ++a) { 5581 callback->hcb_start_addr = cb_sa[a]; 5582 callback->hcb_end_addr = cb_ea[a]; 5583 callback->hcb_function(callback); 5584 } 5585 addr_cnt = 0; 5586 } 5587 5588 next_block: 5589 hmeblkp = nx_hblk; 5590 } 5591 SFMMU_HASH_UNLOCK(hmebp); 5592 } 5593 5594 sfmmu_hblks_list_purge(&list, 0); 5595 if (dmrp != NULL) { 5596 DEMAP_RANGE_FLUSH(dmrp); 5597 cpuset = sfmmup->sfmmu_cpusran; 5598 xt_sync(cpuset); 5599 } 5600 5601 for (a = 0; a < addr_cnt; ++a) { 5602 callback->hcb_start_addr = cb_sa[a]; 5603 callback->hcb_end_addr = cb_ea[a]; 5604 callback->hcb_function(callback); 5605 } 5606 5607 /* 5608 * Check TSB and TLB page sizes if the process isn't exiting. 5609 */ 5610 if (!sfmmup->sfmmu_free) 5611 sfmmu_check_page_sizes(sfmmup, 0); 5612 } 5613 5614 /* 5615 * Unload all the mappings in the range [addr..addr+len). addr and len must 5616 * be MMU_PAGESIZE aligned. 5617 */ 5618 5619 extern struct seg *segkmap; 5620 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 5621 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 5622 5623 5624 void 5625 hat_unload_callback( 5626 struct hat *sfmmup, 5627 caddr_t addr, 5628 size_t len, 5629 uint_t flags, 5630 hat_callback_t *callback) 5631 { 5632 struct hmehash_bucket *hmebp; 5633 hmeblk_tag hblktag; 5634 int hmeshift, hashno, iskernel; 5635 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 5636 caddr_t endaddr; 5637 cpuset_t cpuset; 5638 int addr_count = 0; 5639 int a; 5640 caddr_t cb_start_addr[MAX_CB_ADDR]; 5641 caddr_t cb_end_addr[MAX_CB_ADDR]; 5642 int issegkmap = ISSEGKMAP(sfmmup, addr); 5643 demap_range_t dmr, *dmrp; 5644 5645 if (sfmmup->sfmmu_xhat_provider) { 5646 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback); 5647 return; 5648 } else { 5649 /* 5650 * This must be a CPU HAT. If the address space has 5651 * XHATs attached, unload the mappings for all of them, 5652 * just in case 5653 */ 5654 ASSERT(sfmmup->sfmmu_as != NULL); 5655 if (sfmmup->sfmmu_as->a_xhat != NULL) 5656 xhat_unload_callback_all(sfmmup->sfmmu_as, addr, 5657 len, flags, callback); 5658 } 5659 5660 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 5661 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 5662 5663 ASSERT(sfmmup != NULL); 5664 ASSERT((len & MMU_PAGEOFFSET) == 0); 5665 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 5666 5667 /* 5668 * Probing through a large VA range (say 63 bits) will be slow, even 5669 * at 4 Meg steps between the probes. So, when the virtual address range 5670 * is very large, search the HME entries for what to unload. 5671 * 5672 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 5673 * 5674 * UHMEHASH_SZ is number of hash buckets to examine 5675 * 5676 */ 5677 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 5678 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 5679 return; 5680 } 5681 5682 CPUSET_ZERO(cpuset); 5683 5684 /* 5685 * If the process is exiting, we can save a lot of fuss since 5686 * we'll flush the TLB when we free the ctx anyway. 5687 */ 5688 if (sfmmup->sfmmu_free) 5689 dmrp = NULL; 5690 else 5691 dmrp = &dmr; 5692 5693 DEMAP_RANGE_INIT(sfmmup, dmrp); 5694 endaddr = addr + len; 5695 hblktag.htag_id = sfmmup; 5696 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5697 5698 /* 5699 * It is likely for the vm to call unload over a wide range of 5700 * addresses that are actually very sparsely populated by 5701 * translations. In order to speed this up the sfmmu hat supports 5702 * the concept of shadow hmeblks. Dummy large page hmeblks that 5703 * correspond to actual small translations are allocated at tteload 5704 * time and are referred to as shadow hmeblks. Now, during unload 5705 * time, we first check if we have a shadow hmeblk for that 5706 * translation. The absence of one means the corresponding address 5707 * range is empty and can be skipped. 5708 * 5709 * The kernel is an exception to above statement and that is why 5710 * we don't use shadow hmeblks and hash starting from the smallest 5711 * page size. 5712 */ 5713 if (sfmmup == KHATID) { 5714 iskernel = 1; 5715 hashno = TTE64K; 5716 } else { 5717 iskernel = 0; 5718 if (mmu_page_sizes == max_mmu_page_sizes) { 5719 hashno = TTE256M; 5720 } else { 5721 hashno = TTE4M; 5722 } 5723 } 5724 while (addr < endaddr) { 5725 hmeshift = HME_HASH_SHIFT(hashno); 5726 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5727 hblktag.htag_rehash = hashno; 5728 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5729 5730 SFMMU_HASH_LOCK(hmebp); 5731 5732 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 5733 if (hmeblkp == NULL) { 5734 /* 5735 * didn't find an hmeblk. skip the appropiate 5736 * address range. 5737 */ 5738 SFMMU_HASH_UNLOCK(hmebp); 5739 if (iskernel) { 5740 if (hashno < mmu_hashcnt) { 5741 hashno++; 5742 continue; 5743 } else { 5744 hashno = TTE64K; 5745 addr = (caddr_t)roundup((uintptr_t)addr 5746 + 1, MMU_PAGESIZE64K); 5747 continue; 5748 } 5749 } 5750 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5751 (1 << hmeshift)); 5752 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5753 ASSERT(hashno == TTE64K); 5754 continue; 5755 } 5756 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5757 hashno = TTE512K; 5758 continue; 5759 } 5760 if (mmu_page_sizes == max_mmu_page_sizes) { 5761 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5762 hashno = TTE4M; 5763 continue; 5764 } 5765 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5766 hashno = TTE32M; 5767 continue; 5768 } 5769 hashno = TTE256M; 5770 continue; 5771 } else { 5772 hashno = TTE4M; 5773 continue; 5774 } 5775 } 5776 ASSERT(hmeblkp); 5777 ASSERT(!hmeblkp->hblk_shared); 5778 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5779 /* 5780 * If the valid count is zero we can skip the range 5781 * mapped by this hmeblk. 5782 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 5783 * is used by segment drivers as a hint 5784 * that the mapping resource won't be used any longer. 5785 * The best example of this is during exit(). 5786 */ 5787 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5788 get_hblk_span(hmeblkp)); 5789 if ((flags & HAT_UNLOAD_UNMAP) || 5790 (iskernel && !issegkmap)) { 5791 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5792 &list, 0); 5793 } 5794 SFMMU_HASH_UNLOCK(hmebp); 5795 5796 if (iskernel) { 5797 hashno = TTE64K; 5798 continue; 5799 } 5800 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5801 ASSERT(hashno == TTE64K); 5802 continue; 5803 } 5804 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5805 hashno = TTE512K; 5806 continue; 5807 } 5808 if (mmu_page_sizes == max_mmu_page_sizes) { 5809 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5810 hashno = TTE4M; 5811 continue; 5812 } 5813 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5814 hashno = TTE32M; 5815 continue; 5816 } 5817 hashno = TTE256M; 5818 continue; 5819 } else { 5820 hashno = TTE4M; 5821 continue; 5822 } 5823 } 5824 if (hmeblkp->hblk_shw_bit) { 5825 /* 5826 * If we encounter a shadow hmeblk we know there is 5827 * smaller sized hmeblks mapping the same address space. 5828 * Decrement the hash size and rehash. 5829 */ 5830 ASSERT(sfmmup != KHATID); 5831 hashno--; 5832 SFMMU_HASH_UNLOCK(hmebp); 5833 continue; 5834 } 5835 5836 /* 5837 * track callback address ranges. 5838 * only start a new range when it's not contiguous 5839 */ 5840 if (callback != NULL) { 5841 if (addr_count > 0 && 5842 addr == cb_end_addr[addr_count - 1]) 5843 --addr_count; 5844 else 5845 cb_start_addr[addr_count] = addr; 5846 } 5847 5848 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5849 dmrp, flags); 5850 5851 if (callback != NULL) 5852 cb_end_addr[addr_count++] = addr; 5853 5854 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5855 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5856 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0); 5857 } 5858 SFMMU_HASH_UNLOCK(hmebp); 5859 5860 /* 5861 * Notify our caller as to exactly which pages 5862 * have been unloaded. We do these in clumps, 5863 * to minimize the number of xt_sync()s that need to occur. 5864 */ 5865 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5866 DEMAP_RANGE_FLUSH(dmrp); 5867 if (dmrp != NULL) { 5868 cpuset = sfmmup->sfmmu_cpusran; 5869 xt_sync(cpuset); 5870 } 5871 5872 for (a = 0; a < MAX_CB_ADDR; ++a) { 5873 callback->hcb_start_addr = cb_start_addr[a]; 5874 callback->hcb_end_addr = cb_end_addr[a]; 5875 callback->hcb_function(callback); 5876 } 5877 addr_count = 0; 5878 } 5879 if (iskernel) { 5880 hashno = TTE64K; 5881 continue; 5882 } 5883 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5884 ASSERT(hashno == TTE64K); 5885 continue; 5886 } 5887 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5888 hashno = TTE512K; 5889 continue; 5890 } 5891 if (mmu_page_sizes == max_mmu_page_sizes) { 5892 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5893 hashno = TTE4M; 5894 continue; 5895 } 5896 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5897 hashno = TTE32M; 5898 continue; 5899 } 5900 hashno = TTE256M; 5901 } else { 5902 hashno = TTE4M; 5903 } 5904 } 5905 5906 sfmmu_hblks_list_purge(&list, 0); 5907 DEMAP_RANGE_FLUSH(dmrp); 5908 if (dmrp != NULL) { 5909 cpuset = sfmmup->sfmmu_cpusran; 5910 xt_sync(cpuset); 5911 } 5912 if (callback && addr_count != 0) { 5913 for (a = 0; a < addr_count; ++a) { 5914 callback->hcb_start_addr = cb_start_addr[a]; 5915 callback->hcb_end_addr = cb_end_addr[a]; 5916 callback->hcb_function(callback); 5917 } 5918 } 5919 5920 /* 5921 * Check TSB and TLB page sizes if the process isn't exiting. 5922 */ 5923 if (!sfmmup->sfmmu_free) 5924 sfmmu_check_page_sizes(sfmmup, 0); 5925 } 5926 5927 /* 5928 * Unload all the mappings in the range [addr..addr+len). addr and len must 5929 * be MMU_PAGESIZE aligned. 5930 */ 5931 void 5932 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 5933 { 5934 if (sfmmup->sfmmu_xhat_provider) { 5935 XHAT_UNLOAD(sfmmup, addr, len, flags); 5936 return; 5937 } 5938 hat_unload_callback(sfmmup, addr, len, flags, NULL); 5939 } 5940 5941 5942 /* 5943 * Find the largest mapping size for this page. 5944 */ 5945 int 5946 fnd_mapping_sz(page_t *pp) 5947 { 5948 int sz; 5949 int p_index; 5950 5951 p_index = PP_MAPINDEX(pp); 5952 5953 sz = 0; 5954 p_index >>= 1; /* don't care about 8K bit */ 5955 for (; p_index; p_index >>= 1) { 5956 sz++; 5957 } 5958 5959 return (sz); 5960 } 5961 5962 /* 5963 * This function unloads a range of addresses for an hmeblk. 5964 * It returns the next address to be unloaded. 5965 * It should be called with the hash lock held. 5966 */ 5967 static caddr_t 5968 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5969 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 5970 { 5971 tte_t tte, ttemod; 5972 struct sf_hment *sfhmep; 5973 int ttesz; 5974 long ttecnt; 5975 page_t *pp; 5976 kmutex_t *pml; 5977 int ret; 5978 int use_demap_range; 5979 5980 ASSERT(in_hblk_range(hmeblkp, addr)); 5981 ASSERT(!hmeblkp->hblk_shw_bit); 5982 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared); 5983 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared); 5984 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared); 5985 5986 #ifdef DEBUG 5987 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5988 (endaddr < get_hblk_endaddr(hmeblkp))) { 5989 panic("sfmmu_hblk_unload: partial unload of large page"); 5990 } 5991 #endif /* DEBUG */ 5992 5993 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5994 ttesz = get_hblk_ttesz(hmeblkp); 5995 5996 use_demap_range = ((dmrp == NULL) || 5997 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp))); 5998 5999 if (use_demap_range) { 6000 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 6001 } else { 6002 DEMAP_RANGE_FLUSH(dmrp); 6003 } 6004 ttecnt = 0; 6005 HBLKTOHME(sfhmep, hmeblkp, addr); 6006 6007 while (addr < endaddr) { 6008 pml = NULL; 6009 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6010 if (TTE_IS_VALID(&tte)) { 6011 pp = sfhmep->hme_page; 6012 if (pp != NULL) { 6013 pml = sfmmu_mlist_enter(pp); 6014 } 6015 6016 /* 6017 * Verify if hme still points to 'pp' now that 6018 * we have p_mapping lock. 6019 */ 6020 if (sfhmep->hme_page != pp) { 6021 if (pp != NULL && sfhmep->hme_page != NULL) { 6022 ASSERT(pml != NULL); 6023 sfmmu_mlist_exit(pml); 6024 /* Re-start this iteration. */ 6025 continue; 6026 } 6027 ASSERT((pp != NULL) && 6028 (sfhmep->hme_page == NULL)); 6029 goto tte_unloaded; 6030 } 6031 6032 /* 6033 * This point on we have both HASH and p_mapping 6034 * lock. 6035 */ 6036 ASSERT(pp == sfhmep->hme_page); 6037 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6038 6039 /* 6040 * We need to loop on modify tte because it is 6041 * possible for pagesync to come along and 6042 * change the software bits beneath us. 6043 * 6044 * Page_unload can also invalidate the tte after 6045 * we read tte outside of p_mapping lock. 6046 */ 6047 again: 6048 ttemod = tte; 6049 6050 TTE_SET_INVALID(&ttemod); 6051 ret = sfmmu_modifytte_try(&tte, &ttemod, 6052 &sfhmep->hme_tte); 6053 6054 if (ret <= 0) { 6055 if (TTE_IS_VALID(&tte)) { 6056 ASSERT(ret < 0); 6057 goto again; 6058 } 6059 if (pp != NULL) { 6060 panic("sfmmu_hblk_unload: pp = 0x%p " 6061 "tte became invalid under mlist" 6062 " lock = 0x%p", (void *)pp, 6063 (void *)pml); 6064 } 6065 continue; 6066 } 6067 6068 if (!(flags & HAT_UNLOAD_NOSYNC) || 6069 (pp != NULL && TTE_EXECUTED(&tte))) { 6070 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6071 } 6072 6073 /* 6074 * Ok- we invalidated the tte. Do the rest of the job. 6075 */ 6076 ttecnt++; 6077 6078 if (flags & HAT_UNLOAD_UNLOCK) { 6079 ASSERT(hmeblkp->hblk_lckcnt > 0); 6080 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 6081 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 6082 } 6083 6084 /* 6085 * Normally we would need to flush the page 6086 * from the virtual cache at this point in 6087 * order to prevent a potential cache alias 6088 * inconsistency. 6089 * The particular scenario we need to worry 6090 * about is: 6091 * Given: va1 and va2 are two virtual address 6092 * that alias and map the same physical 6093 * address. 6094 * 1. mapping exists from va1 to pa and data 6095 * has been read into the cache. 6096 * 2. unload va1. 6097 * 3. load va2 and modify data using va2. 6098 * 4 unload va2. 6099 * 5. load va1 and reference data. Unless we 6100 * flush the data cache when we unload we will 6101 * get stale data. 6102 * Fortunately, page coloring eliminates the 6103 * above scenario by remembering the color a 6104 * physical page was last or is currently 6105 * mapped to. Now, we delay the flush until 6106 * the loading of translations. Only when the 6107 * new translation is of a different color 6108 * are we forced to flush. 6109 */ 6110 if (use_demap_range) { 6111 /* 6112 * Mark this page as needing a demap. 6113 */ 6114 DEMAP_RANGE_MARKPG(dmrp, addr); 6115 } else { 6116 ASSERT(sfmmup != NULL); 6117 ASSERT(!hmeblkp->hblk_shared); 6118 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 6119 sfmmup->sfmmu_free, 0); 6120 } 6121 6122 if (pp) { 6123 /* 6124 * Remove the hment from the mapping list 6125 */ 6126 ASSERT(hmeblkp->hblk_hmecnt > 0); 6127 6128 /* 6129 * Again, we cannot 6130 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 6131 */ 6132 HME_SUB(sfhmep, pp); 6133 membar_stst(); 6134 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 6135 } 6136 6137 ASSERT(hmeblkp->hblk_vcnt > 0); 6138 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 6139 6140 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 6141 !hmeblkp->hblk_lckcnt); 6142 6143 #ifdef VAC 6144 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 6145 if (PP_ISTNC(pp)) { 6146 /* 6147 * If page was temporary 6148 * uncached, try to recache 6149 * it. Note that HME_SUB() was 6150 * called above so p_index and 6151 * mlist had been updated. 6152 */ 6153 conv_tnc(pp, ttesz); 6154 } else if (pp->p_mapping == NULL) { 6155 ASSERT(kpm_enable); 6156 /* 6157 * Page is marked to be in VAC conflict 6158 * to an existing kpm mapping and/or is 6159 * kpm mapped using only the regular 6160 * pagesize. 6161 */ 6162 sfmmu_kpm_hme_unload(pp); 6163 } 6164 } 6165 #endif /* VAC */ 6166 } else if ((pp = sfhmep->hme_page) != NULL) { 6167 /* 6168 * TTE is invalid but the hme 6169 * still exists. let pageunload 6170 * complete its job. 6171 */ 6172 ASSERT(pml == NULL); 6173 pml = sfmmu_mlist_enter(pp); 6174 if (sfhmep->hme_page != NULL) { 6175 sfmmu_mlist_exit(pml); 6176 continue; 6177 } 6178 ASSERT(sfhmep->hme_page == NULL); 6179 } else if (hmeblkp->hblk_hmecnt != 0) { 6180 /* 6181 * pageunload may have not finished decrementing 6182 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 6183 * wait for pageunload to finish. Rely on pageunload 6184 * to decrement hblk_hmecnt after hblk_vcnt. 6185 */ 6186 pfn_t pfn = TTE_TO_TTEPFN(&tte); 6187 ASSERT(pml == NULL); 6188 if (pf_is_memory(pfn)) { 6189 pp = page_numtopp_nolock(pfn); 6190 if (pp != NULL) { 6191 pml = sfmmu_mlist_enter(pp); 6192 sfmmu_mlist_exit(pml); 6193 pml = NULL; 6194 } 6195 } 6196 } 6197 6198 tte_unloaded: 6199 /* 6200 * At this point, the tte we are looking at 6201 * should be unloaded, and hme has been unlinked 6202 * from page too. This is important because in 6203 * pageunload, it does ttesync() then HME_SUB. 6204 * We need to make sure HME_SUB has been completed 6205 * so we know ttesync() has been completed. Otherwise, 6206 * at exit time, after return from hat layer, VM will 6207 * release as structure which hat_setstat() (called 6208 * by ttesync()) needs. 6209 */ 6210 #ifdef DEBUG 6211 { 6212 tte_t dtte; 6213 6214 ASSERT(sfhmep->hme_page == NULL); 6215 6216 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 6217 ASSERT(!TTE_IS_VALID(&dtte)); 6218 } 6219 #endif 6220 6221 if (pml) { 6222 sfmmu_mlist_exit(pml); 6223 } 6224 6225 addr += TTEBYTES(ttesz); 6226 sfhmep++; 6227 DEMAP_RANGE_NEXTPG(dmrp); 6228 } 6229 /* 6230 * For shared hmeblks this routine is only called when region is freed 6231 * and no longer referenced. So no need to decrement ttecnt 6232 * in the region structure here. 6233 */ 6234 if (ttecnt > 0 && sfmmup != NULL) { 6235 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 6236 } 6237 return (addr); 6238 } 6239 6240 /* 6241 * Synchronize all the mappings in the range [addr..addr+len). 6242 * Can be called with clearflag having two states: 6243 * HAT_SYNC_DONTZERO means just return the rm stats 6244 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 6245 */ 6246 void 6247 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 6248 { 6249 struct hmehash_bucket *hmebp; 6250 hmeblk_tag hblktag; 6251 int hmeshift, hashno = 1; 6252 struct hme_blk *hmeblkp, *list = NULL; 6253 caddr_t endaddr; 6254 cpuset_t cpuset; 6255 6256 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 6257 ASSERT((sfmmup == ksfmmup) || 6258 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 6259 ASSERT((len & MMU_PAGEOFFSET) == 0); 6260 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6261 (clearflag == HAT_SYNC_ZERORM)); 6262 6263 CPUSET_ZERO(cpuset); 6264 6265 endaddr = addr + len; 6266 hblktag.htag_id = sfmmup; 6267 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 6268 6269 /* 6270 * Spitfire supports 4 page sizes. 6271 * Most pages are expected to be of the smallest page 6272 * size (8K) and these will not need to be rehashed. 64K 6273 * pages also don't need to be rehashed because the an hmeblk 6274 * spans 64K of address space. 512K pages might need 1 rehash and 6275 * and 4M pages 2 rehashes. 6276 */ 6277 while (addr < endaddr) { 6278 hmeshift = HME_HASH_SHIFT(hashno); 6279 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 6280 hblktag.htag_rehash = hashno; 6281 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 6282 6283 SFMMU_HASH_LOCK(hmebp); 6284 6285 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 6286 if (hmeblkp != NULL) { 6287 ASSERT(!hmeblkp->hblk_shared); 6288 /* 6289 * We've encountered a shadow hmeblk so skip the range 6290 * of the next smaller mapping size. 6291 */ 6292 if (hmeblkp->hblk_shw_bit) { 6293 ASSERT(sfmmup != ksfmmup); 6294 ASSERT(hashno > 1); 6295 addr = (caddr_t)P2END((uintptr_t)addr, 6296 TTEBYTES(hashno - 1)); 6297 } else { 6298 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 6299 addr, endaddr, clearflag); 6300 } 6301 SFMMU_HASH_UNLOCK(hmebp); 6302 hashno = 1; 6303 continue; 6304 } 6305 SFMMU_HASH_UNLOCK(hmebp); 6306 6307 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 6308 /* 6309 * We have traversed the whole list and rehashed 6310 * if necessary without finding the address to sync. 6311 * This is ok so we increment the address by the 6312 * smallest hmeblk range for kernel mappings and the 6313 * largest hmeblk range, to account for shadow hmeblks, 6314 * for user mappings and continue. 6315 */ 6316 if (sfmmup == ksfmmup) 6317 addr = (caddr_t)P2END((uintptr_t)addr, 6318 TTEBYTES(1)); 6319 else 6320 addr = (caddr_t)P2END((uintptr_t)addr, 6321 TTEBYTES(hashno)); 6322 hashno = 1; 6323 } else { 6324 hashno++; 6325 } 6326 } 6327 sfmmu_hblks_list_purge(&list, 0); 6328 cpuset = sfmmup->sfmmu_cpusran; 6329 xt_sync(cpuset); 6330 } 6331 6332 static caddr_t 6333 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 6334 caddr_t endaddr, int clearflag) 6335 { 6336 tte_t tte, ttemod; 6337 struct sf_hment *sfhmep; 6338 int ttesz; 6339 struct page *pp; 6340 kmutex_t *pml; 6341 int ret; 6342 6343 ASSERT(hmeblkp->hblk_shw_bit == 0); 6344 ASSERT(!hmeblkp->hblk_shared); 6345 6346 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6347 6348 ttesz = get_hblk_ttesz(hmeblkp); 6349 HBLKTOHME(sfhmep, hmeblkp, addr); 6350 6351 while (addr < endaddr) { 6352 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6353 if (TTE_IS_VALID(&tte)) { 6354 pml = NULL; 6355 pp = sfhmep->hme_page; 6356 if (pp) { 6357 pml = sfmmu_mlist_enter(pp); 6358 } 6359 if (pp != sfhmep->hme_page) { 6360 /* 6361 * tte most have been unloaded 6362 * underneath us. Recheck 6363 */ 6364 ASSERT(pml); 6365 sfmmu_mlist_exit(pml); 6366 continue; 6367 } 6368 6369 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6370 6371 if (clearflag == HAT_SYNC_ZERORM) { 6372 ttemod = tte; 6373 TTE_CLR_RM(&ttemod); 6374 ret = sfmmu_modifytte_try(&tte, &ttemod, 6375 &sfhmep->hme_tte); 6376 if (ret < 0) { 6377 if (pml) { 6378 sfmmu_mlist_exit(pml); 6379 } 6380 continue; 6381 } 6382 6383 if (ret > 0) { 6384 sfmmu_tlb_demap(addr, sfmmup, 6385 hmeblkp, 0, 0); 6386 } 6387 } 6388 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6389 if (pml) { 6390 sfmmu_mlist_exit(pml); 6391 } 6392 } 6393 addr += TTEBYTES(ttesz); 6394 sfhmep++; 6395 } 6396 return (addr); 6397 } 6398 6399 /* 6400 * This function will sync a tte to the page struct and it will 6401 * update the hat stats. Currently it allows us to pass a NULL pp 6402 * and we will simply update the stats. We may want to change this 6403 * so we only keep stats for pages backed by pp's. 6404 */ 6405 static void 6406 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 6407 { 6408 uint_t rm = 0; 6409 int sz = TTE_CSZ(ttep); 6410 pgcnt_t npgs; 6411 6412 ASSERT(TTE_IS_VALID(ttep)); 6413 6414 if (!TTE_IS_NOSYNC(ttep)) { 6415 6416 if (TTE_IS_REF(ttep)) 6417 rm |= P_REF; 6418 6419 if (TTE_IS_MOD(ttep)) 6420 rm |= P_MOD; 6421 6422 if (rm != 0) { 6423 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { 6424 int i; 6425 caddr_t vaddr = addr; 6426 6427 for (i = 0; i < TTEPAGES(sz); i++) { 6428 hat_setstat(sfmmup->sfmmu_as, vaddr, 6429 MMU_PAGESIZE, rm); 6430 vaddr += MMU_PAGESIZE; 6431 } 6432 } 6433 } 6434 } 6435 6436 if (!pp) 6437 return; 6438 6439 /* 6440 * If software says this page is executable, and the page was 6441 * in fact executed (indicated by hardware exec permission 6442 * being enabled), then set P_EXEC on the page to remember 6443 * that it was executed. The I$ will be flushed when the page 6444 * is reassigned. 6445 */ 6446 if (TTE_EXECUTED(ttep)) { 6447 rm |= P_EXEC; 6448 } else if (rm == 0) { 6449 return; 6450 } 6451 6452 /* 6453 * XXX I want to use cas to update nrm bits but they 6454 * currently belong in common/vm and not in hat where 6455 * they should be. 6456 * The nrm bits are protected by the same mutex as 6457 * the one that protects the page's mapping list. 6458 */ 6459 ASSERT(sfmmu_mlist_held(pp)); 6460 /* 6461 * If the tte is for a large page, we need to sync all the 6462 * pages covered by the tte. 6463 */ 6464 if (sz != TTE8K) { 6465 ASSERT(pp->p_szc != 0); 6466 pp = PP_GROUPLEADER(pp, sz); 6467 ASSERT(sfmmu_mlist_held(pp)); 6468 } 6469 6470 /* Get number of pages from tte size. */ 6471 npgs = TTEPAGES(sz); 6472 6473 do { 6474 ASSERT(pp); 6475 ASSERT(sfmmu_mlist_held(pp)); 6476 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 6477 ((rm & P_MOD) != 0 && !PP_ISMOD(pp)) || 6478 ((rm & P_EXEC) != 0 && !PP_ISEXEC(pp))) 6479 hat_page_setattr(pp, rm); 6480 6481 /* 6482 * Are we done? If not, we must have a large mapping. 6483 * For large mappings we need to sync the rest of the pages 6484 * covered by this tte; goto the next page. 6485 */ 6486 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 6487 } 6488 6489 /* 6490 * Execute pre-callback handler of each pa_hment linked to pp 6491 * 6492 * Inputs: 6493 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 6494 * capture_cpus: pointer to return value (below) 6495 * 6496 * Returns: 6497 * Propagates the subsystem callback return values back to the caller; 6498 * returns 0 on success. If capture_cpus is non-NULL, the value returned 6499 * is zero if all of the pa_hments are of a type that do not require 6500 * capturing CPUs prior to suspending the mapping, else it is 1. 6501 */ 6502 static int 6503 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 6504 { 6505 struct sf_hment *sfhmep; 6506 struct pa_hment *pahmep; 6507 int (*f)(caddr_t, uint_t, uint_t, void *); 6508 int ret; 6509 id_t id; 6510 int locked = 0; 6511 kmutex_t *pml; 6512 6513 ASSERT(PAGE_EXCL(pp)); 6514 if (!sfmmu_mlist_held(pp)) { 6515 pml = sfmmu_mlist_enter(pp); 6516 locked = 1; 6517 } 6518 6519 if (capture_cpus) 6520 *capture_cpus = 0; 6521 6522 top: 6523 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6524 /* 6525 * skip sf_hments corresponding to VA<->PA mappings; 6526 * for pa_hment's, hme_tte.ll is zero 6527 */ 6528 if (!IS_PAHME(sfhmep)) 6529 continue; 6530 6531 pahmep = sfhmep->hme_data; 6532 ASSERT(pahmep != NULL); 6533 6534 /* 6535 * skip if pre-handler has been called earlier in this loop 6536 */ 6537 if (pahmep->flags & flag) 6538 continue; 6539 6540 id = pahmep->cb_id; 6541 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6542 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 6543 *capture_cpus = 1; 6544 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 6545 pahmep->flags |= flag; 6546 continue; 6547 } 6548 6549 /* 6550 * Drop the mapping list lock to avoid locking order issues. 6551 */ 6552 if (locked) 6553 sfmmu_mlist_exit(pml); 6554 6555 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 6556 if (ret != 0) 6557 return (ret); /* caller must do the cleanup */ 6558 6559 if (locked) { 6560 pml = sfmmu_mlist_enter(pp); 6561 pahmep->flags |= flag; 6562 goto top; 6563 } 6564 6565 pahmep->flags |= flag; 6566 } 6567 6568 if (locked) 6569 sfmmu_mlist_exit(pml); 6570 6571 return (0); 6572 } 6573 6574 /* 6575 * Execute post-callback handler of each pa_hment linked to pp 6576 * 6577 * Same overall assumptions and restrictions apply as for 6578 * hat_pageprocess_precallbacks(). 6579 */ 6580 static void 6581 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 6582 { 6583 pfn_t pgpfn = pp->p_pagenum; 6584 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 6585 pfn_t newpfn; 6586 struct sf_hment *sfhmep; 6587 struct pa_hment *pahmep; 6588 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 6589 id_t id; 6590 int locked = 0; 6591 kmutex_t *pml; 6592 6593 ASSERT(PAGE_EXCL(pp)); 6594 if (!sfmmu_mlist_held(pp)) { 6595 pml = sfmmu_mlist_enter(pp); 6596 locked = 1; 6597 } 6598 6599 top: 6600 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6601 /* 6602 * skip sf_hments corresponding to VA<->PA mappings; 6603 * for pa_hment's, hme_tte.ll is zero 6604 */ 6605 if (!IS_PAHME(sfhmep)) 6606 continue; 6607 6608 pahmep = sfhmep->hme_data; 6609 ASSERT(pahmep != NULL); 6610 6611 if ((pahmep->flags & flag) == 0) 6612 continue; 6613 6614 pahmep->flags &= ~flag; 6615 6616 id = pahmep->cb_id; 6617 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6618 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 6619 continue; 6620 6621 /* 6622 * Convert the base page PFN into the constituent PFN 6623 * which is needed by the callback handler. 6624 */ 6625 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 6626 6627 /* 6628 * Drop the mapping list lock to avoid locking order issues. 6629 */ 6630 if (locked) 6631 sfmmu_mlist_exit(pml); 6632 6633 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 6634 != 0) 6635 panic("sfmmu: posthandler failed"); 6636 6637 if (locked) { 6638 pml = sfmmu_mlist_enter(pp); 6639 goto top; 6640 } 6641 } 6642 6643 if (locked) 6644 sfmmu_mlist_exit(pml); 6645 } 6646 6647 /* 6648 * Suspend locked kernel mapping 6649 */ 6650 void 6651 hat_pagesuspend(struct page *pp) 6652 { 6653 struct sf_hment *sfhmep; 6654 sfmmu_t *sfmmup; 6655 tte_t tte, ttemod; 6656 struct hme_blk *hmeblkp; 6657 caddr_t addr; 6658 int index, cons; 6659 cpuset_t cpuset; 6660 6661 ASSERT(PAGE_EXCL(pp)); 6662 ASSERT(sfmmu_mlist_held(pp)); 6663 6664 mutex_enter(&kpr_suspendlock); 6665 6666 /* 6667 * We're about to suspend a kernel mapping so mark this thread as 6668 * non-traceable by DTrace. This prevents us from running into issues 6669 * with probe context trying to touch a suspended page 6670 * in the relocation codepath itself. 6671 */ 6672 curthread->t_flag |= T_DONTDTRACE; 6673 6674 index = PP_MAPINDEX(pp); 6675 cons = TTE8K; 6676 6677 retry: 6678 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6679 6680 if (IS_PAHME(sfhmep)) 6681 continue; 6682 6683 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 6684 continue; 6685 6686 /* 6687 * Loop until we successfully set the suspend bit in 6688 * the TTE. 6689 */ 6690 again: 6691 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6692 ASSERT(TTE_IS_VALID(&tte)); 6693 6694 ttemod = tte; 6695 TTE_SET_SUSPEND(&ttemod); 6696 if (sfmmu_modifytte_try(&tte, &ttemod, 6697 &sfhmep->hme_tte) < 0) 6698 goto again; 6699 6700 /* 6701 * Invalidate TSB entry 6702 */ 6703 hmeblkp = sfmmu_hmetohblk(sfhmep); 6704 6705 sfmmup = hblktosfmmu(hmeblkp); 6706 ASSERT(sfmmup == ksfmmup); 6707 ASSERT(!hmeblkp->hblk_shared); 6708 6709 addr = tte_to_vaddr(hmeblkp, tte); 6710 6711 /* 6712 * No need to make sure that the TSB for this sfmmu is 6713 * not being relocated since it is ksfmmup and thus it 6714 * will never be relocated. 6715 */ 6716 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 6717 6718 /* 6719 * Update xcall stats 6720 */ 6721 cpuset = cpu_ready_set; 6722 CPUSET_DEL(cpuset, CPU->cpu_id); 6723 6724 /* LINTED: constant in conditional context */ 6725 SFMMU_XCALL_STATS(ksfmmup); 6726 6727 /* 6728 * Flush TLB entry on remote CPU's 6729 */ 6730 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 6731 (uint64_t)ksfmmup); 6732 xt_sync(cpuset); 6733 6734 /* 6735 * Flush TLB entry on local CPU 6736 */ 6737 vtag_flushpage(addr, (uint64_t)ksfmmup); 6738 } 6739 6740 while (index != 0) { 6741 index = index >> 1; 6742 if (index != 0) 6743 cons++; 6744 if (index & 0x1) { 6745 pp = PP_GROUPLEADER(pp, cons); 6746 goto retry; 6747 } 6748 } 6749 } 6750 6751 #ifdef DEBUG 6752 6753 #define N_PRLE 1024 6754 struct prle { 6755 page_t *targ; 6756 page_t *repl; 6757 int status; 6758 int pausecpus; 6759 hrtime_t whence; 6760 }; 6761 6762 static struct prle page_relocate_log[N_PRLE]; 6763 static int prl_entry; 6764 static kmutex_t prl_mutex; 6765 6766 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 6767 mutex_enter(&prl_mutex); \ 6768 page_relocate_log[prl_entry].targ = *(t); \ 6769 page_relocate_log[prl_entry].repl = *(r); \ 6770 page_relocate_log[prl_entry].status = (s); \ 6771 page_relocate_log[prl_entry].pausecpus = (p); \ 6772 page_relocate_log[prl_entry].whence = gethrtime(); \ 6773 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 6774 mutex_exit(&prl_mutex); 6775 6776 #else /* !DEBUG */ 6777 #define PAGE_RELOCATE_LOG(t, r, s, p) 6778 #endif 6779 6780 /* 6781 * Core Kernel Page Relocation Algorithm 6782 * 6783 * Input: 6784 * 6785 * target : constituent pages are SE_EXCL locked. 6786 * replacement: constituent pages are SE_EXCL locked. 6787 * 6788 * Output: 6789 * 6790 * nrelocp: number of pages relocated 6791 */ 6792 int 6793 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 6794 { 6795 page_t *targ, *repl; 6796 page_t *tpp, *rpp; 6797 kmutex_t *low, *high; 6798 spgcnt_t npages, i; 6799 page_t *pl = NULL; 6800 uint_t ppattr; 6801 int old_pil; 6802 cpuset_t cpuset; 6803 int cap_cpus; 6804 int ret; 6805 #ifdef VAC 6806 int cflags = 0; 6807 #endif 6808 6809 if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) { 6810 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 6811 return (EAGAIN); 6812 } 6813 6814 mutex_enter(&kpr_mutex); 6815 kreloc_thread = curthread; 6816 6817 targ = *target; 6818 repl = *replacement; 6819 ASSERT(repl != NULL); 6820 ASSERT(targ->p_szc == repl->p_szc); 6821 6822 npages = page_get_pagecnt(targ->p_szc); 6823 6824 /* 6825 * unload VA<->PA mappings that are not locked 6826 */ 6827 tpp = targ; 6828 for (i = 0; i < npages; i++) { 6829 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 6830 tpp++; 6831 } 6832 6833 /* 6834 * Do "presuspend" callbacks, in a context from which we can still 6835 * block as needed. Note that we don't hold the mapping list lock 6836 * of "targ" at this point due to potential locking order issues; 6837 * we assume that between the hat_pageunload() above and holding 6838 * the SE_EXCL lock that the mapping list *cannot* change at this 6839 * point. 6840 */ 6841 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 6842 if (ret != 0) { 6843 /* 6844 * EIO translates to fatal error, for all others cleanup 6845 * and return EAGAIN. 6846 */ 6847 ASSERT(ret != EIO); 6848 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 6849 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 6850 kreloc_thread = NULL; 6851 mutex_exit(&kpr_mutex); 6852 return (EAGAIN); 6853 } 6854 6855 /* 6856 * acquire p_mapping list lock for both the target and replacement 6857 * root pages. 6858 * 6859 * low and high refer to the need to grab the mlist locks in a 6860 * specific order in order to prevent race conditions. Thus the 6861 * lower lock must be grabbed before the higher lock. 6862 * 6863 * This will block hat_unload's accessing p_mapping list. Since 6864 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 6865 * blocked. Thus, no one else will be accessing the p_mapping list 6866 * while we suspend and reload the locked mapping below. 6867 */ 6868 tpp = targ; 6869 rpp = repl; 6870 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6871 6872 kpreempt_disable(); 6873 6874 /* 6875 * We raise our PIL to 13 so that we don't get captured by 6876 * another CPU or pinned by an interrupt thread. We can't go to 6877 * PIL 14 since the nexus driver(s) may need to interrupt at 6878 * that level in the case of IOMMU pseudo mappings. 6879 */ 6880 cpuset = cpu_ready_set; 6881 CPUSET_DEL(cpuset, CPU->cpu_id); 6882 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6883 old_pil = splr(XCALL_PIL); 6884 } else { 6885 old_pil = -1; 6886 xc_attention(cpuset); 6887 } 6888 ASSERT(getpil() == XCALL_PIL); 6889 6890 /* 6891 * Now do suspend callbacks. In the case of an IOMMU mapping 6892 * this will suspend all DMA activity to the page while it is 6893 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6894 * may be captured at this point we should have acquired any needed 6895 * locks in the presuspend callback. 6896 */ 6897 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6898 if (ret != 0) { 6899 repl = targ; 6900 goto suspend_fail; 6901 } 6902 6903 /* 6904 * Raise the PIL yet again, this time to block all high-level 6905 * interrupts on this CPU. This is necessary to prevent an 6906 * interrupt routine from pinning the thread which holds the 6907 * mapping suspended and then touching the suspended page. 6908 * 6909 * Once the page is suspended we also need to be careful to 6910 * avoid calling any functions which touch any seg_kmem memory 6911 * since that memory may be backed by the very page we are 6912 * relocating in here! 6913 */ 6914 hat_pagesuspend(targ); 6915 6916 /* 6917 * Now that we are confident everybody has stopped using this page, 6918 * copy the page contents. Note we use a physical copy to prevent 6919 * locking issues and to avoid fpRAS because we can't handle it in 6920 * this context. 6921 */ 6922 for (i = 0; i < npages; i++, tpp++, rpp++) { 6923 #ifdef VAC 6924 /* 6925 * If the replacement has a different vcolor than 6926 * the one being replacd, we need to handle VAC 6927 * consistency for it just as we were setting up 6928 * a new mapping to it. 6929 */ 6930 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) && 6931 (tpp->p_vcolor != rpp->p_vcolor) && 6932 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) { 6933 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp)); 6934 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 6935 rpp->p_pagenum); 6936 } 6937 #endif 6938 /* 6939 * Copy the contents of the page. 6940 */ 6941 ppcopy_kernel(tpp, rpp); 6942 } 6943 6944 tpp = targ; 6945 rpp = repl; 6946 for (i = 0; i < npages; i++, tpp++, rpp++) { 6947 /* 6948 * Copy attributes. VAC consistency was handled above, 6949 * if required. 6950 */ 6951 ppattr = hat_page_getattr(tpp, (P_MOD | P_REF | P_RO)); 6952 page_clr_all_props(rpp, 0); 6953 page_set_props(rpp, ppattr); 6954 rpp->p_index = tpp->p_index; 6955 tpp->p_index = 0; 6956 #ifdef VAC 6957 rpp->p_vcolor = tpp->p_vcolor; 6958 #endif 6959 } 6960 6961 /* 6962 * First, unsuspend the page, if we set the suspend bit, and transfer 6963 * the mapping list from the target page to the replacement page. 6964 * Next process postcallbacks; since pa_hment's are linked only to the 6965 * p_mapping list of root page, we don't iterate over the constituent 6966 * pages. 6967 */ 6968 hat_pagereload(targ, repl); 6969 6970 suspend_fail: 6971 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 6972 6973 /* 6974 * Now lower our PIL and release any captured CPUs since we 6975 * are out of the "danger zone". After this it will again be 6976 * safe to acquire adaptive mutex locks, or to drop them... 6977 */ 6978 if (old_pil != -1) { 6979 splx(old_pil); 6980 } else { 6981 xc_dismissed(cpuset); 6982 } 6983 6984 kpreempt_enable(); 6985 6986 sfmmu_mlist_reloc_exit(low, high); 6987 6988 /* 6989 * Postsuspend callbacks should drop any locks held across 6990 * the suspend callbacks. As before, we don't hold the mapping 6991 * list lock at this point.. our assumption is that the mapping 6992 * list still can't change due to our holding SE_EXCL lock and 6993 * there being no unlocked mappings left. Hence the restriction 6994 * on calling context to hat_delete_callback() 6995 */ 6996 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 6997 if (ret != 0) { 6998 /* 6999 * The second presuspend call failed: we got here through 7000 * the suspend_fail label above. 7001 */ 7002 ASSERT(ret != EIO); 7003 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 7004 kreloc_thread = NULL; 7005 mutex_exit(&kpr_mutex); 7006 return (EAGAIN); 7007 } 7008 7009 /* 7010 * Now that we're out of the performance critical section we can 7011 * take care of updating the hash table, since we still 7012 * hold all the pages locked SE_EXCL at this point we 7013 * needn't worry about things changing out from under us. 7014 */ 7015 tpp = targ; 7016 rpp = repl; 7017 for (i = 0; i < npages; i++, tpp++, rpp++) { 7018 7019 /* 7020 * replace targ with replacement in page_hash table 7021 */ 7022 targ = tpp; 7023 page_relocate_hash(rpp, targ); 7024 7025 /* 7026 * concatenate target; caller of platform_page_relocate() 7027 * expects target to be concatenated after returning. 7028 */ 7029 ASSERT(targ->p_next == targ); 7030 ASSERT(targ->p_prev == targ); 7031 page_list_concat(&pl, &targ); 7032 } 7033 7034 ASSERT(*target == pl); 7035 *nrelocp = npages; 7036 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 7037 kreloc_thread = NULL; 7038 mutex_exit(&kpr_mutex); 7039 return (0); 7040 } 7041 7042 /* 7043 * Called when stray pa_hments are found attached to a page which is 7044 * being freed. Notify the subsystem which attached the pa_hment of 7045 * the error if it registered a suitable handler, else panic. 7046 */ 7047 static void 7048 sfmmu_pahment_leaked(struct pa_hment *pahmep) 7049 { 7050 id_t cb_id = pahmep->cb_id; 7051 7052 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 7053 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 7054 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 7055 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 7056 return; /* non-fatal */ 7057 } 7058 panic("pa_hment leaked: 0x%p", (void *)pahmep); 7059 } 7060 7061 /* 7062 * Remove all mappings to page 'pp'. 7063 */ 7064 int 7065 hat_pageunload(struct page *pp, uint_t forceflag) 7066 { 7067 struct page *origpp = pp; 7068 struct sf_hment *sfhme, *tmphme; 7069 struct hme_blk *hmeblkp; 7070 kmutex_t *pml; 7071 #ifdef VAC 7072 kmutex_t *pmtx; 7073 #endif 7074 cpuset_t cpuset, tset; 7075 int index, cons; 7076 int xhme_blks; 7077 int pa_hments; 7078 7079 ASSERT(PAGE_EXCL(pp)); 7080 7081 retry_xhat: 7082 tmphme = NULL; 7083 xhme_blks = 0; 7084 pa_hments = 0; 7085 CPUSET_ZERO(cpuset); 7086 7087 pml = sfmmu_mlist_enter(pp); 7088 7089 #ifdef VAC 7090 if (pp->p_kpmref) 7091 sfmmu_kpm_pageunload(pp); 7092 ASSERT(!PP_ISMAPPED_KPM(pp)); 7093 #endif 7094 7095 index = PP_MAPINDEX(pp); 7096 cons = TTE8K; 7097 retry: 7098 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7099 tmphme = sfhme->hme_next; 7100 7101 if (IS_PAHME(sfhme)) { 7102 ASSERT(sfhme->hme_data != NULL); 7103 pa_hments++; 7104 continue; 7105 } 7106 7107 hmeblkp = sfmmu_hmetohblk(sfhme); 7108 if (hmeblkp->hblk_xhat_bit) { 7109 struct xhat_hme_blk *xblk = 7110 (struct xhat_hme_blk *)hmeblkp; 7111 7112 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat, 7113 pp, forceflag, XBLK2PROVBLK(xblk)); 7114 7115 xhme_blks = 1; 7116 continue; 7117 } 7118 7119 /* 7120 * If there are kernel mappings don't unload them, they will 7121 * be suspended. 7122 */ 7123 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 7124 hmeblkp->hblk_tag.htag_id == ksfmmup) 7125 continue; 7126 7127 tset = sfmmu_pageunload(pp, sfhme, cons); 7128 CPUSET_OR(cpuset, tset); 7129 } 7130 7131 while (index != 0) { 7132 index = index >> 1; 7133 if (index != 0) 7134 cons++; 7135 if (index & 0x1) { 7136 /* Go to leading page */ 7137 pp = PP_GROUPLEADER(pp, cons); 7138 ASSERT(sfmmu_mlist_held(pp)); 7139 goto retry; 7140 } 7141 } 7142 7143 /* 7144 * cpuset may be empty if the page was only mapped by segkpm, 7145 * in which case we won't actually cross-trap. 7146 */ 7147 xt_sync(cpuset); 7148 7149 /* 7150 * The page should have no mappings at this point, unless 7151 * we were called from hat_page_relocate() in which case we 7152 * leave the locked mappings which will be suspended later. 7153 */ 7154 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments || 7155 (forceflag == SFMMU_KERNEL_RELOC)); 7156 7157 #ifdef VAC 7158 if (PP_ISTNC(pp)) { 7159 if (cons == TTE8K) { 7160 pmtx = sfmmu_page_enter(pp); 7161 PP_CLRTNC(pp); 7162 sfmmu_page_exit(pmtx); 7163 } else { 7164 conv_tnc(pp, cons); 7165 } 7166 } 7167 #endif /* VAC */ 7168 7169 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 7170 /* 7171 * Unlink any pa_hments and free them, calling back 7172 * the responsible subsystem to notify it of the error. 7173 * This can occur in situations such as drivers leaking 7174 * DMA handles: naughty, but common enough that we'd like 7175 * to keep the system running rather than bringing it 7176 * down with an obscure error like "pa_hment leaked" 7177 * which doesn't aid the user in debugging their driver. 7178 */ 7179 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7180 tmphme = sfhme->hme_next; 7181 if (IS_PAHME(sfhme)) { 7182 struct pa_hment *pahmep = sfhme->hme_data; 7183 sfmmu_pahment_leaked(pahmep); 7184 HME_SUB(sfhme, pp); 7185 kmem_cache_free(pa_hment_cache, pahmep); 7186 } 7187 } 7188 7189 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks); 7190 } 7191 7192 sfmmu_mlist_exit(pml); 7193 7194 /* 7195 * XHAT may not have finished unloading pages 7196 * because some other thread was waiting for 7197 * mlist lock and XHAT_PAGEUNLOAD let it do 7198 * the job. 7199 */ 7200 if (xhme_blks) { 7201 pp = origpp; 7202 goto retry_xhat; 7203 } 7204 7205 return (0); 7206 } 7207 7208 cpuset_t 7209 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 7210 { 7211 struct hme_blk *hmeblkp; 7212 sfmmu_t *sfmmup; 7213 tte_t tte, ttemod; 7214 #ifdef DEBUG 7215 tte_t orig_old; 7216 #endif /* DEBUG */ 7217 caddr_t addr; 7218 int ttesz; 7219 int ret; 7220 cpuset_t cpuset; 7221 7222 ASSERT(pp != NULL); 7223 ASSERT(sfmmu_mlist_held(pp)); 7224 ASSERT(!PP_ISKAS(pp)); 7225 7226 CPUSET_ZERO(cpuset); 7227 7228 hmeblkp = sfmmu_hmetohblk(sfhme); 7229 7230 readtte: 7231 sfmmu_copytte(&sfhme->hme_tte, &tte); 7232 if (TTE_IS_VALID(&tte)) { 7233 sfmmup = hblktosfmmu(hmeblkp); 7234 ttesz = get_hblk_ttesz(hmeblkp); 7235 /* 7236 * Only unload mappings of 'cons' size. 7237 */ 7238 if (ttesz != cons) 7239 return (cpuset); 7240 7241 /* 7242 * Note that we have p_mapping lock, but no hash lock here. 7243 * hblk_unload() has to have both hash lock AND p_mapping 7244 * lock before it tries to modify tte. So, the tte could 7245 * not become invalid in the sfmmu_modifytte_try() below. 7246 */ 7247 ttemod = tte; 7248 #ifdef DEBUG 7249 orig_old = tte; 7250 #endif /* DEBUG */ 7251 7252 TTE_SET_INVALID(&ttemod); 7253 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7254 if (ret < 0) { 7255 #ifdef DEBUG 7256 /* only R/M bits can change. */ 7257 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 7258 #endif /* DEBUG */ 7259 goto readtte; 7260 } 7261 7262 if (ret == 0) { 7263 panic("pageunload: cas failed?"); 7264 } 7265 7266 addr = tte_to_vaddr(hmeblkp, tte); 7267 7268 if (hmeblkp->hblk_shared) { 7269 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7270 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7271 sf_region_t *rgnp; 7272 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7273 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7274 ASSERT(srdp != NULL); 7275 rgnp = srdp->srd_hmergnp[rid]; 7276 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7277 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1); 7278 sfmmu_ttesync(NULL, addr, &tte, pp); 7279 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0); 7280 atomic_add_long(&rgnp->rgn_ttecnt[ttesz], -1); 7281 } else { 7282 sfmmu_ttesync(sfmmup, addr, &tte, pp); 7283 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1); 7284 7285 /* 7286 * We need to flush the page from the virtual cache 7287 * in order to prevent a virtual cache alias 7288 * inconsistency. The particular scenario we need 7289 * to worry about is: 7290 * Given: va1 and va2 are two virtual address that 7291 * alias and will map the same physical address. 7292 * 1. mapping exists from va1 to pa and data has 7293 * been read into the cache. 7294 * 2. unload va1. 7295 * 3. load va2 and modify data using va2. 7296 * 4 unload va2. 7297 * 5. load va1 and reference data. Unless we flush 7298 * the data cache when we unload we will get 7299 * stale data. 7300 * This scenario is taken care of by using virtual 7301 * page coloring. 7302 */ 7303 if (sfmmup->sfmmu_ismhat) { 7304 /* 7305 * Flush TSBs, TLBs and caches 7306 * of every process 7307 * sharing this ism segment. 7308 */ 7309 sfmmu_hat_lock_all(); 7310 mutex_enter(&ism_mlist_lock); 7311 kpreempt_disable(); 7312 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 7313 pp->p_pagenum, CACHE_NO_FLUSH); 7314 kpreempt_enable(); 7315 mutex_exit(&ism_mlist_lock); 7316 sfmmu_hat_unlock_all(); 7317 cpuset = cpu_ready_set; 7318 } else { 7319 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7320 cpuset = sfmmup->sfmmu_cpusran; 7321 } 7322 } 7323 7324 /* 7325 * Hme_sub has to run after ttesync() and a_rss update. 7326 * See hblk_unload(). 7327 */ 7328 HME_SUB(sfhme, pp); 7329 membar_stst(); 7330 7331 /* 7332 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 7333 * since pteload may have done a HME_ADD() right after 7334 * we did the HME_SUB() above. Hmecnt is now maintained 7335 * by cas only. no lock guranteed its value. The only 7336 * gurantee we have is the hmecnt should not be less than 7337 * what it should be so the hblk will not be taken away. 7338 * It's also important that we decremented the hmecnt after 7339 * we are done with hmeblkp so that this hmeblk won't be 7340 * stolen. 7341 */ 7342 ASSERT(hmeblkp->hblk_hmecnt > 0); 7343 ASSERT(hmeblkp->hblk_vcnt > 0); 7344 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 7345 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 7346 /* 7347 * This is bug 4063182. 7348 * XXX: fixme 7349 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 7350 * !hmeblkp->hblk_lckcnt); 7351 */ 7352 } else { 7353 panic("invalid tte? pp %p &tte %p", 7354 (void *)pp, (void *)&tte); 7355 } 7356 7357 return (cpuset); 7358 } 7359 7360 /* 7361 * While relocating a kernel page, this function will move the mappings 7362 * from tpp to dpp and modify any associated data with these mappings. 7363 * It also unsuspends the suspended kernel mapping. 7364 */ 7365 static void 7366 hat_pagereload(struct page *tpp, struct page *dpp) 7367 { 7368 struct sf_hment *sfhme; 7369 tte_t tte, ttemod; 7370 int index, cons; 7371 7372 ASSERT(getpil() == PIL_MAX); 7373 ASSERT(sfmmu_mlist_held(tpp)); 7374 ASSERT(sfmmu_mlist_held(dpp)); 7375 7376 index = PP_MAPINDEX(tpp); 7377 cons = TTE8K; 7378 7379 /* Update real mappings to the page */ 7380 retry: 7381 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 7382 if (IS_PAHME(sfhme)) 7383 continue; 7384 sfmmu_copytte(&sfhme->hme_tte, &tte); 7385 ttemod = tte; 7386 7387 /* 7388 * replace old pfn with new pfn in TTE 7389 */ 7390 PFN_TO_TTE(ttemod, dpp->p_pagenum); 7391 7392 /* 7393 * clear suspend bit 7394 */ 7395 ASSERT(TTE_IS_SUSPEND(&ttemod)); 7396 TTE_CLR_SUSPEND(&ttemod); 7397 7398 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 7399 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 7400 7401 /* 7402 * set hme_page point to new page 7403 */ 7404 sfhme->hme_page = dpp; 7405 } 7406 7407 /* 7408 * move p_mapping list from old page to new page 7409 */ 7410 dpp->p_mapping = tpp->p_mapping; 7411 tpp->p_mapping = NULL; 7412 dpp->p_share = tpp->p_share; 7413 tpp->p_share = 0; 7414 7415 while (index != 0) { 7416 index = index >> 1; 7417 if (index != 0) 7418 cons++; 7419 if (index & 0x1) { 7420 tpp = PP_GROUPLEADER(tpp, cons); 7421 dpp = PP_GROUPLEADER(dpp, cons); 7422 goto retry; 7423 } 7424 } 7425 7426 curthread->t_flag &= ~T_DONTDTRACE; 7427 mutex_exit(&kpr_suspendlock); 7428 } 7429 7430 uint_t 7431 hat_pagesync(struct page *pp, uint_t clearflag) 7432 { 7433 struct sf_hment *sfhme, *tmphme = NULL; 7434 struct hme_blk *hmeblkp; 7435 kmutex_t *pml; 7436 cpuset_t cpuset, tset; 7437 int index, cons; 7438 extern ulong_t po_share; 7439 page_t *save_pp = pp; 7440 int stop_on_sh = 0; 7441 uint_t shcnt; 7442 7443 CPUSET_ZERO(cpuset); 7444 7445 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 7446 return (PP_GENERIC_ATTR(pp)); 7447 } 7448 7449 if ((clearflag & HAT_SYNC_ZERORM) == 0) { 7450 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) { 7451 return (PP_GENERIC_ATTR(pp)); 7452 } 7453 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) { 7454 return (PP_GENERIC_ATTR(pp)); 7455 } 7456 if (clearflag & HAT_SYNC_STOPON_SHARED) { 7457 if (pp->p_share > po_share) { 7458 hat_page_setattr(pp, P_REF); 7459 return (PP_GENERIC_ATTR(pp)); 7460 } 7461 stop_on_sh = 1; 7462 shcnt = 0; 7463 } 7464 } 7465 7466 clearflag &= ~HAT_SYNC_STOPON_SHARED; 7467 pml = sfmmu_mlist_enter(pp); 7468 index = PP_MAPINDEX(pp); 7469 cons = TTE8K; 7470 retry: 7471 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7472 /* 7473 * We need to save the next hment on the list since 7474 * it is possible for pagesync to remove an invalid hment 7475 * from the list. 7476 */ 7477 tmphme = sfhme->hme_next; 7478 if (IS_PAHME(sfhme)) 7479 continue; 7480 /* 7481 * If we are looking for large mappings and this hme doesn't 7482 * reach the range we are seeking, just ignore it. 7483 */ 7484 hmeblkp = sfmmu_hmetohblk(sfhme); 7485 if (hmeblkp->hblk_xhat_bit) 7486 continue; 7487 7488 if (hme_size(sfhme) < cons) 7489 continue; 7490 7491 if (stop_on_sh) { 7492 if (hmeblkp->hblk_shared) { 7493 sf_srd_t *srdp = hblktosrd(hmeblkp); 7494 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7495 sf_region_t *rgnp; 7496 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7497 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7498 ASSERT(srdp != NULL); 7499 rgnp = srdp->srd_hmergnp[rid]; 7500 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 7501 rgnp, rid); 7502 shcnt += rgnp->rgn_refcnt; 7503 } else { 7504 shcnt++; 7505 } 7506 if (shcnt > po_share) { 7507 /* 7508 * tell the pager to spare the page this time 7509 * around. 7510 */ 7511 hat_page_setattr(save_pp, P_REF); 7512 index = 0; 7513 break; 7514 } 7515 } 7516 tset = sfmmu_pagesync(pp, sfhme, 7517 clearflag & ~HAT_SYNC_STOPON_RM); 7518 CPUSET_OR(cpuset, tset); 7519 7520 /* 7521 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 7522 * as the "ref" or "mod" is set or share cnt exceeds po_share. 7523 */ 7524 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 7525 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 7526 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) { 7527 index = 0; 7528 break; 7529 } 7530 } 7531 7532 while (index) { 7533 index = index >> 1; 7534 cons++; 7535 if (index & 0x1) { 7536 /* Go to leading page */ 7537 pp = PP_GROUPLEADER(pp, cons); 7538 goto retry; 7539 } 7540 } 7541 7542 xt_sync(cpuset); 7543 sfmmu_mlist_exit(pml); 7544 return (PP_GENERIC_ATTR(save_pp)); 7545 } 7546 7547 /* 7548 * Get all the hardware dependent attributes for a page struct 7549 */ 7550 static cpuset_t 7551 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 7552 uint_t clearflag) 7553 { 7554 caddr_t addr; 7555 tte_t tte, ttemod; 7556 struct hme_blk *hmeblkp; 7557 int ret; 7558 sfmmu_t *sfmmup; 7559 cpuset_t cpuset; 7560 7561 ASSERT(pp != NULL); 7562 ASSERT(sfmmu_mlist_held(pp)); 7563 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 7564 (clearflag == HAT_SYNC_ZERORM)); 7565 7566 SFMMU_STAT(sf_pagesync); 7567 7568 CPUSET_ZERO(cpuset); 7569 7570 sfmmu_pagesync_retry: 7571 7572 sfmmu_copytte(&sfhme->hme_tte, &tte); 7573 if (TTE_IS_VALID(&tte)) { 7574 hmeblkp = sfmmu_hmetohblk(sfhme); 7575 sfmmup = hblktosfmmu(hmeblkp); 7576 addr = tte_to_vaddr(hmeblkp, tte); 7577 if (clearflag == HAT_SYNC_ZERORM) { 7578 ttemod = tte; 7579 TTE_CLR_RM(&ttemod); 7580 ret = sfmmu_modifytte_try(&tte, &ttemod, 7581 &sfhme->hme_tte); 7582 if (ret < 0) { 7583 /* 7584 * cas failed and the new value is not what 7585 * we want. 7586 */ 7587 goto sfmmu_pagesync_retry; 7588 } 7589 7590 if (ret > 0) { 7591 /* we win the cas */ 7592 if (hmeblkp->hblk_shared) { 7593 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7594 uint_t rid = 7595 hmeblkp->hblk_tag.htag_rid; 7596 sf_region_t *rgnp; 7597 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7598 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7599 ASSERT(srdp != NULL); 7600 rgnp = srdp->srd_hmergnp[rid]; 7601 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7602 srdp, rgnp, rid); 7603 cpuset = sfmmu_rgntlb_demap(addr, 7604 rgnp, hmeblkp, 1); 7605 } else { 7606 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 7607 0, 0); 7608 cpuset = sfmmup->sfmmu_cpusran; 7609 } 7610 } 7611 } 7612 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr, 7613 &tte, pp); 7614 } 7615 return (cpuset); 7616 } 7617 7618 /* 7619 * Remove write permission from a mappings to a page, so that 7620 * we can detect the next modification of it. This requires modifying 7621 * the TTE then invalidating (demap) any TLB entry using that TTE. 7622 * This code is similar to sfmmu_pagesync(). 7623 */ 7624 static cpuset_t 7625 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 7626 { 7627 caddr_t addr; 7628 tte_t tte; 7629 tte_t ttemod; 7630 struct hme_blk *hmeblkp; 7631 int ret; 7632 sfmmu_t *sfmmup; 7633 cpuset_t cpuset; 7634 7635 ASSERT(pp != NULL); 7636 ASSERT(sfmmu_mlist_held(pp)); 7637 7638 CPUSET_ZERO(cpuset); 7639 SFMMU_STAT(sf_clrwrt); 7640 7641 retry: 7642 7643 sfmmu_copytte(&sfhme->hme_tte, &tte); 7644 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 7645 hmeblkp = sfmmu_hmetohblk(sfhme); 7646 7647 /* 7648 * xhat mappings should never be to a VMODSORT page. 7649 */ 7650 ASSERT(hmeblkp->hblk_xhat_bit == 0); 7651 7652 sfmmup = hblktosfmmu(hmeblkp); 7653 addr = tte_to_vaddr(hmeblkp, tte); 7654 7655 ttemod = tte; 7656 TTE_CLR_WRT(&ttemod); 7657 TTE_CLR_MOD(&ttemod); 7658 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7659 7660 /* 7661 * if cas failed and the new value is not what 7662 * we want retry 7663 */ 7664 if (ret < 0) 7665 goto retry; 7666 7667 /* we win the cas */ 7668 if (ret > 0) { 7669 if (hmeblkp->hblk_shared) { 7670 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7671 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7672 sf_region_t *rgnp; 7673 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7674 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7675 ASSERT(srdp != NULL); 7676 rgnp = srdp->srd_hmergnp[rid]; 7677 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7678 srdp, rgnp, rid); 7679 cpuset = sfmmu_rgntlb_demap(addr, 7680 rgnp, hmeblkp, 1); 7681 } else { 7682 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7683 cpuset = sfmmup->sfmmu_cpusran; 7684 } 7685 } 7686 } 7687 7688 return (cpuset); 7689 } 7690 7691 /* 7692 * Walk all mappings of a page, removing write permission and clearing the 7693 * ref/mod bits. This code is similar to hat_pagesync() 7694 */ 7695 static void 7696 hat_page_clrwrt(page_t *pp) 7697 { 7698 struct sf_hment *sfhme; 7699 struct sf_hment *tmphme = NULL; 7700 kmutex_t *pml; 7701 cpuset_t cpuset; 7702 cpuset_t tset; 7703 int index; 7704 int cons; 7705 7706 CPUSET_ZERO(cpuset); 7707 7708 pml = sfmmu_mlist_enter(pp); 7709 index = PP_MAPINDEX(pp); 7710 cons = TTE8K; 7711 retry: 7712 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7713 tmphme = sfhme->hme_next; 7714 7715 /* 7716 * If we are looking for large mappings and this hme doesn't 7717 * reach the range we are seeking, just ignore its. 7718 */ 7719 7720 if (hme_size(sfhme) < cons) 7721 continue; 7722 7723 tset = sfmmu_pageclrwrt(pp, sfhme); 7724 CPUSET_OR(cpuset, tset); 7725 } 7726 7727 while (index) { 7728 index = index >> 1; 7729 cons++; 7730 if (index & 0x1) { 7731 /* Go to leading page */ 7732 pp = PP_GROUPLEADER(pp, cons); 7733 goto retry; 7734 } 7735 } 7736 7737 xt_sync(cpuset); 7738 sfmmu_mlist_exit(pml); 7739 } 7740 7741 /* 7742 * Set the given REF/MOD/RO bits for the given page. 7743 * For a vnode with a sorted v_pages list, we need to change 7744 * the attributes and the v_pages list together under page_vnode_mutex. 7745 */ 7746 void 7747 hat_page_setattr(page_t *pp, uint_t flag) 7748 { 7749 vnode_t *vp = pp->p_vnode; 7750 page_t **listp; 7751 kmutex_t *pmtx; 7752 kmutex_t *vphm = NULL; 7753 int noshuffle; 7754 7755 noshuffle = flag & P_NSH; 7756 flag &= ~P_NSH; 7757 7758 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO | P_EXEC))); 7759 7760 /* 7761 * nothing to do if attribute already set 7762 */ 7763 if ((pp->p_nrm & flag) == flag) 7764 return; 7765 7766 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 7767 !noshuffle) { 7768 vphm = page_vnode_mutex(vp); 7769 mutex_enter(vphm); 7770 } 7771 7772 pmtx = sfmmu_page_enter(pp); 7773 pp->p_nrm |= flag; 7774 sfmmu_page_exit(pmtx); 7775 7776 if (vphm != NULL) { 7777 /* 7778 * Some File Systems examine v_pages for NULL w/o 7779 * grabbing the vphm mutex. Must not let it become NULL when 7780 * pp is the only page on the list. 7781 */ 7782 if (pp->p_vpnext != pp) { 7783 page_vpsub(&vp->v_pages, pp); 7784 if (vp->v_pages != NULL) 7785 listp = &vp->v_pages->p_vpprev->p_vpnext; 7786 else 7787 listp = &vp->v_pages; 7788 page_vpadd(listp, pp); 7789 } 7790 mutex_exit(vphm); 7791 } 7792 } 7793 7794 void 7795 hat_page_clrattr(page_t *pp, uint_t flag) 7796 { 7797 vnode_t *vp = pp->p_vnode; 7798 kmutex_t *pmtx; 7799 7800 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7801 7802 pmtx = sfmmu_page_enter(pp); 7803 7804 /* 7805 * Caller is expected to hold page's io lock for VMODSORT to work 7806 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 7807 * bit is cleared. 7808 * We don't have assert to avoid tripping some existing third party 7809 * code. The dirty page is moved back to top of the v_page list 7810 * after IO is done in pvn_write_done(). 7811 */ 7812 pp->p_nrm &= ~flag; 7813 sfmmu_page_exit(pmtx); 7814 7815 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 7816 7817 /* 7818 * VMODSORT works by removing write permissions and getting 7819 * a fault when a page is made dirty. At this point 7820 * we need to remove write permission from all mappings 7821 * to this page. 7822 */ 7823 hat_page_clrwrt(pp); 7824 } 7825 } 7826 7827 uint_t 7828 hat_page_getattr(page_t *pp, uint_t flag) 7829 { 7830 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7831 return ((uint_t)(pp->p_nrm & flag)); 7832 } 7833 7834 /* 7835 * DEBUG kernels: verify that a kernel va<->pa translation 7836 * is safe by checking the underlying page_t is in a page 7837 * relocation-safe state. 7838 */ 7839 #ifdef DEBUG 7840 void 7841 sfmmu_check_kpfn(pfn_t pfn) 7842 { 7843 page_t *pp; 7844 int index, cons; 7845 7846 if (hat_check_vtop == 0) 7847 return; 7848 7849 if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr) 7850 return; 7851 7852 pp = page_numtopp_nolock(pfn); 7853 if (!pp) 7854 return; 7855 7856 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7857 return; 7858 7859 /* 7860 * Handed a large kernel page, we dig up the root page since we 7861 * know the root page might have the lock also. 7862 */ 7863 if (pp->p_szc != 0) { 7864 index = PP_MAPINDEX(pp); 7865 cons = TTE8K; 7866 again: 7867 while (index != 0) { 7868 index >>= 1; 7869 if (index != 0) 7870 cons++; 7871 if (index & 0x1) { 7872 pp = PP_GROUPLEADER(pp, cons); 7873 goto again; 7874 } 7875 } 7876 } 7877 7878 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7879 return; 7880 7881 /* 7882 * Pages need to be locked or allocated "permanent" (either from 7883 * static_arena arena or explicitly setting PG_NORELOC when calling 7884 * page_create_va()) for VA->PA translations to be valid. 7885 */ 7886 if (!PP_ISNORELOC(pp)) 7887 panic("Illegal VA->PA translation, pp 0x%p not permanent", 7888 (void *)pp); 7889 else 7890 panic("Illegal VA->PA translation, pp 0x%p not locked", 7891 (void *)pp); 7892 } 7893 #endif /* DEBUG */ 7894 7895 /* 7896 * Returns a page frame number for a given virtual address. 7897 * Returns PFN_INVALID to indicate an invalid mapping 7898 */ 7899 pfn_t 7900 hat_getpfnum(struct hat *hat, caddr_t addr) 7901 { 7902 pfn_t pfn; 7903 tte_t tte; 7904 7905 /* 7906 * We would like to 7907 * ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 7908 * but we can't because the iommu driver will call this 7909 * routine at interrupt time and it can't grab the as lock 7910 * or it will deadlock: A thread could have the as lock 7911 * and be waiting for io. The io can't complete 7912 * because the interrupt thread is blocked trying to grab 7913 * the as lock. 7914 */ 7915 7916 ASSERT(hat->sfmmu_xhat_provider == NULL); 7917 7918 if (hat == ksfmmup) { 7919 if (IS_KMEM_VA_LARGEPAGE(addr)) { 7920 ASSERT(segkmem_lpszc > 0); 7921 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc); 7922 if (pfn != PFN_INVALID) { 7923 sfmmu_check_kpfn(pfn); 7924 return (pfn); 7925 } 7926 } else if (segkpm && IS_KPM_ADDR(addr)) { 7927 return (sfmmu_kpm_vatopfn(addr)); 7928 } 7929 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7930 == PFN_SUSPENDED) { 7931 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7932 } 7933 sfmmu_check_kpfn(pfn); 7934 return (pfn); 7935 } else { 7936 return (sfmmu_uvatopfn(addr, hat, NULL)); 7937 } 7938 } 7939 7940 /* 7941 * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged. 7942 * Use hat_getpfnum(kas.a_hat, ...) instead. 7943 * 7944 * We'd like to return PFN_INVALID if the mappings have underlying page_t's 7945 * but can't right now due to the fact that some software has grown to use 7946 * this interface incorrectly. So for now when the interface is misused, 7947 * return a warning to the user that in the future it won't work in the 7948 * way they're abusing it, and carry on (after disabling page relocation). 7949 */ 7950 pfn_t 7951 hat_getkpfnum(caddr_t addr) 7952 { 7953 pfn_t pfn; 7954 tte_t tte; 7955 int badcaller = 0; 7956 extern int segkmem_reloc; 7957 7958 if (segkpm && IS_KPM_ADDR(addr)) { 7959 badcaller = 1; 7960 pfn = sfmmu_kpm_vatopfn(addr); 7961 } else { 7962 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7963 == PFN_SUSPENDED) { 7964 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7965 } 7966 badcaller = pf_is_memory(pfn); 7967 } 7968 7969 if (badcaller) { 7970 /* 7971 * We can't return PFN_INVALID or the caller may panic 7972 * or corrupt the system. The only alternative is to 7973 * disable page relocation at this point for all kernel 7974 * memory. This will impact any callers of page_relocate() 7975 * such as FMA or DR. 7976 * 7977 * RFE: Add junk here to spit out an ereport so the sysadmin 7978 * can be advised that he should upgrade his device driver 7979 * so that this doesn't happen. 7980 */ 7981 hat_getkpfnum_badcall(caller()); 7982 if (hat_kpr_enabled && segkmem_reloc) { 7983 hat_kpr_enabled = 0; 7984 segkmem_reloc = 0; 7985 cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED"); 7986 } 7987 } 7988 return (pfn); 7989 } 7990 7991 /* 7992 * This routine will return both pfn and tte for the vaddr. 7993 */ 7994 static pfn_t 7995 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep) 7996 { 7997 struct hmehash_bucket *hmebp; 7998 hmeblk_tag hblktag; 7999 int hmeshift, hashno = 1; 8000 struct hme_blk *hmeblkp = NULL; 8001 tte_t tte; 8002 8003 struct sf_hment *sfhmep; 8004 pfn_t pfn; 8005 8006 /* support for ISM */ 8007 ism_map_t *ism_map; 8008 ism_blk_t *ism_blkp; 8009 int i; 8010 sfmmu_t *ism_hatid = NULL; 8011 sfmmu_t *locked_hatid = NULL; 8012 sfmmu_t *sv_sfmmup = sfmmup; 8013 caddr_t sv_vaddr = vaddr; 8014 sf_srd_t *srdp; 8015 8016 if (ttep == NULL) { 8017 ttep = &tte; 8018 } else { 8019 ttep->ll = 0; 8020 } 8021 8022 ASSERT(sfmmup != ksfmmup); 8023 SFMMU_STAT(sf_user_vtop); 8024 /* 8025 * Set ism_hatid if vaddr falls in a ISM segment. 8026 */ 8027 ism_blkp = sfmmup->sfmmu_iblk; 8028 if (ism_blkp != NULL) { 8029 sfmmu_ismhat_enter(sfmmup, 0); 8030 locked_hatid = sfmmup; 8031 } 8032 while (ism_blkp != NULL && ism_hatid == NULL) { 8033 ism_map = ism_blkp->iblk_maps; 8034 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 8035 if (vaddr >= ism_start(ism_map[i]) && 8036 vaddr < ism_end(ism_map[i])) { 8037 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 8038 vaddr = (caddr_t)(vaddr - 8039 ism_start(ism_map[i])); 8040 break; 8041 } 8042 } 8043 ism_blkp = ism_blkp->iblk_next; 8044 } 8045 if (locked_hatid) { 8046 sfmmu_ismhat_exit(locked_hatid, 0); 8047 } 8048 8049 hblktag.htag_id = sfmmup; 8050 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 8051 do { 8052 hmeshift = HME_HASH_SHIFT(hashno); 8053 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 8054 hblktag.htag_rehash = hashno; 8055 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 8056 8057 SFMMU_HASH_LOCK(hmebp); 8058 8059 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 8060 if (hmeblkp != NULL) { 8061 ASSERT(!hmeblkp->hblk_shared); 8062 HBLKTOHME(sfhmep, hmeblkp, vaddr); 8063 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8064 SFMMU_HASH_UNLOCK(hmebp); 8065 if (TTE_IS_VALID(ttep)) { 8066 pfn = TTE_TO_PFN(vaddr, ttep); 8067 return (pfn); 8068 } 8069 break; 8070 } 8071 SFMMU_HASH_UNLOCK(hmebp); 8072 hashno++; 8073 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 8074 8075 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) { 8076 return (PFN_INVALID); 8077 } 8078 srdp = sv_sfmmup->sfmmu_srdp; 8079 ASSERT(srdp != NULL); 8080 ASSERT(srdp->srd_refcnt != 0); 8081 hblktag.htag_id = srdp; 8082 hashno = 1; 8083 do { 8084 hmeshift = HME_HASH_SHIFT(hashno); 8085 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift); 8086 hblktag.htag_rehash = hashno; 8087 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift); 8088 8089 SFMMU_HASH_LOCK(hmebp); 8090 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL; 8091 hmeblkp = hmeblkp->hblk_next) { 8092 uint_t rid; 8093 sf_region_t *rgnp; 8094 caddr_t rsaddr; 8095 caddr_t readdr; 8096 8097 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag, 8098 sv_sfmmup->sfmmu_hmeregion_map)) { 8099 continue; 8100 } 8101 ASSERT(hmeblkp->hblk_shared); 8102 rid = hmeblkp->hblk_tag.htag_rid; 8103 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8104 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8105 rgnp = srdp->srd_hmergnp[rid]; 8106 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 8107 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr); 8108 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8109 rsaddr = rgnp->rgn_saddr; 8110 readdr = rsaddr + rgnp->rgn_size; 8111 #ifdef DEBUG 8112 if (TTE_IS_VALID(ttep) || 8113 get_hblk_ttesz(hmeblkp) > TTE8K) { 8114 caddr_t eva = tte_to_evaddr(hmeblkp, ttep); 8115 ASSERT(eva > sv_vaddr); 8116 ASSERT(sv_vaddr >= rsaddr); 8117 ASSERT(sv_vaddr < readdr); 8118 ASSERT(eva <= readdr); 8119 } 8120 #endif /* DEBUG */ 8121 /* 8122 * Continue the search if we 8123 * found an invalid 8K tte outside of the area 8124 * covered by this hmeblk's region. 8125 */ 8126 if (TTE_IS_VALID(ttep)) { 8127 SFMMU_HASH_UNLOCK(hmebp); 8128 pfn = TTE_TO_PFN(sv_vaddr, ttep); 8129 return (pfn); 8130 } else if (get_hblk_ttesz(hmeblkp) > TTE8K || 8131 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) { 8132 SFMMU_HASH_UNLOCK(hmebp); 8133 pfn = PFN_INVALID; 8134 return (pfn); 8135 } 8136 } 8137 SFMMU_HASH_UNLOCK(hmebp); 8138 hashno++; 8139 } while (hashno <= mmu_hashcnt); 8140 return (PFN_INVALID); 8141 } 8142 8143 8144 /* 8145 * For compatability with AT&T and later optimizations 8146 */ 8147 /* ARGSUSED */ 8148 void 8149 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 8150 { 8151 ASSERT(hat != NULL); 8152 ASSERT(hat->sfmmu_xhat_provider == NULL); 8153 } 8154 8155 /* 8156 * Return the number of mappings to a particular page. This number is an 8157 * approximation of the number of people sharing the page. 8158 * 8159 * shared hmeblks or ism hmeblks are counted as 1 mapping here. 8160 * hat_page_checkshare() can be used to compare threshold to share 8161 * count that reflects the number of region sharers albeit at higher cost. 8162 */ 8163 ulong_t 8164 hat_page_getshare(page_t *pp) 8165 { 8166 page_t *spp = pp; /* start page */ 8167 kmutex_t *pml; 8168 ulong_t cnt; 8169 int index, sz = TTE64K; 8170 8171 /* 8172 * We need to grab the mlist lock to make sure any outstanding 8173 * load/unloads complete. Otherwise we could return zero 8174 * even though the unload(s) hasn't finished yet. 8175 */ 8176 pml = sfmmu_mlist_enter(spp); 8177 cnt = spp->p_share; 8178 8179 #ifdef VAC 8180 if (kpm_enable) 8181 cnt += spp->p_kpmref; 8182 #endif 8183 8184 /* 8185 * If we have any large mappings, we count the number of 8186 * mappings that this large page is part of. 8187 */ 8188 index = PP_MAPINDEX(spp); 8189 index >>= 1; 8190 while (index) { 8191 pp = PP_GROUPLEADER(spp, sz); 8192 if ((index & 0x1) && pp != spp) { 8193 cnt += pp->p_share; 8194 spp = pp; 8195 } 8196 index >>= 1; 8197 sz++; 8198 } 8199 sfmmu_mlist_exit(pml); 8200 return (cnt); 8201 } 8202 8203 /* 8204 * Return 1 if the number of mappings exceeds sh_thresh. Return 0 8205 * otherwise. Count shared hmeblks by region's refcnt. 8206 */ 8207 int 8208 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 8209 { 8210 kmutex_t *pml; 8211 ulong_t cnt = 0; 8212 int index, sz = TTE8K; 8213 struct sf_hment *sfhme, *tmphme = NULL; 8214 struct hme_blk *hmeblkp; 8215 8216 pml = sfmmu_mlist_enter(pp); 8217 8218 if (kpm_enable) 8219 cnt = pp->p_kpmref; 8220 8221 if (pp->p_share + cnt > sh_thresh) { 8222 sfmmu_mlist_exit(pml); 8223 return (1); 8224 } 8225 8226 index = PP_MAPINDEX(pp); 8227 8228 again: 8229 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 8230 tmphme = sfhme->hme_next; 8231 if (IS_PAHME(sfhme)) { 8232 continue; 8233 } 8234 8235 hmeblkp = sfmmu_hmetohblk(sfhme); 8236 if (hmeblkp->hblk_xhat_bit) { 8237 cnt++; 8238 if (cnt > sh_thresh) { 8239 sfmmu_mlist_exit(pml); 8240 return (1); 8241 } 8242 continue; 8243 } 8244 if (hme_size(sfhme) != sz) { 8245 continue; 8246 } 8247 8248 if (hmeblkp->hblk_shared) { 8249 sf_srd_t *srdp = hblktosrd(hmeblkp); 8250 uint_t rid = hmeblkp->hblk_tag.htag_rid; 8251 sf_region_t *rgnp; 8252 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8253 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8254 ASSERT(srdp != NULL); 8255 rgnp = srdp->srd_hmergnp[rid]; 8256 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 8257 rgnp, rid); 8258 cnt += rgnp->rgn_refcnt; 8259 } else { 8260 cnt++; 8261 } 8262 if (cnt > sh_thresh) { 8263 sfmmu_mlist_exit(pml); 8264 return (1); 8265 } 8266 } 8267 8268 index >>= 1; 8269 sz++; 8270 while (index) { 8271 pp = PP_GROUPLEADER(pp, sz); 8272 ASSERT(sfmmu_mlist_held(pp)); 8273 if (index & 0x1) { 8274 goto again; 8275 } 8276 index >>= 1; 8277 sz++; 8278 } 8279 sfmmu_mlist_exit(pml); 8280 return (0); 8281 } 8282 8283 /* 8284 * Unload all large mappings to the pp and reset the p_szc field of every 8285 * constituent page according to the remaining mappings. 8286 * 8287 * pp must be locked SE_EXCL. Even though no other constituent pages are 8288 * locked it's legal to unload the large mappings to the pp because all 8289 * constituent pages of large locked mappings have to be locked SE_SHARED. 8290 * This means if we have SE_EXCL lock on one of constituent pages none of the 8291 * large mappings to pp are locked. 8292 * 8293 * Decrease p_szc field starting from the last constituent page and ending 8294 * with the root page. This method is used because other threads rely on the 8295 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 8296 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 8297 * ensures that p_szc changes of the constituent pages appears atomic for all 8298 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 8299 * 8300 * This mechanism is only used for file system pages where it's not always 8301 * possible to get SE_EXCL locks on all constituent pages to demote the size 8302 * code (as is done for anonymous or kernel large pages). 8303 * 8304 * See more comments in front of sfmmu_mlspl_enter(). 8305 */ 8306 void 8307 hat_page_demote(page_t *pp) 8308 { 8309 int index; 8310 int sz; 8311 cpuset_t cpuset; 8312 int sync = 0; 8313 page_t *rootpp; 8314 struct sf_hment *sfhme; 8315 struct sf_hment *tmphme = NULL; 8316 struct hme_blk *hmeblkp; 8317 uint_t pszc; 8318 page_t *lastpp; 8319 cpuset_t tset; 8320 pgcnt_t npgs; 8321 kmutex_t *pml; 8322 kmutex_t *pmtx = NULL; 8323 8324 ASSERT(PAGE_EXCL(pp)); 8325 ASSERT(!PP_ISFREE(pp)); 8326 ASSERT(!PP_ISKAS(pp)); 8327 ASSERT(page_szc_lock_assert(pp)); 8328 pml = sfmmu_mlist_enter(pp); 8329 8330 pszc = pp->p_szc; 8331 if (pszc == 0) { 8332 goto out; 8333 } 8334 8335 index = PP_MAPINDEX(pp) >> 1; 8336 8337 if (index) { 8338 CPUSET_ZERO(cpuset); 8339 sz = TTE64K; 8340 sync = 1; 8341 } 8342 8343 while (index) { 8344 if (!(index & 0x1)) { 8345 index >>= 1; 8346 sz++; 8347 continue; 8348 } 8349 ASSERT(sz <= pszc); 8350 rootpp = PP_GROUPLEADER(pp, sz); 8351 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 8352 tmphme = sfhme->hme_next; 8353 ASSERT(!IS_PAHME(sfhme)); 8354 hmeblkp = sfmmu_hmetohblk(sfhme); 8355 if (hme_size(sfhme) != sz) { 8356 continue; 8357 } 8358 if (hmeblkp->hblk_xhat_bit) { 8359 cmn_err(CE_PANIC, 8360 "hat_page_demote: xhat hmeblk"); 8361 } 8362 tset = sfmmu_pageunload(rootpp, sfhme, sz); 8363 CPUSET_OR(cpuset, tset); 8364 } 8365 if (index >>= 1) { 8366 sz++; 8367 } 8368 } 8369 8370 ASSERT(!PP_ISMAPPED_LARGE(pp)); 8371 8372 if (sync) { 8373 xt_sync(cpuset); 8374 #ifdef VAC 8375 if (PP_ISTNC(pp)) { 8376 conv_tnc(rootpp, sz); 8377 } 8378 #endif /* VAC */ 8379 } 8380 8381 pmtx = sfmmu_page_enter(pp); 8382 8383 ASSERT(pp->p_szc == pszc); 8384 rootpp = PP_PAGEROOT(pp); 8385 ASSERT(rootpp->p_szc == pszc); 8386 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 8387 8388 while (lastpp != rootpp) { 8389 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 8390 ASSERT(sz < pszc); 8391 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 8392 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 8393 while (--npgs > 0) { 8394 lastpp->p_szc = (uchar_t)sz; 8395 lastpp = PP_PAGEPREV(lastpp); 8396 } 8397 if (sz) { 8398 /* 8399 * make sure before current root's pszc 8400 * is updated all updates to constituent pages pszc 8401 * fields are globally visible. 8402 */ 8403 membar_producer(); 8404 } 8405 lastpp->p_szc = sz; 8406 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 8407 if (lastpp != rootpp) { 8408 lastpp = PP_PAGEPREV(lastpp); 8409 } 8410 } 8411 if (sz == 0) { 8412 /* the loop above doesn't cover this case */ 8413 rootpp->p_szc = 0; 8414 } 8415 out: 8416 ASSERT(pp->p_szc == 0); 8417 if (pmtx != NULL) { 8418 sfmmu_page_exit(pmtx); 8419 } 8420 sfmmu_mlist_exit(pml); 8421 } 8422 8423 /* 8424 * Refresh the HAT ismttecnt[] element for size szc. 8425 * Caller must have set ISM busy flag to prevent mapping 8426 * lists from changing while we're traversing them. 8427 */ 8428 pgcnt_t 8429 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 8430 { 8431 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 8432 ism_map_t *ism_map; 8433 pgcnt_t npgs = 0; 8434 pgcnt_t npgs_scd = 0; 8435 int j; 8436 sf_scd_t *scdp; 8437 uchar_t rid; 8438 8439 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 8440 scdp = sfmmup->sfmmu_scdp; 8441 8442 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 8443 ism_map = ism_blkp->iblk_maps; 8444 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) { 8445 rid = ism_map[j].imap_rid; 8446 ASSERT(rid == SFMMU_INVALID_ISMRID || 8447 rid < sfmmup->sfmmu_srdp->srd_next_ismrid); 8448 8449 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID && 8450 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 8451 /* ISM is in sfmmup's SCD */ 8452 npgs_scd += 8453 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8454 } else { 8455 /* ISMs is not in SCD */ 8456 npgs += 8457 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8458 } 8459 } 8460 } 8461 sfmmup->sfmmu_ismttecnt[szc] = npgs; 8462 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd; 8463 return (npgs); 8464 } 8465 8466 /* 8467 * Yield the memory claim requirement for an address space. 8468 * 8469 * This is currently implemented as the number of bytes that have active 8470 * hardware translations that have page structures. Therefore, it can 8471 * underestimate the traditional resident set size, eg, if the 8472 * physical page is present and the hardware translation is missing; 8473 * and it can overestimate the rss, eg, if there are active 8474 * translations to a frame buffer with page structs. 8475 * Also, it does not take sharing into account. 8476 * 8477 * Note that we don't acquire locks here since this function is most often 8478 * called from the clock thread. 8479 */ 8480 size_t 8481 hat_get_mapped_size(struct hat *hat) 8482 { 8483 size_t assize = 0; 8484 int i; 8485 8486 if (hat == NULL) 8487 return (0); 8488 8489 ASSERT(hat->sfmmu_xhat_provider == NULL); 8490 8491 for (i = 0; i < mmu_page_sizes; i++) 8492 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] + 8493 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i); 8494 8495 if (hat->sfmmu_iblk == NULL) 8496 return (assize); 8497 8498 for (i = 0; i < mmu_page_sizes; i++) 8499 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] + 8500 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i); 8501 8502 return (assize); 8503 } 8504 8505 int 8506 hat_stats_enable(struct hat *hat) 8507 { 8508 hatlock_t *hatlockp; 8509 8510 ASSERT(hat->sfmmu_xhat_provider == NULL); 8511 8512 hatlockp = sfmmu_hat_enter(hat); 8513 hat->sfmmu_rmstat++; 8514 sfmmu_hat_exit(hatlockp); 8515 return (1); 8516 } 8517 8518 void 8519 hat_stats_disable(struct hat *hat) 8520 { 8521 hatlock_t *hatlockp; 8522 8523 ASSERT(hat->sfmmu_xhat_provider == NULL); 8524 8525 hatlockp = sfmmu_hat_enter(hat); 8526 hat->sfmmu_rmstat--; 8527 sfmmu_hat_exit(hatlockp); 8528 } 8529 8530 /* 8531 * Routines for entering or removing ourselves from the 8532 * ism_hat's mapping list. This is used for both private and 8533 * SCD hats. 8534 */ 8535 static void 8536 iment_add(struct ism_ment *iment, struct hat *ism_hat) 8537 { 8538 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8539 8540 iment->iment_prev = NULL; 8541 iment->iment_next = ism_hat->sfmmu_iment; 8542 if (ism_hat->sfmmu_iment) { 8543 ism_hat->sfmmu_iment->iment_prev = iment; 8544 } 8545 ism_hat->sfmmu_iment = iment; 8546 } 8547 8548 static void 8549 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 8550 { 8551 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8552 8553 if (ism_hat->sfmmu_iment == NULL) { 8554 panic("ism map entry remove - no entries"); 8555 } 8556 8557 if (iment->iment_prev) { 8558 ASSERT(ism_hat->sfmmu_iment != iment); 8559 iment->iment_prev->iment_next = iment->iment_next; 8560 } else { 8561 ASSERT(ism_hat->sfmmu_iment == iment); 8562 ism_hat->sfmmu_iment = iment->iment_next; 8563 } 8564 8565 if (iment->iment_next) { 8566 iment->iment_next->iment_prev = iment->iment_prev; 8567 } 8568 8569 /* 8570 * zero out the entry 8571 */ 8572 iment->iment_next = NULL; 8573 iment->iment_prev = NULL; 8574 iment->iment_hat = NULL; 8575 } 8576 8577 /* 8578 * Hat_share()/unshare() return an (non-zero) error 8579 * when saddr and daddr are not properly aligned. 8580 * 8581 * The top level mapping element determines the alignment 8582 * requirement for saddr and daddr, depending on different 8583 * architectures. 8584 * 8585 * When hat_share()/unshare() are not supported, 8586 * HATOP_SHARE()/UNSHARE() return 0 8587 */ 8588 int 8589 hat_share(struct hat *sfmmup, caddr_t addr, 8590 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 8591 { 8592 ism_blk_t *ism_blkp; 8593 ism_blk_t *new_iblk; 8594 ism_map_t *ism_map; 8595 ism_ment_t *ism_ment; 8596 int i, added; 8597 hatlock_t *hatlockp; 8598 int reload_mmu = 0; 8599 uint_t ismshift = page_get_shift(ismszc); 8600 size_t ismpgsz = page_get_pagesize(ismszc); 8601 uint_t ismmask = (uint_t)ismpgsz - 1; 8602 size_t sh_size = ISM_SHIFT(ismshift, len); 8603 ushort_t ismhatflag; 8604 hat_region_cookie_t rcookie; 8605 sf_scd_t *old_scdp; 8606 8607 #ifdef DEBUG 8608 caddr_t eaddr = addr + len; 8609 #endif /* DEBUG */ 8610 8611 ASSERT(ism_hatid != NULL && sfmmup != NULL); 8612 ASSERT(sptaddr == ISMID_STARTADDR); 8613 /* 8614 * Check the alignment. 8615 */ 8616 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 8617 return (EINVAL); 8618 8619 /* 8620 * Check size alignment. 8621 */ 8622 if (!ISM_ALIGNED(ismshift, len)) 8623 return (EINVAL); 8624 8625 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 8626 8627 /* 8628 * Allocate ism_ment for the ism_hat's mapping list, and an 8629 * ism map blk in case we need one. We must do our 8630 * allocations before acquiring locks to prevent a deadlock 8631 * in the kmem allocator on the mapping list lock. 8632 */ 8633 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 8634 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 8635 8636 /* 8637 * Serialize ISM mappings with the ISM busy flag, and also the 8638 * trap handlers. 8639 */ 8640 sfmmu_ismhat_enter(sfmmup, 0); 8641 8642 /* 8643 * Allocate an ism map blk if necessary. 8644 */ 8645 if (sfmmup->sfmmu_iblk == NULL) { 8646 sfmmup->sfmmu_iblk = new_iblk; 8647 bzero(new_iblk, sizeof (*new_iblk)); 8648 new_iblk->iblk_nextpa = (uint64_t)-1; 8649 membar_stst(); /* make sure next ptr visible to all CPUs */ 8650 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 8651 reload_mmu = 1; 8652 new_iblk = NULL; 8653 } 8654 8655 #ifdef DEBUG 8656 /* 8657 * Make sure mapping does not already exist. 8658 */ 8659 ism_blkp = sfmmup->sfmmu_iblk; 8660 while (ism_blkp != NULL) { 8661 ism_map = ism_blkp->iblk_maps; 8662 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 8663 if ((addr >= ism_start(ism_map[i]) && 8664 addr < ism_end(ism_map[i])) || 8665 eaddr > ism_start(ism_map[i]) && 8666 eaddr <= ism_end(ism_map[i])) { 8667 panic("sfmmu_share: Already mapped!"); 8668 } 8669 } 8670 ism_blkp = ism_blkp->iblk_next; 8671 } 8672 #endif /* DEBUG */ 8673 8674 ASSERT(ismszc >= TTE4M); 8675 if (ismszc == TTE4M) { 8676 ismhatflag = HAT_4M_FLAG; 8677 } else if (ismszc == TTE32M) { 8678 ismhatflag = HAT_32M_FLAG; 8679 } else if (ismszc == TTE256M) { 8680 ismhatflag = HAT_256M_FLAG; 8681 } 8682 /* 8683 * Add mapping to first available mapping slot. 8684 */ 8685 ism_blkp = sfmmup->sfmmu_iblk; 8686 added = 0; 8687 while (!added) { 8688 ism_map = ism_blkp->iblk_maps; 8689 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8690 if (ism_map[i].imap_ismhat == NULL) { 8691 8692 ism_map[i].imap_ismhat = ism_hatid; 8693 ism_map[i].imap_vb_shift = (uchar_t)ismshift; 8694 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8695 ism_map[i].imap_hatflags = ismhatflag; 8696 ism_map[i].imap_sz_mask = ismmask; 8697 /* 8698 * imap_seg is checked in ISM_CHECK to see if 8699 * non-NULL, then other info assumed valid. 8700 */ 8701 membar_stst(); 8702 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 8703 ism_map[i].imap_ment = ism_ment; 8704 8705 /* 8706 * Now add ourselves to the ism_hat's 8707 * mapping list. 8708 */ 8709 ism_ment->iment_hat = sfmmup; 8710 ism_ment->iment_base_va = addr; 8711 ism_hatid->sfmmu_ismhat = 1; 8712 mutex_enter(&ism_mlist_lock); 8713 iment_add(ism_ment, ism_hatid); 8714 mutex_exit(&ism_mlist_lock); 8715 added = 1; 8716 break; 8717 } 8718 } 8719 if (!added && ism_blkp->iblk_next == NULL) { 8720 ism_blkp->iblk_next = new_iblk; 8721 new_iblk = NULL; 8722 bzero(ism_blkp->iblk_next, 8723 sizeof (*ism_blkp->iblk_next)); 8724 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 8725 membar_stst(); 8726 ism_blkp->iblk_nextpa = 8727 va_to_pa((caddr_t)ism_blkp->iblk_next); 8728 } 8729 ism_blkp = ism_blkp->iblk_next; 8730 } 8731 8732 /* 8733 * After calling hat_join_region, sfmmup may join a new SCD or 8734 * move from the old scd to a new scd, in which case, we want to 8735 * shrink the sfmmup's private tsb size, i.e., pass shrink to 8736 * sfmmu_check_page_sizes at the end of this routine. 8737 */ 8738 old_scdp = sfmmup->sfmmu_scdp; 8739 8740 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0, 8741 PROT_ALL, ismszc, NULL, HAT_REGION_ISM); 8742 if (rcookie != HAT_INVALID_REGION_COOKIE) { 8743 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie); 8744 } 8745 /* 8746 * Update our counters for this sfmmup's ism mappings. 8747 */ 8748 for (i = 0; i <= ismszc; i++) { 8749 if (!(disable_ism_large_pages & (1 << i))) 8750 (void) ism_tsb_entries(sfmmup, i); 8751 } 8752 8753 /* 8754 * For ISM and DISM we do not support 512K pages, so we only only 8755 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the 8756 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 8757 * 8758 * Need to set 32M/256M ISM flags to make sure 8759 * sfmmu_check_page_sizes() enables them on Panther. 8760 */ 8761 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 8762 8763 switch (ismszc) { 8764 case TTE256M: 8765 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) { 8766 hatlockp = sfmmu_hat_enter(sfmmup); 8767 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM); 8768 sfmmu_hat_exit(hatlockp); 8769 } 8770 break; 8771 case TTE32M: 8772 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) { 8773 hatlockp = sfmmu_hat_enter(sfmmup); 8774 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM); 8775 sfmmu_hat_exit(hatlockp); 8776 } 8777 break; 8778 default: 8779 break; 8780 } 8781 8782 /* 8783 * If we updated the ismblkpa for this HAT we must make 8784 * sure all CPUs running this process reload their tsbmiss area. 8785 * Otherwise they will fail to load the mappings in the tsbmiss 8786 * handler and will loop calling pagefault(). 8787 */ 8788 if (reload_mmu) { 8789 hatlockp = sfmmu_hat_enter(sfmmup); 8790 sfmmu_sync_mmustate(sfmmup); 8791 sfmmu_hat_exit(hatlockp); 8792 } 8793 8794 sfmmu_ismhat_exit(sfmmup, 0); 8795 8796 /* 8797 * Free up ismblk if we didn't use it. 8798 */ 8799 if (new_iblk != NULL) 8800 kmem_cache_free(ism_blk_cache, new_iblk); 8801 8802 /* 8803 * Check TSB and TLB page sizes. 8804 */ 8805 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) { 8806 sfmmu_check_page_sizes(sfmmup, 0); 8807 } else { 8808 sfmmu_check_page_sizes(sfmmup, 1); 8809 } 8810 return (0); 8811 } 8812 8813 /* 8814 * hat_unshare removes exactly one ism_map from 8815 * this process's as. It expects multiple calls 8816 * to hat_unshare for multiple shm segments. 8817 */ 8818 void 8819 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 8820 { 8821 ism_map_t *ism_map; 8822 ism_ment_t *free_ment = NULL; 8823 ism_blk_t *ism_blkp; 8824 struct hat *ism_hatid; 8825 int found, i; 8826 hatlock_t *hatlockp; 8827 struct tsb_info *tsbinfo; 8828 uint_t ismshift = page_get_shift(ismszc); 8829 size_t sh_size = ISM_SHIFT(ismshift, len); 8830 uchar_t ism_rid; 8831 sf_scd_t *old_scdp; 8832 8833 ASSERT(ISM_ALIGNED(ismshift, addr)); 8834 ASSERT(ISM_ALIGNED(ismshift, len)); 8835 ASSERT(sfmmup != NULL); 8836 ASSERT(sfmmup != ksfmmup); 8837 8838 if (sfmmup->sfmmu_xhat_provider) { 8839 XHAT_UNSHARE(sfmmup, addr, len); 8840 return; 8841 } else { 8842 /* 8843 * This must be a CPU HAT. If the address space has 8844 * XHATs attached, inform all XHATs that ISM segment 8845 * is going away 8846 */ 8847 ASSERT(sfmmup->sfmmu_as != NULL); 8848 if (sfmmup->sfmmu_as->a_xhat != NULL) 8849 xhat_unshare_all(sfmmup->sfmmu_as, addr, len); 8850 } 8851 8852 /* 8853 * Make sure that during the entire time ISM mappings are removed, 8854 * the trap handlers serialize behind us, and that no one else 8855 * can be mucking with ISM mappings. This also lets us get away 8856 * with not doing expensive cross calls to flush the TLB -- we 8857 * just discard the context, flush the entire TSB, and call it 8858 * a day. 8859 */ 8860 sfmmu_ismhat_enter(sfmmup, 0); 8861 8862 /* 8863 * Remove the mapping. 8864 * 8865 * We can't have any holes in the ism map. 8866 * The tsb miss code while searching the ism map will 8867 * stop on an empty map slot. So we must move 8868 * everyone past the hole up 1 if any. 8869 * 8870 * Also empty ism map blks are not freed until the 8871 * process exits. This is to prevent a MT race condition 8872 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 8873 */ 8874 found = 0; 8875 ism_blkp = sfmmup->sfmmu_iblk; 8876 while (!found && ism_blkp != NULL) { 8877 ism_map = ism_blkp->iblk_maps; 8878 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8879 if (addr == ism_start(ism_map[i]) && 8880 sh_size == (size_t)(ism_size(ism_map[i]))) { 8881 found = 1; 8882 break; 8883 } 8884 } 8885 if (!found) 8886 ism_blkp = ism_blkp->iblk_next; 8887 } 8888 8889 if (found) { 8890 ism_hatid = ism_map[i].imap_ismhat; 8891 ism_rid = ism_map[i].imap_rid; 8892 ASSERT(ism_hatid != NULL); 8893 ASSERT(ism_hatid->sfmmu_ismhat == 1); 8894 8895 /* 8896 * After hat_leave_region, the sfmmup may leave SCD, 8897 * in which case, we want to grow the private tsb size when 8898 * calling sfmmu_check_page_sizes at the end of the routine. 8899 */ 8900 old_scdp = sfmmup->sfmmu_scdp; 8901 /* 8902 * Then remove ourselves from the region. 8903 */ 8904 if (ism_rid != SFMMU_INVALID_ISMRID) { 8905 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid), 8906 HAT_REGION_ISM); 8907 } 8908 8909 /* 8910 * And now guarantee that any other cpu 8911 * that tries to process an ISM miss 8912 * will go to tl=0. 8913 */ 8914 hatlockp = sfmmu_hat_enter(sfmmup); 8915 sfmmu_invalidate_ctx(sfmmup); 8916 sfmmu_hat_exit(hatlockp); 8917 8918 /* 8919 * Remove ourselves from the ism mapping list. 8920 */ 8921 mutex_enter(&ism_mlist_lock); 8922 iment_sub(ism_map[i].imap_ment, ism_hatid); 8923 mutex_exit(&ism_mlist_lock); 8924 free_ment = ism_map[i].imap_ment; 8925 8926 /* 8927 * We delete the ism map by copying 8928 * the next map over the current one. 8929 * We will take the next one in the maps 8930 * array or from the next ism_blk. 8931 */ 8932 while (ism_blkp != NULL) { 8933 ism_map = ism_blkp->iblk_maps; 8934 while (i < (ISM_MAP_SLOTS - 1)) { 8935 ism_map[i] = ism_map[i + 1]; 8936 i++; 8937 } 8938 /* i == (ISM_MAP_SLOTS - 1) */ 8939 ism_blkp = ism_blkp->iblk_next; 8940 if (ism_blkp != NULL) { 8941 ism_map[i] = ism_blkp->iblk_maps[0]; 8942 i = 0; 8943 } else { 8944 ism_map[i].imap_seg = 0; 8945 ism_map[i].imap_vb_shift = 0; 8946 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8947 ism_map[i].imap_hatflags = 0; 8948 ism_map[i].imap_sz_mask = 0; 8949 ism_map[i].imap_ismhat = NULL; 8950 ism_map[i].imap_ment = NULL; 8951 } 8952 } 8953 8954 /* 8955 * Now flush entire TSB for the process, since 8956 * demapping page by page can be too expensive. 8957 * We don't have to flush the TLB here anymore 8958 * since we switch to a new TLB ctx instead. 8959 * Also, there is no need to flush if the process 8960 * is exiting since the TSB will be freed later. 8961 */ 8962 if (!sfmmup->sfmmu_free) { 8963 hatlockp = sfmmu_hat_enter(sfmmup); 8964 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 8965 tsbinfo = tsbinfo->tsb_next) { 8966 if (tsbinfo->tsb_flags & TSB_SWAPPED) 8967 continue; 8968 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) { 8969 tsbinfo->tsb_flags |= 8970 TSB_FLUSH_NEEDED; 8971 continue; 8972 } 8973 8974 sfmmu_inv_tsb(tsbinfo->tsb_va, 8975 TSB_BYTES(tsbinfo->tsb_szc)); 8976 } 8977 sfmmu_hat_exit(hatlockp); 8978 } 8979 } 8980 8981 /* 8982 * Update our counters for this sfmmup's ism mappings. 8983 */ 8984 for (i = 0; i <= ismszc; i++) { 8985 if (!(disable_ism_large_pages & (1 << i))) 8986 (void) ism_tsb_entries(sfmmup, i); 8987 } 8988 8989 sfmmu_ismhat_exit(sfmmup, 0); 8990 8991 /* 8992 * We must do our freeing here after dropping locks 8993 * to prevent a deadlock in the kmem allocator on the 8994 * mapping list lock. 8995 */ 8996 if (free_ment != NULL) 8997 kmem_cache_free(ism_ment_cache, free_ment); 8998 8999 /* 9000 * Check TSB and TLB page sizes if the process isn't exiting. 9001 */ 9002 if (!sfmmup->sfmmu_free) { 9003 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 9004 sfmmu_check_page_sizes(sfmmup, 1); 9005 } else { 9006 sfmmu_check_page_sizes(sfmmup, 0); 9007 } 9008 } 9009 } 9010 9011 /* ARGSUSED */ 9012 static int 9013 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 9014 { 9015 /* void *buf is sfmmu_t pointer */ 9016 bzero(buf, sizeof (sfmmu_t)); 9017 9018 return (0); 9019 } 9020 9021 /* ARGSUSED */ 9022 static void 9023 sfmmu_idcache_destructor(void *buf, void *cdrarg) 9024 { 9025 /* void *buf is sfmmu_t pointer */ 9026 } 9027 9028 /* 9029 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 9030 * field to be the pa of this hmeblk 9031 */ 9032 /* ARGSUSED */ 9033 static int 9034 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 9035 { 9036 struct hme_blk *hmeblkp; 9037 9038 bzero(buf, (size_t)cdrarg); 9039 hmeblkp = (struct hme_blk *)buf; 9040 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 9041 9042 #ifdef HBLK_TRACE 9043 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 9044 #endif /* HBLK_TRACE */ 9045 9046 return (0); 9047 } 9048 9049 /* ARGSUSED */ 9050 static void 9051 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 9052 { 9053 9054 #ifdef HBLK_TRACE 9055 9056 struct hme_blk *hmeblkp; 9057 9058 hmeblkp = (struct hme_blk *)buf; 9059 mutex_destroy(&hmeblkp->hblk_audit_lock); 9060 9061 #endif /* HBLK_TRACE */ 9062 } 9063 9064 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 9065 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 9066 /* 9067 * The kmem allocator will callback into our reclaim routine when the system 9068 * is running low in memory. We traverse the hash and free up all unused but 9069 * still cached hme_blks. We also traverse the free list and free them up 9070 * as well. 9071 */ 9072 /*ARGSUSED*/ 9073 static void 9074 sfmmu_hblkcache_reclaim(void *cdrarg) 9075 { 9076 int i; 9077 struct hmehash_bucket *hmebp; 9078 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 9079 static struct hmehash_bucket *uhmehash_reclaim_hand; 9080 static struct hmehash_bucket *khmehash_reclaim_hand; 9081 struct hme_blk *list = NULL, *last_hmeblkp; 9082 cpuset_t cpuset = cpu_ready_set; 9083 cpu_hme_pend_t *cpuhp; 9084 9085 /* Free up hmeblks on the cpu pending lists */ 9086 for (i = 0; i < NCPU; i++) { 9087 cpuhp = &cpu_hme_pend[i]; 9088 if (cpuhp->chp_listp != NULL) { 9089 mutex_enter(&cpuhp->chp_mutex); 9090 if (cpuhp->chp_listp == NULL) { 9091 mutex_exit(&cpuhp->chp_mutex); 9092 continue; 9093 } 9094 for (last_hmeblkp = cpuhp->chp_listp; 9095 last_hmeblkp->hblk_next != NULL; 9096 last_hmeblkp = last_hmeblkp->hblk_next) 9097 ; 9098 last_hmeblkp->hblk_next = list; 9099 list = cpuhp->chp_listp; 9100 cpuhp->chp_listp = NULL; 9101 cpuhp->chp_count = 0; 9102 mutex_exit(&cpuhp->chp_mutex); 9103 } 9104 9105 } 9106 9107 if (list != NULL) { 9108 kpreempt_disable(); 9109 CPUSET_DEL(cpuset, CPU->cpu_id); 9110 xt_sync(cpuset); 9111 xt_sync(cpuset); 9112 kpreempt_enable(); 9113 sfmmu_hblk_free(&list); 9114 list = NULL; 9115 } 9116 9117 hmebp = uhmehash_reclaim_hand; 9118 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 9119 uhmehash_reclaim_hand = hmebp = uhme_hash; 9120 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9121 9122 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9123 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9124 hmeblkp = hmebp->hmeblkp; 9125 pr_hblk = NULL; 9126 while (hmeblkp) { 9127 nx_hblk = hmeblkp->hblk_next; 9128 if (!hmeblkp->hblk_vcnt && 9129 !hmeblkp->hblk_hmecnt) { 9130 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9131 pr_hblk, &list, 0); 9132 } else { 9133 pr_hblk = hmeblkp; 9134 } 9135 hmeblkp = nx_hblk; 9136 } 9137 SFMMU_HASH_UNLOCK(hmebp); 9138 } 9139 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 9140 hmebp = uhme_hash; 9141 } 9142 9143 hmebp = khmehash_reclaim_hand; 9144 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 9145 khmehash_reclaim_hand = hmebp = khme_hash; 9146 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9147 9148 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9149 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9150 hmeblkp = hmebp->hmeblkp; 9151 pr_hblk = NULL; 9152 while (hmeblkp) { 9153 nx_hblk = hmeblkp->hblk_next; 9154 if (!hmeblkp->hblk_vcnt && 9155 !hmeblkp->hblk_hmecnt) { 9156 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9157 pr_hblk, &list, 0); 9158 } else { 9159 pr_hblk = hmeblkp; 9160 } 9161 hmeblkp = nx_hblk; 9162 } 9163 SFMMU_HASH_UNLOCK(hmebp); 9164 } 9165 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 9166 hmebp = khme_hash; 9167 } 9168 sfmmu_hblks_list_purge(&list, 0); 9169 } 9170 9171 /* 9172 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 9173 * same goes for sfmmu_get_addrvcolor(). 9174 * 9175 * This function will return the virtual color for the specified page. The 9176 * virtual color corresponds to this page current mapping or its last mapping. 9177 * It is used by memory allocators to choose addresses with the correct 9178 * alignment so vac consistency is automatically maintained. If the page 9179 * has no color it returns -1. 9180 */ 9181 /*ARGSUSED*/ 9182 int 9183 sfmmu_get_ppvcolor(struct page *pp) 9184 { 9185 #ifdef VAC 9186 int color; 9187 9188 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 9189 return (-1); 9190 } 9191 color = PP_GET_VCOLOR(pp); 9192 ASSERT(color < mmu_btop(shm_alignment)); 9193 return (color); 9194 #else 9195 return (-1); 9196 #endif /* VAC */ 9197 } 9198 9199 /* 9200 * This function will return the desired alignment for vac consistency 9201 * (vac color) given a virtual address. If no vac is present it returns -1. 9202 */ 9203 /*ARGSUSED*/ 9204 int 9205 sfmmu_get_addrvcolor(caddr_t vaddr) 9206 { 9207 #ifdef VAC 9208 if (cache & CACHE_VAC) { 9209 return (addr_to_vcolor(vaddr)); 9210 } else { 9211 return (-1); 9212 } 9213 #else 9214 return (-1); 9215 #endif /* VAC */ 9216 } 9217 9218 #ifdef VAC 9219 /* 9220 * Check for conflicts. 9221 * A conflict exists if the new and existent mappings do not match in 9222 * their "shm_alignment fields. If conflicts exist, the existant mappings 9223 * are flushed unless one of them is locked. If one of them is locked, then 9224 * the mappings are flushed and converted to non-cacheable mappings. 9225 */ 9226 static void 9227 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 9228 { 9229 struct hat *tmphat; 9230 struct sf_hment *sfhmep, *tmphme = NULL; 9231 struct hme_blk *hmeblkp; 9232 int vcolor; 9233 tte_t tte; 9234 9235 ASSERT(sfmmu_mlist_held(pp)); 9236 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 9237 9238 vcolor = addr_to_vcolor(addr); 9239 if (PP_NEWPAGE(pp)) { 9240 PP_SET_VCOLOR(pp, vcolor); 9241 return; 9242 } 9243 9244 if (PP_GET_VCOLOR(pp) == vcolor) { 9245 return; 9246 } 9247 9248 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 9249 /* 9250 * Previous user of page had a different color 9251 * but since there are no current users 9252 * we just flush the cache and change the color. 9253 */ 9254 SFMMU_STAT(sf_pgcolor_conflict); 9255 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9256 PP_SET_VCOLOR(pp, vcolor); 9257 return; 9258 } 9259 9260 /* 9261 * If we get here we have a vac conflict with a current 9262 * mapping. VAC conflict policy is as follows. 9263 * - The default is to unload the other mappings unless: 9264 * - If we have a large mapping we uncache the page. 9265 * We need to uncache the rest of the large page too. 9266 * - If any of the mappings are locked we uncache the page. 9267 * - If the requested mapping is inconsistent 9268 * with another mapping and that mapping 9269 * is in the same address space we have to 9270 * make it non-cached. The default thing 9271 * to do is unload the inconsistent mapping 9272 * but if they are in the same address space 9273 * we run the risk of unmapping the pc or the 9274 * stack which we will use as we return to the user, 9275 * in which case we can then fault on the thing 9276 * we just unloaded and get into an infinite loop. 9277 */ 9278 if (PP_ISMAPPED_LARGE(pp)) { 9279 int sz; 9280 9281 /* 9282 * Existing mapping is for big pages. We don't unload 9283 * existing big mappings to satisfy new mappings. 9284 * Always convert all mappings to TNC. 9285 */ 9286 sz = fnd_mapping_sz(pp); 9287 pp = PP_GROUPLEADER(pp, sz); 9288 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 9289 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 9290 TTEPAGES(sz)); 9291 9292 return; 9293 } 9294 9295 /* 9296 * check if any mapping is in same as or if it is locked 9297 * since in that case we need to uncache. 9298 */ 9299 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9300 tmphme = sfhmep->hme_next; 9301 if (IS_PAHME(sfhmep)) 9302 continue; 9303 hmeblkp = sfmmu_hmetohblk(sfhmep); 9304 if (hmeblkp->hblk_xhat_bit) 9305 continue; 9306 tmphat = hblktosfmmu(hmeblkp); 9307 sfmmu_copytte(&sfhmep->hme_tte, &tte); 9308 ASSERT(TTE_IS_VALID(&tte)); 9309 if (hmeblkp->hblk_shared || tmphat == hat || 9310 hmeblkp->hblk_lckcnt) { 9311 /* 9312 * We have an uncache conflict 9313 */ 9314 SFMMU_STAT(sf_uncache_conflict); 9315 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 9316 return; 9317 } 9318 } 9319 9320 /* 9321 * We have an unload conflict 9322 * We have already checked for LARGE mappings, therefore 9323 * the remaining mapping(s) must be TTE8K. 9324 */ 9325 SFMMU_STAT(sf_unload_conflict); 9326 9327 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9328 tmphme = sfhmep->hme_next; 9329 if (IS_PAHME(sfhmep)) 9330 continue; 9331 hmeblkp = sfmmu_hmetohblk(sfhmep); 9332 if (hmeblkp->hblk_xhat_bit) 9333 continue; 9334 ASSERT(!hmeblkp->hblk_shared); 9335 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 9336 } 9337 9338 if (PP_ISMAPPED_KPM(pp)) 9339 sfmmu_kpm_vac_unload(pp, addr); 9340 9341 /* 9342 * Unloads only do TLB flushes so we need to flush the 9343 * cache here. 9344 */ 9345 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9346 PP_SET_VCOLOR(pp, vcolor); 9347 } 9348 9349 /* 9350 * Whenever a mapping is unloaded and the page is in TNC state, 9351 * we see if the page can be made cacheable again. 'pp' is 9352 * the page that we just unloaded a mapping from, the size 9353 * of mapping that was unloaded is 'ottesz'. 9354 * Remark: 9355 * The recache policy for mpss pages can leave a performance problem 9356 * under the following circumstances: 9357 * . A large page in uncached mode has just been unmapped. 9358 * . All constituent pages are TNC due to a conflicting small mapping. 9359 * . There are many other, non conflicting, small mappings around for 9360 * a lot of the constituent pages. 9361 * . We're called w/ the "old" groupleader page and the old ottesz, 9362 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 9363 * we end up w/ TTE8K or npages == 1. 9364 * . We call tst_tnc w/ the old groupleader only, and if there is no 9365 * conflict, we re-cache only this page. 9366 * . All other small mappings are not checked and will be left in TNC mode. 9367 * The problem is not very serious because: 9368 * . mpss is actually only defined for heap and stack, so the probability 9369 * is not very high that a large page mapping exists in parallel to a small 9370 * one (this is possible, but seems to be bad programming style in the 9371 * appl). 9372 * . The problem gets a little bit more serious, when those TNC pages 9373 * have to be mapped into kernel space, e.g. for networking. 9374 * . When VAC alias conflicts occur in applications, this is regarded 9375 * as an application bug. So if kstat's show them, the appl should 9376 * be changed anyway. 9377 */ 9378 void 9379 conv_tnc(page_t *pp, int ottesz) 9380 { 9381 int cursz, dosz; 9382 pgcnt_t curnpgs, dopgs; 9383 pgcnt_t pg64k; 9384 page_t *pp2; 9385 9386 /* 9387 * Determine how big a range we check for TNC and find 9388 * leader page. cursz is the size of the biggest 9389 * mapping that still exist on 'pp'. 9390 */ 9391 if (PP_ISMAPPED_LARGE(pp)) { 9392 cursz = fnd_mapping_sz(pp); 9393 } else { 9394 cursz = TTE8K; 9395 } 9396 9397 if (ottesz >= cursz) { 9398 dosz = ottesz; 9399 pp2 = pp; 9400 } else { 9401 dosz = cursz; 9402 pp2 = PP_GROUPLEADER(pp, dosz); 9403 } 9404 9405 pg64k = TTEPAGES(TTE64K); 9406 dopgs = TTEPAGES(dosz); 9407 9408 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 9409 9410 while (dopgs != 0) { 9411 curnpgs = TTEPAGES(cursz); 9412 if (tst_tnc(pp2, curnpgs)) { 9413 SFMMU_STAT_ADD(sf_recache, curnpgs); 9414 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 9415 curnpgs); 9416 } 9417 9418 ASSERT(dopgs >= curnpgs); 9419 dopgs -= curnpgs; 9420 9421 if (dopgs == 0) { 9422 break; 9423 } 9424 9425 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 9426 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 9427 cursz = fnd_mapping_sz(pp2); 9428 } else { 9429 cursz = TTE8K; 9430 } 9431 } 9432 } 9433 9434 /* 9435 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 9436 * returns 0 otherwise. Note that oaddr argument is valid for only 9437 * 8k pages. 9438 */ 9439 int 9440 tst_tnc(page_t *pp, pgcnt_t npages) 9441 { 9442 struct sf_hment *sfhme; 9443 struct hme_blk *hmeblkp; 9444 tte_t tte; 9445 caddr_t vaddr; 9446 int clr_valid = 0; 9447 int color, color1, bcolor; 9448 int i, ncolors; 9449 9450 ASSERT(pp != NULL); 9451 ASSERT(!(cache & CACHE_WRITEBACK)); 9452 9453 if (npages > 1) { 9454 ncolors = CACHE_NUM_COLOR; 9455 } 9456 9457 for (i = 0; i < npages; i++) { 9458 ASSERT(sfmmu_mlist_held(pp)); 9459 ASSERT(PP_ISTNC(pp)); 9460 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 9461 9462 if (PP_ISPNC(pp)) { 9463 return (0); 9464 } 9465 9466 clr_valid = 0; 9467 if (PP_ISMAPPED_KPM(pp)) { 9468 caddr_t kpmvaddr; 9469 9470 ASSERT(kpm_enable); 9471 kpmvaddr = hat_kpm_page2va(pp, 1); 9472 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 9473 color1 = addr_to_vcolor(kpmvaddr); 9474 clr_valid = 1; 9475 } 9476 9477 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9478 if (IS_PAHME(sfhme)) 9479 continue; 9480 hmeblkp = sfmmu_hmetohblk(sfhme); 9481 if (hmeblkp->hblk_xhat_bit) 9482 continue; 9483 9484 sfmmu_copytte(&sfhme->hme_tte, &tte); 9485 ASSERT(TTE_IS_VALID(&tte)); 9486 9487 vaddr = tte_to_vaddr(hmeblkp, tte); 9488 color = addr_to_vcolor(vaddr); 9489 9490 if (npages > 1) { 9491 /* 9492 * If there is a big mapping, make sure 9493 * 8K mapping is consistent with the big 9494 * mapping. 9495 */ 9496 bcolor = i % ncolors; 9497 if (color != bcolor) { 9498 return (0); 9499 } 9500 } 9501 if (!clr_valid) { 9502 clr_valid = 1; 9503 color1 = color; 9504 } 9505 9506 if (color1 != color) { 9507 return (0); 9508 } 9509 } 9510 9511 pp = PP_PAGENEXT(pp); 9512 } 9513 9514 return (1); 9515 } 9516 9517 void 9518 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 9519 pgcnt_t npages) 9520 { 9521 kmutex_t *pmtx; 9522 int i, ncolors, bcolor; 9523 kpm_hlk_t *kpmp; 9524 cpuset_t cpuset; 9525 9526 ASSERT(pp != NULL); 9527 ASSERT(!(cache & CACHE_WRITEBACK)); 9528 9529 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 9530 pmtx = sfmmu_page_enter(pp); 9531 9532 /* 9533 * Fast path caching single unmapped page 9534 */ 9535 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 9536 flags == HAT_CACHE) { 9537 PP_CLRTNC(pp); 9538 PP_CLRPNC(pp); 9539 sfmmu_page_exit(pmtx); 9540 sfmmu_kpm_kpmp_exit(kpmp); 9541 return; 9542 } 9543 9544 /* 9545 * We need to capture all cpus in order to change cacheability 9546 * because we can't allow one cpu to access the same physical 9547 * page using a cacheable and a non-cachebale mapping at the same 9548 * time. Since we may end up walking the ism mapping list 9549 * have to grab it's lock now since we can't after all the 9550 * cpus have been captured. 9551 */ 9552 sfmmu_hat_lock_all(); 9553 mutex_enter(&ism_mlist_lock); 9554 kpreempt_disable(); 9555 cpuset = cpu_ready_set; 9556 xc_attention(cpuset); 9557 9558 if (npages > 1) { 9559 /* 9560 * Make sure all colors are flushed since the 9561 * sfmmu_page_cache() only flushes one color- 9562 * it does not know big pages. 9563 */ 9564 ncolors = CACHE_NUM_COLOR; 9565 if (flags & HAT_TMPNC) { 9566 for (i = 0; i < ncolors; i++) { 9567 sfmmu_cache_flushcolor(i, pp->p_pagenum); 9568 } 9569 cache_flush_flag = CACHE_NO_FLUSH; 9570 } 9571 } 9572 9573 for (i = 0; i < npages; i++) { 9574 9575 ASSERT(sfmmu_mlist_held(pp)); 9576 9577 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 9578 9579 if (npages > 1) { 9580 bcolor = i % ncolors; 9581 } else { 9582 bcolor = NO_VCOLOR; 9583 } 9584 9585 sfmmu_page_cache(pp, flags, cache_flush_flag, 9586 bcolor); 9587 } 9588 9589 pp = PP_PAGENEXT(pp); 9590 } 9591 9592 xt_sync(cpuset); 9593 xc_dismissed(cpuset); 9594 mutex_exit(&ism_mlist_lock); 9595 sfmmu_hat_unlock_all(); 9596 sfmmu_page_exit(pmtx); 9597 sfmmu_kpm_kpmp_exit(kpmp); 9598 kpreempt_enable(); 9599 } 9600 9601 /* 9602 * This function changes the virtual cacheability of all mappings to a 9603 * particular page. When changing from uncache to cacheable the mappings will 9604 * only be changed if all of them have the same virtual color. 9605 * We need to flush the cache in all cpus. It is possible that 9606 * a process referenced a page as cacheable but has sinced exited 9607 * and cleared the mapping list. We still to flush it but have no 9608 * state so all cpus is the only alternative. 9609 */ 9610 static void 9611 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 9612 { 9613 struct sf_hment *sfhme; 9614 struct hme_blk *hmeblkp; 9615 sfmmu_t *sfmmup; 9616 tte_t tte, ttemod; 9617 caddr_t vaddr; 9618 int ret, color; 9619 pfn_t pfn; 9620 9621 color = bcolor; 9622 pfn = pp->p_pagenum; 9623 9624 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9625 9626 if (IS_PAHME(sfhme)) 9627 continue; 9628 hmeblkp = sfmmu_hmetohblk(sfhme); 9629 9630 if (hmeblkp->hblk_xhat_bit) 9631 continue; 9632 9633 sfmmu_copytte(&sfhme->hme_tte, &tte); 9634 ASSERT(TTE_IS_VALID(&tte)); 9635 vaddr = tte_to_vaddr(hmeblkp, tte); 9636 color = addr_to_vcolor(vaddr); 9637 9638 #ifdef DEBUG 9639 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 9640 ASSERT(color == bcolor); 9641 } 9642 #endif 9643 9644 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 9645 9646 ttemod = tte; 9647 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 9648 TTE_CLR_VCACHEABLE(&ttemod); 9649 } else { /* flags & HAT_CACHE */ 9650 TTE_SET_VCACHEABLE(&ttemod); 9651 } 9652 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 9653 if (ret < 0) { 9654 /* 9655 * Since all cpus are captured modifytte should not 9656 * fail. 9657 */ 9658 panic("sfmmu_page_cache: write to tte failed"); 9659 } 9660 9661 sfmmup = hblktosfmmu(hmeblkp); 9662 if (cache_flush_flag == CACHE_FLUSH) { 9663 /* 9664 * Flush TSBs, TLBs and caches 9665 */ 9666 if (hmeblkp->hblk_shared) { 9667 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9668 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9669 sf_region_t *rgnp; 9670 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9671 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9672 ASSERT(srdp != NULL); 9673 rgnp = srdp->srd_hmergnp[rid]; 9674 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9675 srdp, rgnp, rid); 9676 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9677 hmeblkp, 0); 9678 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr)); 9679 } else if (sfmmup->sfmmu_ismhat) { 9680 if (flags & HAT_CACHE) { 9681 SFMMU_STAT(sf_ism_recache); 9682 } else { 9683 SFMMU_STAT(sf_ism_uncache); 9684 } 9685 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9686 pfn, CACHE_FLUSH); 9687 } else { 9688 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 9689 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 9690 } 9691 9692 /* 9693 * all cache entries belonging to this pfn are 9694 * now flushed. 9695 */ 9696 cache_flush_flag = CACHE_NO_FLUSH; 9697 } else { 9698 /* 9699 * Flush only TSBs and TLBs. 9700 */ 9701 if (hmeblkp->hblk_shared) { 9702 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9703 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9704 sf_region_t *rgnp; 9705 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9706 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9707 ASSERT(srdp != NULL); 9708 rgnp = srdp->srd_hmergnp[rid]; 9709 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9710 srdp, rgnp, rid); 9711 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9712 hmeblkp, 0); 9713 } else if (sfmmup->sfmmu_ismhat) { 9714 if (flags & HAT_CACHE) { 9715 SFMMU_STAT(sf_ism_recache); 9716 } else { 9717 SFMMU_STAT(sf_ism_uncache); 9718 } 9719 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9720 pfn, CACHE_NO_FLUSH); 9721 } else { 9722 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 9723 } 9724 } 9725 } 9726 9727 if (PP_ISMAPPED_KPM(pp)) 9728 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 9729 9730 switch (flags) { 9731 9732 default: 9733 panic("sfmmu_pagecache: unknown flags"); 9734 break; 9735 9736 case HAT_CACHE: 9737 PP_CLRTNC(pp); 9738 PP_CLRPNC(pp); 9739 PP_SET_VCOLOR(pp, color); 9740 break; 9741 9742 case HAT_TMPNC: 9743 PP_SETTNC(pp); 9744 PP_SET_VCOLOR(pp, NO_VCOLOR); 9745 break; 9746 9747 case HAT_UNCACHE: 9748 PP_SETPNC(pp); 9749 PP_CLRTNC(pp); 9750 PP_SET_VCOLOR(pp, NO_VCOLOR); 9751 break; 9752 } 9753 } 9754 #endif /* VAC */ 9755 9756 9757 /* 9758 * Wrapper routine used to return a context. 9759 * 9760 * It's the responsibility of the caller to guarantee that the 9761 * process serializes on calls here by taking the HAT lock for 9762 * the hat. 9763 * 9764 */ 9765 static void 9766 sfmmu_get_ctx(sfmmu_t *sfmmup) 9767 { 9768 mmu_ctx_t *mmu_ctxp; 9769 uint_t pstate_save; 9770 int ret; 9771 9772 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9773 ASSERT(sfmmup != ksfmmup); 9774 9775 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) { 9776 sfmmu_setup_tsbinfo(sfmmup); 9777 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID); 9778 } 9779 9780 kpreempt_disable(); 9781 9782 mmu_ctxp = CPU_MMU_CTXP(CPU); 9783 ASSERT(mmu_ctxp); 9784 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 9785 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 9786 9787 /* 9788 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU. 9789 */ 9790 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs) 9791 sfmmu_ctx_wrap_around(mmu_ctxp); 9792 9793 /* 9794 * Let the MMU set up the page sizes to use for 9795 * this context in the TLB. Don't program 2nd dtlb for ism hat. 9796 */ 9797 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) { 9798 mmu_set_ctx_page_sizes(sfmmup); 9799 } 9800 9801 /* 9802 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with 9803 * interrupts disabled to prevent race condition with wrap-around 9804 * ctx invalidatation. In sun4v, ctx invalidation also involves 9805 * a HV call to set the number of TSBs to 0. If interrupts are not 9806 * disabled until after sfmmu_load_mmustate is complete TSBs may 9807 * become assigned to INVALID_CONTEXT. This is not allowed. 9808 */ 9809 pstate_save = sfmmu_disable_intrs(); 9810 9811 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) && 9812 sfmmup->sfmmu_scdp != NULL) { 9813 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 9814 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 9815 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED); 9816 /* debug purpose only */ 9817 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 9818 != INVALID_CONTEXT); 9819 } 9820 sfmmu_load_mmustate(sfmmup); 9821 9822 sfmmu_enable_intrs(pstate_save); 9823 9824 kpreempt_enable(); 9825 } 9826 9827 /* 9828 * When all cnums are used up in a MMU, cnum will wrap around to the 9829 * next generation and start from 2. 9830 */ 9831 static void 9832 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp) 9833 { 9834 9835 /* caller must have disabled the preemption */ 9836 ASSERT(curthread->t_preempt >= 1); 9837 ASSERT(mmu_ctxp != NULL); 9838 9839 /* acquire Per-MMU (PM) spin lock */ 9840 mutex_enter(&mmu_ctxp->mmu_lock); 9841 9842 /* re-check to see if wrap-around is needed */ 9843 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs) 9844 goto done; 9845 9846 SFMMU_MMU_STAT(mmu_wrap_around); 9847 9848 /* update gnum */ 9849 ASSERT(mmu_ctxp->mmu_gnum != 0); 9850 mmu_ctxp->mmu_gnum++; 9851 if (mmu_ctxp->mmu_gnum == 0 || 9852 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) { 9853 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.", 9854 (void *)mmu_ctxp); 9855 } 9856 9857 if (mmu_ctxp->mmu_ncpus > 1) { 9858 cpuset_t cpuset; 9859 9860 membar_enter(); /* make sure updated gnum visible */ 9861 9862 SFMMU_XCALL_STATS(NULL); 9863 9864 /* xcall to others on the same MMU to invalidate ctx */ 9865 cpuset = mmu_ctxp->mmu_cpuset; 9866 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id)); 9867 CPUSET_DEL(cpuset, CPU->cpu_id); 9868 CPUSET_AND(cpuset, cpu_ready_set); 9869 9870 /* 9871 * Pass in INVALID_CONTEXT as the first parameter to 9872 * sfmmu_raise_tsb_exception, which invalidates the context 9873 * of any process running on the CPUs in the MMU. 9874 */ 9875 xt_some(cpuset, sfmmu_raise_tsb_exception, 9876 INVALID_CONTEXT, INVALID_CONTEXT); 9877 xt_sync(cpuset); 9878 9879 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 9880 } 9881 9882 if (sfmmu_getctx_sec() != INVALID_CONTEXT) { 9883 sfmmu_setctx_sec(INVALID_CONTEXT); 9884 sfmmu_clear_utsbinfo(); 9885 } 9886 9887 /* 9888 * No xcall is needed here. For sun4u systems all CPUs in context 9889 * domain share a single physical MMU therefore it's enough to flush 9890 * TLB on local CPU. On sun4v systems we use 1 global context 9891 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception 9892 * handler. Note that vtag_flushall_uctxs() is called 9893 * for Ultra II machine, where the equivalent flushall functionality 9894 * is implemented in SW, and only user ctx TLB entries are flushed. 9895 */ 9896 if (&vtag_flushall_uctxs != NULL) { 9897 vtag_flushall_uctxs(); 9898 } else { 9899 vtag_flushall(); 9900 } 9901 9902 /* reset mmu cnum, skips cnum 0 and 1 */ 9903 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 9904 9905 done: 9906 mutex_exit(&mmu_ctxp->mmu_lock); 9907 } 9908 9909 9910 /* 9911 * For multi-threaded process, set the process context to INVALID_CONTEXT 9912 * so that it faults and reloads the MMU state from TL=0. For single-threaded 9913 * process, we can just load the MMU state directly without having to 9914 * set context invalid. Caller must hold the hat lock since we don't 9915 * acquire it here. 9916 */ 9917 static void 9918 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 9919 { 9920 uint_t cnum; 9921 uint_t pstate_save; 9922 9923 ASSERT(sfmmup != ksfmmup); 9924 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9925 9926 kpreempt_disable(); 9927 9928 /* 9929 * We check whether the pass'ed-in sfmmup is the same as the 9930 * current running proc. This is to makes sure the current proc 9931 * stays single-threaded if it already is. 9932 */ 9933 if ((sfmmup == curthread->t_procp->p_as->a_hat) && 9934 (curthread->t_procp->p_lwpcnt == 1)) { 9935 /* single-thread */ 9936 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum; 9937 if (cnum != INVALID_CONTEXT) { 9938 uint_t curcnum; 9939 /* 9940 * Disable interrupts to prevent race condition 9941 * with sfmmu_ctx_wrap_around ctx invalidation. 9942 * In sun4v, ctx invalidation involves setting 9943 * TSB to NULL, hence, interrupts should be disabled 9944 * untill after sfmmu_load_mmustate is completed. 9945 */ 9946 pstate_save = sfmmu_disable_intrs(); 9947 curcnum = sfmmu_getctx_sec(); 9948 if (curcnum == cnum) 9949 sfmmu_load_mmustate(sfmmup); 9950 sfmmu_enable_intrs(pstate_save); 9951 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT); 9952 } 9953 } else { 9954 /* 9955 * multi-thread 9956 * or when sfmmup is not the same as the curproc. 9957 */ 9958 sfmmu_invalidate_ctx(sfmmup); 9959 } 9960 9961 kpreempt_enable(); 9962 } 9963 9964 9965 /* 9966 * Replace the specified TSB with a new TSB. This function gets called when 9967 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the 9968 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 9969 * (8K). 9970 * 9971 * Caller must hold the HAT lock, but should assume any tsb_info 9972 * pointers it has are no longer valid after calling this function. 9973 * 9974 * Return values: 9975 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 9976 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 9977 * something to this tsbinfo/TSB 9978 * TSB_SUCCESS Operation succeeded 9979 */ 9980 static tsb_replace_rc_t 9981 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 9982 hatlock_t *hatlockp, uint_t flags) 9983 { 9984 struct tsb_info *new_tsbinfo = NULL; 9985 struct tsb_info *curtsb, *prevtsb; 9986 uint_t tte_sz_mask; 9987 int i; 9988 9989 ASSERT(sfmmup != ksfmmup); 9990 ASSERT(sfmmup->sfmmu_ismhat == 0); 9991 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9992 ASSERT(szc <= tsb_max_growsize); 9993 9994 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 9995 return (TSB_LOSTRACE); 9996 9997 /* 9998 * Find the tsb_info ahead of this one in the list, and 9999 * also make sure that the tsb_info passed in really 10000 * exists! 10001 */ 10002 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10003 curtsb != old_tsbinfo && curtsb != NULL; 10004 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10005 ; 10006 ASSERT(curtsb != NULL); 10007 10008 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10009 /* 10010 * The process is swapped out, so just set the new size 10011 * code. When it swaps back in, we'll allocate a new one 10012 * of the new chosen size. 10013 */ 10014 curtsb->tsb_szc = szc; 10015 return (TSB_SUCCESS); 10016 } 10017 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 10018 10019 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 10020 10021 /* 10022 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 10023 * If we fail to allocate a TSB, exit. 10024 * 10025 * If tsb grows with new tsb size > 4M and old tsb size < 4M, 10026 * then try 4M slab after the initial alloc fails. 10027 * 10028 * If tsb swapin with tsb size > 4M, then try 4M after the 10029 * initial alloc fails. 10030 */ 10031 sfmmu_hat_exit(hatlockp); 10032 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, 10033 tte_sz_mask, flags, sfmmup) && 10034 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) || 10035 (!(flags & TSB_SWAPIN) && 10036 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) || 10037 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE, 10038 tte_sz_mask, flags, sfmmup))) { 10039 (void) sfmmu_hat_enter(sfmmup); 10040 if (!(flags & TSB_SWAPIN)) 10041 SFMMU_STAT(sf_tsb_resize_failures); 10042 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10043 return (TSB_ALLOCFAIL); 10044 } 10045 (void) sfmmu_hat_enter(sfmmup); 10046 10047 /* 10048 * Re-check to make sure somebody else didn't muck with us while we 10049 * didn't hold the HAT lock. If the process swapped out, fine, just 10050 * exit; this can happen if we try to shrink the TSB from the context 10051 * of another process (such as on an ISM unmap), though it is rare. 10052 */ 10053 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10054 SFMMU_STAT(sf_tsb_resize_failures); 10055 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10056 sfmmu_hat_exit(hatlockp); 10057 sfmmu_tsbinfo_free(new_tsbinfo); 10058 (void) sfmmu_hat_enter(sfmmup); 10059 return (TSB_LOSTRACE); 10060 } 10061 10062 #ifdef DEBUG 10063 /* Reverify that the tsb_info still exists.. for debugging only */ 10064 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10065 curtsb != old_tsbinfo && curtsb != NULL; 10066 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10067 ; 10068 ASSERT(curtsb != NULL); 10069 #endif /* DEBUG */ 10070 10071 /* 10072 * Quiesce any CPUs running this process on their next TLB miss 10073 * so they atomically see the new tsb_info. We temporarily set the 10074 * context to invalid context so new threads that come on processor 10075 * after we do the xcall to cpusran will also serialize behind the 10076 * HAT lock on TLB miss and will see the new TSB. Since this short 10077 * race with a new thread coming on processor is relatively rare, 10078 * this synchronization mechanism should be cheaper than always 10079 * pausing all CPUs for the duration of the setup, which is what 10080 * the old implementation did. This is particuarly true if we are 10081 * copying a huge chunk of memory around during that window. 10082 * 10083 * The memory barriers are to make sure things stay consistent 10084 * with resume() since it does not hold the HAT lock while 10085 * walking the list of tsb_info structures. 10086 */ 10087 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 10088 /* The TSB is either growing or shrinking. */ 10089 sfmmu_invalidate_ctx(sfmmup); 10090 } else { 10091 /* 10092 * It is illegal to swap in TSBs from a process other 10093 * than a process being swapped in. This in turn 10094 * implies we do not have a valid MMU context here 10095 * since a process needs one to resolve translation 10096 * misses. 10097 */ 10098 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 10099 } 10100 10101 #ifdef DEBUG 10102 ASSERT(max_mmu_ctxdoms > 0); 10103 10104 /* 10105 * Process should have INVALID_CONTEXT on all MMUs 10106 */ 10107 for (i = 0; i < max_mmu_ctxdoms; i++) { 10108 10109 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT); 10110 } 10111 #endif 10112 10113 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 10114 membar_stst(); /* strict ordering required */ 10115 if (prevtsb) 10116 prevtsb->tsb_next = new_tsbinfo; 10117 else 10118 sfmmup->sfmmu_tsb = new_tsbinfo; 10119 membar_enter(); /* make sure new TSB globally visible */ 10120 10121 /* 10122 * We need to migrate TSB entries from the old TSB to the new TSB 10123 * if tsb_remap_ttes is set and the TSB is growing. 10124 */ 10125 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 10126 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 10127 10128 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10129 10130 /* 10131 * Drop the HAT lock to free our old tsb_info. 10132 */ 10133 sfmmu_hat_exit(hatlockp); 10134 10135 if ((flags & TSB_GROW) == TSB_GROW) { 10136 SFMMU_STAT(sf_tsb_grow); 10137 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 10138 SFMMU_STAT(sf_tsb_shrink); 10139 } 10140 10141 sfmmu_tsbinfo_free(old_tsbinfo); 10142 10143 (void) sfmmu_hat_enter(sfmmup); 10144 return (TSB_SUCCESS); 10145 } 10146 10147 /* 10148 * This function will re-program hat pgsz array, and invalidate the 10149 * process' context, forcing the process to switch to another 10150 * context on the next TLB miss, and therefore start using the 10151 * TLB that is reprogrammed for the new page sizes. 10152 */ 10153 void 10154 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 10155 { 10156 int i; 10157 hatlock_t *hatlockp = NULL; 10158 10159 hatlockp = sfmmu_hat_enter(sfmmup); 10160 /* USIII+-IV+ optimization, requires hat lock */ 10161 if (tmp_pgsz) { 10162 for (i = 0; i < mmu_page_sizes; i++) 10163 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 10164 } 10165 SFMMU_STAT(sf_tlb_reprog_pgsz); 10166 10167 sfmmu_invalidate_ctx(sfmmup); 10168 10169 sfmmu_hat_exit(hatlockp); 10170 } 10171 10172 /* 10173 * The scd_rttecnt field in the SCD must be updated to take account of the 10174 * regions which it contains. 10175 */ 10176 static void 10177 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp) 10178 { 10179 uint_t rid; 10180 uint_t i, j; 10181 ulong_t w; 10182 sf_region_t *rgnp; 10183 10184 ASSERT(srdp != NULL); 10185 10186 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 10187 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 10188 continue; 10189 } 10190 10191 j = 0; 10192 while (w) { 10193 if (!(w & 0x1)) { 10194 j++; 10195 w >>= 1; 10196 continue; 10197 } 10198 rid = (i << BT_ULSHIFT) | j; 10199 j++; 10200 w >>= 1; 10201 10202 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 10203 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 10204 rgnp = srdp->srd_hmergnp[rid]; 10205 ASSERT(rgnp->rgn_refcnt > 0); 10206 ASSERT(rgnp->rgn_id == rid); 10207 10208 scdp->scd_rttecnt[rgnp->rgn_pgszc] += 10209 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 10210 10211 /* 10212 * Maintain the tsb0 inflation cnt for the regions 10213 * in the SCD. 10214 */ 10215 if (rgnp->rgn_pgszc >= TTE4M) { 10216 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt += 10217 rgnp->rgn_size >> 10218 (TTE_PAGE_SHIFT(TTE8K) + 2); 10219 } 10220 } 10221 } 10222 } 10223 10224 /* 10225 * This function assumes that there are either four or six supported page 10226 * sizes and at most two programmable TLBs, so we need to decide which 10227 * page sizes are most important and then tell the MMU layer so it 10228 * can adjust the TLB page sizes accordingly (if supported). 10229 * 10230 * If these assumptions change, this function will need to be 10231 * updated to support whatever the new limits are. 10232 * 10233 * The growing flag is nonzero if we are growing the address space, 10234 * and zero if it is shrinking. This allows us to decide whether 10235 * to grow or shrink our TSB, depending upon available memory 10236 * conditions. 10237 */ 10238 static void 10239 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 10240 { 10241 uint64_t ttecnt[MMU_PAGE_SIZES]; 10242 uint64_t tte8k_cnt, tte4m_cnt; 10243 uint8_t i; 10244 int sectsb_thresh; 10245 10246 /* 10247 * Kernel threads, processes with small address spaces not using 10248 * large pages, and dummy ISM HATs need not apply. 10249 */ 10250 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 10251 return; 10252 10253 if (!SFMMU_LGPGS_INUSE(sfmmup) && 10254 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 10255 return; 10256 10257 for (i = 0; i < mmu_page_sizes; i++) { 10258 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] + 10259 sfmmup->sfmmu_ismttecnt[i]; 10260 } 10261 10262 /* Check pagesizes in use, and possibly reprogram DTLB. */ 10263 if (&mmu_check_page_sizes) 10264 mmu_check_page_sizes(sfmmup, ttecnt); 10265 10266 /* 10267 * Calculate the number of 8k ttes to represent the span of these 10268 * pages. 10269 */ 10270 tte8k_cnt = ttecnt[TTE8K] + 10271 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 10272 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 10273 if (mmu_page_sizes == max_mmu_page_sizes) { 10274 tte4m_cnt = ttecnt[TTE4M] + 10275 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 10276 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 10277 } else { 10278 tte4m_cnt = ttecnt[TTE4M]; 10279 } 10280 10281 /* 10282 * Inflate tte8k_cnt to allow for region large page allocation failure. 10283 */ 10284 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt; 10285 10286 /* 10287 * Inflate TSB sizes by a factor of 2 if this process 10288 * uses 4M text pages to minimize extra conflict misses 10289 * in the first TSB since without counting text pages 10290 * 8K TSB may become too small. 10291 * 10292 * Also double the size of the second TSB to minimize 10293 * extra conflict misses due to competition between 4M text pages 10294 * and data pages. 10295 * 10296 * We need to adjust the second TSB allocation threshold by the 10297 * inflation factor, since there is no point in creating a second 10298 * TSB when we know all the mappings can fit in the I/D TLBs. 10299 */ 10300 sectsb_thresh = tsb_sectsb_threshold; 10301 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 10302 tte8k_cnt <<= 1; 10303 tte4m_cnt <<= 1; 10304 sectsb_thresh <<= 1; 10305 } 10306 10307 /* 10308 * Check to see if our TSB is the right size; we may need to 10309 * grow or shrink it. If the process is small, our work is 10310 * finished at this point. 10311 */ 10312 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) { 10313 return; 10314 } 10315 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh); 10316 } 10317 10318 static void 10319 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 10320 uint64_t tte4m_cnt, int sectsb_thresh) 10321 { 10322 int tsb_bits; 10323 uint_t tsb_szc; 10324 struct tsb_info *tsbinfop; 10325 hatlock_t *hatlockp = NULL; 10326 10327 hatlockp = sfmmu_hat_enter(sfmmup); 10328 ASSERT(hatlockp != NULL); 10329 tsbinfop = sfmmup->sfmmu_tsb; 10330 ASSERT(tsbinfop != NULL); 10331 10332 /* 10333 * If we're growing, select the size based on RSS. If we're 10334 * shrinking, leave some room so we don't have to turn around and 10335 * grow again immediately. 10336 */ 10337 if (growing) 10338 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 10339 else 10340 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 10341 10342 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10343 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10344 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10345 hatlockp, TSB_SHRINK); 10346 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 10347 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10348 hatlockp, TSB_GROW); 10349 } 10350 tsbinfop = sfmmup->sfmmu_tsb; 10351 10352 /* 10353 * With the TLB and first TSB out of the way, we need to see if 10354 * we need a second TSB for 4M pages. If we managed to reprogram 10355 * the TLB page sizes above, the process will start using this new 10356 * TSB right away; otherwise, it will start using it on the next 10357 * context switch. Either way, it's no big deal so there's no 10358 * synchronization with the trap handlers here unless we grow the 10359 * TSB (in which case it's required to prevent using the old one 10360 * after it's freed). Note: second tsb is required for 32M/256M 10361 * page sizes. 10362 */ 10363 if (tte4m_cnt > sectsb_thresh) { 10364 /* 10365 * If we're growing, select the size based on RSS. If we're 10366 * shrinking, leave some room so we don't have to turn 10367 * around and grow again immediately. 10368 */ 10369 if (growing) 10370 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 10371 else 10372 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 10373 if (tsbinfop->tsb_next == NULL) { 10374 struct tsb_info *newtsb; 10375 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 10376 0 : TSB_ALLOC; 10377 10378 sfmmu_hat_exit(hatlockp); 10379 10380 /* 10381 * Try to allocate a TSB for 4[32|256]M pages. If we 10382 * can't get the size we want, retry w/a minimum sized 10383 * TSB. If that still didn't work, give up; we can 10384 * still run without one. 10385 */ 10386 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 10387 TSB4M|TSB32M|TSB256M:TSB4M; 10388 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 10389 allocflags, sfmmup)) && 10390 (tsb_szc <= TSB_4M_SZCODE || 10391 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 10392 tsb_bits, allocflags, sfmmup)) && 10393 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 10394 tsb_bits, allocflags, sfmmup)) { 10395 return; 10396 } 10397 10398 hatlockp = sfmmu_hat_enter(sfmmup); 10399 10400 sfmmu_invalidate_ctx(sfmmup); 10401 10402 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 10403 sfmmup->sfmmu_tsb->tsb_next = newtsb; 10404 SFMMU_STAT(sf_tsb_sectsb_create); 10405 sfmmu_hat_exit(hatlockp); 10406 return; 10407 } else { 10408 /* 10409 * It's annoying, but possible for us 10410 * to get here.. we dropped the HAT lock 10411 * because of locking order in the kmem 10412 * allocator, and while we were off getting 10413 * our memory, some other thread decided to 10414 * do us a favor and won the race to get a 10415 * second TSB for this process. Sigh. 10416 */ 10417 sfmmu_hat_exit(hatlockp); 10418 sfmmu_tsbinfo_free(newtsb); 10419 return; 10420 } 10421 } 10422 10423 /* 10424 * We have a second TSB, see if it's big enough. 10425 */ 10426 tsbinfop = tsbinfop->tsb_next; 10427 10428 /* 10429 * Check to see if our second TSB is the right size; 10430 * we may need to grow or shrink it. 10431 * To prevent thrashing (e.g. growing the TSB on a 10432 * subsequent map operation), only try to shrink if 10433 * the TSB reach exceeds twice the virtual address 10434 * space size. 10435 */ 10436 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10437 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10438 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10439 tsb_szc, hatlockp, TSB_SHRINK); 10440 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 10441 TSB_OK_GROW()) { 10442 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10443 tsb_szc, hatlockp, TSB_GROW); 10444 } 10445 } 10446 10447 sfmmu_hat_exit(hatlockp); 10448 } 10449 10450 /* 10451 * Free up a sfmmu 10452 * Since the sfmmu is currently embedded in the hat struct we simply zero 10453 * out our fields and free up the ism map blk list if any. 10454 */ 10455 static void 10456 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 10457 { 10458 ism_blk_t *blkp, *nx_blkp; 10459 #ifdef DEBUG 10460 ism_map_t *map; 10461 int i; 10462 #endif 10463 10464 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 10465 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 10466 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 10467 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 10468 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 10469 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 10470 ASSERT(SF_RGNMAP_ISNULL(sfmmup)); 10471 10472 sfmmup->sfmmu_free = 0; 10473 sfmmup->sfmmu_ismhat = 0; 10474 10475 blkp = sfmmup->sfmmu_iblk; 10476 sfmmup->sfmmu_iblk = NULL; 10477 10478 while (blkp) { 10479 #ifdef DEBUG 10480 map = blkp->iblk_maps; 10481 for (i = 0; i < ISM_MAP_SLOTS; i++) { 10482 ASSERT(map[i].imap_seg == 0); 10483 ASSERT(map[i].imap_ismhat == NULL); 10484 ASSERT(map[i].imap_ment == NULL); 10485 } 10486 #endif 10487 nx_blkp = blkp->iblk_next; 10488 blkp->iblk_next = NULL; 10489 blkp->iblk_nextpa = (uint64_t)-1; 10490 kmem_cache_free(ism_blk_cache, blkp); 10491 blkp = nx_blkp; 10492 } 10493 } 10494 10495 /* 10496 * Locking primitves accessed by HATLOCK macros 10497 */ 10498 10499 #define SFMMU_SPL_MTX (0x0) 10500 #define SFMMU_ML_MTX (0x1) 10501 10502 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 10503 SPL_HASH(pg) : MLIST_HASH(pg)) 10504 10505 kmutex_t * 10506 sfmmu_page_enter(struct page *pp) 10507 { 10508 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 10509 } 10510 10511 void 10512 sfmmu_page_exit(kmutex_t *spl) 10513 { 10514 mutex_exit(spl); 10515 } 10516 10517 int 10518 sfmmu_page_spl_held(struct page *pp) 10519 { 10520 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 10521 } 10522 10523 kmutex_t * 10524 sfmmu_mlist_enter(struct page *pp) 10525 { 10526 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 10527 } 10528 10529 void 10530 sfmmu_mlist_exit(kmutex_t *mml) 10531 { 10532 mutex_exit(mml); 10533 } 10534 10535 int 10536 sfmmu_mlist_held(struct page *pp) 10537 { 10538 10539 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 10540 } 10541 10542 /* 10543 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 10544 * sfmmu_mlist_enter() case mml_table lock array is used and for 10545 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 10546 * 10547 * The lock is taken on a root page so that it protects an operation on all 10548 * constituent pages of a large page pp belongs to. 10549 * 10550 * The routine takes a lock from the appropriate array. The lock is determined 10551 * by hashing the root page. After taking the lock this routine checks if the 10552 * root page has the same size code that was used to determine the root (i.e 10553 * that root hasn't changed). If root page has the expected p_szc field we 10554 * have the right lock and it's returned to the caller. If root's p_szc 10555 * decreased we release the lock and retry from the beginning. This case can 10556 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 10557 * value and taking the lock. The number of retries due to p_szc decrease is 10558 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 10559 * determined by hashing pp itself. 10560 * 10561 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 10562 * possible that p_szc can increase. To increase p_szc a thread has to lock 10563 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 10564 * callers that don't hold a page locked recheck if hmeblk through which pp 10565 * was found still maps this pp. If it doesn't map it anymore returned lock 10566 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 10567 * p_szc increase after taking the lock it returns this lock without further 10568 * retries because in this case the caller doesn't care about which lock was 10569 * taken. The caller will drop it right away. 10570 * 10571 * After the routine returns it's guaranteed that hat_page_demote() can't 10572 * change p_szc field of any of constituent pages of a large page pp belongs 10573 * to as long as pp was either locked at least SHARED prior to this call or 10574 * the caller finds that hment that pointed to this pp still references this 10575 * pp (this also assumes that the caller holds hme hash bucket lock so that 10576 * the same pp can't be remapped into the same hmeblk after it was unmapped by 10577 * hat_pageunload()). 10578 */ 10579 static kmutex_t * 10580 sfmmu_mlspl_enter(struct page *pp, int type) 10581 { 10582 kmutex_t *mtx; 10583 uint_t prev_rszc = UINT_MAX; 10584 page_t *rootpp; 10585 uint_t szc; 10586 uint_t rszc; 10587 uint_t pszc = pp->p_szc; 10588 10589 ASSERT(pp != NULL); 10590 10591 again: 10592 if (pszc == 0) { 10593 mtx = SFMMU_MLSPL_MTX(type, pp); 10594 mutex_enter(mtx); 10595 return (mtx); 10596 } 10597 10598 /* The lock lives in the root page */ 10599 rootpp = PP_GROUPLEADER(pp, pszc); 10600 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10601 mutex_enter(mtx); 10602 10603 /* 10604 * Return mml in the following 3 cases: 10605 * 10606 * 1) If pp itself is root since if its p_szc decreased before we took 10607 * the lock pp is still the root of smaller szc page. And if its p_szc 10608 * increased it doesn't matter what lock we return (see comment in 10609 * front of this routine). 10610 * 10611 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 10612 * large page we have the right lock since any previous potential 10613 * hat_page_demote() is done demoting from greater than current root's 10614 * p_szc because hat_page_demote() changes root's p_szc last. No 10615 * further hat_page_demote() can start or be in progress since it 10616 * would need the same lock we currently hold. 10617 * 10618 * 3) If rootpp's p_szc increased since previous iteration it doesn't 10619 * matter what lock we return (see comment in front of this routine). 10620 */ 10621 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 10622 rszc >= prev_rszc) { 10623 return (mtx); 10624 } 10625 10626 /* 10627 * hat_page_demote() could have decreased root's p_szc. 10628 * In this case pp's p_szc must also be smaller than pszc. 10629 * Retry. 10630 */ 10631 if (rszc < pszc) { 10632 szc = pp->p_szc; 10633 if (szc < pszc) { 10634 mutex_exit(mtx); 10635 pszc = szc; 10636 goto again; 10637 } 10638 /* 10639 * pp's p_szc increased after it was decreased. 10640 * page cannot be mapped. Return current lock. The caller 10641 * will drop it right away. 10642 */ 10643 return (mtx); 10644 } 10645 10646 /* 10647 * root's p_szc is greater than pp's p_szc. 10648 * hat_page_demote() is not done with all pages 10649 * yet. Wait for it to complete. 10650 */ 10651 mutex_exit(mtx); 10652 rootpp = PP_GROUPLEADER(rootpp, rszc); 10653 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10654 mutex_enter(mtx); 10655 mutex_exit(mtx); 10656 prev_rszc = rszc; 10657 goto again; 10658 } 10659 10660 static int 10661 sfmmu_mlspl_held(struct page *pp, int type) 10662 { 10663 kmutex_t *mtx; 10664 10665 ASSERT(pp != NULL); 10666 /* The lock lives in the root page */ 10667 pp = PP_PAGEROOT(pp); 10668 ASSERT(pp != NULL); 10669 10670 mtx = SFMMU_MLSPL_MTX(type, pp); 10671 return (MUTEX_HELD(mtx)); 10672 } 10673 10674 static uint_t 10675 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 10676 { 10677 struct hme_blk *hblkp; 10678 10679 10680 if (freehblkp != NULL) { 10681 mutex_enter(&freehblkp_lock); 10682 if (freehblkp != NULL) { 10683 /* 10684 * If the current thread is owning hblk_reserve OR 10685 * critical request from sfmmu_hblk_steal() 10686 * let it succeed even if freehblkcnt is really low. 10687 */ 10688 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 10689 SFMMU_STAT(sf_get_free_throttle); 10690 mutex_exit(&freehblkp_lock); 10691 return (0); 10692 } 10693 freehblkcnt--; 10694 *hmeblkpp = freehblkp; 10695 hblkp = *hmeblkpp; 10696 freehblkp = hblkp->hblk_next; 10697 mutex_exit(&freehblkp_lock); 10698 hblkp->hblk_next = NULL; 10699 SFMMU_STAT(sf_get_free_success); 10700 10701 ASSERT(hblkp->hblk_hmecnt == 0); 10702 ASSERT(hblkp->hblk_vcnt == 0); 10703 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp)); 10704 10705 return (1); 10706 } 10707 mutex_exit(&freehblkp_lock); 10708 } 10709 10710 /* Check cpu hblk pending queues */ 10711 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) { 10712 hblkp = *hmeblkpp; 10713 hblkp->hblk_next = NULL; 10714 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp); 10715 10716 ASSERT(hblkp->hblk_hmecnt == 0); 10717 ASSERT(hblkp->hblk_vcnt == 0); 10718 10719 return (1); 10720 } 10721 10722 SFMMU_STAT(sf_get_free_fail); 10723 return (0); 10724 } 10725 10726 static uint_t 10727 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 10728 { 10729 struct hme_blk *hblkp; 10730 10731 ASSERT(hmeblkp->hblk_hmecnt == 0); 10732 ASSERT(hmeblkp->hblk_vcnt == 0); 10733 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10734 10735 /* 10736 * If the current thread is mapping into kernel space, 10737 * let it succede even if freehblkcnt is max 10738 * so that it will avoid freeing it to kmem. 10739 * This will prevent stack overflow due to 10740 * possible recursion since kmem_cache_free() 10741 * might require creation of a slab which 10742 * in turn needs an hmeblk to map that slab; 10743 * let's break this vicious chain at the first 10744 * opportunity. 10745 */ 10746 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10747 mutex_enter(&freehblkp_lock); 10748 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10749 SFMMU_STAT(sf_put_free_success); 10750 freehblkcnt++; 10751 hmeblkp->hblk_next = freehblkp; 10752 freehblkp = hmeblkp; 10753 mutex_exit(&freehblkp_lock); 10754 return (1); 10755 } 10756 mutex_exit(&freehblkp_lock); 10757 } 10758 10759 /* 10760 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 10761 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 10762 * we are not in the process of mapping into kernel space. 10763 */ 10764 ASSERT(!critical); 10765 while (freehblkcnt > HBLK_RESERVE_CNT) { 10766 mutex_enter(&freehblkp_lock); 10767 if (freehblkcnt > HBLK_RESERVE_CNT) { 10768 freehblkcnt--; 10769 hblkp = freehblkp; 10770 freehblkp = hblkp->hblk_next; 10771 mutex_exit(&freehblkp_lock); 10772 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 10773 kmem_cache_free(sfmmu8_cache, hblkp); 10774 continue; 10775 } 10776 mutex_exit(&freehblkp_lock); 10777 } 10778 SFMMU_STAT(sf_put_free_fail); 10779 return (0); 10780 } 10781 10782 static void 10783 sfmmu_hblk_swap(struct hme_blk *new) 10784 { 10785 struct hme_blk *old, *hblkp, *prev; 10786 uint64_t newpa; 10787 caddr_t base, vaddr, endaddr; 10788 struct hmehash_bucket *hmebp; 10789 struct sf_hment *osfhme, *nsfhme; 10790 page_t *pp; 10791 kmutex_t *pml; 10792 tte_t tte; 10793 struct hme_blk *list = NULL; 10794 10795 #ifdef DEBUG 10796 hmeblk_tag hblktag; 10797 struct hme_blk *found; 10798 #endif 10799 old = HBLK_RESERVE; 10800 ASSERT(!old->hblk_shared); 10801 10802 /* 10803 * save pa before bcopy clobbers it 10804 */ 10805 newpa = new->hblk_nextpa; 10806 10807 base = (caddr_t)get_hblk_base(old); 10808 endaddr = base + get_hblk_span(old); 10809 10810 /* 10811 * acquire hash bucket lock. 10812 */ 10813 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K, 10814 SFMMU_INVALID_SHMERID); 10815 10816 /* 10817 * copy contents from old to new 10818 */ 10819 bcopy((void *)old, (void *)new, HME8BLK_SZ); 10820 10821 /* 10822 * add new to hash chain 10823 */ 10824 sfmmu_hblk_hash_add(hmebp, new, newpa); 10825 10826 /* 10827 * search hash chain for hblk_reserve; this needs to be performed 10828 * after adding new, otherwise prev won't correspond to the hblk which 10829 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to 10830 * remove old later. 10831 */ 10832 for (prev = NULL, 10833 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old; 10834 prev = hblkp, hblkp = hblkp->hblk_next) 10835 ; 10836 10837 if (hblkp != old) 10838 panic("sfmmu_hblk_swap: hblk_reserve not found"); 10839 10840 /* 10841 * p_mapping list is still pointing to hments in hblk_reserve; 10842 * fix up p_mapping list so that they point to hments in new. 10843 * 10844 * Since all these mappings are created by hblk_reserve_thread 10845 * on the way and it's using at least one of the buffers from each of 10846 * the newly minted slabs, there is no danger of any of these 10847 * mappings getting unloaded by another thread. 10848 * 10849 * tsbmiss could only modify ref/mod bits of hments in old/new. 10850 * Since all of these hments hold mappings established by segkmem 10851 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 10852 * have no meaning for the mappings in hblk_reserve. hments in 10853 * old and new are identical except for ref/mod bits. 10854 */ 10855 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 10856 10857 HBLKTOHME(osfhme, old, vaddr); 10858 sfmmu_copytte(&osfhme->hme_tte, &tte); 10859 10860 if (TTE_IS_VALID(&tte)) { 10861 if ((pp = osfhme->hme_page) == NULL) 10862 panic("sfmmu_hblk_swap: page not mapped"); 10863 10864 pml = sfmmu_mlist_enter(pp); 10865 10866 if (pp != osfhme->hme_page) 10867 panic("sfmmu_hblk_swap: mapping changed"); 10868 10869 HBLKTOHME(nsfhme, new, vaddr); 10870 10871 HME_ADD(nsfhme, pp); 10872 HME_SUB(osfhme, pp); 10873 10874 sfmmu_mlist_exit(pml); 10875 } 10876 } 10877 10878 /* 10879 * remove old from hash chain 10880 */ 10881 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1); 10882 10883 #ifdef DEBUG 10884 10885 hblktag.htag_id = ksfmmup; 10886 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 10887 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 10888 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 10889 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 10890 10891 if (found != new) 10892 panic("sfmmu_hblk_swap: new hblk not found"); 10893 #endif 10894 10895 SFMMU_HASH_UNLOCK(hmebp); 10896 10897 /* 10898 * Reset hblk_reserve 10899 */ 10900 bzero((void *)old, HME8BLK_SZ); 10901 old->hblk_nextpa = va_to_pa((caddr_t)old); 10902 } 10903 10904 /* 10905 * Grab the mlist mutex for both pages passed in. 10906 * 10907 * low and high will be returned as pointers to the mutexes for these pages. 10908 * low refers to the mutex residing in the lower bin of the mlist hash, while 10909 * high refers to the mutex residing in the higher bin of the mlist hash. This 10910 * is due to the locking order restrictions on the same thread grabbing 10911 * multiple mlist mutexes. The low lock must be acquired before the high lock. 10912 * 10913 * If both pages hash to the same mutex, only grab that single mutex, and 10914 * high will be returned as NULL 10915 * If the pages hash to different bins in the hash, grab the lower addressed 10916 * lock first and then the higher addressed lock in order to follow the locking 10917 * rules involved with the same thread grabbing multiple mlist mutexes. 10918 * low and high will both have non-NULL values. 10919 */ 10920 static void 10921 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 10922 kmutex_t **low, kmutex_t **high) 10923 { 10924 kmutex_t *mml_targ, *mml_repl; 10925 10926 /* 10927 * no need to do the dance around szc as in sfmmu_mlist_enter() 10928 * because this routine is only called by hat_page_relocate() and all 10929 * targ and repl pages are already locked EXCL so szc can't change. 10930 */ 10931 10932 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 10933 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 10934 10935 if (mml_targ == mml_repl) { 10936 *low = mml_targ; 10937 *high = NULL; 10938 } else { 10939 if (mml_targ < mml_repl) { 10940 *low = mml_targ; 10941 *high = mml_repl; 10942 } else { 10943 *low = mml_repl; 10944 *high = mml_targ; 10945 } 10946 } 10947 10948 mutex_enter(*low); 10949 if (*high) 10950 mutex_enter(*high); 10951 } 10952 10953 static void 10954 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 10955 { 10956 if (high) 10957 mutex_exit(high); 10958 mutex_exit(low); 10959 } 10960 10961 static hatlock_t * 10962 sfmmu_hat_enter(sfmmu_t *sfmmup) 10963 { 10964 hatlock_t *hatlockp; 10965 10966 if (sfmmup != ksfmmup) { 10967 hatlockp = TSB_HASH(sfmmup); 10968 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 10969 return (hatlockp); 10970 } 10971 return (NULL); 10972 } 10973 10974 static hatlock_t * 10975 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 10976 { 10977 hatlock_t *hatlockp; 10978 10979 if (sfmmup != ksfmmup) { 10980 hatlockp = TSB_HASH(sfmmup); 10981 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 10982 return (NULL); 10983 return (hatlockp); 10984 } 10985 return (NULL); 10986 } 10987 10988 static void 10989 sfmmu_hat_exit(hatlock_t *hatlockp) 10990 { 10991 if (hatlockp != NULL) 10992 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 10993 } 10994 10995 static void 10996 sfmmu_hat_lock_all(void) 10997 { 10998 int i; 10999 for (i = 0; i < SFMMU_NUM_LOCK; i++) 11000 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 11001 } 11002 11003 static void 11004 sfmmu_hat_unlock_all(void) 11005 { 11006 int i; 11007 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 11008 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 11009 } 11010 11011 int 11012 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 11013 { 11014 ASSERT(sfmmup != ksfmmup); 11015 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 11016 } 11017 11018 /* 11019 * Locking primitives to provide consistency between ISM unmap 11020 * and other operations. Since ISM unmap can take a long time, we 11021 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 11022 * contention on the hatlock buckets while ISM segments are being 11023 * unmapped. The tradeoff is that the flags don't prevent priority 11024 * inversion from occurring, so we must request kernel priority in 11025 * case we have to sleep to keep from getting buried while holding 11026 * the HAT_ISMBUSY flag set, which in turn could block other kernel 11027 * threads from running (for example, in sfmmu_uvatopfn()). 11028 */ 11029 static void 11030 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 11031 { 11032 hatlock_t *hatlockp; 11033 11034 THREAD_KPRI_REQUEST(); 11035 if (!hatlock_held) 11036 hatlockp = sfmmu_hat_enter(sfmmup); 11037 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 11038 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11039 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 11040 if (!hatlock_held) 11041 sfmmu_hat_exit(hatlockp); 11042 } 11043 11044 static void 11045 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 11046 { 11047 hatlock_t *hatlockp; 11048 11049 if (!hatlock_held) 11050 hatlockp = sfmmu_hat_enter(sfmmup); 11051 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 11052 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 11053 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11054 if (!hatlock_held) 11055 sfmmu_hat_exit(hatlockp); 11056 THREAD_KPRI_RELEASE(); 11057 } 11058 11059 /* 11060 * 11061 * Algorithm: 11062 * 11063 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 11064 * hblks. 11065 * 11066 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 11067 * 11068 * (a) try to return an hblk from reserve pool of free hblks; 11069 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 11070 * and return hblk_reserve. 11071 * 11072 * (3) call kmem_cache_alloc() to allocate hblk; 11073 * 11074 * (a) if hblk_reserve_lock is held by the current thread, 11075 * atomically replace hblk_reserve by the hblk that is 11076 * returned by kmem_cache_alloc; release hblk_reserve_lock 11077 * and call kmem_cache_alloc() again. 11078 * (b) if reserve pool is not full, add the hblk that is 11079 * returned by kmem_cache_alloc to reserve pool and 11080 * call kmem_cache_alloc again. 11081 * 11082 */ 11083 static struct hme_blk * 11084 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 11085 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 11086 uint_t flags, uint_t rid) 11087 { 11088 struct hme_blk *hmeblkp = NULL; 11089 struct hme_blk *newhblkp; 11090 struct hme_blk *shw_hblkp = NULL; 11091 struct kmem_cache *sfmmu_cache = NULL; 11092 uint64_t hblkpa; 11093 ulong_t index; 11094 uint_t owner; /* set to 1 if using hblk_reserve */ 11095 uint_t forcefree; 11096 int sleep; 11097 sf_srd_t *srdp; 11098 sf_region_t *rgnp; 11099 11100 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11101 ASSERT(hblktag.htag_rid == rid); 11102 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 11103 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11104 IS_P2ALIGNED(vaddr, TTEBYTES(size))); 11105 11106 /* 11107 * If segkmem is not created yet, allocate from static hmeblks 11108 * created at the end of startup_modules(). See the block comment 11109 * in startup_modules() describing how we estimate the number of 11110 * static hmeblks that will be needed during re-map. 11111 */ 11112 if (!hblk_alloc_dynamic) { 11113 11114 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11115 11116 if (size == TTE8K) { 11117 index = nucleus_hblk8.index; 11118 if (index >= nucleus_hblk8.len) { 11119 /* 11120 * If we panic here, see startup_modules() to 11121 * make sure that we are calculating the 11122 * number of hblk8's that we need correctly. 11123 */ 11124 prom_panic("no nucleus hblk8 to allocate"); 11125 } 11126 hmeblkp = 11127 (struct hme_blk *)&nucleus_hblk8.list[index]; 11128 nucleus_hblk8.index++; 11129 SFMMU_STAT(sf_hblk8_nalloc); 11130 } else { 11131 index = nucleus_hblk1.index; 11132 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 11133 /* 11134 * If we panic here, see startup_modules(). 11135 * Most likely you need to update the 11136 * calculation of the number of hblk1 elements 11137 * that the kernel needs to boot. 11138 */ 11139 prom_panic("no nucleus hblk1 to allocate"); 11140 } 11141 hmeblkp = 11142 (struct hme_blk *)&nucleus_hblk1.list[index]; 11143 nucleus_hblk1.index++; 11144 SFMMU_STAT(sf_hblk1_nalloc); 11145 } 11146 11147 goto hblk_init; 11148 } 11149 11150 SFMMU_HASH_UNLOCK(hmebp); 11151 11152 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) { 11153 if (mmu_page_sizes == max_mmu_page_sizes) { 11154 if (size < TTE256M) 11155 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11156 size, flags); 11157 } else { 11158 if (size < TTE4M) 11159 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11160 size, flags); 11161 } 11162 } else if (SFMMU_IS_SHMERID_VALID(rid)) { 11163 /* 11164 * Shared hmes use per region bitmaps in rgn_hmeflag 11165 * rather than shadow hmeblks to keep track of the 11166 * mapping sizes which have been allocated for the region. 11167 * Here we cleanup old invalid hmeblks with this rid, 11168 * which may be left around by pageunload(). 11169 */ 11170 int ttesz; 11171 caddr_t va; 11172 caddr_t eva = vaddr + TTEBYTES(size); 11173 11174 ASSERT(sfmmup != KHATID); 11175 11176 srdp = sfmmup->sfmmu_srdp; 11177 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11178 rgnp = srdp->srd_hmergnp[rid]; 11179 ASSERT(rgnp != NULL && rgnp->rgn_id == rid); 11180 ASSERT(rgnp->rgn_refcnt != 0); 11181 ASSERT(size <= rgnp->rgn_pgszc); 11182 11183 ttesz = HBLK_MIN_TTESZ; 11184 do { 11185 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) { 11186 continue; 11187 } 11188 11189 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) { 11190 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz); 11191 } else if (ttesz < size) { 11192 for (va = vaddr; va < eva; 11193 va += TTEBYTES(ttesz)) { 11194 sfmmu_cleanup_rhblk(srdp, va, rid, 11195 ttesz); 11196 } 11197 } 11198 } while (++ttesz <= rgnp->rgn_pgszc); 11199 } 11200 11201 fill_hblk: 11202 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 11203 11204 if (owner && size == TTE8K) { 11205 11206 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11207 /* 11208 * We are really in a tight spot. We already own 11209 * hblk_reserve and we need another hblk. In anticipation 11210 * of this kind of scenario, we specifically set aside 11211 * HBLK_RESERVE_MIN number of hblks to be used exclusively 11212 * by owner of hblk_reserve. 11213 */ 11214 SFMMU_STAT(sf_hblk_recurse_cnt); 11215 11216 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 11217 panic("sfmmu_hblk_alloc: reserve list is empty"); 11218 11219 goto hblk_verify; 11220 } 11221 11222 ASSERT(!owner); 11223 11224 if ((flags & HAT_NO_KALLOC) == 0) { 11225 11226 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 11227 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 11228 11229 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 11230 hmeblkp = sfmmu_hblk_steal(size); 11231 } else { 11232 /* 11233 * if we are the owner of hblk_reserve, 11234 * swap hblk_reserve with hmeblkp and 11235 * start a fresh life. Hope things go 11236 * better this time. 11237 */ 11238 if (hblk_reserve_thread == curthread) { 11239 ASSERT(sfmmu_cache == sfmmu8_cache); 11240 sfmmu_hblk_swap(hmeblkp); 11241 hblk_reserve_thread = NULL; 11242 mutex_exit(&hblk_reserve_lock); 11243 goto fill_hblk; 11244 } 11245 /* 11246 * let's donate this hblk to our reserve list if 11247 * we are not mapping kernel range 11248 */ 11249 if (size == TTE8K && sfmmup != KHATID) { 11250 if (sfmmu_put_free_hblk(hmeblkp, 0)) 11251 goto fill_hblk; 11252 } 11253 } 11254 } else { 11255 /* 11256 * We are here to map the slab in sfmmu8_cache; let's 11257 * check if we could tap our reserve list; if successful, 11258 * this will avoid the pain of going thru sfmmu_hblk_swap 11259 */ 11260 SFMMU_STAT(sf_hblk_slab_cnt); 11261 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 11262 /* 11263 * let's start hblk_reserve dance 11264 */ 11265 SFMMU_STAT(sf_hblk_reserve_cnt); 11266 owner = 1; 11267 mutex_enter(&hblk_reserve_lock); 11268 hmeblkp = HBLK_RESERVE; 11269 hblk_reserve_thread = curthread; 11270 } 11271 } 11272 11273 hblk_verify: 11274 ASSERT(hmeblkp != NULL); 11275 set_hblk_sz(hmeblkp, size); 11276 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 11277 SFMMU_HASH_LOCK(hmebp); 11278 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11279 if (newhblkp != NULL) { 11280 SFMMU_HASH_UNLOCK(hmebp); 11281 if (hmeblkp != HBLK_RESERVE) { 11282 /* 11283 * This is really tricky! 11284 * 11285 * vmem_alloc(vmem_seg_arena) 11286 * vmem_alloc(vmem_internal_arena) 11287 * segkmem_alloc(heap_arena) 11288 * vmem_alloc(heap_arena) 11289 * page_create() 11290 * hat_memload() 11291 * kmem_cache_free() 11292 * kmem_cache_alloc() 11293 * kmem_slab_create() 11294 * vmem_alloc(kmem_internal_arena) 11295 * segkmem_alloc(heap_arena) 11296 * vmem_alloc(heap_arena) 11297 * page_create() 11298 * hat_memload() 11299 * kmem_cache_free() 11300 * ... 11301 * 11302 * Thus, hat_memload() could call kmem_cache_free 11303 * for enough number of times that we could easily 11304 * hit the bottom of the stack or run out of reserve 11305 * list of vmem_seg structs. So, we must donate 11306 * this hblk to reserve list if it's allocated 11307 * from sfmmu8_cache *and* mapping kernel range. 11308 * We don't need to worry about freeing hmeblk1's 11309 * to kmem since they don't map any kmem slabs. 11310 * 11311 * Note: When segkmem supports largepages, we must 11312 * free hmeblk1's to reserve list as well. 11313 */ 11314 forcefree = (sfmmup == KHATID) ? 1 : 0; 11315 if (size == TTE8K && 11316 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 11317 goto re_verify; 11318 } 11319 ASSERT(sfmmup != KHATID); 11320 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 11321 } else { 11322 /* 11323 * Hey! we don't need hblk_reserve any more. 11324 */ 11325 ASSERT(owner); 11326 hblk_reserve_thread = NULL; 11327 mutex_exit(&hblk_reserve_lock); 11328 owner = 0; 11329 } 11330 re_verify: 11331 /* 11332 * let's check if the goodies are still present 11333 */ 11334 SFMMU_HASH_LOCK(hmebp); 11335 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11336 if (newhblkp != NULL) { 11337 /* 11338 * return newhblkp if it's not hblk_reserve; 11339 * if newhblkp is hblk_reserve, return it 11340 * _only if_ we are the owner of hblk_reserve. 11341 */ 11342 if (newhblkp != HBLK_RESERVE || owner) { 11343 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11344 newhblkp->hblk_shared); 11345 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || 11346 !newhblkp->hblk_shared); 11347 return (newhblkp); 11348 } else { 11349 /* 11350 * we just hit hblk_reserve in the hash and 11351 * we are not the owner of that; 11352 * 11353 * block until hblk_reserve_thread completes 11354 * swapping hblk_reserve and try the dance 11355 * once again. 11356 */ 11357 SFMMU_HASH_UNLOCK(hmebp); 11358 mutex_enter(&hblk_reserve_lock); 11359 mutex_exit(&hblk_reserve_lock); 11360 SFMMU_STAT(sf_hblk_reserve_hit); 11361 goto fill_hblk; 11362 } 11363 } else { 11364 /* 11365 * it's no more! try the dance once again. 11366 */ 11367 SFMMU_HASH_UNLOCK(hmebp); 11368 goto fill_hblk; 11369 } 11370 } 11371 11372 hblk_init: 11373 if (SFMMU_IS_SHMERID_VALID(rid)) { 11374 uint16_t tteflag = 0x1 << 11375 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size); 11376 11377 if (!(rgnp->rgn_hmeflags & tteflag)) { 11378 atomic_or_16(&rgnp->rgn_hmeflags, tteflag); 11379 } 11380 hmeblkp->hblk_shared = 1; 11381 } else { 11382 hmeblkp->hblk_shared = 0; 11383 } 11384 set_hblk_sz(hmeblkp, size); 11385 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11386 hmeblkp->hblk_next = (struct hme_blk *)NULL; 11387 hmeblkp->hblk_tag = hblktag; 11388 hmeblkp->hblk_shadow = shw_hblkp; 11389 hblkpa = hmeblkp->hblk_nextpa; 11390 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 11391 11392 ASSERT(get_hblk_ttesz(hmeblkp) == size); 11393 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 11394 ASSERT(hmeblkp->hblk_hmecnt == 0); 11395 ASSERT(hmeblkp->hblk_vcnt == 0); 11396 ASSERT(hmeblkp->hblk_lckcnt == 0); 11397 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 11398 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 11399 return (hmeblkp); 11400 } 11401 11402 /* 11403 * This function cleans up the hme_blk and returns it to the free list. 11404 */ 11405 /* ARGSUSED */ 11406 static void 11407 sfmmu_hblk_free(struct hme_blk **listp) 11408 { 11409 struct hme_blk *hmeblkp, *next_hmeblkp; 11410 int size; 11411 uint_t critical; 11412 uint64_t hblkpa; 11413 11414 ASSERT(*listp != NULL); 11415 11416 hmeblkp = *listp; 11417 while (hmeblkp != NULL) { 11418 next_hmeblkp = hmeblkp->hblk_next; 11419 ASSERT(!hmeblkp->hblk_hmecnt); 11420 ASSERT(!hmeblkp->hblk_vcnt); 11421 ASSERT(!hmeblkp->hblk_lckcnt); 11422 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 11423 ASSERT(hmeblkp->hblk_shared == 0); 11424 ASSERT(hmeblkp->hblk_shw_bit == 0); 11425 ASSERT(hmeblkp->hblk_shadow == NULL); 11426 11427 hblkpa = va_to_pa((caddr_t)hmeblkp); 11428 ASSERT(hblkpa != (uint64_t)-1); 11429 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 11430 11431 size = get_hblk_ttesz(hmeblkp); 11432 hmeblkp->hblk_next = NULL; 11433 hmeblkp->hblk_nextpa = hblkpa; 11434 11435 if (hmeblkp->hblk_nuc_bit == 0) { 11436 11437 if (size != TTE8K || 11438 !sfmmu_put_free_hblk(hmeblkp, critical)) 11439 kmem_cache_free(get_hblk_cache(hmeblkp), 11440 hmeblkp); 11441 } 11442 hmeblkp = next_hmeblkp; 11443 } 11444 } 11445 11446 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 11447 #define SFMMU_HBLK_STEAL_THRESHOLD 5 11448 11449 static uint_t sfmmu_hblk_steal_twice; 11450 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 11451 11452 /* 11453 * Steal a hmeblk from user or kernel hme hash lists. 11454 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to 11455 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts 11456 * tap into critical reserve of freehblkp. 11457 * Note: We remain looping in this routine until we find one. 11458 */ 11459 static struct hme_blk * 11460 sfmmu_hblk_steal(int size) 11461 { 11462 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 11463 struct hmehash_bucket *hmebp; 11464 struct hme_blk *hmeblkp = NULL, *pr_hblk; 11465 uint64_t hblkpa; 11466 int i; 11467 uint_t loop_cnt = 0, critical; 11468 11469 for (;;) { 11470 /* Check cpu hblk pending queues */ 11471 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) { 11472 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 11473 ASSERT(hmeblkp->hblk_hmecnt == 0); 11474 ASSERT(hmeblkp->hblk_vcnt == 0); 11475 return (hmeblkp); 11476 } 11477 11478 if (size == TTE8K) { 11479 critical = 11480 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0; 11481 if (sfmmu_get_free_hblk(&hmeblkp, critical)) 11482 return (hmeblkp); 11483 } 11484 11485 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 11486 uhmehash_steal_hand; 11487 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 11488 11489 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 11490 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 11491 SFMMU_HASH_LOCK(hmebp); 11492 hmeblkp = hmebp->hmeblkp; 11493 hblkpa = hmebp->hmeh_nextpa; 11494 pr_hblk = NULL; 11495 while (hmeblkp) { 11496 /* 11497 * check if it is a hmeblk that is not locked 11498 * and not shared. skip shadow hmeblks with 11499 * shadow_mask set i.e valid count non zero. 11500 */ 11501 if ((get_hblk_ttesz(hmeblkp) == size) && 11502 (hmeblkp->hblk_shw_bit == 0 || 11503 hmeblkp->hblk_vcnt == 0) && 11504 (hmeblkp->hblk_lckcnt == 0)) { 11505 /* 11506 * there is a high probability that we 11507 * will find a free one. search some 11508 * buckets for a free hmeblk initially 11509 * before unloading a valid hmeblk. 11510 */ 11511 if ((hmeblkp->hblk_vcnt == 0 && 11512 hmeblkp->hblk_hmecnt == 0) || (i >= 11513 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 11514 if (sfmmu_steal_this_hblk(hmebp, 11515 hmeblkp, hblkpa, pr_hblk)) { 11516 /* 11517 * Hblk is unloaded 11518 * successfully 11519 */ 11520 break; 11521 } 11522 } 11523 } 11524 pr_hblk = hmeblkp; 11525 hblkpa = hmeblkp->hblk_nextpa; 11526 hmeblkp = hmeblkp->hblk_next; 11527 } 11528 11529 SFMMU_HASH_UNLOCK(hmebp); 11530 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 11531 hmebp = uhme_hash; 11532 } 11533 uhmehash_steal_hand = hmebp; 11534 11535 if (hmeblkp != NULL) 11536 break; 11537 11538 /* 11539 * in the worst case, look for a free one in the kernel 11540 * hash table. 11541 */ 11542 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 11543 SFMMU_HASH_LOCK(hmebp); 11544 hmeblkp = hmebp->hmeblkp; 11545 hblkpa = hmebp->hmeh_nextpa; 11546 pr_hblk = NULL; 11547 while (hmeblkp) { 11548 /* 11549 * check if it is free hmeblk 11550 */ 11551 if ((get_hblk_ttesz(hmeblkp) == size) && 11552 (hmeblkp->hblk_lckcnt == 0) && 11553 (hmeblkp->hblk_vcnt == 0) && 11554 (hmeblkp->hblk_hmecnt == 0)) { 11555 if (sfmmu_steal_this_hblk(hmebp, 11556 hmeblkp, hblkpa, pr_hblk)) { 11557 break; 11558 } else { 11559 /* 11560 * Cannot fail since we have 11561 * hash lock. 11562 */ 11563 panic("fail to steal?"); 11564 } 11565 } 11566 11567 pr_hblk = hmeblkp; 11568 hblkpa = hmeblkp->hblk_nextpa; 11569 hmeblkp = hmeblkp->hblk_next; 11570 } 11571 11572 SFMMU_HASH_UNLOCK(hmebp); 11573 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 11574 hmebp = khme_hash; 11575 } 11576 11577 if (hmeblkp != NULL) 11578 break; 11579 sfmmu_hblk_steal_twice++; 11580 } 11581 return (hmeblkp); 11582 } 11583 11584 /* 11585 * This routine does real work to prepare a hblk to be "stolen" by 11586 * unloading the mappings, updating shadow counts .... 11587 * It returns 1 if the block is ready to be reused (stolen), or 0 11588 * means the block cannot be stolen yet- pageunload is still working 11589 * on this hblk. 11590 */ 11591 static int 11592 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 11593 uint64_t hblkpa, struct hme_blk *pr_hblk) 11594 { 11595 int shw_size, vshift; 11596 struct hme_blk *shw_hblkp; 11597 caddr_t vaddr; 11598 uint_t shw_mask, newshw_mask; 11599 struct hme_blk *list = NULL; 11600 11601 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11602 11603 /* 11604 * check if the hmeblk is free, unload if necessary 11605 */ 11606 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11607 sfmmu_t *sfmmup; 11608 demap_range_t dmr; 11609 11610 sfmmup = hblktosfmmu(hmeblkp); 11611 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) { 11612 return (0); 11613 } 11614 DEMAP_RANGE_INIT(sfmmup, &dmr); 11615 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 11616 (caddr_t)get_hblk_base(hmeblkp), 11617 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 11618 DEMAP_RANGE_FLUSH(&dmr); 11619 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11620 /* 11621 * Pageunload is working on the same hblk. 11622 */ 11623 return (0); 11624 } 11625 11626 sfmmu_hblk_steal_unload_count++; 11627 } 11628 11629 ASSERT(hmeblkp->hblk_lckcnt == 0); 11630 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 11631 11632 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1); 11633 hmeblkp->hblk_nextpa = hblkpa; 11634 11635 shw_hblkp = hmeblkp->hblk_shadow; 11636 if (shw_hblkp) { 11637 ASSERT(!hmeblkp->hblk_shared); 11638 shw_size = get_hblk_ttesz(shw_hblkp); 11639 vaddr = (caddr_t)get_hblk_base(hmeblkp); 11640 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 11641 ASSERT(vshift < 8); 11642 /* 11643 * Atomically clear shadow mask bit 11644 */ 11645 do { 11646 shw_mask = shw_hblkp->hblk_shw_mask; 11647 ASSERT(shw_mask & (1 << vshift)); 11648 newshw_mask = shw_mask & ~(1 << vshift); 11649 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 11650 shw_mask, newshw_mask); 11651 } while (newshw_mask != shw_mask); 11652 hmeblkp->hblk_shadow = NULL; 11653 } 11654 11655 /* 11656 * remove shadow bit if we are stealing an unused shadow hmeblk. 11657 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 11658 * we are indeed allocating a shadow hmeblk. 11659 */ 11660 hmeblkp->hblk_shw_bit = 0; 11661 11662 if (hmeblkp->hblk_shared) { 11663 sf_srd_t *srdp; 11664 sf_region_t *rgnp; 11665 uint_t rid; 11666 11667 srdp = hblktosrd(hmeblkp); 11668 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11669 rid = hmeblkp->hblk_tag.htag_rid; 11670 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11671 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11672 rgnp = srdp->srd_hmergnp[rid]; 11673 ASSERT(rgnp != NULL); 11674 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 11675 hmeblkp->hblk_shared = 0; 11676 } 11677 11678 sfmmu_hblk_steal_count++; 11679 SFMMU_STAT(sf_steal_count); 11680 11681 return (1); 11682 } 11683 11684 struct hme_blk * 11685 sfmmu_hmetohblk(struct sf_hment *sfhme) 11686 { 11687 struct hme_blk *hmeblkp; 11688 struct sf_hment *sfhme0; 11689 struct hme_blk *hblk_dummy = 0; 11690 11691 /* 11692 * No dummy sf_hments, please. 11693 */ 11694 ASSERT(sfhme->hme_tte.ll != 0); 11695 11696 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 11697 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 11698 (uintptr_t)&hblk_dummy->hblk_hme[0]); 11699 11700 return (hmeblkp); 11701 } 11702 11703 /* 11704 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 11705 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 11706 * KM_SLEEP allocation. 11707 * 11708 * Return 0 on success, -1 otherwise. 11709 */ 11710 static void 11711 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11712 { 11713 struct tsb_info *tsbinfop, *next; 11714 tsb_replace_rc_t rc; 11715 boolean_t gotfirst = B_FALSE; 11716 11717 ASSERT(sfmmup != ksfmmup); 11718 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11719 11720 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 11721 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11722 } 11723 11724 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11725 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 11726 } else { 11727 return; 11728 } 11729 11730 ASSERT(sfmmup->sfmmu_tsb != NULL); 11731 11732 /* 11733 * Loop over all tsbinfo's replacing them with ones that actually have 11734 * a TSB. If any of the replacements ever fail, bail out of the loop. 11735 */ 11736 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 11737 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 11738 next = tsbinfop->tsb_next; 11739 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 11740 hatlockp, TSB_SWAPIN); 11741 if (rc != TSB_SUCCESS) { 11742 break; 11743 } 11744 gotfirst = B_TRUE; 11745 } 11746 11747 switch (rc) { 11748 case TSB_SUCCESS: 11749 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11750 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11751 return; 11752 case TSB_LOSTRACE: 11753 break; 11754 case TSB_ALLOCFAIL: 11755 break; 11756 default: 11757 panic("sfmmu_replace_tsb returned unrecognized failure code " 11758 "%d", rc); 11759 } 11760 11761 /* 11762 * In this case, we failed to get one of our TSBs. If we failed to 11763 * get the first TSB, get one of minimum size (8KB). Walk the list 11764 * and throw away the tsbinfos, starting where the allocation failed; 11765 * we can get by with just one TSB as long as we don't leave the 11766 * SWAPPED tsbinfo structures lying around. 11767 */ 11768 tsbinfop = sfmmup->sfmmu_tsb; 11769 next = tsbinfop->tsb_next; 11770 tsbinfop->tsb_next = NULL; 11771 11772 sfmmu_hat_exit(hatlockp); 11773 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 11774 next = tsbinfop->tsb_next; 11775 sfmmu_tsbinfo_free(tsbinfop); 11776 } 11777 hatlockp = sfmmu_hat_enter(sfmmup); 11778 11779 /* 11780 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 11781 * pages. 11782 */ 11783 if (!gotfirst) { 11784 tsbinfop = sfmmup->sfmmu_tsb; 11785 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 11786 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 11787 ASSERT(rc == TSB_SUCCESS); 11788 } 11789 11790 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11791 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11792 } 11793 11794 static int 11795 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw) 11796 { 11797 ulong_t bix = 0; 11798 uint_t rid; 11799 sf_region_t *rgnp; 11800 11801 ASSERT(srdp != NULL); 11802 ASSERT(srdp->srd_refcnt != 0); 11803 11804 w <<= BT_ULSHIFT; 11805 while (bmw) { 11806 if (!(bmw & 0x1)) { 11807 bix++; 11808 bmw >>= 1; 11809 continue; 11810 } 11811 rid = w | bix; 11812 rgnp = srdp->srd_hmergnp[rid]; 11813 ASSERT(rgnp->rgn_refcnt > 0); 11814 ASSERT(rgnp->rgn_id == rid); 11815 if (addr < rgnp->rgn_saddr || 11816 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) { 11817 bix++; 11818 bmw >>= 1; 11819 } else { 11820 return (1); 11821 } 11822 } 11823 return (0); 11824 } 11825 11826 /* 11827 * Handle exceptions for low level tsb_handler. 11828 * 11829 * There are many scenarios that could land us here: 11830 * 11831 * If the context is invalid we land here. The context can be invalid 11832 * for 3 reasons: 1) we couldn't allocate a new context and now need to 11833 * perform a wrap around operation in order to allocate a new context. 11834 * 2) Context was invalidated to change pagesize programming 3) ISMs or 11835 * TSBs configuration is changeing for this process and we are forced into 11836 * here to do a syncronization operation. If the context is valid we can 11837 * be here from window trap hanlder. In this case just call trap to handle 11838 * the fault. 11839 * 11840 * Note that the process will run in INVALID_CONTEXT before 11841 * faulting into here and subsequently loading the MMU registers 11842 * (including the TSB base register) associated with this process. 11843 * For this reason, the trap handlers must all test for 11844 * INVALID_CONTEXT before attempting to access any registers other 11845 * than the context registers. 11846 */ 11847 void 11848 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 11849 { 11850 sfmmu_t *sfmmup, *shsfmmup; 11851 uint_t ctxtype; 11852 klwp_id_t lwp; 11853 char lwp_save_state; 11854 hatlock_t *hatlockp, *shatlockp; 11855 struct tsb_info *tsbinfop; 11856 struct tsbmiss *tsbmp; 11857 sf_scd_t *scdp; 11858 11859 SFMMU_STAT(sf_tsb_exceptions); 11860 SFMMU_MMU_STAT(mmu_tsb_exceptions); 11861 sfmmup = astosfmmu(curthread->t_procp->p_as); 11862 /* 11863 * note that in sun4u, tagacces register contains ctxnum 11864 * while sun4v passes ctxtype in the tagaccess register. 11865 */ 11866 ctxtype = tagaccess & TAGACC_CTX_MASK; 11867 11868 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT); 11869 ASSERT(sfmmup->sfmmu_ismhat == 0); 11870 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) || 11871 ctxtype == INVALID_CONTEXT); 11872 11873 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) { 11874 /* 11875 * We may land here because shme bitmap and pagesize 11876 * flags are updated lazily in tsbmiss area on other cpus. 11877 * If we detect here that tsbmiss area is out of sync with 11878 * sfmmu update it and retry the trapped instruction. 11879 * Otherwise call trap(). 11880 */ 11881 int ret = 0; 11882 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K); 11883 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK); 11884 11885 /* 11886 * Must set lwp state to LWP_SYS before 11887 * trying to acquire any adaptive lock 11888 */ 11889 lwp = ttolwp(curthread); 11890 ASSERT(lwp); 11891 lwp_save_state = lwp->lwp_state; 11892 lwp->lwp_state = LWP_SYS; 11893 11894 hatlockp = sfmmu_hat_enter(sfmmup); 11895 kpreempt_disable(); 11896 tsbmp = &tsbmiss_area[CPU->cpu_id]; 11897 ASSERT(sfmmup == tsbmp->usfmmup); 11898 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) & 11899 ~tteflag_mask) || 11900 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) & 11901 ~tteflag_mask)) { 11902 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags; 11903 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags; 11904 ret = 1; 11905 } 11906 if (sfmmup->sfmmu_srdp != NULL) { 11907 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap; 11908 ulong_t *tm = tsbmp->shmermap; 11909 ulong_t i; 11910 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 11911 ulong_t d = tm[i] ^ sm[i]; 11912 if (d) { 11913 if (d & sm[i]) { 11914 if (!ret && sfmmu_is_rgnva( 11915 sfmmup->sfmmu_srdp, 11916 addr, i, d & sm[i])) { 11917 ret = 1; 11918 } 11919 } 11920 tm[i] = sm[i]; 11921 } 11922 } 11923 } 11924 kpreempt_enable(); 11925 sfmmu_hat_exit(hatlockp); 11926 lwp->lwp_state = lwp_save_state; 11927 if (ret) { 11928 return; 11929 } 11930 } else if (ctxtype == INVALID_CONTEXT) { 11931 /* 11932 * First, make sure we come out of here with a valid ctx, 11933 * since if we don't get one we'll simply loop on the 11934 * faulting instruction. 11935 * 11936 * If the ISM mappings are changing, the TSB is relocated, 11937 * the process is swapped, the process is joining SCD or 11938 * leaving SCD or shared regions we serialize behind the 11939 * controlling thread with hat lock, sfmmu_flags and 11940 * sfmmu_tsb_cv condition variable. 11941 */ 11942 11943 /* 11944 * Must set lwp state to LWP_SYS before 11945 * trying to acquire any adaptive lock 11946 */ 11947 lwp = ttolwp(curthread); 11948 ASSERT(lwp); 11949 lwp_save_state = lwp->lwp_state; 11950 lwp->lwp_state = LWP_SYS; 11951 11952 hatlockp = sfmmu_hat_enter(sfmmup); 11953 retry: 11954 if ((scdp = sfmmup->sfmmu_scdp) != NULL) { 11955 shsfmmup = scdp->scd_sfmmup; 11956 ASSERT(shsfmmup != NULL); 11957 11958 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL; 11959 tsbinfop = tsbinfop->tsb_next) { 11960 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 11961 /* drop the private hat lock */ 11962 sfmmu_hat_exit(hatlockp); 11963 /* acquire the shared hat lock */ 11964 shatlockp = sfmmu_hat_enter(shsfmmup); 11965 /* 11966 * recheck to see if anything changed 11967 * after we drop the private hat lock. 11968 */ 11969 if (sfmmup->sfmmu_scdp == scdp && 11970 shsfmmup == scdp->scd_sfmmup) { 11971 sfmmu_tsb_chk_reloc(shsfmmup, 11972 shatlockp); 11973 } 11974 sfmmu_hat_exit(shatlockp); 11975 hatlockp = sfmmu_hat_enter(sfmmup); 11976 goto retry; 11977 } 11978 } 11979 } 11980 11981 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 11982 tsbinfop = tsbinfop->tsb_next) { 11983 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 11984 cv_wait(&sfmmup->sfmmu_tsb_cv, 11985 HATLOCK_MUTEXP(hatlockp)); 11986 goto retry; 11987 } 11988 } 11989 11990 /* 11991 * Wait for ISM maps to be updated. 11992 */ 11993 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 11994 cv_wait(&sfmmup->sfmmu_tsb_cv, 11995 HATLOCK_MUTEXP(hatlockp)); 11996 goto retry; 11997 } 11998 11999 /* Is this process joining an SCD? */ 12000 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12001 /* 12002 * Flush private TSB and setup shared TSB. 12003 * sfmmu_finish_join_scd() does not drop the 12004 * hat lock. 12005 */ 12006 sfmmu_finish_join_scd(sfmmup); 12007 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 12008 } 12009 12010 /* 12011 * If we're swapping in, get TSB(s). Note that we must do 12012 * this before we get a ctx or load the MMU state. Once 12013 * we swap in we have to recheck to make sure the TSB(s) and 12014 * ISM mappings didn't change while we slept. 12015 */ 12016 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 12017 sfmmu_tsb_swapin(sfmmup, hatlockp); 12018 goto retry; 12019 } 12020 12021 sfmmu_get_ctx(sfmmup); 12022 12023 sfmmu_hat_exit(hatlockp); 12024 /* 12025 * Must restore lwp_state if not calling 12026 * trap() for further processing. Restore 12027 * it anyway. 12028 */ 12029 lwp->lwp_state = lwp_save_state; 12030 return; 12031 } 12032 trap(rp, (caddr_t)tagaccess, traptype, 0); 12033 } 12034 12035 static void 12036 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp) 12037 { 12038 struct tsb_info *tp; 12039 12040 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12041 12042 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) { 12043 if (tp->tsb_flags & TSB_RELOC_FLAG) { 12044 cv_wait(&sfmmup->sfmmu_tsb_cv, 12045 HATLOCK_MUTEXP(hatlockp)); 12046 break; 12047 } 12048 } 12049 } 12050 12051 /* 12052 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 12053 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 12054 * rather than spinning to avoid send mondo timeouts with 12055 * interrupts enabled. When the lock is acquired it is immediately 12056 * released and we return back to sfmmu_vatopfn just after 12057 * the GET_TTE call. 12058 */ 12059 void 12060 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 12061 { 12062 struct page **pp; 12063 12064 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12065 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12066 } 12067 12068 /* 12069 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 12070 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 12071 * cross traps which cannot be handled while spinning in the 12072 * trap handlers. Simply enter and exit the kpr_suspendlock spin 12073 * mutex, which is held by the holder of the suspend bit, and then 12074 * retry the trapped instruction after unwinding. 12075 */ 12076 /*ARGSUSED*/ 12077 void 12078 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 12079 { 12080 ASSERT(curthread != kreloc_thread); 12081 mutex_enter(&kpr_suspendlock); 12082 mutex_exit(&kpr_suspendlock); 12083 } 12084 12085 /* 12086 * This routine could be optimized to reduce the number of xcalls by flushing 12087 * the entire TLBs if region reference count is above some threshold but the 12088 * tradeoff will depend on the size of the TLB. So for now flush the specific 12089 * page a context at a time. 12090 * 12091 * If uselocks is 0 then it's called after all cpus were captured and all the 12092 * hat locks were taken. In this case don't take the region lock by relying on 12093 * the order of list region update operations in hat_join_region(), 12094 * hat_leave_region() and hat_dup_region(). The ordering in those routines 12095 * guarantees that list is always forward walkable and reaches active sfmmus 12096 * regardless of where xc_attention() captures a cpu. 12097 */ 12098 cpuset_t 12099 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp, 12100 struct hme_blk *hmeblkp, int uselocks) 12101 { 12102 sfmmu_t *sfmmup; 12103 cpuset_t cpuset; 12104 cpuset_t rcpuset; 12105 hatlock_t *hatlockp; 12106 uint_t rid = rgnp->rgn_id; 12107 sf_rgn_link_t *rlink; 12108 sf_scd_t *scdp; 12109 12110 ASSERT(hmeblkp->hblk_shared); 12111 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 12112 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 12113 12114 CPUSET_ZERO(rcpuset); 12115 if (uselocks) { 12116 mutex_enter(&rgnp->rgn_mutex); 12117 } 12118 sfmmup = rgnp->rgn_sfmmu_head; 12119 while (sfmmup != NULL) { 12120 if (uselocks) { 12121 hatlockp = sfmmu_hat_enter(sfmmup); 12122 } 12123 12124 /* 12125 * When an SCD is created the SCD hat is linked on the sfmmu 12126 * region lists for each hme region which is part of the 12127 * SCD. If we find an SCD hat, when walking these lists, 12128 * then we flush the shared TSBs, if we find a private hat, 12129 * which is part of an SCD, but where the region 12130 * is not part of the SCD then we flush the private TSBs. 12131 */ 12132 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12133 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12134 scdp = sfmmup->sfmmu_scdp; 12135 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 12136 if (uselocks) { 12137 sfmmu_hat_exit(hatlockp); 12138 } 12139 goto next; 12140 } 12141 } 12142 12143 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12144 12145 kpreempt_disable(); 12146 cpuset = sfmmup->sfmmu_cpusran; 12147 CPUSET_AND(cpuset, cpu_ready_set); 12148 CPUSET_DEL(cpuset, CPU->cpu_id); 12149 SFMMU_XCALL_STATS(sfmmup); 12150 xt_some(cpuset, vtag_flushpage_tl1, 12151 (uint64_t)addr, (uint64_t)sfmmup); 12152 vtag_flushpage(addr, (uint64_t)sfmmup); 12153 if (uselocks) { 12154 sfmmu_hat_exit(hatlockp); 12155 } 12156 kpreempt_enable(); 12157 CPUSET_OR(rcpuset, cpuset); 12158 12159 next: 12160 /* LINTED: constant in conditional context */ 12161 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 12162 ASSERT(rlink != NULL); 12163 sfmmup = rlink->next; 12164 } 12165 if (uselocks) { 12166 mutex_exit(&rgnp->rgn_mutex); 12167 } 12168 return (rcpuset); 12169 } 12170 12171 /* 12172 * This routine takes an sfmmu pointer and the va for an adddress in an 12173 * ISM region as input and returns the corresponding region id in ism_rid. 12174 * The return value of 1 indicates that a region has been found and ism_rid 12175 * is valid, otherwise 0 is returned. 12176 */ 12177 static int 12178 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid) 12179 { 12180 ism_blk_t *ism_blkp; 12181 int i; 12182 ism_map_t *ism_map; 12183 #ifdef DEBUG 12184 struct hat *ism_hatid; 12185 #endif 12186 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12187 12188 ism_blkp = sfmmup->sfmmu_iblk; 12189 while (ism_blkp != NULL) { 12190 ism_map = ism_blkp->iblk_maps; 12191 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 12192 if ((va >= ism_start(ism_map[i])) && 12193 (va < ism_end(ism_map[i]))) { 12194 12195 *ism_rid = ism_map[i].imap_rid; 12196 #ifdef DEBUG 12197 ism_hatid = ism_map[i].imap_ismhat; 12198 ASSERT(ism_hatid == ism_sfmmup); 12199 ASSERT(ism_hatid->sfmmu_ismhat); 12200 #endif 12201 return (1); 12202 } 12203 } 12204 ism_blkp = ism_blkp->iblk_next; 12205 } 12206 return (0); 12207 } 12208 12209 /* 12210 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 12211 * This routine may be called with all cpu's captured. Therefore, the 12212 * caller is responsible for holding all locks and disabling kernel 12213 * preemption. 12214 */ 12215 /* ARGSUSED */ 12216 static void 12217 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 12218 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 12219 { 12220 cpuset_t cpuset; 12221 caddr_t va; 12222 ism_ment_t *ment; 12223 sfmmu_t *sfmmup; 12224 #ifdef VAC 12225 int vcolor; 12226 #endif 12227 12228 sf_scd_t *scdp; 12229 uint_t ism_rid; 12230 12231 ASSERT(!hmeblkp->hblk_shared); 12232 /* 12233 * Walk the ism_hat's mapping list and flush the page 12234 * from every hat sharing this ism_hat. This routine 12235 * may be called while all cpu's have been captured. 12236 * Therefore we can't attempt to grab any locks. For now 12237 * this means we will protect the ism mapping list under 12238 * a single lock which will be grabbed by the caller. 12239 * If hat_share/unshare scalibility becomes a performance 12240 * problem then we may need to re-think ism mapping list locking. 12241 */ 12242 ASSERT(ism_sfmmup->sfmmu_ismhat); 12243 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 12244 addr = addr - ISMID_STARTADDR; 12245 12246 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 12247 12248 sfmmup = ment->iment_hat; 12249 12250 va = ment->iment_base_va; 12251 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 12252 12253 /* 12254 * When an SCD is created the SCD hat is linked on the ism 12255 * mapping lists for each ISM segment which is part of the 12256 * SCD. If we find an SCD hat, when walking these lists, 12257 * then we flush the shared TSBs, if we find a private hat, 12258 * which is part of an SCD, but where the region 12259 * corresponding to this va is not part of the SCD then we 12260 * flush the private TSBs. 12261 */ 12262 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12263 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) && 12264 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12265 if (!find_ism_rid(sfmmup, ism_sfmmup, va, 12266 &ism_rid)) { 12267 cmn_err(CE_PANIC, 12268 "can't find matching ISM rid!"); 12269 } 12270 12271 scdp = sfmmup->sfmmu_scdp; 12272 if (SFMMU_IS_ISMRID_VALID(ism_rid) && 12273 SF_RGNMAP_TEST(scdp->scd_ismregion_map, 12274 ism_rid)) { 12275 continue; 12276 } 12277 } 12278 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1); 12279 12280 cpuset = sfmmup->sfmmu_cpusran; 12281 CPUSET_AND(cpuset, cpu_ready_set); 12282 CPUSET_DEL(cpuset, CPU->cpu_id); 12283 SFMMU_XCALL_STATS(sfmmup); 12284 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 12285 (uint64_t)sfmmup); 12286 vtag_flushpage(va, (uint64_t)sfmmup); 12287 12288 #ifdef VAC 12289 /* 12290 * Flush D$ 12291 * When flushing D$ we must flush all 12292 * cpu's. See sfmmu_cache_flush(). 12293 */ 12294 if (cache_flush_flag == CACHE_FLUSH) { 12295 cpuset = cpu_ready_set; 12296 CPUSET_DEL(cpuset, CPU->cpu_id); 12297 12298 SFMMU_XCALL_STATS(sfmmup); 12299 vcolor = addr_to_vcolor(va); 12300 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12301 vac_flushpage(pfnum, vcolor); 12302 } 12303 #endif /* VAC */ 12304 } 12305 } 12306 12307 /* 12308 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 12309 * a particular virtual address and ctx. If noflush is set we do not 12310 * flush the TLB/TSB. This function may or may not be called with the 12311 * HAT lock held. 12312 */ 12313 static void 12314 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12315 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 12316 int hat_lock_held) 12317 { 12318 #ifdef VAC 12319 int vcolor; 12320 #endif 12321 cpuset_t cpuset; 12322 hatlock_t *hatlockp; 12323 12324 ASSERT(!hmeblkp->hblk_shared); 12325 12326 #if defined(lint) && !defined(VAC) 12327 pfnum = pfnum; 12328 cpu_flag = cpu_flag; 12329 cache_flush_flag = cache_flush_flag; 12330 #endif 12331 12332 /* 12333 * There is no longer a need to protect against ctx being 12334 * stolen here since we don't store the ctx in the TSB anymore. 12335 */ 12336 #ifdef VAC 12337 vcolor = addr_to_vcolor(addr); 12338 #endif 12339 12340 /* 12341 * We must hold the hat lock during the flush of TLB, 12342 * to avoid a race with sfmmu_invalidate_ctx(), where 12343 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12344 * causing TLB demap routine to skip flush on that MMU. 12345 * If the context on a MMU has already been set to 12346 * INVALID_CONTEXT, we just get an extra flush on 12347 * that MMU. 12348 */ 12349 if (!hat_lock_held && !tlb_noflush) 12350 hatlockp = sfmmu_hat_enter(sfmmup); 12351 12352 kpreempt_disable(); 12353 if (!tlb_noflush) { 12354 /* 12355 * Flush the TSB and TLB. 12356 */ 12357 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12358 12359 cpuset = sfmmup->sfmmu_cpusran; 12360 CPUSET_AND(cpuset, cpu_ready_set); 12361 CPUSET_DEL(cpuset, CPU->cpu_id); 12362 12363 SFMMU_XCALL_STATS(sfmmup); 12364 12365 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 12366 (uint64_t)sfmmup); 12367 12368 vtag_flushpage(addr, (uint64_t)sfmmup); 12369 } 12370 12371 if (!hat_lock_held && !tlb_noflush) 12372 sfmmu_hat_exit(hatlockp); 12373 12374 #ifdef VAC 12375 /* 12376 * Flush the D$ 12377 * 12378 * Even if the ctx is stolen, we need to flush the 12379 * cache. Our ctx stealer only flushes the TLBs. 12380 */ 12381 if (cache_flush_flag == CACHE_FLUSH) { 12382 if (cpu_flag & FLUSH_ALL_CPUS) { 12383 cpuset = cpu_ready_set; 12384 } else { 12385 cpuset = sfmmup->sfmmu_cpusran; 12386 CPUSET_AND(cpuset, cpu_ready_set); 12387 } 12388 CPUSET_DEL(cpuset, CPU->cpu_id); 12389 SFMMU_XCALL_STATS(sfmmup); 12390 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12391 vac_flushpage(pfnum, vcolor); 12392 } 12393 #endif /* VAC */ 12394 kpreempt_enable(); 12395 } 12396 12397 /* 12398 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 12399 * address and ctx. If noflush is set we do not currently do anything. 12400 * This function may or may not be called with the HAT lock held. 12401 */ 12402 static void 12403 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12404 int tlb_noflush, int hat_lock_held) 12405 { 12406 cpuset_t cpuset; 12407 hatlock_t *hatlockp; 12408 12409 ASSERT(!hmeblkp->hblk_shared); 12410 12411 /* 12412 * If the process is exiting we have nothing to do. 12413 */ 12414 if (tlb_noflush) 12415 return; 12416 12417 /* 12418 * Flush TSB. 12419 */ 12420 if (!hat_lock_held) 12421 hatlockp = sfmmu_hat_enter(sfmmup); 12422 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12423 12424 kpreempt_disable(); 12425 12426 cpuset = sfmmup->sfmmu_cpusran; 12427 CPUSET_AND(cpuset, cpu_ready_set); 12428 CPUSET_DEL(cpuset, CPU->cpu_id); 12429 12430 SFMMU_XCALL_STATS(sfmmup); 12431 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup); 12432 12433 vtag_flushpage(addr, (uint64_t)sfmmup); 12434 12435 if (!hat_lock_held) 12436 sfmmu_hat_exit(hatlockp); 12437 12438 kpreempt_enable(); 12439 12440 } 12441 12442 /* 12443 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 12444 * call handler that can flush a range of pages to save on xcalls. 12445 */ 12446 static int sfmmu_xcall_save; 12447 12448 /* 12449 * this routine is never used for demaping addresses backed by SRD hmeblks. 12450 */ 12451 static void 12452 sfmmu_tlb_range_demap(demap_range_t *dmrp) 12453 { 12454 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 12455 hatlock_t *hatlockp; 12456 cpuset_t cpuset; 12457 uint64_t sfmmu_pgcnt; 12458 pgcnt_t pgcnt = 0; 12459 int pgunload = 0; 12460 int dirtypg = 0; 12461 caddr_t addr = dmrp->dmr_addr; 12462 caddr_t eaddr; 12463 uint64_t bitvec = dmrp->dmr_bitvec; 12464 12465 ASSERT(bitvec & 1); 12466 12467 /* 12468 * Flush TSB and calculate number of pages to flush. 12469 */ 12470 while (bitvec != 0) { 12471 dirtypg = 0; 12472 /* 12473 * Find the first page to flush and then count how many 12474 * pages there are after it that also need to be flushed. 12475 * This way the number of TSB flushes is minimized. 12476 */ 12477 while ((bitvec & 1) == 0) { 12478 pgcnt++; 12479 addr += MMU_PAGESIZE; 12480 bitvec >>= 1; 12481 } 12482 while (bitvec & 1) { 12483 dirtypg++; 12484 bitvec >>= 1; 12485 } 12486 eaddr = addr + ptob(dirtypg); 12487 hatlockp = sfmmu_hat_enter(sfmmup); 12488 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 12489 sfmmu_hat_exit(hatlockp); 12490 pgunload += dirtypg; 12491 addr = eaddr; 12492 pgcnt += dirtypg; 12493 } 12494 12495 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 12496 if (sfmmup->sfmmu_free == 0) { 12497 addr = dmrp->dmr_addr; 12498 bitvec = dmrp->dmr_bitvec; 12499 12500 /* 12501 * make sure it has SFMMU_PGCNT_SHIFT bits only, 12502 * as it will be used to pack argument for xt_some 12503 */ 12504 ASSERT((pgcnt > 0) && 12505 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT))); 12506 12507 /* 12508 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in 12509 * the low 6 bits of sfmmup. This is doable since pgcnt 12510 * always >= 1. 12511 */ 12512 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK)); 12513 sfmmu_pgcnt = (uint64_t)sfmmup | 12514 ((pgcnt - 1) & SFMMU_PGCNT_MASK); 12515 12516 /* 12517 * We must hold the hat lock during the flush of TLB, 12518 * to avoid a race with sfmmu_invalidate_ctx(), where 12519 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12520 * causing TLB demap routine to skip flush on that MMU. 12521 * If the context on a MMU has already been set to 12522 * INVALID_CONTEXT, we just get an extra flush on 12523 * that MMU. 12524 */ 12525 hatlockp = sfmmu_hat_enter(sfmmup); 12526 kpreempt_disable(); 12527 12528 cpuset = sfmmup->sfmmu_cpusran; 12529 CPUSET_AND(cpuset, cpu_ready_set); 12530 CPUSET_DEL(cpuset, CPU->cpu_id); 12531 12532 SFMMU_XCALL_STATS(sfmmup); 12533 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 12534 sfmmu_pgcnt); 12535 12536 for (; bitvec != 0; bitvec >>= 1) { 12537 if (bitvec & 1) 12538 vtag_flushpage(addr, (uint64_t)sfmmup); 12539 addr += MMU_PAGESIZE; 12540 } 12541 kpreempt_enable(); 12542 sfmmu_hat_exit(hatlockp); 12543 12544 sfmmu_xcall_save += (pgunload-1); 12545 } 12546 dmrp->dmr_bitvec = 0; 12547 } 12548 12549 /* 12550 * In cases where we need to synchronize with TLB/TSB miss trap 12551 * handlers, _and_ need to flush the TLB, it's a lot easier to 12552 * throw away the context from the process than to do a 12553 * special song and dance to keep things consistent for the 12554 * handlers. 12555 * 12556 * Since the process suddenly ends up without a context and our caller 12557 * holds the hat lock, threads that fault after this function is called 12558 * will pile up on the lock. We can then do whatever we need to 12559 * atomically from the context of the caller. The first blocked thread 12560 * to resume executing will get the process a new context, and the 12561 * process will resume executing. 12562 * 12563 * One added advantage of this approach is that on MMUs that 12564 * support a "flush all" operation, we will delay the flush until 12565 * cnum wrap-around, and then flush the TLB one time. This 12566 * is rather rare, so it's a lot less expensive than making 8000 12567 * x-calls to flush the TLB 8000 times. 12568 * 12569 * A per-process (PP) lock is used to synchronize ctx allocations in 12570 * resume() and ctx invalidations here. 12571 */ 12572 static void 12573 sfmmu_invalidate_ctx(sfmmu_t *sfmmup) 12574 { 12575 cpuset_t cpuset; 12576 int cnum, currcnum; 12577 mmu_ctx_t *mmu_ctxp; 12578 int i; 12579 uint_t pstate_save; 12580 12581 SFMMU_STAT(sf_ctx_inv); 12582 12583 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12584 ASSERT(sfmmup != ksfmmup); 12585 12586 kpreempt_disable(); 12587 12588 mmu_ctxp = CPU_MMU_CTXP(CPU); 12589 ASSERT(mmu_ctxp); 12590 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 12591 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 12592 12593 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum; 12594 12595 pstate_save = sfmmu_disable_intrs(); 12596 12597 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */ 12598 /* set HAT cnum invalid across all context domains. */ 12599 for (i = 0; i < max_mmu_ctxdoms; i++) { 12600 12601 cnum = sfmmup->sfmmu_ctxs[i].cnum; 12602 if (cnum == INVALID_CONTEXT) { 12603 continue; 12604 } 12605 12606 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 12607 } 12608 membar_enter(); /* make sure globally visible to all CPUs */ 12609 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */ 12610 12611 sfmmu_enable_intrs(pstate_save); 12612 12613 cpuset = sfmmup->sfmmu_cpusran; 12614 CPUSET_DEL(cpuset, CPU->cpu_id); 12615 CPUSET_AND(cpuset, cpu_ready_set); 12616 if (!CPUSET_ISNULL(cpuset)) { 12617 SFMMU_XCALL_STATS(sfmmup); 12618 xt_some(cpuset, sfmmu_raise_tsb_exception, 12619 (uint64_t)sfmmup, INVALID_CONTEXT); 12620 xt_sync(cpuset); 12621 SFMMU_STAT(sf_tsb_raise_exception); 12622 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 12623 } 12624 12625 /* 12626 * If the hat to-be-invalidated is the same as the current 12627 * process on local CPU we need to invalidate 12628 * this CPU context as well. 12629 */ 12630 if ((sfmmu_getctx_sec() == currcnum) && 12631 (currcnum != INVALID_CONTEXT)) { 12632 /* sets shared context to INVALID too */ 12633 sfmmu_setctx_sec(INVALID_CONTEXT); 12634 sfmmu_clear_utsbinfo(); 12635 } 12636 12637 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID); 12638 12639 kpreempt_enable(); 12640 12641 /* 12642 * we hold the hat lock, so nobody should allocate a context 12643 * for us yet 12644 */ 12645 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT); 12646 } 12647 12648 #ifdef VAC 12649 /* 12650 * We need to flush the cache in all cpus. It is possible that 12651 * a process referenced a page as cacheable but has sinced exited 12652 * and cleared the mapping list. We still to flush it but have no 12653 * state so all cpus is the only alternative. 12654 */ 12655 void 12656 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 12657 { 12658 cpuset_t cpuset; 12659 12660 kpreempt_disable(); 12661 cpuset = cpu_ready_set; 12662 CPUSET_DEL(cpuset, CPU->cpu_id); 12663 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12664 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12665 xt_sync(cpuset); 12666 vac_flushpage(pfnum, vcolor); 12667 kpreempt_enable(); 12668 } 12669 12670 void 12671 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 12672 { 12673 cpuset_t cpuset; 12674 12675 ASSERT(vcolor >= 0); 12676 12677 kpreempt_disable(); 12678 cpuset = cpu_ready_set; 12679 CPUSET_DEL(cpuset, CPU->cpu_id); 12680 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12681 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 12682 xt_sync(cpuset); 12683 vac_flushcolor(vcolor, pfnum); 12684 kpreempt_enable(); 12685 } 12686 #endif /* VAC */ 12687 12688 /* 12689 * We need to prevent processes from accessing the TSB using a cached physical 12690 * address. It's alright if they try to access the TSB via virtual address 12691 * since they will just fault on that virtual address once the mapping has 12692 * been suspended. 12693 */ 12694 #pragma weak sendmondo_in_recover 12695 12696 /* ARGSUSED */ 12697 static int 12698 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 12699 { 12700 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12701 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12702 hatlock_t *hatlockp; 12703 sf_scd_t *scdp; 12704 12705 if (flags != HAT_PRESUSPEND) 12706 return (0); 12707 12708 /* 12709 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must 12710 * be a shared hat, then set SCD's tsbinfo's flag. 12711 * If tsb is not shared, sfmmup is a private hat, then set 12712 * its private tsbinfo's flag. 12713 */ 12714 hatlockp = sfmmu_hat_enter(sfmmup); 12715 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 12716 12717 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) { 12718 sfmmu_tsb_inv_ctx(sfmmup); 12719 sfmmu_hat_exit(hatlockp); 12720 } else { 12721 /* release lock on the shared hat */ 12722 sfmmu_hat_exit(hatlockp); 12723 /* sfmmup is a shared hat */ 12724 ASSERT(sfmmup->sfmmu_scdhat); 12725 scdp = sfmmup->sfmmu_scdp; 12726 ASSERT(scdp != NULL); 12727 /* get private hat from the scd list */ 12728 mutex_enter(&scdp->scd_mutex); 12729 sfmmup = scdp->scd_sf_list; 12730 while (sfmmup != NULL) { 12731 hatlockp = sfmmu_hat_enter(sfmmup); 12732 /* 12733 * We do not call sfmmu_tsb_inv_ctx here because 12734 * sendmondo_in_recover check is only needed for 12735 * sun4u. 12736 */ 12737 sfmmu_invalidate_ctx(sfmmup); 12738 sfmmu_hat_exit(hatlockp); 12739 sfmmup = sfmmup->sfmmu_scd_link.next; 12740 12741 } 12742 mutex_exit(&scdp->scd_mutex); 12743 } 12744 return (0); 12745 } 12746 12747 static void 12748 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup) 12749 { 12750 extern uint32_t sendmondo_in_recover; 12751 12752 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12753 12754 /* 12755 * For Cheetah+ Erratum 25: 12756 * Wait for any active recovery to finish. We can't risk 12757 * relocating the TSB of the thread running mondo_recover_proc() 12758 * since, if we did that, we would deadlock. The scenario we are 12759 * trying to avoid is as follows: 12760 * 12761 * THIS CPU RECOVER CPU 12762 * -------- ----------- 12763 * Begins recovery, walking through TSB 12764 * hat_pagesuspend() TSB TTE 12765 * TLB miss on TSB TTE, spins at TL1 12766 * xt_sync() 12767 * send_mondo_timeout() 12768 * mondo_recover_proc() 12769 * ((deadlocked)) 12770 * 12771 * The second half of the workaround is that mondo_recover_proc() 12772 * checks to see if the tsb_info has the RELOC flag set, and if it 12773 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 12774 * and hence avoiding the TLB miss that could result in a deadlock. 12775 */ 12776 if (&sendmondo_in_recover) { 12777 membar_enter(); /* make sure RELOC flag visible */ 12778 while (sendmondo_in_recover) { 12779 drv_usecwait(1); 12780 membar_consumer(); 12781 } 12782 } 12783 12784 sfmmu_invalidate_ctx(sfmmup); 12785 } 12786 12787 /* ARGSUSED */ 12788 static int 12789 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 12790 void *tsbinfo, pfn_t newpfn) 12791 { 12792 hatlock_t *hatlockp; 12793 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12794 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12795 12796 if (flags != HAT_POSTUNSUSPEND) 12797 return (0); 12798 12799 hatlockp = sfmmu_hat_enter(sfmmup); 12800 12801 SFMMU_STAT(sf_tsb_reloc); 12802 12803 /* 12804 * The process may have swapped out while we were relocating one 12805 * of its TSBs. If so, don't bother doing the setup since the 12806 * process can't be using the memory anymore. 12807 */ 12808 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 12809 ASSERT(va == tsbinfop->tsb_va); 12810 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 12811 12812 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 12813 sfmmu_inv_tsb(tsbinfop->tsb_va, 12814 TSB_BYTES(tsbinfop->tsb_szc)); 12815 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 12816 } 12817 } 12818 12819 membar_exit(); 12820 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 12821 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 12822 12823 sfmmu_hat_exit(hatlockp); 12824 12825 return (0); 12826 } 12827 12828 /* 12829 * Allocate and initialize a tsb_info structure. Note that we may or may not 12830 * allocate a TSB here, depending on the flags passed in. 12831 */ 12832 static int 12833 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 12834 uint_t flags, sfmmu_t *sfmmup) 12835 { 12836 int err; 12837 12838 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 12839 sfmmu_tsbinfo_cache, KM_SLEEP); 12840 12841 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 12842 tsb_szc, flags, sfmmup)) != 0) { 12843 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 12844 SFMMU_STAT(sf_tsb_allocfail); 12845 *tsbinfopp = NULL; 12846 return (err); 12847 } 12848 SFMMU_STAT(sf_tsb_alloc); 12849 12850 /* 12851 * Bump the TSB size counters for this TSB size. 12852 */ 12853 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 12854 return (0); 12855 } 12856 12857 static void 12858 sfmmu_tsb_free(struct tsb_info *tsbinfo) 12859 { 12860 caddr_t tsbva = tsbinfo->tsb_va; 12861 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 12862 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 12863 vmem_t *vmp = tsbinfo->tsb_vmp; 12864 12865 /* 12866 * If we allocated this TSB from relocatable kernel memory, then we 12867 * need to uninstall the callback handler. 12868 */ 12869 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 12870 uintptr_t slab_mask; 12871 caddr_t slab_vaddr; 12872 page_t **ppl; 12873 int ret; 12874 12875 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena); 12876 if (tsb_size > MMU_PAGESIZE4M) 12877 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12878 else 12879 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12880 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 12881 12882 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 12883 ASSERT(ret == 0); 12884 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 12885 0, NULL); 12886 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 12887 } 12888 12889 if (kmem_cachep != NULL) { 12890 kmem_cache_free(kmem_cachep, tsbva); 12891 } else { 12892 vmem_xfree(vmp, (void *)tsbva, tsb_size); 12893 } 12894 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 12895 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 12896 } 12897 12898 static void 12899 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 12900 { 12901 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 12902 sfmmu_tsb_free(tsbinfo); 12903 } 12904 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 12905 12906 } 12907 12908 /* 12909 * Setup all the references to physical memory for this tsbinfo. 12910 * The underlying page(s) must be locked. 12911 */ 12912 static void 12913 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 12914 { 12915 ASSERT(pfn != PFN_INVALID); 12916 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 12917 12918 #ifndef sun4v 12919 if (tsbinfo->tsb_szc == 0) { 12920 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 12921 PROT_WRITE|PROT_READ, TTE8K); 12922 } else { 12923 /* 12924 * Round down PA and use a large mapping; the handlers will 12925 * compute the TSB pointer at the correct offset into the 12926 * big virtual page. NOTE: this assumes all TSBs larger 12927 * than 8K must come from physically contiguous slabs of 12928 * size tsb_slab_size. 12929 */ 12930 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 12931 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 12932 } 12933 tsbinfo->tsb_pa = ptob(pfn); 12934 12935 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 12936 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 12937 12938 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 12939 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 12940 #else /* sun4v */ 12941 tsbinfo->tsb_pa = ptob(pfn); 12942 #endif /* sun4v */ 12943 } 12944 12945 12946 /* 12947 * Returns zero on success, ENOMEM if over the high water mark, 12948 * or EAGAIN if the caller needs to retry with a smaller TSB 12949 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 12950 * 12951 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 12952 * is specified and the TSB requested is PAGESIZE, though it 12953 * may sleep waiting for memory if sufficient memory is not 12954 * available. 12955 */ 12956 static int 12957 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 12958 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 12959 { 12960 caddr_t vaddr = NULL; 12961 caddr_t slab_vaddr; 12962 uintptr_t slab_mask; 12963 int tsbbytes = TSB_BYTES(tsbcode); 12964 int lowmem = 0; 12965 struct kmem_cache *kmem_cachep = NULL; 12966 vmem_t *vmp = NULL; 12967 lgrp_id_t lgrpid = LGRP_NONE; 12968 pfn_t pfn; 12969 uint_t cbflags = HAC_SLEEP; 12970 page_t **pplist; 12971 int ret; 12972 12973 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena); 12974 if (tsbbytes > MMU_PAGESIZE4M) 12975 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12976 else 12977 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12978 12979 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 12980 flags |= TSB_ALLOC; 12981 12982 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 12983 12984 tsbinfo->tsb_sfmmu = sfmmup; 12985 12986 /* 12987 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 12988 * return. 12989 */ 12990 if ((flags & TSB_ALLOC) == 0) { 12991 tsbinfo->tsb_szc = tsbcode; 12992 tsbinfo->tsb_ttesz_mask = tteszmask; 12993 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 12994 tsbinfo->tsb_pa = -1; 12995 tsbinfo->tsb_tte.ll = 0; 12996 tsbinfo->tsb_next = NULL; 12997 tsbinfo->tsb_flags = TSB_SWAPPED; 12998 tsbinfo->tsb_cache = NULL; 12999 tsbinfo->tsb_vmp = NULL; 13000 return (0); 13001 } 13002 13003 #ifdef DEBUG 13004 /* 13005 * For debugging: 13006 * Randomly force allocation failures every tsb_alloc_mtbf 13007 * tries if TSB_FORCEALLOC is not specified. This will 13008 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 13009 * it is even, to allow testing of both failure paths... 13010 */ 13011 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 13012 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 13013 tsb_alloc_count = 0; 13014 tsb_alloc_fail_mtbf++; 13015 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 13016 } 13017 #endif /* DEBUG */ 13018 13019 /* 13020 * Enforce high water mark if we are not doing a forced allocation 13021 * and are not shrinking a process' TSB. 13022 */ 13023 if ((flags & TSB_SHRINK) == 0 && 13024 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 13025 if ((flags & TSB_FORCEALLOC) == 0) 13026 return (ENOMEM); 13027 lowmem = 1; 13028 } 13029 13030 /* 13031 * Allocate from the correct location based upon the size of the TSB 13032 * compared to the base page size, and what memory conditions dictate. 13033 * Note we always do nonblocking allocations from the TSB arena since 13034 * we don't want memory fragmentation to cause processes to block 13035 * indefinitely waiting for memory; until the kernel algorithms that 13036 * coalesce large pages are improved this is our best option. 13037 * 13038 * Algorithm: 13039 * If allocating a "large" TSB (>8K), allocate from the 13040 * appropriate kmem_tsb_default_arena vmem arena 13041 * else if low on memory or the TSB_FORCEALLOC flag is set or 13042 * tsb_forceheap is set 13043 * Allocate from kernel heap via sfmmu_tsb8k_cache with 13044 * KM_SLEEP (never fails) 13045 * else 13046 * Allocate from appropriate sfmmu_tsb_cache with 13047 * KM_NOSLEEP 13048 * endif 13049 */ 13050 if (tsb_lgrp_affinity) 13051 lgrpid = lgrp_home_id(curthread); 13052 if (lgrpid == LGRP_NONE) 13053 lgrpid = 0; /* use lgrp of boot CPU */ 13054 13055 if (tsbbytes > MMU_PAGESIZE) { 13056 if (tsbbytes > MMU_PAGESIZE4M) { 13057 vmp = kmem_bigtsb_default_arena[lgrpid]; 13058 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13059 0, 0, NULL, NULL, VM_NOSLEEP); 13060 } else { 13061 vmp = kmem_tsb_default_arena[lgrpid]; 13062 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13063 0, 0, NULL, NULL, VM_NOSLEEP); 13064 } 13065 #ifdef DEBUG 13066 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 13067 #else /* !DEBUG */ 13068 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 13069 #endif /* DEBUG */ 13070 kmem_cachep = sfmmu_tsb8k_cache; 13071 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 13072 ASSERT(vaddr != NULL); 13073 } else { 13074 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 13075 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 13076 } 13077 13078 tsbinfo->tsb_cache = kmem_cachep; 13079 tsbinfo->tsb_vmp = vmp; 13080 13081 if (vaddr == NULL) { 13082 return (EAGAIN); 13083 } 13084 13085 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 13086 kmem_cachep = tsbinfo->tsb_cache; 13087 13088 /* 13089 * If we are allocating from outside the cage, then we need to 13090 * register a relocation callback handler. Note that for now 13091 * since pseudo mappings always hang off of the slab's root page, 13092 * we need only lock the first 8K of the TSB slab. This is a bit 13093 * hacky but it is good for performance. 13094 */ 13095 if (kmem_cachep != sfmmu_tsb8k_cache) { 13096 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 13097 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 13098 ASSERT(ret == 0); 13099 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 13100 cbflags, (void *)tsbinfo, &pfn, NULL); 13101 13102 /* 13103 * Need to free up resources if we could not successfully 13104 * add the callback function and return an error condition. 13105 */ 13106 if (ret != 0) { 13107 if (kmem_cachep) { 13108 kmem_cache_free(kmem_cachep, vaddr); 13109 } else { 13110 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 13111 } 13112 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 13113 S_WRITE); 13114 return (EAGAIN); 13115 } 13116 } else { 13117 /* 13118 * Since allocation of 8K TSBs from heap is rare and occurs 13119 * during memory pressure we allocate them from permanent 13120 * memory rather than using callbacks to get the PFN. 13121 */ 13122 pfn = hat_getpfnum(kas.a_hat, vaddr); 13123 } 13124 13125 tsbinfo->tsb_va = vaddr; 13126 tsbinfo->tsb_szc = tsbcode; 13127 tsbinfo->tsb_ttesz_mask = tteszmask; 13128 tsbinfo->tsb_next = NULL; 13129 tsbinfo->tsb_flags = 0; 13130 13131 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 13132 13133 sfmmu_inv_tsb(vaddr, tsbbytes); 13134 13135 if (kmem_cachep != sfmmu_tsb8k_cache) { 13136 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 13137 } 13138 13139 return (0); 13140 } 13141 13142 /* 13143 * Initialize per cpu tsb and per cpu tsbmiss_area 13144 */ 13145 void 13146 sfmmu_init_tsbs(void) 13147 { 13148 int i; 13149 struct tsbmiss *tsbmissp; 13150 struct kpmtsbm *kpmtsbmp; 13151 #ifndef sun4v 13152 extern int dcache_line_mask; 13153 #endif /* sun4v */ 13154 extern uint_t vac_colors; 13155 13156 /* 13157 * Init. tsb miss area. 13158 */ 13159 tsbmissp = tsbmiss_area; 13160 13161 for (i = 0; i < NCPU; tsbmissp++, i++) { 13162 /* 13163 * initialize the tsbmiss area. 13164 * Do this for all possible CPUs as some may be added 13165 * while the system is running. There is no cost to this. 13166 */ 13167 tsbmissp->ksfmmup = ksfmmup; 13168 #ifndef sun4v 13169 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 13170 #endif /* sun4v */ 13171 tsbmissp->khashstart = 13172 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 13173 tsbmissp->uhashstart = 13174 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 13175 tsbmissp->khashsz = khmehash_num; 13176 tsbmissp->uhashsz = uhmehash_num; 13177 } 13178 13179 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B', 13180 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0); 13181 13182 if (kpm_enable == 0) 13183 return; 13184 13185 /* -- Begin KPM specific init -- */ 13186 13187 if (kpm_smallpages) { 13188 /* 13189 * If we're using base pagesize pages for seg_kpm 13190 * mappings, we use the kernel TSB since we can't afford 13191 * to allocate a second huge TSB for these mappings. 13192 */ 13193 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13194 kpm_tsbsz = ktsb_szcode; 13195 kpmsm_tsbbase = kpm_tsbbase; 13196 kpmsm_tsbsz = kpm_tsbsz; 13197 } else { 13198 /* 13199 * In VAC conflict case, just put the entries in the 13200 * kernel 8K indexed TSB for now so we can find them. 13201 * This could really be changed in the future if we feel 13202 * the need... 13203 */ 13204 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13205 kpmsm_tsbsz = ktsb_szcode; 13206 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 13207 kpm_tsbsz = ktsb4m_szcode; 13208 } 13209 13210 kpmtsbmp = kpmtsbm_area; 13211 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 13212 /* 13213 * Initialize the kpmtsbm area. 13214 * Do this for all possible CPUs as some may be added 13215 * while the system is running. There is no cost to this. 13216 */ 13217 kpmtsbmp->vbase = kpm_vbase; 13218 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 13219 kpmtsbmp->sz_shift = kpm_size_shift; 13220 kpmtsbmp->kpmp_shift = kpmp_shift; 13221 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 13222 if (kpm_smallpages == 0) { 13223 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 13224 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 13225 } else { 13226 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 13227 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 13228 } 13229 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 13230 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 13231 #ifdef DEBUG 13232 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 13233 #endif /* DEBUG */ 13234 if (ktsb_phys) 13235 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 13236 } 13237 13238 /* -- End KPM specific init -- */ 13239 } 13240 13241 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 13242 struct tsb_info ktsb_info[2]; 13243 13244 /* 13245 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 13246 */ 13247 void 13248 sfmmu_init_ktsbinfo() 13249 { 13250 ASSERT(ksfmmup != NULL); 13251 ASSERT(ksfmmup->sfmmu_tsb == NULL); 13252 /* 13253 * Allocate tsbinfos for kernel and copy in data 13254 * to make debug easier and sun4v setup easier. 13255 */ 13256 ktsb_info[0].tsb_sfmmu = ksfmmup; 13257 ktsb_info[0].tsb_szc = ktsb_szcode; 13258 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 13259 ktsb_info[0].tsb_va = ktsb_base; 13260 ktsb_info[0].tsb_pa = ktsb_pbase; 13261 ktsb_info[0].tsb_flags = 0; 13262 ktsb_info[0].tsb_tte.ll = 0; 13263 ktsb_info[0].tsb_cache = NULL; 13264 13265 ktsb_info[1].tsb_sfmmu = ksfmmup; 13266 ktsb_info[1].tsb_szc = ktsb4m_szcode; 13267 ktsb_info[1].tsb_ttesz_mask = TSB4M; 13268 ktsb_info[1].tsb_va = ktsb4m_base; 13269 ktsb_info[1].tsb_pa = ktsb4m_pbase; 13270 ktsb_info[1].tsb_flags = 0; 13271 ktsb_info[1].tsb_tte.ll = 0; 13272 ktsb_info[1].tsb_cache = NULL; 13273 13274 /* Link them into ksfmmup. */ 13275 ktsb_info[0].tsb_next = &ktsb_info[1]; 13276 ktsb_info[1].tsb_next = NULL; 13277 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 13278 13279 sfmmu_setup_tsbinfo(ksfmmup); 13280 } 13281 13282 /* 13283 * Cache the last value returned from va_to_pa(). If the VA specified 13284 * in the current call to cached_va_to_pa() maps to the same Page (as the 13285 * previous call to cached_va_to_pa()), then compute the PA using 13286 * cached info, else call va_to_pa(). 13287 * 13288 * Note: this function is neither MT-safe nor consistent in the presence 13289 * of multiple, interleaved threads. This function was created to enable 13290 * an optimization used during boot (at a point when there's only one thread 13291 * executing on the "boot CPU", and before startup_vm() has been called). 13292 */ 13293 static uint64_t 13294 cached_va_to_pa(void *vaddr) 13295 { 13296 static uint64_t prev_vaddr_base = 0; 13297 static uint64_t prev_pfn = 0; 13298 13299 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 13300 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 13301 } else { 13302 uint64_t pa = va_to_pa(vaddr); 13303 13304 if (pa != ((uint64_t)-1)) { 13305 /* 13306 * Computed physical address is valid. Cache its 13307 * related info for the next cached_va_to_pa() call. 13308 */ 13309 prev_pfn = pa & MMU_PAGEMASK; 13310 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 13311 } 13312 13313 return (pa); 13314 } 13315 } 13316 13317 /* 13318 * Carve up our nucleus hblk region. We may allocate more hblks than 13319 * asked due to rounding errors but we are guaranteed to have at least 13320 * enough space to allocate the requested number of hblk8's and hblk1's. 13321 */ 13322 void 13323 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 13324 { 13325 struct hme_blk *hmeblkp; 13326 size_t hme8blk_sz, hme1blk_sz; 13327 size_t i; 13328 size_t hblk8_bound; 13329 ulong_t j = 0, k = 0; 13330 13331 ASSERT(addr != NULL && size != 0); 13332 13333 /* Need to use proper structure alignment */ 13334 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 13335 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 13336 13337 nucleus_hblk8.list = (void *)addr; 13338 nucleus_hblk8.index = 0; 13339 13340 /* 13341 * Use as much memory as possible for hblk8's since we 13342 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 13343 * We need to hold back enough space for the hblk1's which 13344 * we'll allocate next. 13345 */ 13346 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 13347 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 13348 hmeblkp = (struct hme_blk *)addr; 13349 addr += hme8blk_sz; 13350 hmeblkp->hblk_nuc_bit = 1; 13351 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13352 } 13353 nucleus_hblk8.len = j; 13354 ASSERT(j >= nhblk8); 13355 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 13356 13357 nucleus_hblk1.list = (void *)addr; 13358 nucleus_hblk1.index = 0; 13359 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 13360 hmeblkp = (struct hme_blk *)addr; 13361 addr += hme1blk_sz; 13362 hmeblkp->hblk_nuc_bit = 1; 13363 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13364 } 13365 ASSERT(k >= nhblk1); 13366 nucleus_hblk1.len = k; 13367 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 13368 } 13369 13370 /* 13371 * This function is currently not supported on this platform. For what 13372 * it's supposed to do, see hat.c and hat_srmmu.c 13373 */ 13374 /* ARGSUSED */ 13375 faultcode_t 13376 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 13377 uint_t flags) 13378 { 13379 ASSERT(hat->sfmmu_xhat_provider == NULL); 13380 return (FC_NOSUPPORT); 13381 } 13382 13383 /* 13384 * Searchs the mapping list of the page for a mapping of the same size. If not 13385 * found the corresponding bit is cleared in the p_index field. When large 13386 * pages are more prevalent in the system, we can maintain the mapping list 13387 * in order and we don't have to traverse the list each time. Just check the 13388 * next and prev entries, and if both are of different size, we clear the bit. 13389 */ 13390 static void 13391 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 13392 { 13393 struct sf_hment *sfhmep; 13394 struct hme_blk *hmeblkp; 13395 int index; 13396 pgcnt_t npgs; 13397 13398 ASSERT(ttesz > TTE8K); 13399 13400 ASSERT(sfmmu_mlist_held(pp)); 13401 13402 ASSERT(PP_ISMAPPED_LARGE(pp)); 13403 13404 /* 13405 * Traverse mapping list looking for another mapping of same size. 13406 * since we only want to clear index field if all mappings of 13407 * that size are gone. 13408 */ 13409 13410 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 13411 if (IS_PAHME(sfhmep)) 13412 continue; 13413 hmeblkp = sfmmu_hmetohblk(sfhmep); 13414 if (hmeblkp->hblk_xhat_bit) 13415 continue; 13416 if (hme_size(sfhmep) == ttesz) { 13417 /* 13418 * another mapping of the same size. don't clear index. 13419 */ 13420 return; 13421 } 13422 } 13423 13424 /* 13425 * Clear the p_index bit for large page. 13426 */ 13427 index = PAGESZ_TO_INDEX(ttesz); 13428 npgs = TTEPAGES(ttesz); 13429 while (npgs-- > 0) { 13430 ASSERT(pp->p_index & index); 13431 pp->p_index &= ~index; 13432 pp = PP_PAGENEXT(pp); 13433 } 13434 } 13435 13436 /* 13437 * return supported features 13438 */ 13439 /* ARGSUSED */ 13440 int 13441 hat_supported(enum hat_features feature, void *arg) 13442 { 13443 switch (feature) { 13444 case HAT_SHARED_PT: 13445 case HAT_DYNAMIC_ISM_UNMAP: 13446 case HAT_VMODSORT: 13447 return (1); 13448 case HAT_SHARED_REGIONS: 13449 if (shctx_on) 13450 return (1); 13451 else 13452 return (0); 13453 default: 13454 return (0); 13455 } 13456 } 13457 13458 void 13459 hat_enter(struct hat *hat) 13460 { 13461 hatlock_t *hatlockp; 13462 13463 if (hat != ksfmmup) { 13464 hatlockp = TSB_HASH(hat); 13465 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 13466 } 13467 } 13468 13469 void 13470 hat_exit(struct hat *hat) 13471 { 13472 hatlock_t *hatlockp; 13473 13474 if (hat != ksfmmup) { 13475 hatlockp = TSB_HASH(hat); 13476 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 13477 } 13478 } 13479 13480 /*ARGSUSED*/ 13481 void 13482 hat_reserve(struct as *as, caddr_t addr, size_t len) 13483 { 13484 } 13485 13486 static void 13487 hat_kstat_init(void) 13488 { 13489 kstat_t *ksp; 13490 13491 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 13492 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 13493 KSTAT_FLAG_VIRTUAL); 13494 if (ksp) { 13495 ksp->ks_data = (void *) &sfmmu_global_stat; 13496 kstat_install(ksp); 13497 } 13498 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 13499 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 13500 KSTAT_FLAG_VIRTUAL); 13501 if (ksp) { 13502 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 13503 kstat_install(ksp); 13504 } 13505 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 13506 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 13507 KSTAT_FLAG_WRITABLE); 13508 if (ksp) { 13509 ksp->ks_update = sfmmu_kstat_percpu_update; 13510 kstat_install(ksp); 13511 } 13512 } 13513 13514 /* ARGSUSED */ 13515 static int 13516 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 13517 { 13518 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 13519 struct tsbmiss *tsbm = tsbmiss_area; 13520 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 13521 int i; 13522 13523 ASSERT(cpu_kstat); 13524 if (rw == KSTAT_READ) { 13525 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 13526 cpu_kstat->sf_itlb_misses = 0; 13527 cpu_kstat->sf_dtlb_misses = 0; 13528 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 13529 tsbm->uprot_traps; 13530 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 13531 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 13532 cpu_kstat->sf_tsb_hits = 0; 13533 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 13534 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 13535 } 13536 } else { 13537 /* KSTAT_WRITE is used to clear stats */ 13538 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 13539 tsbm->utsb_misses = 0; 13540 tsbm->ktsb_misses = 0; 13541 tsbm->uprot_traps = 0; 13542 tsbm->kprot_traps = 0; 13543 kpmtsbm->kpm_dtlb_misses = 0; 13544 kpmtsbm->kpm_tsb_misses = 0; 13545 } 13546 } 13547 return (0); 13548 } 13549 13550 #ifdef DEBUG 13551 13552 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 13553 13554 /* 13555 * A tte checker. *orig_old is the value we read before cas. 13556 * *cur is the value returned by cas. 13557 * *new is the desired value when we do the cas. 13558 * 13559 * *hmeblkp is currently unused. 13560 */ 13561 13562 /* ARGSUSED */ 13563 void 13564 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 13565 { 13566 pfn_t i, j, k; 13567 int cpuid = CPU->cpu_id; 13568 13569 gorig[cpuid] = orig_old; 13570 gcur[cpuid] = cur; 13571 gnew[cpuid] = new; 13572 13573 #ifdef lint 13574 hmeblkp = hmeblkp; 13575 #endif 13576 13577 if (TTE_IS_VALID(orig_old)) { 13578 if (TTE_IS_VALID(cur)) { 13579 i = TTE_TO_TTEPFN(orig_old); 13580 j = TTE_TO_TTEPFN(cur); 13581 k = TTE_TO_TTEPFN(new); 13582 if (i != j) { 13583 /* remap error? */ 13584 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j); 13585 } 13586 13587 if (i != k) { 13588 /* remap error? */ 13589 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k); 13590 } 13591 } else { 13592 if (TTE_IS_VALID(new)) { 13593 panic("chk_tte: invalid cur? "); 13594 } 13595 13596 i = TTE_TO_TTEPFN(orig_old); 13597 k = TTE_TO_TTEPFN(new); 13598 if (i != k) { 13599 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k); 13600 } 13601 } 13602 } else { 13603 if (TTE_IS_VALID(cur)) { 13604 j = TTE_TO_TTEPFN(cur); 13605 if (TTE_IS_VALID(new)) { 13606 k = TTE_TO_TTEPFN(new); 13607 if (j != k) { 13608 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx", 13609 j, k); 13610 } 13611 } else { 13612 panic("chk_tte: why here?"); 13613 } 13614 } else { 13615 if (!TTE_IS_VALID(new)) { 13616 panic("chk_tte: why here2 ?"); 13617 } 13618 } 13619 } 13620 } 13621 13622 #endif /* DEBUG */ 13623 13624 extern void prefetch_tsbe_read(struct tsbe *); 13625 extern void prefetch_tsbe_write(struct tsbe *); 13626 13627 13628 /* 13629 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 13630 * us optimal performance on Cheetah+. You can only have 8 outstanding 13631 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 13632 * prefetch to make the most utilization of the prefetch capability. 13633 */ 13634 #define TSBE_PREFETCH_STRIDE (7) 13635 13636 void 13637 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 13638 { 13639 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 13640 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 13641 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 13642 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 13643 struct tsbe *old; 13644 struct tsbe *new; 13645 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 13646 uint64_t va; 13647 int new_offset; 13648 int i; 13649 int vpshift; 13650 int last_prefetch; 13651 13652 if (old_bytes == new_bytes) { 13653 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 13654 } else { 13655 13656 /* 13657 * A TSBE is 16 bytes which means there are four TSBE's per 13658 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 13659 */ 13660 old = (struct tsbe *)old_tsbinfo->tsb_va; 13661 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 13662 for (i = 0; i < old_entries; i++, old++) { 13663 if (((i & (4-1)) == 0) && (i < last_prefetch)) 13664 prefetch_tsbe_read(old); 13665 if (!old->tte_tag.tag_invalid) { 13666 /* 13667 * We have a valid TTE to remap. Check the 13668 * size. We won't remap 64K or 512K TTEs 13669 * because they span more than one TSB entry 13670 * and are indexed using an 8K virt. page. 13671 * Ditto for 32M and 256M TTEs. 13672 */ 13673 if (TTE_CSZ(&old->tte_data) == TTE64K || 13674 TTE_CSZ(&old->tte_data) == TTE512K) 13675 continue; 13676 if (mmu_page_sizes == max_mmu_page_sizes) { 13677 if (TTE_CSZ(&old->tte_data) == TTE32M || 13678 TTE_CSZ(&old->tte_data) == TTE256M) 13679 continue; 13680 } 13681 13682 /* clear the lower 22 bits of the va */ 13683 va = *(uint64_t *)old << 22; 13684 /* turn va into a virtual pfn */ 13685 va >>= 22 - TSB_START_SIZE; 13686 /* 13687 * or in bits from the offset in the tsb 13688 * to get the real virtual pfn. These 13689 * correspond to bits [21:13] in the va 13690 */ 13691 vpshift = 13692 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 13693 0x1ff; 13694 va |= (i << vpshift); 13695 va >>= vpshift; 13696 new_offset = va & (new_entries - 1); 13697 new = new_base + new_offset; 13698 prefetch_tsbe_write(new); 13699 *new = *old; 13700 } 13701 } 13702 } 13703 } 13704 13705 /* 13706 * unused in sfmmu 13707 */ 13708 void 13709 hat_dump(void) 13710 { 13711 } 13712 13713 /* 13714 * Called when a thread is exiting and we have switched to the kernel address 13715 * space. Perform the same VM initialization resume() uses when switching 13716 * processes. 13717 * 13718 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 13719 * we call it anyway in case the semantics change in the future. 13720 */ 13721 /*ARGSUSED*/ 13722 void 13723 hat_thread_exit(kthread_t *thd) 13724 { 13725 uint_t pgsz_cnum; 13726 uint_t pstate_save; 13727 13728 ASSERT(thd->t_procp->p_as == &kas); 13729 13730 pgsz_cnum = KCONTEXT; 13731 #ifdef sun4u 13732 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT); 13733 #endif 13734 13735 /* 13736 * Note that sfmmu_load_mmustate() is currently a no-op for 13737 * kernel threads. We need to disable interrupts here, 13738 * simply because otherwise sfmmu_load_mmustate() would panic 13739 * if the caller does not disable interrupts. 13740 */ 13741 pstate_save = sfmmu_disable_intrs(); 13742 13743 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */ 13744 sfmmu_setctx_sec(pgsz_cnum); 13745 sfmmu_load_mmustate(ksfmmup); 13746 sfmmu_enable_intrs(pstate_save); 13747 } 13748 13749 13750 /* 13751 * SRD support 13752 */ 13753 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \ 13754 (((uintptr_t)(vp)) >> 11)) & \ 13755 srd_hashmask) 13756 13757 /* 13758 * Attach the process to the srd struct associated with the exec vnode 13759 * from which the process is started. 13760 */ 13761 void 13762 hat_join_srd(struct hat *sfmmup, vnode_t *evp) 13763 { 13764 uint_t hash = SRD_HASH_FUNCTION(evp); 13765 sf_srd_t *srdp; 13766 sf_srd_t *newsrdp; 13767 13768 ASSERT(sfmmup != ksfmmup); 13769 ASSERT(sfmmup->sfmmu_srdp == NULL); 13770 13771 if (!shctx_on) { 13772 return; 13773 } 13774 13775 VN_HOLD(evp); 13776 13777 if (srd_buckets[hash].srdb_srdp != NULL) { 13778 mutex_enter(&srd_buckets[hash].srdb_lock); 13779 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13780 srdp = srdp->srd_hash) { 13781 if (srdp->srd_evp == evp) { 13782 ASSERT(srdp->srd_refcnt >= 0); 13783 sfmmup->sfmmu_srdp = srdp; 13784 atomic_add_32( 13785 (volatile uint_t *)&srdp->srd_refcnt, 1); 13786 mutex_exit(&srd_buckets[hash].srdb_lock); 13787 return; 13788 } 13789 } 13790 mutex_exit(&srd_buckets[hash].srdb_lock); 13791 } 13792 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP); 13793 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0); 13794 13795 newsrdp->srd_evp = evp; 13796 newsrdp->srd_refcnt = 1; 13797 newsrdp->srd_hmergnfree = NULL; 13798 newsrdp->srd_ismrgnfree = NULL; 13799 13800 mutex_enter(&srd_buckets[hash].srdb_lock); 13801 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13802 srdp = srdp->srd_hash) { 13803 if (srdp->srd_evp == evp) { 13804 ASSERT(srdp->srd_refcnt >= 0); 13805 sfmmup->sfmmu_srdp = srdp; 13806 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 13807 mutex_exit(&srd_buckets[hash].srdb_lock); 13808 kmem_cache_free(srd_cache, newsrdp); 13809 return; 13810 } 13811 } 13812 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp; 13813 srd_buckets[hash].srdb_srdp = newsrdp; 13814 sfmmup->sfmmu_srdp = newsrdp; 13815 13816 mutex_exit(&srd_buckets[hash].srdb_lock); 13817 13818 } 13819 13820 static void 13821 sfmmu_leave_srd(sfmmu_t *sfmmup) 13822 { 13823 vnode_t *evp; 13824 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13825 uint_t hash; 13826 sf_srd_t **prev_srdpp; 13827 sf_region_t *rgnp; 13828 sf_region_t *nrgnp; 13829 #ifdef DEBUG 13830 int rgns = 0; 13831 #endif 13832 int i; 13833 13834 ASSERT(sfmmup != ksfmmup); 13835 ASSERT(srdp != NULL); 13836 ASSERT(srdp->srd_refcnt > 0); 13837 ASSERT(sfmmup->sfmmu_scdp == NULL); 13838 ASSERT(sfmmup->sfmmu_free == 1); 13839 13840 sfmmup->sfmmu_srdp = NULL; 13841 evp = srdp->srd_evp; 13842 ASSERT(evp != NULL); 13843 if (atomic_add_32_nv( 13844 (volatile uint_t *)&srdp->srd_refcnt, -1)) { 13845 VN_RELE(evp); 13846 return; 13847 } 13848 13849 hash = SRD_HASH_FUNCTION(evp); 13850 mutex_enter(&srd_buckets[hash].srdb_lock); 13851 for (prev_srdpp = &srd_buckets[hash].srdb_srdp; 13852 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) { 13853 if (srdp->srd_evp == evp) { 13854 break; 13855 } 13856 } 13857 if (srdp == NULL || srdp->srd_refcnt) { 13858 mutex_exit(&srd_buckets[hash].srdb_lock); 13859 VN_RELE(evp); 13860 return; 13861 } 13862 *prev_srdpp = srdp->srd_hash; 13863 mutex_exit(&srd_buckets[hash].srdb_lock); 13864 13865 ASSERT(srdp->srd_refcnt == 0); 13866 VN_RELE(evp); 13867 13868 #ifdef DEBUG 13869 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) { 13870 ASSERT(srdp->srd_rgnhash[i] == NULL); 13871 } 13872 #endif /* DEBUG */ 13873 13874 /* free each hme regions in the srd */ 13875 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) { 13876 nrgnp = rgnp->rgn_next; 13877 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid); 13878 ASSERT(rgnp->rgn_refcnt == 0); 13879 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13880 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13881 ASSERT(rgnp->rgn_hmeflags == 0); 13882 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp); 13883 #ifdef DEBUG 13884 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13885 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13886 } 13887 rgns++; 13888 #endif /* DEBUG */ 13889 kmem_cache_free(region_cache, rgnp); 13890 } 13891 ASSERT(rgns == srdp->srd_next_hmerid); 13892 13893 #ifdef DEBUG 13894 rgns = 0; 13895 #endif 13896 /* free each ism rgns in the srd */ 13897 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) { 13898 nrgnp = rgnp->rgn_next; 13899 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid); 13900 ASSERT(rgnp->rgn_refcnt == 0); 13901 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13902 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13903 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp); 13904 #ifdef DEBUG 13905 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13906 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13907 } 13908 rgns++; 13909 #endif /* DEBUG */ 13910 kmem_cache_free(region_cache, rgnp); 13911 } 13912 ASSERT(rgns == srdp->srd_next_ismrid); 13913 ASSERT(srdp->srd_ismbusyrgns == 0); 13914 ASSERT(srdp->srd_hmebusyrgns == 0); 13915 13916 srdp->srd_next_ismrid = 0; 13917 srdp->srd_next_hmerid = 0; 13918 13919 bzero((void *)srdp->srd_ismrgnp, 13920 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS); 13921 bzero((void *)srdp->srd_hmergnp, 13922 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS); 13923 13924 ASSERT(srdp->srd_scdp == NULL); 13925 kmem_cache_free(srd_cache, srdp); 13926 } 13927 13928 /* ARGSUSED */ 13929 static int 13930 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags) 13931 { 13932 sf_srd_t *srdp = (sf_srd_t *)buf; 13933 bzero(buf, sizeof (*srdp)); 13934 13935 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL); 13936 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL); 13937 return (0); 13938 } 13939 13940 /* ARGSUSED */ 13941 static void 13942 sfmmu_srdcache_destructor(void *buf, void *cdrarg) 13943 { 13944 sf_srd_t *srdp = (sf_srd_t *)buf; 13945 13946 mutex_destroy(&srdp->srd_mutex); 13947 mutex_destroy(&srdp->srd_scd_mutex); 13948 } 13949 13950 /* 13951 * The caller makes sure hat_join_region()/hat_leave_region() can't be called 13952 * at the same time for the same process and address range. This is ensured by 13953 * the fact that address space is locked as writer when a process joins the 13954 * regions. Therefore there's no need to hold an srd lock during the entire 13955 * execution of hat_join_region()/hat_leave_region(). 13956 */ 13957 13958 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \ 13959 (((uintptr_t)(obj)) >> 11)) & \ 13960 srd_rgn_hashmask) 13961 /* 13962 * This routine implements the shared context functionality required when 13963 * attaching a segment to an address space. It must be called from 13964 * hat_share() for D(ISM) segments and from segvn_create() for segments 13965 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie 13966 * which is saved in the private segment data for hme segments and 13967 * the ism_map structure for ism segments. 13968 */ 13969 hat_region_cookie_t 13970 hat_join_region(struct hat *sfmmup, 13971 caddr_t r_saddr, 13972 size_t r_size, 13973 void *r_obj, 13974 u_offset_t r_objoff, 13975 uchar_t r_perm, 13976 uchar_t r_pgszc, 13977 hat_rgn_cb_func_t r_cb_function, 13978 uint_t flags) 13979 { 13980 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13981 uint_t rhash; 13982 uint_t rid; 13983 hatlock_t *hatlockp; 13984 sf_region_t *rgnp; 13985 sf_region_t *new_rgnp = NULL; 13986 int i; 13987 uint16_t *nextidp; 13988 sf_region_t **freelistp; 13989 int maxids; 13990 sf_region_t **rarrp; 13991 uint16_t *busyrgnsp; 13992 ulong_t rttecnt; 13993 uchar_t tteflag; 13994 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 13995 int text = (r_type == HAT_REGION_TEXT); 13996 13997 if (srdp == NULL || r_size == 0) { 13998 return (HAT_INVALID_REGION_COOKIE); 13999 } 14000 14001 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14002 ASSERT(sfmmup != ksfmmup); 14003 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14004 ASSERT(srdp->srd_refcnt > 0); 14005 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14006 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14007 ASSERT(r_pgszc < mmu_page_sizes); 14008 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) || 14009 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) { 14010 panic("hat_join_region: region addr or size is not aligned\n"); 14011 } 14012 14013 14014 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14015 SFMMU_REGION_HME; 14016 /* 14017 * Currently only support shared hmes for the read only main text 14018 * region. 14019 */ 14020 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) || 14021 (r_perm & PROT_WRITE))) { 14022 return (HAT_INVALID_REGION_COOKIE); 14023 } 14024 14025 rhash = RGN_HASH_FUNCTION(r_obj); 14026 14027 if (r_type == SFMMU_REGION_ISM) { 14028 nextidp = &srdp->srd_next_ismrid; 14029 freelistp = &srdp->srd_ismrgnfree; 14030 maxids = SFMMU_MAX_ISM_REGIONS; 14031 rarrp = srdp->srd_ismrgnp; 14032 busyrgnsp = &srdp->srd_ismbusyrgns; 14033 } else { 14034 nextidp = &srdp->srd_next_hmerid; 14035 freelistp = &srdp->srd_hmergnfree; 14036 maxids = SFMMU_MAX_HME_REGIONS; 14037 rarrp = srdp->srd_hmergnp; 14038 busyrgnsp = &srdp->srd_hmebusyrgns; 14039 } 14040 14041 mutex_enter(&srdp->srd_mutex); 14042 14043 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14044 rgnp = rgnp->rgn_hash) { 14045 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size && 14046 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff && 14047 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) { 14048 break; 14049 } 14050 } 14051 14052 rfound: 14053 if (rgnp != NULL) { 14054 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14055 ASSERT(rgnp->rgn_cb_function == r_cb_function); 14056 ASSERT(rgnp->rgn_refcnt >= 0); 14057 rid = rgnp->rgn_id; 14058 ASSERT(rid < maxids); 14059 ASSERT(rarrp[rid] == rgnp); 14060 ASSERT(rid < *nextidp); 14061 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14062 mutex_exit(&srdp->srd_mutex); 14063 if (new_rgnp != NULL) { 14064 kmem_cache_free(region_cache, new_rgnp); 14065 } 14066 if (r_type == SFMMU_REGION_HME) { 14067 int myjoin = 14068 (sfmmup == astosfmmu(curthread->t_procp->p_as)); 14069 14070 sfmmu_link_to_hmeregion(sfmmup, rgnp); 14071 /* 14072 * bitmap should be updated after linking sfmmu on 14073 * region list so that pageunload() doesn't skip 14074 * TSB/TLB flush. As soon as bitmap is updated another 14075 * thread in this process can already start accessing 14076 * this region. 14077 */ 14078 /* 14079 * Normally ttecnt accounting is done as part of 14080 * pagefault handling. But a process may not take any 14081 * pagefaults on shared hmeblks created by some other 14082 * process. To compensate for this assume that the 14083 * entire region will end up faulted in using 14084 * the region's pagesize. 14085 * 14086 */ 14087 if (r_pgszc > TTE8K) { 14088 tteflag = 1 << r_pgszc; 14089 if (disable_large_pages & tteflag) { 14090 tteflag = 0; 14091 } 14092 } else { 14093 tteflag = 0; 14094 } 14095 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) { 14096 hatlockp = sfmmu_hat_enter(sfmmup); 14097 sfmmup->sfmmu_rtteflags |= tteflag; 14098 sfmmu_hat_exit(hatlockp); 14099 } 14100 hatlockp = sfmmu_hat_enter(sfmmup); 14101 14102 /* 14103 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M 14104 * region to allow for large page allocation failure. 14105 */ 14106 if (r_pgszc >= TTE4M) { 14107 sfmmup->sfmmu_tsb0_4minflcnt += 14108 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14109 } 14110 14111 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14112 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14113 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14114 rttecnt); 14115 14116 if (text && r_pgszc >= TTE4M && 14117 (tteflag || ((disable_large_pages >> TTE4M) & 14118 ((1 << (r_pgszc - TTE4M + 1)) - 1))) && 14119 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 14120 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 14121 } 14122 14123 sfmmu_hat_exit(hatlockp); 14124 /* 14125 * On Panther we need to make sure TLB is programmed 14126 * to accept 32M/256M pages. Call 14127 * sfmmu_check_page_sizes() now to make sure TLB is 14128 * setup before making hmeregions visible to other 14129 * threads. 14130 */ 14131 sfmmu_check_page_sizes(sfmmup, 1); 14132 hatlockp = sfmmu_hat_enter(sfmmup); 14133 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14134 14135 /* 14136 * if context is invalid tsb miss exception code will 14137 * call sfmmu_check_page_sizes() and update tsbmiss 14138 * area later. 14139 */ 14140 kpreempt_disable(); 14141 if (myjoin && 14142 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 14143 != INVALID_CONTEXT)) { 14144 struct tsbmiss *tsbmp; 14145 14146 tsbmp = &tsbmiss_area[CPU->cpu_id]; 14147 ASSERT(sfmmup == tsbmp->usfmmup); 14148 BT_SET(tsbmp->shmermap, rid); 14149 if (r_pgszc > TTE64K) { 14150 tsbmp->uhat_rtteflags |= tteflag; 14151 } 14152 14153 } 14154 kpreempt_enable(); 14155 14156 sfmmu_hat_exit(hatlockp); 14157 ASSERT((hat_region_cookie_t)((uint64_t)rid) != 14158 HAT_INVALID_REGION_COOKIE); 14159 } else { 14160 hatlockp = sfmmu_hat_enter(sfmmup); 14161 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid); 14162 sfmmu_hat_exit(hatlockp); 14163 } 14164 ASSERT(rid < maxids); 14165 14166 if (r_type == SFMMU_REGION_ISM) { 14167 sfmmu_find_scd(sfmmup); 14168 } 14169 return ((hat_region_cookie_t)((uint64_t)rid)); 14170 } 14171 14172 ASSERT(new_rgnp == NULL); 14173 14174 if (*busyrgnsp >= maxids) { 14175 mutex_exit(&srdp->srd_mutex); 14176 return (HAT_INVALID_REGION_COOKIE); 14177 } 14178 14179 ASSERT(MUTEX_HELD(&srdp->srd_mutex)); 14180 if (*freelistp != NULL) { 14181 rgnp = *freelistp; 14182 *freelistp = rgnp->rgn_next; 14183 ASSERT(rgnp->rgn_id < *nextidp); 14184 ASSERT(rgnp->rgn_id < maxids); 14185 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 14186 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) 14187 == r_type); 14188 ASSERT(rarrp[rgnp->rgn_id] == rgnp); 14189 ASSERT(rgnp->rgn_hmeflags == 0); 14190 } else { 14191 /* 14192 * release local locks before memory allocation. 14193 */ 14194 mutex_exit(&srdp->srd_mutex); 14195 14196 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP); 14197 14198 mutex_enter(&srdp->srd_mutex); 14199 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14200 rgnp = rgnp->rgn_hash) { 14201 if (rgnp->rgn_saddr == r_saddr && 14202 rgnp->rgn_size == r_size && 14203 rgnp->rgn_obj == r_obj && 14204 rgnp->rgn_objoff == r_objoff && 14205 rgnp->rgn_perm == r_perm && 14206 rgnp->rgn_pgszc == r_pgszc) { 14207 break; 14208 } 14209 } 14210 if (rgnp != NULL) { 14211 goto rfound; 14212 } 14213 14214 if (*nextidp >= maxids) { 14215 mutex_exit(&srdp->srd_mutex); 14216 goto fail; 14217 } 14218 rgnp = new_rgnp; 14219 new_rgnp = NULL; 14220 rgnp->rgn_id = (*nextidp)++; 14221 ASSERT(rgnp->rgn_id < maxids); 14222 ASSERT(rarrp[rgnp->rgn_id] == NULL); 14223 rarrp[rgnp->rgn_id] = rgnp; 14224 } 14225 14226 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14227 ASSERT(rgnp->rgn_hmeflags == 0); 14228 #ifdef DEBUG 14229 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14230 ASSERT(rgnp->rgn_ttecnt[i] == 0); 14231 } 14232 #endif 14233 rgnp->rgn_saddr = r_saddr; 14234 rgnp->rgn_size = r_size; 14235 rgnp->rgn_obj = r_obj; 14236 rgnp->rgn_objoff = r_objoff; 14237 rgnp->rgn_perm = r_perm; 14238 rgnp->rgn_pgszc = r_pgszc; 14239 rgnp->rgn_flags = r_type; 14240 rgnp->rgn_refcnt = 0; 14241 rgnp->rgn_cb_function = r_cb_function; 14242 rgnp->rgn_hash = srdp->srd_rgnhash[rhash]; 14243 srdp->srd_rgnhash[rhash] = rgnp; 14244 (*busyrgnsp)++; 14245 ASSERT(*busyrgnsp <= maxids); 14246 goto rfound; 14247 14248 fail: 14249 ASSERT(new_rgnp != NULL); 14250 kmem_cache_free(region_cache, new_rgnp); 14251 return (HAT_INVALID_REGION_COOKIE); 14252 } 14253 14254 /* 14255 * This function implements the shared context functionality required 14256 * when detaching a segment from an address space. It must be called 14257 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(), 14258 * for segments with a valid region_cookie. 14259 * It will also be called from all seg_vn routines which change a 14260 * segment's attributes such as segvn_setprot(), segvn_setpagesize(), 14261 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault 14262 * from segvn_fault(). 14263 */ 14264 void 14265 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags) 14266 { 14267 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14268 sf_scd_t *scdp; 14269 uint_t rhash; 14270 uint_t rid = (uint_t)((uint64_t)rcookie); 14271 hatlock_t *hatlockp = NULL; 14272 sf_region_t *rgnp; 14273 sf_region_t **prev_rgnpp; 14274 sf_region_t *cur_rgnp; 14275 void *r_obj; 14276 int i; 14277 caddr_t r_saddr; 14278 caddr_t r_eaddr; 14279 size_t r_size; 14280 uchar_t r_pgszc; 14281 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14282 14283 ASSERT(sfmmup != ksfmmup); 14284 ASSERT(srdp != NULL); 14285 ASSERT(srdp->srd_refcnt > 0); 14286 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14287 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14288 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL); 14289 14290 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14291 SFMMU_REGION_HME; 14292 14293 if (r_type == SFMMU_REGION_ISM) { 14294 ASSERT(SFMMU_IS_ISMRID_VALID(rid)); 14295 ASSERT(rid < SFMMU_MAX_ISM_REGIONS); 14296 rgnp = srdp->srd_ismrgnp[rid]; 14297 } else { 14298 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14299 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14300 rgnp = srdp->srd_hmergnp[rid]; 14301 } 14302 ASSERT(rgnp != NULL); 14303 ASSERT(rgnp->rgn_id == rid); 14304 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14305 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14306 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14307 14308 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14309 if (r_type == SFMMU_REGION_HME && sfmmup->sfmmu_as->a_xhat != NULL) { 14310 xhat_unload_callback_all(sfmmup->sfmmu_as, rgnp->rgn_saddr, 14311 rgnp->rgn_size, 0, NULL); 14312 } 14313 14314 if (sfmmup->sfmmu_free) { 14315 ulong_t rttecnt; 14316 r_pgszc = rgnp->rgn_pgszc; 14317 r_size = rgnp->rgn_size; 14318 14319 ASSERT(sfmmup->sfmmu_scdp == NULL); 14320 if (r_type == SFMMU_REGION_ISM) { 14321 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14322 } else { 14323 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14324 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14325 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14326 14327 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14328 -rttecnt); 14329 14330 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14331 } 14332 } else if (r_type == SFMMU_REGION_ISM) { 14333 hatlockp = sfmmu_hat_enter(sfmmup); 14334 ASSERT(rid < srdp->srd_next_ismrid); 14335 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14336 scdp = sfmmup->sfmmu_scdp; 14337 if (scdp != NULL && 14338 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 14339 sfmmu_leave_scd(sfmmup, r_type); 14340 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14341 } 14342 sfmmu_hat_exit(hatlockp); 14343 } else { 14344 ulong_t rttecnt; 14345 r_pgszc = rgnp->rgn_pgszc; 14346 r_saddr = rgnp->rgn_saddr; 14347 r_size = rgnp->rgn_size; 14348 r_eaddr = r_saddr + r_size; 14349 14350 ASSERT(r_type == SFMMU_REGION_HME); 14351 hatlockp = sfmmu_hat_enter(sfmmup); 14352 ASSERT(rid < srdp->srd_next_hmerid); 14353 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14354 14355 /* 14356 * If region is part of an SCD call sfmmu_leave_scd(). 14357 * Otherwise if process is not exiting and has valid context 14358 * just drop the context on the floor to lose stale TLB 14359 * entries and force the update of tsb miss area to reflect 14360 * the new region map. After that clean our TSB entries. 14361 */ 14362 scdp = sfmmup->sfmmu_scdp; 14363 if (scdp != NULL && 14364 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 14365 sfmmu_leave_scd(sfmmup, r_type); 14366 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14367 } 14368 sfmmu_invalidate_ctx(sfmmup); 14369 14370 i = TTE8K; 14371 while (i < mmu_page_sizes) { 14372 if (rgnp->rgn_ttecnt[i] != 0) { 14373 sfmmu_unload_tsb_range(sfmmup, r_saddr, 14374 r_eaddr, i); 14375 if (i < TTE4M) { 14376 i = TTE4M; 14377 continue; 14378 } else { 14379 break; 14380 } 14381 } 14382 i++; 14383 } 14384 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */ 14385 if (r_pgszc >= TTE4M) { 14386 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14387 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14388 rttecnt); 14389 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt; 14390 } 14391 14392 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14393 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14394 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14395 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt); 14396 14397 sfmmu_hat_exit(hatlockp); 14398 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 14399 /* sfmmup left the scd, grow private tsb */ 14400 sfmmu_check_page_sizes(sfmmup, 1); 14401 } else { 14402 sfmmu_check_page_sizes(sfmmup, 0); 14403 } 14404 } 14405 14406 if (r_type == SFMMU_REGION_HME) { 14407 sfmmu_unlink_from_hmeregion(sfmmup, rgnp); 14408 } 14409 14410 r_obj = rgnp->rgn_obj; 14411 if (atomic_add_32_nv((volatile uint_t *)&rgnp->rgn_refcnt, -1)) { 14412 return; 14413 } 14414 14415 /* 14416 * looks like nobody uses this region anymore. Free it. 14417 */ 14418 rhash = RGN_HASH_FUNCTION(r_obj); 14419 mutex_enter(&srdp->srd_mutex); 14420 for (prev_rgnpp = &srdp->srd_rgnhash[rhash]; 14421 (cur_rgnp = *prev_rgnpp) != NULL; 14422 prev_rgnpp = &cur_rgnp->rgn_hash) { 14423 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) { 14424 break; 14425 } 14426 } 14427 14428 if (cur_rgnp == NULL) { 14429 mutex_exit(&srdp->srd_mutex); 14430 return; 14431 } 14432 14433 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14434 *prev_rgnpp = rgnp->rgn_hash; 14435 if (r_type == SFMMU_REGION_ISM) { 14436 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14437 ASSERT(rid < srdp->srd_next_ismrid); 14438 rgnp->rgn_next = srdp->srd_ismrgnfree; 14439 srdp->srd_ismrgnfree = rgnp; 14440 ASSERT(srdp->srd_ismbusyrgns > 0); 14441 srdp->srd_ismbusyrgns--; 14442 mutex_exit(&srdp->srd_mutex); 14443 return; 14444 } 14445 mutex_exit(&srdp->srd_mutex); 14446 14447 /* 14448 * Destroy region's hmeblks. 14449 */ 14450 sfmmu_unload_hmeregion(srdp, rgnp); 14451 14452 rgnp->rgn_hmeflags = 0; 14453 14454 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14455 ASSERT(rgnp->rgn_id == rid); 14456 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14457 rgnp->rgn_ttecnt[i] = 0; 14458 } 14459 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14460 mutex_enter(&srdp->srd_mutex); 14461 ASSERT(rid < srdp->srd_next_hmerid); 14462 rgnp->rgn_next = srdp->srd_hmergnfree; 14463 srdp->srd_hmergnfree = rgnp; 14464 ASSERT(srdp->srd_hmebusyrgns > 0); 14465 srdp->srd_hmebusyrgns--; 14466 mutex_exit(&srdp->srd_mutex); 14467 } 14468 14469 /* 14470 * For now only called for hmeblk regions and not for ISM regions. 14471 */ 14472 void 14473 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie) 14474 { 14475 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14476 uint_t rid = (uint_t)((uint64_t)rcookie); 14477 sf_region_t *rgnp; 14478 sf_rgn_link_t *rlink; 14479 sf_rgn_link_t *hrlink; 14480 ulong_t rttecnt; 14481 14482 ASSERT(sfmmup != ksfmmup); 14483 ASSERT(srdp != NULL); 14484 ASSERT(srdp->srd_refcnt > 0); 14485 14486 ASSERT(rid < srdp->srd_next_hmerid); 14487 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14488 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14489 14490 rgnp = srdp->srd_hmergnp[rid]; 14491 ASSERT(rgnp->rgn_refcnt > 0); 14492 ASSERT(rgnp->rgn_id == rid); 14493 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME); 14494 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14495 14496 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14497 14498 /* LINTED: constant in conditional context */ 14499 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0); 14500 ASSERT(rlink != NULL); 14501 mutex_enter(&rgnp->rgn_mutex); 14502 ASSERT(rgnp->rgn_sfmmu_head != NULL); 14503 /* LINTED: constant in conditional context */ 14504 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0); 14505 ASSERT(hrlink != NULL); 14506 ASSERT(hrlink->prev == NULL); 14507 rlink->next = rgnp->rgn_sfmmu_head; 14508 rlink->prev = NULL; 14509 hrlink->prev = sfmmup; 14510 /* 14511 * make sure rlink's next field is correct 14512 * before making this link visible. 14513 */ 14514 membar_stst(); 14515 rgnp->rgn_sfmmu_head = sfmmup; 14516 mutex_exit(&rgnp->rgn_mutex); 14517 14518 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14519 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 14520 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt); 14521 /* update tsb0 inflation count */ 14522 if (rgnp->rgn_pgszc >= TTE4M) { 14523 sfmmup->sfmmu_tsb0_4minflcnt += 14524 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14525 } 14526 /* 14527 * Update regionid bitmask without hat lock since no other thread 14528 * can update this region bitmask right now. 14529 */ 14530 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14531 } 14532 14533 /* ARGSUSED */ 14534 static int 14535 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags) 14536 { 14537 sf_region_t *rgnp = (sf_region_t *)buf; 14538 bzero(buf, sizeof (*rgnp)); 14539 14540 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL); 14541 14542 return (0); 14543 } 14544 14545 /* ARGSUSED */ 14546 static void 14547 sfmmu_rgncache_destructor(void *buf, void *cdrarg) 14548 { 14549 sf_region_t *rgnp = (sf_region_t *)buf; 14550 mutex_destroy(&rgnp->rgn_mutex); 14551 } 14552 14553 static int 14554 sfrgnmap_isnull(sf_region_map_t *map) 14555 { 14556 int i; 14557 14558 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14559 if (map->bitmap[i] != 0) { 14560 return (0); 14561 } 14562 } 14563 return (1); 14564 } 14565 14566 static int 14567 sfhmergnmap_isnull(sf_hmeregion_map_t *map) 14568 { 14569 int i; 14570 14571 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 14572 if (map->bitmap[i] != 0) { 14573 return (0); 14574 } 14575 } 14576 return (1); 14577 } 14578 14579 #ifdef DEBUG 14580 static void 14581 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist) 14582 { 14583 sfmmu_t *sp; 14584 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14585 14586 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) { 14587 ASSERT(srdp == sp->sfmmu_srdp); 14588 if (sp == sfmmup) { 14589 if (onlist) { 14590 return; 14591 } else { 14592 panic("shctx: sfmmu 0x%p found on scd" 14593 "list 0x%p", (void *)sfmmup, 14594 (void *)*headp); 14595 } 14596 } 14597 } 14598 if (onlist) { 14599 panic("shctx: sfmmu 0x%p not found on scd list 0x%p", 14600 (void *)sfmmup, (void *)*headp); 14601 } else { 14602 return; 14603 } 14604 } 14605 #else /* DEBUG */ 14606 #define check_scd_sfmmu_list(headp, sfmmup, onlist) 14607 #endif /* DEBUG */ 14608 14609 /* 14610 * Removes an sfmmu from the SCD sfmmu list. 14611 */ 14612 static void 14613 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14614 { 14615 ASSERT(sfmmup->sfmmu_srdp != NULL); 14616 check_scd_sfmmu_list(headp, sfmmup, 1); 14617 if (sfmmup->sfmmu_scd_link.prev != NULL) { 14618 ASSERT(*headp != sfmmup); 14619 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next = 14620 sfmmup->sfmmu_scd_link.next; 14621 } else { 14622 ASSERT(*headp == sfmmup); 14623 *headp = sfmmup->sfmmu_scd_link.next; 14624 } 14625 if (sfmmup->sfmmu_scd_link.next != NULL) { 14626 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev = 14627 sfmmup->sfmmu_scd_link.prev; 14628 } 14629 } 14630 14631 14632 /* 14633 * Adds an sfmmu to the start of the queue. 14634 */ 14635 static void 14636 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14637 { 14638 check_scd_sfmmu_list(headp, sfmmup, 0); 14639 sfmmup->sfmmu_scd_link.prev = NULL; 14640 sfmmup->sfmmu_scd_link.next = *headp; 14641 if (*headp != NULL) 14642 (*headp)->sfmmu_scd_link.prev = sfmmup; 14643 *headp = sfmmup; 14644 } 14645 14646 /* 14647 * Remove an scd from the start of the queue. 14648 */ 14649 static void 14650 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp) 14651 { 14652 if (scdp->scd_prev != NULL) { 14653 ASSERT(*headp != scdp); 14654 scdp->scd_prev->scd_next = scdp->scd_next; 14655 } else { 14656 ASSERT(*headp == scdp); 14657 *headp = scdp->scd_next; 14658 } 14659 14660 if (scdp->scd_next != NULL) { 14661 scdp->scd_next->scd_prev = scdp->scd_prev; 14662 } 14663 } 14664 14665 /* 14666 * Add an scd to the start of the queue. 14667 */ 14668 static void 14669 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp) 14670 { 14671 scdp->scd_prev = NULL; 14672 scdp->scd_next = *headp; 14673 if (*headp != NULL) { 14674 (*headp)->scd_prev = scdp; 14675 } 14676 *headp = scdp; 14677 } 14678 14679 static int 14680 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp) 14681 { 14682 uint_t rid; 14683 uint_t i; 14684 uint_t j; 14685 ulong_t w; 14686 sf_region_t *rgnp; 14687 ulong_t tte8k_cnt = 0; 14688 ulong_t tte4m_cnt = 0; 14689 uint_t tsb_szc; 14690 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 14691 sfmmu_t *ism_hatid; 14692 struct tsb_info *newtsb; 14693 int szc; 14694 14695 ASSERT(srdp != NULL); 14696 14697 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14698 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14699 continue; 14700 } 14701 j = 0; 14702 while (w) { 14703 if (!(w & 0x1)) { 14704 j++; 14705 w >>= 1; 14706 continue; 14707 } 14708 rid = (i << BT_ULSHIFT) | j; 14709 j++; 14710 w >>= 1; 14711 14712 if (rid < SFMMU_MAX_HME_REGIONS) { 14713 rgnp = srdp->srd_hmergnp[rid]; 14714 ASSERT(rgnp->rgn_id == rid); 14715 ASSERT(rgnp->rgn_refcnt > 0); 14716 14717 if (rgnp->rgn_pgszc < TTE4M) { 14718 tte8k_cnt += rgnp->rgn_size >> 14719 TTE_PAGE_SHIFT(TTE8K); 14720 } else { 14721 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14722 tte4m_cnt += rgnp->rgn_size >> 14723 TTE_PAGE_SHIFT(TTE4M); 14724 /* 14725 * Inflate SCD tsb0 by preallocating 14726 * 1/4 8k ttecnt for 4M regions to 14727 * allow for lgpg alloc failure. 14728 */ 14729 tte8k_cnt += rgnp->rgn_size >> 14730 (TTE_PAGE_SHIFT(TTE8K) + 2); 14731 } 14732 } else { 14733 rid -= SFMMU_MAX_HME_REGIONS; 14734 rgnp = srdp->srd_ismrgnp[rid]; 14735 ASSERT(rgnp->rgn_id == rid); 14736 ASSERT(rgnp->rgn_refcnt > 0); 14737 14738 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14739 ASSERT(ism_hatid->sfmmu_ismhat); 14740 14741 for (szc = 0; szc < TTE4M; szc++) { 14742 tte8k_cnt += 14743 ism_hatid->sfmmu_ttecnt[szc] << 14744 TTE_BSZS_SHIFT(szc); 14745 } 14746 14747 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14748 if (rgnp->rgn_pgszc >= TTE4M) { 14749 tte4m_cnt += rgnp->rgn_size >> 14750 TTE_PAGE_SHIFT(TTE4M); 14751 } 14752 } 14753 } 14754 } 14755 14756 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 14757 14758 /* Allocate both the SCD TSBs here. */ 14759 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14760 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) && 14761 (tsb_szc <= TSB_4M_SZCODE || 14762 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14763 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K, 14764 TSB_ALLOC, scsfmmup))) { 14765 14766 SFMMU_STAT(sf_scd_1sttsb_allocfail); 14767 return (TSB_ALLOCFAIL); 14768 } else { 14769 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX; 14770 14771 if (tte4m_cnt) { 14772 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 14773 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, 14774 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) && 14775 (tsb_szc <= TSB_4M_SZCODE || 14776 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 14777 TSB4M|TSB32M|TSB256M, 14778 TSB_ALLOC, scsfmmup))) { 14779 /* 14780 * If we fail to allocate the 2nd shared tsb, 14781 * just free the 1st tsb, return failure. 14782 */ 14783 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb); 14784 SFMMU_STAT(sf_scd_2ndtsb_allocfail); 14785 return (TSB_ALLOCFAIL); 14786 } else { 14787 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL); 14788 newtsb->tsb_flags |= TSB_SHAREDCTX; 14789 scsfmmup->sfmmu_tsb->tsb_next = newtsb; 14790 SFMMU_STAT(sf_scd_2ndtsb_alloc); 14791 } 14792 } 14793 SFMMU_STAT(sf_scd_1sttsb_alloc); 14794 } 14795 return (TSB_SUCCESS); 14796 } 14797 14798 static void 14799 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu) 14800 { 14801 while (scd_sfmmu->sfmmu_tsb != NULL) { 14802 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next; 14803 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb); 14804 scd_sfmmu->sfmmu_tsb = next; 14805 } 14806 } 14807 14808 /* 14809 * Link the sfmmu onto the hme region list. 14810 */ 14811 void 14812 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14813 { 14814 uint_t rid; 14815 sf_rgn_link_t *rlink; 14816 sfmmu_t *head; 14817 sf_rgn_link_t *hrlink; 14818 14819 rid = rgnp->rgn_id; 14820 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14821 14822 /* LINTED: constant in conditional context */ 14823 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1); 14824 ASSERT(rlink != NULL); 14825 mutex_enter(&rgnp->rgn_mutex); 14826 if ((head = rgnp->rgn_sfmmu_head) == NULL) { 14827 rlink->next = NULL; 14828 rlink->prev = NULL; 14829 /* 14830 * make sure rlink's next field is NULL 14831 * before making this link visible. 14832 */ 14833 membar_stst(); 14834 rgnp->rgn_sfmmu_head = sfmmup; 14835 } else { 14836 /* LINTED: constant in conditional context */ 14837 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0); 14838 ASSERT(hrlink != NULL); 14839 ASSERT(hrlink->prev == NULL); 14840 rlink->next = head; 14841 rlink->prev = NULL; 14842 hrlink->prev = sfmmup; 14843 /* 14844 * make sure rlink's next field is correct 14845 * before making this link visible. 14846 */ 14847 membar_stst(); 14848 rgnp->rgn_sfmmu_head = sfmmup; 14849 } 14850 mutex_exit(&rgnp->rgn_mutex); 14851 } 14852 14853 /* 14854 * Unlink the sfmmu from the hme region list. 14855 */ 14856 void 14857 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14858 { 14859 uint_t rid; 14860 sf_rgn_link_t *rlink; 14861 14862 rid = rgnp->rgn_id; 14863 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14864 14865 /* LINTED: constant in conditional context */ 14866 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 14867 ASSERT(rlink != NULL); 14868 mutex_enter(&rgnp->rgn_mutex); 14869 if (rgnp->rgn_sfmmu_head == sfmmup) { 14870 sfmmu_t *next = rlink->next; 14871 rgnp->rgn_sfmmu_head = next; 14872 /* 14873 * if we are stopped by xc_attention() after this 14874 * point the forward link walking in 14875 * sfmmu_rgntlb_demap() will work correctly since the 14876 * head correctly points to the next element. 14877 */ 14878 membar_stst(); 14879 rlink->next = NULL; 14880 ASSERT(rlink->prev == NULL); 14881 if (next != NULL) { 14882 sf_rgn_link_t *nrlink; 14883 /* LINTED: constant in conditional context */ 14884 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14885 ASSERT(nrlink != NULL); 14886 ASSERT(nrlink->prev == sfmmup); 14887 nrlink->prev = NULL; 14888 } 14889 } else { 14890 sfmmu_t *next = rlink->next; 14891 sfmmu_t *prev = rlink->prev; 14892 sf_rgn_link_t *prlink; 14893 14894 ASSERT(prev != NULL); 14895 /* LINTED: constant in conditional context */ 14896 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0); 14897 ASSERT(prlink != NULL); 14898 ASSERT(prlink->next == sfmmup); 14899 prlink->next = next; 14900 /* 14901 * if we are stopped by xc_attention() 14902 * after this point the forward link walking 14903 * will work correctly since the prev element 14904 * correctly points to the next element. 14905 */ 14906 membar_stst(); 14907 rlink->next = NULL; 14908 rlink->prev = NULL; 14909 if (next != NULL) { 14910 sf_rgn_link_t *nrlink; 14911 /* LINTED: constant in conditional context */ 14912 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14913 ASSERT(nrlink != NULL); 14914 ASSERT(nrlink->prev == sfmmup); 14915 nrlink->prev = prev; 14916 } 14917 } 14918 mutex_exit(&rgnp->rgn_mutex); 14919 } 14920 14921 /* 14922 * Link scd sfmmu onto ism or hme region list for each region in the 14923 * scd region map. 14924 */ 14925 void 14926 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp) 14927 { 14928 uint_t rid; 14929 uint_t i; 14930 uint_t j; 14931 ulong_t w; 14932 sf_region_t *rgnp; 14933 sfmmu_t *scsfmmup; 14934 14935 scsfmmup = scdp->scd_sfmmup; 14936 ASSERT(scsfmmup->sfmmu_scdhat); 14937 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14938 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14939 continue; 14940 } 14941 j = 0; 14942 while (w) { 14943 if (!(w & 0x1)) { 14944 j++; 14945 w >>= 1; 14946 continue; 14947 } 14948 rid = (i << BT_ULSHIFT) | j; 14949 j++; 14950 w >>= 1; 14951 14952 if (rid < SFMMU_MAX_HME_REGIONS) { 14953 rgnp = srdp->srd_hmergnp[rid]; 14954 ASSERT(rgnp->rgn_id == rid); 14955 ASSERT(rgnp->rgn_refcnt > 0); 14956 sfmmu_link_to_hmeregion(scsfmmup, rgnp); 14957 } else { 14958 sfmmu_t *ism_hatid = NULL; 14959 ism_ment_t *ism_ment; 14960 rid -= SFMMU_MAX_HME_REGIONS; 14961 rgnp = srdp->srd_ismrgnp[rid]; 14962 ASSERT(rgnp->rgn_id == rid); 14963 ASSERT(rgnp->rgn_refcnt > 0); 14964 14965 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14966 ASSERT(ism_hatid->sfmmu_ismhat); 14967 ism_ment = &scdp->scd_ism_links[rid]; 14968 ism_ment->iment_hat = scsfmmup; 14969 ism_ment->iment_base_va = rgnp->rgn_saddr; 14970 mutex_enter(&ism_mlist_lock); 14971 iment_add(ism_ment, ism_hatid); 14972 mutex_exit(&ism_mlist_lock); 14973 14974 } 14975 } 14976 } 14977 } 14978 /* 14979 * Unlink scd sfmmu from ism or hme region list for each region in the 14980 * scd region map. 14981 */ 14982 void 14983 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp) 14984 { 14985 uint_t rid; 14986 uint_t i; 14987 uint_t j; 14988 ulong_t w; 14989 sf_region_t *rgnp; 14990 sfmmu_t *scsfmmup; 14991 14992 scsfmmup = scdp->scd_sfmmup; 14993 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14994 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14995 continue; 14996 } 14997 j = 0; 14998 while (w) { 14999 if (!(w & 0x1)) { 15000 j++; 15001 w >>= 1; 15002 continue; 15003 } 15004 rid = (i << BT_ULSHIFT) | j; 15005 j++; 15006 w >>= 1; 15007 15008 if (rid < SFMMU_MAX_HME_REGIONS) { 15009 rgnp = srdp->srd_hmergnp[rid]; 15010 ASSERT(rgnp->rgn_id == rid); 15011 ASSERT(rgnp->rgn_refcnt > 0); 15012 sfmmu_unlink_from_hmeregion(scsfmmup, 15013 rgnp); 15014 15015 } else { 15016 sfmmu_t *ism_hatid = NULL; 15017 ism_ment_t *ism_ment; 15018 rid -= SFMMU_MAX_HME_REGIONS; 15019 rgnp = srdp->srd_ismrgnp[rid]; 15020 ASSERT(rgnp->rgn_id == rid); 15021 ASSERT(rgnp->rgn_refcnt > 0); 15022 15023 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 15024 ASSERT(ism_hatid->sfmmu_ismhat); 15025 ism_ment = &scdp->scd_ism_links[rid]; 15026 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup); 15027 ASSERT(ism_ment->iment_base_va == 15028 rgnp->rgn_saddr); 15029 ism_ment->iment_hat = NULL; 15030 ism_ment->iment_base_va = 0; 15031 mutex_enter(&ism_mlist_lock); 15032 iment_sub(ism_ment, ism_hatid); 15033 mutex_exit(&ism_mlist_lock); 15034 15035 } 15036 } 15037 } 15038 } 15039 /* 15040 * Allocates and initialises a new SCD structure, this is called with 15041 * the srd_scd_mutex held and returns with the reference count 15042 * initialised to 1. 15043 */ 15044 static sf_scd_t * 15045 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map) 15046 { 15047 sf_scd_t *new_scdp; 15048 sfmmu_t *scsfmmup; 15049 int i; 15050 15051 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex)); 15052 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP); 15053 15054 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 15055 new_scdp->scd_sfmmup = scsfmmup; 15056 scsfmmup->sfmmu_srdp = srdp; 15057 scsfmmup->sfmmu_scdp = new_scdp; 15058 scsfmmup->sfmmu_tsb0_4minflcnt = 0; 15059 scsfmmup->sfmmu_scdhat = 1; 15060 CPUSET_ALL(scsfmmup->sfmmu_cpusran); 15061 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 15062 15063 ASSERT(max_mmu_ctxdoms > 0); 15064 for (i = 0; i < max_mmu_ctxdoms; i++) { 15065 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 15066 scsfmmup->sfmmu_ctxs[i].gnum = 0; 15067 } 15068 15069 for (i = 0; i < MMU_PAGE_SIZES; i++) { 15070 new_scdp->scd_rttecnt[i] = 0; 15071 } 15072 15073 new_scdp->scd_region_map = *new_map; 15074 new_scdp->scd_refcnt = 1; 15075 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) { 15076 kmem_cache_free(scd_cache, new_scdp); 15077 kmem_cache_free(sfmmuid_cache, scsfmmup); 15078 return (NULL); 15079 } 15080 if (&mmu_init_scd) { 15081 mmu_init_scd(new_scdp); 15082 } 15083 return (new_scdp); 15084 } 15085 15086 /* 15087 * The first phase of a process joining an SCD. The hat structure is 15088 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set 15089 * and a cross-call with context invalidation is used to cause the 15090 * remaining work to be carried out in the sfmmu_tsbmiss_exception() 15091 * routine. 15092 */ 15093 static void 15094 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup) 15095 { 15096 hatlock_t *hatlockp; 15097 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15098 int i; 15099 sf_scd_t *old_scdp; 15100 15101 ASSERT(srdp != NULL); 15102 ASSERT(scdp != NULL); 15103 ASSERT(scdp->scd_refcnt > 0); 15104 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15105 15106 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) { 15107 ASSERT(old_scdp != scdp); 15108 15109 mutex_enter(&old_scdp->scd_mutex); 15110 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup); 15111 mutex_exit(&old_scdp->scd_mutex); 15112 /* 15113 * sfmmup leaves the old scd. Update sfmmu_ttecnt to 15114 * include the shme rgn ttecnt for rgns that 15115 * were in the old SCD 15116 */ 15117 for (i = 0; i < mmu_page_sizes; i++) { 15118 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15119 old_scdp->scd_rttecnt[i]); 15120 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15121 sfmmup->sfmmu_scdrttecnt[i]); 15122 } 15123 } 15124 15125 /* 15126 * Move sfmmu to the scd lists. 15127 */ 15128 mutex_enter(&scdp->scd_mutex); 15129 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup); 15130 mutex_exit(&scdp->scd_mutex); 15131 SF_SCD_INCR_REF(scdp); 15132 15133 hatlockp = sfmmu_hat_enter(sfmmup); 15134 /* 15135 * For a multi-thread process, we must stop 15136 * all the other threads before joining the scd. 15137 */ 15138 15139 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD); 15140 15141 sfmmu_invalidate_ctx(sfmmup); 15142 sfmmup->sfmmu_scdp = scdp; 15143 15144 /* 15145 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update 15146 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD. 15147 */ 15148 for (i = 0; i < mmu_page_sizes; i++) { 15149 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i]; 15150 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]); 15151 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15152 -sfmmup->sfmmu_scdrttecnt[i]); 15153 } 15154 /* update tsb0 inflation count */ 15155 if (old_scdp != NULL) { 15156 sfmmup->sfmmu_tsb0_4minflcnt += 15157 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15158 } 15159 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 15160 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt); 15161 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15162 15163 sfmmu_hat_exit(hatlockp); 15164 15165 if (old_scdp != NULL) { 15166 SF_SCD_DECR_REF(srdp, old_scdp); 15167 } 15168 15169 } 15170 15171 /* 15172 * This routine is called by a process to become part of an SCD. It is called 15173 * from sfmmu_tsbmiss_exception() once most of the initial work has been 15174 * done by sfmmu_join_scd(). This routine must not drop the hat lock. 15175 */ 15176 static void 15177 sfmmu_finish_join_scd(sfmmu_t *sfmmup) 15178 { 15179 struct tsb_info *tsbinfop; 15180 15181 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15182 ASSERT(sfmmup->sfmmu_scdp != NULL); 15183 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)); 15184 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15185 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)); 15186 15187 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 15188 tsbinfop = tsbinfop->tsb_next) { 15189 if (tsbinfop->tsb_flags & TSB_SWAPPED) { 15190 continue; 15191 } 15192 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG)); 15193 15194 sfmmu_inv_tsb(tsbinfop->tsb_va, 15195 TSB_BYTES(tsbinfop->tsb_szc)); 15196 } 15197 15198 /* Set HAT_CTX1_FLAG for all SCD ISMs */ 15199 sfmmu_ism_hatflags(sfmmup, 1); 15200 15201 SFMMU_STAT(sf_join_scd); 15202 } 15203 15204 /* 15205 * This routine is called in order to check if there is an SCD which matches 15206 * the process's region map if not then a new SCD may be created. 15207 */ 15208 static void 15209 sfmmu_find_scd(sfmmu_t *sfmmup) 15210 { 15211 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15212 sf_scd_t *scdp, *new_scdp; 15213 int ret; 15214 15215 ASSERT(srdp != NULL); 15216 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15217 15218 mutex_enter(&srdp->srd_scd_mutex); 15219 for (scdp = srdp->srd_scdp; scdp != NULL; 15220 scdp = scdp->scd_next) { 15221 SF_RGNMAP_EQUAL(&scdp->scd_region_map, 15222 &sfmmup->sfmmu_region_map, ret); 15223 if (ret == 1) { 15224 SF_SCD_INCR_REF(scdp); 15225 mutex_exit(&srdp->srd_scd_mutex); 15226 sfmmu_join_scd(scdp, sfmmup); 15227 ASSERT(scdp->scd_refcnt >= 2); 15228 atomic_add_32((volatile uint32_t *) 15229 &scdp->scd_refcnt, -1); 15230 return; 15231 } else { 15232 /* 15233 * If the sfmmu region map is a subset of the scd 15234 * region map, then the assumption is that this process 15235 * will continue attaching to ISM segments until the 15236 * region maps are equal. 15237 */ 15238 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map, 15239 &sfmmup->sfmmu_region_map, ret); 15240 if (ret == 1) { 15241 mutex_exit(&srdp->srd_scd_mutex); 15242 return; 15243 } 15244 } 15245 } 15246 15247 ASSERT(scdp == NULL); 15248 /* 15249 * No matching SCD has been found, create a new one. 15250 */ 15251 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) == 15252 NULL) { 15253 mutex_exit(&srdp->srd_scd_mutex); 15254 return; 15255 } 15256 15257 /* 15258 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd. 15259 */ 15260 15261 /* Set scd_rttecnt for shme rgns in SCD */ 15262 sfmmu_set_scd_rttecnt(srdp, new_scdp); 15263 15264 /* 15265 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists. 15266 */ 15267 sfmmu_link_scd_to_regions(srdp, new_scdp); 15268 sfmmu_add_scd(&srdp->srd_scdp, new_scdp); 15269 SFMMU_STAT_ADD(sf_create_scd, 1); 15270 15271 mutex_exit(&srdp->srd_scd_mutex); 15272 sfmmu_join_scd(new_scdp, sfmmup); 15273 ASSERT(new_scdp->scd_refcnt >= 2); 15274 atomic_add_32((volatile uint32_t *)&new_scdp->scd_refcnt, -1); 15275 } 15276 15277 /* 15278 * This routine is called by a process to remove itself from an SCD. It is 15279 * either called when the processes has detached from a segment or from 15280 * hat_free_start() as a result of calling exit. 15281 */ 15282 static void 15283 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type) 15284 { 15285 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15286 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15287 hatlock_t *hatlockp = TSB_HASH(sfmmup); 15288 int i; 15289 15290 ASSERT(scdp != NULL); 15291 ASSERT(srdp != NULL); 15292 15293 if (sfmmup->sfmmu_free) { 15294 /* 15295 * If the process is part of an SCD the sfmmu is unlinked 15296 * from scd_sf_list. 15297 */ 15298 mutex_enter(&scdp->scd_mutex); 15299 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15300 mutex_exit(&scdp->scd_mutex); 15301 /* 15302 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15303 * are about to leave the SCD 15304 */ 15305 for (i = 0; i < mmu_page_sizes; i++) { 15306 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15307 scdp->scd_rttecnt[i]); 15308 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15309 sfmmup->sfmmu_scdrttecnt[i]); 15310 sfmmup->sfmmu_scdrttecnt[i] = 0; 15311 } 15312 sfmmup->sfmmu_scdp = NULL; 15313 15314 SF_SCD_DECR_REF(srdp, scdp); 15315 return; 15316 } 15317 15318 ASSERT(r_type != SFMMU_REGION_ISM || 15319 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15320 ASSERT(scdp->scd_refcnt); 15321 ASSERT(!sfmmup->sfmmu_free); 15322 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15323 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15324 15325 /* 15326 * Wait for ISM maps to be updated. 15327 */ 15328 if (r_type != SFMMU_REGION_ISM) { 15329 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) && 15330 sfmmup->sfmmu_scdp != NULL) { 15331 cv_wait(&sfmmup->sfmmu_tsb_cv, 15332 HATLOCK_MUTEXP(hatlockp)); 15333 } 15334 15335 if (sfmmup->sfmmu_scdp == NULL) { 15336 sfmmu_hat_exit(hatlockp); 15337 return; 15338 } 15339 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 15340 } 15341 15342 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 15343 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 15344 /* 15345 * Since HAT_JOIN_SCD was set our context 15346 * is still invalid. 15347 */ 15348 } else { 15349 /* 15350 * For a multi-thread process, we must stop 15351 * all the other threads before leaving the scd. 15352 */ 15353 15354 sfmmu_invalidate_ctx(sfmmup); 15355 } 15356 15357 /* Clear all the rid's for ISM, delete flags, etc */ 15358 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15359 sfmmu_ism_hatflags(sfmmup, 0); 15360 15361 /* 15362 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15363 * are in SCD before this sfmmup leaves the SCD. 15364 */ 15365 for (i = 0; i < mmu_page_sizes; i++) { 15366 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15367 scdp->scd_rttecnt[i]); 15368 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15369 sfmmup->sfmmu_scdrttecnt[i]); 15370 sfmmup->sfmmu_scdrttecnt[i] = 0; 15371 /* update ismttecnt to include SCD ism before hat leaves SCD */ 15372 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i]; 15373 sfmmup->sfmmu_scdismttecnt[i] = 0; 15374 } 15375 /* update tsb0 inflation count */ 15376 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15377 15378 if (r_type != SFMMU_REGION_ISM) { 15379 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 15380 } 15381 sfmmup->sfmmu_scdp = NULL; 15382 15383 sfmmu_hat_exit(hatlockp); 15384 15385 /* 15386 * Unlink sfmmu from scd_sf_list this can be done without holding 15387 * the hat lock as we hold the sfmmu_as lock which prevents 15388 * hat_join_region from adding this thread to the scd again. Other 15389 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL 15390 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp 15391 * while holding the hat lock. 15392 */ 15393 mutex_enter(&scdp->scd_mutex); 15394 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15395 mutex_exit(&scdp->scd_mutex); 15396 SFMMU_STAT(sf_leave_scd); 15397 15398 SF_SCD_DECR_REF(srdp, scdp); 15399 hatlockp = sfmmu_hat_enter(sfmmup); 15400 15401 } 15402 15403 /* 15404 * Unlink and free up an SCD structure with a reference count of 0. 15405 */ 15406 static void 15407 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap) 15408 { 15409 sfmmu_t *scsfmmup; 15410 sf_scd_t *sp; 15411 hatlock_t *shatlockp; 15412 int i, ret; 15413 15414 mutex_enter(&srdp->srd_scd_mutex); 15415 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) { 15416 if (sp == scdp) 15417 break; 15418 } 15419 if (sp == NULL || sp->scd_refcnt) { 15420 mutex_exit(&srdp->srd_scd_mutex); 15421 return; 15422 } 15423 15424 /* 15425 * It is possible that the scd has been freed and reallocated with a 15426 * different region map while we've been waiting for the srd_scd_mutex. 15427 */ 15428 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret); 15429 if (ret != 1) { 15430 mutex_exit(&srdp->srd_scd_mutex); 15431 return; 15432 } 15433 15434 ASSERT(scdp->scd_sf_list == NULL); 15435 /* 15436 * Unlink scd from srd_scdp list. 15437 */ 15438 sfmmu_remove_scd(&srdp->srd_scdp, scdp); 15439 mutex_exit(&srdp->srd_scd_mutex); 15440 15441 sfmmu_unlink_scd_from_regions(srdp, scdp); 15442 15443 /* Clear shared context tsb and release ctx */ 15444 scsfmmup = scdp->scd_sfmmup; 15445 15446 /* 15447 * create a barrier so that scd will not be destroyed 15448 * if other thread still holds the same shared hat lock. 15449 * E.g., sfmmu_tsbmiss_exception() needs to acquire the 15450 * shared hat lock before checking the shared tsb reloc flag. 15451 */ 15452 shatlockp = sfmmu_hat_enter(scsfmmup); 15453 sfmmu_hat_exit(shatlockp); 15454 15455 sfmmu_free_scd_tsbs(scsfmmup); 15456 15457 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 15458 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) { 15459 kmem_free(scsfmmup->sfmmu_hmeregion_links[i], 15460 SFMMU_L2_HMERLINKS_SIZE); 15461 scsfmmup->sfmmu_hmeregion_links[i] = NULL; 15462 } 15463 } 15464 kmem_cache_free(sfmmuid_cache, scsfmmup); 15465 kmem_cache_free(scd_cache, scdp); 15466 SFMMU_STAT(sf_destroy_scd); 15467 } 15468 15469 /* 15470 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to 15471 * bits which are set in the ism_region_map parameter. This flag indicates to 15472 * the tsbmiss handler that mapping for these segments should be loaded using 15473 * the shared context. 15474 */ 15475 static void 15476 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag) 15477 { 15478 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15479 ism_blk_t *ism_blkp; 15480 ism_map_t *ism_map; 15481 int i, rid; 15482 15483 ASSERT(sfmmup->sfmmu_iblk != NULL); 15484 ASSERT(scdp != NULL); 15485 /* 15486 * Note that the caller either set HAT_ISMBUSY flag or checked 15487 * under hat lock that HAT_ISMBUSY was not set by another thread. 15488 */ 15489 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15490 15491 ism_blkp = sfmmup->sfmmu_iblk; 15492 while (ism_blkp != NULL) { 15493 ism_map = ism_blkp->iblk_maps; 15494 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 15495 rid = ism_map[i].imap_rid; 15496 if (rid == SFMMU_INVALID_ISMRID) { 15497 continue; 15498 } 15499 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS); 15500 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) && 15501 addflag) { 15502 ism_map[i].imap_hatflags |= 15503 HAT_CTX1_FLAG; 15504 } else { 15505 ism_map[i].imap_hatflags &= 15506 ~HAT_CTX1_FLAG; 15507 } 15508 } 15509 ism_blkp = ism_blkp->iblk_next; 15510 } 15511 } 15512 15513 static int 15514 sfmmu_srd_lock_held(sf_srd_t *srdp) 15515 { 15516 return (MUTEX_HELD(&srdp->srd_mutex)); 15517 } 15518 15519 /* ARGSUSED */ 15520 static int 15521 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags) 15522 { 15523 sf_scd_t *scdp = (sf_scd_t *)buf; 15524 15525 bzero(buf, sizeof (sf_scd_t)); 15526 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL); 15527 return (0); 15528 } 15529 15530 /* ARGSUSED */ 15531 static void 15532 sfmmu_scdcache_destructor(void *buf, void *cdrarg) 15533 { 15534 sf_scd_t *scdp = (sf_scd_t *)buf; 15535 15536 mutex_destroy(&scdp->scd_mutex); 15537 } 15538 15539 /* 15540 * The listp parameter is a pointer to a list of hmeblks which are partially 15541 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the 15542 * freeing process is to cross-call all cpus to ensure that there are no 15543 * remaining cached references. 15544 * 15545 * If the local generation number is less than the global then we can free 15546 * hmeblks which are already on the pending queue as another cpu has completed 15547 * the cross-call. 15548 * 15549 * We cross-call to make sure that there are no threads on other cpus accessing 15550 * these hmblks and then complete the process of freeing them under the 15551 * following conditions: 15552 * The total number of pending hmeblks is greater than the threshold 15553 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks 15554 * It is at least 1 second since the last time we cross-called 15555 * 15556 * Otherwise, we add the hmeblks to the per-cpu pending queue. 15557 */ 15558 static void 15559 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree) 15560 { 15561 struct hme_blk *hblkp, *pr_hblkp = NULL; 15562 int count = 0; 15563 cpuset_t cpuset = cpu_ready_set; 15564 cpu_hme_pend_t *cpuhp; 15565 timestruc_t now; 15566 int one_second_expired = 0; 15567 15568 gethrestime_lasttick(&now); 15569 15570 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) { 15571 ASSERT(hblkp->hblk_shw_bit == 0); 15572 ASSERT(hblkp->hblk_shared == 0); 15573 count++; 15574 pr_hblkp = hblkp; 15575 } 15576 15577 cpuhp = &cpu_hme_pend[CPU->cpu_seqid]; 15578 mutex_enter(&cpuhp->chp_mutex); 15579 15580 if ((cpuhp->chp_count + count) == 0) { 15581 mutex_exit(&cpuhp->chp_mutex); 15582 return; 15583 } 15584 15585 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) { 15586 one_second_expired = 1; 15587 } 15588 15589 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT || 15590 (cpuhp->chp_count + count) > cpu_hme_pend_thresh || 15591 one_second_expired)) { 15592 /* Append global list to local */ 15593 if (pr_hblkp == NULL) { 15594 *listp = cpuhp->chp_listp; 15595 } else { 15596 pr_hblkp->hblk_next = cpuhp->chp_listp; 15597 } 15598 cpuhp->chp_listp = NULL; 15599 cpuhp->chp_count = 0; 15600 cpuhp->chp_timestamp = now.tv_sec; 15601 mutex_exit(&cpuhp->chp_mutex); 15602 15603 kpreempt_disable(); 15604 CPUSET_DEL(cpuset, CPU->cpu_id); 15605 xt_sync(cpuset); 15606 xt_sync(cpuset); 15607 kpreempt_enable(); 15608 15609 /* 15610 * At this stage we know that no trap handlers on other 15611 * cpus can have references to hmeblks on the list. 15612 */ 15613 sfmmu_hblk_free(listp); 15614 } else if (*listp != NULL) { 15615 pr_hblkp->hblk_next = cpuhp->chp_listp; 15616 cpuhp->chp_listp = *listp; 15617 cpuhp->chp_count += count; 15618 *listp = NULL; 15619 mutex_exit(&cpuhp->chp_mutex); 15620 } else { 15621 mutex_exit(&cpuhp->chp_mutex); 15622 } 15623 } 15624 15625 /* 15626 * Add an hmeblk to the the hash list. 15627 */ 15628 void 15629 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15630 uint64_t hblkpa) 15631 { 15632 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15633 #ifdef DEBUG 15634 if (hmebp->hmeblkp == NULL) { 15635 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA); 15636 } 15637 #endif /* DEBUG */ 15638 15639 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa; 15640 /* 15641 * Since the TSB miss handler now does not lock the hash chain before 15642 * walking it, make sure that the hmeblks nextpa is globally visible 15643 * before we make the hmeblk globally visible by updating the chain root 15644 * pointer in the hash bucket. 15645 */ 15646 membar_producer(); 15647 hmebp->hmeh_nextpa = hblkpa; 15648 hmeblkp->hblk_next = hmebp->hmeblkp; 15649 hmebp->hmeblkp = hmeblkp; 15650 15651 } 15652 15653 /* 15654 * This function is the first part of a 2 part process to remove an hmeblk 15655 * from the hash chain. In this phase we unlink the hmeblk from the hash chain 15656 * but leave the next physical pointer unchanged. The hmeblk is then linked onto 15657 * a per-cpu pending list using the virtual address pointer. 15658 * 15659 * TSB miss trap handlers that start after this phase will no longer see 15660 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register 15661 * can still use it for further chain traversal because we haven't yet modifed 15662 * the next physical pointer or freed it. 15663 * 15664 * In the second phase of hmeblk removal we'll issue a barrier xcall before 15665 * we reuse or free this hmeblk. This will make sure all lingering references to 15666 * the hmeblk after first phase disappear before we finally reclaim it. 15667 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains 15668 * during their traversal. 15669 * 15670 * The hmehash_mutex must be held when calling this function. 15671 * 15672 * Input: 15673 * hmebp - hme hash bucket pointer 15674 * hmeblkp - address of hmeblk to be removed 15675 * pr_hblk - virtual address of previous hmeblkp 15676 * listp - pointer to list of hmeblks linked by virtual address 15677 * free_now flag - indicates that a complete removal from the hash chains 15678 * is necessary. 15679 * 15680 * It is inefficient to use the free_now flag as a cross-call is required to 15681 * remove a single hmeblk from the hash chain but is necessary when hmeblks are 15682 * in short supply. 15683 */ 15684 void 15685 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15686 struct hme_blk *pr_hblk, struct hme_blk **listp, 15687 int free_now) 15688 { 15689 int shw_size, vshift; 15690 struct hme_blk *shw_hblkp; 15691 uint_t shw_mask, newshw_mask; 15692 caddr_t vaddr; 15693 int size; 15694 cpuset_t cpuset = cpu_ready_set; 15695 15696 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15697 15698 if (hmebp->hmeblkp == hmeblkp) { 15699 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa; 15700 hmebp->hmeblkp = hmeblkp->hblk_next; 15701 } else { 15702 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa; 15703 pr_hblk->hblk_next = hmeblkp->hblk_next; 15704 } 15705 15706 size = get_hblk_ttesz(hmeblkp); 15707 shw_hblkp = hmeblkp->hblk_shadow; 15708 if (shw_hblkp) { 15709 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 15710 ASSERT(!hmeblkp->hblk_shared); 15711 #ifdef DEBUG 15712 if (mmu_page_sizes == max_mmu_page_sizes) { 15713 ASSERT(size < TTE256M); 15714 } else { 15715 ASSERT(size < TTE4M); 15716 } 15717 #endif /* DEBUG */ 15718 15719 shw_size = get_hblk_ttesz(shw_hblkp); 15720 vaddr = (caddr_t)get_hblk_base(hmeblkp); 15721 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 15722 ASSERT(vshift < 8); 15723 /* 15724 * Atomically clear shadow mask bit 15725 */ 15726 do { 15727 shw_mask = shw_hblkp->hblk_shw_mask; 15728 ASSERT(shw_mask & (1 << vshift)); 15729 newshw_mask = shw_mask & ~(1 << vshift); 15730 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 15731 shw_mask, newshw_mask); 15732 } while (newshw_mask != shw_mask); 15733 hmeblkp->hblk_shadow = NULL; 15734 } 15735 hmeblkp->hblk_shw_bit = 0; 15736 15737 if (hmeblkp->hblk_shared) { 15738 #ifdef DEBUG 15739 sf_srd_t *srdp; 15740 sf_region_t *rgnp; 15741 uint_t rid; 15742 15743 srdp = hblktosrd(hmeblkp); 15744 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 15745 rid = hmeblkp->hblk_tag.htag_rid; 15746 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 15747 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 15748 rgnp = srdp->srd_hmergnp[rid]; 15749 ASSERT(rgnp != NULL); 15750 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 15751 #endif /* DEBUG */ 15752 hmeblkp->hblk_shared = 0; 15753 } 15754 if (free_now) { 15755 kpreempt_disable(); 15756 CPUSET_DEL(cpuset, CPU->cpu_id); 15757 xt_sync(cpuset); 15758 xt_sync(cpuset); 15759 kpreempt_enable(); 15760 15761 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 15762 hmeblkp->hblk_next = NULL; 15763 } else { 15764 /* Append hmeblkp to listp for processing later. */ 15765 hmeblkp->hblk_next = *listp; 15766 *listp = hmeblkp; 15767 } 15768 } 15769 15770 /* 15771 * This routine is called when memory is in short supply and returns a free 15772 * hmeblk of the requested size from the cpu pending lists. 15773 */ 15774 static struct hme_blk * 15775 sfmmu_check_pending_hblks(int size) 15776 { 15777 int i; 15778 struct hme_blk *hmeblkp = NULL, *last_hmeblkp; 15779 int found_hmeblk; 15780 cpuset_t cpuset = cpu_ready_set; 15781 cpu_hme_pend_t *cpuhp; 15782 15783 /* Flush cpu hblk pending queues */ 15784 for (i = 0; i < NCPU; i++) { 15785 cpuhp = &cpu_hme_pend[i]; 15786 if (cpuhp->chp_listp != NULL) { 15787 mutex_enter(&cpuhp->chp_mutex); 15788 if (cpuhp->chp_listp == NULL) { 15789 mutex_exit(&cpuhp->chp_mutex); 15790 continue; 15791 } 15792 found_hmeblk = 0; 15793 last_hmeblkp = NULL; 15794 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL; 15795 hmeblkp = hmeblkp->hblk_next) { 15796 if (get_hblk_ttesz(hmeblkp) == size) { 15797 if (last_hmeblkp == NULL) { 15798 cpuhp->chp_listp = 15799 hmeblkp->hblk_next; 15800 } else { 15801 last_hmeblkp->hblk_next = 15802 hmeblkp->hblk_next; 15803 } 15804 ASSERT(cpuhp->chp_count > 0); 15805 cpuhp->chp_count--; 15806 found_hmeblk = 1; 15807 break; 15808 } else { 15809 last_hmeblkp = hmeblkp; 15810 } 15811 } 15812 mutex_exit(&cpuhp->chp_mutex); 15813 15814 if (found_hmeblk) { 15815 kpreempt_disable(); 15816 CPUSET_DEL(cpuset, CPU->cpu_id); 15817 xt_sync(cpuset); 15818 xt_sync(cpuset); 15819 kpreempt_enable(); 15820 return (hmeblkp); 15821 } 15822 } 15823 } 15824 return (NULL); 15825 } 15826