1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * VM - Hardware Address Translation management for Spitfire MMU. 28 * 29 * This file implements the machine specific hardware translation 30 * needed by the VM system. The machine independent interface is 31 * described in <vm/hat.h> while the machine dependent interface 32 * and data structures are described in <vm/hat_sfmmu.h>. 33 * 34 * The hat layer manages the address translation hardware as a cache 35 * driven by calls from the higher levels in the VM system. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/kstat.h> 40 #include <vm/hat.h> 41 #include <vm/hat_sfmmu.h> 42 #include <vm/page.h> 43 #include <sys/pte.h> 44 #include <sys/systm.h> 45 #include <sys/mman.h> 46 #include <sys/sysmacros.h> 47 #include <sys/machparam.h> 48 #include <sys/vtrace.h> 49 #include <sys/kmem.h> 50 #include <sys/mmu.h> 51 #include <sys/cmn_err.h> 52 #include <sys/cpu.h> 53 #include <sys/cpuvar.h> 54 #include <sys/debug.h> 55 #include <sys/lgrp.h> 56 #include <sys/archsystm.h> 57 #include <sys/machsystm.h> 58 #include <sys/vmsystm.h> 59 #include <vm/as.h> 60 #include <vm/seg.h> 61 #include <vm/seg_kp.h> 62 #include <vm/seg_kmem.h> 63 #include <vm/seg_kpm.h> 64 #include <vm/rm.h> 65 #include <sys/t_lock.h> 66 #include <sys/obpdefs.h> 67 #include <sys/vm_machparam.h> 68 #include <sys/var.h> 69 #include <sys/trap.h> 70 #include <sys/machtrap.h> 71 #include <sys/scb.h> 72 #include <sys/bitmap.h> 73 #include <sys/machlock.h> 74 #include <sys/membar.h> 75 #include <sys/atomic.h> 76 #include <sys/cpu_module.h> 77 #include <sys/prom_debug.h> 78 #include <sys/ksynch.h> 79 #include <sys/mem_config.h> 80 #include <sys/mem_cage.h> 81 #include <vm/vm_dep.h> 82 #include <vm/xhat_sfmmu.h> 83 #include <sys/fpu/fpusystm.h> 84 #include <vm/mach_kpm.h> 85 #include <sys/callb.h> 86 87 #ifdef DEBUG 88 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ 89 if (SFMMU_IS_SHMERID_VALID(rid)) { \ 90 caddr_t _eaddr = (saddr) + (len); \ 91 sf_srd_t *_srdp; \ 92 sf_region_t *_rgnp; \ 93 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 94 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \ 95 ASSERT((hat) != ksfmmup); \ 96 _srdp = (hat)->sfmmu_srdp; \ 97 ASSERT(_srdp != NULL); \ 98 ASSERT(_srdp->srd_refcnt != 0); \ 99 _rgnp = _srdp->srd_hmergnp[(rid)]; \ 100 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \ 101 ASSERT(_rgnp->rgn_refcnt != 0); \ 102 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \ 103 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 104 SFMMU_REGION_HME); \ 105 ASSERT((saddr) >= _rgnp->rgn_saddr); \ 106 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \ 107 ASSERT(_eaddr > _rgnp->rgn_saddr); \ 108 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \ 109 } 110 111 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \ 112 { \ 113 caddr_t _hsva; \ 114 caddr_t _heva; \ 115 caddr_t _rsva; \ 116 caddr_t _reva; \ 117 int _ttesz = get_hblk_ttesz(hmeblkp); \ 118 int _flagtte; \ 119 ASSERT((srdp)->srd_refcnt != 0); \ 120 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 121 ASSERT((rgnp)->rgn_id == rid); \ 122 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \ 123 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 124 SFMMU_REGION_HME); \ 125 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \ 126 _hsva = (caddr_t)get_hblk_base(hmeblkp); \ 127 _heva = get_hblk_endaddr(hmeblkp); \ 128 _rsva = (caddr_t)P2ALIGN( \ 129 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \ 130 _reva = (caddr_t)P2ROUNDUP( \ 131 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \ 132 HBLK_MIN_BYTES); \ 133 ASSERT(_hsva >= _rsva); \ 134 ASSERT(_hsva < _reva); \ 135 ASSERT(_heva > _rsva); \ 136 ASSERT(_heva <= _reva); \ 137 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \ 138 _ttesz; \ 139 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \ 140 } 141 142 #else /* DEBUG */ 143 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len) 144 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 145 #endif /* DEBUG */ 146 147 #if defined(SF_ERRATA_57) 148 extern caddr_t errata57_limit; 149 #endif 150 151 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 152 (sizeof (int64_t))) 153 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 154 155 #define HBLK_RESERVE_CNT 128 156 #define HBLK_RESERVE_MIN 20 157 158 static struct hme_blk *freehblkp; 159 static kmutex_t freehblkp_lock; 160 static int freehblkcnt; 161 162 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 163 static kmutex_t hblk_reserve_lock; 164 static kthread_t *hblk_reserve_thread; 165 166 static nucleus_hblk8_info_t nucleus_hblk8; 167 static nucleus_hblk1_info_t nucleus_hblk1; 168 169 /* 170 * SFMMU specific hat functions 171 */ 172 void hat_pagecachectl(struct page *, int); 173 174 /* flags for hat_pagecachectl */ 175 #define HAT_CACHE 0x1 176 #define HAT_UNCACHE 0x2 177 #define HAT_TMPNC 0x4 178 179 /* 180 * This flag is set to 0 via the MD in platforms that do not support 181 * I-cache coherency in hardware. Used to enable "soft exec" mode. 182 * The MD "coherency" property is optional, and defaults to 1 (because 183 * coherent I-cache is the norm.) 184 */ 185 uint_t icache_is_coherent = 1; 186 187 /* 188 * Flag to allow the creation of non-cacheable translations 189 * to system memory. It is off by default. At the moment this 190 * flag is used by the ecache error injector. The error injector 191 * will turn it on when creating such a translation then shut it 192 * off when it's finished. 193 */ 194 195 int sfmmu_allow_nc_trans = 0; 196 197 /* 198 * Flag to disable large page support. 199 * value of 1 => disable all large pages. 200 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 201 * 202 * For example, use the value 0x4 to disable 512K pages. 203 * 204 */ 205 #define LARGE_PAGES_OFF 0x1 206 207 /* 208 * The disable_large_pages and disable_ism_large_pages variables control 209 * hat_memload_array and the page sizes to be used by ISM and the kernel. 210 * 211 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables 212 * are only used to control which OOB pages to use at upper VM segment creation 213 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines. 214 * Their values may come from platform or CPU specific code to disable page 215 * sizes that should not be used. 216 * 217 * WARNING: 512K pages are currently not supported for ISM/DISM. 218 */ 219 uint_t disable_large_pages = 0; 220 uint_t disable_ism_large_pages = (1 << TTE512K); 221 uint_t disable_auto_data_large_pages = 0; 222 uint_t disable_auto_text_large_pages = 0; 223 224 /* 225 * Private sfmmu data structures for hat management 226 */ 227 static struct kmem_cache *sfmmuid_cache; 228 static struct kmem_cache *mmuctxdom_cache; 229 230 /* 231 * Private sfmmu data structures for tsb management 232 */ 233 static struct kmem_cache *sfmmu_tsbinfo_cache; 234 static struct kmem_cache *sfmmu_tsb8k_cache; 235 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 236 static vmem_t *kmem_bigtsb_arena; 237 static vmem_t *kmem_tsb_arena; 238 239 /* 240 * sfmmu static variables for hmeblk resource management. 241 */ 242 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 243 static struct kmem_cache *sfmmu8_cache; 244 static struct kmem_cache *sfmmu1_cache; 245 static struct kmem_cache *pa_hment_cache; 246 247 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 248 /* 249 * private data for ism 250 */ 251 static struct kmem_cache *ism_blk_cache; 252 static struct kmem_cache *ism_ment_cache; 253 #define ISMID_STARTADDR NULL 254 255 /* 256 * Region management data structures and function declarations. 257 */ 258 259 static void sfmmu_leave_srd(sfmmu_t *); 260 static int sfmmu_srdcache_constructor(void *, void *, int); 261 static void sfmmu_srdcache_destructor(void *, void *); 262 static int sfmmu_rgncache_constructor(void *, void *, int); 263 static void sfmmu_rgncache_destructor(void *, void *); 264 static int sfrgnmap_isnull(sf_region_map_t *); 265 static int sfhmergnmap_isnull(sf_hmeregion_map_t *); 266 static int sfmmu_scdcache_constructor(void *, void *, int); 267 static void sfmmu_scdcache_destructor(void *, void *); 268 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t, 269 size_t, void *, u_offset_t); 270 271 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1; 272 static sf_srd_bucket_t *srd_buckets; 273 static struct kmem_cache *srd_cache; 274 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1; 275 static struct kmem_cache *region_cache; 276 static struct kmem_cache *scd_cache; 277 278 #ifdef sun4v 279 int use_bigtsb_arena = 1; 280 #else 281 int use_bigtsb_arena = 0; 282 #endif 283 284 /* External /etc/system tunable, for turning on&off the shctx support */ 285 int disable_shctx = 0; 286 /* Internal variable, set by MD if the HW supports shctx feature */ 287 int shctx_on = 0; 288 289 #ifdef DEBUG 290 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int); 291 #endif 292 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *); 293 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *); 294 295 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *); 296 static void sfmmu_find_scd(sfmmu_t *); 297 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *); 298 static void sfmmu_finish_join_scd(sfmmu_t *); 299 static void sfmmu_leave_scd(sfmmu_t *, uchar_t); 300 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *); 301 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *); 302 static void sfmmu_free_scd_tsbs(sfmmu_t *); 303 static void sfmmu_tsb_inv_ctx(sfmmu_t *); 304 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *); 305 static void sfmmu_ism_hatflags(sfmmu_t *, int); 306 static int sfmmu_srd_lock_held(sf_srd_t *); 307 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *); 308 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *); 309 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *); 310 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *); 311 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *); 312 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *); 313 314 /* 315 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 316 * HAT flags, synchronizing TLB/TSB coherency, and context management. 317 * The lock is hashed on the sfmmup since the case where we need to lock 318 * all processes is rare but does occur (e.g. we need to unload a shared 319 * mapping from all processes using the mapping). We have a lot of buckets, 320 * and each slab of sfmmu_t's can use about a quarter of them, giving us 321 * a fairly good distribution without wasting too much space and overhead 322 * when we have to grab them all. 323 */ 324 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 325 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 326 327 /* 328 * Hash algorithm optimized for a small number of slabs. 329 * 7 is (highbit((sizeof sfmmu_t)) - 1) 330 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 331 * kmem_cache, and thus they will be sequential within that cache. In 332 * addition, each new slab will have a different "color" up to cache_maxcolor 333 * which will skew the hashing for each successive slab which is allocated. 334 * If the size of sfmmu_t changed to a larger size, this algorithm may need 335 * to be revisited. 336 */ 337 #define TSB_HASH_SHIFT_BITS (7) 338 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 339 340 #ifdef DEBUG 341 int tsb_hash_debug = 0; 342 #define TSB_HASH(sfmmup) \ 343 (tsb_hash_debug ? &hat_lock[0] : \ 344 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 345 #else /* DEBUG */ 346 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 347 #endif /* DEBUG */ 348 349 350 /* sfmmu_replace_tsb() return codes. */ 351 typedef enum tsb_replace_rc { 352 TSB_SUCCESS, 353 TSB_ALLOCFAIL, 354 TSB_LOSTRACE, 355 TSB_ALREADY_SWAPPED, 356 TSB_CANTGROW 357 } tsb_replace_rc_t; 358 359 /* 360 * Flags for TSB allocation routines. 361 */ 362 #define TSB_ALLOC 0x01 363 #define TSB_FORCEALLOC 0x02 364 #define TSB_GROW 0x04 365 #define TSB_SHRINK 0x08 366 #define TSB_SWAPIN 0x10 367 368 /* 369 * Support for HAT callbacks. 370 */ 371 #define SFMMU_MAX_RELOC_CALLBACKS 10 372 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 373 static id_t sfmmu_cb_nextid = 0; 374 static id_t sfmmu_tsb_cb_id; 375 struct sfmmu_callback *sfmmu_cb_table; 376 377 /* 378 * Kernel page relocation is enabled by default for non-caged 379 * kernel pages. This has little effect unless segkmem_reloc is 380 * set, since by default kernel memory comes from inside the 381 * kernel cage. 382 */ 383 int hat_kpr_enabled = 1; 384 385 kmutex_t kpr_mutex; 386 kmutex_t kpr_suspendlock; 387 kthread_t *kreloc_thread; 388 389 /* 390 * Enable VA->PA translation sanity checking on DEBUG kernels. 391 * Disabled by default. This is incompatible with some 392 * drivers (error injector, RSM) so if it breaks you get 393 * to keep both pieces. 394 */ 395 int hat_check_vtop = 0; 396 397 /* 398 * Private sfmmu routines (prototypes) 399 */ 400 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 401 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 402 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t, 403 uint_t); 404 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 405 caddr_t, demap_range_t *, uint_t); 406 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 407 caddr_t, int); 408 static void sfmmu_hblk_free(struct hmehash_bucket *, struct hme_blk *, 409 uint64_t, struct hme_blk **); 410 static void sfmmu_hblks_list_purge(struct hme_blk **); 411 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 412 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 413 static struct hme_blk *sfmmu_hblk_steal(int); 414 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 415 struct hme_blk *, uint64_t, uint64_t, 416 struct hme_blk *); 417 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 418 419 static void hat_do_memload_array(struct hat *, caddr_t, size_t, 420 struct page **, uint_t, uint_t, uint_t); 421 static void hat_do_memload(struct hat *, caddr_t, struct page *, 422 uint_t, uint_t, uint_t); 423 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 424 uint_t, uint_t, pgcnt_t, uint_t); 425 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 426 uint_t); 427 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 428 uint_t, uint_t); 429 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 430 caddr_t, int, uint_t); 431 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 432 struct hmehash_bucket *, caddr_t, uint_t, uint_t, 433 uint_t); 434 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 435 caddr_t, page_t **, uint_t, uint_t); 436 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 437 438 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 439 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *); 440 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 441 #ifdef VAC 442 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 443 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 444 int tst_tnc(page_t *pp, pgcnt_t); 445 void conv_tnc(page_t *pp, int); 446 #endif 447 448 static void sfmmu_get_ctx(sfmmu_t *); 449 static void sfmmu_free_sfmmu(sfmmu_t *); 450 451 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 452 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 453 454 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 455 static void hat_pagereload(struct page *, struct page *); 456 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 457 #ifdef VAC 458 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 459 static void sfmmu_page_cache(page_t *, int, int, int); 460 #endif 461 462 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *, 463 struct hme_blk *, int); 464 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 465 pfn_t, int, int, int, int); 466 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 467 pfn_t, int); 468 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 469 static void sfmmu_tlb_range_demap(demap_range_t *); 470 static void sfmmu_invalidate_ctx(sfmmu_t *); 471 static void sfmmu_sync_mmustate(sfmmu_t *); 472 473 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 474 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 475 sfmmu_t *); 476 static void sfmmu_tsb_free(struct tsb_info *); 477 static void sfmmu_tsbinfo_free(struct tsb_info *); 478 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 479 sfmmu_t *); 480 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *); 481 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 482 static int sfmmu_select_tsb_szc(pgcnt_t); 483 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 484 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 485 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 486 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 487 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 488 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 489 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 490 hatlock_t *, uint_t); 491 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int); 492 493 #ifdef VAC 494 void sfmmu_cache_flush(pfn_t, int); 495 void sfmmu_cache_flushcolor(int, pfn_t); 496 #endif 497 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 498 caddr_t, demap_range_t *, uint_t, int); 499 500 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 501 static uint_t sfmmu_ptov_attr(tte_t *); 502 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 503 caddr_t, demap_range_t *, uint_t); 504 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 505 static int sfmmu_idcache_constructor(void *, void *, int); 506 static void sfmmu_idcache_destructor(void *, void *); 507 static int sfmmu_hblkcache_constructor(void *, void *, int); 508 static void sfmmu_hblkcache_destructor(void *, void *); 509 static void sfmmu_hblkcache_reclaim(void *); 510 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 511 struct hmehash_bucket *); 512 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 513 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int); 514 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t, 515 int, caddr_t *); 516 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *); 517 518 static void sfmmu_rm_large_mappings(page_t *, int); 519 520 static void hat_lock_init(void); 521 static void hat_kstat_init(void); 522 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 523 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *); 524 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t); 525 static void sfmmu_check_page_sizes(sfmmu_t *, int); 526 int fnd_mapping_sz(page_t *); 527 static void iment_add(struct ism_ment *, struct hat *); 528 static void iment_sub(struct ism_ment *, struct hat *); 529 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 530 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 531 extern void sfmmu_clear_utsbinfo(void); 532 533 static void sfmmu_ctx_wrap_around(mmu_ctx_t *); 534 535 /* kpm globals */ 536 #ifdef DEBUG 537 /* 538 * Enable trap level tsbmiss handling 539 */ 540 int kpm_tsbmtl = 1; 541 542 /* 543 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 544 * required TLB shootdowns in this case, so handle w/ care. Off by default. 545 */ 546 int kpm_tlb_flush; 547 #endif /* DEBUG */ 548 549 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 550 551 #ifdef DEBUG 552 static void sfmmu_check_hblk_flist(); 553 #endif 554 555 /* 556 * Semi-private sfmmu data structures. Some of them are initialize in 557 * startup or in hat_init. Some of them are private but accessed by 558 * assembly code or mach_sfmmu.c 559 */ 560 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 561 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 562 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 563 uint64_t khme_hash_pa; /* PA of khme_hash */ 564 int uhmehash_num; /* # of buckets in user hash table */ 565 int khmehash_num; /* # of buckets in kernel hash table */ 566 567 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */ 568 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */ 569 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ 570 571 #define DEFAULT_NUM_CTXS_PER_MMU 8192 572 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; 573 574 int cache; /* describes system cache */ 575 576 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 577 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 578 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 579 int ktsb_sz; /* kernel 8k-indexed tsb size */ 580 581 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 582 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 583 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 584 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 585 586 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 587 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 588 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 589 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 590 591 #ifndef sun4v 592 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 593 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 594 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 595 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 596 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 597 #endif /* sun4v */ 598 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 599 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 600 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */ 601 602 /* 603 * Size to use for TSB slabs. Future platforms that support page sizes 604 * larger than 4M may wish to change these values, and provide their own 605 * assembly macros for building and decoding the TSB base register contents. 606 * Note disable_large_pages will override the value set here. 607 */ 608 static uint_t tsb_slab_ttesz = TTE4M; 609 size_t tsb_slab_size = MMU_PAGESIZE4M; 610 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 611 /* PFN mask for TTE */ 612 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT; 613 614 /* 615 * Size to use for TSB slabs. These are used only when 256M tsb arenas 616 * exist. 617 */ 618 static uint_t bigtsb_slab_ttesz = TTE256M; 619 static size_t bigtsb_slab_size = MMU_PAGESIZE256M; 620 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M; 621 /* 256M page alignment for 8K pfn */ 622 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT; 623 624 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 625 static int tsb_max_growsize = 0; 626 627 /* 628 * Tunable parameters dealing with TSB policies. 629 */ 630 631 /* 632 * This undocumented tunable forces all 8K TSBs to be allocated from 633 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 634 */ 635 #ifdef DEBUG 636 int tsb_forceheap = 0; 637 #endif /* DEBUG */ 638 639 /* 640 * Decide whether to use per-lgroup arenas, or one global set of 641 * TSB arenas. The default is not to break up per-lgroup, since 642 * most platforms don't recognize any tangible benefit from it. 643 */ 644 int tsb_lgrp_affinity = 0; 645 646 /* 647 * Used for growing the TSB based on the process RSS. 648 * tsb_rss_factor is based on the smallest TSB, and is 649 * shifted by the TSB size to determine if we need to grow. 650 * The default will grow the TSB if the number of TTEs for 651 * this page size exceeds 75% of the number of TSB entries, 652 * which should _almost_ eliminate all conflict misses 653 * (at the expense of using up lots and lots of memory). 654 */ 655 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 656 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 657 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 658 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 659 default_tsb_size) 660 #define TSB_OK_SHRINK() \ 661 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 662 #define TSB_OK_GROW() \ 663 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 664 665 int enable_tsb_rss_sizing = 1; 666 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 667 668 /* which TSB size code to use for new address spaces or if rss sizing off */ 669 int default_tsb_size = TSB_8K_SZCODE; 670 671 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 672 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 673 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 674 675 #ifdef DEBUG 676 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 677 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 678 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 679 static int tsb_alloc_fail_mtbf = 0; 680 static int tsb_alloc_count = 0; 681 #endif /* DEBUG */ 682 683 /* if set to 1, will remap valid TTEs when growing TSB. */ 684 int tsb_remap_ttes = 1; 685 686 /* 687 * If we have more than this many mappings, allocate a second TSB. 688 * This default is chosen because the I/D fully associative TLBs are 689 * assumed to have at least 8 available entries. Platforms with a 690 * larger fully-associative TLB could probably override the default. 691 */ 692 693 #ifdef sun4v 694 int tsb_sectsb_threshold = 0; 695 #else 696 int tsb_sectsb_threshold = 8; 697 #endif 698 699 /* 700 * kstat data 701 */ 702 struct sfmmu_global_stat sfmmu_global_stat; 703 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 704 705 /* 706 * Global data 707 */ 708 sfmmu_t *ksfmmup; /* kernel's hat id */ 709 710 #ifdef DEBUG 711 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 712 #endif 713 714 /* sfmmu locking operations */ 715 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 716 static int sfmmu_mlspl_held(struct page *, int); 717 718 kmutex_t *sfmmu_page_enter(page_t *); 719 void sfmmu_page_exit(kmutex_t *); 720 int sfmmu_page_spl_held(struct page *); 721 722 /* sfmmu internal locking operations - accessed directly */ 723 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 724 kmutex_t **, kmutex_t **); 725 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 726 static hatlock_t * 727 sfmmu_hat_enter(sfmmu_t *); 728 static hatlock_t * 729 sfmmu_hat_tryenter(sfmmu_t *); 730 static void sfmmu_hat_exit(hatlock_t *); 731 static void sfmmu_hat_lock_all(void); 732 static void sfmmu_hat_unlock_all(void); 733 static void sfmmu_ismhat_enter(sfmmu_t *, int); 734 static void sfmmu_ismhat_exit(sfmmu_t *, int); 735 736 /* 737 * Array of mutexes protecting a page's mapping list and p_nrm field. 738 * 739 * The hash function looks complicated, but is made up so that: 740 * 741 * "pp" not shifted, so adjacent pp values will hash to different cache lines 742 * (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock) 743 * 744 * "pp" >> mml_shift, incorporates more source bits into the hash result 745 * 746 * "& (mml_table_size - 1), should be faster than using remainder "%" 747 * 748 * Hopefully, mml_table, mml_table_size and mml_shift are all in the same 749 * cacheline, since they get declared next to each other below. We'll trust 750 * ld not to do something random. 751 */ 752 #ifdef DEBUG 753 int mlist_hash_debug = 0; 754 #define MLIST_HASH(pp) (mlist_hash_debug ? &mml_table[0] : \ 755 &mml_table[((uintptr_t)(pp) + \ 756 ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]) 757 #else /* !DEBUG */ 758 #define MLIST_HASH(pp) &mml_table[ \ 759 ((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)] 760 #endif /* !DEBUG */ 761 762 kmutex_t *mml_table; 763 uint_t mml_table_sz; /* must be a power of 2 */ 764 uint_t mml_shift; /* log2(mml_table_sz) + 3 for align */ 765 766 kpm_hlk_t *kpmp_table; 767 uint_t kpmp_table_sz; /* must be a power of 2 */ 768 uchar_t kpmp_shift; 769 770 kpm_shlk_t *kpmp_stable; 771 uint_t kpmp_stable_sz; /* must be a power of 2 */ 772 773 /* 774 * SPL_HASH was improved to avoid false cache line sharing 775 */ 776 #define SPL_TABLE_SIZE 128 777 #define SPL_MASK (SPL_TABLE_SIZE - 1) 778 #define SPL_SHIFT 7 /* log2(SPL_TABLE_SIZE) */ 779 780 #define SPL_INDEX(pp) \ 781 ((((uintptr_t)(pp) >> SPL_SHIFT) ^ \ 782 ((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \ 783 (SPL_TABLE_SIZE - 1)) 784 785 #define SPL_HASH(pp) \ 786 (&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex) 787 788 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 789 790 791 /* 792 * hat_unload_callback() will group together callbacks in order 793 * to avoid xt_sync() calls. This is the maximum size of the group. 794 */ 795 #define MAX_CB_ADDR 32 796 797 tte_t hw_tte; 798 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 799 800 static char *mmu_ctx_kstat_names[] = { 801 "mmu_ctx_tsb_exceptions", 802 "mmu_ctx_tsb_raise_exception", 803 "mmu_ctx_wrap_around", 804 }; 805 806 /* 807 * Wrapper for vmem_xalloc since vmem_create only allows limited 808 * parameters for vm_source_alloc functions. This function allows us 809 * to specify alignment consistent with the size of the object being 810 * allocated. 811 */ 812 static void * 813 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 814 { 815 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 816 } 817 818 /* Common code for setting tsb_alloc_hiwater. */ 819 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 820 ptob(pages) / tsb_alloc_hiwater_factor 821 822 /* 823 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 824 * a single TSB. physmem is the number of physical pages so we need physmem 8K 825 * TTEs to represent all those physical pages. We round this up by using 826 * 1<<highbit(). To figure out which size code to use, remember that the size 827 * code is just an amount to shift the smallest TSB size to get the size of 828 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 829 * highbit() - 1) to get the size code for the smallest TSB that can represent 830 * all of physical memory, while erring on the side of too much. 831 * 832 * Restrict tsb_max_growsize to make sure that: 833 * 1) TSBs can't grow larger than the TSB slab size 834 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 835 */ 836 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 837 int _i, _szc, _slabszc, _tsbszc; \ 838 \ 839 _i = highbit(pages); \ 840 if ((1 << (_i - 1)) == (pages)) \ 841 _i--; /* 2^n case, round down */ \ 842 _szc = _i - TSB_START_SIZE; \ 843 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \ 844 _tsbszc = MIN(_szc, _slabszc); \ 845 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \ 846 } 847 848 /* 849 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 850 * tsb_info which handles that TTE size. 851 */ 852 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \ 853 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 854 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \ 855 sfmmu_hat_lock_held(sfmmup)); \ 856 if ((tte_szc) >= TTE4M) { \ 857 ASSERT((tsbinfop) != NULL); \ 858 (tsbinfop) = (tsbinfop)->tsb_next; \ 859 } \ 860 } 861 862 /* 863 * Macro to use to unload entries from the TSB. 864 * It has knowledge of which page sizes get replicated in the TSB 865 * and will call the appropriate unload routine for the appropriate size. 866 */ 867 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \ 868 { \ 869 int ttesz = get_hblk_ttesz(hmeblkp); \ 870 if (ttesz == TTE8K || ttesz == TTE4M) { \ 871 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 872 } else { \ 873 caddr_t sva = ismhat ? addr : \ 874 (caddr_t)get_hblk_base(hmeblkp); \ 875 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 876 ASSERT(addr >= sva && addr < eva); \ 877 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 878 } \ 879 } 880 881 882 /* Update tsb_alloc_hiwater after memory is configured. */ 883 /*ARGSUSED*/ 884 static void 885 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages) 886 { 887 /* Assumes physmem has already been updated. */ 888 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 889 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 890 } 891 892 /* 893 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 894 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 895 * deleted. 896 */ 897 /*ARGSUSED*/ 898 static int 899 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages) 900 { 901 return (0); 902 } 903 904 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 905 /*ARGSUSED*/ 906 static void 907 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 908 { 909 /* 910 * Whether the delete was cancelled or not, just go ahead and update 911 * tsb_alloc_hiwater and tsb_max_growsize. 912 */ 913 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 914 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 915 } 916 917 static kphysm_setup_vector_t sfmmu_update_vec = { 918 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 919 sfmmu_update_post_add, /* post_add */ 920 sfmmu_update_pre_del, /* pre_del */ 921 sfmmu_update_post_del /* post_del */ 922 }; 923 924 925 /* 926 * HME_BLK HASH PRIMITIVES 927 */ 928 929 /* 930 * Enter a hme on the mapping list for page pp. 931 * When large pages are more prevalent in the system we might want to 932 * keep the mapping list in ascending order by the hment size. For now, 933 * small pages are more frequent, so don't slow it down. 934 */ 935 #define HME_ADD(hme, pp) \ 936 { \ 937 ASSERT(sfmmu_mlist_held(pp)); \ 938 \ 939 hme->hme_prev = NULL; \ 940 hme->hme_next = pp->p_mapping; \ 941 hme->hme_page = pp; \ 942 if (pp->p_mapping) { \ 943 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 944 ASSERT(pp->p_share > 0); \ 945 } else { \ 946 /* EMPTY */ \ 947 ASSERT(pp->p_share == 0); \ 948 } \ 949 pp->p_mapping = hme; \ 950 pp->p_share++; \ 951 } 952 953 /* 954 * Enter a hme on the mapping list for page pp. 955 * If we are unmapping a large translation, we need to make sure that the 956 * change is reflect in the corresponding bit of the p_index field. 957 */ 958 #define HME_SUB(hme, pp) \ 959 { \ 960 ASSERT(sfmmu_mlist_held(pp)); \ 961 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 962 \ 963 if (pp->p_mapping == NULL) { \ 964 panic("hme_remove - no mappings"); \ 965 } \ 966 \ 967 membar_stst(); /* ensure previous stores finish */ \ 968 \ 969 ASSERT(pp->p_share > 0); \ 970 pp->p_share--; \ 971 \ 972 if (hme->hme_prev) { \ 973 ASSERT(pp->p_mapping != hme); \ 974 ASSERT(hme->hme_prev->hme_page == pp || \ 975 IS_PAHME(hme->hme_prev)); \ 976 hme->hme_prev->hme_next = hme->hme_next; \ 977 } else { \ 978 ASSERT(pp->p_mapping == hme); \ 979 pp->p_mapping = hme->hme_next; \ 980 ASSERT((pp->p_mapping == NULL) ? \ 981 (pp->p_share == 0) : 1); \ 982 } \ 983 \ 984 if (hme->hme_next) { \ 985 ASSERT(hme->hme_next->hme_page == pp || \ 986 IS_PAHME(hme->hme_next)); \ 987 hme->hme_next->hme_prev = hme->hme_prev; \ 988 } \ 989 \ 990 /* zero out the entry */ \ 991 hme->hme_next = NULL; \ 992 hme->hme_prev = NULL; \ 993 hme->hme_page = NULL; \ 994 \ 995 if (hme_size(hme) > TTE8K) { \ 996 /* remove mappings for remainder of large pg */ \ 997 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 998 } \ 999 } 1000 1001 /* 1002 * This function returns the hment given the hme_blk and a vaddr. 1003 * It assumes addr has already been checked to belong to hme_blk's 1004 * range. 1005 */ 1006 #define HBLKTOHME(hment, hmeblkp, addr) \ 1007 { \ 1008 int index; \ 1009 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 1010 } 1011 1012 /* 1013 * Version of HBLKTOHME that also returns the index in hmeblkp 1014 * of the hment. 1015 */ 1016 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 1017 { \ 1018 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 1019 \ 1020 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 1021 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 1022 } else \ 1023 idx = 0; \ 1024 \ 1025 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 1026 } 1027 1028 /* 1029 * Disable any page sizes not supported by the CPU 1030 */ 1031 void 1032 hat_init_pagesizes() 1033 { 1034 int i; 1035 1036 mmu_exported_page_sizes = 0; 1037 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 1038 1039 szc_2_userszc[i] = (uint_t)-1; 1040 userszc_2_szc[i] = (uint_t)-1; 1041 1042 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 1043 disable_large_pages |= (1 << i); 1044 } else { 1045 szc_2_userszc[i] = mmu_exported_page_sizes; 1046 userszc_2_szc[mmu_exported_page_sizes] = i; 1047 mmu_exported_page_sizes++; 1048 } 1049 } 1050 1051 disable_ism_large_pages |= disable_large_pages; 1052 disable_auto_data_large_pages = disable_large_pages; 1053 disable_auto_text_large_pages = disable_large_pages; 1054 1055 /* 1056 * Initialize mmu-specific large page sizes. 1057 */ 1058 if (&mmu_large_pages_disabled) { 1059 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 1060 disable_ism_large_pages |= 1061 mmu_large_pages_disabled(HAT_LOAD_SHARE); 1062 disable_auto_data_large_pages |= 1063 mmu_large_pages_disabled(HAT_AUTO_DATA); 1064 disable_auto_text_large_pages |= 1065 mmu_large_pages_disabled(HAT_AUTO_TEXT); 1066 } 1067 } 1068 1069 /* 1070 * Initialize the hardware address translation structures. 1071 */ 1072 void 1073 hat_init(void) 1074 { 1075 int i; 1076 uint_t sz; 1077 size_t size; 1078 1079 hat_lock_init(); 1080 hat_kstat_init(); 1081 1082 /* 1083 * Hardware-only bits in a TTE 1084 */ 1085 MAKE_TTE_MASK(&hw_tte); 1086 1087 hat_init_pagesizes(); 1088 1089 /* Initialize the hash locks */ 1090 for (i = 0; i < khmehash_num; i++) { 1091 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1092 MUTEX_DEFAULT, NULL); 1093 } 1094 for (i = 0; i < uhmehash_num; i++) { 1095 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1096 MUTEX_DEFAULT, NULL); 1097 } 1098 khmehash_num--; /* make sure counter starts from 0 */ 1099 uhmehash_num--; /* make sure counter starts from 0 */ 1100 1101 /* 1102 * Allocate context domain structures. 1103 * 1104 * A platform may choose to modify max_mmu_ctxdoms in 1105 * set_platform_defaults(). If a platform does not define 1106 * a set_platform_defaults() or does not choose to modify 1107 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU. 1108 * 1109 * For sun4v, there will be one global context domain, this is to 1110 * avoid the ldom cpu substitution problem. 1111 * 1112 * For all platforms that have CPUs sharing MMUs, this 1113 * value must be defined. 1114 */ 1115 if (max_mmu_ctxdoms == 0) { 1116 #ifndef sun4v 1117 max_mmu_ctxdoms = max_ncpus; 1118 #else /* sun4v */ 1119 max_mmu_ctxdoms = 1; 1120 #endif /* sun4v */ 1121 } 1122 1123 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *); 1124 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP); 1125 1126 /* mmu_ctx_t is 64 bytes aligned */ 1127 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache", 1128 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1129 /* 1130 * MMU context domain initialization for the Boot CPU. 1131 * This needs the context domains array allocated above. 1132 */ 1133 mutex_enter(&cpu_lock); 1134 sfmmu_cpu_init(CPU); 1135 mutex_exit(&cpu_lock); 1136 1137 /* 1138 * Intialize ism mapping list lock. 1139 */ 1140 1141 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1142 1143 /* 1144 * Each sfmmu structure carries an array of MMU context info 1145 * structures, one per context domain. The size of this array depends 1146 * on the maximum number of context domains. So, the size of the 1147 * sfmmu structure varies per platform. 1148 * 1149 * sfmmu is allocated from static arena, because trap 1150 * handler at TL > 0 is not allowed to touch kernel relocatable 1151 * memory. sfmmu's alignment is changed to 64 bytes from 1152 * default 8 bytes, as the lower 6 bits will be used to pass 1153 * pgcnt to vtag_flush_pgcnt_tl1. 1154 */ 1155 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1); 1156 1157 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size, 1158 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1159 NULL, NULL, static_arena, 0); 1160 1161 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1162 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1163 1164 /* 1165 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1166 * from the heap when low on memory or when TSB_FORCEALLOC is 1167 * specified, don't use magazines to cache them--we want to return 1168 * them to the system as quickly as possible. 1169 */ 1170 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1171 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1172 static_arena, KMC_NOMAGAZINE); 1173 1174 /* 1175 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1176 * memory, which corresponds to the old static reserve for TSBs. 1177 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1178 * memory we'll allocate for TSB slabs; beyond this point TSB 1179 * allocations will be taken from the kernel heap (via 1180 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1181 * consumer. 1182 */ 1183 if (tsb_alloc_hiwater_factor == 0) { 1184 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1185 } 1186 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1187 1188 for (sz = tsb_slab_ttesz; sz > 0; sz--) { 1189 if (!(disable_large_pages & (1 << sz))) 1190 break; 1191 } 1192 1193 if (sz < tsb_slab_ttesz) { 1194 tsb_slab_ttesz = sz; 1195 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz; 1196 tsb_slab_size = 1 << tsb_slab_shift; 1197 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1; 1198 use_bigtsb_arena = 0; 1199 } else if (use_bigtsb_arena && 1200 (disable_large_pages & (1 << bigtsb_slab_ttesz))) { 1201 use_bigtsb_arena = 0; 1202 } 1203 1204 if (!use_bigtsb_arena) { 1205 bigtsb_slab_shift = tsb_slab_shift; 1206 } 1207 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1208 1209 /* 1210 * On smaller memory systems, allocate TSB memory in smaller chunks 1211 * than the default 4M slab size. We also honor disable_large_pages 1212 * here. 1213 * 1214 * The trap handlers need to be patched with the final slab shift, 1215 * since they need to be able to construct the TSB pointer at runtime. 1216 */ 1217 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1218 !(disable_large_pages & (1 << TTE512K))) { 1219 tsb_slab_ttesz = TTE512K; 1220 tsb_slab_shift = MMU_PAGESHIFT512K; 1221 tsb_slab_size = MMU_PAGESIZE512K; 1222 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT; 1223 use_bigtsb_arena = 0; 1224 } 1225 1226 if (!use_bigtsb_arena) { 1227 bigtsb_slab_ttesz = tsb_slab_ttesz; 1228 bigtsb_slab_shift = tsb_slab_shift; 1229 bigtsb_slab_size = tsb_slab_size; 1230 bigtsb_slab_mask = tsb_slab_mask; 1231 } 1232 1233 1234 /* 1235 * Set up memory callback to update tsb_alloc_hiwater and 1236 * tsb_max_growsize. 1237 */ 1238 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0); 1239 ASSERT(i == 0); 1240 1241 /* 1242 * kmem_tsb_arena is the source from which large TSB slabs are 1243 * drawn. The quantum of this arena corresponds to the largest 1244 * TSB size we can dynamically allocate for user processes. 1245 * Currently it must also be a supported page size since we 1246 * use exactly one translation entry to map each slab page. 1247 * 1248 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1249 * which most TSBs are allocated. Since most TSB allocations are 1250 * typically 8K we have a kmem cache we stack on top of each 1251 * kmem_tsb_default_arena to speed up those allocations. 1252 * 1253 * Note the two-level scheme of arenas is required only 1254 * because vmem_create doesn't allow us to specify alignment 1255 * requirements. If this ever changes the code could be 1256 * simplified to use only one level of arenas. 1257 * 1258 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena 1259 * will be provided in addition to the 4M kmem_tsb_arena. 1260 */ 1261 if (use_bigtsb_arena) { 1262 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0, 1263 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper, 1264 vmem_xfree, heap_arena, 0, VM_SLEEP); 1265 } 1266 1267 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1268 sfmmu_vmem_xalloc_aligned_wrapper, 1269 vmem_xfree, heap_arena, 0, VM_SLEEP); 1270 1271 if (tsb_lgrp_affinity) { 1272 char s[50]; 1273 for (i = 0; i < NLGRPS_MAX; i++) { 1274 if (use_bigtsb_arena) { 1275 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i); 1276 kmem_bigtsb_default_arena[i] = vmem_create(s, 1277 NULL, 0, 2 * tsb_slab_size, 1278 sfmmu_tsb_segkmem_alloc, 1279 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 1280 0, VM_SLEEP | VM_BESTFIT); 1281 } 1282 1283 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1284 kmem_tsb_default_arena[i] = vmem_create(s, 1285 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1286 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1287 VM_SLEEP | VM_BESTFIT); 1288 1289 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1290 sfmmu_tsb_cache[i] = kmem_cache_create(s, 1291 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1292 kmem_tsb_default_arena[i], 0); 1293 } 1294 } else { 1295 if (use_bigtsb_arena) { 1296 kmem_bigtsb_default_arena[0] = 1297 vmem_create("kmem_bigtsb_default", NULL, 0, 1298 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc, 1299 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0, 1300 VM_SLEEP | VM_BESTFIT); 1301 } 1302 1303 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1304 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1305 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1306 VM_SLEEP | VM_BESTFIT); 1307 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1308 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1309 kmem_tsb_default_arena[0], 0); 1310 } 1311 1312 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1313 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1314 sfmmu_hblkcache_destructor, 1315 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1316 hat_memload_arena, KMC_NOHASH); 1317 1318 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1319 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP); 1320 1321 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1322 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1323 sfmmu_hblkcache_destructor, 1324 NULL, (void *)HME1BLK_SZ, 1325 hat_memload1_arena, KMC_NOHASH); 1326 1327 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1328 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1329 1330 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1331 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1332 NULL, NULL, static_arena, KMC_NOHASH); 1333 1334 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1335 sizeof (ism_ment_t), 0, NULL, NULL, 1336 NULL, NULL, NULL, 0); 1337 1338 /* 1339 * We grab the first hat for the kernel, 1340 */ 1341 AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER); 1342 kas.a_hat = hat_alloc(&kas); 1343 AS_LOCK_EXIT(&kas, &kas.a_lock); 1344 1345 /* 1346 * Initialize hblk_reserve. 1347 */ 1348 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1349 va_to_pa((caddr_t)hblk_reserve); 1350 1351 #ifndef UTSB_PHYS 1352 /* 1353 * Reserve some kernel virtual address space for the locked TTEs 1354 * that allow us to probe the TSB from TL>0. 1355 */ 1356 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1357 0, 0, NULL, NULL, VM_SLEEP); 1358 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1359 0, 0, NULL, NULL, VM_SLEEP); 1360 #endif 1361 1362 #ifdef VAC 1363 /* 1364 * The big page VAC handling code assumes VAC 1365 * will not be bigger than the smallest big 1366 * page- which is 64K. 1367 */ 1368 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1369 cmn_err(CE_PANIC, "VAC too big!"); 1370 } 1371 #endif 1372 1373 (void) xhat_init(); 1374 1375 uhme_hash_pa = va_to_pa(uhme_hash); 1376 khme_hash_pa = va_to_pa(khme_hash); 1377 1378 /* 1379 * Initialize relocation locks. kpr_suspendlock is held 1380 * at PIL_MAX to prevent interrupts from pinning the holder 1381 * of a suspended TTE which may access it leading to a 1382 * deadlock condition. 1383 */ 1384 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1385 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1386 1387 /* 1388 * If Shared context support is disabled via /etc/system 1389 * set shctx_on to 0 here if it was set to 1 earlier in boot 1390 * sequence by cpu module initialization code. 1391 */ 1392 if (shctx_on && disable_shctx) { 1393 shctx_on = 0; 1394 } 1395 1396 if (shctx_on) { 1397 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS * 1398 sizeof (srd_buckets[0]), KM_SLEEP); 1399 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) { 1400 mutex_init(&srd_buckets[i].srdb_lock, NULL, 1401 MUTEX_DEFAULT, NULL); 1402 } 1403 1404 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t), 1405 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor, 1406 NULL, NULL, NULL, 0); 1407 region_cache = kmem_cache_create("region_cache", 1408 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor, 1409 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0); 1410 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t), 1411 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor, 1412 NULL, NULL, NULL, 0); 1413 } 1414 1415 /* 1416 * Pre-allocate hrm_hashtab before enabling the collection of 1417 * refmod statistics. Allocating on the fly would mean us 1418 * running the risk of suffering recursive mutex enters or 1419 * deadlocks. 1420 */ 1421 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1422 KM_SLEEP); 1423 } 1424 1425 /* 1426 * Initialize locking for the hat layer, called early during boot. 1427 */ 1428 static void 1429 hat_lock_init() 1430 { 1431 int i; 1432 1433 /* 1434 * initialize the array of mutexes protecting a page's mapping 1435 * list and p_nrm field. 1436 */ 1437 for (i = 0; i < mml_table_sz; i++) 1438 mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL); 1439 1440 if (kpm_enable) { 1441 for (i = 0; i < kpmp_table_sz; i++) { 1442 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1443 MUTEX_DEFAULT, NULL); 1444 } 1445 } 1446 1447 /* 1448 * Initialize array of mutex locks that protects sfmmu fields and 1449 * TSB lists. 1450 */ 1451 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1452 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1453 NULL); 1454 } 1455 1456 #define SFMMU_KERNEL_MAXVA \ 1457 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1458 1459 /* 1460 * Allocate a hat structure. 1461 * Called when an address space first uses a hat. 1462 */ 1463 struct hat * 1464 hat_alloc(struct as *as) 1465 { 1466 sfmmu_t *sfmmup; 1467 int i; 1468 uint64_t cnum; 1469 extern uint_t get_color_start(struct as *); 1470 1471 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1472 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1473 sfmmup->sfmmu_as = as; 1474 sfmmup->sfmmu_flags = 0; 1475 sfmmup->sfmmu_tteflags = 0; 1476 sfmmup->sfmmu_rtteflags = 0; 1477 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock); 1478 1479 if (as == &kas) { 1480 ksfmmup = sfmmup; 1481 sfmmup->sfmmu_cext = 0; 1482 cnum = KCONTEXT; 1483 1484 sfmmup->sfmmu_clrstart = 0; 1485 sfmmup->sfmmu_tsb = NULL; 1486 /* 1487 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1488 * to setup tsb_info for ksfmmup. 1489 */ 1490 } else { 1491 1492 /* 1493 * Just set to invalid ctx. When it faults, it will 1494 * get a valid ctx. This would avoid the situation 1495 * where we get a ctx, but it gets stolen and then 1496 * we fault when we try to run and so have to get 1497 * another ctx. 1498 */ 1499 sfmmup->sfmmu_cext = 0; 1500 cnum = INVALID_CONTEXT; 1501 1502 /* initialize original physical page coloring bin */ 1503 sfmmup->sfmmu_clrstart = get_color_start(as); 1504 #ifdef DEBUG 1505 if (tsb_random_size) { 1506 uint32_t randval = (uint32_t)gettick() >> 4; 1507 int size = randval % (tsb_max_growsize + 1); 1508 1509 /* chose a random tsb size for stress testing */ 1510 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1511 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1512 } else 1513 #endif /* DEBUG */ 1514 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1515 default_tsb_size, 1516 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1517 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID; 1518 ASSERT(sfmmup->sfmmu_tsb != NULL); 1519 } 1520 1521 ASSERT(max_mmu_ctxdoms > 0); 1522 for (i = 0; i < max_mmu_ctxdoms; i++) { 1523 sfmmup->sfmmu_ctxs[i].cnum = cnum; 1524 sfmmup->sfmmu_ctxs[i].gnum = 0; 1525 } 1526 1527 for (i = 0; i < max_mmu_page_sizes; i++) { 1528 sfmmup->sfmmu_ttecnt[i] = 0; 1529 sfmmup->sfmmu_scdrttecnt[i] = 0; 1530 sfmmup->sfmmu_ismttecnt[i] = 0; 1531 sfmmup->sfmmu_scdismttecnt[i] = 0; 1532 sfmmup->sfmmu_pgsz[i] = TTE8K; 1533 } 1534 sfmmup->sfmmu_tsb0_4minflcnt = 0; 1535 sfmmup->sfmmu_iblk = NULL; 1536 sfmmup->sfmmu_ismhat = 0; 1537 sfmmup->sfmmu_scdhat = 0; 1538 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1539 if (sfmmup == ksfmmup) { 1540 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1541 } else { 1542 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1543 } 1544 sfmmup->sfmmu_free = 0; 1545 sfmmup->sfmmu_rmstat = 0; 1546 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1547 sfmmup->sfmmu_xhat_provider = NULL; 1548 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1549 sfmmup->sfmmu_srdp = NULL; 1550 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map); 1551 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 1552 sfmmup->sfmmu_scdp = NULL; 1553 sfmmup->sfmmu_scd_link.next = NULL; 1554 sfmmup->sfmmu_scd_link.prev = NULL; 1555 return (sfmmup); 1556 } 1557 1558 /* 1559 * Create per-MMU context domain kstats for a given MMU ctx. 1560 */ 1561 static void 1562 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp) 1563 { 1564 mmu_ctx_stat_t stat; 1565 kstat_t *mmu_kstat; 1566 1567 ASSERT(MUTEX_HELD(&cpu_lock)); 1568 ASSERT(mmu_ctxp->mmu_kstat == NULL); 1569 1570 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx", 1571 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1572 1573 if (mmu_kstat == NULL) { 1574 cmn_err(CE_WARN, "kstat_create for MMU %d failed", 1575 mmu_ctxp->mmu_idx); 1576 } else { 1577 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data; 1578 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++) 1579 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat], 1580 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64); 1581 mmu_ctxp->mmu_kstat = mmu_kstat; 1582 kstat_install(mmu_kstat); 1583 } 1584 } 1585 1586 /* 1587 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU 1588 * context domain information for a given CPU. If a platform does not 1589 * specify that interface, then the function below is used instead to return 1590 * default information. The defaults are as follows: 1591 * 1592 * - For sun4u systems there's one MMU context domain per CPU. 1593 * This default is used by all sun4u systems except OPL. OPL systems 1594 * provide platform specific interface to map CPU ids to MMU ids 1595 * because on OPL more than 1 CPU shares a single MMU. 1596 * Note that on sun4v, there is one global context domain for 1597 * the entire system. This is to avoid running into potential problem 1598 * with ldom physical cpu substitution feature. 1599 * - The number of MMU context IDs supported on any CPU in the 1600 * system is 8K. 1601 */ 1602 /*ARGSUSED*/ 1603 static void 1604 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop) 1605 { 1606 infop->mmu_nctxs = nctxs; 1607 #ifndef sun4v 1608 infop->mmu_idx = cpu[cpuid]->cpu_seqid; 1609 #else /* sun4v */ 1610 infop->mmu_idx = 0; 1611 #endif /* sun4v */ 1612 } 1613 1614 /* 1615 * Called during CPU initialization to set the MMU context-related information 1616 * for a CPU. 1617 * 1618 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum. 1619 */ 1620 void 1621 sfmmu_cpu_init(cpu_t *cp) 1622 { 1623 mmu_ctx_info_t info; 1624 mmu_ctx_t *mmu_ctxp; 1625 1626 ASSERT(MUTEX_HELD(&cpu_lock)); 1627 1628 if (&plat_cpuid_to_mmu_ctx_info == NULL) 1629 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1630 else 1631 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1632 1633 ASSERT(info.mmu_idx < max_mmu_ctxdoms); 1634 1635 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) { 1636 /* Each mmu_ctx is cacheline aligned. */ 1637 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP); 1638 bzero(mmu_ctxp, sizeof (mmu_ctx_t)); 1639 1640 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN, 1641 (void *)ipltospl(DISP_LEVEL)); 1642 mmu_ctxp->mmu_idx = info.mmu_idx; 1643 mmu_ctxp->mmu_nctxs = info.mmu_nctxs; 1644 /* 1645 * Globally for lifetime of a system, 1646 * gnum must always increase. 1647 * mmu_saved_gnum is protected by the cpu_lock. 1648 */ 1649 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1; 1650 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 1651 1652 sfmmu_mmu_kstat_create(mmu_ctxp); 1653 1654 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp; 1655 } else { 1656 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx); 1657 } 1658 1659 /* 1660 * The mmu_lock is acquired here to prevent races with 1661 * the wrap-around code. 1662 */ 1663 mutex_enter(&mmu_ctxp->mmu_lock); 1664 1665 1666 mmu_ctxp->mmu_ncpus++; 1667 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1668 CPU_MMU_IDX(cp) = info.mmu_idx; 1669 CPU_MMU_CTXP(cp) = mmu_ctxp; 1670 1671 mutex_exit(&mmu_ctxp->mmu_lock); 1672 } 1673 1674 /* 1675 * Called to perform MMU context-related cleanup for a CPU. 1676 */ 1677 void 1678 sfmmu_cpu_cleanup(cpu_t *cp) 1679 { 1680 mmu_ctx_t *mmu_ctxp; 1681 1682 ASSERT(MUTEX_HELD(&cpu_lock)); 1683 1684 mmu_ctxp = CPU_MMU_CTXP(cp); 1685 ASSERT(mmu_ctxp != NULL); 1686 1687 /* 1688 * The mmu_lock is acquired here to prevent races with 1689 * the wrap-around code. 1690 */ 1691 mutex_enter(&mmu_ctxp->mmu_lock); 1692 1693 CPU_MMU_CTXP(cp) = NULL; 1694 1695 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1696 if (--mmu_ctxp->mmu_ncpus == 0) { 1697 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL; 1698 mutex_exit(&mmu_ctxp->mmu_lock); 1699 mutex_destroy(&mmu_ctxp->mmu_lock); 1700 1701 if (mmu_ctxp->mmu_kstat) 1702 kstat_delete(mmu_ctxp->mmu_kstat); 1703 1704 /* mmu_saved_gnum is protected by the cpu_lock. */ 1705 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum) 1706 mmu_saved_gnum = mmu_ctxp->mmu_gnum; 1707 1708 kmem_cache_free(mmuctxdom_cache, mmu_ctxp); 1709 1710 return; 1711 } 1712 1713 mutex_exit(&mmu_ctxp->mmu_lock); 1714 } 1715 1716 /* 1717 * Hat_setup, makes an address space context the current active one. 1718 * In sfmmu this translates to setting the secondary context with the 1719 * corresponding context. 1720 */ 1721 void 1722 hat_setup(struct hat *sfmmup, int allocflag) 1723 { 1724 hatlock_t *hatlockp; 1725 1726 /* Init needs some special treatment. */ 1727 if (allocflag == HAT_INIT) { 1728 /* 1729 * Make sure that we have 1730 * 1. a TSB 1731 * 2. a valid ctx that doesn't get stolen after this point. 1732 */ 1733 hatlockp = sfmmu_hat_enter(sfmmup); 1734 1735 /* 1736 * Swap in the TSB. hat_init() allocates tsbinfos without 1737 * TSBs, but we need one for init, since the kernel does some 1738 * special things to set up its stack and needs the TSB to 1739 * resolve page faults. 1740 */ 1741 sfmmu_tsb_swapin(sfmmup, hatlockp); 1742 1743 sfmmu_get_ctx(sfmmup); 1744 1745 sfmmu_hat_exit(hatlockp); 1746 } else { 1747 ASSERT(allocflag == HAT_ALLOC); 1748 1749 hatlockp = sfmmu_hat_enter(sfmmup); 1750 kpreempt_disable(); 1751 1752 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1753 /* 1754 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter, 1755 * pagesize bits don't matter in this case since we are passing 1756 * INVALID_CONTEXT to it. 1757 * Compatibility Note: hw takes care of MMU_SCONTEXT1 1758 */ 1759 sfmmu_setctx_sec(INVALID_CONTEXT); 1760 sfmmu_clear_utsbinfo(); 1761 1762 kpreempt_enable(); 1763 sfmmu_hat_exit(hatlockp); 1764 } 1765 } 1766 1767 /* 1768 * Free all the translation resources for the specified address space. 1769 * Called from as_free when an address space is being destroyed. 1770 */ 1771 void 1772 hat_free_start(struct hat *sfmmup) 1773 { 1774 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 1775 ASSERT(sfmmup != ksfmmup); 1776 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1777 1778 sfmmup->sfmmu_free = 1; 1779 if (sfmmup->sfmmu_scdp != NULL) { 1780 sfmmu_leave_scd(sfmmup, 0); 1781 } 1782 1783 ASSERT(sfmmup->sfmmu_scdp == NULL); 1784 } 1785 1786 void 1787 hat_free_end(struct hat *sfmmup) 1788 { 1789 int i; 1790 1791 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1792 ASSERT(sfmmup->sfmmu_free == 1); 1793 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1794 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1795 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1796 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1797 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1798 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1799 1800 if (sfmmup->sfmmu_rmstat) { 1801 hat_freestat(sfmmup->sfmmu_as, NULL); 1802 } 1803 1804 while (sfmmup->sfmmu_tsb != NULL) { 1805 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1806 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1807 sfmmup->sfmmu_tsb = next; 1808 } 1809 1810 if (sfmmup->sfmmu_srdp != NULL) { 1811 sfmmu_leave_srd(sfmmup); 1812 ASSERT(sfmmup->sfmmu_srdp == NULL); 1813 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1814 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) { 1815 kmem_free(sfmmup->sfmmu_hmeregion_links[i], 1816 SFMMU_L2_HMERLINKS_SIZE); 1817 sfmmup->sfmmu_hmeregion_links[i] = NULL; 1818 } 1819 } 1820 } 1821 sfmmu_free_sfmmu(sfmmup); 1822 1823 #ifdef DEBUG 1824 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1825 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL); 1826 } 1827 #endif 1828 1829 kmem_cache_free(sfmmuid_cache, sfmmup); 1830 } 1831 1832 /* 1833 * Set up any translation structures, for the specified address space, 1834 * that are needed or preferred when the process is being swapped in. 1835 */ 1836 /* ARGSUSED */ 1837 void 1838 hat_swapin(struct hat *hat) 1839 { 1840 ASSERT(hat->sfmmu_xhat_provider == NULL); 1841 } 1842 1843 /* 1844 * Free all of the translation resources, for the specified address space, 1845 * that can be freed while the process is swapped out. Called from as_swapout. 1846 * Also, free up the ctx that this process was using. 1847 */ 1848 void 1849 hat_swapout(struct hat *sfmmup) 1850 { 1851 struct hmehash_bucket *hmebp; 1852 struct hme_blk *hmeblkp; 1853 struct hme_blk *pr_hblk = NULL; 1854 struct hme_blk *nx_hblk; 1855 int i; 1856 uint64_t hblkpa, prevpa, nx_pa; 1857 struct hme_blk *list = NULL; 1858 hatlock_t *hatlockp; 1859 struct tsb_info *tsbinfop; 1860 struct free_tsb { 1861 struct free_tsb *next; 1862 struct tsb_info *tsbinfop; 1863 }; /* free list of TSBs */ 1864 struct free_tsb *freelist, *last, *next; 1865 1866 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1867 SFMMU_STAT(sf_swapout); 1868 1869 /* 1870 * There is no way to go from an as to all its translations in sfmmu. 1871 * Here is one of the times when we take the big hit and traverse 1872 * the hash looking for hme_blks to free up. Not only do we free up 1873 * this as hme_blks but all those that are free. We are obviously 1874 * swapping because we need memory so let's free up as much 1875 * as we can. 1876 * 1877 * Note that we don't flush TLB/TSB here -- it's not necessary 1878 * because: 1879 * 1) we free the ctx we're using and throw away the TSB(s); 1880 * 2) processes aren't runnable while being swapped out. 1881 */ 1882 ASSERT(sfmmup != KHATID); 1883 for (i = 0; i <= UHMEHASH_SZ; i++) { 1884 hmebp = &uhme_hash[i]; 1885 SFMMU_HASH_LOCK(hmebp); 1886 hmeblkp = hmebp->hmeblkp; 1887 hblkpa = hmebp->hmeh_nextpa; 1888 prevpa = 0; 1889 pr_hblk = NULL; 1890 while (hmeblkp) { 1891 1892 ASSERT(!hmeblkp->hblk_xhat_bit); 1893 1894 if ((hmeblkp->hblk_tag.htag_id == sfmmup) && 1895 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) { 1896 ASSERT(!hmeblkp->hblk_shared); 1897 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 1898 (caddr_t)get_hblk_base(hmeblkp), 1899 get_hblk_endaddr(hmeblkp), 1900 NULL, HAT_UNLOAD); 1901 } 1902 nx_hblk = hmeblkp->hblk_next; 1903 nx_pa = hmeblkp->hblk_nextpa; 1904 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 1905 ASSERT(!hmeblkp->hblk_lckcnt); 1906 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 1907 prevpa, pr_hblk); 1908 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 1909 } else { 1910 pr_hblk = hmeblkp; 1911 prevpa = hblkpa; 1912 } 1913 hmeblkp = nx_hblk; 1914 hblkpa = nx_pa; 1915 } 1916 SFMMU_HASH_UNLOCK(hmebp); 1917 } 1918 1919 sfmmu_hblks_list_purge(&list); 1920 1921 /* 1922 * Now free up the ctx so that others can reuse it. 1923 */ 1924 hatlockp = sfmmu_hat_enter(sfmmup); 1925 1926 sfmmu_invalidate_ctx(sfmmup); 1927 1928 /* 1929 * Free TSBs, but not tsbinfos, and set SWAPPED flag. 1930 * If TSBs were never swapped in, just return. 1931 * This implies that we don't support partial swapping 1932 * of TSBs -- either all are swapped out, or none are. 1933 * 1934 * We must hold the HAT lock here to prevent racing with another 1935 * thread trying to unmap TTEs from the TSB or running the post- 1936 * relocator after relocating the TSB's memory. Unfortunately, we 1937 * can't free memory while holding the HAT lock or we could 1938 * deadlock, so we build a list of TSBs to be freed after marking 1939 * the tsbinfos as swapped out and free them after dropping the 1940 * lock. 1941 */ 1942 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 1943 sfmmu_hat_exit(hatlockp); 1944 return; 1945 } 1946 1947 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED); 1948 last = freelist = NULL; 1949 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 1950 tsbinfop = tsbinfop->tsb_next) { 1951 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0); 1952 1953 /* 1954 * Cast the TSB into a struct free_tsb and put it on the free 1955 * list. 1956 */ 1957 if (freelist == NULL) { 1958 last = freelist = (struct free_tsb *)tsbinfop->tsb_va; 1959 } else { 1960 last->next = (struct free_tsb *)tsbinfop->tsb_va; 1961 last = last->next; 1962 } 1963 last->next = NULL; 1964 last->tsbinfop = tsbinfop; 1965 tsbinfop->tsb_flags |= TSB_SWAPPED; 1966 /* 1967 * Zero out the TTE to clear the valid bit. 1968 * Note we can't use a value like 0xbad because we want to 1969 * ensure diagnostic bits are NEVER set on TTEs that might 1970 * be loaded. The intent is to catch any invalid access 1971 * to the swapped TSB, such as a thread running with a valid 1972 * context without first calling sfmmu_tsb_swapin() to 1973 * allocate TSB memory. 1974 */ 1975 tsbinfop->tsb_tte.ll = 0; 1976 } 1977 1978 /* Now we can drop the lock and free the TSB memory. */ 1979 sfmmu_hat_exit(hatlockp); 1980 for (; freelist != NULL; freelist = next) { 1981 next = freelist->next; 1982 sfmmu_tsb_free(freelist->tsbinfop); 1983 } 1984 } 1985 1986 /* 1987 * Duplicate the translations of an as into another newas 1988 */ 1989 /* ARGSUSED */ 1990 int 1991 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 1992 uint_t flag) 1993 { 1994 sf_srd_t *srdp; 1995 sf_scd_t *scdp; 1996 int i; 1997 extern uint_t get_color_start(struct as *); 1998 1999 ASSERT(hat->sfmmu_xhat_provider == NULL); 2000 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) || 2001 (flag == HAT_DUP_SRD)); 2002 ASSERT(hat != ksfmmup); 2003 ASSERT(newhat != ksfmmup); 2004 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp); 2005 2006 if (flag == HAT_DUP_COW) { 2007 panic("hat_dup: HAT_DUP_COW not supported"); 2008 } 2009 2010 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) { 2011 ASSERT(srdp->srd_evp != NULL); 2012 VN_HOLD(srdp->srd_evp); 2013 ASSERT(srdp->srd_refcnt > 0); 2014 newhat->sfmmu_srdp = srdp; 2015 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 2016 } 2017 2018 /* 2019 * HAT_DUP_ALL flag is used after as duplication is done. 2020 */ 2021 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) { 2022 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2); 2023 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags; 2024 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) { 2025 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG; 2026 } 2027 2028 /* check if need to join scd */ 2029 if ((scdp = hat->sfmmu_scdp) != NULL && 2030 newhat->sfmmu_scdp != scdp) { 2031 int ret; 2032 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map, 2033 &scdp->scd_region_map, ret); 2034 ASSERT(ret); 2035 sfmmu_join_scd(scdp, newhat); 2036 ASSERT(newhat->sfmmu_scdp == scdp && 2037 scdp->scd_refcnt >= 2); 2038 for (i = 0; i < max_mmu_page_sizes; i++) { 2039 newhat->sfmmu_ismttecnt[i] = 2040 hat->sfmmu_ismttecnt[i]; 2041 newhat->sfmmu_scdismttecnt[i] = 2042 hat->sfmmu_scdismttecnt[i]; 2043 } 2044 } 2045 2046 sfmmu_check_page_sizes(newhat, 1); 2047 } 2048 2049 if (flag == HAT_DUP_ALL && consistent_coloring == 0 && 2050 update_proc_pgcolorbase_after_fork != 0) { 2051 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as); 2052 } 2053 return (0); 2054 } 2055 2056 void 2057 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 2058 uint_t attr, uint_t flags) 2059 { 2060 hat_do_memload(hat, addr, pp, attr, flags, 2061 SFMMU_INVALID_SHMERID); 2062 } 2063 2064 void 2065 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2066 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2067 { 2068 uint_t rid; 2069 if (rcookie == HAT_INVALID_REGION_COOKIE || 2070 hat->sfmmu_xhat_provider != NULL) { 2071 hat_do_memload(hat, addr, pp, attr, flags, 2072 SFMMU_INVALID_SHMERID); 2073 return; 2074 } 2075 rid = (uint_t)((uint64_t)rcookie); 2076 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2077 hat_do_memload(hat, addr, pp, attr, flags, rid); 2078 } 2079 2080 /* 2081 * Set up addr to map to page pp with protection prot. 2082 * As an optimization we also load the TSB with the 2083 * corresponding tte but it is no big deal if the tte gets kicked out. 2084 */ 2085 static void 2086 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp, 2087 uint_t attr, uint_t flags, uint_t rid) 2088 { 2089 tte_t tte; 2090 2091 2092 ASSERT(hat != NULL); 2093 ASSERT(PAGE_LOCKED(pp)); 2094 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2095 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2096 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2097 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE); 2098 2099 if (PP_ISFREE(pp)) { 2100 panic("hat_memload: loading a mapping to free page %p", 2101 (void *)pp); 2102 } 2103 2104 if (hat->sfmmu_xhat_provider) { 2105 /* no regions for xhats */ 2106 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2107 XHAT_MEMLOAD(hat, addr, pp, attr, flags); 2108 return; 2109 } 2110 2111 ASSERT((hat == ksfmmup) || 2112 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2113 2114 if (flags & ~SFMMU_LOAD_ALLFLAG) 2115 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 2116 flags & ~SFMMU_LOAD_ALLFLAG); 2117 2118 if (hat->sfmmu_rmstat) 2119 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 2120 2121 #if defined(SF_ERRATA_57) 2122 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2123 (addr < errata57_limit) && (attr & PROT_EXEC) && 2124 !(flags & HAT_LOAD_SHARE)) { 2125 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 2126 " page executable"); 2127 attr &= ~PROT_EXEC; 2128 } 2129 #endif 2130 2131 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2132 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid); 2133 2134 /* 2135 * Check TSB and TLB page sizes. 2136 */ 2137 if ((flags & HAT_LOAD_SHARE) == 0) { 2138 sfmmu_check_page_sizes(hat, 1); 2139 } 2140 } 2141 2142 /* 2143 * hat_devload can be called to map real memory (e.g. 2144 * /dev/kmem) and even though hat_devload will determine pf is 2145 * for memory, it will be unable to get a shared lock on the 2146 * page (because someone else has it exclusively) and will 2147 * pass dp = NULL. If tteload doesn't get a non-NULL 2148 * page pointer it can't cache memory. 2149 */ 2150 void 2151 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 2152 uint_t attr, int flags) 2153 { 2154 tte_t tte; 2155 struct page *pp = NULL; 2156 int use_lgpg = 0; 2157 2158 ASSERT(hat != NULL); 2159 2160 if (hat->sfmmu_xhat_provider) { 2161 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags); 2162 return; 2163 } 2164 2165 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2166 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2167 ASSERT((hat == ksfmmup) || 2168 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2169 if (len == 0) 2170 panic("hat_devload: zero len"); 2171 if (flags & ~SFMMU_LOAD_ALLFLAG) 2172 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 2173 flags & ~SFMMU_LOAD_ALLFLAG); 2174 2175 #if defined(SF_ERRATA_57) 2176 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2177 (addr < errata57_limit) && (attr & PROT_EXEC) && 2178 !(flags & HAT_LOAD_SHARE)) { 2179 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 2180 " page executable"); 2181 attr &= ~PROT_EXEC; 2182 } 2183 #endif 2184 2185 /* 2186 * If it's a memory page find its pp 2187 */ 2188 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 2189 pp = page_numtopp_nolock(pfn); 2190 if (pp == NULL) { 2191 flags |= HAT_LOAD_NOCONSIST; 2192 } else { 2193 if (PP_ISFREE(pp)) { 2194 panic("hat_memload: loading " 2195 "a mapping to free page %p", 2196 (void *)pp); 2197 } 2198 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2199 panic("hat_memload: loading a mapping " 2200 "to unlocked relocatable page %p", 2201 (void *)pp); 2202 } 2203 ASSERT(len == MMU_PAGESIZE); 2204 } 2205 } 2206 2207 if (hat->sfmmu_rmstat) 2208 hat_resvstat(len, hat->sfmmu_as, addr); 2209 2210 if (flags & HAT_LOAD_NOCONSIST) { 2211 attr |= SFMMU_UNCACHEVTTE; 2212 use_lgpg = 1; 2213 } 2214 if (!pf_is_memory(pfn)) { 2215 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 2216 use_lgpg = 1; 2217 switch (attr & HAT_ORDER_MASK) { 2218 case HAT_STRICTORDER: 2219 case HAT_UNORDERED_OK: 2220 /* 2221 * we set the side effect bit for all non 2222 * memory mappings unless merging is ok 2223 */ 2224 attr |= SFMMU_SIDEFFECT; 2225 break; 2226 case HAT_MERGING_OK: 2227 case HAT_LOADCACHING_OK: 2228 case HAT_STORECACHING_OK: 2229 break; 2230 default: 2231 panic("hat_devload: bad attr"); 2232 break; 2233 } 2234 } 2235 while (len) { 2236 if (!use_lgpg) { 2237 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2238 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2239 flags, SFMMU_INVALID_SHMERID); 2240 len -= MMU_PAGESIZE; 2241 addr += MMU_PAGESIZE; 2242 pfn++; 2243 continue; 2244 } 2245 /* 2246 * try to use large pages, check va/pa alignments 2247 * Note that 32M/256M page sizes are not (yet) supported. 2248 */ 2249 if ((len >= MMU_PAGESIZE4M) && 2250 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 2251 !(disable_large_pages & (1 << TTE4M)) && 2252 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 2253 sfmmu_memtte(&tte, pfn, attr, TTE4M); 2254 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2255 flags, SFMMU_INVALID_SHMERID); 2256 len -= MMU_PAGESIZE4M; 2257 addr += MMU_PAGESIZE4M; 2258 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 2259 } else if ((len >= MMU_PAGESIZE512K) && 2260 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 2261 !(disable_large_pages & (1 << TTE512K)) && 2262 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 2263 sfmmu_memtte(&tte, pfn, attr, TTE512K); 2264 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2265 flags, SFMMU_INVALID_SHMERID); 2266 len -= MMU_PAGESIZE512K; 2267 addr += MMU_PAGESIZE512K; 2268 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 2269 } else if ((len >= MMU_PAGESIZE64K) && 2270 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 2271 !(disable_large_pages & (1 << TTE64K)) && 2272 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 2273 sfmmu_memtte(&tte, pfn, attr, TTE64K); 2274 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2275 flags, SFMMU_INVALID_SHMERID); 2276 len -= MMU_PAGESIZE64K; 2277 addr += MMU_PAGESIZE64K; 2278 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 2279 } else { 2280 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2281 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2282 flags, SFMMU_INVALID_SHMERID); 2283 len -= MMU_PAGESIZE; 2284 addr += MMU_PAGESIZE; 2285 pfn++; 2286 } 2287 } 2288 2289 /* 2290 * Check TSB and TLB page sizes. 2291 */ 2292 if ((flags & HAT_LOAD_SHARE) == 0) { 2293 sfmmu_check_page_sizes(hat, 1); 2294 } 2295 } 2296 2297 void 2298 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 2299 struct page **pps, uint_t attr, uint_t flags) 2300 { 2301 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2302 SFMMU_INVALID_SHMERID); 2303 } 2304 2305 void 2306 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2307 struct page **pps, uint_t attr, uint_t flags, 2308 hat_region_cookie_t rcookie) 2309 { 2310 uint_t rid; 2311 if (rcookie == HAT_INVALID_REGION_COOKIE || 2312 hat->sfmmu_xhat_provider != NULL) { 2313 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2314 SFMMU_INVALID_SHMERID); 2315 return; 2316 } 2317 rid = (uint_t)((uint64_t)rcookie); 2318 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2319 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid); 2320 } 2321 2322 /* 2323 * Map the largest extend possible out of the page array. The array may NOT 2324 * be in order. The largest possible mapping a page can have 2325 * is specified in the p_szc field. The p_szc field 2326 * cannot change as long as there any mappings (large or small) 2327 * to any of the pages that make up the large page. (ie. any 2328 * promotion/demotion of page size is not up to the hat but up to 2329 * the page free list manager). The array 2330 * should consist of properly aligned contigous pages that are 2331 * part of a big page for a large mapping to be created. 2332 */ 2333 static void 2334 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len, 2335 struct page **pps, uint_t attr, uint_t flags, uint_t rid) 2336 { 2337 int ttesz; 2338 size_t mapsz; 2339 pgcnt_t numpg, npgs; 2340 tte_t tte; 2341 page_t *pp; 2342 uint_t large_pages_disable; 2343 2344 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2345 SFMMU_VALIDATE_HMERID(hat, rid, addr, len); 2346 2347 if (hat->sfmmu_xhat_provider) { 2348 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2349 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags); 2350 return; 2351 } 2352 2353 if (hat->sfmmu_rmstat) 2354 hat_resvstat(len, hat->sfmmu_as, addr); 2355 2356 #if defined(SF_ERRATA_57) 2357 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2358 (addr < errata57_limit) && (attr & PROT_EXEC) && 2359 !(flags & HAT_LOAD_SHARE)) { 2360 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 2361 "user page executable"); 2362 attr &= ~PROT_EXEC; 2363 } 2364 #endif 2365 2366 /* Get number of pages */ 2367 npgs = len >> MMU_PAGESHIFT; 2368 2369 if (flags & HAT_LOAD_SHARE) { 2370 large_pages_disable = disable_ism_large_pages; 2371 } else { 2372 large_pages_disable = disable_large_pages; 2373 } 2374 2375 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 2376 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2377 rid); 2378 return; 2379 } 2380 2381 while (npgs >= NHMENTS) { 2382 pp = *pps; 2383 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 2384 /* 2385 * Check if this page size is disabled. 2386 */ 2387 if (large_pages_disable & (1 << ttesz)) 2388 continue; 2389 2390 numpg = TTEPAGES(ttesz); 2391 mapsz = numpg << MMU_PAGESHIFT; 2392 if ((npgs >= numpg) && 2393 IS_P2ALIGNED(addr, mapsz) && 2394 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 2395 /* 2396 * At this point we have enough pages and 2397 * we know the virtual address and the pfn 2398 * are properly aligned. We still need 2399 * to check for physical contiguity but since 2400 * it is very likely that this is the case 2401 * we will assume they are so and undo 2402 * the request if necessary. It would 2403 * be great if we could get a hint flag 2404 * like HAT_CONTIG which would tell us 2405 * the pages are contigous for sure. 2406 */ 2407 sfmmu_memtte(&tte, (*pps)->p_pagenum, 2408 attr, ttesz); 2409 if (!sfmmu_tteload_array(hat, &tte, addr, 2410 pps, flags, rid)) { 2411 break; 2412 } 2413 } 2414 } 2415 if (ttesz == TTE8K) { 2416 /* 2417 * We were not able to map array using a large page 2418 * batch a hmeblk or fraction at a time. 2419 */ 2420 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 2421 & (NHMENTS-1); 2422 numpg = NHMENTS - numpg; 2423 ASSERT(numpg <= npgs); 2424 mapsz = numpg * MMU_PAGESIZE; 2425 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 2426 numpg, rid); 2427 } 2428 addr += mapsz; 2429 npgs -= numpg; 2430 pps += numpg; 2431 } 2432 2433 if (npgs) { 2434 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2435 rid); 2436 } 2437 2438 /* 2439 * Check TSB and TLB page sizes. 2440 */ 2441 if ((flags & HAT_LOAD_SHARE) == 0) { 2442 sfmmu_check_page_sizes(hat, 1); 2443 } 2444 } 2445 2446 /* 2447 * Function tries to batch 8K pages into the same hme blk. 2448 */ 2449 static void 2450 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 2451 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid) 2452 { 2453 tte_t tte; 2454 page_t *pp; 2455 struct hmehash_bucket *hmebp; 2456 struct hme_blk *hmeblkp; 2457 int index; 2458 2459 while (npgs) { 2460 /* 2461 * Acquire the hash bucket. 2462 */ 2463 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K, 2464 rid); 2465 ASSERT(hmebp); 2466 2467 /* 2468 * Find the hment block. 2469 */ 2470 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2471 TTE8K, flags, rid); 2472 ASSERT(hmeblkp); 2473 2474 do { 2475 /* 2476 * Make the tte. 2477 */ 2478 pp = *pps; 2479 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2480 2481 /* 2482 * Add the translation. 2483 */ 2484 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2485 vaddr, pps, flags, rid); 2486 2487 /* 2488 * Goto next page. 2489 */ 2490 pps++; 2491 npgs--; 2492 2493 /* 2494 * Goto next address. 2495 */ 2496 vaddr += MMU_PAGESIZE; 2497 2498 /* 2499 * Don't crossover into a different hmentblk. 2500 */ 2501 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2502 (NHMENTS-1)); 2503 2504 } while (index != 0 && npgs != 0); 2505 2506 /* 2507 * Release the hash bucket. 2508 */ 2509 2510 sfmmu_tteload_release_hashbucket(hmebp); 2511 } 2512 } 2513 2514 /* 2515 * Construct a tte for a page: 2516 * 2517 * tte_valid = 1 2518 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only) 2519 * tte_size = size 2520 * tte_nfo = attr & HAT_NOFAULT 2521 * tte_ie = attr & HAT_STRUCTURE_LE 2522 * tte_hmenum = hmenum 2523 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2524 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2525 * tte_ref = 1 (optimization) 2526 * tte_wr_perm = attr & PROT_WRITE; 2527 * tte_no_sync = attr & HAT_NOSYNC 2528 * tte_lock = attr & SFMMU_LOCKTTE 2529 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2530 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2531 * tte_e = attr & SFMMU_SIDEFFECT 2532 * tte_priv = !(attr & PROT_USER) 2533 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2534 * tte_glb = 0 2535 */ 2536 void 2537 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2538 { 2539 ASSERT((attr & ~(SFMMU_LOAD_ALLATTR | HAT_ATTR_NOSOFTEXEC)) == 0); 2540 2541 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2542 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2543 2544 if (TTE_IS_NOSYNC(ttep)) { 2545 TTE_SET_REF(ttep); 2546 if (TTE_IS_WRITABLE(ttep)) { 2547 TTE_SET_MOD(ttep); 2548 } 2549 } 2550 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2551 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2552 } 2553 2554 /* 2555 * Disable hardware execute permission to force a fault if 2556 * this page is executed, so we can detect the execution. Set 2557 * the soft exec bit to remember that this TTE has execute 2558 * permission. 2559 */ 2560 if (TTE_IS_EXECUTABLE(ttep) && (attr & HAT_ATTR_NOSOFTEXEC) == 0 && 2561 icache_is_coherent == 0) { 2562 TTE_CLR_EXEC(ttep); 2563 TTE_SET_SOFTEXEC(ttep); 2564 } 2565 } 2566 2567 /* 2568 * This function will add a translation to the hme_blk and allocate the 2569 * hme_blk if one does not exist. 2570 * If a page structure is specified then it will add the 2571 * corresponding hment to the mapping list. 2572 * It will also update the hmenum field for the tte. 2573 * 2574 * Currently this function is only used for kernel mappings. 2575 * So pass invalid region to sfmmu_tteload_array(). 2576 */ 2577 void 2578 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2579 uint_t flags) 2580 { 2581 ASSERT(sfmmup == ksfmmup); 2582 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags, 2583 SFMMU_INVALID_SHMERID); 2584 } 2585 2586 /* 2587 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2588 * Assumes that a particular page size may only be resident in one TSB. 2589 */ 2590 static void 2591 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2592 { 2593 struct tsb_info *tsbinfop = NULL; 2594 uint64_t tag; 2595 struct tsbe *tsbe_addr; 2596 uint64_t tsb_base; 2597 uint_t tsb_size; 2598 int vpshift = MMU_PAGESHIFT; 2599 int phys = 0; 2600 2601 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2602 phys = ktsb_phys; 2603 if (ttesz >= TTE4M) { 2604 #ifndef sun4v 2605 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2606 #endif 2607 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2608 tsb_size = ktsb4m_szcode; 2609 } else { 2610 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2611 tsb_size = ktsb_szcode; 2612 } 2613 } else { 2614 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2615 2616 /* 2617 * If there isn't a TSB for this page size, or the TSB is 2618 * swapped out, there is nothing to do. Note that the latter 2619 * case seems impossible but can occur if hat_pageunload() 2620 * is called on an ISM mapping while the process is swapped 2621 * out. 2622 */ 2623 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2624 return; 2625 2626 /* 2627 * If another thread is in the middle of relocating a TSB 2628 * we can't unload the entry so set a flag so that the 2629 * TSB will be flushed before it can be accessed by the 2630 * process. 2631 */ 2632 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2633 if (ttep == NULL) 2634 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2635 return; 2636 } 2637 #if defined(UTSB_PHYS) 2638 phys = 1; 2639 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2640 #else 2641 tsb_base = (uint64_t)tsbinfop->tsb_va; 2642 #endif 2643 tsb_size = tsbinfop->tsb_szc; 2644 } 2645 if (ttesz >= TTE4M) 2646 vpshift = MMU_PAGESHIFT4M; 2647 2648 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2649 tag = sfmmu_make_tsbtag(vaddr); 2650 2651 if (ttep == NULL) { 2652 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2653 } else { 2654 if (ttesz >= TTE4M) { 2655 SFMMU_STAT(sf_tsb_load4m); 2656 } else { 2657 SFMMU_STAT(sf_tsb_load8k); 2658 } 2659 2660 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2661 } 2662 } 2663 2664 /* 2665 * Unmap all entries from [start, end) matching the given page size. 2666 * 2667 * This function is used primarily to unmap replicated 64K or 512K entries 2668 * from the TSB that are inserted using the base page size TSB pointer, but 2669 * it may also be called to unmap a range of addresses from the TSB. 2670 */ 2671 void 2672 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2673 { 2674 struct tsb_info *tsbinfop; 2675 uint64_t tag; 2676 struct tsbe *tsbe_addr; 2677 caddr_t vaddr; 2678 uint64_t tsb_base; 2679 int vpshift, vpgsz; 2680 uint_t tsb_size; 2681 int phys = 0; 2682 2683 /* 2684 * Assumptions: 2685 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2686 * at a time shooting down any valid entries we encounter. 2687 * 2688 * If ttesz >= 4M we walk the range 4M at a time shooting 2689 * down any valid mappings we find. 2690 */ 2691 if (sfmmup == ksfmmup) { 2692 phys = ktsb_phys; 2693 if (ttesz >= TTE4M) { 2694 #ifndef sun4v 2695 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2696 #endif 2697 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2698 tsb_size = ktsb4m_szcode; 2699 } else { 2700 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2701 tsb_size = ktsb_szcode; 2702 } 2703 } else { 2704 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2705 2706 /* 2707 * If there isn't a TSB for this page size, or the TSB is 2708 * swapped out, there is nothing to do. Note that the latter 2709 * case seems impossible but can occur if hat_pageunload() 2710 * is called on an ISM mapping while the process is swapped 2711 * out. 2712 */ 2713 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2714 return; 2715 2716 /* 2717 * If another thread is in the middle of relocating a TSB 2718 * we can't unload the entry so set a flag so that the 2719 * TSB will be flushed before it can be accessed by the 2720 * process. 2721 */ 2722 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2723 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2724 return; 2725 } 2726 #if defined(UTSB_PHYS) 2727 phys = 1; 2728 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2729 #else 2730 tsb_base = (uint64_t)tsbinfop->tsb_va; 2731 #endif 2732 tsb_size = tsbinfop->tsb_szc; 2733 } 2734 if (ttesz >= TTE4M) { 2735 vpshift = MMU_PAGESHIFT4M; 2736 vpgsz = MMU_PAGESIZE4M; 2737 } else { 2738 vpshift = MMU_PAGESHIFT; 2739 vpgsz = MMU_PAGESIZE; 2740 } 2741 2742 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2743 tag = sfmmu_make_tsbtag(vaddr); 2744 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2745 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2746 } 2747 } 2748 2749 /* 2750 * Select the optimum TSB size given the number of mappings 2751 * that need to be cached. 2752 */ 2753 static int 2754 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2755 { 2756 int szc = 0; 2757 2758 #ifdef DEBUG 2759 if (tsb_grow_stress) { 2760 uint32_t randval = (uint32_t)gettick() >> 4; 2761 return (randval % (tsb_max_growsize + 1)); 2762 } 2763 #endif /* DEBUG */ 2764 2765 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2766 szc++; 2767 return (szc); 2768 } 2769 2770 /* 2771 * This function will add a translation to the hme_blk and allocate the 2772 * hme_blk if one does not exist. 2773 * If a page structure is specified then it will add the 2774 * corresponding hment to the mapping list. 2775 * It will also update the hmenum field for the tte. 2776 * Furthermore, it attempts to create a large page translation 2777 * for <addr,hat> at page array pps. It assumes addr and first 2778 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2779 */ 2780 static int 2781 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2782 page_t **pps, uint_t flags, uint_t rid) 2783 { 2784 struct hmehash_bucket *hmebp; 2785 struct hme_blk *hmeblkp; 2786 int ret; 2787 uint_t size; 2788 2789 /* 2790 * Get mapping size. 2791 */ 2792 size = TTE_CSZ(ttep); 2793 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2794 2795 /* 2796 * Acquire the hash bucket. 2797 */ 2798 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid); 2799 ASSERT(hmebp); 2800 2801 /* 2802 * Find the hment block. 2803 */ 2804 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags, 2805 rid); 2806 ASSERT(hmeblkp); 2807 2808 /* 2809 * Add the translation. 2810 */ 2811 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags, 2812 rid); 2813 2814 /* 2815 * Release the hash bucket. 2816 */ 2817 sfmmu_tteload_release_hashbucket(hmebp); 2818 2819 return (ret); 2820 } 2821 2822 /* 2823 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2824 */ 2825 static struct hmehash_bucket * 2826 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size, 2827 uint_t rid) 2828 { 2829 struct hmehash_bucket *hmebp; 2830 int hmeshift; 2831 void *htagid = sfmmutohtagid(sfmmup, rid); 2832 2833 ASSERT(htagid != NULL); 2834 2835 hmeshift = HME_HASH_SHIFT(size); 2836 2837 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift); 2838 2839 SFMMU_HASH_LOCK(hmebp); 2840 2841 return (hmebp); 2842 } 2843 2844 /* 2845 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2846 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2847 * allocated. 2848 */ 2849 static struct hme_blk * 2850 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2851 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid) 2852 { 2853 hmeblk_tag hblktag; 2854 int hmeshift; 2855 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 2856 uint64_t hblkpa, prevpa; 2857 struct kmem_cache *sfmmu_cache; 2858 uint_t forcefree; 2859 2860 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 2861 2862 hblktag.htag_id = sfmmutohtagid(sfmmup, rid); 2863 ASSERT(hblktag.htag_id != NULL); 2864 hmeshift = HME_HASH_SHIFT(size); 2865 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2866 hblktag.htag_rehash = HME_HASH_REHASH(size); 2867 hblktag.htag_rid = rid; 2868 2869 ttearray_realloc: 2870 2871 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, 2872 pr_hblk, prevpa, &list); 2873 2874 /* 2875 * We block until hblk_reserve_lock is released; it's held by 2876 * the thread, temporarily using hblk_reserve, until hblk_reserve is 2877 * replaced by a hblk from sfmmu8_cache. 2878 */ 2879 if (hmeblkp == (struct hme_blk *)hblk_reserve && 2880 hblk_reserve_thread != curthread) { 2881 SFMMU_HASH_UNLOCK(hmebp); 2882 mutex_enter(&hblk_reserve_lock); 2883 mutex_exit(&hblk_reserve_lock); 2884 SFMMU_STAT(sf_hblk_reserve_hit); 2885 SFMMU_HASH_LOCK(hmebp); 2886 goto ttearray_realloc; 2887 } 2888 2889 if (hmeblkp == NULL) { 2890 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2891 hblktag, flags, rid); 2892 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2893 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2894 } else { 2895 /* 2896 * It is possible for 8k and 64k hblks to collide since they 2897 * have the same rehash value. This is because we 2898 * lazily free hblks and 8K/64K blks could be lingering. 2899 * If we find size mismatch we free the block and & try again. 2900 */ 2901 if (get_hblk_ttesz(hmeblkp) != size) { 2902 ASSERT(!hmeblkp->hblk_vcnt); 2903 ASSERT(!hmeblkp->hblk_hmecnt); 2904 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 2905 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 2906 goto ttearray_realloc; 2907 } 2908 if (hmeblkp->hblk_shw_bit) { 2909 /* 2910 * if the hblk was previously used as a shadow hblk then 2911 * we will change it to a normal hblk 2912 */ 2913 ASSERT(!hmeblkp->hblk_shared); 2914 if (hmeblkp->hblk_shw_mask) { 2915 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 2916 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2917 goto ttearray_realloc; 2918 } else { 2919 hmeblkp->hblk_shw_bit = 0; 2920 } 2921 } 2922 SFMMU_STAT(sf_hblk_hit); 2923 } 2924 2925 /* 2926 * hat_memload() should never call kmem_cache_free(); see block 2927 * comment showing the stacktrace in sfmmu_hblk_alloc(); 2928 * enqueue each hblk in the list to reserve list if it's created 2929 * from sfmmu8_cache *and* sfmmup == KHATID. 2930 */ 2931 forcefree = (sfmmup == KHATID) ? 1 : 0; 2932 while ((pr_hblk = list) != NULL) { 2933 list = pr_hblk->hblk_next; 2934 sfmmu_cache = get_hblk_cache(pr_hblk); 2935 if ((sfmmu_cache == sfmmu8_cache) && 2936 sfmmu_put_free_hblk(pr_hblk, forcefree)) 2937 continue; 2938 2939 ASSERT(sfmmup != KHATID); 2940 kmem_cache_free(sfmmu_cache, pr_hblk); 2941 } 2942 2943 ASSERT(get_hblk_ttesz(hmeblkp) == size); 2944 ASSERT(!hmeblkp->hblk_shw_bit); 2945 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2946 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2947 ASSERT(hmeblkp->hblk_tag.htag_rid == rid); 2948 2949 return (hmeblkp); 2950 } 2951 2952 /* 2953 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 2954 * otherwise. 2955 */ 2956 static int 2957 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 2958 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid) 2959 { 2960 page_t *pp = *pps; 2961 int hmenum, size, remap; 2962 tte_t tteold, flush_tte; 2963 #ifdef DEBUG 2964 tte_t orig_old; 2965 #endif /* DEBUG */ 2966 struct sf_hment *sfhme; 2967 kmutex_t *pml, *pmtx; 2968 hatlock_t *hatlockp; 2969 int myflt; 2970 2971 /* 2972 * remove this panic when we decide to let user virtual address 2973 * space be >= USERLIMIT. 2974 */ 2975 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 2976 panic("user addr %p in kernel space", (void *)vaddr); 2977 #if defined(TTE_IS_GLOBAL) 2978 if (TTE_IS_GLOBAL(ttep)) 2979 panic("sfmmu_tteload: creating global tte"); 2980 #endif 2981 2982 #ifdef DEBUG 2983 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 2984 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 2985 panic("sfmmu_tteload: non cacheable memory tte"); 2986 #endif /* DEBUG */ 2987 2988 /* don't simulate dirty bit for writeable ISM/DISM mappings */ 2989 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) { 2990 TTE_SET_REF(ttep); 2991 TTE_SET_MOD(ttep); 2992 } 2993 2994 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 2995 !TTE_IS_MOD(ttep)) { 2996 /* 2997 * Don't load TSB for dummy as in ISM. Also don't preload 2998 * the TSB if the TTE isn't writable since we're likely to 2999 * fault on it again -- preloading can be fairly expensive. 3000 */ 3001 flags |= SFMMU_NO_TSBLOAD; 3002 } 3003 3004 size = TTE_CSZ(ttep); 3005 switch (size) { 3006 case TTE8K: 3007 SFMMU_STAT(sf_tteload8k); 3008 break; 3009 case TTE64K: 3010 SFMMU_STAT(sf_tteload64k); 3011 break; 3012 case TTE512K: 3013 SFMMU_STAT(sf_tteload512k); 3014 break; 3015 case TTE4M: 3016 SFMMU_STAT(sf_tteload4m); 3017 break; 3018 case (TTE32M): 3019 SFMMU_STAT(sf_tteload32m); 3020 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3021 break; 3022 case (TTE256M): 3023 SFMMU_STAT(sf_tteload256m); 3024 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3025 break; 3026 } 3027 3028 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 3029 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 3030 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3031 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3032 3033 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 3034 3035 /* 3036 * Need to grab mlist lock here so that pageunload 3037 * will not change tte behind us. 3038 */ 3039 if (pp) { 3040 pml = sfmmu_mlist_enter(pp); 3041 } 3042 3043 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3044 /* 3045 * Look for corresponding hment and if valid verify 3046 * pfns are equal. 3047 */ 3048 remap = TTE_IS_VALID(&tteold); 3049 if (remap) { 3050 pfn_t new_pfn, old_pfn; 3051 3052 old_pfn = TTE_TO_PFN(vaddr, &tteold); 3053 new_pfn = TTE_TO_PFN(vaddr, ttep); 3054 3055 if (flags & HAT_LOAD_REMAP) { 3056 /* make sure we are remapping same type of pages */ 3057 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 3058 panic("sfmmu_tteload - tte remap io<->memory"); 3059 } 3060 if (old_pfn != new_pfn && 3061 (pp != NULL || sfhme->hme_page != NULL)) { 3062 panic("sfmmu_tteload - tte remap pp != NULL"); 3063 } 3064 } else if (old_pfn != new_pfn) { 3065 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 3066 (void *)hmeblkp); 3067 } 3068 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 3069 3070 if (TTE_IS_EXECUTABLE(&tteold) && TTE_IS_SOFTEXEC(ttep)) { 3071 TTE_SET_EXEC(ttep); 3072 } 3073 } 3074 3075 if (pp) { 3076 /* 3077 * If we know that this page will be executed, because 3078 * it was in the past (PP_ISEXEC is already true), or 3079 * if the caller says it will likely be executed 3080 * (HAT_LOAD_TEXT is true), then there is no need to 3081 * dynamically detect execution with a soft exec 3082 * fault. Enable hardware execute permission now. 3083 */ 3084 if ((PP_ISEXEC(pp) || (flags & HAT_LOAD_TEXT)) && 3085 TTE_IS_SOFTEXEC(ttep)) { 3086 TTE_SET_EXEC(ttep); 3087 } 3088 3089 if (size == TTE8K) { 3090 #ifdef VAC 3091 /* 3092 * Handle VAC consistency 3093 */ 3094 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 3095 sfmmu_vac_conflict(sfmmup, vaddr, pp); 3096 } 3097 #endif 3098 3099 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3100 pmtx = sfmmu_page_enter(pp); 3101 PP_CLRRO(pp); 3102 sfmmu_page_exit(pmtx); 3103 } else if (!PP_ISMAPPED(pp) && 3104 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 3105 pmtx = sfmmu_page_enter(pp); 3106 if (!(PP_ISMOD(pp))) { 3107 PP_SETRO(pp); 3108 } 3109 sfmmu_page_exit(pmtx); 3110 } 3111 3112 if (TTE_EXECUTED(ttep)) { 3113 pmtx = sfmmu_page_enter(pp); 3114 PP_SETEXEC(pp); 3115 sfmmu_page_exit(pmtx); 3116 } 3117 3118 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 3119 /* 3120 * sfmmu_pagearray_setup failed so return 3121 */ 3122 sfmmu_mlist_exit(pml); 3123 return (1); 3124 } 3125 3126 } else if (TTE_IS_SOFTEXEC(ttep)) { 3127 TTE_SET_EXEC(ttep); 3128 } 3129 3130 /* 3131 * Make sure hment is not on a mapping list. 3132 */ 3133 ASSERT(remap || (sfhme->hme_page == NULL)); 3134 3135 /* if it is not a remap then hme->next better be NULL */ 3136 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 3137 3138 if (flags & HAT_LOAD_LOCK) { 3139 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 3140 panic("too high lckcnt-hmeblk %p", 3141 (void *)hmeblkp); 3142 } 3143 atomic_add_32(&hmeblkp->hblk_lckcnt, 1); 3144 3145 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 3146 } 3147 3148 #ifdef VAC 3149 if (pp && PP_ISNC(pp)) { 3150 /* 3151 * If the physical page is marked to be uncacheable, like 3152 * by a vac conflict, make sure the new mapping is also 3153 * uncacheable. 3154 */ 3155 TTE_CLR_VCACHEABLE(ttep); 3156 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 3157 } 3158 #endif 3159 ttep->tte_hmenum = hmenum; 3160 3161 #ifdef DEBUG 3162 orig_old = tteold; 3163 #endif /* DEBUG */ 3164 3165 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 3166 if ((sfmmup == KHATID) && 3167 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 3168 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3169 } 3170 #ifdef DEBUG 3171 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 3172 #endif /* DEBUG */ 3173 } 3174 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 3175 3176 if (!TTE_IS_VALID(&tteold)) { 3177 3178 atomic_add_16(&hmeblkp->hblk_vcnt, 1); 3179 if (rid == SFMMU_INVALID_SHMERID) { 3180 atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1); 3181 } else { 3182 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 3183 sf_region_t *rgnp = srdp->srd_hmergnp[rid]; 3184 /* 3185 * We already accounted for region ttecnt's in sfmmu 3186 * during hat_join_region() processing. Here we 3187 * only update ttecnt's in region struture. 3188 */ 3189 atomic_add_long(&rgnp->rgn_ttecnt[size], 1); 3190 } 3191 } 3192 3193 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup); 3194 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 3195 sfmmup != ksfmmup) { 3196 uchar_t tteflag = 1 << size; 3197 if (rid == SFMMU_INVALID_SHMERID) { 3198 if (!(sfmmup->sfmmu_tteflags & tteflag)) { 3199 hatlockp = sfmmu_hat_enter(sfmmup); 3200 sfmmup->sfmmu_tteflags |= tteflag; 3201 sfmmu_hat_exit(hatlockp); 3202 } 3203 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) { 3204 hatlockp = sfmmu_hat_enter(sfmmup); 3205 sfmmup->sfmmu_rtteflags |= tteflag; 3206 sfmmu_hat_exit(hatlockp); 3207 } 3208 /* 3209 * Update the current CPU tsbmiss area, so the current thread 3210 * won't need to take the tsbmiss for the new pagesize. 3211 * The other threads in the process will update their tsb 3212 * miss area lazily in sfmmu_tsbmiss_exception() when they 3213 * fail to find the translation for a newly added pagesize. 3214 */ 3215 if (size > TTE64K && myflt) { 3216 struct tsbmiss *tsbmp; 3217 kpreempt_disable(); 3218 tsbmp = &tsbmiss_area[CPU->cpu_id]; 3219 if (rid == SFMMU_INVALID_SHMERID) { 3220 if (!(tsbmp->uhat_tteflags & tteflag)) { 3221 tsbmp->uhat_tteflags |= tteflag; 3222 } 3223 } else { 3224 if (!(tsbmp->uhat_rtteflags & tteflag)) { 3225 tsbmp->uhat_rtteflags |= tteflag; 3226 } 3227 } 3228 kpreempt_enable(); 3229 } 3230 } 3231 3232 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 3233 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 3234 hatlockp = sfmmu_hat_enter(sfmmup); 3235 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 3236 sfmmu_hat_exit(hatlockp); 3237 } 3238 3239 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 3240 hw_tte.tte_intlo; 3241 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 3242 hw_tte.tte_inthi; 3243 3244 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 3245 /* 3246 * If remap and new tte differs from old tte we need 3247 * to sync the mod bit and flush TLB/TSB. We don't 3248 * need to sync ref bit because we currently always set 3249 * ref bit in tteload. 3250 */ 3251 ASSERT(TTE_IS_REF(ttep)); 3252 if (TTE_IS_MOD(&tteold) || (TTE_EXECUTED(&tteold) && 3253 !TTE_IS_EXECUTABLE(ttep))) { 3254 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 3255 } 3256 /* 3257 * hwtte bits shouldn't change for SRD hmeblks as long as SRD 3258 * hmes are only used for read only text. Adding this code for 3259 * completeness and future use of shared hmeblks with writable 3260 * mappings of VMODSORT vnodes. 3261 */ 3262 if (hmeblkp->hblk_shared) { 3263 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr, 3264 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1); 3265 xt_sync(cpuset); 3266 SFMMU_STAT_ADD(sf_region_remap_demap, 1); 3267 } else { 3268 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 3269 xt_sync(sfmmup->sfmmu_cpusran); 3270 } 3271 } 3272 3273 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 3274 /* 3275 * We only preload 8K and 4M mappings into the TSB, since 3276 * 64K and 512K mappings are replicated and hence don't 3277 * have a single, unique TSB entry. Ditto for 32M/256M. 3278 */ 3279 if (size == TTE8K || size == TTE4M) { 3280 sf_scd_t *scdp; 3281 hatlockp = sfmmu_hat_enter(sfmmup); 3282 /* 3283 * Don't preload private TSB if the mapping is used 3284 * by the shctx in the SCD. 3285 */ 3286 scdp = sfmmup->sfmmu_scdp; 3287 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL || 3288 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 3289 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, 3290 size); 3291 } 3292 sfmmu_hat_exit(hatlockp); 3293 } 3294 } 3295 if (pp) { 3296 if (!remap) { 3297 HME_ADD(sfhme, pp); 3298 atomic_add_16(&hmeblkp->hblk_hmecnt, 1); 3299 ASSERT(hmeblkp->hblk_hmecnt > 0); 3300 3301 /* 3302 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 3303 * see pageunload() for comment. 3304 */ 3305 } 3306 sfmmu_mlist_exit(pml); 3307 } 3308 3309 return (0); 3310 } 3311 /* 3312 * Function unlocks hash bucket. 3313 */ 3314 static void 3315 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 3316 { 3317 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3318 SFMMU_HASH_UNLOCK(hmebp); 3319 } 3320 3321 /* 3322 * function which checks and sets up page array for a large 3323 * translation. Will set p_vcolor, p_index, p_ro fields. 3324 * Assumes addr and pfnum of first page are properly aligned. 3325 * Will check for physical contiguity. If check fails it return 3326 * non null. 3327 */ 3328 static int 3329 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 3330 { 3331 int i, index, ttesz; 3332 pfn_t pfnum; 3333 pgcnt_t npgs; 3334 page_t *pp, *pp1; 3335 kmutex_t *pmtx; 3336 #ifdef VAC 3337 int osz; 3338 int cflags = 0; 3339 int vac_err = 0; 3340 #endif 3341 int newidx = 0; 3342 3343 ttesz = TTE_CSZ(ttep); 3344 3345 ASSERT(ttesz > TTE8K); 3346 3347 npgs = TTEPAGES(ttesz); 3348 index = PAGESZ_TO_INDEX(ttesz); 3349 3350 pfnum = (*pps)->p_pagenum; 3351 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 3352 3353 /* 3354 * Save the first pp so we can do HAT_TMPNC at the end. 3355 */ 3356 pp1 = *pps; 3357 #ifdef VAC 3358 osz = fnd_mapping_sz(pp1); 3359 #endif 3360 3361 for (i = 0; i < npgs; i++, pps++) { 3362 pp = *pps; 3363 ASSERT(PAGE_LOCKED(pp)); 3364 ASSERT(pp->p_szc >= ttesz); 3365 ASSERT(pp->p_szc == pp1->p_szc); 3366 ASSERT(sfmmu_mlist_held(pp)); 3367 3368 /* 3369 * XXX is it possible to maintain P_RO on the root only? 3370 */ 3371 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3372 pmtx = sfmmu_page_enter(pp); 3373 PP_CLRRO(pp); 3374 sfmmu_page_exit(pmtx); 3375 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 3376 !PP_ISMOD(pp)) { 3377 pmtx = sfmmu_page_enter(pp); 3378 if (!(PP_ISMOD(pp))) { 3379 PP_SETRO(pp); 3380 } 3381 sfmmu_page_exit(pmtx); 3382 } 3383 3384 if (TTE_EXECUTED(ttep)) { 3385 pmtx = sfmmu_page_enter(pp); 3386 PP_SETEXEC(pp); 3387 sfmmu_page_exit(pmtx); 3388 } 3389 3390 /* 3391 * If this is a remap we skip vac & contiguity checks. 3392 */ 3393 if (remap) 3394 continue; 3395 3396 /* 3397 * set p_vcolor and detect any vac conflicts. 3398 */ 3399 #ifdef VAC 3400 if (vac_err == 0) { 3401 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 3402 3403 } 3404 #endif 3405 3406 /* 3407 * Save current index in case we need to undo it. 3408 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 3409 * "SFMMU_INDEX_SHIFT 6" 3410 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 3411 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 3412 * 3413 * So: index = PAGESZ_TO_INDEX(ttesz); 3414 * if ttesz == 1 then index = 0x2 3415 * 2 then index = 0x4 3416 * 3 then index = 0x8 3417 * 4 then index = 0x10 3418 * 5 then index = 0x20 3419 * The code below checks if it's a new pagesize (ie, newidx) 3420 * in case we need to take it back out of p_index, 3421 * and then or's the new index into the existing index. 3422 */ 3423 if ((PP_MAPINDEX(pp) & index) == 0) 3424 newidx = 1; 3425 pp->p_index = (PP_MAPINDEX(pp) | index); 3426 3427 /* 3428 * contiguity check 3429 */ 3430 if (pp->p_pagenum != pfnum) { 3431 /* 3432 * If we fail the contiguity test then 3433 * the only thing we need to fix is the p_index field. 3434 * We might get a few extra flushes but since this 3435 * path is rare that is ok. The p_ro field will 3436 * get automatically fixed on the next tteload to 3437 * the page. NO TNC bit is set yet. 3438 */ 3439 while (i >= 0) { 3440 pp = *pps; 3441 if (newidx) 3442 pp->p_index = (PP_MAPINDEX(pp) & 3443 ~index); 3444 pps--; 3445 i--; 3446 } 3447 return (1); 3448 } 3449 pfnum++; 3450 addr += MMU_PAGESIZE; 3451 } 3452 3453 #ifdef VAC 3454 if (vac_err) { 3455 if (ttesz > osz) { 3456 /* 3457 * There are some smaller mappings that causes vac 3458 * conflicts. Convert all existing small mappings to 3459 * TNC. 3460 */ 3461 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 3462 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 3463 npgs); 3464 } else { 3465 /* EMPTY */ 3466 /* 3467 * If there exists an big page mapping, 3468 * that means the whole existing big page 3469 * has TNC setting already. No need to covert to 3470 * TNC again. 3471 */ 3472 ASSERT(PP_ISTNC(pp1)); 3473 } 3474 } 3475 #endif /* VAC */ 3476 3477 return (0); 3478 } 3479 3480 #ifdef VAC 3481 /* 3482 * Routine that detects vac consistency for a large page. It also 3483 * sets virtual color for all pp's for this big mapping. 3484 */ 3485 static int 3486 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 3487 { 3488 int vcolor, ocolor; 3489 3490 ASSERT(sfmmu_mlist_held(pp)); 3491 3492 if (PP_ISNC(pp)) { 3493 return (HAT_TMPNC); 3494 } 3495 3496 vcolor = addr_to_vcolor(addr); 3497 if (PP_NEWPAGE(pp)) { 3498 PP_SET_VCOLOR(pp, vcolor); 3499 return (0); 3500 } 3501 3502 ocolor = PP_GET_VCOLOR(pp); 3503 if (ocolor == vcolor) { 3504 return (0); 3505 } 3506 3507 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 3508 /* 3509 * Previous user of page had a differnet color 3510 * but since there are no current users 3511 * we just flush the cache and change the color. 3512 * As an optimization for large pages we flush the 3513 * entire cache of that color and set a flag. 3514 */ 3515 SFMMU_STAT(sf_pgcolor_conflict); 3516 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 3517 CacheColor_SetFlushed(*cflags, ocolor); 3518 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 3519 } 3520 PP_SET_VCOLOR(pp, vcolor); 3521 return (0); 3522 } 3523 3524 /* 3525 * We got a real conflict with a current mapping. 3526 * set flags to start unencaching all mappings 3527 * and return failure so we restart looping 3528 * the pp array from the beginning. 3529 */ 3530 return (HAT_TMPNC); 3531 } 3532 #endif /* VAC */ 3533 3534 /* 3535 * creates a large page shadow hmeblk for a tte. 3536 * The purpose of this routine is to allow us to do quick unloads because 3537 * the vm layer can easily pass a very large but sparsely populated range. 3538 */ 3539 static struct hme_blk * 3540 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 3541 { 3542 struct hmehash_bucket *hmebp; 3543 hmeblk_tag hblktag; 3544 int hmeshift, size, vshift; 3545 uint_t shw_mask, newshw_mask; 3546 struct hme_blk *hmeblkp; 3547 3548 ASSERT(sfmmup != KHATID); 3549 if (mmu_page_sizes == max_mmu_page_sizes) { 3550 ASSERT(ttesz < TTE256M); 3551 } else { 3552 ASSERT(ttesz < TTE4M); 3553 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 3554 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 3555 } 3556 3557 if (ttesz == TTE8K) { 3558 size = TTE512K; 3559 } else { 3560 size = ++ttesz; 3561 } 3562 3563 hblktag.htag_id = sfmmup; 3564 hmeshift = HME_HASH_SHIFT(size); 3565 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3566 hblktag.htag_rehash = HME_HASH_REHASH(size); 3567 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3568 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 3569 3570 SFMMU_HASH_LOCK(hmebp); 3571 3572 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3573 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 3574 if (hmeblkp == NULL) { 3575 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3576 hblktag, flags, SFMMU_INVALID_SHMERID); 3577 } 3578 ASSERT(hmeblkp); 3579 if (!hmeblkp->hblk_shw_mask) { 3580 /* 3581 * if this is a unused hblk it was just allocated or could 3582 * potentially be a previous large page hblk so we need to 3583 * set the shadow bit. 3584 */ 3585 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3586 hmeblkp->hblk_shw_bit = 1; 3587 } else if (hmeblkp->hblk_shw_bit == 0) { 3588 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p", 3589 (void *)hmeblkp); 3590 } 3591 ASSERT(hmeblkp->hblk_shw_bit == 1); 3592 ASSERT(!hmeblkp->hblk_shared); 3593 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3594 ASSERT(vshift < 8); 3595 /* 3596 * Atomically set shw mask bit 3597 */ 3598 do { 3599 shw_mask = hmeblkp->hblk_shw_mask; 3600 newshw_mask = shw_mask | (1 << vshift); 3601 newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask, 3602 newshw_mask); 3603 } while (newshw_mask != shw_mask); 3604 3605 SFMMU_HASH_UNLOCK(hmebp); 3606 3607 return (hmeblkp); 3608 } 3609 3610 /* 3611 * This routine cleanup a previous shadow hmeblk and changes it to 3612 * a regular hblk. This happens rarely but it is possible 3613 * when a process wants to use large pages and there are hblks still 3614 * lying around from the previous as that used these hmeblks. 3615 * The alternative was to cleanup the shadow hblks at unload time 3616 * but since so few user processes actually use large pages, it is 3617 * better to be lazy and cleanup at this time. 3618 */ 3619 static void 3620 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3621 struct hmehash_bucket *hmebp) 3622 { 3623 caddr_t addr, endaddr; 3624 int hashno, size; 3625 3626 ASSERT(hmeblkp->hblk_shw_bit); 3627 ASSERT(!hmeblkp->hblk_shared); 3628 3629 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3630 3631 if (!hmeblkp->hblk_shw_mask) { 3632 hmeblkp->hblk_shw_bit = 0; 3633 return; 3634 } 3635 addr = (caddr_t)get_hblk_base(hmeblkp); 3636 endaddr = get_hblk_endaddr(hmeblkp); 3637 size = get_hblk_ttesz(hmeblkp); 3638 hashno = size - 1; 3639 ASSERT(hashno > 0); 3640 SFMMU_HASH_UNLOCK(hmebp); 3641 3642 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3643 3644 SFMMU_HASH_LOCK(hmebp); 3645 } 3646 3647 static void 3648 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3649 int hashno) 3650 { 3651 int hmeshift, shadow = 0; 3652 hmeblk_tag hblktag; 3653 struct hmehash_bucket *hmebp; 3654 struct hme_blk *hmeblkp; 3655 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3656 uint64_t hblkpa, prevpa, nx_pa; 3657 3658 ASSERT(hashno > 0); 3659 hblktag.htag_id = sfmmup; 3660 hblktag.htag_rehash = hashno; 3661 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3662 3663 hmeshift = HME_HASH_SHIFT(hashno); 3664 3665 while (addr < endaddr) { 3666 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3667 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3668 SFMMU_HASH_LOCK(hmebp); 3669 /* inline HME_HASH_SEARCH */ 3670 hmeblkp = hmebp->hmeblkp; 3671 hblkpa = hmebp->hmeh_nextpa; 3672 prevpa = 0; 3673 pr_hblk = NULL; 3674 while (hmeblkp) { 3675 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 3676 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3677 /* found hme_blk */ 3678 ASSERT(!hmeblkp->hblk_shared); 3679 if (hmeblkp->hblk_shw_bit) { 3680 if (hmeblkp->hblk_shw_mask) { 3681 shadow = 1; 3682 sfmmu_shadow_hcleanup(sfmmup, 3683 hmeblkp, hmebp); 3684 break; 3685 } else { 3686 hmeblkp->hblk_shw_bit = 0; 3687 } 3688 } 3689 3690 /* 3691 * Hblk_hmecnt and hblk_vcnt could be non zero 3692 * since hblk_unload() does not gurantee that. 3693 * 3694 * XXX - this could cause tteload() to spin 3695 * where sfmmu_shadow_hcleanup() is called. 3696 */ 3697 } 3698 3699 nx_hblk = hmeblkp->hblk_next; 3700 nx_pa = hmeblkp->hblk_nextpa; 3701 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3702 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 3703 pr_hblk); 3704 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 3705 } else { 3706 pr_hblk = hmeblkp; 3707 prevpa = hblkpa; 3708 } 3709 hmeblkp = nx_hblk; 3710 hblkpa = nx_pa; 3711 } 3712 3713 SFMMU_HASH_UNLOCK(hmebp); 3714 3715 if (shadow) { 3716 /* 3717 * We found another shadow hblk so cleaned its 3718 * children. We need to go back and cleanup 3719 * the original hblk so we don't change the 3720 * addr. 3721 */ 3722 shadow = 0; 3723 } else { 3724 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3725 (1 << hmeshift)); 3726 } 3727 } 3728 sfmmu_hblks_list_purge(&list); 3729 } 3730 3731 /* 3732 * This routine's job is to delete stale invalid shared hmeregions hmeblks that 3733 * may still linger on after pageunload. 3734 */ 3735 static void 3736 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz) 3737 { 3738 int hmeshift; 3739 hmeblk_tag hblktag; 3740 struct hmehash_bucket *hmebp; 3741 struct hme_blk *hmeblkp; 3742 struct hme_blk *pr_hblk; 3743 struct hme_blk *list = NULL; 3744 uint64_t hblkpa, prevpa; 3745 3746 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3747 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3748 3749 hmeshift = HME_HASH_SHIFT(ttesz); 3750 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3751 hblktag.htag_rehash = ttesz; 3752 hblktag.htag_rid = rid; 3753 hblktag.htag_id = srdp; 3754 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3755 3756 SFMMU_HASH_LOCK(hmebp); 3757 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, pr_hblk, 3758 prevpa, &list); 3759 if (hmeblkp != NULL) { 3760 ASSERT(hmeblkp->hblk_shared); 3761 ASSERT(!hmeblkp->hblk_shw_bit); 3762 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3763 panic("sfmmu_cleanup_rhblk: valid hmeblk"); 3764 } 3765 ASSERT(!hmeblkp->hblk_lckcnt); 3766 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 3767 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 3768 } 3769 SFMMU_HASH_UNLOCK(hmebp); 3770 sfmmu_hblks_list_purge(&list); 3771 } 3772 3773 /* ARGSUSED */ 3774 static void 3775 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 3776 size_t r_size, void *r_obj, u_offset_t r_objoff) 3777 { 3778 } 3779 3780 /* 3781 * Searches for an hmeblk which maps addr, then unloads this mapping 3782 * and updates *eaddrp, if the hmeblk is found. 3783 */ 3784 static void 3785 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr, 3786 caddr_t eaddr, int ttesz, caddr_t *eaddrp) 3787 { 3788 int hmeshift; 3789 hmeblk_tag hblktag; 3790 struct hmehash_bucket *hmebp; 3791 struct hme_blk *hmeblkp; 3792 struct hme_blk *pr_hblk; 3793 struct hme_blk *list = NULL; 3794 uint64_t hblkpa, prevpa; 3795 3796 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3797 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3798 ASSERT(ttesz >= HBLK_MIN_TTESZ); 3799 3800 hmeshift = HME_HASH_SHIFT(ttesz); 3801 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3802 hblktag.htag_rehash = ttesz; 3803 hblktag.htag_rid = rid; 3804 hblktag.htag_id = srdp; 3805 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3806 3807 SFMMU_HASH_LOCK(hmebp); 3808 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, pr_hblk, 3809 prevpa, &list); 3810 if (hmeblkp != NULL) { 3811 ASSERT(hmeblkp->hblk_shared); 3812 ASSERT(!hmeblkp->hblk_lckcnt); 3813 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3814 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr, 3815 eaddr, NULL, HAT_UNLOAD); 3816 ASSERT(*eaddrp > addr); 3817 } 3818 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3819 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 3820 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 3821 } 3822 SFMMU_HASH_UNLOCK(hmebp); 3823 sfmmu_hblks_list_purge(&list); 3824 } 3825 3826 static void 3827 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp) 3828 { 3829 int ttesz = rgnp->rgn_pgszc; 3830 size_t rsz = rgnp->rgn_size; 3831 caddr_t rsaddr = rgnp->rgn_saddr; 3832 caddr_t readdr = rsaddr + rsz; 3833 caddr_t rhsaddr; 3834 caddr_t va; 3835 uint_t rid = rgnp->rgn_id; 3836 caddr_t cbsaddr; 3837 caddr_t cbeaddr; 3838 hat_rgn_cb_func_t rcbfunc; 3839 ulong_t cnt; 3840 3841 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3842 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3843 3844 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz))); 3845 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz))); 3846 if (ttesz < HBLK_MIN_TTESZ) { 3847 ttesz = HBLK_MIN_TTESZ; 3848 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES); 3849 } else { 3850 rhsaddr = rsaddr; 3851 } 3852 3853 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) { 3854 rcbfunc = sfmmu_rgn_cb_noop; 3855 } 3856 3857 while (ttesz >= HBLK_MIN_TTESZ) { 3858 cbsaddr = rsaddr; 3859 cbeaddr = rsaddr; 3860 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 3861 ttesz--; 3862 continue; 3863 } 3864 cnt = 0; 3865 va = rsaddr; 3866 while (va < readdr) { 3867 ASSERT(va >= rhsaddr); 3868 if (va != cbeaddr) { 3869 if (cbeaddr != cbsaddr) { 3870 ASSERT(cbeaddr > cbsaddr); 3871 (*rcbfunc)(cbsaddr, cbeaddr, 3872 rsaddr, rsz, rgnp->rgn_obj, 3873 rgnp->rgn_objoff); 3874 } 3875 cbsaddr = va; 3876 cbeaddr = va; 3877 } 3878 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr, 3879 ttesz, &cbeaddr); 3880 cnt++; 3881 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz)); 3882 } 3883 if (cbeaddr != cbsaddr) { 3884 ASSERT(cbeaddr > cbsaddr); 3885 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr, 3886 rsz, rgnp->rgn_obj, 3887 rgnp->rgn_objoff); 3888 } 3889 ttesz--; 3890 } 3891 } 3892 3893 /* 3894 * Release one hardware address translation lock on the given address range. 3895 */ 3896 void 3897 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3898 { 3899 struct hmehash_bucket *hmebp; 3900 hmeblk_tag hblktag; 3901 int hmeshift, hashno = 1; 3902 struct hme_blk *hmeblkp, *list = NULL; 3903 caddr_t endaddr; 3904 3905 ASSERT(sfmmup != NULL); 3906 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3907 3908 ASSERT((sfmmup == ksfmmup) || 3909 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3910 ASSERT((len & MMU_PAGEOFFSET) == 0); 3911 endaddr = addr + len; 3912 hblktag.htag_id = sfmmup; 3913 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3914 3915 /* 3916 * Spitfire supports 4 page sizes. 3917 * Most pages are expected to be of the smallest page size (8K) and 3918 * these will not need to be rehashed. 64K pages also don't need to be 3919 * rehashed because an hmeblk spans 64K of address space. 512K pages 3920 * might need 1 rehash and and 4M pages might need 2 rehashes. 3921 */ 3922 while (addr < endaddr) { 3923 hmeshift = HME_HASH_SHIFT(hashno); 3924 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3925 hblktag.htag_rehash = hashno; 3926 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3927 3928 SFMMU_HASH_LOCK(hmebp); 3929 3930 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3931 if (hmeblkp != NULL) { 3932 ASSERT(!hmeblkp->hblk_shared); 3933 /* 3934 * If we encounter a shadow hmeblk then 3935 * we know there are no valid hmeblks mapping 3936 * this address at this size or larger. 3937 * Just increment address by the smallest 3938 * page size. 3939 */ 3940 if (hmeblkp->hblk_shw_bit) { 3941 addr += MMU_PAGESIZE; 3942 } else { 3943 addr = sfmmu_hblk_unlock(hmeblkp, addr, 3944 endaddr); 3945 } 3946 SFMMU_HASH_UNLOCK(hmebp); 3947 hashno = 1; 3948 continue; 3949 } 3950 SFMMU_HASH_UNLOCK(hmebp); 3951 3952 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 3953 /* 3954 * We have traversed the whole list and rehashed 3955 * if necessary without finding the address to unlock 3956 * which should never happen. 3957 */ 3958 panic("sfmmu_unlock: addr not found. " 3959 "addr %p hat %p", (void *)addr, (void *)sfmmup); 3960 } else { 3961 hashno++; 3962 } 3963 } 3964 3965 sfmmu_hblks_list_purge(&list); 3966 } 3967 3968 void 3969 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len, 3970 hat_region_cookie_t rcookie) 3971 { 3972 sf_srd_t *srdp; 3973 sf_region_t *rgnp; 3974 int ttesz; 3975 uint_t rid; 3976 caddr_t eaddr; 3977 caddr_t va; 3978 int hmeshift; 3979 hmeblk_tag hblktag; 3980 struct hmehash_bucket *hmebp; 3981 struct hme_blk *hmeblkp; 3982 struct hme_blk *pr_hblk; 3983 struct hme_blk *list; 3984 uint64_t hblkpa, prevpa; 3985 3986 if (rcookie == HAT_INVALID_REGION_COOKIE) { 3987 hat_unlock(sfmmup, addr, len); 3988 return; 3989 } 3990 3991 ASSERT(sfmmup != NULL); 3992 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3993 ASSERT(sfmmup != ksfmmup); 3994 3995 srdp = sfmmup->sfmmu_srdp; 3996 rid = (uint_t)((uint64_t)rcookie); 3997 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3998 eaddr = addr + len; 3999 va = addr; 4000 list = NULL; 4001 rgnp = srdp->srd_hmergnp[rid]; 4002 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len); 4003 4004 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc))); 4005 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc))); 4006 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) { 4007 ttesz = HBLK_MIN_TTESZ; 4008 } else { 4009 ttesz = rgnp->rgn_pgszc; 4010 } 4011 while (va < eaddr) { 4012 while (ttesz < rgnp->rgn_pgszc && 4013 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) { 4014 ttesz++; 4015 } 4016 while (ttesz >= HBLK_MIN_TTESZ) { 4017 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 4018 ttesz--; 4019 continue; 4020 } 4021 hmeshift = HME_HASH_SHIFT(ttesz); 4022 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift); 4023 hblktag.htag_rehash = ttesz; 4024 hblktag.htag_rid = rid; 4025 hblktag.htag_id = srdp; 4026 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift); 4027 SFMMU_HASH_LOCK(hmebp); 4028 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, 4029 pr_hblk, prevpa, &list); 4030 if (hmeblkp == NULL) { 4031 SFMMU_HASH_UNLOCK(hmebp); 4032 ttesz--; 4033 continue; 4034 } 4035 ASSERT(hmeblkp->hblk_shared); 4036 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr); 4037 ASSERT(va >= eaddr || 4038 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz))); 4039 SFMMU_HASH_UNLOCK(hmebp); 4040 break; 4041 } 4042 if (ttesz < HBLK_MIN_TTESZ) { 4043 panic("hat_unlock_region: addr not found " 4044 "addr %p hat %p", (void *)va, (void *)sfmmup); 4045 } 4046 } 4047 sfmmu_hblks_list_purge(&list); 4048 } 4049 4050 /* 4051 * Function to unlock a range of addresses in an hmeblk. It returns the 4052 * next address that needs to be unlocked. 4053 * Should be called with the hash lock held. 4054 */ 4055 static caddr_t 4056 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 4057 { 4058 struct sf_hment *sfhme; 4059 tte_t tteold, ttemod; 4060 int ttesz, ret; 4061 4062 ASSERT(in_hblk_range(hmeblkp, addr)); 4063 ASSERT(hmeblkp->hblk_shw_bit == 0); 4064 4065 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4066 ttesz = get_hblk_ttesz(hmeblkp); 4067 4068 HBLKTOHME(sfhme, hmeblkp, addr); 4069 while (addr < endaddr) { 4070 readtte: 4071 sfmmu_copytte(&sfhme->hme_tte, &tteold); 4072 if (TTE_IS_VALID(&tteold)) { 4073 4074 ttemod = tteold; 4075 4076 ret = sfmmu_modifytte_try(&tteold, &ttemod, 4077 &sfhme->hme_tte); 4078 4079 if (ret < 0) 4080 goto readtte; 4081 4082 if (hmeblkp->hblk_lckcnt == 0) 4083 panic("zero hblk lckcnt"); 4084 4085 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 4086 (uintptr_t)endaddr) 4087 panic("can't unlock large tte"); 4088 4089 ASSERT(hmeblkp->hblk_lckcnt > 0); 4090 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 4091 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 4092 } else { 4093 panic("sfmmu_hblk_unlock: invalid tte"); 4094 } 4095 addr += TTEBYTES(ttesz); 4096 sfhme++; 4097 } 4098 return (addr); 4099 } 4100 4101 /* 4102 * Physical Address Mapping Framework 4103 * 4104 * General rules: 4105 * 4106 * (1) Applies only to seg_kmem memory pages. To make things easier, 4107 * seg_kpm addresses are also accepted by the routines, but nothing 4108 * is done with them since by definition their PA mappings are static. 4109 * (2) hat_add_callback() may only be called while holding the page lock 4110 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()), 4111 * or passing HAC_PAGELOCK flag. 4112 * (3) prehandler() and posthandler() may not call hat_add_callback() or 4113 * hat_delete_callback(), nor should they allocate memory. Post quiesce 4114 * callbacks may not sleep or acquire adaptive mutex locks. 4115 * (4) Either prehandler() or posthandler() (but not both) may be specified 4116 * as being NULL. Specifying an errhandler() is optional. 4117 * 4118 * Details of using the framework: 4119 * 4120 * registering a callback (hat_register_callback()) 4121 * 4122 * Pass prehandler, posthandler, errhandler addresses 4123 * as described below. If capture_cpus argument is nonzero, 4124 * suspend callback to the prehandler will occur with CPUs 4125 * captured and executing xc_loop() and CPUs will remain 4126 * captured until after the posthandler suspend callback 4127 * occurs. 4128 * 4129 * adding a callback (hat_add_callback()) 4130 * 4131 * as_pagelock(); 4132 * hat_add_callback(); 4133 * save returned pfn in private data structures or program registers; 4134 * as_pageunlock(); 4135 * 4136 * prehandler() 4137 * 4138 * Stop all accesses by physical address to this memory page. 4139 * Called twice: the first, PRESUSPEND, is a context safe to acquire 4140 * adaptive locks. The second, SUSPEND, is called at high PIL with 4141 * CPUs captured so adaptive locks may NOT be acquired (and all spin 4142 * locks must be XCALL_PIL or higher locks). 4143 * 4144 * May return the following errors: 4145 * EIO: A fatal error has occurred. This will result in panic. 4146 * EAGAIN: The page cannot be suspended. This will fail the 4147 * relocation. 4148 * 0: Success. 4149 * 4150 * posthandler() 4151 * 4152 * Save new pfn in private data structures or program registers; 4153 * not allowed to fail (non-zero return values will result in panic). 4154 * 4155 * errhandler() 4156 * 4157 * called when an error occurs related to the callback. Currently 4158 * the only such error is HAT_CB_ERR_LEAKED which indicates that 4159 * a page is being freed, but there are still outstanding callback(s) 4160 * registered on the page. 4161 * 4162 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 4163 * 4164 * stop using physical address 4165 * hat_delete_callback(); 4166 * 4167 */ 4168 4169 /* 4170 * Register a callback class. Each subsystem should do this once and 4171 * cache the id_t returned for use in setting up and tearing down callbacks. 4172 * 4173 * There is no facility for removing callback IDs once they are created; 4174 * the "key" should be unique for each module, so in case a module is unloaded 4175 * and subsequently re-loaded, we can recycle the module's previous entry. 4176 */ 4177 id_t 4178 hat_register_callback(int key, 4179 int (*prehandler)(caddr_t, uint_t, uint_t, void *), 4180 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 4181 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 4182 int capture_cpus) 4183 { 4184 id_t id; 4185 4186 /* 4187 * Search the table for a pre-existing callback associated with 4188 * the identifier "key". If one exists, we re-use that entry in 4189 * the table for this instance, otherwise we assign the next 4190 * available table slot. 4191 */ 4192 for (id = 0; id < sfmmu_max_cb_id; id++) { 4193 if (sfmmu_cb_table[id].key == key) 4194 break; 4195 } 4196 4197 if (id == sfmmu_max_cb_id) { 4198 id = sfmmu_cb_nextid++; 4199 if (id >= sfmmu_max_cb_id) 4200 panic("hat_register_callback: out of callback IDs"); 4201 } 4202 4203 ASSERT(prehandler != NULL || posthandler != NULL); 4204 4205 sfmmu_cb_table[id].key = key; 4206 sfmmu_cb_table[id].prehandler = prehandler; 4207 sfmmu_cb_table[id].posthandler = posthandler; 4208 sfmmu_cb_table[id].errhandler = errhandler; 4209 sfmmu_cb_table[id].capture_cpus = capture_cpus; 4210 4211 return (id); 4212 } 4213 4214 #define HAC_COOKIE_NONE (void *)-1 4215 4216 /* 4217 * Add relocation callbacks to the specified addr/len which will be called 4218 * when relocating the associated page. See the description of pre and 4219 * posthandler above for more details. 4220 * 4221 * If HAC_PAGELOCK is included in flags, the underlying memory page is 4222 * locked internally so the caller must be able to deal with the callback 4223 * running even before this function has returned. If HAC_PAGELOCK is not 4224 * set, it is assumed that the underlying memory pages are locked. 4225 * 4226 * Since the caller must track the individual page boundaries anyway, 4227 * we only allow a callback to be added to a single page (large 4228 * or small). Thus [addr, addr + len) MUST be contained within a single 4229 * page. 4230 * 4231 * Registering multiple callbacks on the same [addr, addr+len) is supported, 4232 * _provided_that_ a unique parameter is specified for each callback. 4233 * If multiple callbacks are registered on the same range the callback will 4234 * be invoked with each unique parameter. Registering the same callback with 4235 * the same argument more than once will result in corrupted kernel state. 4236 * 4237 * Returns the pfn of the underlying kernel page in *rpfn 4238 * on success, or PFN_INVALID on failure. 4239 * 4240 * cookiep (if passed) provides storage space for an opaque cookie 4241 * to return later to hat_delete_callback(). This cookie makes the callback 4242 * deletion significantly quicker by avoiding a potentially lengthy hash 4243 * search. 4244 * 4245 * Returns values: 4246 * 0: success 4247 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 4248 * EINVAL: callback ID is not valid 4249 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 4250 * space 4251 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary 4252 */ 4253 int 4254 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 4255 void *pvt, pfn_t *rpfn, void **cookiep) 4256 { 4257 struct hmehash_bucket *hmebp; 4258 hmeblk_tag hblktag; 4259 struct hme_blk *hmeblkp; 4260 int hmeshift, hashno; 4261 caddr_t saddr, eaddr, baseaddr; 4262 struct pa_hment *pahmep; 4263 struct sf_hment *sfhmep, *osfhmep; 4264 kmutex_t *pml; 4265 tte_t tte; 4266 page_t *pp; 4267 vnode_t *vp; 4268 u_offset_t off; 4269 pfn_t pfn; 4270 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 4271 int locked = 0; 4272 4273 /* 4274 * For KPM mappings, just return the physical address since we 4275 * don't need to register any callbacks. 4276 */ 4277 if (IS_KPM_ADDR(vaddr)) { 4278 uint64_t paddr; 4279 SFMMU_KPM_VTOP(vaddr, paddr); 4280 *rpfn = btop(paddr); 4281 if (cookiep != NULL) 4282 *cookiep = HAC_COOKIE_NONE; 4283 return (0); 4284 } 4285 4286 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 4287 *rpfn = PFN_INVALID; 4288 return (EINVAL); 4289 } 4290 4291 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 4292 *rpfn = PFN_INVALID; 4293 return (ENOMEM); 4294 } 4295 4296 sfhmep = &pahmep->sfment; 4297 4298 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4299 eaddr = saddr + len; 4300 4301 rehash: 4302 /* Find the mapping(s) for this page */ 4303 for (hashno = TTE64K, hmeblkp = NULL; 4304 hmeblkp == NULL && hashno <= mmu_hashcnt; 4305 hashno++) { 4306 hmeshift = HME_HASH_SHIFT(hashno); 4307 hblktag.htag_id = ksfmmup; 4308 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4309 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4310 hblktag.htag_rehash = hashno; 4311 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4312 4313 SFMMU_HASH_LOCK(hmebp); 4314 4315 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4316 4317 if (hmeblkp == NULL) 4318 SFMMU_HASH_UNLOCK(hmebp); 4319 } 4320 4321 if (hmeblkp == NULL) { 4322 kmem_cache_free(pa_hment_cache, pahmep); 4323 *rpfn = PFN_INVALID; 4324 return (ENXIO); 4325 } 4326 4327 ASSERT(!hmeblkp->hblk_shared); 4328 4329 HBLKTOHME(osfhmep, hmeblkp, saddr); 4330 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4331 4332 if (!TTE_IS_VALID(&tte)) { 4333 SFMMU_HASH_UNLOCK(hmebp); 4334 kmem_cache_free(pa_hment_cache, pahmep); 4335 *rpfn = PFN_INVALID; 4336 return (ENXIO); 4337 } 4338 4339 /* 4340 * Make sure the boundaries for the callback fall within this 4341 * single mapping. 4342 */ 4343 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 4344 ASSERT(saddr >= baseaddr); 4345 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) { 4346 SFMMU_HASH_UNLOCK(hmebp); 4347 kmem_cache_free(pa_hment_cache, pahmep); 4348 *rpfn = PFN_INVALID; 4349 return (ERANGE); 4350 } 4351 4352 pfn = sfmmu_ttetopfn(&tte, vaddr); 4353 4354 /* 4355 * The pfn may not have a page_t underneath in which case we 4356 * just return it. This can happen if we are doing I/O to a 4357 * static portion of the kernel's address space, for instance. 4358 */ 4359 pp = osfhmep->hme_page; 4360 if (pp == NULL) { 4361 SFMMU_HASH_UNLOCK(hmebp); 4362 kmem_cache_free(pa_hment_cache, pahmep); 4363 *rpfn = pfn; 4364 if (cookiep) 4365 *cookiep = HAC_COOKIE_NONE; 4366 return (0); 4367 } 4368 ASSERT(pp == PP_PAGEROOT(pp)); 4369 4370 vp = pp->p_vnode; 4371 off = pp->p_offset; 4372 4373 pml = sfmmu_mlist_enter(pp); 4374 4375 if (flags & HAC_PAGELOCK) { 4376 if (!page_trylock(pp, SE_SHARED)) { 4377 /* 4378 * Somebody is holding SE_EXCL lock. Might 4379 * even be hat_page_relocate(). Drop all 4380 * our locks, lookup the page in &kvp, and 4381 * retry. If it doesn't exist in &kvp and &zvp, 4382 * then we must be dealing with a kernel mapped 4383 * page which doesn't actually belong to 4384 * segkmem so we punt. 4385 */ 4386 sfmmu_mlist_exit(pml); 4387 SFMMU_HASH_UNLOCK(hmebp); 4388 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4389 4390 /* check zvp before giving up */ 4391 if (pp == NULL) 4392 pp = page_lookup(&zvp, (u_offset_t)saddr, 4393 SE_SHARED); 4394 4395 /* Okay, we didn't find it, give up */ 4396 if (pp == NULL) { 4397 kmem_cache_free(pa_hment_cache, pahmep); 4398 *rpfn = pfn; 4399 if (cookiep) 4400 *cookiep = HAC_COOKIE_NONE; 4401 return (0); 4402 } 4403 page_unlock(pp); 4404 goto rehash; 4405 } 4406 locked = 1; 4407 } 4408 4409 if (!PAGE_LOCKED(pp) && !panicstr) 4410 panic("hat_add_callback: page 0x%p not locked", (void *)pp); 4411 4412 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4413 pp->p_offset != off) { 4414 /* 4415 * The page moved before we got our hands on it. Drop 4416 * all the locks and try again. 4417 */ 4418 ASSERT((flags & HAC_PAGELOCK) != 0); 4419 sfmmu_mlist_exit(pml); 4420 SFMMU_HASH_UNLOCK(hmebp); 4421 page_unlock(pp); 4422 locked = 0; 4423 goto rehash; 4424 } 4425 4426 if (!VN_ISKAS(vp)) { 4427 /* 4428 * This is not a segkmem page but another page which 4429 * has been kernel mapped. It had better have at least 4430 * a share lock on it. Return the pfn. 4431 */ 4432 sfmmu_mlist_exit(pml); 4433 SFMMU_HASH_UNLOCK(hmebp); 4434 if (locked) 4435 page_unlock(pp); 4436 kmem_cache_free(pa_hment_cache, pahmep); 4437 ASSERT(PAGE_LOCKED(pp)); 4438 *rpfn = pfn; 4439 if (cookiep) 4440 *cookiep = HAC_COOKIE_NONE; 4441 return (0); 4442 } 4443 4444 /* 4445 * Setup this pa_hment and link its embedded dummy sf_hment into 4446 * the mapping list. 4447 */ 4448 pp->p_share++; 4449 pahmep->cb_id = callback_id; 4450 pahmep->addr = vaddr; 4451 pahmep->len = len; 4452 pahmep->refcnt = 1; 4453 pahmep->flags = 0; 4454 pahmep->pvt = pvt; 4455 4456 sfhmep->hme_tte.ll = 0; 4457 sfhmep->hme_data = pahmep; 4458 sfhmep->hme_prev = osfhmep; 4459 sfhmep->hme_next = osfhmep->hme_next; 4460 4461 if (osfhmep->hme_next) 4462 osfhmep->hme_next->hme_prev = sfhmep; 4463 4464 osfhmep->hme_next = sfhmep; 4465 4466 sfmmu_mlist_exit(pml); 4467 SFMMU_HASH_UNLOCK(hmebp); 4468 4469 if (locked) 4470 page_unlock(pp); 4471 4472 *rpfn = pfn; 4473 if (cookiep) 4474 *cookiep = (void *)pahmep; 4475 4476 return (0); 4477 } 4478 4479 /* 4480 * Remove the relocation callbacks from the specified addr/len. 4481 */ 4482 void 4483 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags, 4484 void *cookie) 4485 { 4486 struct hmehash_bucket *hmebp; 4487 hmeblk_tag hblktag; 4488 struct hme_blk *hmeblkp; 4489 int hmeshift, hashno; 4490 caddr_t saddr; 4491 struct pa_hment *pahmep; 4492 struct sf_hment *sfhmep, *osfhmep; 4493 kmutex_t *pml; 4494 tte_t tte; 4495 page_t *pp; 4496 vnode_t *vp; 4497 u_offset_t off; 4498 int locked = 0; 4499 4500 /* 4501 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to 4502 * remove so just return. 4503 */ 4504 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr)) 4505 return; 4506 4507 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4508 4509 rehash: 4510 /* Find the mapping(s) for this page */ 4511 for (hashno = TTE64K, hmeblkp = NULL; 4512 hmeblkp == NULL && hashno <= mmu_hashcnt; 4513 hashno++) { 4514 hmeshift = HME_HASH_SHIFT(hashno); 4515 hblktag.htag_id = ksfmmup; 4516 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4517 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4518 hblktag.htag_rehash = hashno; 4519 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4520 4521 SFMMU_HASH_LOCK(hmebp); 4522 4523 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4524 4525 if (hmeblkp == NULL) 4526 SFMMU_HASH_UNLOCK(hmebp); 4527 } 4528 4529 if (hmeblkp == NULL) 4530 return; 4531 4532 ASSERT(!hmeblkp->hblk_shared); 4533 4534 HBLKTOHME(osfhmep, hmeblkp, saddr); 4535 4536 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4537 if (!TTE_IS_VALID(&tte)) { 4538 SFMMU_HASH_UNLOCK(hmebp); 4539 return; 4540 } 4541 4542 pp = osfhmep->hme_page; 4543 if (pp == NULL) { 4544 SFMMU_HASH_UNLOCK(hmebp); 4545 ASSERT(cookie == NULL); 4546 return; 4547 } 4548 4549 vp = pp->p_vnode; 4550 off = pp->p_offset; 4551 4552 pml = sfmmu_mlist_enter(pp); 4553 4554 if (flags & HAC_PAGELOCK) { 4555 if (!page_trylock(pp, SE_SHARED)) { 4556 /* 4557 * Somebody is holding SE_EXCL lock. Might 4558 * even be hat_page_relocate(). Drop all 4559 * our locks, lookup the page in &kvp, and 4560 * retry. If it doesn't exist in &kvp and &zvp, 4561 * then we must be dealing with a kernel mapped 4562 * page which doesn't actually belong to 4563 * segkmem so we punt. 4564 */ 4565 sfmmu_mlist_exit(pml); 4566 SFMMU_HASH_UNLOCK(hmebp); 4567 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4568 /* check zvp before giving up */ 4569 if (pp == NULL) 4570 pp = page_lookup(&zvp, (u_offset_t)saddr, 4571 SE_SHARED); 4572 4573 if (pp == NULL) { 4574 ASSERT(cookie == NULL); 4575 return; 4576 } 4577 page_unlock(pp); 4578 goto rehash; 4579 } 4580 locked = 1; 4581 } 4582 4583 ASSERT(PAGE_LOCKED(pp)); 4584 4585 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4586 pp->p_offset != off) { 4587 /* 4588 * The page moved before we got our hands on it. Drop 4589 * all the locks and try again. 4590 */ 4591 ASSERT((flags & HAC_PAGELOCK) != 0); 4592 sfmmu_mlist_exit(pml); 4593 SFMMU_HASH_UNLOCK(hmebp); 4594 page_unlock(pp); 4595 locked = 0; 4596 goto rehash; 4597 } 4598 4599 if (!VN_ISKAS(vp)) { 4600 /* 4601 * This is not a segkmem page but another page which 4602 * has been kernel mapped. 4603 */ 4604 sfmmu_mlist_exit(pml); 4605 SFMMU_HASH_UNLOCK(hmebp); 4606 if (locked) 4607 page_unlock(pp); 4608 ASSERT(cookie == NULL); 4609 return; 4610 } 4611 4612 if (cookie != NULL) { 4613 pahmep = (struct pa_hment *)cookie; 4614 sfhmep = &pahmep->sfment; 4615 } else { 4616 for (sfhmep = pp->p_mapping; sfhmep != NULL; 4617 sfhmep = sfhmep->hme_next) { 4618 4619 /* 4620 * skip va<->pa mappings 4621 */ 4622 if (!IS_PAHME(sfhmep)) 4623 continue; 4624 4625 pahmep = sfhmep->hme_data; 4626 ASSERT(pahmep != NULL); 4627 4628 /* 4629 * if pa_hment matches, remove it 4630 */ 4631 if ((pahmep->pvt == pvt) && 4632 (pahmep->addr == vaddr) && 4633 (pahmep->len == len)) { 4634 break; 4635 } 4636 } 4637 } 4638 4639 if (sfhmep == NULL) { 4640 if (!panicstr) { 4641 panic("hat_delete_callback: pa_hment not found, pp %p", 4642 (void *)pp); 4643 } 4644 return; 4645 } 4646 4647 /* 4648 * Note: at this point a valid kernel mapping must still be 4649 * present on this page. 4650 */ 4651 pp->p_share--; 4652 if (pp->p_share <= 0) 4653 panic("hat_delete_callback: zero p_share"); 4654 4655 if (--pahmep->refcnt == 0) { 4656 if (pahmep->flags != 0) 4657 panic("hat_delete_callback: pa_hment is busy"); 4658 4659 /* 4660 * Remove sfhmep from the mapping list for the page. 4661 */ 4662 if (sfhmep->hme_prev) { 4663 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 4664 } else { 4665 pp->p_mapping = sfhmep->hme_next; 4666 } 4667 4668 if (sfhmep->hme_next) 4669 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 4670 4671 sfmmu_mlist_exit(pml); 4672 SFMMU_HASH_UNLOCK(hmebp); 4673 4674 if (locked) 4675 page_unlock(pp); 4676 4677 kmem_cache_free(pa_hment_cache, pahmep); 4678 return; 4679 } 4680 4681 sfmmu_mlist_exit(pml); 4682 SFMMU_HASH_UNLOCK(hmebp); 4683 if (locked) 4684 page_unlock(pp); 4685 } 4686 4687 /* 4688 * hat_probe returns 1 if the translation for the address 'addr' is 4689 * loaded, zero otherwise. 4690 * 4691 * hat_probe should be used only for advisorary purposes because it may 4692 * occasionally return the wrong value. The implementation must guarantee that 4693 * returning the wrong value is a very rare event. hat_probe is used 4694 * to implement optimizations in the segment drivers. 4695 * 4696 */ 4697 int 4698 hat_probe(struct hat *sfmmup, caddr_t addr) 4699 { 4700 pfn_t pfn; 4701 tte_t tte; 4702 4703 ASSERT(sfmmup != NULL); 4704 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4705 4706 ASSERT((sfmmup == ksfmmup) || 4707 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4708 4709 if (sfmmup == ksfmmup) { 4710 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 4711 == PFN_SUSPENDED) { 4712 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 4713 } 4714 } else { 4715 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL); 4716 } 4717 4718 if (pfn != PFN_INVALID) 4719 return (1); 4720 else 4721 return (0); 4722 } 4723 4724 ssize_t 4725 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 4726 { 4727 tte_t tte; 4728 4729 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4730 4731 if (sfmmup == ksfmmup) { 4732 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4733 return (-1); 4734 } 4735 } else { 4736 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4737 return (-1); 4738 } 4739 } 4740 4741 ASSERT(TTE_IS_VALID(&tte)); 4742 return (TTEBYTES(TTE_CSZ(&tte))); 4743 } 4744 4745 uint_t 4746 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 4747 { 4748 tte_t tte; 4749 4750 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4751 4752 if (sfmmup == ksfmmup) { 4753 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4754 tte.ll = 0; 4755 } 4756 } else { 4757 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4758 tte.ll = 0; 4759 } 4760 } 4761 if (TTE_IS_VALID(&tte)) { 4762 *attr = sfmmu_ptov_attr(&tte); 4763 return (0); 4764 } 4765 *attr = 0; 4766 return ((uint_t)0xffffffff); 4767 } 4768 4769 /* 4770 * Enables more attributes on specified address range (ie. logical OR) 4771 */ 4772 void 4773 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4774 { 4775 if (hat->sfmmu_xhat_provider) { 4776 XHAT_SETATTR(hat, addr, len, attr); 4777 return; 4778 } else { 4779 /* 4780 * This must be a CPU HAT. If the address space has 4781 * XHATs attached, change attributes for all of them, 4782 * just in case 4783 */ 4784 ASSERT(hat->sfmmu_as != NULL); 4785 if (hat->sfmmu_as->a_xhat != NULL) 4786 xhat_setattr_all(hat->sfmmu_as, addr, len, attr); 4787 } 4788 4789 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 4790 } 4791 4792 /* 4793 * Assigns attributes to the specified address range. All the attributes 4794 * are specified. 4795 */ 4796 void 4797 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4798 { 4799 if (hat->sfmmu_xhat_provider) { 4800 XHAT_CHGATTR(hat, addr, len, attr); 4801 return; 4802 } else { 4803 /* 4804 * This must be a CPU HAT. If the address space has 4805 * XHATs attached, change attributes for all of them, 4806 * just in case 4807 */ 4808 ASSERT(hat->sfmmu_as != NULL); 4809 if (hat->sfmmu_as->a_xhat != NULL) 4810 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr); 4811 } 4812 4813 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 4814 } 4815 4816 /* 4817 * Remove attributes on the specified address range (ie. loginal NAND) 4818 */ 4819 void 4820 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4821 { 4822 if (hat->sfmmu_xhat_provider) { 4823 XHAT_CLRATTR(hat, addr, len, attr); 4824 return; 4825 } else { 4826 /* 4827 * This must be a CPU HAT. If the address space has 4828 * XHATs attached, change attributes for all of them, 4829 * just in case 4830 */ 4831 ASSERT(hat->sfmmu_as != NULL); 4832 if (hat->sfmmu_as->a_xhat != NULL) 4833 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr); 4834 } 4835 4836 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 4837 } 4838 4839 /* 4840 * Change attributes on an address range to that specified by attr and mode. 4841 */ 4842 static void 4843 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 4844 int mode) 4845 { 4846 struct hmehash_bucket *hmebp; 4847 hmeblk_tag hblktag; 4848 int hmeshift, hashno = 1; 4849 struct hme_blk *hmeblkp, *list = NULL; 4850 caddr_t endaddr; 4851 cpuset_t cpuset; 4852 demap_range_t dmr; 4853 4854 CPUSET_ZERO(cpuset); 4855 4856 ASSERT((sfmmup == ksfmmup) || 4857 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4858 ASSERT((len & MMU_PAGEOFFSET) == 0); 4859 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4860 4861 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4862 ((addr + len) > (caddr_t)USERLIMIT)) { 4863 panic("user addr %p in kernel space", 4864 (void *)addr); 4865 } 4866 4867 endaddr = addr + len; 4868 hblktag.htag_id = sfmmup; 4869 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4870 DEMAP_RANGE_INIT(sfmmup, &dmr); 4871 4872 while (addr < endaddr) { 4873 hmeshift = HME_HASH_SHIFT(hashno); 4874 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4875 hblktag.htag_rehash = hashno; 4876 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4877 4878 SFMMU_HASH_LOCK(hmebp); 4879 4880 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4881 if (hmeblkp != NULL) { 4882 ASSERT(!hmeblkp->hblk_shared); 4883 /* 4884 * We've encountered a shadow hmeblk so skip the range 4885 * of the next smaller mapping size. 4886 */ 4887 if (hmeblkp->hblk_shw_bit) { 4888 ASSERT(sfmmup != ksfmmup); 4889 ASSERT(hashno > 1); 4890 addr = (caddr_t)P2END((uintptr_t)addr, 4891 TTEBYTES(hashno - 1)); 4892 } else { 4893 addr = sfmmu_hblk_chgattr(sfmmup, 4894 hmeblkp, addr, endaddr, &dmr, attr, mode); 4895 } 4896 SFMMU_HASH_UNLOCK(hmebp); 4897 hashno = 1; 4898 continue; 4899 } 4900 SFMMU_HASH_UNLOCK(hmebp); 4901 4902 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4903 /* 4904 * We have traversed the whole list and rehashed 4905 * if necessary without finding the address to chgattr. 4906 * This is ok, so we increment the address by the 4907 * smallest hmeblk range for kernel mappings or for 4908 * user mappings with no large pages, and the largest 4909 * hmeblk range, to account for shadow hmeblks, for 4910 * user mappings with large pages and continue. 4911 */ 4912 if (sfmmup == ksfmmup) 4913 addr = (caddr_t)P2END((uintptr_t)addr, 4914 TTEBYTES(1)); 4915 else 4916 addr = (caddr_t)P2END((uintptr_t)addr, 4917 TTEBYTES(hashno)); 4918 hashno = 1; 4919 } else { 4920 hashno++; 4921 } 4922 } 4923 4924 sfmmu_hblks_list_purge(&list); 4925 DEMAP_RANGE_FLUSH(&dmr); 4926 cpuset = sfmmup->sfmmu_cpusran; 4927 xt_sync(cpuset); 4928 } 4929 4930 /* 4931 * This function chgattr on a range of addresses in an hmeblk. It returns the 4932 * next addres that needs to be chgattr. 4933 * It should be called with the hash lock held. 4934 * XXX It should be possible to optimize chgattr by not flushing every time but 4935 * on the other hand: 4936 * 1. do one flush crosscall. 4937 * 2. only flush if we are increasing permissions (make sure this will work) 4938 */ 4939 static caddr_t 4940 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4941 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 4942 { 4943 tte_t tte, tteattr, tteflags, ttemod; 4944 struct sf_hment *sfhmep; 4945 int ttesz; 4946 struct page *pp = NULL; 4947 kmutex_t *pml, *pmtx; 4948 int ret; 4949 int use_demap_range; 4950 #if defined(SF_ERRATA_57) 4951 int check_exec; 4952 #endif 4953 4954 ASSERT(in_hblk_range(hmeblkp, addr)); 4955 ASSERT(hmeblkp->hblk_shw_bit == 0); 4956 ASSERT(!hmeblkp->hblk_shared); 4957 4958 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4959 ttesz = get_hblk_ttesz(hmeblkp); 4960 4961 /* 4962 * Flush the current demap region if addresses have been 4963 * skipped or the page size doesn't match. 4964 */ 4965 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 4966 if (use_demap_range) { 4967 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4968 } else { 4969 DEMAP_RANGE_FLUSH(dmrp); 4970 } 4971 4972 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 4973 #if defined(SF_ERRATA_57) 4974 check_exec = (sfmmup != ksfmmup) && 4975 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4976 TTE_IS_EXECUTABLE(&tteattr); 4977 #endif 4978 HBLKTOHME(sfhmep, hmeblkp, addr); 4979 while (addr < endaddr) { 4980 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4981 if (TTE_IS_VALID(&tte)) { 4982 if ((tte.ll & tteflags.ll) == tteattr.ll) { 4983 /* 4984 * if the new attr is the same as old 4985 * continue 4986 */ 4987 goto next_addr; 4988 } 4989 if (!TTE_IS_WRITABLE(&tteattr)) { 4990 /* 4991 * make sure we clear hw modify bit if we 4992 * removing write protections 4993 */ 4994 tteflags.tte_intlo |= TTE_HWWR_INT; 4995 } 4996 4997 pml = NULL; 4998 pp = sfhmep->hme_page; 4999 if (pp) { 5000 pml = sfmmu_mlist_enter(pp); 5001 } 5002 5003 if (pp != sfhmep->hme_page) { 5004 /* 5005 * tte must have been unloaded. 5006 */ 5007 ASSERT(pml); 5008 sfmmu_mlist_exit(pml); 5009 continue; 5010 } 5011 5012 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5013 5014 ttemod = tte; 5015 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 5016 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 5017 5018 #if defined(SF_ERRATA_57) 5019 if (check_exec && addr < errata57_limit) 5020 ttemod.tte_exec_perm = 0; 5021 #endif 5022 ret = sfmmu_modifytte_try(&tte, &ttemod, 5023 &sfhmep->hme_tte); 5024 5025 if (ret < 0) { 5026 /* tte changed underneath us */ 5027 if (pml) { 5028 sfmmu_mlist_exit(pml); 5029 } 5030 continue; 5031 } 5032 5033 if ((tteflags.tte_intlo & TTE_HWWR_INT) || 5034 (TTE_EXECUTED(&tte) && 5035 !TTE_IS_EXECUTABLE(&ttemod))) { 5036 /* 5037 * need to sync if clearing modify/exec bit. 5038 */ 5039 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5040 } 5041 5042 if (pp && PP_ISRO(pp)) { 5043 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 5044 pmtx = sfmmu_page_enter(pp); 5045 PP_CLRRO(pp); 5046 sfmmu_page_exit(pmtx); 5047 } 5048 } 5049 5050 if (ret > 0 && use_demap_range) { 5051 DEMAP_RANGE_MARKPG(dmrp, addr); 5052 } else if (ret > 0) { 5053 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5054 } 5055 5056 if (pml) { 5057 sfmmu_mlist_exit(pml); 5058 } 5059 } 5060 next_addr: 5061 addr += TTEBYTES(ttesz); 5062 sfhmep++; 5063 DEMAP_RANGE_NEXTPG(dmrp); 5064 } 5065 return (addr); 5066 } 5067 5068 /* 5069 * This routine converts virtual attributes to physical ones. It will 5070 * update the tteflags field with the tte mask corresponding to the attributes 5071 * affected and it returns the new attributes. It will also clear the modify 5072 * bit if we are taking away write permission. This is necessary since the 5073 * modify bit is the hardware permission bit and we need to clear it in order 5074 * to detect write faults. 5075 */ 5076 static uint64_t 5077 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 5078 { 5079 tte_t ttevalue; 5080 5081 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 5082 5083 switch (mode) { 5084 case SFMMU_CHGATTR: 5085 /* all attributes specified */ 5086 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 5087 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 5088 ttemaskp->tte_inthi = TTEINTHI_ATTR; 5089 ttemaskp->tte_intlo = TTEINTLO_ATTR; 5090 if (!icache_is_coherent) { 5091 if (!(attr & PROT_EXEC)) { 5092 TTE_SET_SOFTEXEC(ttemaskp); 5093 } else { 5094 TTE_CLR_EXEC(ttemaskp); 5095 TTE_SET_SOFTEXEC(&ttevalue); 5096 } 5097 } 5098 break; 5099 case SFMMU_SETATTR: 5100 ASSERT(!(attr & ~HAT_PROT_MASK)); 5101 ttemaskp->ll = 0; 5102 ttevalue.ll = 0; 5103 /* 5104 * a valid tte implies exec and read for sfmmu 5105 * so no need to do anything about them. 5106 * since priviledged access implies user access 5107 * PROT_USER doesn't make sense either. 5108 */ 5109 if (attr & PROT_WRITE) { 5110 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 5111 ttevalue.tte_intlo |= TTE_WRPRM_INT; 5112 } 5113 break; 5114 case SFMMU_CLRATTR: 5115 /* attributes will be nand with current ones */ 5116 if (attr & ~(PROT_WRITE | PROT_USER)) { 5117 panic("sfmmu: attr %x not supported", attr); 5118 } 5119 ttemaskp->ll = 0; 5120 ttevalue.ll = 0; 5121 if (attr & PROT_WRITE) { 5122 /* clear both writable and modify bit */ 5123 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 5124 } 5125 if (attr & PROT_USER) { 5126 ttemaskp->tte_intlo |= TTE_PRIV_INT; 5127 ttevalue.tte_intlo |= TTE_PRIV_INT; 5128 } 5129 break; 5130 default: 5131 panic("sfmmu_vtop_attr: bad mode %x", mode); 5132 } 5133 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 5134 return (ttevalue.ll); 5135 } 5136 5137 static uint_t 5138 sfmmu_ptov_attr(tte_t *ttep) 5139 { 5140 uint_t attr; 5141 5142 ASSERT(TTE_IS_VALID(ttep)); 5143 5144 attr = PROT_READ; 5145 5146 if (TTE_IS_WRITABLE(ttep)) { 5147 attr |= PROT_WRITE; 5148 } 5149 if (TTE_IS_EXECUTABLE(ttep)) { 5150 attr |= PROT_EXEC; 5151 } 5152 if (TTE_IS_SOFTEXEC(ttep)) { 5153 attr |= PROT_EXEC; 5154 } 5155 if (!TTE_IS_PRIVILEGED(ttep)) { 5156 attr |= PROT_USER; 5157 } 5158 if (TTE_IS_NFO(ttep)) { 5159 attr |= HAT_NOFAULT; 5160 } 5161 if (TTE_IS_NOSYNC(ttep)) { 5162 attr |= HAT_NOSYNC; 5163 } 5164 if (TTE_IS_SIDEFFECT(ttep)) { 5165 attr |= SFMMU_SIDEFFECT; 5166 } 5167 if (!TTE_IS_VCACHEABLE(ttep)) { 5168 attr |= SFMMU_UNCACHEVTTE; 5169 } 5170 if (!TTE_IS_PCACHEABLE(ttep)) { 5171 attr |= SFMMU_UNCACHEPTTE; 5172 } 5173 return (attr); 5174 } 5175 5176 /* 5177 * hat_chgprot is a deprecated hat call. New segment drivers 5178 * should store all attributes and use hat_*attr calls. 5179 * 5180 * Change the protections in the virtual address range 5181 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 5182 * then remove write permission, leaving the other 5183 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 5184 * 5185 */ 5186 void 5187 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 5188 { 5189 struct hmehash_bucket *hmebp; 5190 hmeblk_tag hblktag; 5191 int hmeshift, hashno = 1; 5192 struct hme_blk *hmeblkp, *list = NULL; 5193 caddr_t endaddr; 5194 cpuset_t cpuset; 5195 demap_range_t dmr; 5196 5197 ASSERT((len & MMU_PAGEOFFSET) == 0); 5198 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 5199 5200 if (sfmmup->sfmmu_xhat_provider) { 5201 XHAT_CHGPROT(sfmmup, addr, len, vprot); 5202 return; 5203 } else { 5204 /* 5205 * This must be a CPU HAT. If the address space has 5206 * XHATs attached, change attributes for all of them, 5207 * just in case 5208 */ 5209 ASSERT(sfmmup->sfmmu_as != NULL); 5210 if (sfmmup->sfmmu_as->a_xhat != NULL) 5211 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot); 5212 } 5213 5214 CPUSET_ZERO(cpuset); 5215 5216 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 5217 ((addr + len) > (caddr_t)USERLIMIT)) { 5218 panic("user addr %p vprot %x in kernel space", 5219 (void *)addr, vprot); 5220 } 5221 endaddr = addr + len; 5222 hblktag.htag_id = sfmmup; 5223 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5224 DEMAP_RANGE_INIT(sfmmup, &dmr); 5225 5226 while (addr < endaddr) { 5227 hmeshift = HME_HASH_SHIFT(hashno); 5228 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5229 hblktag.htag_rehash = hashno; 5230 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5231 5232 SFMMU_HASH_LOCK(hmebp); 5233 5234 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5235 if (hmeblkp != NULL) { 5236 ASSERT(!hmeblkp->hblk_shared); 5237 /* 5238 * We've encountered a shadow hmeblk so skip the range 5239 * of the next smaller mapping size. 5240 */ 5241 if (hmeblkp->hblk_shw_bit) { 5242 ASSERT(sfmmup != ksfmmup); 5243 ASSERT(hashno > 1); 5244 addr = (caddr_t)P2END((uintptr_t)addr, 5245 TTEBYTES(hashno - 1)); 5246 } else { 5247 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 5248 addr, endaddr, &dmr, vprot); 5249 } 5250 SFMMU_HASH_UNLOCK(hmebp); 5251 hashno = 1; 5252 continue; 5253 } 5254 SFMMU_HASH_UNLOCK(hmebp); 5255 5256 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5257 /* 5258 * We have traversed the whole list and rehashed 5259 * if necessary without finding the address to chgprot. 5260 * This is ok so we increment the address by the 5261 * smallest hmeblk range for kernel mappings and the 5262 * largest hmeblk range, to account for shadow hmeblks, 5263 * for user mappings and continue. 5264 */ 5265 if (sfmmup == ksfmmup) 5266 addr = (caddr_t)P2END((uintptr_t)addr, 5267 TTEBYTES(1)); 5268 else 5269 addr = (caddr_t)P2END((uintptr_t)addr, 5270 TTEBYTES(hashno)); 5271 hashno = 1; 5272 } else { 5273 hashno++; 5274 } 5275 } 5276 5277 sfmmu_hblks_list_purge(&list); 5278 DEMAP_RANGE_FLUSH(&dmr); 5279 cpuset = sfmmup->sfmmu_cpusran; 5280 xt_sync(cpuset); 5281 } 5282 5283 /* 5284 * This function chgprots a range of addresses in an hmeblk. It returns the 5285 * next addres that needs to be chgprot. 5286 * It should be called with the hash lock held. 5287 * XXX It shold be possible to optimize chgprot by not flushing every time but 5288 * on the other hand: 5289 * 1. do one flush crosscall. 5290 * 2. only flush if we are increasing permissions (make sure this will work) 5291 */ 5292 static caddr_t 5293 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5294 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 5295 { 5296 uint_t pprot; 5297 tte_t tte, ttemod; 5298 struct sf_hment *sfhmep; 5299 uint_t tteflags; 5300 int ttesz; 5301 struct page *pp = NULL; 5302 kmutex_t *pml, *pmtx; 5303 int ret; 5304 int use_demap_range; 5305 #if defined(SF_ERRATA_57) 5306 int check_exec; 5307 #endif 5308 5309 ASSERT(in_hblk_range(hmeblkp, addr)); 5310 ASSERT(hmeblkp->hblk_shw_bit == 0); 5311 ASSERT(!hmeblkp->hblk_shared); 5312 5313 #ifdef DEBUG 5314 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5315 (endaddr < get_hblk_endaddr(hmeblkp))) { 5316 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 5317 } 5318 #endif /* DEBUG */ 5319 5320 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5321 ttesz = get_hblk_ttesz(hmeblkp); 5322 5323 pprot = sfmmu_vtop_prot(vprot, &tteflags); 5324 #if defined(SF_ERRATA_57) 5325 check_exec = (sfmmup != ksfmmup) && 5326 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 5327 ((vprot & PROT_EXEC) == PROT_EXEC); 5328 #endif 5329 HBLKTOHME(sfhmep, hmeblkp, addr); 5330 5331 /* 5332 * Flush the current demap region if addresses have been 5333 * skipped or the page size doesn't match. 5334 */ 5335 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 5336 if (use_demap_range) { 5337 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5338 } else { 5339 DEMAP_RANGE_FLUSH(dmrp); 5340 } 5341 5342 while (addr < endaddr) { 5343 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5344 if (TTE_IS_VALID(&tte)) { 5345 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 5346 /* 5347 * if the new protection is the same as old 5348 * continue 5349 */ 5350 goto next_addr; 5351 } 5352 pml = NULL; 5353 pp = sfhmep->hme_page; 5354 if (pp) { 5355 pml = sfmmu_mlist_enter(pp); 5356 } 5357 if (pp != sfhmep->hme_page) { 5358 /* 5359 * tte most have been unloaded 5360 * underneath us. Recheck 5361 */ 5362 ASSERT(pml); 5363 sfmmu_mlist_exit(pml); 5364 continue; 5365 } 5366 5367 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5368 5369 ttemod = tte; 5370 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 5371 ASSERT(TTE_IS_SOFTEXEC(&tte) == 5372 TTE_IS_SOFTEXEC(&ttemod)); 5373 ASSERT(TTE_IS_EXECUTABLE(&tte) == 5374 TTE_IS_EXECUTABLE(&ttemod)); 5375 5376 #if defined(SF_ERRATA_57) 5377 if (check_exec && addr < errata57_limit) 5378 ttemod.tte_exec_perm = 0; 5379 #endif 5380 ret = sfmmu_modifytte_try(&tte, &ttemod, 5381 &sfhmep->hme_tte); 5382 5383 if (ret < 0) { 5384 /* tte changed underneath us */ 5385 if (pml) { 5386 sfmmu_mlist_exit(pml); 5387 } 5388 continue; 5389 } 5390 5391 if (tteflags & TTE_HWWR_INT) { 5392 /* 5393 * need to sync if we are clearing modify bit. 5394 */ 5395 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5396 } 5397 5398 if (pp && PP_ISRO(pp)) { 5399 if (pprot & TTE_WRPRM_INT) { 5400 pmtx = sfmmu_page_enter(pp); 5401 PP_CLRRO(pp); 5402 sfmmu_page_exit(pmtx); 5403 } 5404 } 5405 5406 if (ret > 0 && use_demap_range) { 5407 DEMAP_RANGE_MARKPG(dmrp, addr); 5408 } else if (ret > 0) { 5409 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5410 } 5411 5412 if (pml) { 5413 sfmmu_mlist_exit(pml); 5414 } 5415 } 5416 next_addr: 5417 addr += TTEBYTES(ttesz); 5418 sfhmep++; 5419 DEMAP_RANGE_NEXTPG(dmrp); 5420 } 5421 return (addr); 5422 } 5423 5424 /* 5425 * This routine is deprecated and should only be used by hat_chgprot. 5426 * The correct routine is sfmmu_vtop_attr. 5427 * This routine converts virtual page protections to physical ones. It will 5428 * update the tteflags field with the tte mask corresponding to the protections 5429 * affected and it returns the new protections. It will also clear the modify 5430 * bit if we are taking away write permission. This is necessary since the 5431 * modify bit is the hardware permission bit and we need to clear it in order 5432 * to detect write faults. 5433 * It accepts the following special protections: 5434 * ~PROT_WRITE = remove write permissions. 5435 * ~PROT_USER = remove user permissions. 5436 */ 5437 static uint_t 5438 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 5439 { 5440 if (vprot == (uint_t)~PROT_WRITE) { 5441 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 5442 return (0); /* will cause wrprm to be cleared */ 5443 } 5444 if (vprot == (uint_t)~PROT_USER) { 5445 *tteflagsp = TTE_PRIV_INT; 5446 return (0); /* will cause privprm to be cleared */ 5447 } 5448 if ((vprot == 0) || (vprot == PROT_USER) || 5449 ((vprot & PROT_ALL) != vprot)) { 5450 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5451 } 5452 5453 switch (vprot) { 5454 case (PROT_READ): 5455 case (PROT_EXEC): 5456 case (PROT_EXEC | PROT_READ): 5457 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5458 return (TTE_PRIV_INT); /* set prv and clr wrt */ 5459 case (PROT_WRITE): 5460 case (PROT_WRITE | PROT_READ): 5461 case (PROT_EXEC | PROT_WRITE): 5462 case (PROT_EXEC | PROT_WRITE | PROT_READ): 5463 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5464 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 5465 case (PROT_USER | PROT_READ): 5466 case (PROT_USER | PROT_EXEC): 5467 case (PROT_USER | PROT_EXEC | PROT_READ): 5468 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5469 return (0); /* clr prv and wrt */ 5470 case (PROT_USER | PROT_WRITE): 5471 case (PROT_USER | PROT_WRITE | PROT_READ): 5472 case (PROT_USER | PROT_EXEC | PROT_WRITE): 5473 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 5474 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5475 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 5476 default: 5477 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5478 } 5479 return (0); 5480 } 5481 5482 /* 5483 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 5484 * the normal algorithm would take too long for a very large VA range with 5485 * few real mappings. This routine just walks thru all HMEs in the global 5486 * hash table to find and remove mappings. 5487 */ 5488 static void 5489 hat_unload_large_virtual( 5490 struct hat *sfmmup, 5491 caddr_t startaddr, 5492 size_t len, 5493 uint_t flags, 5494 hat_callback_t *callback) 5495 { 5496 struct hmehash_bucket *hmebp; 5497 struct hme_blk *hmeblkp; 5498 struct hme_blk *pr_hblk = NULL; 5499 struct hme_blk *nx_hblk; 5500 struct hme_blk *list = NULL; 5501 int i; 5502 uint64_t hblkpa, prevpa, nx_pa; 5503 demap_range_t dmr, *dmrp; 5504 cpuset_t cpuset; 5505 caddr_t endaddr = startaddr + len; 5506 caddr_t sa; 5507 caddr_t ea; 5508 caddr_t cb_sa[MAX_CB_ADDR]; 5509 caddr_t cb_ea[MAX_CB_ADDR]; 5510 int addr_cnt = 0; 5511 int a = 0; 5512 5513 if (sfmmup->sfmmu_free) { 5514 dmrp = NULL; 5515 } else { 5516 dmrp = &dmr; 5517 DEMAP_RANGE_INIT(sfmmup, dmrp); 5518 } 5519 5520 /* 5521 * Loop through all the hash buckets of HME blocks looking for matches. 5522 */ 5523 for (i = 0; i <= UHMEHASH_SZ; i++) { 5524 hmebp = &uhme_hash[i]; 5525 SFMMU_HASH_LOCK(hmebp); 5526 hmeblkp = hmebp->hmeblkp; 5527 hblkpa = hmebp->hmeh_nextpa; 5528 prevpa = 0; 5529 pr_hblk = NULL; 5530 while (hmeblkp) { 5531 nx_hblk = hmeblkp->hblk_next; 5532 nx_pa = hmeblkp->hblk_nextpa; 5533 5534 /* 5535 * skip if not this context, if a shadow block or 5536 * if the mapping is not in the requested range 5537 */ 5538 if (hmeblkp->hblk_tag.htag_id != sfmmup || 5539 hmeblkp->hblk_shw_bit || 5540 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 5541 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 5542 pr_hblk = hmeblkp; 5543 prevpa = hblkpa; 5544 goto next_block; 5545 } 5546 5547 ASSERT(!hmeblkp->hblk_shared); 5548 /* 5549 * unload if there are any current valid mappings 5550 */ 5551 if (hmeblkp->hblk_vcnt != 0 || 5552 hmeblkp->hblk_hmecnt != 0) 5553 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 5554 sa, ea, dmrp, flags); 5555 5556 /* 5557 * on unmap we also release the HME block itself, once 5558 * all mappings are gone. 5559 */ 5560 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 5561 !hmeblkp->hblk_vcnt && 5562 !hmeblkp->hblk_hmecnt) { 5563 ASSERT(!hmeblkp->hblk_lckcnt); 5564 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 5565 prevpa, pr_hblk); 5566 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 5567 } else { 5568 pr_hblk = hmeblkp; 5569 prevpa = hblkpa; 5570 } 5571 5572 if (callback == NULL) 5573 goto next_block; 5574 5575 /* 5576 * HME blocks may span more than one page, but we may be 5577 * unmapping only one page, so check for a smaller range 5578 * for the callback 5579 */ 5580 if (sa < startaddr) 5581 sa = startaddr; 5582 if (--ea > endaddr) 5583 ea = endaddr - 1; 5584 5585 cb_sa[addr_cnt] = sa; 5586 cb_ea[addr_cnt] = ea; 5587 if (++addr_cnt == MAX_CB_ADDR) { 5588 if (dmrp != NULL) { 5589 DEMAP_RANGE_FLUSH(dmrp); 5590 cpuset = sfmmup->sfmmu_cpusran; 5591 xt_sync(cpuset); 5592 } 5593 5594 for (a = 0; a < MAX_CB_ADDR; ++a) { 5595 callback->hcb_start_addr = cb_sa[a]; 5596 callback->hcb_end_addr = cb_ea[a]; 5597 callback->hcb_function(callback); 5598 } 5599 addr_cnt = 0; 5600 } 5601 5602 next_block: 5603 hmeblkp = nx_hblk; 5604 hblkpa = nx_pa; 5605 } 5606 SFMMU_HASH_UNLOCK(hmebp); 5607 } 5608 5609 sfmmu_hblks_list_purge(&list); 5610 if (dmrp != NULL) { 5611 DEMAP_RANGE_FLUSH(dmrp); 5612 cpuset = sfmmup->sfmmu_cpusran; 5613 xt_sync(cpuset); 5614 } 5615 5616 for (a = 0; a < addr_cnt; ++a) { 5617 callback->hcb_start_addr = cb_sa[a]; 5618 callback->hcb_end_addr = cb_ea[a]; 5619 callback->hcb_function(callback); 5620 } 5621 5622 /* 5623 * Check TSB and TLB page sizes if the process isn't exiting. 5624 */ 5625 if (!sfmmup->sfmmu_free) 5626 sfmmu_check_page_sizes(sfmmup, 0); 5627 } 5628 5629 /* 5630 * Unload all the mappings in the range [addr..addr+len). addr and len must 5631 * be MMU_PAGESIZE aligned. 5632 */ 5633 5634 extern struct seg *segkmap; 5635 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 5636 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 5637 5638 5639 void 5640 hat_unload_callback( 5641 struct hat *sfmmup, 5642 caddr_t addr, 5643 size_t len, 5644 uint_t flags, 5645 hat_callback_t *callback) 5646 { 5647 struct hmehash_bucket *hmebp; 5648 hmeblk_tag hblktag; 5649 int hmeshift, hashno, iskernel; 5650 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 5651 caddr_t endaddr; 5652 cpuset_t cpuset; 5653 uint64_t hblkpa, prevpa; 5654 int addr_count = 0; 5655 int a; 5656 caddr_t cb_start_addr[MAX_CB_ADDR]; 5657 caddr_t cb_end_addr[MAX_CB_ADDR]; 5658 int issegkmap = ISSEGKMAP(sfmmup, addr); 5659 demap_range_t dmr, *dmrp; 5660 5661 if (sfmmup->sfmmu_xhat_provider) { 5662 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback); 5663 return; 5664 } else { 5665 /* 5666 * This must be a CPU HAT. If the address space has 5667 * XHATs attached, unload the mappings for all of them, 5668 * just in case 5669 */ 5670 ASSERT(sfmmup->sfmmu_as != NULL); 5671 if (sfmmup->sfmmu_as->a_xhat != NULL) 5672 xhat_unload_callback_all(sfmmup->sfmmu_as, addr, 5673 len, flags, callback); 5674 } 5675 5676 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 5677 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 5678 5679 ASSERT(sfmmup != NULL); 5680 ASSERT((len & MMU_PAGEOFFSET) == 0); 5681 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 5682 5683 /* 5684 * Probing through a large VA range (say 63 bits) will be slow, even 5685 * at 4 Meg steps between the probes. So, when the virtual address range 5686 * is very large, search the HME entries for what to unload. 5687 * 5688 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 5689 * 5690 * UHMEHASH_SZ is number of hash buckets to examine 5691 * 5692 */ 5693 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 5694 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 5695 return; 5696 } 5697 5698 CPUSET_ZERO(cpuset); 5699 5700 /* 5701 * If the process is exiting, we can save a lot of fuss since 5702 * we'll flush the TLB when we free the ctx anyway. 5703 */ 5704 if (sfmmup->sfmmu_free) 5705 dmrp = NULL; 5706 else 5707 dmrp = &dmr; 5708 5709 DEMAP_RANGE_INIT(sfmmup, dmrp); 5710 endaddr = addr + len; 5711 hblktag.htag_id = sfmmup; 5712 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5713 5714 /* 5715 * It is likely for the vm to call unload over a wide range of 5716 * addresses that are actually very sparsely populated by 5717 * translations. In order to speed this up the sfmmu hat supports 5718 * the concept of shadow hmeblks. Dummy large page hmeblks that 5719 * correspond to actual small translations are allocated at tteload 5720 * time and are referred to as shadow hmeblks. Now, during unload 5721 * time, we first check if we have a shadow hmeblk for that 5722 * translation. The absence of one means the corresponding address 5723 * range is empty and can be skipped. 5724 * 5725 * The kernel is an exception to above statement and that is why 5726 * we don't use shadow hmeblks and hash starting from the smallest 5727 * page size. 5728 */ 5729 if (sfmmup == KHATID) { 5730 iskernel = 1; 5731 hashno = TTE64K; 5732 } else { 5733 iskernel = 0; 5734 if (mmu_page_sizes == max_mmu_page_sizes) { 5735 hashno = TTE256M; 5736 } else { 5737 hashno = TTE4M; 5738 } 5739 } 5740 while (addr < endaddr) { 5741 hmeshift = HME_HASH_SHIFT(hashno); 5742 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5743 hblktag.htag_rehash = hashno; 5744 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5745 5746 SFMMU_HASH_LOCK(hmebp); 5747 5748 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, hblkpa, pr_hblk, 5749 prevpa, &list); 5750 if (hmeblkp == NULL) { 5751 /* 5752 * didn't find an hmeblk. skip the appropiate 5753 * address range. 5754 */ 5755 SFMMU_HASH_UNLOCK(hmebp); 5756 if (iskernel) { 5757 if (hashno < mmu_hashcnt) { 5758 hashno++; 5759 continue; 5760 } else { 5761 hashno = TTE64K; 5762 addr = (caddr_t)roundup((uintptr_t)addr 5763 + 1, MMU_PAGESIZE64K); 5764 continue; 5765 } 5766 } 5767 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5768 (1 << hmeshift)); 5769 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5770 ASSERT(hashno == TTE64K); 5771 continue; 5772 } 5773 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5774 hashno = TTE512K; 5775 continue; 5776 } 5777 if (mmu_page_sizes == max_mmu_page_sizes) { 5778 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5779 hashno = TTE4M; 5780 continue; 5781 } 5782 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5783 hashno = TTE32M; 5784 continue; 5785 } 5786 hashno = TTE256M; 5787 continue; 5788 } else { 5789 hashno = TTE4M; 5790 continue; 5791 } 5792 } 5793 ASSERT(hmeblkp); 5794 ASSERT(!hmeblkp->hblk_shared); 5795 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5796 /* 5797 * If the valid count is zero we can skip the range 5798 * mapped by this hmeblk. 5799 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 5800 * is used by segment drivers as a hint 5801 * that the mapping resource won't be used any longer. 5802 * The best example of this is during exit(). 5803 */ 5804 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5805 get_hblk_span(hmeblkp)); 5806 if ((flags & HAT_UNLOAD_UNMAP) || 5807 (iskernel && !issegkmap)) { 5808 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 5809 pr_hblk); 5810 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 5811 } 5812 SFMMU_HASH_UNLOCK(hmebp); 5813 5814 if (iskernel) { 5815 hashno = TTE64K; 5816 continue; 5817 } 5818 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5819 ASSERT(hashno == TTE64K); 5820 continue; 5821 } 5822 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5823 hashno = TTE512K; 5824 continue; 5825 } 5826 if (mmu_page_sizes == max_mmu_page_sizes) { 5827 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5828 hashno = TTE4M; 5829 continue; 5830 } 5831 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5832 hashno = TTE32M; 5833 continue; 5834 } 5835 hashno = TTE256M; 5836 continue; 5837 } else { 5838 hashno = TTE4M; 5839 continue; 5840 } 5841 } 5842 if (hmeblkp->hblk_shw_bit) { 5843 /* 5844 * If we encounter a shadow hmeblk we know there is 5845 * smaller sized hmeblks mapping the same address space. 5846 * Decrement the hash size and rehash. 5847 */ 5848 ASSERT(sfmmup != KHATID); 5849 hashno--; 5850 SFMMU_HASH_UNLOCK(hmebp); 5851 continue; 5852 } 5853 5854 /* 5855 * track callback address ranges. 5856 * only start a new range when it's not contiguous 5857 */ 5858 if (callback != NULL) { 5859 if (addr_count > 0 && 5860 addr == cb_end_addr[addr_count - 1]) 5861 --addr_count; 5862 else 5863 cb_start_addr[addr_count] = addr; 5864 } 5865 5866 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5867 dmrp, flags); 5868 5869 if (callback != NULL) 5870 cb_end_addr[addr_count++] = addr; 5871 5872 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5873 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5874 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, 5875 pr_hblk); 5876 sfmmu_hblk_free(hmebp, hmeblkp, hblkpa, &list); 5877 } 5878 SFMMU_HASH_UNLOCK(hmebp); 5879 5880 /* 5881 * Notify our caller as to exactly which pages 5882 * have been unloaded. We do these in clumps, 5883 * to minimize the number of xt_sync()s that need to occur. 5884 */ 5885 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5886 DEMAP_RANGE_FLUSH(dmrp); 5887 if (dmrp != NULL) { 5888 cpuset = sfmmup->sfmmu_cpusran; 5889 xt_sync(cpuset); 5890 } 5891 5892 for (a = 0; a < MAX_CB_ADDR; ++a) { 5893 callback->hcb_start_addr = cb_start_addr[a]; 5894 callback->hcb_end_addr = cb_end_addr[a]; 5895 callback->hcb_function(callback); 5896 } 5897 addr_count = 0; 5898 } 5899 if (iskernel) { 5900 hashno = TTE64K; 5901 continue; 5902 } 5903 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5904 ASSERT(hashno == TTE64K); 5905 continue; 5906 } 5907 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5908 hashno = TTE512K; 5909 continue; 5910 } 5911 if (mmu_page_sizes == max_mmu_page_sizes) { 5912 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5913 hashno = TTE4M; 5914 continue; 5915 } 5916 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5917 hashno = TTE32M; 5918 continue; 5919 } 5920 hashno = TTE256M; 5921 } else { 5922 hashno = TTE4M; 5923 } 5924 } 5925 5926 sfmmu_hblks_list_purge(&list); 5927 DEMAP_RANGE_FLUSH(dmrp); 5928 if (dmrp != NULL) { 5929 cpuset = sfmmup->sfmmu_cpusran; 5930 xt_sync(cpuset); 5931 } 5932 if (callback && addr_count != 0) { 5933 for (a = 0; a < addr_count; ++a) { 5934 callback->hcb_start_addr = cb_start_addr[a]; 5935 callback->hcb_end_addr = cb_end_addr[a]; 5936 callback->hcb_function(callback); 5937 } 5938 } 5939 5940 /* 5941 * Check TSB and TLB page sizes if the process isn't exiting. 5942 */ 5943 if (!sfmmup->sfmmu_free) 5944 sfmmu_check_page_sizes(sfmmup, 0); 5945 } 5946 5947 /* 5948 * Unload all the mappings in the range [addr..addr+len). addr and len must 5949 * be MMU_PAGESIZE aligned. 5950 */ 5951 void 5952 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 5953 { 5954 if (sfmmup->sfmmu_xhat_provider) { 5955 XHAT_UNLOAD(sfmmup, addr, len, flags); 5956 return; 5957 } 5958 hat_unload_callback(sfmmup, addr, len, flags, NULL); 5959 } 5960 5961 5962 /* 5963 * Find the largest mapping size for this page. 5964 */ 5965 int 5966 fnd_mapping_sz(page_t *pp) 5967 { 5968 int sz; 5969 int p_index; 5970 5971 p_index = PP_MAPINDEX(pp); 5972 5973 sz = 0; 5974 p_index >>= 1; /* don't care about 8K bit */ 5975 for (; p_index; p_index >>= 1) { 5976 sz++; 5977 } 5978 5979 return (sz); 5980 } 5981 5982 /* 5983 * This function unloads a range of addresses for an hmeblk. 5984 * It returns the next address to be unloaded. 5985 * It should be called with the hash lock held. 5986 */ 5987 static caddr_t 5988 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5989 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 5990 { 5991 tte_t tte, ttemod; 5992 struct sf_hment *sfhmep; 5993 int ttesz; 5994 long ttecnt; 5995 page_t *pp; 5996 kmutex_t *pml; 5997 int ret; 5998 int use_demap_range; 5999 6000 ASSERT(in_hblk_range(hmeblkp, addr)); 6001 ASSERT(!hmeblkp->hblk_shw_bit); 6002 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared); 6003 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared); 6004 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared); 6005 6006 #ifdef DEBUG 6007 if (get_hblk_ttesz(hmeblkp) != TTE8K && 6008 (endaddr < get_hblk_endaddr(hmeblkp))) { 6009 panic("sfmmu_hblk_unload: partial unload of large page"); 6010 } 6011 #endif /* DEBUG */ 6012 6013 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6014 ttesz = get_hblk_ttesz(hmeblkp); 6015 6016 use_demap_range = ((dmrp == NULL) || 6017 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp))); 6018 6019 if (use_demap_range) { 6020 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 6021 } else { 6022 DEMAP_RANGE_FLUSH(dmrp); 6023 } 6024 ttecnt = 0; 6025 HBLKTOHME(sfhmep, hmeblkp, addr); 6026 6027 while (addr < endaddr) { 6028 pml = NULL; 6029 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6030 if (TTE_IS_VALID(&tte)) { 6031 pp = sfhmep->hme_page; 6032 if (pp != NULL) { 6033 pml = sfmmu_mlist_enter(pp); 6034 } 6035 6036 /* 6037 * Verify if hme still points to 'pp' now that 6038 * we have p_mapping lock. 6039 */ 6040 if (sfhmep->hme_page != pp) { 6041 if (pp != NULL && sfhmep->hme_page != NULL) { 6042 ASSERT(pml != NULL); 6043 sfmmu_mlist_exit(pml); 6044 /* Re-start this iteration. */ 6045 continue; 6046 } 6047 ASSERT((pp != NULL) && 6048 (sfhmep->hme_page == NULL)); 6049 goto tte_unloaded; 6050 } 6051 6052 /* 6053 * This point on we have both HASH and p_mapping 6054 * lock. 6055 */ 6056 ASSERT(pp == sfhmep->hme_page); 6057 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6058 6059 /* 6060 * We need to loop on modify tte because it is 6061 * possible for pagesync to come along and 6062 * change the software bits beneath us. 6063 * 6064 * Page_unload can also invalidate the tte after 6065 * we read tte outside of p_mapping lock. 6066 */ 6067 again: 6068 ttemod = tte; 6069 6070 TTE_SET_INVALID(&ttemod); 6071 ret = sfmmu_modifytte_try(&tte, &ttemod, 6072 &sfhmep->hme_tte); 6073 6074 if (ret <= 0) { 6075 if (TTE_IS_VALID(&tte)) { 6076 ASSERT(ret < 0); 6077 goto again; 6078 } 6079 if (pp != NULL) { 6080 panic("sfmmu_hblk_unload: pp = 0x%p " 6081 "tte became invalid under mlist" 6082 " lock = 0x%p", (void *)pp, 6083 (void *)pml); 6084 } 6085 continue; 6086 } 6087 6088 if (!(flags & HAT_UNLOAD_NOSYNC) || 6089 (pp != NULL && TTE_EXECUTED(&tte))) { 6090 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6091 } 6092 6093 /* 6094 * Ok- we invalidated the tte. Do the rest of the job. 6095 */ 6096 ttecnt++; 6097 6098 if (flags & HAT_UNLOAD_UNLOCK) { 6099 ASSERT(hmeblkp->hblk_lckcnt > 0); 6100 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 6101 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 6102 } 6103 6104 /* 6105 * Normally we would need to flush the page 6106 * from the virtual cache at this point in 6107 * order to prevent a potential cache alias 6108 * inconsistency. 6109 * The particular scenario we need to worry 6110 * about is: 6111 * Given: va1 and va2 are two virtual address 6112 * that alias and map the same physical 6113 * address. 6114 * 1. mapping exists from va1 to pa and data 6115 * has been read into the cache. 6116 * 2. unload va1. 6117 * 3. load va2 and modify data using va2. 6118 * 4 unload va2. 6119 * 5. load va1 and reference data. Unless we 6120 * flush the data cache when we unload we will 6121 * get stale data. 6122 * Fortunately, page coloring eliminates the 6123 * above scenario by remembering the color a 6124 * physical page was last or is currently 6125 * mapped to. Now, we delay the flush until 6126 * the loading of translations. Only when the 6127 * new translation is of a different color 6128 * are we forced to flush. 6129 */ 6130 if (use_demap_range) { 6131 /* 6132 * Mark this page as needing a demap. 6133 */ 6134 DEMAP_RANGE_MARKPG(dmrp, addr); 6135 } else { 6136 ASSERT(sfmmup != NULL); 6137 ASSERT(!hmeblkp->hblk_shared); 6138 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 6139 sfmmup->sfmmu_free, 0); 6140 } 6141 6142 if (pp) { 6143 /* 6144 * Remove the hment from the mapping list 6145 */ 6146 ASSERT(hmeblkp->hblk_hmecnt > 0); 6147 6148 /* 6149 * Again, we cannot 6150 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 6151 */ 6152 HME_SUB(sfhmep, pp); 6153 membar_stst(); 6154 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 6155 } 6156 6157 ASSERT(hmeblkp->hblk_vcnt > 0); 6158 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 6159 6160 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 6161 !hmeblkp->hblk_lckcnt); 6162 6163 #ifdef VAC 6164 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 6165 if (PP_ISTNC(pp)) { 6166 /* 6167 * If page was temporary 6168 * uncached, try to recache 6169 * it. Note that HME_SUB() was 6170 * called above so p_index and 6171 * mlist had been updated. 6172 */ 6173 conv_tnc(pp, ttesz); 6174 } else if (pp->p_mapping == NULL) { 6175 ASSERT(kpm_enable); 6176 /* 6177 * Page is marked to be in VAC conflict 6178 * to an existing kpm mapping and/or is 6179 * kpm mapped using only the regular 6180 * pagesize. 6181 */ 6182 sfmmu_kpm_hme_unload(pp); 6183 } 6184 } 6185 #endif /* VAC */ 6186 } else if ((pp = sfhmep->hme_page) != NULL) { 6187 /* 6188 * TTE is invalid but the hme 6189 * still exists. let pageunload 6190 * complete its job. 6191 */ 6192 ASSERT(pml == NULL); 6193 pml = sfmmu_mlist_enter(pp); 6194 if (sfhmep->hme_page != NULL) { 6195 sfmmu_mlist_exit(pml); 6196 continue; 6197 } 6198 ASSERT(sfhmep->hme_page == NULL); 6199 } else if (hmeblkp->hblk_hmecnt != 0) { 6200 /* 6201 * pageunload may have not finished decrementing 6202 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 6203 * wait for pageunload to finish. Rely on pageunload 6204 * to decrement hblk_hmecnt after hblk_vcnt. 6205 */ 6206 pfn_t pfn = TTE_TO_TTEPFN(&tte); 6207 ASSERT(pml == NULL); 6208 if (pf_is_memory(pfn)) { 6209 pp = page_numtopp_nolock(pfn); 6210 if (pp != NULL) { 6211 pml = sfmmu_mlist_enter(pp); 6212 sfmmu_mlist_exit(pml); 6213 pml = NULL; 6214 } 6215 } 6216 } 6217 6218 tte_unloaded: 6219 /* 6220 * At this point, the tte we are looking at 6221 * should be unloaded, and hme has been unlinked 6222 * from page too. This is important because in 6223 * pageunload, it does ttesync() then HME_SUB. 6224 * We need to make sure HME_SUB has been completed 6225 * so we know ttesync() has been completed. Otherwise, 6226 * at exit time, after return from hat layer, VM will 6227 * release as structure which hat_setstat() (called 6228 * by ttesync()) needs. 6229 */ 6230 #ifdef DEBUG 6231 { 6232 tte_t dtte; 6233 6234 ASSERT(sfhmep->hme_page == NULL); 6235 6236 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 6237 ASSERT(!TTE_IS_VALID(&dtte)); 6238 } 6239 #endif 6240 6241 if (pml) { 6242 sfmmu_mlist_exit(pml); 6243 } 6244 6245 addr += TTEBYTES(ttesz); 6246 sfhmep++; 6247 DEMAP_RANGE_NEXTPG(dmrp); 6248 } 6249 /* 6250 * For shared hmeblks this routine is only called when region is freed 6251 * and no longer referenced. So no need to decrement ttecnt 6252 * in the region structure here. 6253 */ 6254 if (ttecnt > 0 && sfmmup != NULL) { 6255 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 6256 } 6257 return (addr); 6258 } 6259 6260 /* 6261 * Synchronize all the mappings in the range [addr..addr+len). 6262 * Can be called with clearflag having two states: 6263 * HAT_SYNC_DONTZERO means just return the rm stats 6264 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 6265 */ 6266 void 6267 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 6268 { 6269 struct hmehash_bucket *hmebp; 6270 hmeblk_tag hblktag; 6271 int hmeshift, hashno = 1; 6272 struct hme_blk *hmeblkp, *list = NULL; 6273 caddr_t endaddr; 6274 cpuset_t cpuset; 6275 6276 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 6277 ASSERT((sfmmup == ksfmmup) || 6278 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 6279 ASSERT((len & MMU_PAGEOFFSET) == 0); 6280 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6281 (clearflag == HAT_SYNC_ZERORM)); 6282 6283 CPUSET_ZERO(cpuset); 6284 6285 endaddr = addr + len; 6286 hblktag.htag_id = sfmmup; 6287 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 6288 6289 /* 6290 * Spitfire supports 4 page sizes. 6291 * Most pages are expected to be of the smallest page 6292 * size (8K) and these will not need to be rehashed. 64K 6293 * pages also don't need to be rehashed because the an hmeblk 6294 * spans 64K of address space. 512K pages might need 1 rehash and 6295 * and 4M pages 2 rehashes. 6296 */ 6297 while (addr < endaddr) { 6298 hmeshift = HME_HASH_SHIFT(hashno); 6299 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 6300 hblktag.htag_rehash = hashno; 6301 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 6302 6303 SFMMU_HASH_LOCK(hmebp); 6304 6305 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 6306 if (hmeblkp != NULL) { 6307 ASSERT(!hmeblkp->hblk_shared); 6308 /* 6309 * We've encountered a shadow hmeblk so skip the range 6310 * of the next smaller mapping size. 6311 */ 6312 if (hmeblkp->hblk_shw_bit) { 6313 ASSERT(sfmmup != ksfmmup); 6314 ASSERT(hashno > 1); 6315 addr = (caddr_t)P2END((uintptr_t)addr, 6316 TTEBYTES(hashno - 1)); 6317 } else { 6318 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 6319 addr, endaddr, clearflag); 6320 } 6321 SFMMU_HASH_UNLOCK(hmebp); 6322 hashno = 1; 6323 continue; 6324 } 6325 SFMMU_HASH_UNLOCK(hmebp); 6326 6327 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 6328 /* 6329 * We have traversed the whole list and rehashed 6330 * if necessary without finding the address to sync. 6331 * This is ok so we increment the address by the 6332 * smallest hmeblk range for kernel mappings and the 6333 * largest hmeblk range, to account for shadow hmeblks, 6334 * for user mappings and continue. 6335 */ 6336 if (sfmmup == ksfmmup) 6337 addr = (caddr_t)P2END((uintptr_t)addr, 6338 TTEBYTES(1)); 6339 else 6340 addr = (caddr_t)P2END((uintptr_t)addr, 6341 TTEBYTES(hashno)); 6342 hashno = 1; 6343 } else { 6344 hashno++; 6345 } 6346 } 6347 sfmmu_hblks_list_purge(&list); 6348 cpuset = sfmmup->sfmmu_cpusran; 6349 xt_sync(cpuset); 6350 } 6351 6352 static caddr_t 6353 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 6354 caddr_t endaddr, int clearflag) 6355 { 6356 tte_t tte, ttemod; 6357 struct sf_hment *sfhmep; 6358 int ttesz; 6359 struct page *pp; 6360 kmutex_t *pml; 6361 int ret; 6362 6363 ASSERT(hmeblkp->hblk_shw_bit == 0); 6364 ASSERT(!hmeblkp->hblk_shared); 6365 6366 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6367 6368 ttesz = get_hblk_ttesz(hmeblkp); 6369 HBLKTOHME(sfhmep, hmeblkp, addr); 6370 6371 while (addr < endaddr) { 6372 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6373 if (TTE_IS_VALID(&tte)) { 6374 pml = NULL; 6375 pp = sfhmep->hme_page; 6376 if (pp) { 6377 pml = sfmmu_mlist_enter(pp); 6378 } 6379 if (pp != sfhmep->hme_page) { 6380 /* 6381 * tte most have been unloaded 6382 * underneath us. Recheck 6383 */ 6384 ASSERT(pml); 6385 sfmmu_mlist_exit(pml); 6386 continue; 6387 } 6388 6389 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6390 6391 if (clearflag == HAT_SYNC_ZERORM) { 6392 ttemod = tte; 6393 TTE_CLR_RM(&ttemod); 6394 ret = sfmmu_modifytte_try(&tte, &ttemod, 6395 &sfhmep->hme_tte); 6396 if (ret < 0) { 6397 if (pml) { 6398 sfmmu_mlist_exit(pml); 6399 } 6400 continue; 6401 } 6402 6403 if (ret > 0) { 6404 sfmmu_tlb_demap(addr, sfmmup, 6405 hmeblkp, 0, 0); 6406 } 6407 } 6408 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6409 if (pml) { 6410 sfmmu_mlist_exit(pml); 6411 } 6412 } 6413 addr += TTEBYTES(ttesz); 6414 sfhmep++; 6415 } 6416 return (addr); 6417 } 6418 6419 /* 6420 * This function will sync a tte to the page struct and it will 6421 * update the hat stats. Currently it allows us to pass a NULL pp 6422 * and we will simply update the stats. We may want to change this 6423 * so we only keep stats for pages backed by pp's. 6424 */ 6425 static void 6426 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 6427 { 6428 uint_t rm = 0; 6429 int sz = TTE_CSZ(ttep); 6430 pgcnt_t npgs; 6431 6432 ASSERT(TTE_IS_VALID(ttep)); 6433 6434 if (!TTE_IS_NOSYNC(ttep)) { 6435 6436 if (TTE_IS_REF(ttep)) 6437 rm |= P_REF; 6438 6439 if (TTE_IS_MOD(ttep)) 6440 rm |= P_MOD; 6441 6442 if (rm != 0) { 6443 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { 6444 int i; 6445 caddr_t vaddr = addr; 6446 6447 for (i = 0; i < TTEPAGES(sz); i++) { 6448 hat_setstat(sfmmup->sfmmu_as, vaddr, 6449 MMU_PAGESIZE, rm); 6450 vaddr += MMU_PAGESIZE; 6451 } 6452 } 6453 } 6454 } 6455 6456 if (!pp) 6457 return; 6458 6459 /* 6460 * If software says this page is executable, and the page was 6461 * in fact executed (indicated by hardware exec permission 6462 * being enabled), then set P_EXEC on the page to remember 6463 * that it was executed. The I$ will be flushed when the page 6464 * is reassigned. 6465 */ 6466 if (TTE_EXECUTED(ttep)) { 6467 rm |= P_EXEC; 6468 } else if (rm == 0) { 6469 return; 6470 } 6471 6472 /* 6473 * XXX I want to use cas to update nrm bits but they 6474 * currently belong in common/vm and not in hat where 6475 * they should be. 6476 * The nrm bits are protected by the same mutex as 6477 * the one that protects the page's mapping list. 6478 */ 6479 ASSERT(sfmmu_mlist_held(pp)); 6480 /* 6481 * If the tte is for a large page, we need to sync all the 6482 * pages covered by the tte. 6483 */ 6484 if (sz != TTE8K) { 6485 ASSERT(pp->p_szc != 0); 6486 pp = PP_GROUPLEADER(pp, sz); 6487 ASSERT(sfmmu_mlist_held(pp)); 6488 } 6489 6490 /* Get number of pages from tte size. */ 6491 npgs = TTEPAGES(sz); 6492 6493 do { 6494 ASSERT(pp); 6495 ASSERT(sfmmu_mlist_held(pp)); 6496 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 6497 ((rm & P_MOD) != 0 && !PP_ISMOD(pp)) || 6498 ((rm & P_EXEC) != 0 && !PP_ISEXEC(pp))) 6499 hat_page_setattr(pp, rm); 6500 6501 /* 6502 * Are we done? If not, we must have a large mapping. 6503 * For large mappings we need to sync the rest of the pages 6504 * covered by this tte; goto the next page. 6505 */ 6506 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 6507 } 6508 6509 /* 6510 * Execute pre-callback handler of each pa_hment linked to pp 6511 * 6512 * Inputs: 6513 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 6514 * capture_cpus: pointer to return value (below) 6515 * 6516 * Returns: 6517 * Propagates the subsystem callback return values back to the caller; 6518 * returns 0 on success. If capture_cpus is non-NULL, the value returned 6519 * is zero if all of the pa_hments are of a type that do not require 6520 * capturing CPUs prior to suspending the mapping, else it is 1. 6521 */ 6522 static int 6523 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 6524 { 6525 struct sf_hment *sfhmep; 6526 struct pa_hment *pahmep; 6527 int (*f)(caddr_t, uint_t, uint_t, void *); 6528 int ret; 6529 id_t id; 6530 int locked = 0; 6531 kmutex_t *pml; 6532 6533 ASSERT(PAGE_EXCL(pp)); 6534 if (!sfmmu_mlist_held(pp)) { 6535 pml = sfmmu_mlist_enter(pp); 6536 locked = 1; 6537 } 6538 6539 if (capture_cpus) 6540 *capture_cpus = 0; 6541 6542 top: 6543 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6544 /* 6545 * skip sf_hments corresponding to VA<->PA mappings; 6546 * for pa_hment's, hme_tte.ll is zero 6547 */ 6548 if (!IS_PAHME(sfhmep)) 6549 continue; 6550 6551 pahmep = sfhmep->hme_data; 6552 ASSERT(pahmep != NULL); 6553 6554 /* 6555 * skip if pre-handler has been called earlier in this loop 6556 */ 6557 if (pahmep->flags & flag) 6558 continue; 6559 6560 id = pahmep->cb_id; 6561 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6562 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 6563 *capture_cpus = 1; 6564 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 6565 pahmep->flags |= flag; 6566 continue; 6567 } 6568 6569 /* 6570 * Drop the mapping list lock to avoid locking order issues. 6571 */ 6572 if (locked) 6573 sfmmu_mlist_exit(pml); 6574 6575 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 6576 if (ret != 0) 6577 return (ret); /* caller must do the cleanup */ 6578 6579 if (locked) { 6580 pml = sfmmu_mlist_enter(pp); 6581 pahmep->flags |= flag; 6582 goto top; 6583 } 6584 6585 pahmep->flags |= flag; 6586 } 6587 6588 if (locked) 6589 sfmmu_mlist_exit(pml); 6590 6591 return (0); 6592 } 6593 6594 /* 6595 * Execute post-callback handler of each pa_hment linked to pp 6596 * 6597 * Same overall assumptions and restrictions apply as for 6598 * hat_pageprocess_precallbacks(). 6599 */ 6600 static void 6601 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 6602 { 6603 pfn_t pgpfn = pp->p_pagenum; 6604 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 6605 pfn_t newpfn; 6606 struct sf_hment *sfhmep; 6607 struct pa_hment *pahmep; 6608 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 6609 id_t id; 6610 int locked = 0; 6611 kmutex_t *pml; 6612 6613 ASSERT(PAGE_EXCL(pp)); 6614 if (!sfmmu_mlist_held(pp)) { 6615 pml = sfmmu_mlist_enter(pp); 6616 locked = 1; 6617 } 6618 6619 top: 6620 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6621 /* 6622 * skip sf_hments corresponding to VA<->PA mappings; 6623 * for pa_hment's, hme_tte.ll is zero 6624 */ 6625 if (!IS_PAHME(sfhmep)) 6626 continue; 6627 6628 pahmep = sfhmep->hme_data; 6629 ASSERT(pahmep != NULL); 6630 6631 if ((pahmep->flags & flag) == 0) 6632 continue; 6633 6634 pahmep->flags &= ~flag; 6635 6636 id = pahmep->cb_id; 6637 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6638 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 6639 continue; 6640 6641 /* 6642 * Convert the base page PFN into the constituent PFN 6643 * which is needed by the callback handler. 6644 */ 6645 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 6646 6647 /* 6648 * Drop the mapping list lock to avoid locking order issues. 6649 */ 6650 if (locked) 6651 sfmmu_mlist_exit(pml); 6652 6653 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 6654 != 0) 6655 panic("sfmmu: posthandler failed"); 6656 6657 if (locked) { 6658 pml = sfmmu_mlist_enter(pp); 6659 goto top; 6660 } 6661 } 6662 6663 if (locked) 6664 sfmmu_mlist_exit(pml); 6665 } 6666 6667 /* 6668 * Suspend locked kernel mapping 6669 */ 6670 void 6671 hat_pagesuspend(struct page *pp) 6672 { 6673 struct sf_hment *sfhmep; 6674 sfmmu_t *sfmmup; 6675 tte_t tte, ttemod; 6676 struct hme_blk *hmeblkp; 6677 caddr_t addr; 6678 int index, cons; 6679 cpuset_t cpuset; 6680 6681 ASSERT(PAGE_EXCL(pp)); 6682 ASSERT(sfmmu_mlist_held(pp)); 6683 6684 mutex_enter(&kpr_suspendlock); 6685 6686 /* 6687 * We're about to suspend a kernel mapping so mark this thread as 6688 * non-traceable by DTrace. This prevents us from running into issues 6689 * with probe context trying to touch a suspended page 6690 * in the relocation codepath itself. 6691 */ 6692 curthread->t_flag |= T_DONTDTRACE; 6693 6694 index = PP_MAPINDEX(pp); 6695 cons = TTE8K; 6696 6697 retry: 6698 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6699 6700 if (IS_PAHME(sfhmep)) 6701 continue; 6702 6703 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 6704 continue; 6705 6706 /* 6707 * Loop until we successfully set the suspend bit in 6708 * the TTE. 6709 */ 6710 again: 6711 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6712 ASSERT(TTE_IS_VALID(&tte)); 6713 6714 ttemod = tte; 6715 TTE_SET_SUSPEND(&ttemod); 6716 if (sfmmu_modifytte_try(&tte, &ttemod, 6717 &sfhmep->hme_tte) < 0) 6718 goto again; 6719 6720 /* 6721 * Invalidate TSB entry 6722 */ 6723 hmeblkp = sfmmu_hmetohblk(sfhmep); 6724 6725 sfmmup = hblktosfmmu(hmeblkp); 6726 ASSERT(sfmmup == ksfmmup); 6727 ASSERT(!hmeblkp->hblk_shared); 6728 6729 addr = tte_to_vaddr(hmeblkp, tte); 6730 6731 /* 6732 * No need to make sure that the TSB for this sfmmu is 6733 * not being relocated since it is ksfmmup and thus it 6734 * will never be relocated. 6735 */ 6736 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 6737 6738 /* 6739 * Update xcall stats 6740 */ 6741 cpuset = cpu_ready_set; 6742 CPUSET_DEL(cpuset, CPU->cpu_id); 6743 6744 /* LINTED: constant in conditional context */ 6745 SFMMU_XCALL_STATS(ksfmmup); 6746 6747 /* 6748 * Flush TLB entry on remote CPU's 6749 */ 6750 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 6751 (uint64_t)ksfmmup); 6752 xt_sync(cpuset); 6753 6754 /* 6755 * Flush TLB entry on local CPU 6756 */ 6757 vtag_flushpage(addr, (uint64_t)ksfmmup); 6758 } 6759 6760 while (index != 0) { 6761 index = index >> 1; 6762 if (index != 0) 6763 cons++; 6764 if (index & 0x1) { 6765 pp = PP_GROUPLEADER(pp, cons); 6766 goto retry; 6767 } 6768 } 6769 } 6770 6771 #ifdef DEBUG 6772 6773 #define N_PRLE 1024 6774 struct prle { 6775 page_t *targ; 6776 page_t *repl; 6777 int status; 6778 int pausecpus; 6779 hrtime_t whence; 6780 }; 6781 6782 static struct prle page_relocate_log[N_PRLE]; 6783 static int prl_entry; 6784 static kmutex_t prl_mutex; 6785 6786 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 6787 mutex_enter(&prl_mutex); \ 6788 page_relocate_log[prl_entry].targ = *(t); \ 6789 page_relocate_log[prl_entry].repl = *(r); \ 6790 page_relocate_log[prl_entry].status = (s); \ 6791 page_relocate_log[prl_entry].pausecpus = (p); \ 6792 page_relocate_log[prl_entry].whence = gethrtime(); \ 6793 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 6794 mutex_exit(&prl_mutex); 6795 6796 #else /* !DEBUG */ 6797 #define PAGE_RELOCATE_LOG(t, r, s, p) 6798 #endif 6799 6800 /* 6801 * Core Kernel Page Relocation Algorithm 6802 * 6803 * Input: 6804 * 6805 * target : constituent pages are SE_EXCL locked. 6806 * replacement: constituent pages are SE_EXCL locked. 6807 * 6808 * Output: 6809 * 6810 * nrelocp: number of pages relocated 6811 */ 6812 int 6813 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 6814 { 6815 page_t *targ, *repl; 6816 page_t *tpp, *rpp; 6817 kmutex_t *low, *high; 6818 spgcnt_t npages, i; 6819 page_t *pl = NULL; 6820 uint_t ppattr; 6821 int old_pil; 6822 cpuset_t cpuset; 6823 int cap_cpus; 6824 int ret; 6825 #ifdef VAC 6826 int cflags = 0; 6827 #endif 6828 6829 if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) { 6830 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 6831 return (EAGAIN); 6832 } 6833 6834 mutex_enter(&kpr_mutex); 6835 kreloc_thread = curthread; 6836 6837 targ = *target; 6838 repl = *replacement; 6839 ASSERT(repl != NULL); 6840 ASSERT(targ->p_szc == repl->p_szc); 6841 6842 npages = page_get_pagecnt(targ->p_szc); 6843 6844 /* 6845 * unload VA<->PA mappings that are not locked 6846 */ 6847 tpp = targ; 6848 for (i = 0; i < npages; i++) { 6849 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 6850 tpp++; 6851 } 6852 6853 /* 6854 * Do "presuspend" callbacks, in a context from which we can still 6855 * block as needed. Note that we don't hold the mapping list lock 6856 * of "targ" at this point due to potential locking order issues; 6857 * we assume that between the hat_pageunload() above and holding 6858 * the SE_EXCL lock that the mapping list *cannot* change at this 6859 * point. 6860 */ 6861 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 6862 if (ret != 0) { 6863 /* 6864 * EIO translates to fatal error, for all others cleanup 6865 * and return EAGAIN. 6866 */ 6867 ASSERT(ret != EIO); 6868 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 6869 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 6870 kreloc_thread = NULL; 6871 mutex_exit(&kpr_mutex); 6872 return (EAGAIN); 6873 } 6874 6875 /* 6876 * acquire p_mapping list lock for both the target and replacement 6877 * root pages. 6878 * 6879 * low and high refer to the need to grab the mlist locks in a 6880 * specific order in order to prevent race conditions. Thus the 6881 * lower lock must be grabbed before the higher lock. 6882 * 6883 * This will block hat_unload's accessing p_mapping list. Since 6884 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 6885 * blocked. Thus, no one else will be accessing the p_mapping list 6886 * while we suspend and reload the locked mapping below. 6887 */ 6888 tpp = targ; 6889 rpp = repl; 6890 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6891 6892 kpreempt_disable(); 6893 6894 /* 6895 * We raise our PIL to 13 so that we don't get captured by 6896 * another CPU or pinned by an interrupt thread. We can't go to 6897 * PIL 14 since the nexus driver(s) may need to interrupt at 6898 * that level in the case of IOMMU pseudo mappings. 6899 */ 6900 cpuset = cpu_ready_set; 6901 CPUSET_DEL(cpuset, CPU->cpu_id); 6902 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6903 old_pil = splr(XCALL_PIL); 6904 } else { 6905 old_pil = -1; 6906 xc_attention(cpuset); 6907 } 6908 ASSERT(getpil() == XCALL_PIL); 6909 6910 /* 6911 * Now do suspend callbacks. In the case of an IOMMU mapping 6912 * this will suspend all DMA activity to the page while it is 6913 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6914 * may be captured at this point we should have acquired any needed 6915 * locks in the presuspend callback. 6916 */ 6917 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6918 if (ret != 0) { 6919 repl = targ; 6920 goto suspend_fail; 6921 } 6922 6923 /* 6924 * Raise the PIL yet again, this time to block all high-level 6925 * interrupts on this CPU. This is necessary to prevent an 6926 * interrupt routine from pinning the thread which holds the 6927 * mapping suspended and then touching the suspended page. 6928 * 6929 * Once the page is suspended we also need to be careful to 6930 * avoid calling any functions which touch any seg_kmem memory 6931 * since that memory may be backed by the very page we are 6932 * relocating in here! 6933 */ 6934 hat_pagesuspend(targ); 6935 6936 /* 6937 * Now that we are confident everybody has stopped using this page, 6938 * copy the page contents. Note we use a physical copy to prevent 6939 * locking issues and to avoid fpRAS because we can't handle it in 6940 * this context. 6941 */ 6942 for (i = 0; i < npages; i++, tpp++, rpp++) { 6943 #ifdef VAC 6944 /* 6945 * If the replacement has a different vcolor than 6946 * the one being replacd, we need to handle VAC 6947 * consistency for it just as we were setting up 6948 * a new mapping to it. 6949 */ 6950 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) && 6951 (tpp->p_vcolor != rpp->p_vcolor) && 6952 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) { 6953 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp)); 6954 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 6955 rpp->p_pagenum); 6956 } 6957 #endif 6958 /* 6959 * Copy the contents of the page. 6960 */ 6961 ppcopy_kernel(tpp, rpp); 6962 } 6963 6964 tpp = targ; 6965 rpp = repl; 6966 for (i = 0; i < npages; i++, tpp++, rpp++) { 6967 /* 6968 * Copy attributes. VAC consistency was handled above, 6969 * if required. 6970 */ 6971 ppattr = hat_page_getattr(tpp, (P_MOD | P_REF | P_RO)); 6972 page_clr_all_props(rpp, 0); 6973 page_set_props(rpp, ppattr); 6974 rpp->p_index = tpp->p_index; 6975 tpp->p_index = 0; 6976 #ifdef VAC 6977 rpp->p_vcolor = tpp->p_vcolor; 6978 #endif 6979 } 6980 6981 /* 6982 * First, unsuspend the page, if we set the suspend bit, and transfer 6983 * the mapping list from the target page to the replacement page. 6984 * Next process postcallbacks; since pa_hment's are linked only to the 6985 * p_mapping list of root page, we don't iterate over the constituent 6986 * pages. 6987 */ 6988 hat_pagereload(targ, repl); 6989 6990 suspend_fail: 6991 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 6992 6993 /* 6994 * Now lower our PIL and release any captured CPUs since we 6995 * are out of the "danger zone". After this it will again be 6996 * safe to acquire adaptive mutex locks, or to drop them... 6997 */ 6998 if (old_pil != -1) { 6999 splx(old_pil); 7000 } else { 7001 xc_dismissed(cpuset); 7002 } 7003 7004 kpreempt_enable(); 7005 7006 sfmmu_mlist_reloc_exit(low, high); 7007 7008 /* 7009 * Postsuspend callbacks should drop any locks held across 7010 * the suspend callbacks. As before, we don't hold the mapping 7011 * list lock at this point.. our assumption is that the mapping 7012 * list still can't change due to our holding SE_EXCL lock and 7013 * there being no unlocked mappings left. Hence the restriction 7014 * on calling context to hat_delete_callback() 7015 */ 7016 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 7017 if (ret != 0) { 7018 /* 7019 * The second presuspend call failed: we got here through 7020 * the suspend_fail label above. 7021 */ 7022 ASSERT(ret != EIO); 7023 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 7024 kreloc_thread = NULL; 7025 mutex_exit(&kpr_mutex); 7026 return (EAGAIN); 7027 } 7028 7029 /* 7030 * Now that we're out of the performance critical section we can 7031 * take care of updating the hash table, since we still 7032 * hold all the pages locked SE_EXCL at this point we 7033 * needn't worry about things changing out from under us. 7034 */ 7035 tpp = targ; 7036 rpp = repl; 7037 for (i = 0; i < npages; i++, tpp++, rpp++) { 7038 7039 /* 7040 * replace targ with replacement in page_hash table 7041 */ 7042 targ = tpp; 7043 page_relocate_hash(rpp, targ); 7044 7045 /* 7046 * concatenate target; caller of platform_page_relocate() 7047 * expects target to be concatenated after returning. 7048 */ 7049 ASSERT(targ->p_next == targ); 7050 ASSERT(targ->p_prev == targ); 7051 page_list_concat(&pl, &targ); 7052 } 7053 7054 ASSERT(*target == pl); 7055 *nrelocp = npages; 7056 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 7057 kreloc_thread = NULL; 7058 mutex_exit(&kpr_mutex); 7059 return (0); 7060 } 7061 7062 /* 7063 * Called when stray pa_hments are found attached to a page which is 7064 * being freed. Notify the subsystem which attached the pa_hment of 7065 * the error if it registered a suitable handler, else panic. 7066 */ 7067 static void 7068 sfmmu_pahment_leaked(struct pa_hment *pahmep) 7069 { 7070 id_t cb_id = pahmep->cb_id; 7071 7072 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 7073 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 7074 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 7075 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 7076 return; /* non-fatal */ 7077 } 7078 panic("pa_hment leaked: 0x%p", (void *)pahmep); 7079 } 7080 7081 /* 7082 * Remove all mappings to page 'pp'. 7083 */ 7084 int 7085 hat_pageunload(struct page *pp, uint_t forceflag) 7086 { 7087 struct page *origpp = pp; 7088 struct sf_hment *sfhme, *tmphme; 7089 struct hme_blk *hmeblkp; 7090 kmutex_t *pml; 7091 #ifdef VAC 7092 kmutex_t *pmtx; 7093 #endif 7094 cpuset_t cpuset, tset; 7095 int index, cons; 7096 int xhme_blks; 7097 int pa_hments; 7098 7099 ASSERT(PAGE_EXCL(pp)); 7100 7101 retry_xhat: 7102 tmphme = NULL; 7103 xhme_blks = 0; 7104 pa_hments = 0; 7105 CPUSET_ZERO(cpuset); 7106 7107 pml = sfmmu_mlist_enter(pp); 7108 7109 #ifdef VAC 7110 if (pp->p_kpmref) 7111 sfmmu_kpm_pageunload(pp); 7112 ASSERT(!PP_ISMAPPED_KPM(pp)); 7113 #endif 7114 7115 index = PP_MAPINDEX(pp); 7116 cons = TTE8K; 7117 retry: 7118 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7119 tmphme = sfhme->hme_next; 7120 7121 if (IS_PAHME(sfhme)) { 7122 ASSERT(sfhme->hme_data != NULL); 7123 pa_hments++; 7124 continue; 7125 } 7126 7127 hmeblkp = sfmmu_hmetohblk(sfhme); 7128 if (hmeblkp->hblk_xhat_bit) { 7129 struct xhat_hme_blk *xblk = 7130 (struct xhat_hme_blk *)hmeblkp; 7131 7132 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat, 7133 pp, forceflag, XBLK2PROVBLK(xblk)); 7134 7135 xhme_blks = 1; 7136 continue; 7137 } 7138 7139 /* 7140 * If there are kernel mappings don't unload them, they will 7141 * be suspended. 7142 */ 7143 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 7144 hmeblkp->hblk_tag.htag_id == ksfmmup) 7145 continue; 7146 7147 tset = sfmmu_pageunload(pp, sfhme, cons); 7148 CPUSET_OR(cpuset, tset); 7149 } 7150 7151 while (index != 0) { 7152 index = index >> 1; 7153 if (index != 0) 7154 cons++; 7155 if (index & 0x1) { 7156 /* Go to leading page */ 7157 pp = PP_GROUPLEADER(pp, cons); 7158 ASSERT(sfmmu_mlist_held(pp)); 7159 goto retry; 7160 } 7161 } 7162 7163 /* 7164 * cpuset may be empty if the page was only mapped by segkpm, 7165 * in which case we won't actually cross-trap. 7166 */ 7167 xt_sync(cpuset); 7168 7169 /* 7170 * The page should have no mappings at this point, unless 7171 * we were called from hat_page_relocate() in which case we 7172 * leave the locked mappings which will be suspended later. 7173 */ 7174 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments || 7175 (forceflag == SFMMU_KERNEL_RELOC)); 7176 7177 #ifdef VAC 7178 if (PP_ISTNC(pp)) { 7179 if (cons == TTE8K) { 7180 pmtx = sfmmu_page_enter(pp); 7181 PP_CLRTNC(pp); 7182 sfmmu_page_exit(pmtx); 7183 } else { 7184 conv_tnc(pp, cons); 7185 } 7186 } 7187 #endif /* VAC */ 7188 7189 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 7190 /* 7191 * Unlink any pa_hments and free them, calling back 7192 * the responsible subsystem to notify it of the error. 7193 * This can occur in situations such as drivers leaking 7194 * DMA handles: naughty, but common enough that we'd like 7195 * to keep the system running rather than bringing it 7196 * down with an obscure error like "pa_hment leaked" 7197 * which doesn't aid the user in debugging their driver. 7198 */ 7199 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7200 tmphme = sfhme->hme_next; 7201 if (IS_PAHME(sfhme)) { 7202 struct pa_hment *pahmep = sfhme->hme_data; 7203 sfmmu_pahment_leaked(pahmep); 7204 HME_SUB(sfhme, pp); 7205 kmem_cache_free(pa_hment_cache, pahmep); 7206 } 7207 } 7208 7209 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks); 7210 } 7211 7212 sfmmu_mlist_exit(pml); 7213 7214 /* 7215 * XHAT may not have finished unloading pages 7216 * because some other thread was waiting for 7217 * mlist lock and XHAT_PAGEUNLOAD let it do 7218 * the job. 7219 */ 7220 if (xhme_blks) { 7221 pp = origpp; 7222 goto retry_xhat; 7223 } 7224 7225 return (0); 7226 } 7227 7228 cpuset_t 7229 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 7230 { 7231 struct hme_blk *hmeblkp; 7232 sfmmu_t *sfmmup; 7233 tte_t tte, ttemod; 7234 #ifdef DEBUG 7235 tte_t orig_old; 7236 #endif /* DEBUG */ 7237 caddr_t addr; 7238 int ttesz; 7239 int ret; 7240 cpuset_t cpuset; 7241 7242 ASSERT(pp != NULL); 7243 ASSERT(sfmmu_mlist_held(pp)); 7244 ASSERT(!PP_ISKAS(pp)); 7245 7246 CPUSET_ZERO(cpuset); 7247 7248 hmeblkp = sfmmu_hmetohblk(sfhme); 7249 7250 readtte: 7251 sfmmu_copytte(&sfhme->hme_tte, &tte); 7252 if (TTE_IS_VALID(&tte)) { 7253 sfmmup = hblktosfmmu(hmeblkp); 7254 ttesz = get_hblk_ttesz(hmeblkp); 7255 /* 7256 * Only unload mappings of 'cons' size. 7257 */ 7258 if (ttesz != cons) 7259 return (cpuset); 7260 7261 /* 7262 * Note that we have p_mapping lock, but no hash lock here. 7263 * hblk_unload() has to have both hash lock AND p_mapping 7264 * lock before it tries to modify tte. So, the tte could 7265 * not become invalid in the sfmmu_modifytte_try() below. 7266 */ 7267 ttemod = tte; 7268 #ifdef DEBUG 7269 orig_old = tte; 7270 #endif /* DEBUG */ 7271 7272 TTE_SET_INVALID(&ttemod); 7273 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7274 if (ret < 0) { 7275 #ifdef DEBUG 7276 /* only R/M bits can change. */ 7277 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 7278 #endif /* DEBUG */ 7279 goto readtte; 7280 } 7281 7282 if (ret == 0) { 7283 panic("pageunload: cas failed?"); 7284 } 7285 7286 addr = tte_to_vaddr(hmeblkp, tte); 7287 7288 if (hmeblkp->hblk_shared) { 7289 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7290 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7291 sf_region_t *rgnp; 7292 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7293 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7294 ASSERT(srdp != NULL); 7295 rgnp = srdp->srd_hmergnp[rid]; 7296 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7297 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1); 7298 sfmmu_ttesync(NULL, addr, &tte, pp); 7299 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0); 7300 atomic_add_long(&rgnp->rgn_ttecnt[ttesz], -1); 7301 } else { 7302 sfmmu_ttesync(sfmmup, addr, &tte, pp); 7303 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1); 7304 7305 /* 7306 * We need to flush the page from the virtual cache 7307 * in order to prevent a virtual cache alias 7308 * inconsistency. The particular scenario we need 7309 * to worry about is: 7310 * Given: va1 and va2 are two virtual address that 7311 * alias and will map the same physical address. 7312 * 1. mapping exists from va1 to pa and data has 7313 * been read into the cache. 7314 * 2. unload va1. 7315 * 3. load va2 and modify data using va2. 7316 * 4 unload va2. 7317 * 5. load va1 and reference data. Unless we flush 7318 * the data cache when we unload we will get 7319 * stale data. 7320 * This scenario is taken care of by using virtual 7321 * page coloring. 7322 */ 7323 if (sfmmup->sfmmu_ismhat) { 7324 /* 7325 * Flush TSBs, TLBs and caches 7326 * of every process 7327 * sharing this ism segment. 7328 */ 7329 sfmmu_hat_lock_all(); 7330 mutex_enter(&ism_mlist_lock); 7331 kpreempt_disable(); 7332 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 7333 pp->p_pagenum, CACHE_NO_FLUSH); 7334 kpreempt_enable(); 7335 mutex_exit(&ism_mlist_lock); 7336 sfmmu_hat_unlock_all(); 7337 cpuset = cpu_ready_set; 7338 } else { 7339 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7340 cpuset = sfmmup->sfmmu_cpusran; 7341 } 7342 } 7343 7344 /* 7345 * Hme_sub has to run after ttesync() and a_rss update. 7346 * See hblk_unload(). 7347 */ 7348 HME_SUB(sfhme, pp); 7349 membar_stst(); 7350 7351 /* 7352 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 7353 * since pteload may have done a HME_ADD() right after 7354 * we did the HME_SUB() above. Hmecnt is now maintained 7355 * by cas only. no lock guranteed its value. The only 7356 * gurantee we have is the hmecnt should not be less than 7357 * what it should be so the hblk will not be taken away. 7358 * It's also important that we decremented the hmecnt after 7359 * we are done with hmeblkp so that this hmeblk won't be 7360 * stolen. 7361 */ 7362 ASSERT(hmeblkp->hblk_hmecnt > 0); 7363 ASSERT(hmeblkp->hblk_vcnt > 0); 7364 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 7365 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 7366 /* 7367 * This is bug 4063182. 7368 * XXX: fixme 7369 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 7370 * !hmeblkp->hblk_lckcnt); 7371 */ 7372 } else { 7373 panic("invalid tte? pp %p &tte %p", 7374 (void *)pp, (void *)&tte); 7375 } 7376 7377 return (cpuset); 7378 } 7379 7380 /* 7381 * While relocating a kernel page, this function will move the mappings 7382 * from tpp to dpp and modify any associated data with these mappings. 7383 * It also unsuspends the suspended kernel mapping. 7384 */ 7385 static void 7386 hat_pagereload(struct page *tpp, struct page *dpp) 7387 { 7388 struct sf_hment *sfhme; 7389 tte_t tte, ttemod; 7390 int index, cons; 7391 7392 ASSERT(getpil() == PIL_MAX); 7393 ASSERT(sfmmu_mlist_held(tpp)); 7394 ASSERT(sfmmu_mlist_held(dpp)); 7395 7396 index = PP_MAPINDEX(tpp); 7397 cons = TTE8K; 7398 7399 /* Update real mappings to the page */ 7400 retry: 7401 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 7402 if (IS_PAHME(sfhme)) 7403 continue; 7404 sfmmu_copytte(&sfhme->hme_tte, &tte); 7405 ttemod = tte; 7406 7407 /* 7408 * replace old pfn with new pfn in TTE 7409 */ 7410 PFN_TO_TTE(ttemod, dpp->p_pagenum); 7411 7412 /* 7413 * clear suspend bit 7414 */ 7415 ASSERT(TTE_IS_SUSPEND(&ttemod)); 7416 TTE_CLR_SUSPEND(&ttemod); 7417 7418 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 7419 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 7420 7421 /* 7422 * set hme_page point to new page 7423 */ 7424 sfhme->hme_page = dpp; 7425 } 7426 7427 /* 7428 * move p_mapping list from old page to new page 7429 */ 7430 dpp->p_mapping = tpp->p_mapping; 7431 tpp->p_mapping = NULL; 7432 dpp->p_share = tpp->p_share; 7433 tpp->p_share = 0; 7434 7435 while (index != 0) { 7436 index = index >> 1; 7437 if (index != 0) 7438 cons++; 7439 if (index & 0x1) { 7440 tpp = PP_GROUPLEADER(tpp, cons); 7441 dpp = PP_GROUPLEADER(dpp, cons); 7442 goto retry; 7443 } 7444 } 7445 7446 curthread->t_flag &= ~T_DONTDTRACE; 7447 mutex_exit(&kpr_suspendlock); 7448 } 7449 7450 uint_t 7451 hat_pagesync(struct page *pp, uint_t clearflag) 7452 { 7453 struct sf_hment *sfhme, *tmphme = NULL; 7454 struct hme_blk *hmeblkp; 7455 kmutex_t *pml; 7456 cpuset_t cpuset, tset; 7457 int index, cons; 7458 extern ulong_t po_share; 7459 page_t *save_pp = pp; 7460 int stop_on_sh = 0; 7461 uint_t shcnt; 7462 7463 CPUSET_ZERO(cpuset); 7464 7465 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 7466 return (PP_GENERIC_ATTR(pp)); 7467 } 7468 7469 if ((clearflag & HAT_SYNC_ZERORM) == 0) { 7470 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) { 7471 return (PP_GENERIC_ATTR(pp)); 7472 } 7473 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) { 7474 return (PP_GENERIC_ATTR(pp)); 7475 } 7476 if (clearflag & HAT_SYNC_STOPON_SHARED) { 7477 if (pp->p_share > po_share) { 7478 hat_page_setattr(pp, P_REF); 7479 return (PP_GENERIC_ATTR(pp)); 7480 } 7481 stop_on_sh = 1; 7482 shcnt = 0; 7483 } 7484 } 7485 7486 clearflag &= ~HAT_SYNC_STOPON_SHARED; 7487 pml = sfmmu_mlist_enter(pp); 7488 index = PP_MAPINDEX(pp); 7489 cons = TTE8K; 7490 retry: 7491 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7492 /* 7493 * We need to save the next hment on the list since 7494 * it is possible for pagesync to remove an invalid hment 7495 * from the list. 7496 */ 7497 tmphme = sfhme->hme_next; 7498 if (IS_PAHME(sfhme)) 7499 continue; 7500 /* 7501 * If we are looking for large mappings and this hme doesn't 7502 * reach the range we are seeking, just ignore it. 7503 */ 7504 hmeblkp = sfmmu_hmetohblk(sfhme); 7505 if (hmeblkp->hblk_xhat_bit) 7506 continue; 7507 7508 if (hme_size(sfhme) < cons) 7509 continue; 7510 7511 if (stop_on_sh) { 7512 if (hmeblkp->hblk_shared) { 7513 sf_srd_t *srdp = hblktosrd(hmeblkp); 7514 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7515 sf_region_t *rgnp; 7516 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7517 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7518 ASSERT(srdp != NULL); 7519 rgnp = srdp->srd_hmergnp[rid]; 7520 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 7521 rgnp, rid); 7522 shcnt += rgnp->rgn_refcnt; 7523 } else { 7524 shcnt++; 7525 } 7526 if (shcnt > po_share) { 7527 /* 7528 * tell the pager to spare the page this time 7529 * around. 7530 */ 7531 hat_page_setattr(save_pp, P_REF); 7532 index = 0; 7533 break; 7534 } 7535 } 7536 tset = sfmmu_pagesync(pp, sfhme, 7537 clearflag & ~HAT_SYNC_STOPON_RM); 7538 CPUSET_OR(cpuset, tset); 7539 7540 /* 7541 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 7542 * as the "ref" or "mod" is set or share cnt exceeds po_share. 7543 */ 7544 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 7545 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 7546 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) { 7547 index = 0; 7548 break; 7549 } 7550 } 7551 7552 while (index) { 7553 index = index >> 1; 7554 cons++; 7555 if (index & 0x1) { 7556 /* Go to leading page */ 7557 pp = PP_GROUPLEADER(pp, cons); 7558 goto retry; 7559 } 7560 } 7561 7562 xt_sync(cpuset); 7563 sfmmu_mlist_exit(pml); 7564 return (PP_GENERIC_ATTR(save_pp)); 7565 } 7566 7567 /* 7568 * Get all the hardware dependent attributes for a page struct 7569 */ 7570 static cpuset_t 7571 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 7572 uint_t clearflag) 7573 { 7574 caddr_t addr; 7575 tte_t tte, ttemod; 7576 struct hme_blk *hmeblkp; 7577 int ret; 7578 sfmmu_t *sfmmup; 7579 cpuset_t cpuset; 7580 7581 ASSERT(pp != NULL); 7582 ASSERT(sfmmu_mlist_held(pp)); 7583 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 7584 (clearflag == HAT_SYNC_ZERORM)); 7585 7586 SFMMU_STAT(sf_pagesync); 7587 7588 CPUSET_ZERO(cpuset); 7589 7590 sfmmu_pagesync_retry: 7591 7592 sfmmu_copytte(&sfhme->hme_tte, &tte); 7593 if (TTE_IS_VALID(&tte)) { 7594 hmeblkp = sfmmu_hmetohblk(sfhme); 7595 sfmmup = hblktosfmmu(hmeblkp); 7596 addr = tte_to_vaddr(hmeblkp, tte); 7597 if (clearflag == HAT_SYNC_ZERORM) { 7598 ttemod = tte; 7599 TTE_CLR_RM(&ttemod); 7600 ret = sfmmu_modifytte_try(&tte, &ttemod, 7601 &sfhme->hme_tte); 7602 if (ret < 0) { 7603 /* 7604 * cas failed and the new value is not what 7605 * we want. 7606 */ 7607 goto sfmmu_pagesync_retry; 7608 } 7609 7610 if (ret > 0) { 7611 /* we win the cas */ 7612 if (hmeblkp->hblk_shared) { 7613 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7614 uint_t rid = 7615 hmeblkp->hblk_tag.htag_rid; 7616 sf_region_t *rgnp; 7617 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7618 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7619 ASSERT(srdp != NULL); 7620 rgnp = srdp->srd_hmergnp[rid]; 7621 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7622 srdp, rgnp, rid); 7623 cpuset = sfmmu_rgntlb_demap(addr, 7624 rgnp, hmeblkp, 1); 7625 } else { 7626 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 7627 0, 0); 7628 cpuset = sfmmup->sfmmu_cpusran; 7629 } 7630 } 7631 } 7632 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr, 7633 &tte, pp); 7634 } 7635 return (cpuset); 7636 } 7637 7638 /* 7639 * Remove write permission from a mappings to a page, so that 7640 * we can detect the next modification of it. This requires modifying 7641 * the TTE then invalidating (demap) any TLB entry using that TTE. 7642 * This code is similar to sfmmu_pagesync(). 7643 */ 7644 static cpuset_t 7645 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 7646 { 7647 caddr_t addr; 7648 tte_t tte; 7649 tte_t ttemod; 7650 struct hme_blk *hmeblkp; 7651 int ret; 7652 sfmmu_t *sfmmup; 7653 cpuset_t cpuset; 7654 7655 ASSERT(pp != NULL); 7656 ASSERT(sfmmu_mlist_held(pp)); 7657 7658 CPUSET_ZERO(cpuset); 7659 SFMMU_STAT(sf_clrwrt); 7660 7661 retry: 7662 7663 sfmmu_copytte(&sfhme->hme_tte, &tte); 7664 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 7665 hmeblkp = sfmmu_hmetohblk(sfhme); 7666 7667 /* 7668 * xhat mappings should never be to a VMODSORT page. 7669 */ 7670 ASSERT(hmeblkp->hblk_xhat_bit == 0); 7671 7672 sfmmup = hblktosfmmu(hmeblkp); 7673 addr = tte_to_vaddr(hmeblkp, tte); 7674 7675 ttemod = tte; 7676 TTE_CLR_WRT(&ttemod); 7677 TTE_CLR_MOD(&ttemod); 7678 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7679 7680 /* 7681 * if cas failed and the new value is not what 7682 * we want retry 7683 */ 7684 if (ret < 0) 7685 goto retry; 7686 7687 /* we win the cas */ 7688 if (ret > 0) { 7689 if (hmeblkp->hblk_shared) { 7690 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7691 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7692 sf_region_t *rgnp; 7693 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7694 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7695 ASSERT(srdp != NULL); 7696 rgnp = srdp->srd_hmergnp[rid]; 7697 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7698 srdp, rgnp, rid); 7699 cpuset = sfmmu_rgntlb_demap(addr, 7700 rgnp, hmeblkp, 1); 7701 } else { 7702 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7703 cpuset = sfmmup->sfmmu_cpusran; 7704 } 7705 } 7706 } 7707 7708 return (cpuset); 7709 } 7710 7711 /* 7712 * Walk all mappings of a page, removing write permission and clearing the 7713 * ref/mod bits. This code is similar to hat_pagesync() 7714 */ 7715 static void 7716 hat_page_clrwrt(page_t *pp) 7717 { 7718 struct sf_hment *sfhme; 7719 struct sf_hment *tmphme = NULL; 7720 kmutex_t *pml; 7721 cpuset_t cpuset; 7722 cpuset_t tset; 7723 int index; 7724 int cons; 7725 7726 CPUSET_ZERO(cpuset); 7727 7728 pml = sfmmu_mlist_enter(pp); 7729 index = PP_MAPINDEX(pp); 7730 cons = TTE8K; 7731 retry: 7732 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7733 tmphme = sfhme->hme_next; 7734 7735 /* 7736 * If we are looking for large mappings and this hme doesn't 7737 * reach the range we are seeking, just ignore its. 7738 */ 7739 7740 if (hme_size(sfhme) < cons) 7741 continue; 7742 7743 tset = sfmmu_pageclrwrt(pp, sfhme); 7744 CPUSET_OR(cpuset, tset); 7745 } 7746 7747 while (index) { 7748 index = index >> 1; 7749 cons++; 7750 if (index & 0x1) { 7751 /* Go to leading page */ 7752 pp = PP_GROUPLEADER(pp, cons); 7753 goto retry; 7754 } 7755 } 7756 7757 xt_sync(cpuset); 7758 sfmmu_mlist_exit(pml); 7759 } 7760 7761 /* 7762 * Set the given REF/MOD/RO bits for the given page. 7763 * For a vnode with a sorted v_pages list, we need to change 7764 * the attributes and the v_pages list together under page_vnode_mutex. 7765 */ 7766 void 7767 hat_page_setattr(page_t *pp, uint_t flag) 7768 { 7769 vnode_t *vp = pp->p_vnode; 7770 page_t **listp; 7771 kmutex_t *pmtx; 7772 kmutex_t *vphm = NULL; 7773 int noshuffle; 7774 7775 noshuffle = flag & P_NSH; 7776 flag &= ~P_NSH; 7777 7778 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO | P_EXEC))); 7779 7780 /* 7781 * nothing to do if attribute already set 7782 */ 7783 if ((pp->p_nrm & flag) == flag) 7784 return; 7785 7786 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 7787 !noshuffle) { 7788 vphm = page_vnode_mutex(vp); 7789 mutex_enter(vphm); 7790 } 7791 7792 pmtx = sfmmu_page_enter(pp); 7793 pp->p_nrm |= flag; 7794 sfmmu_page_exit(pmtx); 7795 7796 if (vphm != NULL) { 7797 /* 7798 * Some File Systems examine v_pages for NULL w/o 7799 * grabbing the vphm mutex. Must not let it become NULL when 7800 * pp is the only page on the list. 7801 */ 7802 if (pp->p_vpnext != pp) { 7803 page_vpsub(&vp->v_pages, pp); 7804 if (vp->v_pages != NULL) 7805 listp = &vp->v_pages->p_vpprev->p_vpnext; 7806 else 7807 listp = &vp->v_pages; 7808 page_vpadd(listp, pp); 7809 } 7810 mutex_exit(vphm); 7811 } 7812 } 7813 7814 void 7815 hat_page_clrattr(page_t *pp, uint_t flag) 7816 { 7817 vnode_t *vp = pp->p_vnode; 7818 kmutex_t *pmtx; 7819 7820 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7821 7822 pmtx = sfmmu_page_enter(pp); 7823 7824 /* 7825 * Caller is expected to hold page's io lock for VMODSORT to work 7826 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 7827 * bit is cleared. 7828 * We don't have assert to avoid tripping some existing third party 7829 * code. The dirty page is moved back to top of the v_page list 7830 * after IO is done in pvn_write_done(). 7831 */ 7832 pp->p_nrm &= ~flag; 7833 sfmmu_page_exit(pmtx); 7834 7835 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 7836 7837 /* 7838 * VMODSORT works by removing write permissions and getting 7839 * a fault when a page is made dirty. At this point 7840 * we need to remove write permission from all mappings 7841 * to this page. 7842 */ 7843 hat_page_clrwrt(pp); 7844 } 7845 } 7846 7847 uint_t 7848 hat_page_getattr(page_t *pp, uint_t flag) 7849 { 7850 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7851 return ((uint_t)(pp->p_nrm & flag)); 7852 } 7853 7854 /* 7855 * DEBUG kernels: verify that a kernel va<->pa translation 7856 * is safe by checking the underlying page_t is in a page 7857 * relocation-safe state. 7858 */ 7859 #ifdef DEBUG 7860 void 7861 sfmmu_check_kpfn(pfn_t pfn) 7862 { 7863 page_t *pp; 7864 int index, cons; 7865 7866 if (hat_check_vtop == 0) 7867 return; 7868 7869 if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr) 7870 return; 7871 7872 pp = page_numtopp_nolock(pfn); 7873 if (!pp) 7874 return; 7875 7876 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7877 return; 7878 7879 /* 7880 * Handed a large kernel page, we dig up the root page since we 7881 * know the root page might have the lock also. 7882 */ 7883 if (pp->p_szc != 0) { 7884 index = PP_MAPINDEX(pp); 7885 cons = TTE8K; 7886 again: 7887 while (index != 0) { 7888 index >>= 1; 7889 if (index != 0) 7890 cons++; 7891 if (index & 0x1) { 7892 pp = PP_GROUPLEADER(pp, cons); 7893 goto again; 7894 } 7895 } 7896 } 7897 7898 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7899 return; 7900 7901 /* 7902 * Pages need to be locked or allocated "permanent" (either from 7903 * static_arena arena or explicitly setting PG_NORELOC when calling 7904 * page_create_va()) for VA->PA translations to be valid. 7905 */ 7906 if (!PP_ISNORELOC(pp)) 7907 panic("Illegal VA->PA translation, pp 0x%p not permanent", 7908 (void *)pp); 7909 else 7910 panic("Illegal VA->PA translation, pp 0x%p not locked", 7911 (void *)pp); 7912 } 7913 #endif /* DEBUG */ 7914 7915 /* 7916 * Returns a page frame number for a given virtual address. 7917 * Returns PFN_INVALID to indicate an invalid mapping 7918 */ 7919 pfn_t 7920 hat_getpfnum(struct hat *hat, caddr_t addr) 7921 { 7922 pfn_t pfn; 7923 tte_t tte; 7924 7925 /* 7926 * We would like to 7927 * ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 7928 * but we can't because the iommu driver will call this 7929 * routine at interrupt time and it can't grab the as lock 7930 * or it will deadlock: A thread could have the as lock 7931 * and be waiting for io. The io can't complete 7932 * because the interrupt thread is blocked trying to grab 7933 * the as lock. 7934 */ 7935 7936 ASSERT(hat->sfmmu_xhat_provider == NULL); 7937 7938 if (hat == ksfmmup) { 7939 if (IS_KMEM_VA_LARGEPAGE(addr)) { 7940 ASSERT(segkmem_lpszc > 0); 7941 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc); 7942 if (pfn != PFN_INVALID) { 7943 sfmmu_check_kpfn(pfn); 7944 return (pfn); 7945 } 7946 } else if (segkpm && IS_KPM_ADDR(addr)) { 7947 return (sfmmu_kpm_vatopfn(addr)); 7948 } 7949 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7950 == PFN_SUSPENDED) { 7951 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7952 } 7953 sfmmu_check_kpfn(pfn); 7954 return (pfn); 7955 } else { 7956 return (sfmmu_uvatopfn(addr, hat, NULL)); 7957 } 7958 } 7959 7960 /* 7961 * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged. 7962 * Use hat_getpfnum(kas.a_hat, ...) instead. 7963 * 7964 * We'd like to return PFN_INVALID if the mappings have underlying page_t's 7965 * but can't right now due to the fact that some software has grown to use 7966 * this interface incorrectly. So for now when the interface is misused, 7967 * return a warning to the user that in the future it won't work in the 7968 * way they're abusing it, and carry on (after disabling page relocation). 7969 */ 7970 pfn_t 7971 hat_getkpfnum(caddr_t addr) 7972 { 7973 pfn_t pfn; 7974 tte_t tte; 7975 int badcaller = 0; 7976 extern int segkmem_reloc; 7977 7978 if (segkpm && IS_KPM_ADDR(addr)) { 7979 badcaller = 1; 7980 pfn = sfmmu_kpm_vatopfn(addr); 7981 } else { 7982 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7983 == PFN_SUSPENDED) { 7984 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7985 } 7986 badcaller = pf_is_memory(pfn); 7987 } 7988 7989 if (badcaller) { 7990 /* 7991 * We can't return PFN_INVALID or the caller may panic 7992 * or corrupt the system. The only alternative is to 7993 * disable page relocation at this point for all kernel 7994 * memory. This will impact any callers of page_relocate() 7995 * such as FMA or DR. 7996 * 7997 * RFE: Add junk here to spit out an ereport so the sysadmin 7998 * can be advised that he should upgrade his device driver 7999 * so that this doesn't happen. 8000 */ 8001 hat_getkpfnum_badcall(caller()); 8002 if (hat_kpr_enabled && segkmem_reloc) { 8003 hat_kpr_enabled = 0; 8004 segkmem_reloc = 0; 8005 cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED"); 8006 } 8007 } 8008 return (pfn); 8009 } 8010 8011 /* 8012 * This routine will return both pfn and tte for the vaddr. 8013 */ 8014 static pfn_t 8015 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep) 8016 { 8017 struct hmehash_bucket *hmebp; 8018 hmeblk_tag hblktag; 8019 int hmeshift, hashno = 1; 8020 struct hme_blk *hmeblkp = NULL; 8021 tte_t tte; 8022 8023 struct sf_hment *sfhmep; 8024 pfn_t pfn; 8025 8026 /* support for ISM */ 8027 ism_map_t *ism_map; 8028 ism_blk_t *ism_blkp; 8029 int i; 8030 sfmmu_t *ism_hatid = NULL; 8031 sfmmu_t *locked_hatid = NULL; 8032 sfmmu_t *sv_sfmmup = sfmmup; 8033 caddr_t sv_vaddr = vaddr; 8034 sf_srd_t *srdp; 8035 8036 if (ttep == NULL) { 8037 ttep = &tte; 8038 } else { 8039 ttep->ll = 0; 8040 } 8041 8042 ASSERT(sfmmup != ksfmmup); 8043 SFMMU_STAT(sf_user_vtop); 8044 /* 8045 * Set ism_hatid if vaddr falls in a ISM segment. 8046 */ 8047 ism_blkp = sfmmup->sfmmu_iblk; 8048 if (ism_blkp != NULL) { 8049 sfmmu_ismhat_enter(sfmmup, 0); 8050 locked_hatid = sfmmup; 8051 } 8052 while (ism_blkp != NULL && ism_hatid == NULL) { 8053 ism_map = ism_blkp->iblk_maps; 8054 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 8055 if (vaddr >= ism_start(ism_map[i]) && 8056 vaddr < ism_end(ism_map[i])) { 8057 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 8058 vaddr = (caddr_t)(vaddr - 8059 ism_start(ism_map[i])); 8060 break; 8061 } 8062 } 8063 ism_blkp = ism_blkp->iblk_next; 8064 } 8065 if (locked_hatid) { 8066 sfmmu_ismhat_exit(locked_hatid, 0); 8067 } 8068 8069 hblktag.htag_id = sfmmup; 8070 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 8071 do { 8072 hmeshift = HME_HASH_SHIFT(hashno); 8073 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 8074 hblktag.htag_rehash = hashno; 8075 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 8076 8077 SFMMU_HASH_LOCK(hmebp); 8078 8079 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 8080 if (hmeblkp != NULL) { 8081 ASSERT(!hmeblkp->hblk_shared); 8082 HBLKTOHME(sfhmep, hmeblkp, vaddr); 8083 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8084 SFMMU_HASH_UNLOCK(hmebp); 8085 if (TTE_IS_VALID(ttep)) { 8086 pfn = TTE_TO_PFN(vaddr, ttep); 8087 return (pfn); 8088 } 8089 break; 8090 } 8091 SFMMU_HASH_UNLOCK(hmebp); 8092 hashno++; 8093 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 8094 8095 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) { 8096 return (PFN_INVALID); 8097 } 8098 srdp = sv_sfmmup->sfmmu_srdp; 8099 ASSERT(srdp != NULL); 8100 ASSERT(srdp->srd_refcnt != 0); 8101 hblktag.htag_id = srdp; 8102 hashno = 1; 8103 do { 8104 hmeshift = HME_HASH_SHIFT(hashno); 8105 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift); 8106 hblktag.htag_rehash = hashno; 8107 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift); 8108 8109 SFMMU_HASH_LOCK(hmebp); 8110 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL; 8111 hmeblkp = hmeblkp->hblk_next) { 8112 uint_t rid; 8113 sf_region_t *rgnp; 8114 caddr_t rsaddr; 8115 caddr_t readdr; 8116 8117 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag, 8118 sv_sfmmup->sfmmu_hmeregion_map)) { 8119 continue; 8120 } 8121 ASSERT(hmeblkp->hblk_shared); 8122 rid = hmeblkp->hblk_tag.htag_rid; 8123 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8124 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8125 rgnp = srdp->srd_hmergnp[rid]; 8126 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 8127 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr); 8128 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8129 rsaddr = rgnp->rgn_saddr; 8130 readdr = rsaddr + rgnp->rgn_size; 8131 #ifdef DEBUG 8132 if (TTE_IS_VALID(ttep) || 8133 get_hblk_ttesz(hmeblkp) > TTE8K) { 8134 caddr_t eva = tte_to_evaddr(hmeblkp, ttep); 8135 ASSERT(eva > sv_vaddr); 8136 ASSERT(sv_vaddr >= rsaddr); 8137 ASSERT(sv_vaddr < readdr); 8138 ASSERT(eva <= readdr); 8139 } 8140 #endif /* DEBUG */ 8141 /* 8142 * Continue the search if we 8143 * found an invalid 8K tte outside of the area 8144 * covered by this hmeblk's region. 8145 */ 8146 if (TTE_IS_VALID(ttep)) { 8147 SFMMU_HASH_UNLOCK(hmebp); 8148 pfn = TTE_TO_PFN(sv_vaddr, ttep); 8149 return (pfn); 8150 } else if (get_hblk_ttesz(hmeblkp) > TTE8K || 8151 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) { 8152 SFMMU_HASH_UNLOCK(hmebp); 8153 pfn = PFN_INVALID; 8154 return (pfn); 8155 } 8156 } 8157 SFMMU_HASH_UNLOCK(hmebp); 8158 hashno++; 8159 } while (hashno <= mmu_hashcnt); 8160 return (PFN_INVALID); 8161 } 8162 8163 8164 /* 8165 * For compatability with AT&T and later optimizations 8166 */ 8167 /* ARGSUSED */ 8168 void 8169 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 8170 { 8171 ASSERT(hat != NULL); 8172 ASSERT(hat->sfmmu_xhat_provider == NULL); 8173 } 8174 8175 /* 8176 * Return the number of mappings to a particular page. This number is an 8177 * approximation of the number of people sharing the page. 8178 * 8179 * shared hmeblks or ism hmeblks are counted as 1 mapping here. 8180 * hat_page_checkshare() can be used to compare threshold to share 8181 * count that reflects the number of region sharers albeit at higher cost. 8182 */ 8183 ulong_t 8184 hat_page_getshare(page_t *pp) 8185 { 8186 page_t *spp = pp; /* start page */ 8187 kmutex_t *pml; 8188 ulong_t cnt; 8189 int index, sz = TTE64K; 8190 8191 /* 8192 * We need to grab the mlist lock to make sure any outstanding 8193 * load/unloads complete. Otherwise we could return zero 8194 * even though the unload(s) hasn't finished yet. 8195 */ 8196 pml = sfmmu_mlist_enter(spp); 8197 cnt = spp->p_share; 8198 8199 #ifdef VAC 8200 if (kpm_enable) 8201 cnt += spp->p_kpmref; 8202 #endif 8203 8204 /* 8205 * If we have any large mappings, we count the number of 8206 * mappings that this large page is part of. 8207 */ 8208 index = PP_MAPINDEX(spp); 8209 index >>= 1; 8210 while (index) { 8211 pp = PP_GROUPLEADER(spp, sz); 8212 if ((index & 0x1) && pp != spp) { 8213 cnt += pp->p_share; 8214 spp = pp; 8215 } 8216 index >>= 1; 8217 sz++; 8218 } 8219 sfmmu_mlist_exit(pml); 8220 return (cnt); 8221 } 8222 8223 /* 8224 * Return 1 if the number of mappings exceeds sh_thresh. Return 0 8225 * otherwise. Count shared hmeblks by region's refcnt. 8226 */ 8227 int 8228 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 8229 { 8230 kmutex_t *pml; 8231 ulong_t cnt = 0; 8232 int index, sz = TTE8K; 8233 struct sf_hment *sfhme, *tmphme = NULL; 8234 struct hme_blk *hmeblkp; 8235 8236 pml = sfmmu_mlist_enter(pp); 8237 8238 if (kpm_enable) 8239 cnt = pp->p_kpmref; 8240 8241 if (pp->p_share + cnt > sh_thresh) { 8242 sfmmu_mlist_exit(pml); 8243 return (1); 8244 } 8245 8246 index = PP_MAPINDEX(pp); 8247 8248 again: 8249 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 8250 tmphme = sfhme->hme_next; 8251 if (IS_PAHME(sfhme)) { 8252 continue; 8253 } 8254 8255 hmeblkp = sfmmu_hmetohblk(sfhme); 8256 if (hmeblkp->hblk_xhat_bit) { 8257 cnt++; 8258 if (cnt > sh_thresh) { 8259 sfmmu_mlist_exit(pml); 8260 return (1); 8261 } 8262 continue; 8263 } 8264 if (hme_size(sfhme) != sz) { 8265 continue; 8266 } 8267 8268 if (hmeblkp->hblk_shared) { 8269 sf_srd_t *srdp = hblktosrd(hmeblkp); 8270 uint_t rid = hmeblkp->hblk_tag.htag_rid; 8271 sf_region_t *rgnp; 8272 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8273 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8274 ASSERT(srdp != NULL); 8275 rgnp = srdp->srd_hmergnp[rid]; 8276 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 8277 rgnp, rid); 8278 cnt += rgnp->rgn_refcnt; 8279 } else { 8280 cnt++; 8281 } 8282 if (cnt > sh_thresh) { 8283 sfmmu_mlist_exit(pml); 8284 return (1); 8285 } 8286 } 8287 8288 index >>= 1; 8289 sz++; 8290 while (index) { 8291 pp = PP_GROUPLEADER(pp, sz); 8292 ASSERT(sfmmu_mlist_held(pp)); 8293 if (index & 0x1) { 8294 goto again; 8295 } 8296 index >>= 1; 8297 sz++; 8298 } 8299 sfmmu_mlist_exit(pml); 8300 return (0); 8301 } 8302 8303 /* 8304 * Unload all large mappings to the pp and reset the p_szc field of every 8305 * constituent page according to the remaining mappings. 8306 * 8307 * pp must be locked SE_EXCL. Even though no other constituent pages are 8308 * locked it's legal to unload the large mappings to the pp because all 8309 * constituent pages of large locked mappings have to be locked SE_SHARED. 8310 * This means if we have SE_EXCL lock on one of constituent pages none of the 8311 * large mappings to pp are locked. 8312 * 8313 * Decrease p_szc field starting from the last constituent page and ending 8314 * with the root page. This method is used because other threads rely on the 8315 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 8316 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 8317 * ensures that p_szc changes of the constituent pages appears atomic for all 8318 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 8319 * 8320 * This mechanism is only used for file system pages where it's not always 8321 * possible to get SE_EXCL locks on all constituent pages to demote the size 8322 * code (as is done for anonymous or kernel large pages). 8323 * 8324 * See more comments in front of sfmmu_mlspl_enter(). 8325 */ 8326 void 8327 hat_page_demote(page_t *pp) 8328 { 8329 int index; 8330 int sz; 8331 cpuset_t cpuset; 8332 int sync = 0; 8333 page_t *rootpp; 8334 struct sf_hment *sfhme; 8335 struct sf_hment *tmphme = NULL; 8336 struct hme_blk *hmeblkp; 8337 uint_t pszc; 8338 page_t *lastpp; 8339 cpuset_t tset; 8340 pgcnt_t npgs; 8341 kmutex_t *pml; 8342 kmutex_t *pmtx = NULL; 8343 8344 ASSERT(PAGE_EXCL(pp)); 8345 ASSERT(!PP_ISFREE(pp)); 8346 ASSERT(!PP_ISKAS(pp)); 8347 ASSERT(page_szc_lock_assert(pp)); 8348 pml = sfmmu_mlist_enter(pp); 8349 8350 pszc = pp->p_szc; 8351 if (pszc == 0) { 8352 goto out; 8353 } 8354 8355 index = PP_MAPINDEX(pp) >> 1; 8356 8357 if (index) { 8358 CPUSET_ZERO(cpuset); 8359 sz = TTE64K; 8360 sync = 1; 8361 } 8362 8363 while (index) { 8364 if (!(index & 0x1)) { 8365 index >>= 1; 8366 sz++; 8367 continue; 8368 } 8369 ASSERT(sz <= pszc); 8370 rootpp = PP_GROUPLEADER(pp, sz); 8371 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 8372 tmphme = sfhme->hme_next; 8373 ASSERT(!IS_PAHME(sfhme)); 8374 hmeblkp = sfmmu_hmetohblk(sfhme); 8375 if (hme_size(sfhme) != sz) { 8376 continue; 8377 } 8378 if (hmeblkp->hblk_xhat_bit) { 8379 cmn_err(CE_PANIC, 8380 "hat_page_demote: xhat hmeblk"); 8381 } 8382 tset = sfmmu_pageunload(rootpp, sfhme, sz); 8383 CPUSET_OR(cpuset, tset); 8384 } 8385 if (index >>= 1) { 8386 sz++; 8387 } 8388 } 8389 8390 ASSERT(!PP_ISMAPPED_LARGE(pp)); 8391 8392 if (sync) { 8393 xt_sync(cpuset); 8394 #ifdef VAC 8395 if (PP_ISTNC(pp)) { 8396 conv_tnc(rootpp, sz); 8397 } 8398 #endif /* VAC */ 8399 } 8400 8401 pmtx = sfmmu_page_enter(pp); 8402 8403 ASSERT(pp->p_szc == pszc); 8404 rootpp = PP_PAGEROOT(pp); 8405 ASSERT(rootpp->p_szc == pszc); 8406 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 8407 8408 while (lastpp != rootpp) { 8409 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 8410 ASSERT(sz < pszc); 8411 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 8412 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 8413 while (--npgs > 0) { 8414 lastpp->p_szc = (uchar_t)sz; 8415 lastpp = PP_PAGEPREV(lastpp); 8416 } 8417 if (sz) { 8418 /* 8419 * make sure before current root's pszc 8420 * is updated all updates to constituent pages pszc 8421 * fields are globally visible. 8422 */ 8423 membar_producer(); 8424 } 8425 lastpp->p_szc = sz; 8426 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 8427 if (lastpp != rootpp) { 8428 lastpp = PP_PAGEPREV(lastpp); 8429 } 8430 } 8431 if (sz == 0) { 8432 /* the loop above doesn't cover this case */ 8433 rootpp->p_szc = 0; 8434 } 8435 out: 8436 ASSERT(pp->p_szc == 0); 8437 if (pmtx != NULL) { 8438 sfmmu_page_exit(pmtx); 8439 } 8440 sfmmu_mlist_exit(pml); 8441 } 8442 8443 /* 8444 * Refresh the HAT ismttecnt[] element for size szc. 8445 * Caller must have set ISM busy flag to prevent mapping 8446 * lists from changing while we're traversing them. 8447 */ 8448 pgcnt_t 8449 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 8450 { 8451 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 8452 ism_map_t *ism_map; 8453 pgcnt_t npgs = 0; 8454 pgcnt_t npgs_scd = 0; 8455 int j; 8456 sf_scd_t *scdp; 8457 uchar_t rid; 8458 8459 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 8460 scdp = sfmmup->sfmmu_scdp; 8461 8462 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 8463 ism_map = ism_blkp->iblk_maps; 8464 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) { 8465 rid = ism_map[j].imap_rid; 8466 ASSERT(rid == SFMMU_INVALID_ISMRID || 8467 rid < sfmmup->sfmmu_srdp->srd_next_ismrid); 8468 8469 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID && 8470 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 8471 /* ISM is in sfmmup's SCD */ 8472 npgs_scd += 8473 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8474 } else { 8475 /* ISMs is not in SCD */ 8476 npgs += 8477 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8478 } 8479 } 8480 } 8481 sfmmup->sfmmu_ismttecnt[szc] = npgs; 8482 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd; 8483 return (npgs); 8484 } 8485 8486 /* 8487 * Yield the memory claim requirement for an address space. 8488 * 8489 * This is currently implemented as the number of bytes that have active 8490 * hardware translations that have page structures. Therefore, it can 8491 * underestimate the traditional resident set size, eg, if the 8492 * physical page is present and the hardware translation is missing; 8493 * and it can overestimate the rss, eg, if there are active 8494 * translations to a frame buffer with page structs. 8495 * Also, it does not take sharing into account. 8496 * 8497 * Note that we don't acquire locks here since this function is most often 8498 * called from the clock thread. 8499 */ 8500 size_t 8501 hat_get_mapped_size(struct hat *hat) 8502 { 8503 size_t assize = 0; 8504 int i; 8505 8506 if (hat == NULL) 8507 return (0); 8508 8509 ASSERT(hat->sfmmu_xhat_provider == NULL); 8510 8511 for (i = 0; i < mmu_page_sizes; i++) 8512 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] + 8513 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i); 8514 8515 if (hat->sfmmu_iblk == NULL) 8516 return (assize); 8517 8518 for (i = 0; i < mmu_page_sizes; i++) 8519 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] + 8520 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i); 8521 8522 return (assize); 8523 } 8524 8525 int 8526 hat_stats_enable(struct hat *hat) 8527 { 8528 hatlock_t *hatlockp; 8529 8530 ASSERT(hat->sfmmu_xhat_provider == NULL); 8531 8532 hatlockp = sfmmu_hat_enter(hat); 8533 hat->sfmmu_rmstat++; 8534 sfmmu_hat_exit(hatlockp); 8535 return (1); 8536 } 8537 8538 void 8539 hat_stats_disable(struct hat *hat) 8540 { 8541 hatlock_t *hatlockp; 8542 8543 ASSERT(hat->sfmmu_xhat_provider == NULL); 8544 8545 hatlockp = sfmmu_hat_enter(hat); 8546 hat->sfmmu_rmstat--; 8547 sfmmu_hat_exit(hatlockp); 8548 } 8549 8550 /* 8551 * Routines for entering or removing ourselves from the 8552 * ism_hat's mapping list. This is used for both private and 8553 * SCD hats. 8554 */ 8555 static void 8556 iment_add(struct ism_ment *iment, struct hat *ism_hat) 8557 { 8558 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8559 8560 iment->iment_prev = NULL; 8561 iment->iment_next = ism_hat->sfmmu_iment; 8562 if (ism_hat->sfmmu_iment) { 8563 ism_hat->sfmmu_iment->iment_prev = iment; 8564 } 8565 ism_hat->sfmmu_iment = iment; 8566 } 8567 8568 static void 8569 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 8570 { 8571 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8572 8573 if (ism_hat->sfmmu_iment == NULL) { 8574 panic("ism map entry remove - no entries"); 8575 } 8576 8577 if (iment->iment_prev) { 8578 ASSERT(ism_hat->sfmmu_iment != iment); 8579 iment->iment_prev->iment_next = iment->iment_next; 8580 } else { 8581 ASSERT(ism_hat->sfmmu_iment == iment); 8582 ism_hat->sfmmu_iment = iment->iment_next; 8583 } 8584 8585 if (iment->iment_next) { 8586 iment->iment_next->iment_prev = iment->iment_prev; 8587 } 8588 8589 /* 8590 * zero out the entry 8591 */ 8592 iment->iment_next = NULL; 8593 iment->iment_prev = NULL; 8594 iment->iment_hat = NULL; 8595 } 8596 8597 /* 8598 * Hat_share()/unshare() return an (non-zero) error 8599 * when saddr and daddr are not properly aligned. 8600 * 8601 * The top level mapping element determines the alignment 8602 * requirement for saddr and daddr, depending on different 8603 * architectures. 8604 * 8605 * When hat_share()/unshare() are not supported, 8606 * HATOP_SHARE()/UNSHARE() return 0 8607 */ 8608 int 8609 hat_share(struct hat *sfmmup, caddr_t addr, 8610 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 8611 { 8612 ism_blk_t *ism_blkp; 8613 ism_blk_t *new_iblk; 8614 ism_map_t *ism_map; 8615 ism_ment_t *ism_ment; 8616 int i, added; 8617 hatlock_t *hatlockp; 8618 int reload_mmu = 0; 8619 uint_t ismshift = page_get_shift(ismszc); 8620 size_t ismpgsz = page_get_pagesize(ismszc); 8621 uint_t ismmask = (uint_t)ismpgsz - 1; 8622 size_t sh_size = ISM_SHIFT(ismshift, len); 8623 ushort_t ismhatflag; 8624 hat_region_cookie_t rcookie; 8625 sf_scd_t *old_scdp; 8626 8627 #ifdef DEBUG 8628 caddr_t eaddr = addr + len; 8629 #endif /* DEBUG */ 8630 8631 ASSERT(ism_hatid != NULL && sfmmup != NULL); 8632 ASSERT(sptaddr == ISMID_STARTADDR); 8633 /* 8634 * Check the alignment. 8635 */ 8636 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 8637 return (EINVAL); 8638 8639 /* 8640 * Check size alignment. 8641 */ 8642 if (!ISM_ALIGNED(ismshift, len)) 8643 return (EINVAL); 8644 8645 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 8646 8647 /* 8648 * Allocate ism_ment for the ism_hat's mapping list, and an 8649 * ism map blk in case we need one. We must do our 8650 * allocations before acquiring locks to prevent a deadlock 8651 * in the kmem allocator on the mapping list lock. 8652 */ 8653 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 8654 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 8655 8656 /* 8657 * Serialize ISM mappings with the ISM busy flag, and also the 8658 * trap handlers. 8659 */ 8660 sfmmu_ismhat_enter(sfmmup, 0); 8661 8662 /* 8663 * Allocate an ism map blk if necessary. 8664 */ 8665 if (sfmmup->sfmmu_iblk == NULL) { 8666 sfmmup->sfmmu_iblk = new_iblk; 8667 bzero(new_iblk, sizeof (*new_iblk)); 8668 new_iblk->iblk_nextpa = (uint64_t)-1; 8669 membar_stst(); /* make sure next ptr visible to all CPUs */ 8670 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 8671 reload_mmu = 1; 8672 new_iblk = NULL; 8673 } 8674 8675 #ifdef DEBUG 8676 /* 8677 * Make sure mapping does not already exist. 8678 */ 8679 ism_blkp = sfmmup->sfmmu_iblk; 8680 while (ism_blkp != NULL) { 8681 ism_map = ism_blkp->iblk_maps; 8682 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 8683 if ((addr >= ism_start(ism_map[i]) && 8684 addr < ism_end(ism_map[i])) || 8685 eaddr > ism_start(ism_map[i]) && 8686 eaddr <= ism_end(ism_map[i])) { 8687 panic("sfmmu_share: Already mapped!"); 8688 } 8689 } 8690 ism_blkp = ism_blkp->iblk_next; 8691 } 8692 #endif /* DEBUG */ 8693 8694 ASSERT(ismszc >= TTE4M); 8695 if (ismszc == TTE4M) { 8696 ismhatflag = HAT_4M_FLAG; 8697 } else if (ismszc == TTE32M) { 8698 ismhatflag = HAT_32M_FLAG; 8699 } else if (ismszc == TTE256M) { 8700 ismhatflag = HAT_256M_FLAG; 8701 } 8702 /* 8703 * Add mapping to first available mapping slot. 8704 */ 8705 ism_blkp = sfmmup->sfmmu_iblk; 8706 added = 0; 8707 while (!added) { 8708 ism_map = ism_blkp->iblk_maps; 8709 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8710 if (ism_map[i].imap_ismhat == NULL) { 8711 8712 ism_map[i].imap_ismhat = ism_hatid; 8713 ism_map[i].imap_vb_shift = (uchar_t)ismshift; 8714 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8715 ism_map[i].imap_hatflags = ismhatflag; 8716 ism_map[i].imap_sz_mask = ismmask; 8717 /* 8718 * imap_seg is checked in ISM_CHECK to see if 8719 * non-NULL, then other info assumed valid. 8720 */ 8721 membar_stst(); 8722 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 8723 ism_map[i].imap_ment = ism_ment; 8724 8725 /* 8726 * Now add ourselves to the ism_hat's 8727 * mapping list. 8728 */ 8729 ism_ment->iment_hat = sfmmup; 8730 ism_ment->iment_base_va = addr; 8731 ism_hatid->sfmmu_ismhat = 1; 8732 mutex_enter(&ism_mlist_lock); 8733 iment_add(ism_ment, ism_hatid); 8734 mutex_exit(&ism_mlist_lock); 8735 added = 1; 8736 break; 8737 } 8738 } 8739 if (!added && ism_blkp->iblk_next == NULL) { 8740 ism_blkp->iblk_next = new_iblk; 8741 new_iblk = NULL; 8742 bzero(ism_blkp->iblk_next, 8743 sizeof (*ism_blkp->iblk_next)); 8744 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 8745 membar_stst(); 8746 ism_blkp->iblk_nextpa = 8747 va_to_pa((caddr_t)ism_blkp->iblk_next); 8748 } 8749 ism_blkp = ism_blkp->iblk_next; 8750 } 8751 8752 /* 8753 * After calling hat_join_region, sfmmup may join a new SCD or 8754 * move from the old scd to a new scd, in which case, we want to 8755 * shrink the sfmmup's private tsb size, i.e., pass shrink to 8756 * sfmmu_check_page_sizes at the end of this routine. 8757 */ 8758 old_scdp = sfmmup->sfmmu_scdp; 8759 8760 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0, 8761 PROT_ALL, ismszc, NULL, HAT_REGION_ISM); 8762 if (rcookie != HAT_INVALID_REGION_COOKIE) { 8763 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie); 8764 } 8765 /* 8766 * Update our counters for this sfmmup's ism mappings. 8767 */ 8768 for (i = 0; i <= ismszc; i++) { 8769 if (!(disable_ism_large_pages & (1 << i))) 8770 (void) ism_tsb_entries(sfmmup, i); 8771 } 8772 8773 /* 8774 * For ISM and DISM we do not support 512K pages, so we only only 8775 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the 8776 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 8777 * 8778 * Need to set 32M/256M ISM flags to make sure 8779 * sfmmu_check_page_sizes() enables them on Panther. 8780 */ 8781 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 8782 8783 switch (ismszc) { 8784 case TTE256M: 8785 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) { 8786 hatlockp = sfmmu_hat_enter(sfmmup); 8787 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM); 8788 sfmmu_hat_exit(hatlockp); 8789 } 8790 break; 8791 case TTE32M: 8792 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) { 8793 hatlockp = sfmmu_hat_enter(sfmmup); 8794 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM); 8795 sfmmu_hat_exit(hatlockp); 8796 } 8797 break; 8798 default: 8799 break; 8800 } 8801 8802 /* 8803 * If we updated the ismblkpa for this HAT we must make 8804 * sure all CPUs running this process reload their tsbmiss area. 8805 * Otherwise they will fail to load the mappings in the tsbmiss 8806 * handler and will loop calling pagefault(). 8807 */ 8808 if (reload_mmu) { 8809 hatlockp = sfmmu_hat_enter(sfmmup); 8810 sfmmu_sync_mmustate(sfmmup); 8811 sfmmu_hat_exit(hatlockp); 8812 } 8813 8814 sfmmu_ismhat_exit(sfmmup, 0); 8815 8816 /* 8817 * Free up ismblk if we didn't use it. 8818 */ 8819 if (new_iblk != NULL) 8820 kmem_cache_free(ism_blk_cache, new_iblk); 8821 8822 /* 8823 * Check TSB and TLB page sizes. 8824 */ 8825 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) { 8826 sfmmu_check_page_sizes(sfmmup, 0); 8827 } else { 8828 sfmmu_check_page_sizes(sfmmup, 1); 8829 } 8830 return (0); 8831 } 8832 8833 /* 8834 * hat_unshare removes exactly one ism_map from 8835 * this process's as. It expects multiple calls 8836 * to hat_unshare for multiple shm segments. 8837 */ 8838 void 8839 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 8840 { 8841 ism_map_t *ism_map; 8842 ism_ment_t *free_ment = NULL; 8843 ism_blk_t *ism_blkp; 8844 struct hat *ism_hatid; 8845 int found, i; 8846 hatlock_t *hatlockp; 8847 struct tsb_info *tsbinfo; 8848 uint_t ismshift = page_get_shift(ismszc); 8849 size_t sh_size = ISM_SHIFT(ismshift, len); 8850 uchar_t ism_rid; 8851 sf_scd_t *old_scdp; 8852 8853 ASSERT(ISM_ALIGNED(ismshift, addr)); 8854 ASSERT(ISM_ALIGNED(ismshift, len)); 8855 ASSERT(sfmmup != NULL); 8856 ASSERT(sfmmup != ksfmmup); 8857 8858 if (sfmmup->sfmmu_xhat_provider) { 8859 XHAT_UNSHARE(sfmmup, addr, len); 8860 return; 8861 } else { 8862 /* 8863 * This must be a CPU HAT. If the address space has 8864 * XHATs attached, inform all XHATs that ISM segment 8865 * is going away 8866 */ 8867 ASSERT(sfmmup->sfmmu_as != NULL); 8868 if (sfmmup->sfmmu_as->a_xhat != NULL) 8869 xhat_unshare_all(sfmmup->sfmmu_as, addr, len); 8870 } 8871 8872 /* 8873 * Make sure that during the entire time ISM mappings are removed, 8874 * the trap handlers serialize behind us, and that no one else 8875 * can be mucking with ISM mappings. This also lets us get away 8876 * with not doing expensive cross calls to flush the TLB -- we 8877 * just discard the context, flush the entire TSB, and call it 8878 * a day. 8879 */ 8880 sfmmu_ismhat_enter(sfmmup, 0); 8881 8882 /* 8883 * Remove the mapping. 8884 * 8885 * We can't have any holes in the ism map. 8886 * The tsb miss code while searching the ism map will 8887 * stop on an empty map slot. So we must move 8888 * everyone past the hole up 1 if any. 8889 * 8890 * Also empty ism map blks are not freed until the 8891 * process exits. This is to prevent a MT race condition 8892 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 8893 */ 8894 found = 0; 8895 ism_blkp = sfmmup->sfmmu_iblk; 8896 while (!found && ism_blkp != NULL) { 8897 ism_map = ism_blkp->iblk_maps; 8898 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8899 if (addr == ism_start(ism_map[i]) && 8900 sh_size == (size_t)(ism_size(ism_map[i]))) { 8901 found = 1; 8902 break; 8903 } 8904 } 8905 if (!found) 8906 ism_blkp = ism_blkp->iblk_next; 8907 } 8908 8909 if (found) { 8910 ism_hatid = ism_map[i].imap_ismhat; 8911 ism_rid = ism_map[i].imap_rid; 8912 ASSERT(ism_hatid != NULL); 8913 ASSERT(ism_hatid->sfmmu_ismhat == 1); 8914 8915 /* 8916 * After hat_leave_region, the sfmmup may leave SCD, 8917 * in which case, we want to grow the private tsb size when 8918 * calling sfmmu_check_page_sizes at the end of the routine. 8919 */ 8920 old_scdp = sfmmup->sfmmu_scdp; 8921 /* 8922 * Then remove ourselves from the region. 8923 */ 8924 if (ism_rid != SFMMU_INVALID_ISMRID) { 8925 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid), 8926 HAT_REGION_ISM); 8927 } 8928 8929 /* 8930 * And now guarantee that any other cpu 8931 * that tries to process an ISM miss 8932 * will go to tl=0. 8933 */ 8934 hatlockp = sfmmu_hat_enter(sfmmup); 8935 sfmmu_invalidate_ctx(sfmmup); 8936 sfmmu_hat_exit(hatlockp); 8937 8938 /* 8939 * Remove ourselves from the ism mapping list. 8940 */ 8941 mutex_enter(&ism_mlist_lock); 8942 iment_sub(ism_map[i].imap_ment, ism_hatid); 8943 mutex_exit(&ism_mlist_lock); 8944 free_ment = ism_map[i].imap_ment; 8945 8946 /* 8947 * We delete the ism map by copying 8948 * the next map over the current one. 8949 * We will take the next one in the maps 8950 * array or from the next ism_blk. 8951 */ 8952 while (ism_blkp != NULL) { 8953 ism_map = ism_blkp->iblk_maps; 8954 while (i < (ISM_MAP_SLOTS - 1)) { 8955 ism_map[i] = ism_map[i + 1]; 8956 i++; 8957 } 8958 /* i == (ISM_MAP_SLOTS - 1) */ 8959 ism_blkp = ism_blkp->iblk_next; 8960 if (ism_blkp != NULL) { 8961 ism_map[i] = ism_blkp->iblk_maps[0]; 8962 i = 0; 8963 } else { 8964 ism_map[i].imap_seg = 0; 8965 ism_map[i].imap_vb_shift = 0; 8966 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8967 ism_map[i].imap_hatflags = 0; 8968 ism_map[i].imap_sz_mask = 0; 8969 ism_map[i].imap_ismhat = NULL; 8970 ism_map[i].imap_ment = NULL; 8971 } 8972 } 8973 8974 /* 8975 * Now flush entire TSB for the process, since 8976 * demapping page by page can be too expensive. 8977 * We don't have to flush the TLB here anymore 8978 * since we switch to a new TLB ctx instead. 8979 * Also, there is no need to flush if the process 8980 * is exiting since the TSB will be freed later. 8981 */ 8982 if (!sfmmup->sfmmu_free) { 8983 hatlockp = sfmmu_hat_enter(sfmmup); 8984 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 8985 tsbinfo = tsbinfo->tsb_next) { 8986 if (tsbinfo->tsb_flags & TSB_SWAPPED) 8987 continue; 8988 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) { 8989 tsbinfo->tsb_flags |= 8990 TSB_FLUSH_NEEDED; 8991 continue; 8992 } 8993 8994 sfmmu_inv_tsb(tsbinfo->tsb_va, 8995 TSB_BYTES(tsbinfo->tsb_szc)); 8996 } 8997 sfmmu_hat_exit(hatlockp); 8998 } 8999 } 9000 9001 /* 9002 * Update our counters for this sfmmup's ism mappings. 9003 */ 9004 for (i = 0; i <= ismszc; i++) { 9005 if (!(disable_ism_large_pages & (1 << i))) 9006 (void) ism_tsb_entries(sfmmup, i); 9007 } 9008 9009 sfmmu_ismhat_exit(sfmmup, 0); 9010 9011 /* 9012 * We must do our freeing here after dropping locks 9013 * to prevent a deadlock in the kmem allocator on the 9014 * mapping list lock. 9015 */ 9016 if (free_ment != NULL) 9017 kmem_cache_free(ism_ment_cache, free_ment); 9018 9019 /* 9020 * Check TSB and TLB page sizes if the process isn't exiting. 9021 */ 9022 if (!sfmmup->sfmmu_free) { 9023 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 9024 sfmmu_check_page_sizes(sfmmup, 1); 9025 } else { 9026 sfmmu_check_page_sizes(sfmmup, 0); 9027 } 9028 } 9029 } 9030 9031 /* ARGSUSED */ 9032 static int 9033 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 9034 { 9035 /* void *buf is sfmmu_t pointer */ 9036 bzero(buf, sizeof (sfmmu_t)); 9037 9038 return (0); 9039 } 9040 9041 /* ARGSUSED */ 9042 static void 9043 sfmmu_idcache_destructor(void *buf, void *cdrarg) 9044 { 9045 /* void *buf is sfmmu_t pointer */ 9046 } 9047 9048 /* 9049 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 9050 * field to be the pa of this hmeblk 9051 */ 9052 /* ARGSUSED */ 9053 static int 9054 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 9055 { 9056 struct hme_blk *hmeblkp; 9057 9058 bzero(buf, (size_t)cdrarg); 9059 hmeblkp = (struct hme_blk *)buf; 9060 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 9061 9062 #ifdef HBLK_TRACE 9063 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 9064 #endif /* HBLK_TRACE */ 9065 9066 return (0); 9067 } 9068 9069 /* ARGSUSED */ 9070 static void 9071 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 9072 { 9073 9074 #ifdef HBLK_TRACE 9075 9076 struct hme_blk *hmeblkp; 9077 9078 hmeblkp = (struct hme_blk *)buf; 9079 mutex_destroy(&hmeblkp->hblk_audit_lock); 9080 9081 #endif /* HBLK_TRACE */ 9082 } 9083 9084 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 9085 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 9086 /* 9087 * The kmem allocator will callback into our reclaim routine when the system 9088 * is running low in memory. We traverse the hash and free up all unused but 9089 * still cached hme_blks. We also traverse the free list and free them up 9090 * as well. 9091 */ 9092 /*ARGSUSED*/ 9093 static void 9094 sfmmu_hblkcache_reclaim(void *cdrarg) 9095 { 9096 int i; 9097 uint64_t hblkpa, prevpa, nx_pa; 9098 struct hmehash_bucket *hmebp; 9099 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 9100 static struct hmehash_bucket *uhmehash_reclaim_hand; 9101 static struct hmehash_bucket *khmehash_reclaim_hand; 9102 struct hme_blk *list = NULL; 9103 9104 hmebp = uhmehash_reclaim_hand; 9105 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 9106 uhmehash_reclaim_hand = hmebp = uhme_hash; 9107 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9108 9109 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9110 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9111 hmeblkp = hmebp->hmeblkp; 9112 hblkpa = hmebp->hmeh_nextpa; 9113 prevpa = 0; 9114 pr_hblk = NULL; 9115 while (hmeblkp) { 9116 nx_hblk = hmeblkp->hblk_next; 9117 nx_pa = hmeblkp->hblk_nextpa; 9118 if (!hmeblkp->hblk_vcnt && 9119 !hmeblkp->hblk_hmecnt) { 9120 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9121 prevpa, pr_hblk); 9122 sfmmu_hblk_free(hmebp, hmeblkp, 9123 hblkpa, &list); 9124 } else { 9125 pr_hblk = hmeblkp; 9126 prevpa = hblkpa; 9127 } 9128 hmeblkp = nx_hblk; 9129 hblkpa = nx_pa; 9130 } 9131 SFMMU_HASH_UNLOCK(hmebp); 9132 } 9133 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 9134 hmebp = uhme_hash; 9135 } 9136 9137 hmebp = khmehash_reclaim_hand; 9138 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 9139 khmehash_reclaim_hand = hmebp = khme_hash; 9140 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9141 9142 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9143 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9144 hmeblkp = hmebp->hmeblkp; 9145 hblkpa = hmebp->hmeh_nextpa; 9146 prevpa = 0; 9147 pr_hblk = NULL; 9148 while (hmeblkp) { 9149 nx_hblk = hmeblkp->hblk_next; 9150 nx_pa = hmeblkp->hblk_nextpa; 9151 if (!hmeblkp->hblk_vcnt && 9152 !hmeblkp->hblk_hmecnt) { 9153 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9154 prevpa, pr_hblk); 9155 sfmmu_hblk_free(hmebp, hmeblkp, 9156 hblkpa, &list); 9157 } else { 9158 pr_hblk = hmeblkp; 9159 prevpa = hblkpa; 9160 } 9161 hmeblkp = nx_hblk; 9162 hblkpa = nx_pa; 9163 } 9164 SFMMU_HASH_UNLOCK(hmebp); 9165 } 9166 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 9167 hmebp = khme_hash; 9168 } 9169 sfmmu_hblks_list_purge(&list); 9170 } 9171 9172 /* 9173 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 9174 * same goes for sfmmu_get_addrvcolor(). 9175 * 9176 * This function will return the virtual color for the specified page. The 9177 * virtual color corresponds to this page current mapping or its last mapping. 9178 * It is used by memory allocators to choose addresses with the correct 9179 * alignment so vac consistency is automatically maintained. If the page 9180 * has no color it returns -1. 9181 */ 9182 /*ARGSUSED*/ 9183 int 9184 sfmmu_get_ppvcolor(struct page *pp) 9185 { 9186 #ifdef VAC 9187 int color; 9188 9189 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 9190 return (-1); 9191 } 9192 color = PP_GET_VCOLOR(pp); 9193 ASSERT(color < mmu_btop(shm_alignment)); 9194 return (color); 9195 #else 9196 return (-1); 9197 #endif /* VAC */ 9198 } 9199 9200 /* 9201 * This function will return the desired alignment for vac consistency 9202 * (vac color) given a virtual address. If no vac is present it returns -1. 9203 */ 9204 /*ARGSUSED*/ 9205 int 9206 sfmmu_get_addrvcolor(caddr_t vaddr) 9207 { 9208 #ifdef VAC 9209 if (cache & CACHE_VAC) { 9210 return (addr_to_vcolor(vaddr)); 9211 } else { 9212 return (-1); 9213 } 9214 #else 9215 return (-1); 9216 #endif /* VAC */ 9217 } 9218 9219 #ifdef VAC 9220 /* 9221 * Check for conflicts. 9222 * A conflict exists if the new and existent mappings do not match in 9223 * their "shm_alignment fields. If conflicts exist, the existant mappings 9224 * are flushed unless one of them is locked. If one of them is locked, then 9225 * the mappings are flushed and converted to non-cacheable mappings. 9226 */ 9227 static void 9228 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 9229 { 9230 struct hat *tmphat; 9231 struct sf_hment *sfhmep, *tmphme = NULL; 9232 struct hme_blk *hmeblkp; 9233 int vcolor; 9234 tte_t tte; 9235 9236 ASSERT(sfmmu_mlist_held(pp)); 9237 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 9238 9239 vcolor = addr_to_vcolor(addr); 9240 if (PP_NEWPAGE(pp)) { 9241 PP_SET_VCOLOR(pp, vcolor); 9242 return; 9243 } 9244 9245 if (PP_GET_VCOLOR(pp) == vcolor) { 9246 return; 9247 } 9248 9249 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 9250 /* 9251 * Previous user of page had a different color 9252 * but since there are no current users 9253 * we just flush the cache and change the color. 9254 */ 9255 SFMMU_STAT(sf_pgcolor_conflict); 9256 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9257 PP_SET_VCOLOR(pp, vcolor); 9258 return; 9259 } 9260 9261 /* 9262 * If we get here we have a vac conflict with a current 9263 * mapping. VAC conflict policy is as follows. 9264 * - The default is to unload the other mappings unless: 9265 * - If we have a large mapping we uncache the page. 9266 * We need to uncache the rest of the large page too. 9267 * - If any of the mappings are locked we uncache the page. 9268 * - If the requested mapping is inconsistent 9269 * with another mapping and that mapping 9270 * is in the same address space we have to 9271 * make it non-cached. The default thing 9272 * to do is unload the inconsistent mapping 9273 * but if they are in the same address space 9274 * we run the risk of unmapping the pc or the 9275 * stack which we will use as we return to the user, 9276 * in which case we can then fault on the thing 9277 * we just unloaded and get into an infinite loop. 9278 */ 9279 if (PP_ISMAPPED_LARGE(pp)) { 9280 int sz; 9281 9282 /* 9283 * Existing mapping is for big pages. We don't unload 9284 * existing big mappings to satisfy new mappings. 9285 * Always convert all mappings to TNC. 9286 */ 9287 sz = fnd_mapping_sz(pp); 9288 pp = PP_GROUPLEADER(pp, sz); 9289 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 9290 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 9291 TTEPAGES(sz)); 9292 9293 return; 9294 } 9295 9296 /* 9297 * check if any mapping is in same as or if it is locked 9298 * since in that case we need to uncache. 9299 */ 9300 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9301 tmphme = sfhmep->hme_next; 9302 if (IS_PAHME(sfhmep)) 9303 continue; 9304 hmeblkp = sfmmu_hmetohblk(sfhmep); 9305 if (hmeblkp->hblk_xhat_bit) 9306 continue; 9307 tmphat = hblktosfmmu(hmeblkp); 9308 sfmmu_copytte(&sfhmep->hme_tte, &tte); 9309 ASSERT(TTE_IS_VALID(&tte)); 9310 if (hmeblkp->hblk_shared || tmphat == hat || 9311 hmeblkp->hblk_lckcnt) { 9312 /* 9313 * We have an uncache conflict 9314 */ 9315 SFMMU_STAT(sf_uncache_conflict); 9316 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 9317 return; 9318 } 9319 } 9320 9321 /* 9322 * We have an unload conflict 9323 * We have already checked for LARGE mappings, therefore 9324 * the remaining mapping(s) must be TTE8K. 9325 */ 9326 SFMMU_STAT(sf_unload_conflict); 9327 9328 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9329 tmphme = sfhmep->hme_next; 9330 if (IS_PAHME(sfhmep)) 9331 continue; 9332 hmeblkp = sfmmu_hmetohblk(sfhmep); 9333 if (hmeblkp->hblk_xhat_bit) 9334 continue; 9335 ASSERT(!hmeblkp->hblk_shared); 9336 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 9337 } 9338 9339 if (PP_ISMAPPED_KPM(pp)) 9340 sfmmu_kpm_vac_unload(pp, addr); 9341 9342 /* 9343 * Unloads only do TLB flushes so we need to flush the 9344 * cache here. 9345 */ 9346 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9347 PP_SET_VCOLOR(pp, vcolor); 9348 } 9349 9350 /* 9351 * Whenever a mapping is unloaded and the page is in TNC state, 9352 * we see if the page can be made cacheable again. 'pp' is 9353 * the page that we just unloaded a mapping from, the size 9354 * of mapping that was unloaded is 'ottesz'. 9355 * Remark: 9356 * The recache policy for mpss pages can leave a performance problem 9357 * under the following circumstances: 9358 * . A large page in uncached mode has just been unmapped. 9359 * . All constituent pages are TNC due to a conflicting small mapping. 9360 * . There are many other, non conflicting, small mappings around for 9361 * a lot of the constituent pages. 9362 * . We're called w/ the "old" groupleader page and the old ottesz, 9363 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 9364 * we end up w/ TTE8K or npages == 1. 9365 * . We call tst_tnc w/ the old groupleader only, and if there is no 9366 * conflict, we re-cache only this page. 9367 * . All other small mappings are not checked and will be left in TNC mode. 9368 * The problem is not very serious because: 9369 * . mpss is actually only defined for heap and stack, so the probability 9370 * is not very high that a large page mapping exists in parallel to a small 9371 * one (this is possible, but seems to be bad programming style in the 9372 * appl). 9373 * . The problem gets a little bit more serious, when those TNC pages 9374 * have to be mapped into kernel space, e.g. for networking. 9375 * . When VAC alias conflicts occur in applications, this is regarded 9376 * as an application bug. So if kstat's show them, the appl should 9377 * be changed anyway. 9378 */ 9379 void 9380 conv_tnc(page_t *pp, int ottesz) 9381 { 9382 int cursz, dosz; 9383 pgcnt_t curnpgs, dopgs; 9384 pgcnt_t pg64k; 9385 page_t *pp2; 9386 9387 /* 9388 * Determine how big a range we check for TNC and find 9389 * leader page. cursz is the size of the biggest 9390 * mapping that still exist on 'pp'. 9391 */ 9392 if (PP_ISMAPPED_LARGE(pp)) { 9393 cursz = fnd_mapping_sz(pp); 9394 } else { 9395 cursz = TTE8K; 9396 } 9397 9398 if (ottesz >= cursz) { 9399 dosz = ottesz; 9400 pp2 = pp; 9401 } else { 9402 dosz = cursz; 9403 pp2 = PP_GROUPLEADER(pp, dosz); 9404 } 9405 9406 pg64k = TTEPAGES(TTE64K); 9407 dopgs = TTEPAGES(dosz); 9408 9409 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 9410 9411 while (dopgs != 0) { 9412 curnpgs = TTEPAGES(cursz); 9413 if (tst_tnc(pp2, curnpgs)) { 9414 SFMMU_STAT_ADD(sf_recache, curnpgs); 9415 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 9416 curnpgs); 9417 } 9418 9419 ASSERT(dopgs >= curnpgs); 9420 dopgs -= curnpgs; 9421 9422 if (dopgs == 0) { 9423 break; 9424 } 9425 9426 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 9427 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 9428 cursz = fnd_mapping_sz(pp2); 9429 } else { 9430 cursz = TTE8K; 9431 } 9432 } 9433 } 9434 9435 /* 9436 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 9437 * returns 0 otherwise. Note that oaddr argument is valid for only 9438 * 8k pages. 9439 */ 9440 int 9441 tst_tnc(page_t *pp, pgcnt_t npages) 9442 { 9443 struct sf_hment *sfhme; 9444 struct hme_blk *hmeblkp; 9445 tte_t tte; 9446 caddr_t vaddr; 9447 int clr_valid = 0; 9448 int color, color1, bcolor; 9449 int i, ncolors; 9450 9451 ASSERT(pp != NULL); 9452 ASSERT(!(cache & CACHE_WRITEBACK)); 9453 9454 if (npages > 1) { 9455 ncolors = CACHE_NUM_COLOR; 9456 } 9457 9458 for (i = 0; i < npages; i++) { 9459 ASSERT(sfmmu_mlist_held(pp)); 9460 ASSERT(PP_ISTNC(pp)); 9461 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 9462 9463 if (PP_ISPNC(pp)) { 9464 return (0); 9465 } 9466 9467 clr_valid = 0; 9468 if (PP_ISMAPPED_KPM(pp)) { 9469 caddr_t kpmvaddr; 9470 9471 ASSERT(kpm_enable); 9472 kpmvaddr = hat_kpm_page2va(pp, 1); 9473 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 9474 color1 = addr_to_vcolor(kpmvaddr); 9475 clr_valid = 1; 9476 } 9477 9478 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9479 if (IS_PAHME(sfhme)) 9480 continue; 9481 hmeblkp = sfmmu_hmetohblk(sfhme); 9482 if (hmeblkp->hblk_xhat_bit) 9483 continue; 9484 9485 sfmmu_copytte(&sfhme->hme_tte, &tte); 9486 ASSERT(TTE_IS_VALID(&tte)); 9487 9488 vaddr = tte_to_vaddr(hmeblkp, tte); 9489 color = addr_to_vcolor(vaddr); 9490 9491 if (npages > 1) { 9492 /* 9493 * If there is a big mapping, make sure 9494 * 8K mapping is consistent with the big 9495 * mapping. 9496 */ 9497 bcolor = i % ncolors; 9498 if (color != bcolor) { 9499 return (0); 9500 } 9501 } 9502 if (!clr_valid) { 9503 clr_valid = 1; 9504 color1 = color; 9505 } 9506 9507 if (color1 != color) { 9508 return (0); 9509 } 9510 } 9511 9512 pp = PP_PAGENEXT(pp); 9513 } 9514 9515 return (1); 9516 } 9517 9518 void 9519 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 9520 pgcnt_t npages) 9521 { 9522 kmutex_t *pmtx; 9523 int i, ncolors, bcolor; 9524 kpm_hlk_t *kpmp; 9525 cpuset_t cpuset; 9526 9527 ASSERT(pp != NULL); 9528 ASSERT(!(cache & CACHE_WRITEBACK)); 9529 9530 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 9531 pmtx = sfmmu_page_enter(pp); 9532 9533 /* 9534 * Fast path caching single unmapped page 9535 */ 9536 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 9537 flags == HAT_CACHE) { 9538 PP_CLRTNC(pp); 9539 PP_CLRPNC(pp); 9540 sfmmu_page_exit(pmtx); 9541 sfmmu_kpm_kpmp_exit(kpmp); 9542 return; 9543 } 9544 9545 /* 9546 * We need to capture all cpus in order to change cacheability 9547 * because we can't allow one cpu to access the same physical 9548 * page using a cacheable and a non-cachebale mapping at the same 9549 * time. Since we may end up walking the ism mapping list 9550 * have to grab it's lock now since we can't after all the 9551 * cpus have been captured. 9552 */ 9553 sfmmu_hat_lock_all(); 9554 mutex_enter(&ism_mlist_lock); 9555 kpreempt_disable(); 9556 cpuset = cpu_ready_set; 9557 xc_attention(cpuset); 9558 9559 if (npages > 1) { 9560 /* 9561 * Make sure all colors are flushed since the 9562 * sfmmu_page_cache() only flushes one color- 9563 * it does not know big pages. 9564 */ 9565 ncolors = CACHE_NUM_COLOR; 9566 if (flags & HAT_TMPNC) { 9567 for (i = 0; i < ncolors; i++) { 9568 sfmmu_cache_flushcolor(i, pp->p_pagenum); 9569 } 9570 cache_flush_flag = CACHE_NO_FLUSH; 9571 } 9572 } 9573 9574 for (i = 0; i < npages; i++) { 9575 9576 ASSERT(sfmmu_mlist_held(pp)); 9577 9578 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 9579 9580 if (npages > 1) { 9581 bcolor = i % ncolors; 9582 } else { 9583 bcolor = NO_VCOLOR; 9584 } 9585 9586 sfmmu_page_cache(pp, flags, cache_flush_flag, 9587 bcolor); 9588 } 9589 9590 pp = PP_PAGENEXT(pp); 9591 } 9592 9593 xt_sync(cpuset); 9594 xc_dismissed(cpuset); 9595 mutex_exit(&ism_mlist_lock); 9596 sfmmu_hat_unlock_all(); 9597 sfmmu_page_exit(pmtx); 9598 sfmmu_kpm_kpmp_exit(kpmp); 9599 kpreempt_enable(); 9600 } 9601 9602 /* 9603 * This function changes the virtual cacheability of all mappings to a 9604 * particular page. When changing from uncache to cacheable the mappings will 9605 * only be changed if all of them have the same virtual color. 9606 * We need to flush the cache in all cpus. It is possible that 9607 * a process referenced a page as cacheable but has sinced exited 9608 * and cleared the mapping list. We still to flush it but have no 9609 * state so all cpus is the only alternative. 9610 */ 9611 static void 9612 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 9613 { 9614 struct sf_hment *sfhme; 9615 struct hme_blk *hmeblkp; 9616 sfmmu_t *sfmmup; 9617 tte_t tte, ttemod; 9618 caddr_t vaddr; 9619 int ret, color; 9620 pfn_t pfn; 9621 9622 color = bcolor; 9623 pfn = pp->p_pagenum; 9624 9625 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9626 9627 if (IS_PAHME(sfhme)) 9628 continue; 9629 hmeblkp = sfmmu_hmetohblk(sfhme); 9630 9631 if (hmeblkp->hblk_xhat_bit) 9632 continue; 9633 9634 sfmmu_copytte(&sfhme->hme_tte, &tte); 9635 ASSERT(TTE_IS_VALID(&tte)); 9636 vaddr = tte_to_vaddr(hmeblkp, tte); 9637 color = addr_to_vcolor(vaddr); 9638 9639 #ifdef DEBUG 9640 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 9641 ASSERT(color == bcolor); 9642 } 9643 #endif 9644 9645 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 9646 9647 ttemod = tte; 9648 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 9649 TTE_CLR_VCACHEABLE(&ttemod); 9650 } else { /* flags & HAT_CACHE */ 9651 TTE_SET_VCACHEABLE(&ttemod); 9652 } 9653 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 9654 if (ret < 0) { 9655 /* 9656 * Since all cpus are captured modifytte should not 9657 * fail. 9658 */ 9659 panic("sfmmu_page_cache: write to tte failed"); 9660 } 9661 9662 sfmmup = hblktosfmmu(hmeblkp); 9663 if (cache_flush_flag == CACHE_FLUSH) { 9664 /* 9665 * Flush TSBs, TLBs and caches 9666 */ 9667 if (hmeblkp->hblk_shared) { 9668 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9669 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9670 sf_region_t *rgnp; 9671 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9672 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9673 ASSERT(srdp != NULL); 9674 rgnp = srdp->srd_hmergnp[rid]; 9675 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9676 srdp, rgnp, rid); 9677 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9678 hmeblkp, 0); 9679 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr)); 9680 } else if (sfmmup->sfmmu_ismhat) { 9681 if (flags & HAT_CACHE) { 9682 SFMMU_STAT(sf_ism_recache); 9683 } else { 9684 SFMMU_STAT(sf_ism_uncache); 9685 } 9686 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9687 pfn, CACHE_FLUSH); 9688 } else { 9689 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 9690 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 9691 } 9692 9693 /* 9694 * all cache entries belonging to this pfn are 9695 * now flushed. 9696 */ 9697 cache_flush_flag = CACHE_NO_FLUSH; 9698 } else { 9699 /* 9700 * Flush only TSBs and TLBs. 9701 */ 9702 if (hmeblkp->hblk_shared) { 9703 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9704 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9705 sf_region_t *rgnp; 9706 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9707 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9708 ASSERT(srdp != NULL); 9709 rgnp = srdp->srd_hmergnp[rid]; 9710 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9711 srdp, rgnp, rid); 9712 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9713 hmeblkp, 0); 9714 } else if (sfmmup->sfmmu_ismhat) { 9715 if (flags & HAT_CACHE) { 9716 SFMMU_STAT(sf_ism_recache); 9717 } else { 9718 SFMMU_STAT(sf_ism_uncache); 9719 } 9720 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9721 pfn, CACHE_NO_FLUSH); 9722 } else { 9723 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 9724 } 9725 } 9726 } 9727 9728 if (PP_ISMAPPED_KPM(pp)) 9729 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 9730 9731 switch (flags) { 9732 9733 default: 9734 panic("sfmmu_pagecache: unknown flags"); 9735 break; 9736 9737 case HAT_CACHE: 9738 PP_CLRTNC(pp); 9739 PP_CLRPNC(pp); 9740 PP_SET_VCOLOR(pp, color); 9741 break; 9742 9743 case HAT_TMPNC: 9744 PP_SETTNC(pp); 9745 PP_SET_VCOLOR(pp, NO_VCOLOR); 9746 break; 9747 9748 case HAT_UNCACHE: 9749 PP_SETPNC(pp); 9750 PP_CLRTNC(pp); 9751 PP_SET_VCOLOR(pp, NO_VCOLOR); 9752 break; 9753 } 9754 } 9755 #endif /* VAC */ 9756 9757 9758 /* 9759 * Wrapper routine used to return a context. 9760 * 9761 * It's the responsibility of the caller to guarantee that the 9762 * process serializes on calls here by taking the HAT lock for 9763 * the hat. 9764 * 9765 */ 9766 static void 9767 sfmmu_get_ctx(sfmmu_t *sfmmup) 9768 { 9769 mmu_ctx_t *mmu_ctxp; 9770 uint_t pstate_save; 9771 int ret; 9772 9773 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9774 ASSERT(sfmmup != ksfmmup); 9775 9776 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) { 9777 sfmmu_setup_tsbinfo(sfmmup); 9778 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID); 9779 } 9780 9781 kpreempt_disable(); 9782 9783 mmu_ctxp = CPU_MMU_CTXP(CPU); 9784 ASSERT(mmu_ctxp); 9785 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 9786 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 9787 9788 /* 9789 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU. 9790 */ 9791 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs) 9792 sfmmu_ctx_wrap_around(mmu_ctxp); 9793 9794 /* 9795 * Let the MMU set up the page sizes to use for 9796 * this context in the TLB. Don't program 2nd dtlb for ism hat. 9797 */ 9798 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) { 9799 mmu_set_ctx_page_sizes(sfmmup); 9800 } 9801 9802 /* 9803 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with 9804 * interrupts disabled to prevent race condition with wrap-around 9805 * ctx invalidatation. In sun4v, ctx invalidation also involves 9806 * a HV call to set the number of TSBs to 0. If interrupts are not 9807 * disabled until after sfmmu_load_mmustate is complete TSBs may 9808 * become assigned to INVALID_CONTEXT. This is not allowed. 9809 */ 9810 pstate_save = sfmmu_disable_intrs(); 9811 9812 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) && 9813 sfmmup->sfmmu_scdp != NULL) { 9814 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 9815 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 9816 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED); 9817 /* debug purpose only */ 9818 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 9819 != INVALID_CONTEXT); 9820 } 9821 sfmmu_load_mmustate(sfmmup); 9822 9823 sfmmu_enable_intrs(pstate_save); 9824 9825 kpreempt_enable(); 9826 } 9827 9828 /* 9829 * When all cnums are used up in a MMU, cnum will wrap around to the 9830 * next generation and start from 2. 9831 */ 9832 static void 9833 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp) 9834 { 9835 9836 /* caller must have disabled the preemption */ 9837 ASSERT(curthread->t_preempt >= 1); 9838 ASSERT(mmu_ctxp != NULL); 9839 9840 /* acquire Per-MMU (PM) spin lock */ 9841 mutex_enter(&mmu_ctxp->mmu_lock); 9842 9843 /* re-check to see if wrap-around is needed */ 9844 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs) 9845 goto done; 9846 9847 SFMMU_MMU_STAT(mmu_wrap_around); 9848 9849 /* update gnum */ 9850 ASSERT(mmu_ctxp->mmu_gnum != 0); 9851 mmu_ctxp->mmu_gnum++; 9852 if (mmu_ctxp->mmu_gnum == 0 || 9853 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) { 9854 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.", 9855 (void *)mmu_ctxp); 9856 } 9857 9858 if (mmu_ctxp->mmu_ncpus > 1) { 9859 cpuset_t cpuset; 9860 9861 membar_enter(); /* make sure updated gnum visible */ 9862 9863 SFMMU_XCALL_STATS(NULL); 9864 9865 /* xcall to others on the same MMU to invalidate ctx */ 9866 cpuset = mmu_ctxp->mmu_cpuset; 9867 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id)); 9868 CPUSET_DEL(cpuset, CPU->cpu_id); 9869 CPUSET_AND(cpuset, cpu_ready_set); 9870 9871 /* 9872 * Pass in INVALID_CONTEXT as the first parameter to 9873 * sfmmu_raise_tsb_exception, which invalidates the context 9874 * of any process running on the CPUs in the MMU. 9875 */ 9876 xt_some(cpuset, sfmmu_raise_tsb_exception, 9877 INVALID_CONTEXT, INVALID_CONTEXT); 9878 xt_sync(cpuset); 9879 9880 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 9881 } 9882 9883 if (sfmmu_getctx_sec() != INVALID_CONTEXT) { 9884 sfmmu_setctx_sec(INVALID_CONTEXT); 9885 sfmmu_clear_utsbinfo(); 9886 } 9887 9888 /* 9889 * No xcall is needed here. For sun4u systems all CPUs in context 9890 * domain share a single physical MMU therefore it's enough to flush 9891 * TLB on local CPU. On sun4v systems we use 1 global context 9892 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception 9893 * handler. Note that vtag_flushall_uctxs() is called 9894 * for Ultra II machine, where the equivalent flushall functionality 9895 * is implemented in SW, and only user ctx TLB entries are flushed. 9896 */ 9897 if (&vtag_flushall_uctxs != NULL) { 9898 vtag_flushall_uctxs(); 9899 } else { 9900 vtag_flushall(); 9901 } 9902 9903 /* reset mmu cnum, skips cnum 0 and 1 */ 9904 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 9905 9906 done: 9907 mutex_exit(&mmu_ctxp->mmu_lock); 9908 } 9909 9910 9911 /* 9912 * For multi-threaded process, set the process context to INVALID_CONTEXT 9913 * so that it faults and reloads the MMU state from TL=0. For single-threaded 9914 * process, we can just load the MMU state directly without having to 9915 * set context invalid. Caller must hold the hat lock since we don't 9916 * acquire it here. 9917 */ 9918 static void 9919 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 9920 { 9921 uint_t cnum; 9922 uint_t pstate_save; 9923 9924 ASSERT(sfmmup != ksfmmup); 9925 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9926 9927 kpreempt_disable(); 9928 9929 /* 9930 * We check whether the pass'ed-in sfmmup is the same as the 9931 * current running proc. This is to makes sure the current proc 9932 * stays single-threaded if it already is. 9933 */ 9934 if ((sfmmup == curthread->t_procp->p_as->a_hat) && 9935 (curthread->t_procp->p_lwpcnt == 1)) { 9936 /* single-thread */ 9937 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum; 9938 if (cnum != INVALID_CONTEXT) { 9939 uint_t curcnum; 9940 /* 9941 * Disable interrupts to prevent race condition 9942 * with sfmmu_ctx_wrap_around ctx invalidation. 9943 * In sun4v, ctx invalidation involves setting 9944 * TSB to NULL, hence, interrupts should be disabled 9945 * untill after sfmmu_load_mmustate is completed. 9946 */ 9947 pstate_save = sfmmu_disable_intrs(); 9948 curcnum = sfmmu_getctx_sec(); 9949 if (curcnum == cnum) 9950 sfmmu_load_mmustate(sfmmup); 9951 sfmmu_enable_intrs(pstate_save); 9952 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT); 9953 } 9954 } else { 9955 /* 9956 * multi-thread 9957 * or when sfmmup is not the same as the curproc. 9958 */ 9959 sfmmu_invalidate_ctx(sfmmup); 9960 } 9961 9962 kpreempt_enable(); 9963 } 9964 9965 9966 /* 9967 * Replace the specified TSB with a new TSB. This function gets called when 9968 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the 9969 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 9970 * (8K). 9971 * 9972 * Caller must hold the HAT lock, but should assume any tsb_info 9973 * pointers it has are no longer valid after calling this function. 9974 * 9975 * Return values: 9976 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 9977 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 9978 * something to this tsbinfo/TSB 9979 * TSB_SUCCESS Operation succeeded 9980 */ 9981 static tsb_replace_rc_t 9982 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 9983 hatlock_t *hatlockp, uint_t flags) 9984 { 9985 struct tsb_info *new_tsbinfo = NULL; 9986 struct tsb_info *curtsb, *prevtsb; 9987 uint_t tte_sz_mask; 9988 int i; 9989 9990 ASSERT(sfmmup != ksfmmup); 9991 ASSERT(sfmmup->sfmmu_ismhat == 0); 9992 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9993 ASSERT(szc <= tsb_max_growsize); 9994 9995 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 9996 return (TSB_LOSTRACE); 9997 9998 /* 9999 * Find the tsb_info ahead of this one in the list, and 10000 * also make sure that the tsb_info passed in really 10001 * exists! 10002 */ 10003 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10004 curtsb != old_tsbinfo && curtsb != NULL; 10005 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10006 ; 10007 ASSERT(curtsb != NULL); 10008 10009 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10010 /* 10011 * The process is swapped out, so just set the new size 10012 * code. When it swaps back in, we'll allocate a new one 10013 * of the new chosen size. 10014 */ 10015 curtsb->tsb_szc = szc; 10016 return (TSB_SUCCESS); 10017 } 10018 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 10019 10020 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 10021 10022 /* 10023 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 10024 * If we fail to allocate a TSB, exit. 10025 * 10026 * If tsb grows with new tsb size > 4M and old tsb size < 4M, 10027 * then try 4M slab after the initial alloc fails. 10028 * 10029 * If tsb swapin with tsb size > 4M, then try 4M after the 10030 * initial alloc fails. 10031 */ 10032 sfmmu_hat_exit(hatlockp); 10033 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, 10034 tte_sz_mask, flags, sfmmup) && 10035 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) || 10036 (!(flags & TSB_SWAPIN) && 10037 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) || 10038 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE, 10039 tte_sz_mask, flags, sfmmup))) { 10040 (void) sfmmu_hat_enter(sfmmup); 10041 if (!(flags & TSB_SWAPIN)) 10042 SFMMU_STAT(sf_tsb_resize_failures); 10043 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10044 return (TSB_ALLOCFAIL); 10045 } 10046 (void) sfmmu_hat_enter(sfmmup); 10047 10048 /* 10049 * Re-check to make sure somebody else didn't muck with us while we 10050 * didn't hold the HAT lock. If the process swapped out, fine, just 10051 * exit; this can happen if we try to shrink the TSB from the context 10052 * of another process (such as on an ISM unmap), though it is rare. 10053 */ 10054 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10055 SFMMU_STAT(sf_tsb_resize_failures); 10056 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10057 sfmmu_hat_exit(hatlockp); 10058 sfmmu_tsbinfo_free(new_tsbinfo); 10059 (void) sfmmu_hat_enter(sfmmup); 10060 return (TSB_LOSTRACE); 10061 } 10062 10063 #ifdef DEBUG 10064 /* Reverify that the tsb_info still exists.. for debugging only */ 10065 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10066 curtsb != old_tsbinfo && curtsb != NULL; 10067 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10068 ; 10069 ASSERT(curtsb != NULL); 10070 #endif /* DEBUG */ 10071 10072 /* 10073 * Quiesce any CPUs running this process on their next TLB miss 10074 * so they atomically see the new tsb_info. We temporarily set the 10075 * context to invalid context so new threads that come on processor 10076 * after we do the xcall to cpusran will also serialize behind the 10077 * HAT lock on TLB miss and will see the new TSB. Since this short 10078 * race with a new thread coming on processor is relatively rare, 10079 * this synchronization mechanism should be cheaper than always 10080 * pausing all CPUs for the duration of the setup, which is what 10081 * the old implementation did. This is particuarly true if we are 10082 * copying a huge chunk of memory around during that window. 10083 * 10084 * The memory barriers are to make sure things stay consistent 10085 * with resume() since it does not hold the HAT lock while 10086 * walking the list of tsb_info structures. 10087 */ 10088 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 10089 /* The TSB is either growing or shrinking. */ 10090 sfmmu_invalidate_ctx(sfmmup); 10091 } else { 10092 /* 10093 * It is illegal to swap in TSBs from a process other 10094 * than a process being swapped in. This in turn 10095 * implies we do not have a valid MMU context here 10096 * since a process needs one to resolve translation 10097 * misses. 10098 */ 10099 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 10100 } 10101 10102 #ifdef DEBUG 10103 ASSERT(max_mmu_ctxdoms > 0); 10104 10105 /* 10106 * Process should have INVALID_CONTEXT on all MMUs 10107 */ 10108 for (i = 0; i < max_mmu_ctxdoms; i++) { 10109 10110 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT); 10111 } 10112 #endif 10113 10114 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 10115 membar_stst(); /* strict ordering required */ 10116 if (prevtsb) 10117 prevtsb->tsb_next = new_tsbinfo; 10118 else 10119 sfmmup->sfmmu_tsb = new_tsbinfo; 10120 membar_enter(); /* make sure new TSB globally visible */ 10121 10122 /* 10123 * We need to migrate TSB entries from the old TSB to the new TSB 10124 * if tsb_remap_ttes is set and the TSB is growing. 10125 */ 10126 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 10127 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 10128 10129 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10130 10131 /* 10132 * Drop the HAT lock to free our old tsb_info. 10133 */ 10134 sfmmu_hat_exit(hatlockp); 10135 10136 if ((flags & TSB_GROW) == TSB_GROW) { 10137 SFMMU_STAT(sf_tsb_grow); 10138 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 10139 SFMMU_STAT(sf_tsb_shrink); 10140 } 10141 10142 sfmmu_tsbinfo_free(old_tsbinfo); 10143 10144 (void) sfmmu_hat_enter(sfmmup); 10145 return (TSB_SUCCESS); 10146 } 10147 10148 /* 10149 * This function will re-program hat pgsz array, and invalidate the 10150 * process' context, forcing the process to switch to another 10151 * context on the next TLB miss, and therefore start using the 10152 * TLB that is reprogrammed for the new page sizes. 10153 */ 10154 void 10155 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 10156 { 10157 int i; 10158 hatlock_t *hatlockp = NULL; 10159 10160 hatlockp = sfmmu_hat_enter(sfmmup); 10161 /* USIII+-IV+ optimization, requires hat lock */ 10162 if (tmp_pgsz) { 10163 for (i = 0; i < mmu_page_sizes; i++) 10164 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 10165 } 10166 SFMMU_STAT(sf_tlb_reprog_pgsz); 10167 10168 sfmmu_invalidate_ctx(sfmmup); 10169 10170 sfmmu_hat_exit(hatlockp); 10171 } 10172 10173 /* 10174 * The scd_rttecnt field in the SCD must be updated to take account of the 10175 * regions which it contains. 10176 */ 10177 static void 10178 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp) 10179 { 10180 uint_t rid; 10181 uint_t i, j; 10182 ulong_t w; 10183 sf_region_t *rgnp; 10184 10185 ASSERT(srdp != NULL); 10186 10187 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 10188 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 10189 continue; 10190 } 10191 10192 j = 0; 10193 while (w) { 10194 if (!(w & 0x1)) { 10195 j++; 10196 w >>= 1; 10197 continue; 10198 } 10199 rid = (i << BT_ULSHIFT) | j; 10200 j++; 10201 w >>= 1; 10202 10203 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 10204 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 10205 rgnp = srdp->srd_hmergnp[rid]; 10206 ASSERT(rgnp->rgn_refcnt > 0); 10207 ASSERT(rgnp->rgn_id == rid); 10208 10209 scdp->scd_rttecnt[rgnp->rgn_pgszc] += 10210 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 10211 10212 /* 10213 * Maintain the tsb0 inflation cnt for the regions 10214 * in the SCD. 10215 */ 10216 if (rgnp->rgn_pgszc >= TTE4M) { 10217 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt += 10218 rgnp->rgn_size >> 10219 (TTE_PAGE_SHIFT(TTE8K) + 2); 10220 } 10221 } 10222 } 10223 } 10224 10225 /* 10226 * This function assumes that there are either four or six supported page 10227 * sizes and at most two programmable TLBs, so we need to decide which 10228 * page sizes are most important and then tell the MMU layer so it 10229 * can adjust the TLB page sizes accordingly (if supported). 10230 * 10231 * If these assumptions change, this function will need to be 10232 * updated to support whatever the new limits are. 10233 * 10234 * The growing flag is nonzero if we are growing the address space, 10235 * and zero if it is shrinking. This allows us to decide whether 10236 * to grow or shrink our TSB, depending upon available memory 10237 * conditions. 10238 */ 10239 static void 10240 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 10241 { 10242 uint64_t ttecnt[MMU_PAGE_SIZES]; 10243 uint64_t tte8k_cnt, tte4m_cnt; 10244 uint8_t i; 10245 int sectsb_thresh; 10246 10247 /* 10248 * Kernel threads, processes with small address spaces not using 10249 * large pages, and dummy ISM HATs need not apply. 10250 */ 10251 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 10252 return; 10253 10254 if (!SFMMU_LGPGS_INUSE(sfmmup) && 10255 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 10256 return; 10257 10258 for (i = 0; i < mmu_page_sizes; i++) { 10259 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] + 10260 sfmmup->sfmmu_ismttecnt[i]; 10261 } 10262 10263 /* Check pagesizes in use, and possibly reprogram DTLB. */ 10264 if (&mmu_check_page_sizes) 10265 mmu_check_page_sizes(sfmmup, ttecnt); 10266 10267 /* 10268 * Calculate the number of 8k ttes to represent the span of these 10269 * pages. 10270 */ 10271 tte8k_cnt = ttecnt[TTE8K] + 10272 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 10273 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 10274 if (mmu_page_sizes == max_mmu_page_sizes) { 10275 tte4m_cnt = ttecnt[TTE4M] + 10276 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 10277 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 10278 } else { 10279 tte4m_cnt = ttecnt[TTE4M]; 10280 } 10281 10282 /* 10283 * Inflate tte8k_cnt to allow for region large page allocation failure. 10284 */ 10285 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt; 10286 10287 /* 10288 * Inflate TSB sizes by a factor of 2 if this process 10289 * uses 4M text pages to minimize extra conflict misses 10290 * in the first TSB since without counting text pages 10291 * 8K TSB may become too small. 10292 * 10293 * Also double the size of the second TSB to minimize 10294 * extra conflict misses due to competition between 4M text pages 10295 * and data pages. 10296 * 10297 * We need to adjust the second TSB allocation threshold by the 10298 * inflation factor, since there is no point in creating a second 10299 * TSB when we know all the mappings can fit in the I/D TLBs. 10300 */ 10301 sectsb_thresh = tsb_sectsb_threshold; 10302 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 10303 tte8k_cnt <<= 1; 10304 tte4m_cnt <<= 1; 10305 sectsb_thresh <<= 1; 10306 } 10307 10308 /* 10309 * Check to see if our TSB is the right size; we may need to 10310 * grow or shrink it. If the process is small, our work is 10311 * finished at this point. 10312 */ 10313 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) { 10314 return; 10315 } 10316 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh); 10317 } 10318 10319 static void 10320 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 10321 uint64_t tte4m_cnt, int sectsb_thresh) 10322 { 10323 int tsb_bits; 10324 uint_t tsb_szc; 10325 struct tsb_info *tsbinfop; 10326 hatlock_t *hatlockp = NULL; 10327 10328 hatlockp = sfmmu_hat_enter(sfmmup); 10329 ASSERT(hatlockp != NULL); 10330 tsbinfop = sfmmup->sfmmu_tsb; 10331 ASSERT(tsbinfop != NULL); 10332 10333 /* 10334 * If we're growing, select the size based on RSS. If we're 10335 * shrinking, leave some room so we don't have to turn around and 10336 * grow again immediately. 10337 */ 10338 if (growing) 10339 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 10340 else 10341 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 10342 10343 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10344 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10345 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10346 hatlockp, TSB_SHRINK); 10347 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 10348 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10349 hatlockp, TSB_GROW); 10350 } 10351 tsbinfop = sfmmup->sfmmu_tsb; 10352 10353 /* 10354 * With the TLB and first TSB out of the way, we need to see if 10355 * we need a second TSB for 4M pages. If we managed to reprogram 10356 * the TLB page sizes above, the process will start using this new 10357 * TSB right away; otherwise, it will start using it on the next 10358 * context switch. Either way, it's no big deal so there's no 10359 * synchronization with the trap handlers here unless we grow the 10360 * TSB (in which case it's required to prevent using the old one 10361 * after it's freed). Note: second tsb is required for 32M/256M 10362 * page sizes. 10363 */ 10364 if (tte4m_cnt > sectsb_thresh) { 10365 /* 10366 * If we're growing, select the size based on RSS. If we're 10367 * shrinking, leave some room so we don't have to turn 10368 * around and grow again immediately. 10369 */ 10370 if (growing) 10371 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 10372 else 10373 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 10374 if (tsbinfop->tsb_next == NULL) { 10375 struct tsb_info *newtsb; 10376 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 10377 0 : TSB_ALLOC; 10378 10379 sfmmu_hat_exit(hatlockp); 10380 10381 /* 10382 * Try to allocate a TSB for 4[32|256]M pages. If we 10383 * can't get the size we want, retry w/a minimum sized 10384 * TSB. If that still didn't work, give up; we can 10385 * still run without one. 10386 */ 10387 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 10388 TSB4M|TSB32M|TSB256M:TSB4M; 10389 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 10390 allocflags, sfmmup)) && 10391 (tsb_szc <= TSB_4M_SZCODE || 10392 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 10393 tsb_bits, allocflags, sfmmup)) && 10394 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 10395 tsb_bits, allocflags, sfmmup)) { 10396 return; 10397 } 10398 10399 hatlockp = sfmmu_hat_enter(sfmmup); 10400 10401 sfmmu_invalidate_ctx(sfmmup); 10402 10403 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 10404 sfmmup->sfmmu_tsb->tsb_next = newtsb; 10405 SFMMU_STAT(sf_tsb_sectsb_create); 10406 sfmmu_hat_exit(hatlockp); 10407 return; 10408 } else { 10409 /* 10410 * It's annoying, but possible for us 10411 * to get here.. we dropped the HAT lock 10412 * because of locking order in the kmem 10413 * allocator, and while we were off getting 10414 * our memory, some other thread decided to 10415 * do us a favor and won the race to get a 10416 * second TSB for this process. Sigh. 10417 */ 10418 sfmmu_hat_exit(hatlockp); 10419 sfmmu_tsbinfo_free(newtsb); 10420 return; 10421 } 10422 } 10423 10424 /* 10425 * We have a second TSB, see if it's big enough. 10426 */ 10427 tsbinfop = tsbinfop->tsb_next; 10428 10429 /* 10430 * Check to see if our second TSB is the right size; 10431 * we may need to grow or shrink it. 10432 * To prevent thrashing (e.g. growing the TSB on a 10433 * subsequent map operation), only try to shrink if 10434 * the TSB reach exceeds twice the virtual address 10435 * space size. 10436 */ 10437 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10438 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10439 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10440 tsb_szc, hatlockp, TSB_SHRINK); 10441 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 10442 TSB_OK_GROW()) { 10443 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10444 tsb_szc, hatlockp, TSB_GROW); 10445 } 10446 } 10447 10448 sfmmu_hat_exit(hatlockp); 10449 } 10450 10451 /* 10452 * Free up a sfmmu 10453 * Since the sfmmu is currently embedded in the hat struct we simply zero 10454 * out our fields and free up the ism map blk list if any. 10455 */ 10456 static void 10457 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 10458 { 10459 ism_blk_t *blkp, *nx_blkp; 10460 #ifdef DEBUG 10461 ism_map_t *map; 10462 int i; 10463 #endif 10464 10465 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 10466 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 10467 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 10468 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 10469 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 10470 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 10471 ASSERT(SF_RGNMAP_ISNULL(sfmmup)); 10472 10473 sfmmup->sfmmu_free = 0; 10474 sfmmup->sfmmu_ismhat = 0; 10475 10476 blkp = sfmmup->sfmmu_iblk; 10477 sfmmup->sfmmu_iblk = NULL; 10478 10479 while (blkp) { 10480 #ifdef DEBUG 10481 map = blkp->iblk_maps; 10482 for (i = 0; i < ISM_MAP_SLOTS; i++) { 10483 ASSERT(map[i].imap_seg == 0); 10484 ASSERT(map[i].imap_ismhat == NULL); 10485 ASSERT(map[i].imap_ment == NULL); 10486 } 10487 #endif 10488 nx_blkp = blkp->iblk_next; 10489 blkp->iblk_next = NULL; 10490 blkp->iblk_nextpa = (uint64_t)-1; 10491 kmem_cache_free(ism_blk_cache, blkp); 10492 blkp = nx_blkp; 10493 } 10494 } 10495 10496 /* 10497 * Locking primitves accessed by HATLOCK macros 10498 */ 10499 10500 #define SFMMU_SPL_MTX (0x0) 10501 #define SFMMU_ML_MTX (0x1) 10502 10503 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 10504 SPL_HASH(pg) : MLIST_HASH(pg)) 10505 10506 kmutex_t * 10507 sfmmu_page_enter(struct page *pp) 10508 { 10509 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 10510 } 10511 10512 void 10513 sfmmu_page_exit(kmutex_t *spl) 10514 { 10515 mutex_exit(spl); 10516 } 10517 10518 int 10519 sfmmu_page_spl_held(struct page *pp) 10520 { 10521 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 10522 } 10523 10524 kmutex_t * 10525 sfmmu_mlist_enter(struct page *pp) 10526 { 10527 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 10528 } 10529 10530 void 10531 sfmmu_mlist_exit(kmutex_t *mml) 10532 { 10533 mutex_exit(mml); 10534 } 10535 10536 int 10537 sfmmu_mlist_held(struct page *pp) 10538 { 10539 10540 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 10541 } 10542 10543 /* 10544 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 10545 * sfmmu_mlist_enter() case mml_table lock array is used and for 10546 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 10547 * 10548 * The lock is taken on a root page so that it protects an operation on all 10549 * constituent pages of a large page pp belongs to. 10550 * 10551 * The routine takes a lock from the appropriate array. The lock is determined 10552 * by hashing the root page. After taking the lock this routine checks if the 10553 * root page has the same size code that was used to determine the root (i.e 10554 * that root hasn't changed). If root page has the expected p_szc field we 10555 * have the right lock and it's returned to the caller. If root's p_szc 10556 * decreased we release the lock and retry from the beginning. This case can 10557 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 10558 * value and taking the lock. The number of retries due to p_szc decrease is 10559 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 10560 * determined by hashing pp itself. 10561 * 10562 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 10563 * possible that p_szc can increase. To increase p_szc a thread has to lock 10564 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 10565 * callers that don't hold a page locked recheck if hmeblk through which pp 10566 * was found still maps this pp. If it doesn't map it anymore returned lock 10567 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 10568 * p_szc increase after taking the lock it returns this lock without further 10569 * retries because in this case the caller doesn't care about which lock was 10570 * taken. The caller will drop it right away. 10571 * 10572 * After the routine returns it's guaranteed that hat_page_demote() can't 10573 * change p_szc field of any of constituent pages of a large page pp belongs 10574 * to as long as pp was either locked at least SHARED prior to this call or 10575 * the caller finds that hment that pointed to this pp still references this 10576 * pp (this also assumes that the caller holds hme hash bucket lock so that 10577 * the same pp can't be remapped into the same hmeblk after it was unmapped by 10578 * hat_pageunload()). 10579 */ 10580 static kmutex_t * 10581 sfmmu_mlspl_enter(struct page *pp, int type) 10582 { 10583 kmutex_t *mtx; 10584 uint_t prev_rszc = UINT_MAX; 10585 page_t *rootpp; 10586 uint_t szc; 10587 uint_t rszc; 10588 uint_t pszc = pp->p_szc; 10589 10590 ASSERT(pp != NULL); 10591 10592 again: 10593 if (pszc == 0) { 10594 mtx = SFMMU_MLSPL_MTX(type, pp); 10595 mutex_enter(mtx); 10596 return (mtx); 10597 } 10598 10599 /* The lock lives in the root page */ 10600 rootpp = PP_GROUPLEADER(pp, pszc); 10601 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10602 mutex_enter(mtx); 10603 10604 /* 10605 * Return mml in the following 3 cases: 10606 * 10607 * 1) If pp itself is root since if its p_szc decreased before we took 10608 * the lock pp is still the root of smaller szc page. And if its p_szc 10609 * increased it doesn't matter what lock we return (see comment in 10610 * front of this routine). 10611 * 10612 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 10613 * large page we have the right lock since any previous potential 10614 * hat_page_demote() is done demoting from greater than current root's 10615 * p_szc because hat_page_demote() changes root's p_szc last. No 10616 * further hat_page_demote() can start or be in progress since it 10617 * would need the same lock we currently hold. 10618 * 10619 * 3) If rootpp's p_szc increased since previous iteration it doesn't 10620 * matter what lock we return (see comment in front of this routine). 10621 */ 10622 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 10623 rszc >= prev_rszc) { 10624 return (mtx); 10625 } 10626 10627 /* 10628 * hat_page_demote() could have decreased root's p_szc. 10629 * In this case pp's p_szc must also be smaller than pszc. 10630 * Retry. 10631 */ 10632 if (rszc < pszc) { 10633 szc = pp->p_szc; 10634 if (szc < pszc) { 10635 mutex_exit(mtx); 10636 pszc = szc; 10637 goto again; 10638 } 10639 /* 10640 * pp's p_szc increased after it was decreased. 10641 * page cannot be mapped. Return current lock. The caller 10642 * will drop it right away. 10643 */ 10644 return (mtx); 10645 } 10646 10647 /* 10648 * root's p_szc is greater than pp's p_szc. 10649 * hat_page_demote() is not done with all pages 10650 * yet. Wait for it to complete. 10651 */ 10652 mutex_exit(mtx); 10653 rootpp = PP_GROUPLEADER(rootpp, rszc); 10654 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10655 mutex_enter(mtx); 10656 mutex_exit(mtx); 10657 prev_rszc = rszc; 10658 goto again; 10659 } 10660 10661 static int 10662 sfmmu_mlspl_held(struct page *pp, int type) 10663 { 10664 kmutex_t *mtx; 10665 10666 ASSERT(pp != NULL); 10667 /* The lock lives in the root page */ 10668 pp = PP_PAGEROOT(pp); 10669 ASSERT(pp != NULL); 10670 10671 mtx = SFMMU_MLSPL_MTX(type, pp); 10672 return (MUTEX_HELD(mtx)); 10673 } 10674 10675 static uint_t 10676 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 10677 { 10678 struct hme_blk *hblkp; 10679 10680 if (freehblkp != NULL) { 10681 mutex_enter(&freehblkp_lock); 10682 if (freehblkp != NULL) { 10683 /* 10684 * If the current thread is owning hblk_reserve OR 10685 * critical request from sfmmu_hblk_steal() 10686 * let it succeed even if freehblkcnt is really low. 10687 */ 10688 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 10689 SFMMU_STAT(sf_get_free_throttle); 10690 mutex_exit(&freehblkp_lock); 10691 return (0); 10692 } 10693 freehblkcnt--; 10694 *hmeblkpp = freehblkp; 10695 hblkp = *hmeblkpp; 10696 freehblkp = hblkp->hblk_next; 10697 mutex_exit(&freehblkp_lock); 10698 hblkp->hblk_next = NULL; 10699 SFMMU_STAT(sf_get_free_success); 10700 return (1); 10701 } 10702 mutex_exit(&freehblkp_lock); 10703 } 10704 SFMMU_STAT(sf_get_free_fail); 10705 return (0); 10706 } 10707 10708 static uint_t 10709 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 10710 { 10711 struct hme_blk *hblkp; 10712 10713 /* 10714 * If the current thread is mapping into kernel space, 10715 * let it succede even if freehblkcnt is max 10716 * so that it will avoid freeing it to kmem. 10717 * This will prevent stack overflow due to 10718 * possible recursion since kmem_cache_free() 10719 * might require creation of a slab which 10720 * in turn needs an hmeblk to map that slab; 10721 * let's break this vicious chain at the first 10722 * opportunity. 10723 */ 10724 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10725 mutex_enter(&freehblkp_lock); 10726 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10727 SFMMU_STAT(sf_put_free_success); 10728 freehblkcnt++; 10729 hmeblkp->hblk_next = freehblkp; 10730 freehblkp = hmeblkp; 10731 mutex_exit(&freehblkp_lock); 10732 return (1); 10733 } 10734 mutex_exit(&freehblkp_lock); 10735 } 10736 10737 /* 10738 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 10739 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 10740 * we are not in the process of mapping into kernel space. 10741 */ 10742 ASSERT(!critical); 10743 while (freehblkcnt > HBLK_RESERVE_CNT) { 10744 mutex_enter(&freehblkp_lock); 10745 if (freehblkcnt > HBLK_RESERVE_CNT) { 10746 freehblkcnt--; 10747 hblkp = freehblkp; 10748 freehblkp = hblkp->hblk_next; 10749 mutex_exit(&freehblkp_lock); 10750 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 10751 kmem_cache_free(sfmmu8_cache, hblkp); 10752 continue; 10753 } 10754 mutex_exit(&freehblkp_lock); 10755 } 10756 SFMMU_STAT(sf_put_free_fail); 10757 return (0); 10758 } 10759 10760 static void 10761 sfmmu_hblk_swap(struct hme_blk *new) 10762 { 10763 struct hme_blk *old, *hblkp, *prev; 10764 uint64_t hblkpa, prevpa, newpa; 10765 caddr_t base, vaddr, endaddr; 10766 struct hmehash_bucket *hmebp; 10767 struct sf_hment *osfhme, *nsfhme; 10768 page_t *pp; 10769 kmutex_t *pml; 10770 tte_t tte; 10771 10772 #ifdef DEBUG 10773 hmeblk_tag hblktag; 10774 struct hme_blk *found; 10775 #endif 10776 old = HBLK_RESERVE; 10777 ASSERT(!old->hblk_shared); 10778 10779 /* 10780 * save pa before bcopy clobbers it 10781 */ 10782 newpa = new->hblk_nextpa; 10783 10784 base = (caddr_t)get_hblk_base(old); 10785 endaddr = base + get_hblk_span(old); 10786 10787 /* 10788 * acquire hash bucket lock. 10789 */ 10790 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K, 10791 SFMMU_INVALID_SHMERID); 10792 10793 /* 10794 * copy contents from old to new 10795 */ 10796 bcopy((void *)old, (void *)new, HME8BLK_SZ); 10797 10798 /* 10799 * add new to hash chain 10800 */ 10801 sfmmu_hblk_hash_add(hmebp, new, newpa); 10802 10803 /* 10804 * search hash chain for hblk_reserve; this needs to be performed 10805 * after adding new, otherwise prevpa and prev won't correspond 10806 * to the hblk which is prior to old in hash chain when we call 10807 * sfmmu_hblk_hash_rm to remove old later. 10808 */ 10809 for (prevpa = 0, prev = NULL, 10810 hblkpa = hmebp->hmeh_nextpa, hblkp = hmebp->hmeblkp; 10811 hblkp != NULL && hblkp != old; 10812 prevpa = hblkpa, prev = hblkp, 10813 hblkpa = hblkp->hblk_nextpa, hblkp = hblkp->hblk_next) 10814 ; 10815 10816 if (hblkp != old) 10817 panic("sfmmu_hblk_swap: hblk_reserve not found"); 10818 10819 /* 10820 * p_mapping list is still pointing to hments in hblk_reserve; 10821 * fix up p_mapping list so that they point to hments in new. 10822 * 10823 * Since all these mappings are created by hblk_reserve_thread 10824 * on the way and it's using at least one of the buffers from each of 10825 * the newly minted slabs, there is no danger of any of these 10826 * mappings getting unloaded by another thread. 10827 * 10828 * tsbmiss could only modify ref/mod bits of hments in old/new. 10829 * Since all of these hments hold mappings established by segkmem 10830 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 10831 * have no meaning for the mappings in hblk_reserve. hments in 10832 * old and new are identical except for ref/mod bits. 10833 */ 10834 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 10835 10836 HBLKTOHME(osfhme, old, vaddr); 10837 sfmmu_copytte(&osfhme->hme_tte, &tte); 10838 10839 if (TTE_IS_VALID(&tte)) { 10840 if ((pp = osfhme->hme_page) == NULL) 10841 panic("sfmmu_hblk_swap: page not mapped"); 10842 10843 pml = sfmmu_mlist_enter(pp); 10844 10845 if (pp != osfhme->hme_page) 10846 panic("sfmmu_hblk_swap: mapping changed"); 10847 10848 HBLKTOHME(nsfhme, new, vaddr); 10849 10850 HME_ADD(nsfhme, pp); 10851 HME_SUB(osfhme, pp); 10852 10853 sfmmu_mlist_exit(pml); 10854 } 10855 } 10856 10857 /* 10858 * remove old from hash chain 10859 */ 10860 sfmmu_hblk_hash_rm(hmebp, old, prevpa, prev); 10861 10862 #ifdef DEBUG 10863 10864 hblktag.htag_id = ksfmmup; 10865 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 10866 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 10867 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 10868 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 10869 10870 if (found != new) 10871 panic("sfmmu_hblk_swap: new hblk not found"); 10872 #endif 10873 10874 SFMMU_HASH_UNLOCK(hmebp); 10875 10876 /* 10877 * Reset hblk_reserve 10878 */ 10879 bzero((void *)old, HME8BLK_SZ); 10880 old->hblk_nextpa = va_to_pa((caddr_t)old); 10881 } 10882 10883 /* 10884 * Grab the mlist mutex for both pages passed in. 10885 * 10886 * low and high will be returned as pointers to the mutexes for these pages. 10887 * low refers to the mutex residing in the lower bin of the mlist hash, while 10888 * high refers to the mutex residing in the higher bin of the mlist hash. This 10889 * is due to the locking order restrictions on the same thread grabbing 10890 * multiple mlist mutexes. The low lock must be acquired before the high lock. 10891 * 10892 * If both pages hash to the same mutex, only grab that single mutex, and 10893 * high will be returned as NULL 10894 * If the pages hash to different bins in the hash, grab the lower addressed 10895 * lock first and then the higher addressed lock in order to follow the locking 10896 * rules involved with the same thread grabbing multiple mlist mutexes. 10897 * low and high will both have non-NULL values. 10898 */ 10899 static void 10900 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 10901 kmutex_t **low, kmutex_t **high) 10902 { 10903 kmutex_t *mml_targ, *mml_repl; 10904 10905 /* 10906 * no need to do the dance around szc as in sfmmu_mlist_enter() 10907 * because this routine is only called by hat_page_relocate() and all 10908 * targ and repl pages are already locked EXCL so szc can't change. 10909 */ 10910 10911 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 10912 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 10913 10914 if (mml_targ == mml_repl) { 10915 *low = mml_targ; 10916 *high = NULL; 10917 } else { 10918 if (mml_targ < mml_repl) { 10919 *low = mml_targ; 10920 *high = mml_repl; 10921 } else { 10922 *low = mml_repl; 10923 *high = mml_targ; 10924 } 10925 } 10926 10927 mutex_enter(*low); 10928 if (*high) 10929 mutex_enter(*high); 10930 } 10931 10932 static void 10933 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 10934 { 10935 if (high) 10936 mutex_exit(high); 10937 mutex_exit(low); 10938 } 10939 10940 static hatlock_t * 10941 sfmmu_hat_enter(sfmmu_t *sfmmup) 10942 { 10943 hatlock_t *hatlockp; 10944 10945 if (sfmmup != ksfmmup) { 10946 hatlockp = TSB_HASH(sfmmup); 10947 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 10948 return (hatlockp); 10949 } 10950 return (NULL); 10951 } 10952 10953 static hatlock_t * 10954 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 10955 { 10956 hatlock_t *hatlockp; 10957 10958 if (sfmmup != ksfmmup) { 10959 hatlockp = TSB_HASH(sfmmup); 10960 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 10961 return (NULL); 10962 return (hatlockp); 10963 } 10964 return (NULL); 10965 } 10966 10967 static void 10968 sfmmu_hat_exit(hatlock_t *hatlockp) 10969 { 10970 if (hatlockp != NULL) 10971 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 10972 } 10973 10974 static void 10975 sfmmu_hat_lock_all(void) 10976 { 10977 int i; 10978 for (i = 0; i < SFMMU_NUM_LOCK; i++) 10979 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 10980 } 10981 10982 static void 10983 sfmmu_hat_unlock_all(void) 10984 { 10985 int i; 10986 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 10987 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 10988 } 10989 10990 int 10991 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 10992 { 10993 ASSERT(sfmmup != ksfmmup); 10994 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 10995 } 10996 10997 /* 10998 * Locking primitives to provide consistency between ISM unmap 10999 * and other operations. Since ISM unmap can take a long time, we 11000 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 11001 * contention on the hatlock buckets while ISM segments are being 11002 * unmapped. The tradeoff is that the flags don't prevent priority 11003 * inversion from occurring, so we must request kernel priority in 11004 * case we have to sleep to keep from getting buried while holding 11005 * the HAT_ISMBUSY flag set, which in turn could block other kernel 11006 * threads from running (for example, in sfmmu_uvatopfn()). 11007 */ 11008 static void 11009 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 11010 { 11011 hatlock_t *hatlockp; 11012 11013 THREAD_KPRI_REQUEST(); 11014 if (!hatlock_held) 11015 hatlockp = sfmmu_hat_enter(sfmmup); 11016 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 11017 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11018 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 11019 if (!hatlock_held) 11020 sfmmu_hat_exit(hatlockp); 11021 } 11022 11023 static void 11024 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 11025 { 11026 hatlock_t *hatlockp; 11027 11028 if (!hatlock_held) 11029 hatlockp = sfmmu_hat_enter(sfmmup); 11030 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 11031 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 11032 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11033 if (!hatlock_held) 11034 sfmmu_hat_exit(hatlockp); 11035 THREAD_KPRI_RELEASE(); 11036 } 11037 11038 /* 11039 * 11040 * Algorithm: 11041 * 11042 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 11043 * hblks. 11044 * 11045 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 11046 * 11047 * (a) try to return an hblk from reserve pool of free hblks; 11048 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 11049 * and return hblk_reserve. 11050 * 11051 * (3) call kmem_cache_alloc() to allocate hblk; 11052 * 11053 * (a) if hblk_reserve_lock is held by the current thread, 11054 * atomically replace hblk_reserve by the hblk that is 11055 * returned by kmem_cache_alloc; release hblk_reserve_lock 11056 * and call kmem_cache_alloc() again. 11057 * (b) if reserve pool is not full, add the hblk that is 11058 * returned by kmem_cache_alloc to reserve pool and 11059 * call kmem_cache_alloc again. 11060 * 11061 */ 11062 static struct hme_blk * 11063 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 11064 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 11065 uint_t flags, uint_t rid) 11066 { 11067 struct hme_blk *hmeblkp = NULL; 11068 struct hme_blk *newhblkp; 11069 struct hme_blk *shw_hblkp = NULL; 11070 struct kmem_cache *sfmmu_cache = NULL; 11071 uint64_t hblkpa; 11072 ulong_t index; 11073 uint_t owner; /* set to 1 if using hblk_reserve */ 11074 uint_t forcefree; 11075 int sleep; 11076 sf_srd_t *srdp; 11077 sf_region_t *rgnp; 11078 11079 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11080 ASSERT(hblktag.htag_rid == rid); 11081 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 11082 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11083 IS_P2ALIGNED(vaddr, TTEBYTES(size))); 11084 11085 /* 11086 * If segkmem is not created yet, allocate from static hmeblks 11087 * created at the end of startup_modules(). See the block comment 11088 * in startup_modules() describing how we estimate the number of 11089 * static hmeblks that will be needed during re-map. 11090 */ 11091 if (!hblk_alloc_dynamic) { 11092 11093 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11094 11095 if (size == TTE8K) { 11096 index = nucleus_hblk8.index; 11097 if (index >= nucleus_hblk8.len) { 11098 /* 11099 * If we panic here, see startup_modules() to 11100 * make sure that we are calculating the 11101 * number of hblk8's that we need correctly. 11102 */ 11103 prom_panic("no nucleus hblk8 to allocate"); 11104 } 11105 hmeblkp = 11106 (struct hme_blk *)&nucleus_hblk8.list[index]; 11107 nucleus_hblk8.index++; 11108 SFMMU_STAT(sf_hblk8_nalloc); 11109 } else { 11110 index = nucleus_hblk1.index; 11111 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 11112 /* 11113 * If we panic here, see startup_modules(). 11114 * Most likely you need to update the 11115 * calculation of the number of hblk1 elements 11116 * that the kernel needs to boot. 11117 */ 11118 prom_panic("no nucleus hblk1 to allocate"); 11119 } 11120 hmeblkp = 11121 (struct hme_blk *)&nucleus_hblk1.list[index]; 11122 nucleus_hblk1.index++; 11123 SFMMU_STAT(sf_hblk1_nalloc); 11124 } 11125 11126 goto hblk_init; 11127 } 11128 11129 SFMMU_HASH_UNLOCK(hmebp); 11130 11131 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) { 11132 if (mmu_page_sizes == max_mmu_page_sizes) { 11133 if (size < TTE256M) 11134 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11135 size, flags); 11136 } else { 11137 if (size < TTE4M) 11138 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11139 size, flags); 11140 } 11141 } else if (SFMMU_IS_SHMERID_VALID(rid)) { 11142 /* 11143 * Shared hmes use per region bitmaps in rgn_hmeflag 11144 * rather than shadow hmeblks to keep track of the 11145 * mapping sizes which have been allocated for the region. 11146 * Here we cleanup old invalid hmeblks with this rid, 11147 * which may be left around by pageunload(). 11148 */ 11149 int ttesz; 11150 caddr_t va; 11151 caddr_t eva = vaddr + TTEBYTES(size); 11152 11153 ASSERT(sfmmup != KHATID); 11154 11155 srdp = sfmmup->sfmmu_srdp; 11156 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11157 rgnp = srdp->srd_hmergnp[rid]; 11158 ASSERT(rgnp != NULL && rgnp->rgn_id == rid); 11159 ASSERT(rgnp->rgn_refcnt != 0); 11160 ASSERT(size <= rgnp->rgn_pgszc); 11161 11162 ttesz = HBLK_MIN_TTESZ; 11163 do { 11164 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) { 11165 continue; 11166 } 11167 11168 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) { 11169 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz); 11170 } else if (ttesz < size) { 11171 for (va = vaddr; va < eva; 11172 va += TTEBYTES(ttesz)) { 11173 sfmmu_cleanup_rhblk(srdp, va, rid, 11174 ttesz); 11175 } 11176 } 11177 } while (++ttesz <= rgnp->rgn_pgszc); 11178 } 11179 11180 fill_hblk: 11181 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 11182 11183 if (owner && size == TTE8K) { 11184 11185 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11186 /* 11187 * We are really in a tight spot. We already own 11188 * hblk_reserve and we need another hblk. In anticipation 11189 * of this kind of scenario, we specifically set aside 11190 * HBLK_RESERVE_MIN number of hblks to be used exclusively 11191 * by owner of hblk_reserve. 11192 */ 11193 SFMMU_STAT(sf_hblk_recurse_cnt); 11194 11195 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 11196 panic("sfmmu_hblk_alloc: reserve list is empty"); 11197 11198 goto hblk_verify; 11199 } 11200 11201 ASSERT(!owner); 11202 11203 if ((flags & HAT_NO_KALLOC) == 0) { 11204 11205 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 11206 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 11207 11208 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 11209 hmeblkp = sfmmu_hblk_steal(size); 11210 } else { 11211 /* 11212 * if we are the owner of hblk_reserve, 11213 * swap hblk_reserve with hmeblkp and 11214 * start a fresh life. Hope things go 11215 * better this time. 11216 */ 11217 if (hblk_reserve_thread == curthread) { 11218 ASSERT(sfmmu_cache == sfmmu8_cache); 11219 sfmmu_hblk_swap(hmeblkp); 11220 hblk_reserve_thread = NULL; 11221 mutex_exit(&hblk_reserve_lock); 11222 goto fill_hblk; 11223 } 11224 /* 11225 * let's donate this hblk to our reserve list if 11226 * we are not mapping kernel range 11227 */ 11228 if (size == TTE8K && sfmmup != KHATID) 11229 if (sfmmu_put_free_hblk(hmeblkp, 0)) 11230 goto fill_hblk; 11231 } 11232 } else { 11233 /* 11234 * We are here to map the slab in sfmmu8_cache; let's 11235 * check if we could tap our reserve list; if successful, 11236 * this will avoid the pain of going thru sfmmu_hblk_swap 11237 */ 11238 SFMMU_STAT(sf_hblk_slab_cnt); 11239 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 11240 /* 11241 * let's start hblk_reserve dance 11242 */ 11243 SFMMU_STAT(sf_hblk_reserve_cnt); 11244 owner = 1; 11245 mutex_enter(&hblk_reserve_lock); 11246 hmeblkp = HBLK_RESERVE; 11247 hblk_reserve_thread = curthread; 11248 } 11249 } 11250 11251 hblk_verify: 11252 ASSERT(hmeblkp != NULL); 11253 set_hblk_sz(hmeblkp, size); 11254 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 11255 SFMMU_HASH_LOCK(hmebp); 11256 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11257 if (newhblkp != NULL) { 11258 SFMMU_HASH_UNLOCK(hmebp); 11259 if (hmeblkp != HBLK_RESERVE) { 11260 /* 11261 * This is really tricky! 11262 * 11263 * vmem_alloc(vmem_seg_arena) 11264 * vmem_alloc(vmem_internal_arena) 11265 * segkmem_alloc(heap_arena) 11266 * vmem_alloc(heap_arena) 11267 * page_create() 11268 * hat_memload() 11269 * kmem_cache_free() 11270 * kmem_cache_alloc() 11271 * kmem_slab_create() 11272 * vmem_alloc(kmem_internal_arena) 11273 * segkmem_alloc(heap_arena) 11274 * vmem_alloc(heap_arena) 11275 * page_create() 11276 * hat_memload() 11277 * kmem_cache_free() 11278 * ... 11279 * 11280 * Thus, hat_memload() could call kmem_cache_free 11281 * for enough number of times that we could easily 11282 * hit the bottom of the stack or run out of reserve 11283 * list of vmem_seg structs. So, we must donate 11284 * this hblk to reserve list if it's allocated 11285 * from sfmmu8_cache *and* mapping kernel range. 11286 * We don't need to worry about freeing hmeblk1's 11287 * to kmem since they don't map any kmem slabs. 11288 * 11289 * Note: When segkmem supports largepages, we must 11290 * free hmeblk1's to reserve list as well. 11291 */ 11292 forcefree = (sfmmup == KHATID) ? 1 : 0; 11293 if (size == TTE8K && 11294 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 11295 goto re_verify; 11296 } 11297 ASSERT(sfmmup != KHATID); 11298 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 11299 } else { 11300 /* 11301 * Hey! we don't need hblk_reserve any more. 11302 */ 11303 ASSERT(owner); 11304 hblk_reserve_thread = NULL; 11305 mutex_exit(&hblk_reserve_lock); 11306 owner = 0; 11307 } 11308 re_verify: 11309 /* 11310 * let's check if the goodies are still present 11311 */ 11312 SFMMU_HASH_LOCK(hmebp); 11313 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11314 if (newhblkp != NULL) { 11315 /* 11316 * return newhblkp if it's not hblk_reserve; 11317 * if newhblkp is hblk_reserve, return it 11318 * _only if_ we are the owner of hblk_reserve. 11319 */ 11320 if (newhblkp != HBLK_RESERVE || owner) { 11321 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11322 newhblkp->hblk_shared); 11323 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || 11324 !newhblkp->hblk_shared); 11325 return (newhblkp); 11326 } else { 11327 /* 11328 * we just hit hblk_reserve in the hash and 11329 * we are not the owner of that; 11330 * 11331 * block until hblk_reserve_thread completes 11332 * swapping hblk_reserve and try the dance 11333 * once again. 11334 */ 11335 SFMMU_HASH_UNLOCK(hmebp); 11336 mutex_enter(&hblk_reserve_lock); 11337 mutex_exit(&hblk_reserve_lock); 11338 SFMMU_STAT(sf_hblk_reserve_hit); 11339 goto fill_hblk; 11340 } 11341 } else { 11342 /* 11343 * it's no more! try the dance once again. 11344 */ 11345 SFMMU_HASH_UNLOCK(hmebp); 11346 goto fill_hblk; 11347 } 11348 } 11349 11350 hblk_init: 11351 if (SFMMU_IS_SHMERID_VALID(rid)) { 11352 uint16_t tteflag = 0x1 << 11353 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size); 11354 11355 if (!(rgnp->rgn_hmeflags & tteflag)) { 11356 atomic_or_16(&rgnp->rgn_hmeflags, tteflag); 11357 } 11358 hmeblkp->hblk_shared = 1; 11359 } else { 11360 hmeblkp->hblk_shared = 0; 11361 } 11362 set_hblk_sz(hmeblkp, size); 11363 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11364 hmeblkp->hblk_next = (struct hme_blk *)NULL; 11365 hmeblkp->hblk_tag = hblktag; 11366 hmeblkp->hblk_shadow = shw_hblkp; 11367 hblkpa = hmeblkp->hblk_nextpa; 11368 hmeblkp->hblk_nextpa = 0; 11369 11370 ASSERT(get_hblk_ttesz(hmeblkp) == size); 11371 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 11372 ASSERT(hmeblkp->hblk_hmecnt == 0); 11373 ASSERT(hmeblkp->hblk_vcnt == 0); 11374 ASSERT(hmeblkp->hblk_lckcnt == 0); 11375 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 11376 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 11377 return (hmeblkp); 11378 } 11379 11380 /* 11381 * This function performs any cleanup required on the hme_blk 11382 * and returns it to the free list. 11383 */ 11384 /* ARGSUSED */ 11385 static void 11386 sfmmu_hblk_free(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 11387 uint64_t hblkpa, struct hme_blk **listp) 11388 { 11389 int shw_size, vshift; 11390 struct hme_blk *shw_hblkp; 11391 uint_t shw_mask, newshw_mask; 11392 caddr_t vaddr; 11393 int size; 11394 uint_t critical; 11395 11396 ASSERT(hmeblkp); 11397 ASSERT(!hmeblkp->hblk_hmecnt); 11398 ASSERT(!hmeblkp->hblk_vcnt); 11399 ASSERT(!hmeblkp->hblk_lckcnt); 11400 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 11401 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 11402 11403 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 11404 11405 size = get_hblk_ttesz(hmeblkp); 11406 shw_hblkp = hmeblkp->hblk_shadow; 11407 if (shw_hblkp) { 11408 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 11409 ASSERT(!hmeblkp->hblk_shared); 11410 if (mmu_page_sizes == max_mmu_page_sizes) { 11411 ASSERT(size < TTE256M); 11412 } else { 11413 ASSERT(size < TTE4M); 11414 } 11415 11416 shw_size = get_hblk_ttesz(shw_hblkp); 11417 vaddr = (caddr_t)get_hblk_base(hmeblkp); 11418 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 11419 ASSERT(vshift < 8); 11420 /* 11421 * Atomically clear shadow mask bit 11422 */ 11423 do { 11424 shw_mask = shw_hblkp->hblk_shw_mask; 11425 ASSERT(shw_mask & (1 << vshift)); 11426 newshw_mask = shw_mask & ~(1 << vshift); 11427 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 11428 shw_mask, newshw_mask); 11429 } while (newshw_mask != shw_mask); 11430 hmeblkp->hblk_shadow = NULL; 11431 } 11432 hmeblkp->hblk_next = NULL; 11433 hmeblkp->hblk_nextpa = hblkpa; 11434 hmeblkp->hblk_shw_bit = 0; 11435 11436 if (hmeblkp->hblk_shared) { 11437 sf_srd_t *srdp; 11438 sf_region_t *rgnp; 11439 uint_t rid; 11440 11441 srdp = hblktosrd(hmeblkp); 11442 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11443 rid = hmeblkp->hblk_tag.htag_rid; 11444 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11445 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11446 rgnp = srdp->srd_hmergnp[rid]; 11447 ASSERT(rgnp != NULL); 11448 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 11449 hmeblkp->hblk_shared = 0; 11450 } 11451 11452 if (hmeblkp->hblk_nuc_bit == 0) { 11453 11454 if (size == TTE8K && sfmmu_put_free_hblk(hmeblkp, critical)) 11455 return; 11456 11457 hmeblkp->hblk_next = *listp; 11458 *listp = hmeblkp; 11459 } 11460 } 11461 11462 static void 11463 sfmmu_hblks_list_purge(struct hme_blk **listp) 11464 { 11465 struct hme_blk *hmeblkp; 11466 11467 while ((hmeblkp = *listp) != NULL) { 11468 *listp = hmeblkp->hblk_next; 11469 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 11470 } 11471 } 11472 11473 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 11474 #define SFMMU_HBLK_STEAL_THRESHOLD 5 11475 11476 static uint_t sfmmu_hblk_steal_twice; 11477 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 11478 11479 /* 11480 * Steal a hmeblk from user or kernel hme hash lists. 11481 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to 11482 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts 11483 * tap into critical reserve of freehblkp. 11484 * Note: We remain looping in this routine until we find one. 11485 */ 11486 static struct hme_blk * 11487 sfmmu_hblk_steal(int size) 11488 { 11489 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 11490 struct hmehash_bucket *hmebp; 11491 struct hme_blk *hmeblkp = NULL, *pr_hblk; 11492 uint64_t hblkpa, prevpa; 11493 int i; 11494 uint_t loop_cnt = 0, critical; 11495 11496 for (;;) { 11497 if (size == TTE8K) { 11498 critical = 11499 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0; 11500 if (sfmmu_get_free_hblk(&hmeblkp, critical)) 11501 return (hmeblkp); 11502 } 11503 11504 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 11505 uhmehash_steal_hand; 11506 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 11507 11508 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 11509 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 11510 SFMMU_HASH_LOCK(hmebp); 11511 hmeblkp = hmebp->hmeblkp; 11512 hblkpa = hmebp->hmeh_nextpa; 11513 prevpa = 0; 11514 pr_hblk = NULL; 11515 while (hmeblkp) { 11516 /* 11517 * check if it is a hmeblk that is not locked 11518 * and not shared. skip shadow hmeblks with 11519 * shadow_mask set i.e valid count non zero. 11520 */ 11521 if ((get_hblk_ttesz(hmeblkp) == size) && 11522 (hmeblkp->hblk_shw_bit == 0 || 11523 hmeblkp->hblk_vcnt == 0) && 11524 (hmeblkp->hblk_lckcnt == 0)) { 11525 /* 11526 * there is a high probability that we 11527 * will find a free one. search some 11528 * buckets for a free hmeblk initially 11529 * before unloading a valid hmeblk. 11530 */ 11531 if ((hmeblkp->hblk_vcnt == 0 && 11532 hmeblkp->hblk_hmecnt == 0) || (i >= 11533 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 11534 if (sfmmu_steal_this_hblk(hmebp, 11535 hmeblkp, hblkpa, prevpa, 11536 pr_hblk)) { 11537 /* 11538 * Hblk is unloaded 11539 * successfully 11540 */ 11541 break; 11542 } 11543 } 11544 } 11545 pr_hblk = hmeblkp; 11546 prevpa = hblkpa; 11547 hblkpa = hmeblkp->hblk_nextpa; 11548 hmeblkp = hmeblkp->hblk_next; 11549 } 11550 11551 SFMMU_HASH_UNLOCK(hmebp); 11552 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 11553 hmebp = uhme_hash; 11554 } 11555 uhmehash_steal_hand = hmebp; 11556 11557 if (hmeblkp != NULL) 11558 break; 11559 11560 /* 11561 * in the worst case, look for a free one in the kernel 11562 * hash table. 11563 */ 11564 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 11565 SFMMU_HASH_LOCK(hmebp); 11566 hmeblkp = hmebp->hmeblkp; 11567 hblkpa = hmebp->hmeh_nextpa; 11568 prevpa = 0; 11569 pr_hblk = NULL; 11570 while (hmeblkp) { 11571 /* 11572 * check if it is free hmeblk 11573 */ 11574 if ((get_hblk_ttesz(hmeblkp) == size) && 11575 (hmeblkp->hblk_lckcnt == 0) && 11576 (hmeblkp->hblk_vcnt == 0) && 11577 (hmeblkp->hblk_hmecnt == 0)) { 11578 if (sfmmu_steal_this_hblk(hmebp, 11579 hmeblkp, hblkpa, prevpa, pr_hblk)) { 11580 break; 11581 } else { 11582 /* 11583 * Cannot fail since we have 11584 * hash lock. 11585 */ 11586 panic("fail to steal?"); 11587 } 11588 } 11589 11590 pr_hblk = hmeblkp; 11591 prevpa = hblkpa; 11592 hblkpa = hmeblkp->hblk_nextpa; 11593 hmeblkp = hmeblkp->hblk_next; 11594 } 11595 11596 SFMMU_HASH_UNLOCK(hmebp); 11597 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 11598 hmebp = khme_hash; 11599 } 11600 11601 if (hmeblkp != NULL) 11602 break; 11603 sfmmu_hblk_steal_twice++; 11604 } 11605 return (hmeblkp); 11606 } 11607 11608 /* 11609 * This routine does real work to prepare a hblk to be "stolen" by 11610 * unloading the mappings, updating shadow counts .... 11611 * It returns 1 if the block is ready to be reused (stolen), or 0 11612 * means the block cannot be stolen yet- pageunload is still working 11613 * on this hblk. 11614 */ 11615 static int 11616 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 11617 uint64_t hblkpa, uint64_t prevpa, struct hme_blk *pr_hblk) 11618 { 11619 int shw_size, vshift; 11620 struct hme_blk *shw_hblkp; 11621 caddr_t vaddr; 11622 uint_t shw_mask, newshw_mask; 11623 11624 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11625 11626 /* 11627 * check if the hmeblk is free, unload if necessary 11628 */ 11629 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11630 sfmmu_t *sfmmup; 11631 demap_range_t dmr; 11632 11633 sfmmup = hblktosfmmu(hmeblkp); 11634 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) { 11635 return (0); 11636 } 11637 DEMAP_RANGE_INIT(sfmmup, &dmr); 11638 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 11639 (caddr_t)get_hblk_base(hmeblkp), 11640 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 11641 DEMAP_RANGE_FLUSH(&dmr); 11642 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11643 /* 11644 * Pageunload is working on the same hblk. 11645 */ 11646 return (0); 11647 } 11648 11649 sfmmu_hblk_steal_unload_count++; 11650 } 11651 11652 ASSERT(hmeblkp->hblk_lckcnt == 0); 11653 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 11654 11655 sfmmu_hblk_hash_rm(hmebp, hmeblkp, prevpa, pr_hblk); 11656 hmeblkp->hblk_nextpa = hblkpa; 11657 11658 shw_hblkp = hmeblkp->hblk_shadow; 11659 if (shw_hblkp) { 11660 ASSERT(!hmeblkp->hblk_shared); 11661 shw_size = get_hblk_ttesz(shw_hblkp); 11662 vaddr = (caddr_t)get_hblk_base(hmeblkp); 11663 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 11664 ASSERT(vshift < 8); 11665 /* 11666 * Atomically clear shadow mask bit 11667 */ 11668 do { 11669 shw_mask = shw_hblkp->hblk_shw_mask; 11670 ASSERT(shw_mask & (1 << vshift)); 11671 newshw_mask = shw_mask & ~(1 << vshift); 11672 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 11673 shw_mask, newshw_mask); 11674 } while (newshw_mask != shw_mask); 11675 hmeblkp->hblk_shadow = NULL; 11676 } 11677 11678 /* 11679 * remove shadow bit if we are stealing an unused shadow hmeblk. 11680 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 11681 * we are indeed allocating a shadow hmeblk. 11682 */ 11683 hmeblkp->hblk_shw_bit = 0; 11684 11685 if (hmeblkp->hblk_shared) { 11686 sf_srd_t *srdp; 11687 sf_region_t *rgnp; 11688 uint_t rid; 11689 11690 srdp = hblktosrd(hmeblkp); 11691 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11692 rid = hmeblkp->hblk_tag.htag_rid; 11693 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11694 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11695 rgnp = srdp->srd_hmergnp[rid]; 11696 ASSERT(rgnp != NULL); 11697 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 11698 hmeblkp->hblk_shared = 0; 11699 } 11700 11701 sfmmu_hblk_steal_count++; 11702 SFMMU_STAT(sf_steal_count); 11703 11704 return (1); 11705 } 11706 11707 struct hme_blk * 11708 sfmmu_hmetohblk(struct sf_hment *sfhme) 11709 { 11710 struct hme_blk *hmeblkp; 11711 struct sf_hment *sfhme0; 11712 struct hme_blk *hblk_dummy = 0; 11713 11714 /* 11715 * No dummy sf_hments, please. 11716 */ 11717 ASSERT(sfhme->hme_tte.ll != 0); 11718 11719 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 11720 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 11721 (uintptr_t)&hblk_dummy->hblk_hme[0]); 11722 11723 return (hmeblkp); 11724 } 11725 11726 /* 11727 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 11728 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 11729 * KM_SLEEP allocation. 11730 * 11731 * Return 0 on success, -1 otherwise. 11732 */ 11733 static void 11734 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11735 { 11736 struct tsb_info *tsbinfop, *next; 11737 tsb_replace_rc_t rc; 11738 boolean_t gotfirst = B_FALSE; 11739 11740 ASSERT(sfmmup != ksfmmup); 11741 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11742 11743 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 11744 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11745 } 11746 11747 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11748 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 11749 } else { 11750 return; 11751 } 11752 11753 ASSERT(sfmmup->sfmmu_tsb != NULL); 11754 11755 /* 11756 * Loop over all tsbinfo's replacing them with ones that actually have 11757 * a TSB. If any of the replacements ever fail, bail out of the loop. 11758 */ 11759 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 11760 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 11761 next = tsbinfop->tsb_next; 11762 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 11763 hatlockp, TSB_SWAPIN); 11764 if (rc != TSB_SUCCESS) { 11765 break; 11766 } 11767 gotfirst = B_TRUE; 11768 } 11769 11770 switch (rc) { 11771 case TSB_SUCCESS: 11772 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11773 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11774 return; 11775 case TSB_LOSTRACE: 11776 break; 11777 case TSB_ALLOCFAIL: 11778 break; 11779 default: 11780 panic("sfmmu_replace_tsb returned unrecognized failure code " 11781 "%d", rc); 11782 } 11783 11784 /* 11785 * In this case, we failed to get one of our TSBs. If we failed to 11786 * get the first TSB, get one of minimum size (8KB). Walk the list 11787 * and throw away the tsbinfos, starting where the allocation failed; 11788 * we can get by with just one TSB as long as we don't leave the 11789 * SWAPPED tsbinfo structures lying around. 11790 */ 11791 tsbinfop = sfmmup->sfmmu_tsb; 11792 next = tsbinfop->tsb_next; 11793 tsbinfop->tsb_next = NULL; 11794 11795 sfmmu_hat_exit(hatlockp); 11796 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 11797 next = tsbinfop->tsb_next; 11798 sfmmu_tsbinfo_free(tsbinfop); 11799 } 11800 hatlockp = sfmmu_hat_enter(sfmmup); 11801 11802 /* 11803 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 11804 * pages. 11805 */ 11806 if (!gotfirst) { 11807 tsbinfop = sfmmup->sfmmu_tsb; 11808 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 11809 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 11810 ASSERT(rc == TSB_SUCCESS); 11811 } 11812 11813 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11814 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11815 } 11816 11817 static int 11818 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw) 11819 { 11820 ulong_t bix = 0; 11821 uint_t rid; 11822 sf_region_t *rgnp; 11823 11824 ASSERT(srdp != NULL); 11825 ASSERT(srdp->srd_refcnt != 0); 11826 11827 w <<= BT_ULSHIFT; 11828 while (bmw) { 11829 if (!(bmw & 0x1)) { 11830 bix++; 11831 bmw >>= 1; 11832 continue; 11833 } 11834 rid = w | bix; 11835 rgnp = srdp->srd_hmergnp[rid]; 11836 ASSERT(rgnp->rgn_refcnt > 0); 11837 ASSERT(rgnp->rgn_id == rid); 11838 if (addr < rgnp->rgn_saddr || 11839 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) { 11840 bix++; 11841 bmw >>= 1; 11842 } else { 11843 return (1); 11844 } 11845 } 11846 return (0); 11847 } 11848 11849 /* 11850 * Handle exceptions for low level tsb_handler. 11851 * 11852 * There are many scenarios that could land us here: 11853 * 11854 * If the context is invalid we land here. The context can be invalid 11855 * for 3 reasons: 1) we couldn't allocate a new context and now need to 11856 * perform a wrap around operation in order to allocate a new context. 11857 * 2) Context was invalidated to change pagesize programming 3) ISMs or 11858 * TSBs configuration is changeing for this process and we are forced into 11859 * here to do a syncronization operation. If the context is valid we can 11860 * be here from window trap hanlder. In this case just call trap to handle 11861 * the fault. 11862 * 11863 * Note that the process will run in INVALID_CONTEXT before 11864 * faulting into here and subsequently loading the MMU registers 11865 * (including the TSB base register) associated with this process. 11866 * For this reason, the trap handlers must all test for 11867 * INVALID_CONTEXT before attempting to access any registers other 11868 * than the context registers. 11869 */ 11870 void 11871 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 11872 { 11873 sfmmu_t *sfmmup, *shsfmmup; 11874 uint_t ctxtype; 11875 klwp_id_t lwp; 11876 char lwp_save_state; 11877 hatlock_t *hatlockp, *shatlockp; 11878 struct tsb_info *tsbinfop; 11879 struct tsbmiss *tsbmp; 11880 sf_scd_t *scdp; 11881 11882 SFMMU_STAT(sf_tsb_exceptions); 11883 SFMMU_MMU_STAT(mmu_tsb_exceptions); 11884 sfmmup = astosfmmu(curthread->t_procp->p_as); 11885 /* 11886 * note that in sun4u, tagacces register contains ctxnum 11887 * while sun4v passes ctxtype in the tagaccess register. 11888 */ 11889 ctxtype = tagaccess & TAGACC_CTX_MASK; 11890 11891 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT); 11892 ASSERT(sfmmup->sfmmu_ismhat == 0); 11893 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) || 11894 ctxtype == INVALID_CONTEXT); 11895 11896 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) { 11897 /* 11898 * We may land here because shme bitmap and pagesize 11899 * flags are updated lazily in tsbmiss area on other cpus. 11900 * If we detect here that tsbmiss area is out of sync with 11901 * sfmmu update it and retry the trapped instruction. 11902 * Otherwise call trap(). 11903 */ 11904 int ret = 0; 11905 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K); 11906 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK); 11907 11908 /* 11909 * Must set lwp state to LWP_SYS before 11910 * trying to acquire any adaptive lock 11911 */ 11912 lwp = ttolwp(curthread); 11913 ASSERT(lwp); 11914 lwp_save_state = lwp->lwp_state; 11915 lwp->lwp_state = LWP_SYS; 11916 11917 hatlockp = sfmmu_hat_enter(sfmmup); 11918 kpreempt_disable(); 11919 tsbmp = &tsbmiss_area[CPU->cpu_id]; 11920 ASSERT(sfmmup == tsbmp->usfmmup); 11921 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) & 11922 ~tteflag_mask) || 11923 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) & 11924 ~tteflag_mask)) { 11925 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags; 11926 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags; 11927 ret = 1; 11928 } 11929 if (sfmmup->sfmmu_srdp != NULL) { 11930 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap; 11931 ulong_t *tm = tsbmp->shmermap; 11932 ulong_t i; 11933 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 11934 ulong_t d = tm[i] ^ sm[i]; 11935 if (d) { 11936 if (d & sm[i]) { 11937 if (!ret && sfmmu_is_rgnva( 11938 sfmmup->sfmmu_srdp, 11939 addr, i, d & sm[i])) { 11940 ret = 1; 11941 } 11942 } 11943 tm[i] = sm[i]; 11944 } 11945 } 11946 } 11947 kpreempt_enable(); 11948 sfmmu_hat_exit(hatlockp); 11949 lwp->lwp_state = lwp_save_state; 11950 if (ret) { 11951 return; 11952 } 11953 } else if (ctxtype == INVALID_CONTEXT) { 11954 /* 11955 * First, make sure we come out of here with a valid ctx, 11956 * since if we don't get one we'll simply loop on the 11957 * faulting instruction. 11958 * 11959 * If the ISM mappings are changing, the TSB is relocated, 11960 * the process is swapped, the process is joining SCD or 11961 * leaving SCD or shared regions we serialize behind the 11962 * controlling thread with hat lock, sfmmu_flags and 11963 * sfmmu_tsb_cv condition variable. 11964 */ 11965 11966 /* 11967 * Must set lwp state to LWP_SYS before 11968 * trying to acquire any adaptive lock 11969 */ 11970 lwp = ttolwp(curthread); 11971 ASSERT(lwp); 11972 lwp_save_state = lwp->lwp_state; 11973 lwp->lwp_state = LWP_SYS; 11974 11975 hatlockp = sfmmu_hat_enter(sfmmup); 11976 retry: 11977 if ((scdp = sfmmup->sfmmu_scdp) != NULL) { 11978 shsfmmup = scdp->scd_sfmmup; 11979 ASSERT(shsfmmup != NULL); 11980 11981 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL; 11982 tsbinfop = tsbinfop->tsb_next) { 11983 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 11984 /* drop the private hat lock */ 11985 sfmmu_hat_exit(hatlockp); 11986 /* acquire the shared hat lock */ 11987 shatlockp = sfmmu_hat_enter(shsfmmup); 11988 /* 11989 * recheck to see if anything changed 11990 * after we drop the private hat lock. 11991 */ 11992 if (sfmmup->sfmmu_scdp == scdp && 11993 shsfmmup == scdp->scd_sfmmup) { 11994 sfmmu_tsb_chk_reloc(shsfmmup, 11995 shatlockp); 11996 } 11997 sfmmu_hat_exit(shatlockp); 11998 hatlockp = sfmmu_hat_enter(sfmmup); 11999 goto retry; 12000 } 12001 } 12002 } 12003 12004 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 12005 tsbinfop = tsbinfop->tsb_next) { 12006 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 12007 cv_wait(&sfmmup->sfmmu_tsb_cv, 12008 HATLOCK_MUTEXP(hatlockp)); 12009 goto retry; 12010 } 12011 } 12012 12013 /* 12014 * Wait for ISM maps to be updated. 12015 */ 12016 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12017 cv_wait(&sfmmup->sfmmu_tsb_cv, 12018 HATLOCK_MUTEXP(hatlockp)); 12019 goto retry; 12020 } 12021 12022 /* Is this process joining an SCD? */ 12023 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12024 /* 12025 * Flush private TSB and setup shared TSB. 12026 * sfmmu_finish_join_scd() does not drop the 12027 * hat lock. 12028 */ 12029 sfmmu_finish_join_scd(sfmmup); 12030 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 12031 } 12032 12033 /* 12034 * If we're swapping in, get TSB(s). Note that we must do 12035 * this before we get a ctx or load the MMU state. Once 12036 * we swap in we have to recheck to make sure the TSB(s) and 12037 * ISM mappings didn't change while we slept. 12038 */ 12039 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 12040 sfmmu_tsb_swapin(sfmmup, hatlockp); 12041 goto retry; 12042 } 12043 12044 sfmmu_get_ctx(sfmmup); 12045 12046 sfmmu_hat_exit(hatlockp); 12047 /* 12048 * Must restore lwp_state if not calling 12049 * trap() for further processing. Restore 12050 * it anyway. 12051 */ 12052 lwp->lwp_state = lwp_save_state; 12053 return; 12054 } 12055 trap(rp, (caddr_t)tagaccess, traptype, 0); 12056 } 12057 12058 static void 12059 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp) 12060 { 12061 struct tsb_info *tp; 12062 12063 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12064 12065 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) { 12066 if (tp->tsb_flags & TSB_RELOC_FLAG) { 12067 cv_wait(&sfmmup->sfmmu_tsb_cv, 12068 HATLOCK_MUTEXP(hatlockp)); 12069 break; 12070 } 12071 } 12072 } 12073 12074 /* 12075 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 12076 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 12077 * rather than spinning to avoid send mondo timeouts with 12078 * interrupts enabled. When the lock is acquired it is immediately 12079 * released and we return back to sfmmu_vatopfn just after 12080 * the GET_TTE call. 12081 */ 12082 void 12083 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 12084 { 12085 struct page **pp; 12086 12087 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12088 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12089 } 12090 12091 /* 12092 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 12093 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 12094 * cross traps which cannot be handled while spinning in the 12095 * trap handlers. Simply enter and exit the kpr_suspendlock spin 12096 * mutex, which is held by the holder of the suspend bit, and then 12097 * retry the trapped instruction after unwinding. 12098 */ 12099 /*ARGSUSED*/ 12100 void 12101 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 12102 { 12103 ASSERT(curthread != kreloc_thread); 12104 mutex_enter(&kpr_suspendlock); 12105 mutex_exit(&kpr_suspendlock); 12106 } 12107 12108 /* 12109 * This routine could be optimized to reduce the number of xcalls by flushing 12110 * the entire TLBs if region reference count is above some threshold but the 12111 * tradeoff will depend on the size of the TLB. So for now flush the specific 12112 * page a context at a time. 12113 * 12114 * If uselocks is 0 then it's called after all cpus were captured and all the 12115 * hat locks were taken. In this case don't take the region lock by relying on 12116 * the order of list region update operations in hat_join_region(), 12117 * hat_leave_region() and hat_dup_region(). The ordering in those routines 12118 * guarantees that list is always forward walkable and reaches active sfmmus 12119 * regardless of where xc_attention() captures a cpu. 12120 */ 12121 cpuset_t 12122 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp, 12123 struct hme_blk *hmeblkp, int uselocks) 12124 { 12125 sfmmu_t *sfmmup; 12126 cpuset_t cpuset; 12127 cpuset_t rcpuset; 12128 hatlock_t *hatlockp; 12129 uint_t rid = rgnp->rgn_id; 12130 sf_rgn_link_t *rlink; 12131 sf_scd_t *scdp; 12132 12133 ASSERT(hmeblkp->hblk_shared); 12134 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 12135 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 12136 12137 CPUSET_ZERO(rcpuset); 12138 if (uselocks) { 12139 mutex_enter(&rgnp->rgn_mutex); 12140 } 12141 sfmmup = rgnp->rgn_sfmmu_head; 12142 while (sfmmup != NULL) { 12143 if (uselocks) { 12144 hatlockp = sfmmu_hat_enter(sfmmup); 12145 } 12146 12147 /* 12148 * When an SCD is created the SCD hat is linked on the sfmmu 12149 * region lists for each hme region which is part of the 12150 * SCD. If we find an SCD hat, when walking these lists, 12151 * then we flush the shared TSBs, if we find a private hat, 12152 * which is part of an SCD, but where the region 12153 * is not part of the SCD then we flush the private TSBs. 12154 */ 12155 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12156 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12157 scdp = sfmmup->sfmmu_scdp; 12158 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 12159 if (uselocks) { 12160 sfmmu_hat_exit(hatlockp); 12161 } 12162 goto next; 12163 } 12164 } 12165 12166 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12167 12168 kpreempt_disable(); 12169 cpuset = sfmmup->sfmmu_cpusran; 12170 CPUSET_AND(cpuset, cpu_ready_set); 12171 CPUSET_DEL(cpuset, CPU->cpu_id); 12172 SFMMU_XCALL_STATS(sfmmup); 12173 xt_some(cpuset, vtag_flushpage_tl1, 12174 (uint64_t)addr, (uint64_t)sfmmup); 12175 vtag_flushpage(addr, (uint64_t)sfmmup); 12176 if (uselocks) { 12177 sfmmu_hat_exit(hatlockp); 12178 } 12179 kpreempt_enable(); 12180 CPUSET_OR(rcpuset, cpuset); 12181 12182 next: 12183 /* LINTED: constant in conditional context */ 12184 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 12185 ASSERT(rlink != NULL); 12186 sfmmup = rlink->next; 12187 } 12188 if (uselocks) { 12189 mutex_exit(&rgnp->rgn_mutex); 12190 } 12191 return (rcpuset); 12192 } 12193 12194 /* 12195 * This routine takes an sfmmu pointer and the va for an adddress in an 12196 * ISM region as input and returns the corresponding region id in ism_rid. 12197 * The return value of 1 indicates that a region has been found and ism_rid 12198 * is valid, otherwise 0 is returned. 12199 */ 12200 static int 12201 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid) 12202 { 12203 ism_blk_t *ism_blkp; 12204 int i; 12205 ism_map_t *ism_map; 12206 #ifdef DEBUG 12207 struct hat *ism_hatid; 12208 #endif 12209 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12210 12211 ism_blkp = sfmmup->sfmmu_iblk; 12212 while (ism_blkp != NULL) { 12213 ism_map = ism_blkp->iblk_maps; 12214 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 12215 if ((va >= ism_start(ism_map[i])) && 12216 (va < ism_end(ism_map[i]))) { 12217 12218 *ism_rid = ism_map[i].imap_rid; 12219 #ifdef DEBUG 12220 ism_hatid = ism_map[i].imap_ismhat; 12221 ASSERT(ism_hatid == ism_sfmmup); 12222 ASSERT(ism_hatid->sfmmu_ismhat); 12223 #endif 12224 return (1); 12225 } 12226 } 12227 ism_blkp = ism_blkp->iblk_next; 12228 } 12229 return (0); 12230 } 12231 12232 /* 12233 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 12234 * This routine may be called with all cpu's captured. Therefore, the 12235 * caller is responsible for holding all locks and disabling kernel 12236 * preemption. 12237 */ 12238 /* ARGSUSED */ 12239 static void 12240 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 12241 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 12242 { 12243 cpuset_t cpuset; 12244 caddr_t va; 12245 ism_ment_t *ment; 12246 sfmmu_t *sfmmup; 12247 #ifdef VAC 12248 int vcolor; 12249 #endif 12250 12251 sf_scd_t *scdp; 12252 uint_t ism_rid; 12253 12254 ASSERT(!hmeblkp->hblk_shared); 12255 /* 12256 * Walk the ism_hat's mapping list and flush the page 12257 * from every hat sharing this ism_hat. This routine 12258 * may be called while all cpu's have been captured. 12259 * Therefore we can't attempt to grab any locks. For now 12260 * this means we will protect the ism mapping list under 12261 * a single lock which will be grabbed by the caller. 12262 * If hat_share/unshare scalibility becomes a performance 12263 * problem then we may need to re-think ism mapping list locking. 12264 */ 12265 ASSERT(ism_sfmmup->sfmmu_ismhat); 12266 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 12267 addr = addr - ISMID_STARTADDR; 12268 12269 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 12270 12271 sfmmup = ment->iment_hat; 12272 12273 va = ment->iment_base_va; 12274 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 12275 12276 /* 12277 * When an SCD is created the SCD hat is linked on the ism 12278 * mapping lists for each ISM segment which is part of the 12279 * SCD. If we find an SCD hat, when walking these lists, 12280 * then we flush the shared TSBs, if we find a private hat, 12281 * which is part of an SCD, but where the region 12282 * corresponding to this va is not part of the SCD then we 12283 * flush the private TSBs. 12284 */ 12285 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12286 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) && 12287 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12288 if (!find_ism_rid(sfmmup, ism_sfmmup, va, 12289 &ism_rid)) { 12290 cmn_err(CE_PANIC, 12291 "can't find matching ISM rid!"); 12292 } 12293 12294 scdp = sfmmup->sfmmu_scdp; 12295 if (SFMMU_IS_ISMRID_VALID(ism_rid) && 12296 SF_RGNMAP_TEST(scdp->scd_ismregion_map, 12297 ism_rid)) { 12298 continue; 12299 } 12300 } 12301 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1); 12302 12303 cpuset = sfmmup->sfmmu_cpusran; 12304 CPUSET_AND(cpuset, cpu_ready_set); 12305 CPUSET_DEL(cpuset, CPU->cpu_id); 12306 SFMMU_XCALL_STATS(sfmmup); 12307 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 12308 (uint64_t)sfmmup); 12309 vtag_flushpage(va, (uint64_t)sfmmup); 12310 12311 #ifdef VAC 12312 /* 12313 * Flush D$ 12314 * When flushing D$ we must flush all 12315 * cpu's. See sfmmu_cache_flush(). 12316 */ 12317 if (cache_flush_flag == CACHE_FLUSH) { 12318 cpuset = cpu_ready_set; 12319 CPUSET_DEL(cpuset, CPU->cpu_id); 12320 12321 SFMMU_XCALL_STATS(sfmmup); 12322 vcolor = addr_to_vcolor(va); 12323 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12324 vac_flushpage(pfnum, vcolor); 12325 } 12326 #endif /* VAC */ 12327 } 12328 } 12329 12330 /* 12331 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 12332 * a particular virtual address and ctx. If noflush is set we do not 12333 * flush the TLB/TSB. This function may or may not be called with the 12334 * HAT lock held. 12335 */ 12336 static void 12337 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12338 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 12339 int hat_lock_held) 12340 { 12341 #ifdef VAC 12342 int vcolor; 12343 #endif 12344 cpuset_t cpuset; 12345 hatlock_t *hatlockp; 12346 12347 ASSERT(!hmeblkp->hblk_shared); 12348 12349 #if defined(lint) && !defined(VAC) 12350 pfnum = pfnum; 12351 cpu_flag = cpu_flag; 12352 cache_flush_flag = cache_flush_flag; 12353 #endif 12354 12355 /* 12356 * There is no longer a need to protect against ctx being 12357 * stolen here since we don't store the ctx in the TSB anymore. 12358 */ 12359 #ifdef VAC 12360 vcolor = addr_to_vcolor(addr); 12361 #endif 12362 12363 /* 12364 * We must hold the hat lock during the flush of TLB, 12365 * to avoid a race with sfmmu_invalidate_ctx(), where 12366 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12367 * causing TLB demap routine to skip flush on that MMU. 12368 * If the context on a MMU has already been set to 12369 * INVALID_CONTEXT, we just get an extra flush on 12370 * that MMU. 12371 */ 12372 if (!hat_lock_held && !tlb_noflush) 12373 hatlockp = sfmmu_hat_enter(sfmmup); 12374 12375 kpreempt_disable(); 12376 if (!tlb_noflush) { 12377 /* 12378 * Flush the TSB and TLB. 12379 */ 12380 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12381 12382 cpuset = sfmmup->sfmmu_cpusran; 12383 CPUSET_AND(cpuset, cpu_ready_set); 12384 CPUSET_DEL(cpuset, CPU->cpu_id); 12385 12386 SFMMU_XCALL_STATS(sfmmup); 12387 12388 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 12389 (uint64_t)sfmmup); 12390 12391 vtag_flushpage(addr, (uint64_t)sfmmup); 12392 } 12393 12394 if (!hat_lock_held && !tlb_noflush) 12395 sfmmu_hat_exit(hatlockp); 12396 12397 #ifdef VAC 12398 /* 12399 * Flush the D$ 12400 * 12401 * Even if the ctx is stolen, we need to flush the 12402 * cache. Our ctx stealer only flushes the TLBs. 12403 */ 12404 if (cache_flush_flag == CACHE_FLUSH) { 12405 if (cpu_flag & FLUSH_ALL_CPUS) { 12406 cpuset = cpu_ready_set; 12407 } else { 12408 cpuset = sfmmup->sfmmu_cpusran; 12409 CPUSET_AND(cpuset, cpu_ready_set); 12410 } 12411 CPUSET_DEL(cpuset, CPU->cpu_id); 12412 SFMMU_XCALL_STATS(sfmmup); 12413 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12414 vac_flushpage(pfnum, vcolor); 12415 } 12416 #endif /* VAC */ 12417 kpreempt_enable(); 12418 } 12419 12420 /* 12421 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 12422 * address and ctx. If noflush is set we do not currently do anything. 12423 * This function may or may not be called with the HAT lock held. 12424 */ 12425 static void 12426 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12427 int tlb_noflush, int hat_lock_held) 12428 { 12429 cpuset_t cpuset; 12430 hatlock_t *hatlockp; 12431 12432 ASSERT(!hmeblkp->hblk_shared); 12433 12434 /* 12435 * If the process is exiting we have nothing to do. 12436 */ 12437 if (tlb_noflush) 12438 return; 12439 12440 /* 12441 * Flush TSB. 12442 */ 12443 if (!hat_lock_held) 12444 hatlockp = sfmmu_hat_enter(sfmmup); 12445 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12446 12447 kpreempt_disable(); 12448 12449 cpuset = sfmmup->sfmmu_cpusran; 12450 CPUSET_AND(cpuset, cpu_ready_set); 12451 CPUSET_DEL(cpuset, CPU->cpu_id); 12452 12453 SFMMU_XCALL_STATS(sfmmup); 12454 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup); 12455 12456 vtag_flushpage(addr, (uint64_t)sfmmup); 12457 12458 if (!hat_lock_held) 12459 sfmmu_hat_exit(hatlockp); 12460 12461 kpreempt_enable(); 12462 12463 } 12464 12465 /* 12466 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 12467 * call handler that can flush a range of pages to save on xcalls. 12468 */ 12469 static int sfmmu_xcall_save; 12470 12471 /* 12472 * this routine is never used for demaping addresses backed by SRD hmeblks. 12473 */ 12474 static void 12475 sfmmu_tlb_range_demap(demap_range_t *dmrp) 12476 { 12477 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 12478 hatlock_t *hatlockp; 12479 cpuset_t cpuset; 12480 uint64_t sfmmu_pgcnt; 12481 pgcnt_t pgcnt = 0; 12482 int pgunload = 0; 12483 int dirtypg = 0; 12484 caddr_t addr = dmrp->dmr_addr; 12485 caddr_t eaddr; 12486 uint64_t bitvec = dmrp->dmr_bitvec; 12487 12488 ASSERT(bitvec & 1); 12489 12490 /* 12491 * Flush TSB and calculate number of pages to flush. 12492 */ 12493 while (bitvec != 0) { 12494 dirtypg = 0; 12495 /* 12496 * Find the first page to flush and then count how many 12497 * pages there are after it that also need to be flushed. 12498 * This way the number of TSB flushes is minimized. 12499 */ 12500 while ((bitvec & 1) == 0) { 12501 pgcnt++; 12502 addr += MMU_PAGESIZE; 12503 bitvec >>= 1; 12504 } 12505 while (bitvec & 1) { 12506 dirtypg++; 12507 bitvec >>= 1; 12508 } 12509 eaddr = addr + ptob(dirtypg); 12510 hatlockp = sfmmu_hat_enter(sfmmup); 12511 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 12512 sfmmu_hat_exit(hatlockp); 12513 pgunload += dirtypg; 12514 addr = eaddr; 12515 pgcnt += dirtypg; 12516 } 12517 12518 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 12519 if (sfmmup->sfmmu_free == 0) { 12520 addr = dmrp->dmr_addr; 12521 bitvec = dmrp->dmr_bitvec; 12522 12523 /* 12524 * make sure it has SFMMU_PGCNT_SHIFT bits only, 12525 * as it will be used to pack argument for xt_some 12526 */ 12527 ASSERT((pgcnt > 0) && 12528 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT))); 12529 12530 /* 12531 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in 12532 * the low 6 bits of sfmmup. This is doable since pgcnt 12533 * always >= 1. 12534 */ 12535 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK)); 12536 sfmmu_pgcnt = (uint64_t)sfmmup | 12537 ((pgcnt - 1) & SFMMU_PGCNT_MASK); 12538 12539 /* 12540 * We must hold the hat lock during the flush of TLB, 12541 * to avoid a race with sfmmu_invalidate_ctx(), where 12542 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12543 * causing TLB demap routine to skip flush on that MMU. 12544 * If the context on a MMU has already been set to 12545 * INVALID_CONTEXT, we just get an extra flush on 12546 * that MMU. 12547 */ 12548 hatlockp = sfmmu_hat_enter(sfmmup); 12549 kpreempt_disable(); 12550 12551 cpuset = sfmmup->sfmmu_cpusran; 12552 CPUSET_AND(cpuset, cpu_ready_set); 12553 CPUSET_DEL(cpuset, CPU->cpu_id); 12554 12555 SFMMU_XCALL_STATS(sfmmup); 12556 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 12557 sfmmu_pgcnt); 12558 12559 for (; bitvec != 0; bitvec >>= 1) { 12560 if (bitvec & 1) 12561 vtag_flushpage(addr, (uint64_t)sfmmup); 12562 addr += MMU_PAGESIZE; 12563 } 12564 kpreempt_enable(); 12565 sfmmu_hat_exit(hatlockp); 12566 12567 sfmmu_xcall_save += (pgunload-1); 12568 } 12569 dmrp->dmr_bitvec = 0; 12570 } 12571 12572 /* 12573 * In cases where we need to synchronize with TLB/TSB miss trap 12574 * handlers, _and_ need to flush the TLB, it's a lot easier to 12575 * throw away the context from the process than to do a 12576 * special song and dance to keep things consistent for the 12577 * handlers. 12578 * 12579 * Since the process suddenly ends up without a context and our caller 12580 * holds the hat lock, threads that fault after this function is called 12581 * will pile up on the lock. We can then do whatever we need to 12582 * atomically from the context of the caller. The first blocked thread 12583 * to resume executing will get the process a new context, and the 12584 * process will resume executing. 12585 * 12586 * One added advantage of this approach is that on MMUs that 12587 * support a "flush all" operation, we will delay the flush until 12588 * cnum wrap-around, and then flush the TLB one time. This 12589 * is rather rare, so it's a lot less expensive than making 8000 12590 * x-calls to flush the TLB 8000 times. 12591 * 12592 * A per-process (PP) lock is used to synchronize ctx allocations in 12593 * resume() and ctx invalidations here. 12594 */ 12595 static void 12596 sfmmu_invalidate_ctx(sfmmu_t *sfmmup) 12597 { 12598 cpuset_t cpuset; 12599 int cnum, currcnum; 12600 mmu_ctx_t *mmu_ctxp; 12601 int i; 12602 uint_t pstate_save; 12603 12604 SFMMU_STAT(sf_ctx_inv); 12605 12606 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12607 ASSERT(sfmmup != ksfmmup); 12608 12609 kpreempt_disable(); 12610 12611 mmu_ctxp = CPU_MMU_CTXP(CPU); 12612 ASSERT(mmu_ctxp); 12613 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 12614 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 12615 12616 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum; 12617 12618 pstate_save = sfmmu_disable_intrs(); 12619 12620 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */ 12621 /* set HAT cnum invalid across all context domains. */ 12622 for (i = 0; i < max_mmu_ctxdoms; i++) { 12623 12624 cnum = sfmmup->sfmmu_ctxs[i].cnum; 12625 if (cnum == INVALID_CONTEXT) { 12626 continue; 12627 } 12628 12629 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 12630 } 12631 membar_enter(); /* make sure globally visible to all CPUs */ 12632 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */ 12633 12634 sfmmu_enable_intrs(pstate_save); 12635 12636 cpuset = sfmmup->sfmmu_cpusran; 12637 CPUSET_DEL(cpuset, CPU->cpu_id); 12638 CPUSET_AND(cpuset, cpu_ready_set); 12639 if (!CPUSET_ISNULL(cpuset)) { 12640 SFMMU_XCALL_STATS(sfmmup); 12641 xt_some(cpuset, sfmmu_raise_tsb_exception, 12642 (uint64_t)sfmmup, INVALID_CONTEXT); 12643 xt_sync(cpuset); 12644 SFMMU_STAT(sf_tsb_raise_exception); 12645 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 12646 } 12647 12648 /* 12649 * If the hat to-be-invalidated is the same as the current 12650 * process on local CPU we need to invalidate 12651 * this CPU context as well. 12652 */ 12653 if ((sfmmu_getctx_sec() == currcnum) && 12654 (currcnum != INVALID_CONTEXT)) { 12655 /* sets shared context to INVALID too */ 12656 sfmmu_setctx_sec(INVALID_CONTEXT); 12657 sfmmu_clear_utsbinfo(); 12658 } 12659 12660 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID); 12661 12662 kpreempt_enable(); 12663 12664 /* 12665 * we hold the hat lock, so nobody should allocate a context 12666 * for us yet 12667 */ 12668 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT); 12669 } 12670 12671 #ifdef VAC 12672 /* 12673 * We need to flush the cache in all cpus. It is possible that 12674 * a process referenced a page as cacheable but has sinced exited 12675 * and cleared the mapping list. We still to flush it but have no 12676 * state so all cpus is the only alternative. 12677 */ 12678 void 12679 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 12680 { 12681 cpuset_t cpuset; 12682 12683 kpreempt_disable(); 12684 cpuset = cpu_ready_set; 12685 CPUSET_DEL(cpuset, CPU->cpu_id); 12686 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12687 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12688 xt_sync(cpuset); 12689 vac_flushpage(pfnum, vcolor); 12690 kpreempt_enable(); 12691 } 12692 12693 void 12694 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 12695 { 12696 cpuset_t cpuset; 12697 12698 ASSERT(vcolor >= 0); 12699 12700 kpreempt_disable(); 12701 cpuset = cpu_ready_set; 12702 CPUSET_DEL(cpuset, CPU->cpu_id); 12703 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12704 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 12705 xt_sync(cpuset); 12706 vac_flushcolor(vcolor, pfnum); 12707 kpreempt_enable(); 12708 } 12709 #endif /* VAC */ 12710 12711 /* 12712 * We need to prevent processes from accessing the TSB using a cached physical 12713 * address. It's alright if they try to access the TSB via virtual address 12714 * since they will just fault on that virtual address once the mapping has 12715 * been suspended. 12716 */ 12717 #pragma weak sendmondo_in_recover 12718 12719 /* ARGSUSED */ 12720 static int 12721 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 12722 { 12723 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12724 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12725 hatlock_t *hatlockp; 12726 sf_scd_t *scdp; 12727 12728 if (flags != HAT_PRESUSPEND) 12729 return (0); 12730 12731 /* 12732 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must 12733 * be a shared hat, then set SCD's tsbinfo's flag. 12734 * If tsb is not shared, sfmmup is a private hat, then set 12735 * its private tsbinfo's flag. 12736 */ 12737 hatlockp = sfmmu_hat_enter(sfmmup); 12738 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 12739 12740 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) { 12741 sfmmu_tsb_inv_ctx(sfmmup); 12742 sfmmu_hat_exit(hatlockp); 12743 } else { 12744 /* release lock on the shared hat */ 12745 sfmmu_hat_exit(hatlockp); 12746 /* sfmmup is a shared hat */ 12747 ASSERT(sfmmup->sfmmu_scdhat); 12748 scdp = sfmmup->sfmmu_scdp; 12749 ASSERT(scdp != NULL); 12750 /* get private hat from the scd list */ 12751 mutex_enter(&scdp->scd_mutex); 12752 sfmmup = scdp->scd_sf_list; 12753 while (sfmmup != NULL) { 12754 hatlockp = sfmmu_hat_enter(sfmmup); 12755 /* 12756 * We do not call sfmmu_tsb_inv_ctx here because 12757 * sendmondo_in_recover check is only needed for 12758 * sun4u. 12759 */ 12760 sfmmu_invalidate_ctx(sfmmup); 12761 sfmmu_hat_exit(hatlockp); 12762 sfmmup = sfmmup->sfmmu_scd_link.next; 12763 12764 } 12765 mutex_exit(&scdp->scd_mutex); 12766 } 12767 return (0); 12768 } 12769 12770 static void 12771 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup) 12772 { 12773 extern uint32_t sendmondo_in_recover; 12774 12775 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12776 12777 /* 12778 * For Cheetah+ Erratum 25: 12779 * Wait for any active recovery to finish. We can't risk 12780 * relocating the TSB of the thread running mondo_recover_proc() 12781 * since, if we did that, we would deadlock. The scenario we are 12782 * trying to avoid is as follows: 12783 * 12784 * THIS CPU RECOVER CPU 12785 * -------- ----------- 12786 * Begins recovery, walking through TSB 12787 * hat_pagesuspend() TSB TTE 12788 * TLB miss on TSB TTE, spins at TL1 12789 * xt_sync() 12790 * send_mondo_timeout() 12791 * mondo_recover_proc() 12792 * ((deadlocked)) 12793 * 12794 * The second half of the workaround is that mondo_recover_proc() 12795 * checks to see if the tsb_info has the RELOC flag set, and if it 12796 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 12797 * and hence avoiding the TLB miss that could result in a deadlock. 12798 */ 12799 if (&sendmondo_in_recover) { 12800 membar_enter(); /* make sure RELOC flag visible */ 12801 while (sendmondo_in_recover) { 12802 drv_usecwait(1); 12803 membar_consumer(); 12804 } 12805 } 12806 12807 sfmmu_invalidate_ctx(sfmmup); 12808 } 12809 12810 /* ARGSUSED */ 12811 static int 12812 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 12813 void *tsbinfo, pfn_t newpfn) 12814 { 12815 hatlock_t *hatlockp; 12816 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12817 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12818 12819 if (flags != HAT_POSTUNSUSPEND) 12820 return (0); 12821 12822 hatlockp = sfmmu_hat_enter(sfmmup); 12823 12824 SFMMU_STAT(sf_tsb_reloc); 12825 12826 /* 12827 * The process may have swapped out while we were relocating one 12828 * of its TSBs. If so, don't bother doing the setup since the 12829 * process can't be using the memory anymore. 12830 */ 12831 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 12832 ASSERT(va == tsbinfop->tsb_va); 12833 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 12834 12835 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 12836 sfmmu_inv_tsb(tsbinfop->tsb_va, 12837 TSB_BYTES(tsbinfop->tsb_szc)); 12838 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 12839 } 12840 } 12841 12842 membar_exit(); 12843 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 12844 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 12845 12846 sfmmu_hat_exit(hatlockp); 12847 12848 return (0); 12849 } 12850 12851 /* 12852 * Allocate and initialize a tsb_info structure. Note that we may or may not 12853 * allocate a TSB here, depending on the flags passed in. 12854 */ 12855 static int 12856 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 12857 uint_t flags, sfmmu_t *sfmmup) 12858 { 12859 int err; 12860 12861 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 12862 sfmmu_tsbinfo_cache, KM_SLEEP); 12863 12864 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 12865 tsb_szc, flags, sfmmup)) != 0) { 12866 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 12867 SFMMU_STAT(sf_tsb_allocfail); 12868 *tsbinfopp = NULL; 12869 return (err); 12870 } 12871 SFMMU_STAT(sf_tsb_alloc); 12872 12873 /* 12874 * Bump the TSB size counters for this TSB size. 12875 */ 12876 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 12877 return (0); 12878 } 12879 12880 static void 12881 sfmmu_tsb_free(struct tsb_info *tsbinfo) 12882 { 12883 caddr_t tsbva = tsbinfo->tsb_va; 12884 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 12885 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 12886 vmem_t *vmp = tsbinfo->tsb_vmp; 12887 12888 /* 12889 * If we allocated this TSB from relocatable kernel memory, then we 12890 * need to uninstall the callback handler. 12891 */ 12892 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 12893 uintptr_t slab_mask; 12894 caddr_t slab_vaddr; 12895 page_t **ppl; 12896 int ret; 12897 12898 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena); 12899 if (tsb_size > MMU_PAGESIZE4M) 12900 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12901 else 12902 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12903 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 12904 12905 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 12906 ASSERT(ret == 0); 12907 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 12908 0, NULL); 12909 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 12910 } 12911 12912 if (kmem_cachep != NULL) { 12913 kmem_cache_free(kmem_cachep, tsbva); 12914 } else { 12915 vmem_xfree(vmp, (void *)tsbva, tsb_size); 12916 } 12917 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 12918 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 12919 } 12920 12921 static void 12922 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 12923 { 12924 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 12925 sfmmu_tsb_free(tsbinfo); 12926 } 12927 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 12928 12929 } 12930 12931 /* 12932 * Setup all the references to physical memory for this tsbinfo. 12933 * The underlying page(s) must be locked. 12934 */ 12935 static void 12936 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 12937 { 12938 ASSERT(pfn != PFN_INVALID); 12939 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 12940 12941 #ifndef sun4v 12942 if (tsbinfo->tsb_szc == 0) { 12943 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 12944 PROT_WRITE|PROT_READ, TTE8K); 12945 } else { 12946 /* 12947 * Round down PA and use a large mapping; the handlers will 12948 * compute the TSB pointer at the correct offset into the 12949 * big virtual page. NOTE: this assumes all TSBs larger 12950 * than 8K must come from physically contiguous slabs of 12951 * size tsb_slab_size. 12952 */ 12953 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 12954 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 12955 } 12956 tsbinfo->tsb_pa = ptob(pfn); 12957 12958 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 12959 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 12960 12961 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 12962 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 12963 #else /* sun4v */ 12964 tsbinfo->tsb_pa = ptob(pfn); 12965 #endif /* sun4v */ 12966 } 12967 12968 12969 /* 12970 * Returns zero on success, ENOMEM if over the high water mark, 12971 * or EAGAIN if the caller needs to retry with a smaller TSB 12972 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 12973 * 12974 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 12975 * is specified and the TSB requested is PAGESIZE, though it 12976 * may sleep waiting for memory if sufficient memory is not 12977 * available. 12978 */ 12979 static int 12980 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 12981 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 12982 { 12983 caddr_t vaddr = NULL; 12984 caddr_t slab_vaddr; 12985 uintptr_t slab_mask; 12986 int tsbbytes = TSB_BYTES(tsbcode); 12987 int lowmem = 0; 12988 struct kmem_cache *kmem_cachep = NULL; 12989 vmem_t *vmp = NULL; 12990 lgrp_id_t lgrpid = LGRP_NONE; 12991 pfn_t pfn; 12992 uint_t cbflags = HAC_SLEEP; 12993 page_t **pplist; 12994 int ret; 12995 12996 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena); 12997 if (tsbbytes > MMU_PAGESIZE4M) 12998 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12999 else 13000 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 13001 13002 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 13003 flags |= TSB_ALLOC; 13004 13005 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 13006 13007 tsbinfo->tsb_sfmmu = sfmmup; 13008 13009 /* 13010 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 13011 * return. 13012 */ 13013 if ((flags & TSB_ALLOC) == 0) { 13014 tsbinfo->tsb_szc = tsbcode; 13015 tsbinfo->tsb_ttesz_mask = tteszmask; 13016 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 13017 tsbinfo->tsb_pa = -1; 13018 tsbinfo->tsb_tte.ll = 0; 13019 tsbinfo->tsb_next = NULL; 13020 tsbinfo->tsb_flags = TSB_SWAPPED; 13021 tsbinfo->tsb_cache = NULL; 13022 tsbinfo->tsb_vmp = NULL; 13023 return (0); 13024 } 13025 13026 #ifdef DEBUG 13027 /* 13028 * For debugging: 13029 * Randomly force allocation failures every tsb_alloc_mtbf 13030 * tries if TSB_FORCEALLOC is not specified. This will 13031 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 13032 * it is even, to allow testing of both failure paths... 13033 */ 13034 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 13035 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 13036 tsb_alloc_count = 0; 13037 tsb_alloc_fail_mtbf++; 13038 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 13039 } 13040 #endif /* DEBUG */ 13041 13042 /* 13043 * Enforce high water mark if we are not doing a forced allocation 13044 * and are not shrinking a process' TSB. 13045 */ 13046 if ((flags & TSB_SHRINK) == 0 && 13047 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 13048 if ((flags & TSB_FORCEALLOC) == 0) 13049 return (ENOMEM); 13050 lowmem = 1; 13051 } 13052 13053 /* 13054 * Allocate from the correct location based upon the size of the TSB 13055 * compared to the base page size, and what memory conditions dictate. 13056 * Note we always do nonblocking allocations from the TSB arena since 13057 * we don't want memory fragmentation to cause processes to block 13058 * indefinitely waiting for memory; until the kernel algorithms that 13059 * coalesce large pages are improved this is our best option. 13060 * 13061 * Algorithm: 13062 * If allocating a "large" TSB (>8K), allocate from the 13063 * appropriate kmem_tsb_default_arena vmem arena 13064 * else if low on memory or the TSB_FORCEALLOC flag is set or 13065 * tsb_forceheap is set 13066 * Allocate from kernel heap via sfmmu_tsb8k_cache with 13067 * KM_SLEEP (never fails) 13068 * else 13069 * Allocate from appropriate sfmmu_tsb_cache with 13070 * KM_NOSLEEP 13071 * endif 13072 */ 13073 if (tsb_lgrp_affinity) 13074 lgrpid = lgrp_home_id(curthread); 13075 if (lgrpid == LGRP_NONE) 13076 lgrpid = 0; /* use lgrp of boot CPU */ 13077 13078 if (tsbbytes > MMU_PAGESIZE) { 13079 if (tsbbytes > MMU_PAGESIZE4M) { 13080 vmp = kmem_bigtsb_default_arena[lgrpid]; 13081 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13082 0, 0, NULL, NULL, VM_NOSLEEP); 13083 } else { 13084 vmp = kmem_tsb_default_arena[lgrpid]; 13085 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13086 0, 0, NULL, NULL, VM_NOSLEEP); 13087 } 13088 #ifdef DEBUG 13089 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 13090 #else /* !DEBUG */ 13091 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 13092 #endif /* DEBUG */ 13093 kmem_cachep = sfmmu_tsb8k_cache; 13094 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 13095 ASSERT(vaddr != NULL); 13096 } else { 13097 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 13098 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 13099 } 13100 13101 tsbinfo->tsb_cache = kmem_cachep; 13102 tsbinfo->tsb_vmp = vmp; 13103 13104 if (vaddr == NULL) { 13105 return (EAGAIN); 13106 } 13107 13108 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 13109 kmem_cachep = tsbinfo->tsb_cache; 13110 13111 /* 13112 * If we are allocating from outside the cage, then we need to 13113 * register a relocation callback handler. Note that for now 13114 * since pseudo mappings always hang off of the slab's root page, 13115 * we need only lock the first 8K of the TSB slab. This is a bit 13116 * hacky but it is good for performance. 13117 */ 13118 if (kmem_cachep != sfmmu_tsb8k_cache) { 13119 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 13120 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 13121 ASSERT(ret == 0); 13122 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 13123 cbflags, (void *)tsbinfo, &pfn, NULL); 13124 13125 /* 13126 * Need to free up resources if we could not successfully 13127 * add the callback function and return an error condition. 13128 */ 13129 if (ret != 0) { 13130 if (kmem_cachep) { 13131 kmem_cache_free(kmem_cachep, vaddr); 13132 } else { 13133 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 13134 } 13135 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 13136 S_WRITE); 13137 return (EAGAIN); 13138 } 13139 } else { 13140 /* 13141 * Since allocation of 8K TSBs from heap is rare and occurs 13142 * during memory pressure we allocate them from permanent 13143 * memory rather than using callbacks to get the PFN. 13144 */ 13145 pfn = hat_getpfnum(kas.a_hat, vaddr); 13146 } 13147 13148 tsbinfo->tsb_va = vaddr; 13149 tsbinfo->tsb_szc = tsbcode; 13150 tsbinfo->tsb_ttesz_mask = tteszmask; 13151 tsbinfo->tsb_next = NULL; 13152 tsbinfo->tsb_flags = 0; 13153 13154 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 13155 13156 sfmmu_inv_tsb(vaddr, tsbbytes); 13157 13158 if (kmem_cachep != sfmmu_tsb8k_cache) { 13159 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 13160 } 13161 13162 return (0); 13163 } 13164 13165 /* 13166 * Initialize per cpu tsb and per cpu tsbmiss_area 13167 */ 13168 void 13169 sfmmu_init_tsbs(void) 13170 { 13171 int i; 13172 struct tsbmiss *tsbmissp; 13173 struct kpmtsbm *kpmtsbmp; 13174 #ifndef sun4v 13175 extern int dcache_line_mask; 13176 #endif /* sun4v */ 13177 extern uint_t vac_colors; 13178 13179 /* 13180 * Init. tsb miss area. 13181 */ 13182 tsbmissp = tsbmiss_area; 13183 13184 for (i = 0; i < NCPU; tsbmissp++, i++) { 13185 /* 13186 * initialize the tsbmiss area. 13187 * Do this for all possible CPUs as some may be added 13188 * while the system is running. There is no cost to this. 13189 */ 13190 tsbmissp->ksfmmup = ksfmmup; 13191 #ifndef sun4v 13192 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 13193 #endif /* sun4v */ 13194 tsbmissp->khashstart = 13195 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 13196 tsbmissp->uhashstart = 13197 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 13198 tsbmissp->khashsz = khmehash_num; 13199 tsbmissp->uhashsz = uhmehash_num; 13200 } 13201 13202 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B', 13203 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0); 13204 13205 if (kpm_enable == 0) 13206 return; 13207 13208 /* -- Begin KPM specific init -- */ 13209 13210 if (kpm_smallpages) { 13211 /* 13212 * If we're using base pagesize pages for seg_kpm 13213 * mappings, we use the kernel TSB since we can't afford 13214 * to allocate a second huge TSB for these mappings. 13215 */ 13216 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13217 kpm_tsbsz = ktsb_szcode; 13218 kpmsm_tsbbase = kpm_tsbbase; 13219 kpmsm_tsbsz = kpm_tsbsz; 13220 } else { 13221 /* 13222 * In VAC conflict case, just put the entries in the 13223 * kernel 8K indexed TSB for now so we can find them. 13224 * This could really be changed in the future if we feel 13225 * the need... 13226 */ 13227 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13228 kpmsm_tsbsz = ktsb_szcode; 13229 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 13230 kpm_tsbsz = ktsb4m_szcode; 13231 } 13232 13233 kpmtsbmp = kpmtsbm_area; 13234 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 13235 /* 13236 * Initialize the kpmtsbm area. 13237 * Do this for all possible CPUs as some may be added 13238 * while the system is running. There is no cost to this. 13239 */ 13240 kpmtsbmp->vbase = kpm_vbase; 13241 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 13242 kpmtsbmp->sz_shift = kpm_size_shift; 13243 kpmtsbmp->kpmp_shift = kpmp_shift; 13244 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 13245 if (kpm_smallpages == 0) { 13246 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 13247 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 13248 } else { 13249 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 13250 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 13251 } 13252 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 13253 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 13254 #ifdef DEBUG 13255 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 13256 #endif /* DEBUG */ 13257 if (ktsb_phys) 13258 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 13259 } 13260 13261 /* -- End KPM specific init -- */ 13262 } 13263 13264 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 13265 struct tsb_info ktsb_info[2]; 13266 13267 /* 13268 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 13269 */ 13270 void 13271 sfmmu_init_ktsbinfo() 13272 { 13273 ASSERT(ksfmmup != NULL); 13274 ASSERT(ksfmmup->sfmmu_tsb == NULL); 13275 /* 13276 * Allocate tsbinfos for kernel and copy in data 13277 * to make debug easier and sun4v setup easier. 13278 */ 13279 ktsb_info[0].tsb_sfmmu = ksfmmup; 13280 ktsb_info[0].tsb_szc = ktsb_szcode; 13281 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 13282 ktsb_info[0].tsb_va = ktsb_base; 13283 ktsb_info[0].tsb_pa = ktsb_pbase; 13284 ktsb_info[0].tsb_flags = 0; 13285 ktsb_info[0].tsb_tte.ll = 0; 13286 ktsb_info[0].tsb_cache = NULL; 13287 13288 ktsb_info[1].tsb_sfmmu = ksfmmup; 13289 ktsb_info[1].tsb_szc = ktsb4m_szcode; 13290 ktsb_info[1].tsb_ttesz_mask = TSB4M; 13291 ktsb_info[1].tsb_va = ktsb4m_base; 13292 ktsb_info[1].tsb_pa = ktsb4m_pbase; 13293 ktsb_info[1].tsb_flags = 0; 13294 ktsb_info[1].tsb_tte.ll = 0; 13295 ktsb_info[1].tsb_cache = NULL; 13296 13297 /* Link them into ksfmmup. */ 13298 ktsb_info[0].tsb_next = &ktsb_info[1]; 13299 ktsb_info[1].tsb_next = NULL; 13300 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 13301 13302 sfmmu_setup_tsbinfo(ksfmmup); 13303 } 13304 13305 /* 13306 * Cache the last value returned from va_to_pa(). If the VA specified 13307 * in the current call to cached_va_to_pa() maps to the same Page (as the 13308 * previous call to cached_va_to_pa()), then compute the PA using 13309 * cached info, else call va_to_pa(). 13310 * 13311 * Note: this function is neither MT-safe nor consistent in the presence 13312 * of multiple, interleaved threads. This function was created to enable 13313 * an optimization used during boot (at a point when there's only one thread 13314 * executing on the "boot CPU", and before startup_vm() has been called). 13315 */ 13316 static uint64_t 13317 cached_va_to_pa(void *vaddr) 13318 { 13319 static uint64_t prev_vaddr_base = 0; 13320 static uint64_t prev_pfn = 0; 13321 13322 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 13323 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 13324 } else { 13325 uint64_t pa = va_to_pa(vaddr); 13326 13327 if (pa != ((uint64_t)-1)) { 13328 /* 13329 * Computed physical address is valid. Cache its 13330 * related info for the next cached_va_to_pa() call. 13331 */ 13332 prev_pfn = pa & MMU_PAGEMASK; 13333 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 13334 } 13335 13336 return (pa); 13337 } 13338 } 13339 13340 /* 13341 * Carve up our nucleus hblk region. We may allocate more hblks than 13342 * asked due to rounding errors but we are guaranteed to have at least 13343 * enough space to allocate the requested number of hblk8's and hblk1's. 13344 */ 13345 void 13346 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 13347 { 13348 struct hme_blk *hmeblkp; 13349 size_t hme8blk_sz, hme1blk_sz; 13350 size_t i; 13351 size_t hblk8_bound; 13352 ulong_t j = 0, k = 0; 13353 13354 ASSERT(addr != NULL && size != 0); 13355 13356 /* Need to use proper structure alignment */ 13357 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 13358 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 13359 13360 nucleus_hblk8.list = (void *)addr; 13361 nucleus_hblk8.index = 0; 13362 13363 /* 13364 * Use as much memory as possible for hblk8's since we 13365 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 13366 * We need to hold back enough space for the hblk1's which 13367 * we'll allocate next. 13368 */ 13369 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 13370 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 13371 hmeblkp = (struct hme_blk *)addr; 13372 addr += hme8blk_sz; 13373 hmeblkp->hblk_nuc_bit = 1; 13374 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13375 } 13376 nucleus_hblk8.len = j; 13377 ASSERT(j >= nhblk8); 13378 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 13379 13380 nucleus_hblk1.list = (void *)addr; 13381 nucleus_hblk1.index = 0; 13382 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 13383 hmeblkp = (struct hme_blk *)addr; 13384 addr += hme1blk_sz; 13385 hmeblkp->hblk_nuc_bit = 1; 13386 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13387 } 13388 ASSERT(k >= nhblk1); 13389 nucleus_hblk1.len = k; 13390 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 13391 } 13392 13393 /* 13394 * This function is currently not supported on this platform. For what 13395 * it's supposed to do, see hat.c and hat_srmmu.c 13396 */ 13397 /* ARGSUSED */ 13398 faultcode_t 13399 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 13400 uint_t flags) 13401 { 13402 ASSERT(hat->sfmmu_xhat_provider == NULL); 13403 return (FC_NOSUPPORT); 13404 } 13405 13406 /* 13407 * Searchs the mapping list of the page for a mapping of the same size. If not 13408 * found the corresponding bit is cleared in the p_index field. When large 13409 * pages are more prevalent in the system, we can maintain the mapping list 13410 * in order and we don't have to traverse the list each time. Just check the 13411 * next and prev entries, and if both are of different size, we clear the bit. 13412 */ 13413 static void 13414 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 13415 { 13416 struct sf_hment *sfhmep; 13417 struct hme_blk *hmeblkp; 13418 int index; 13419 pgcnt_t npgs; 13420 13421 ASSERT(ttesz > TTE8K); 13422 13423 ASSERT(sfmmu_mlist_held(pp)); 13424 13425 ASSERT(PP_ISMAPPED_LARGE(pp)); 13426 13427 /* 13428 * Traverse mapping list looking for another mapping of same size. 13429 * since we only want to clear index field if all mappings of 13430 * that size are gone. 13431 */ 13432 13433 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 13434 if (IS_PAHME(sfhmep)) 13435 continue; 13436 hmeblkp = sfmmu_hmetohblk(sfhmep); 13437 if (hmeblkp->hblk_xhat_bit) 13438 continue; 13439 if (hme_size(sfhmep) == ttesz) { 13440 /* 13441 * another mapping of the same size. don't clear index. 13442 */ 13443 return; 13444 } 13445 } 13446 13447 /* 13448 * Clear the p_index bit for large page. 13449 */ 13450 index = PAGESZ_TO_INDEX(ttesz); 13451 npgs = TTEPAGES(ttesz); 13452 while (npgs-- > 0) { 13453 ASSERT(pp->p_index & index); 13454 pp->p_index &= ~index; 13455 pp = PP_PAGENEXT(pp); 13456 } 13457 } 13458 13459 /* 13460 * return supported features 13461 */ 13462 /* ARGSUSED */ 13463 int 13464 hat_supported(enum hat_features feature, void *arg) 13465 { 13466 switch (feature) { 13467 case HAT_SHARED_PT: 13468 case HAT_DYNAMIC_ISM_UNMAP: 13469 case HAT_VMODSORT: 13470 return (1); 13471 case HAT_SHARED_REGIONS: 13472 if (shctx_on) 13473 return (1); 13474 else 13475 return (0); 13476 default: 13477 return (0); 13478 } 13479 } 13480 13481 void 13482 hat_enter(struct hat *hat) 13483 { 13484 hatlock_t *hatlockp; 13485 13486 if (hat != ksfmmup) { 13487 hatlockp = TSB_HASH(hat); 13488 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 13489 } 13490 } 13491 13492 void 13493 hat_exit(struct hat *hat) 13494 { 13495 hatlock_t *hatlockp; 13496 13497 if (hat != ksfmmup) { 13498 hatlockp = TSB_HASH(hat); 13499 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 13500 } 13501 } 13502 13503 /*ARGSUSED*/ 13504 void 13505 hat_reserve(struct as *as, caddr_t addr, size_t len) 13506 { 13507 } 13508 13509 static void 13510 hat_kstat_init(void) 13511 { 13512 kstat_t *ksp; 13513 13514 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 13515 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 13516 KSTAT_FLAG_VIRTUAL); 13517 if (ksp) { 13518 ksp->ks_data = (void *) &sfmmu_global_stat; 13519 kstat_install(ksp); 13520 } 13521 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 13522 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 13523 KSTAT_FLAG_VIRTUAL); 13524 if (ksp) { 13525 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 13526 kstat_install(ksp); 13527 } 13528 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 13529 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 13530 KSTAT_FLAG_WRITABLE); 13531 if (ksp) { 13532 ksp->ks_update = sfmmu_kstat_percpu_update; 13533 kstat_install(ksp); 13534 } 13535 } 13536 13537 /* ARGSUSED */ 13538 static int 13539 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 13540 { 13541 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 13542 struct tsbmiss *tsbm = tsbmiss_area; 13543 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 13544 int i; 13545 13546 ASSERT(cpu_kstat); 13547 if (rw == KSTAT_READ) { 13548 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 13549 cpu_kstat->sf_itlb_misses = 0; 13550 cpu_kstat->sf_dtlb_misses = 0; 13551 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 13552 tsbm->uprot_traps; 13553 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 13554 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 13555 cpu_kstat->sf_tsb_hits = 0; 13556 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 13557 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 13558 } 13559 } else { 13560 /* KSTAT_WRITE is used to clear stats */ 13561 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 13562 tsbm->utsb_misses = 0; 13563 tsbm->ktsb_misses = 0; 13564 tsbm->uprot_traps = 0; 13565 tsbm->kprot_traps = 0; 13566 kpmtsbm->kpm_dtlb_misses = 0; 13567 kpmtsbm->kpm_tsb_misses = 0; 13568 } 13569 } 13570 return (0); 13571 } 13572 13573 #ifdef DEBUG 13574 13575 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 13576 13577 /* 13578 * A tte checker. *orig_old is the value we read before cas. 13579 * *cur is the value returned by cas. 13580 * *new is the desired value when we do the cas. 13581 * 13582 * *hmeblkp is currently unused. 13583 */ 13584 13585 /* ARGSUSED */ 13586 void 13587 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 13588 { 13589 pfn_t i, j, k; 13590 int cpuid = CPU->cpu_id; 13591 13592 gorig[cpuid] = orig_old; 13593 gcur[cpuid] = cur; 13594 gnew[cpuid] = new; 13595 13596 #ifdef lint 13597 hmeblkp = hmeblkp; 13598 #endif 13599 13600 if (TTE_IS_VALID(orig_old)) { 13601 if (TTE_IS_VALID(cur)) { 13602 i = TTE_TO_TTEPFN(orig_old); 13603 j = TTE_TO_TTEPFN(cur); 13604 k = TTE_TO_TTEPFN(new); 13605 if (i != j) { 13606 /* remap error? */ 13607 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j); 13608 } 13609 13610 if (i != k) { 13611 /* remap error? */ 13612 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k); 13613 } 13614 } else { 13615 if (TTE_IS_VALID(new)) { 13616 panic("chk_tte: invalid cur? "); 13617 } 13618 13619 i = TTE_TO_TTEPFN(orig_old); 13620 k = TTE_TO_TTEPFN(new); 13621 if (i != k) { 13622 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k); 13623 } 13624 } 13625 } else { 13626 if (TTE_IS_VALID(cur)) { 13627 j = TTE_TO_TTEPFN(cur); 13628 if (TTE_IS_VALID(new)) { 13629 k = TTE_TO_TTEPFN(new); 13630 if (j != k) { 13631 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx", 13632 j, k); 13633 } 13634 } else { 13635 panic("chk_tte: why here?"); 13636 } 13637 } else { 13638 if (!TTE_IS_VALID(new)) { 13639 panic("chk_tte: why here2 ?"); 13640 } 13641 } 13642 } 13643 } 13644 13645 #endif /* DEBUG */ 13646 13647 extern void prefetch_tsbe_read(struct tsbe *); 13648 extern void prefetch_tsbe_write(struct tsbe *); 13649 13650 13651 /* 13652 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 13653 * us optimal performance on Cheetah+. You can only have 8 outstanding 13654 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 13655 * prefetch to make the most utilization of the prefetch capability. 13656 */ 13657 #define TSBE_PREFETCH_STRIDE (7) 13658 13659 void 13660 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 13661 { 13662 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 13663 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 13664 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 13665 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 13666 struct tsbe *old; 13667 struct tsbe *new; 13668 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 13669 uint64_t va; 13670 int new_offset; 13671 int i; 13672 int vpshift; 13673 int last_prefetch; 13674 13675 if (old_bytes == new_bytes) { 13676 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 13677 } else { 13678 13679 /* 13680 * A TSBE is 16 bytes which means there are four TSBE's per 13681 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 13682 */ 13683 old = (struct tsbe *)old_tsbinfo->tsb_va; 13684 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 13685 for (i = 0; i < old_entries; i++, old++) { 13686 if (((i & (4-1)) == 0) && (i < last_prefetch)) 13687 prefetch_tsbe_read(old); 13688 if (!old->tte_tag.tag_invalid) { 13689 /* 13690 * We have a valid TTE to remap. Check the 13691 * size. We won't remap 64K or 512K TTEs 13692 * because they span more than one TSB entry 13693 * and are indexed using an 8K virt. page. 13694 * Ditto for 32M and 256M TTEs. 13695 */ 13696 if (TTE_CSZ(&old->tte_data) == TTE64K || 13697 TTE_CSZ(&old->tte_data) == TTE512K) 13698 continue; 13699 if (mmu_page_sizes == max_mmu_page_sizes) { 13700 if (TTE_CSZ(&old->tte_data) == TTE32M || 13701 TTE_CSZ(&old->tte_data) == TTE256M) 13702 continue; 13703 } 13704 13705 /* clear the lower 22 bits of the va */ 13706 va = *(uint64_t *)old << 22; 13707 /* turn va into a virtual pfn */ 13708 va >>= 22 - TSB_START_SIZE; 13709 /* 13710 * or in bits from the offset in the tsb 13711 * to get the real virtual pfn. These 13712 * correspond to bits [21:13] in the va 13713 */ 13714 vpshift = 13715 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 13716 0x1ff; 13717 va |= (i << vpshift); 13718 va >>= vpshift; 13719 new_offset = va & (new_entries - 1); 13720 new = new_base + new_offset; 13721 prefetch_tsbe_write(new); 13722 *new = *old; 13723 } 13724 } 13725 } 13726 } 13727 13728 /* 13729 * unused in sfmmu 13730 */ 13731 void 13732 hat_dump(void) 13733 { 13734 } 13735 13736 /* 13737 * Called when a thread is exiting and we have switched to the kernel address 13738 * space. Perform the same VM initialization resume() uses when switching 13739 * processes. 13740 * 13741 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 13742 * we call it anyway in case the semantics change in the future. 13743 */ 13744 /*ARGSUSED*/ 13745 void 13746 hat_thread_exit(kthread_t *thd) 13747 { 13748 uint_t pgsz_cnum; 13749 uint_t pstate_save; 13750 13751 ASSERT(thd->t_procp->p_as == &kas); 13752 13753 pgsz_cnum = KCONTEXT; 13754 #ifdef sun4u 13755 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT); 13756 #endif 13757 13758 /* 13759 * Note that sfmmu_load_mmustate() is currently a no-op for 13760 * kernel threads. We need to disable interrupts here, 13761 * simply because otherwise sfmmu_load_mmustate() would panic 13762 * if the caller does not disable interrupts. 13763 */ 13764 pstate_save = sfmmu_disable_intrs(); 13765 13766 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */ 13767 sfmmu_setctx_sec(pgsz_cnum); 13768 sfmmu_load_mmustate(ksfmmup); 13769 sfmmu_enable_intrs(pstate_save); 13770 } 13771 13772 13773 /* 13774 * SRD support 13775 */ 13776 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \ 13777 (((uintptr_t)(vp)) >> 11)) & \ 13778 srd_hashmask) 13779 13780 /* 13781 * Attach the process to the srd struct associated with the exec vnode 13782 * from which the process is started. 13783 */ 13784 void 13785 hat_join_srd(struct hat *sfmmup, vnode_t *evp) 13786 { 13787 uint_t hash = SRD_HASH_FUNCTION(evp); 13788 sf_srd_t *srdp; 13789 sf_srd_t *newsrdp; 13790 13791 ASSERT(sfmmup != ksfmmup); 13792 ASSERT(sfmmup->sfmmu_srdp == NULL); 13793 13794 if (!shctx_on) { 13795 return; 13796 } 13797 13798 VN_HOLD(evp); 13799 13800 if (srd_buckets[hash].srdb_srdp != NULL) { 13801 mutex_enter(&srd_buckets[hash].srdb_lock); 13802 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13803 srdp = srdp->srd_hash) { 13804 if (srdp->srd_evp == evp) { 13805 ASSERT(srdp->srd_refcnt >= 0); 13806 sfmmup->sfmmu_srdp = srdp; 13807 atomic_add_32( 13808 (volatile uint_t *)&srdp->srd_refcnt, 1); 13809 mutex_exit(&srd_buckets[hash].srdb_lock); 13810 return; 13811 } 13812 } 13813 mutex_exit(&srd_buckets[hash].srdb_lock); 13814 } 13815 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP); 13816 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0); 13817 13818 newsrdp->srd_evp = evp; 13819 newsrdp->srd_refcnt = 1; 13820 newsrdp->srd_hmergnfree = NULL; 13821 newsrdp->srd_ismrgnfree = NULL; 13822 13823 mutex_enter(&srd_buckets[hash].srdb_lock); 13824 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13825 srdp = srdp->srd_hash) { 13826 if (srdp->srd_evp == evp) { 13827 ASSERT(srdp->srd_refcnt >= 0); 13828 sfmmup->sfmmu_srdp = srdp; 13829 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 13830 mutex_exit(&srd_buckets[hash].srdb_lock); 13831 kmem_cache_free(srd_cache, newsrdp); 13832 return; 13833 } 13834 } 13835 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp; 13836 srd_buckets[hash].srdb_srdp = newsrdp; 13837 sfmmup->sfmmu_srdp = newsrdp; 13838 13839 mutex_exit(&srd_buckets[hash].srdb_lock); 13840 13841 } 13842 13843 static void 13844 sfmmu_leave_srd(sfmmu_t *sfmmup) 13845 { 13846 vnode_t *evp; 13847 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13848 uint_t hash; 13849 sf_srd_t **prev_srdpp; 13850 sf_region_t *rgnp; 13851 sf_region_t *nrgnp; 13852 #ifdef DEBUG 13853 int rgns = 0; 13854 #endif 13855 int i; 13856 13857 ASSERT(sfmmup != ksfmmup); 13858 ASSERT(srdp != NULL); 13859 ASSERT(srdp->srd_refcnt > 0); 13860 ASSERT(sfmmup->sfmmu_scdp == NULL); 13861 ASSERT(sfmmup->sfmmu_free == 1); 13862 13863 sfmmup->sfmmu_srdp = NULL; 13864 evp = srdp->srd_evp; 13865 ASSERT(evp != NULL); 13866 if (atomic_add_32_nv( 13867 (volatile uint_t *)&srdp->srd_refcnt, -1)) { 13868 VN_RELE(evp); 13869 return; 13870 } 13871 13872 hash = SRD_HASH_FUNCTION(evp); 13873 mutex_enter(&srd_buckets[hash].srdb_lock); 13874 for (prev_srdpp = &srd_buckets[hash].srdb_srdp; 13875 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) { 13876 if (srdp->srd_evp == evp) { 13877 break; 13878 } 13879 } 13880 if (srdp == NULL || srdp->srd_refcnt) { 13881 mutex_exit(&srd_buckets[hash].srdb_lock); 13882 VN_RELE(evp); 13883 return; 13884 } 13885 *prev_srdpp = srdp->srd_hash; 13886 mutex_exit(&srd_buckets[hash].srdb_lock); 13887 13888 ASSERT(srdp->srd_refcnt == 0); 13889 VN_RELE(evp); 13890 13891 #ifdef DEBUG 13892 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) { 13893 ASSERT(srdp->srd_rgnhash[i] == NULL); 13894 } 13895 #endif /* DEBUG */ 13896 13897 /* free each hme regions in the srd */ 13898 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) { 13899 nrgnp = rgnp->rgn_next; 13900 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid); 13901 ASSERT(rgnp->rgn_refcnt == 0); 13902 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13903 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13904 ASSERT(rgnp->rgn_hmeflags == 0); 13905 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp); 13906 #ifdef DEBUG 13907 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13908 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13909 } 13910 rgns++; 13911 #endif /* DEBUG */ 13912 kmem_cache_free(region_cache, rgnp); 13913 } 13914 ASSERT(rgns == srdp->srd_next_hmerid); 13915 13916 #ifdef DEBUG 13917 rgns = 0; 13918 #endif 13919 /* free each ism rgns in the srd */ 13920 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) { 13921 nrgnp = rgnp->rgn_next; 13922 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid); 13923 ASSERT(rgnp->rgn_refcnt == 0); 13924 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13925 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13926 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp); 13927 #ifdef DEBUG 13928 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13929 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13930 } 13931 rgns++; 13932 #endif /* DEBUG */ 13933 kmem_cache_free(region_cache, rgnp); 13934 } 13935 ASSERT(rgns == srdp->srd_next_ismrid); 13936 ASSERT(srdp->srd_ismbusyrgns == 0); 13937 ASSERT(srdp->srd_hmebusyrgns == 0); 13938 13939 srdp->srd_next_ismrid = 0; 13940 srdp->srd_next_hmerid = 0; 13941 13942 bzero((void *)srdp->srd_ismrgnp, 13943 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS); 13944 bzero((void *)srdp->srd_hmergnp, 13945 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS); 13946 13947 ASSERT(srdp->srd_scdp == NULL); 13948 kmem_cache_free(srd_cache, srdp); 13949 } 13950 13951 /* ARGSUSED */ 13952 static int 13953 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags) 13954 { 13955 sf_srd_t *srdp = (sf_srd_t *)buf; 13956 bzero(buf, sizeof (*srdp)); 13957 13958 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL); 13959 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL); 13960 return (0); 13961 } 13962 13963 /* ARGSUSED */ 13964 static void 13965 sfmmu_srdcache_destructor(void *buf, void *cdrarg) 13966 { 13967 sf_srd_t *srdp = (sf_srd_t *)buf; 13968 13969 mutex_destroy(&srdp->srd_mutex); 13970 mutex_destroy(&srdp->srd_scd_mutex); 13971 } 13972 13973 /* 13974 * The caller makes sure hat_join_region()/hat_leave_region() can't be called 13975 * at the same time for the same process and address range. This is ensured by 13976 * the fact that address space is locked as writer when a process joins the 13977 * regions. Therefore there's no need to hold an srd lock during the entire 13978 * execution of hat_join_region()/hat_leave_region(). 13979 */ 13980 13981 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \ 13982 (((uintptr_t)(obj)) >> 11)) & \ 13983 srd_rgn_hashmask) 13984 /* 13985 * This routine implements the shared context functionality required when 13986 * attaching a segment to an address space. It must be called from 13987 * hat_share() for D(ISM) segments and from segvn_create() for segments 13988 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie 13989 * which is saved in the private segment data for hme segments and 13990 * the ism_map structure for ism segments. 13991 */ 13992 hat_region_cookie_t 13993 hat_join_region(struct hat *sfmmup, 13994 caddr_t r_saddr, 13995 size_t r_size, 13996 void *r_obj, 13997 u_offset_t r_objoff, 13998 uchar_t r_perm, 13999 uchar_t r_pgszc, 14000 hat_rgn_cb_func_t r_cb_function, 14001 uint_t flags) 14002 { 14003 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14004 uint_t rhash; 14005 uint_t rid; 14006 hatlock_t *hatlockp; 14007 sf_region_t *rgnp; 14008 sf_region_t *new_rgnp = NULL; 14009 int i; 14010 uint16_t *nextidp; 14011 sf_region_t **freelistp; 14012 int maxids; 14013 sf_region_t **rarrp; 14014 uint16_t *busyrgnsp; 14015 ulong_t rttecnt; 14016 uchar_t tteflag; 14017 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14018 int text = (r_type == HAT_REGION_TEXT); 14019 14020 if (srdp == NULL || r_size == 0) { 14021 return (HAT_INVALID_REGION_COOKIE); 14022 } 14023 14024 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14025 ASSERT(sfmmup != ksfmmup); 14026 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14027 ASSERT(srdp->srd_refcnt > 0); 14028 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14029 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14030 ASSERT(r_pgszc < mmu_page_sizes); 14031 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) || 14032 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) { 14033 panic("hat_join_region: region addr or size is not aligned\n"); 14034 } 14035 14036 14037 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14038 SFMMU_REGION_HME; 14039 /* 14040 * Currently only support shared hmes for the read only main text 14041 * region. 14042 */ 14043 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) || 14044 (r_perm & PROT_WRITE))) { 14045 return (HAT_INVALID_REGION_COOKIE); 14046 } 14047 14048 rhash = RGN_HASH_FUNCTION(r_obj); 14049 14050 if (r_type == SFMMU_REGION_ISM) { 14051 nextidp = &srdp->srd_next_ismrid; 14052 freelistp = &srdp->srd_ismrgnfree; 14053 maxids = SFMMU_MAX_ISM_REGIONS; 14054 rarrp = srdp->srd_ismrgnp; 14055 busyrgnsp = &srdp->srd_ismbusyrgns; 14056 } else { 14057 nextidp = &srdp->srd_next_hmerid; 14058 freelistp = &srdp->srd_hmergnfree; 14059 maxids = SFMMU_MAX_HME_REGIONS; 14060 rarrp = srdp->srd_hmergnp; 14061 busyrgnsp = &srdp->srd_hmebusyrgns; 14062 } 14063 14064 mutex_enter(&srdp->srd_mutex); 14065 14066 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14067 rgnp = rgnp->rgn_hash) { 14068 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size && 14069 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff && 14070 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) { 14071 break; 14072 } 14073 } 14074 14075 rfound: 14076 if (rgnp != NULL) { 14077 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14078 ASSERT(rgnp->rgn_cb_function == r_cb_function); 14079 ASSERT(rgnp->rgn_refcnt >= 0); 14080 rid = rgnp->rgn_id; 14081 ASSERT(rid < maxids); 14082 ASSERT(rarrp[rid] == rgnp); 14083 ASSERT(rid < *nextidp); 14084 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14085 mutex_exit(&srdp->srd_mutex); 14086 if (new_rgnp != NULL) { 14087 kmem_cache_free(region_cache, new_rgnp); 14088 } 14089 if (r_type == SFMMU_REGION_HME) { 14090 int myjoin = 14091 (sfmmup == astosfmmu(curthread->t_procp->p_as)); 14092 14093 sfmmu_link_to_hmeregion(sfmmup, rgnp); 14094 /* 14095 * bitmap should be updated after linking sfmmu on 14096 * region list so that pageunload() doesn't skip 14097 * TSB/TLB flush. As soon as bitmap is updated another 14098 * thread in this process can already start accessing 14099 * this region. 14100 */ 14101 /* 14102 * Normally ttecnt accounting is done as part of 14103 * pagefault handling. But a process may not take any 14104 * pagefaults on shared hmeblks created by some other 14105 * process. To compensate for this assume that the 14106 * entire region will end up faulted in using 14107 * the region's pagesize. 14108 * 14109 */ 14110 if (r_pgszc > TTE8K) { 14111 tteflag = 1 << r_pgszc; 14112 if (disable_large_pages & tteflag) { 14113 tteflag = 0; 14114 } 14115 } else { 14116 tteflag = 0; 14117 } 14118 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) { 14119 hatlockp = sfmmu_hat_enter(sfmmup); 14120 sfmmup->sfmmu_rtteflags |= tteflag; 14121 sfmmu_hat_exit(hatlockp); 14122 } 14123 hatlockp = sfmmu_hat_enter(sfmmup); 14124 14125 /* 14126 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M 14127 * region to allow for large page allocation failure. 14128 */ 14129 if (r_pgszc >= TTE4M) { 14130 sfmmup->sfmmu_tsb0_4minflcnt += 14131 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14132 } 14133 14134 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14135 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14136 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14137 rttecnt); 14138 14139 if (text && r_pgszc >= TTE4M && 14140 (tteflag || ((disable_large_pages >> TTE4M) & 14141 ((1 << (r_pgszc - TTE4M + 1)) - 1))) && 14142 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 14143 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 14144 } 14145 14146 sfmmu_hat_exit(hatlockp); 14147 /* 14148 * On Panther we need to make sure TLB is programmed 14149 * to accept 32M/256M pages. Call 14150 * sfmmu_check_page_sizes() now to make sure TLB is 14151 * setup before making hmeregions visible to other 14152 * threads. 14153 */ 14154 sfmmu_check_page_sizes(sfmmup, 1); 14155 hatlockp = sfmmu_hat_enter(sfmmup); 14156 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14157 14158 /* 14159 * if context is invalid tsb miss exception code will 14160 * call sfmmu_check_page_sizes() and update tsbmiss 14161 * area later. 14162 */ 14163 kpreempt_disable(); 14164 if (myjoin && 14165 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 14166 != INVALID_CONTEXT)) { 14167 struct tsbmiss *tsbmp; 14168 14169 tsbmp = &tsbmiss_area[CPU->cpu_id]; 14170 ASSERT(sfmmup == tsbmp->usfmmup); 14171 BT_SET(tsbmp->shmermap, rid); 14172 if (r_pgszc > TTE64K) { 14173 tsbmp->uhat_rtteflags |= tteflag; 14174 } 14175 14176 } 14177 kpreempt_enable(); 14178 14179 sfmmu_hat_exit(hatlockp); 14180 ASSERT((hat_region_cookie_t)((uint64_t)rid) != 14181 HAT_INVALID_REGION_COOKIE); 14182 } else { 14183 hatlockp = sfmmu_hat_enter(sfmmup); 14184 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid); 14185 sfmmu_hat_exit(hatlockp); 14186 } 14187 ASSERT(rid < maxids); 14188 14189 if (r_type == SFMMU_REGION_ISM) { 14190 sfmmu_find_scd(sfmmup); 14191 } 14192 return ((hat_region_cookie_t)((uint64_t)rid)); 14193 } 14194 14195 ASSERT(new_rgnp == NULL); 14196 14197 if (*busyrgnsp >= maxids) { 14198 mutex_exit(&srdp->srd_mutex); 14199 return (HAT_INVALID_REGION_COOKIE); 14200 } 14201 14202 ASSERT(MUTEX_HELD(&srdp->srd_mutex)); 14203 if (*freelistp != NULL) { 14204 rgnp = *freelistp; 14205 *freelistp = rgnp->rgn_next; 14206 ASSERT(rgnp->rgn_id < *nextidp); 14207 ASSERT(rgnp->rgn_id < maxids); 14208 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 14209 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) 14210 == r_type); 14211 ASSERT(rarrp[rgnp->rgn_id] == rgnp); 14212 ASSERT(rgnp->rgn_hmeflags == 0); 14213 } else { 14214 /* 14215 * release local locks before memory allocation. 14216 */ 14217 mutex_exit(&srdp->srd_mutex); 14218 14219 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP); 14220 14221 mutex_enter(&srdp->srd_mutex); 14222 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14223 rgnp = rgnp->rgn_hash) { 14224 if (rgnp->rgn_saddr == r_saddr && 14225 rgnp->rgn_size == r_size && 14226 rgnp->rgn_obj == r_obj && 14227 rgnp->rgn_objoff == r_objoff && 14228 rgnp->rgn_perm == r_perm && 14229 rgnp->rgn_pgszc == r_pgszc) { 14230 break; 14231 } 14232 } 14233 if (rgnp != NULL) { 14234 goto rfound; 14235 } 14236 14237 if (*nextidp >= maxids) { 14238 mutex_exit(&srdp->srd_mutex); 14239 goto fail; 14240 } 14241 rgnp = new_rgnp; 14242 new_rgnp = NULL; 14243 rgnp->rgn_id = (*nextidp)++; 14244 ASSERT(rgnp->rgn_id < maxids); 14245 ASSERT(rarrp[rgnp->rgn_id] == NULL); 14246 rarrp[rgnp->rgn_id] = rgnp; 14247 } 14248 14249 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14250 ASSERT(rgnp->rgn_hmeflags == 0); 14251 #ifdef DEBUG 14252 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14253 ASSERT(rgnp->rgn_ttecnt[i] == 0); 14254 } 14255 #endif 14256 rgnp->rgn_saddr = r_saddr; 14257 rgnp->rgn_size = r_size; 14258 rgnp->rgn_obj = r_obj; 14259 rgnp->rgn_objoff = r_objoff; 14260 rgnp->rgn_perm = r_perm; 14261 rgnp->rgn_pgszc = r_pgszc; 14262 rgnp->rgn_flags = r_type; 14263 rgnp->rgn_refcnt = 0; 14264 rgnp->rgn_cb_function = r_cb_function; 14265 rgnp->rgn_hash = srdp->srd_rgnhash[rhash]; 14266 srdp->srd_rgnhash[rhash] = rgnp; 14267 (*busyrgnsp)++; 14268 ASSERT(*busyrgnsp <= maxids); 14269 goto rfound; 14270 14271 fail: 14272 ASSERT(new_rgnp != NULL); 14273 kmem_cache_free(region_cache, new_rgnp); 14274 return (HAT_INVALID_REGION_COOKIE); 14275 } 14276 14277 /* 14278 * This function implements the shared context functionality required 14279 * when detaching a segment from an address space. It must be called 14280 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(), 14281 * for segments with a valid region_cookie. 14282 * It will also be called from all seg_vn routines which change a 14283 * segment's attributes such as segvn_setprot(), segvn_setpagesize(), 14284 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault 14285 * from segvn_fault(). 14286 */ 14287 void 14288 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags) 14289 { 14290 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14291 sf_scd_t *scdp; 14292 uint_t rhash; 14293 uint_t rid = (uint_t)((uint64_t)rcookie); 14294 hatlock_t *hatlockp = NULL; 14295 sf_region_t *rgnp; 14296 sf_region_t **prev_rgnpp; 14297 sf_region_t *cur_rgnp; 14298 void *r_obj; 14299 int i; 14300 caddr_t r_saddr; 14301 caddr_t r_eaddr; 14302 size_t r_size; 14303 uchar_t r_pgszc; 14304 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14305 14306 ASSERT(sfmmup != ksfmmup); 14307 ASSERT(srdp != NULL); 14308 ASSERT(srdp->srd_refcnt > 0); 14309 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14310 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14311 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL); 14312 14313 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14314 SFMMU_REGION_HME; 14315 14316 if (r_type == SFMMU_REGION_ISM) { 14317 ASSERT(SFMMU_IS_ISMRID_VALID(rid)); 14318 ASSERT(rid < SFMMU_MAX_ISM_REGIONS); 14319 rgnp = srdp->srd_ismrgnp[rid]; 14320 } else { 14321 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14322 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14323 rgnp = srdp->srd_hmergnp[rid]; 14324 } 14325 ASSERT(rgnp != NULL); 14326 ASSERT(rgnp->rgn_id == rid); 14327 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14328 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14329 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14330 14331 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14332 if (r_type == SFMMU_REGION_HME && sfmmup->sfmmu_as->a_xhat != NULL) { 14333 xhat_unload_callback_all(sfmmup->sfmmu_as, rgnp->rgn_saddr, 14334 rgnp->rgn_size, 0, NULL); 14335 } 14336 14337 if (sfmmup->sfmmu_free) { 14338 ulong_t rttecnt; 14339 r_pgszc = rgnp->rgn_pgszc; 14340 r_size = rgnp->rgn_size; 14341 14342 ASSERT(sfmmup->sfmmu_scdp == NULL); 14343 if (r_type == SFMMU_REGION_ISM) { 14344 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14345 } else { 14346 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14347 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14348 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14349 14350 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14351 -rttecnt); 14352 14353 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14354 } 14355 } else if (r_type == SFMMU_REGION_ISM) { 14356 hatlockp = sfmmu_hat_enter(sfmmup); 14357 ASSERT(rid < srdp->srd_next_ismrid); 14358 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14359 scdp = sfmmup->sfmmu_scdp; 14360 if (scdp != NULL && 14361 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 14362 sfmmu_leave_scd(sfmmup, r_type); 14363 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14364 } 14365 sfmmu_hat_exit(hatlockp); 14366 } else { 14367 ulong_t rttecnt; 14368 r_pgszc = rgnp->rgn_pgszc; 14369 r_saddr = rgnp->rgn_saddr; 14370 r_size = rgnp->rgn_size; 14371 r_eaddr = r_saddr + r_size; 14372 14373 ASSERT(r_type == SFMMU_REGION_HME); 14374 hatlockp = sfmmu_hat_enter(sfmmup); 14375 ASSERT(rid < srdp->srd_next_hmerid); 14376 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14377 14378 /* 14379 * If region is part of an SCD call sfmmu_leave_scd(). 14380 * Otherwise if process is not exiting and has valid context 14381 * just drop the context on the floor to lose stale TLB 14382 * entries and force the update of tsb miss area to reflect 14383 * the new region map. After that clean our TSB entries. 14384 */ 14385 scdp = sfmmup->sfmmu_scdp; 14386 if (scdp != NULL && 14387 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 14388 sfmmu_leave_scd(sfmmup, r_type); 14389 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14390 } 14391 sfmmu_invalidate_ctx(sfmmup); 14392 14393 i = TTE8K; 14394 while (i < mmu_page_sizes) { 14395 if (rgnp->rgn_ttecnt[i] != 0) { 14396 sfmmu_unload_tsb_range(sfmmup, r_saddr, 14397 r_eaddr, i); 14398 if (i < TTE4M) { 14399 i = TTE4M; 14400 continue; 14401 } else { 14402 break; 14403 } 14404 } 14405 i++; 14406 } 14407 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */ 14408 if (r_pgszc >= TTE4M) { 14409 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14410 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14411 rttecnt); 14412 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt; 14413 } 14414 14415 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14416 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14417 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14418 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt); 14419 14420 sfmmu_hat_exit(hatlockp); 14421 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 14422 /* sfmmup left the scd, grow private tsb */ 14423 sfmmu_check_page_sizes(sfmmup, 1); 14424 } else { 14425 sfmmu_check_page_sizes(sfmmup, 0); 14426 } 14427 } 14428 14429 if (r_type == SFMMU_REGION_HME) { 14430 sfmmu_unlink_from_hmeregion(sfmmup, rgnp); 14431 } 14432 14433 r_obj = rgnp->rgn_obj; 14434 if (atomic_add_32_nv((volatile uint_t *)&rgnp->rgn_refcnt, -1)) { 14435 return; 14436 } 14437 14438 /* 14439 * looks like nobody uses this region anymore. Free it. 14440 */ 14441 rhash = RGN_HASH_FUNCTION(r_obj); 14442 mutex_enter(&srdp->srd_mutex); 14443 for (prev_rgnpp = &srdp->srd_rgnhash[rhash]; 14444 (cur_rgnp = *prev_rgnpp) != NULL; 14445 prev_rgnpp = &cur_rgnp->rgn_hash) { 14446 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) { 14447 break; 14448 } 14449 } 14450 14451 if (cur_rgnp == NULL) { 14452 mutex_exit(&srdp->srd_mutex); 14453 return; 14454 } 14455 14456 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14457 *prev_rgnpp = rgnp->rgn_hash; 14458 if (r_type == SFMMU_REGION_ISM) { 14459 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14460 ASSERT(rid < srdp->srd_next_ismrid); 14461 rgnp->rgn_next = srdp->srd_ismrgnfree; 14462 srdp->srd_ismrgnfree = rgnp; 14463 ASSERT(srdp->srd_ismbusyrgns > 0); 14464 srdp->srd_ismbusyrgns--; 14465 mutex_exit(&srdp->srd_mutex); 14466 return; 14467 } 14468 mutex_exit(&srdp->srd_mutex); 14469 14470 /* 14471 * Destroy region's hmeblks. 14472 */ 14473 sfmmu_unload_hmeregion(srdp, rgnp); 14474 14475 rgnp->rgn_hmeflags = 0; 14476 14477 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14478 ASSERT(rgnp->rgn_id == rid); 14479 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14480 rgnp->rgn_ttecnt[i] = 0; 14481 } 14482 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14483 mutex_enter(&srdp->srd_mutex); 14484 ASSERT(rid < srdp->srd_next_hmerid); 14485 rgnp->rgn_next = srdp->srd_hmergnfree; 14486 srdp->srd_hmergnfree = rgnp; 14487 ASSERT(srdp->srd_hmebusyrgns > 0); 14488 srdp->srd_hmebusyrgns--; 14489 mutex_exit(&srdp->srd_mutex); 14490 } 14491 14492 /* 14493 * For now only called for hmeblk regions and not for ISM regions. 14494 */ 14495 void 14496 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie) 14497 { 14498 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14499 uint_t rid = (uint_t)((uint64_t)rcookie); 14500 sf_region_t *rgnp; 14501 sf_rgn_link_t *rlink; 14502 sf_rgn_link_t *hrlink; 14503 ulong_t rttecnt; 14504 14505 ASSERT(sfmmup != ksfmmup); 14506 ASSERT(srdp != NULL); 14507 ASSERT(srdp->srd_refcnt > 0); 14508 14509 ASSERT(rid < srdp->srd_next_hmerid); 14510 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14511 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14512 14513 rgnp = srdp->srd_hmergnp[rid]; 14514 ASSERT(rgnp->rgn_refcnt > 0); 14515 ASSERT(rgnp->rgn_id == rid); 14516 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME); 14517 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14518 14519 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14520 14521 /* LINTED: constant in conditional context */ 14522 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0); 14523 ASSERT(rlink != NULL); 14524 mutex_enter(&rgnp->rgn_mutex); 14525 ASSERT(rgnp->rgn_sfmmu_head != NULL); 14526 /* LINTED: constant in conditional context */ 14527 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0); 14528 ASSERT(hrlink != NULL); 14529 ASSERT(hrlink->prev == NULL); 14530 rlink->next = rgnp->rgn_sfmmu_head; 14531 rlink->prev = NULL; 14532 hrlink->prev = sfmmup; 14533 /* 14534 * make sure rlink's next field is correct 14535 * before making this link visible. 14536 */ 14537 membar_stst(); 14538 rgnp->rgn_sfmmu_head = sfmmup; 14539 mutex_exit(&rgnp->rgn_mutex); 14540 14541 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14542 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 14543 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt); 14544 /* update tsb0 inflation count */ 14545 if (rgnp->rgn_pgszc >= TTE4M) { 14546 sfmmup->sfmmu_tsb0_4minflcnt += 14547 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14548 } 14549 /* 14550 * Update regionid bitmask without hat lock since no other thread 14551 * can update this region bitmask right now. 14552 */ 14553 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14554 } 14555 14556 /* ARGSUSED */ 14557 static int 14558 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags) 14559 { 14560 sf_region_t *rgnp = (sf_region_t *)buf; 14561 bzero(buf, sizeof (*rgnp)); 14562 14563 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL); 14564 14565 return (0); 14566 } 14567 14568 /* ARGSUSED */ 14569 static void 14570 sfmmu_rgncache_destructor(void *buf, void *cdrarg) 14571 { 14572 sf_region_t *rgnp = (sf_region_t *)buf; 14573 mutex_destroy(&rgnp->rgn_mutex); 14574 } 14575 14576 static int 14577 sfrgnmap_isnull(sf_region_map_t *map) 14578 { 14579 int i; 14580 14581 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14582 if (map->bitmap[i] != 0) { 14583 return (0); 14584 } 14585 } 14586 return (1); 14587 } 14588 14589 static int 14590 sfhmergnmap_isnull(sf_hmeregion_map_t *map) 14591 { 14592 int i; 14593 14594 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 14595 if (map->bitmap[i] != 0) { 14596 return (0); 14597 } 14598 } 14599 return (1); 14600 } 14601 14602 #ifdef DEBUG 14603 static void 14604 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist) 14605 { 14606 sfmmu_t *sp; 14607 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14608 14609 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) { 14610 ASSERT(srdp == sp->sfmmu_srdp); 14611 if (sp == sfmmup) { 14612 if (onlist) { 14613 return; 14614 } else { 14615 panic("shctx: sfmmu 0x%p found on scd" 14616 "list 0x%p", (void *)sfmmup, 14617 (void *)*headp); 14618 } 14619 } 14620 } 14621 if (onlist) { 14622 panic("shctx: sfmmu 0x%p not found on scd list 0x%p", 14623 (void *)sfmmup, (void *)*headp); 14624 } else { 14625 return; 14626 } 14627 } 14628 #else /* DEBUG */ 14629 #define check_scd_sfmmu_list(headp, sfmmup, onlist) 14630 #endif /* DEBUG */ 14631 14632 /* 14633 * Removes an sfmmu from the SCD sfmmu list. 14634 */ 14635 static void 14636 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14637 { 14638 ASSERT(sfmmup->sfmmu_srdp != NULL); 14639 check_scd_sfmmu_list(headp, sfmmup, 1); 14640 if (sfmmup->sfmmu_scd_link.prev != NULL) { 14641 ASSERT(*headp != sfmmup); 14642 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next = 14643 sfmmup->sfmmu_scd_link.next; 14644 } else { 14645 ASSERT(*headp == sfmmup); 14646 *headp = sfmmup->sfmmu_scd_link.next; 14647 } 14648 if (sfmmup->sfmmu_scd_link.next != NULL) { 14649 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev = 14650 sfmmup->sfmmu_scd_link.prev; 14651 } 14652 } 14653 14654 14655 /* 14656 * Adds an sfmmu to the start of the queue. 14657 */ 14658 static void 14659 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14660 { 14661 check_scd_sfmmu_list(headp, sfmmup, 0); 14662 sfmmup->sfmmu_scd_link.prev = NULL; 14663 sfmmup->sfmmu_scd_link.next = *headp; 14664 if (*headp != NULL) 14665 (*headp)->sfmmu_scd_link.prev = sfmmup; 14666 *headp = sfmmup; 14667 } 14668 14669 /* 14670 * Remove an scd from the start of the queue. 14671 */ 14672 static void 14673 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp) 14674 { 14675 if (scdp->scd_prev != NULL) { 14676 ASSERT(*headp != scdp); 14677 scdp->scd_prev->scd_next = scdp->scd_next; 14678 } else { 14679 ASSERT(*headp == scdp); 14680 *headp = scdp->scd_next; 14681 } 14682 14683 if (scdp->scd_next != NULL) { 14684 scdp->scd_next->scd_prev = scdp->scd_prev; 14685 } 14686 } 14687 14688 /* 14689 * Add an scd to the start of the queue. 14690 */ 14691 static void 14692 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp) 14693 { 14694 scdp->scd_prev = NULL; 14695 scdp->scd_next = *headp; 14696 if (*headp != NULL) { 14697 (*headp)->scd_prev = scdp; 14698 } 14699 *headp = scdp; 14700 } 14701 14702 static int 14703 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp) 14704 { 14705 uint_t rid; 14706 uint_t i; 14707 uint_t j; 14708 ulong_t w; 14709 sf_region_t *rgnp; 14710 ulong_t tte8k_cnt = 0; 14711 ulong_t tte4m_cnt = 0; 14712 uint_t tsb_szc; 14713 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 14714 sfmmu_t *ism_hatid; 14715 struct tsb_info *newtsb; 14716 int szc; 14717 14718 ASSERT(srdp != NULL); 14719 14720 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14721 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14722 continue; 14723 } 14724 j = 0; 14725 while (w) { 14726 if (!(w & 0x1)) { 14727 j++; 14728 w >>= 1; 14729 continue; 14730 } 14731 rid = (i << BT_ULSHIFT) | j; 14732 j++; 14733 w >>= 1; 14734 14735 if (rid < SFMMU_MAX_HME_REGIONS) { 14736 rgnp = srdp->srd_hmergnp[rid]; 14737 ASSERT(rgnp->rgn_id == rid); 14738 ASSERT(rgnp->rgn_refcnt > 0); 14739 14740 if (rgnp->rgn_pgszc < TTE4M) { 14741 tte8k_cnt += rgnp->rgn_size >> 14742 TTE_PAGE_SHIFT(TTE8K); 14743 } else { 14744 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14745 tte4m_cnt += rgnp->rgn_size >> 14746 TTE_PAGE_SHIFT(TTE4M); 14747 /* 14748 * Inflate SCD tsb0 by preallocating 14749 * 1/4 8k ttecnt for 4M regions to 14750 * allow for lgpg alloc failure. 14751 */ 14752 tte8k_cnt += rgnp->rgn_size >> 14753 (TTE_PAGE_SHIFT(TTE8K) + 2); 14754 } 14755 } else { 14756 rid -= SFMMU_MAX_HME_REGIONS; 14757 rgnp = srdp->srd_ismrgnp[rid]; 14758 ASSERT(rgnp->rgn_id == rid); 14759 ASSERT(rgnp->rgn_refcnt > 0); 14760 14761 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14762 ASSERT(ism_hatid->sfmmu_ismhat); 14763 14764 for (szc = 0; szc < TTE4M; szc++) { 14765 tte8k_cnt += 14766 ism_hatid->sfmmu_ttecnt[szc] << 14767 TTE_BSZS_SHIFT(szc); 14768 } 14769 14770 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14771 if (rgnp->rgn_pgszc >= TTE4M) { 14772 tte4m_cnt += rgnp->rgn_size >> 14773 TTE_PAGE_SHIFT(TTE4M); 14774 } 14775 } 14776 } 14777 } 14778 14779 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 14780 14781 /* Allocate both the SCD TSBs here. */ 14782 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14783 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) && 14784 (tsb_szc <= TSB_4M_SZCODE || 14785 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14786 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K, 14787 TSB_ALLOC, scsfmmup))) { 14788 14789 SFMMU_STAT(sf_scd_1sttsb_allocfail); 14790 return (TSB_ALLOCFAIL); 14791 } else { 14792 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX; 14793 14794 if (tte4m_cnt) { 14795 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 14796 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, 14797 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) && 14798 (tsb_szc <= TSB_4M_SZCODE || 14799 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 14800 TSB4M|TSB32M|TSB256M, 14801 TSB_ALLOC, scsfmmup))) { 14802 /* 14803 * If we fail to allocate the 2nd shared tsb, 14804 * just free the 1st tsb, return failure. 14805 */ 14806 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb); 14807 SFMMU_STAT(sf_scd_2ndtsb_allocfail); 14808 return (TSB_ALLOCFAIL); 14809 } else { 14810 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL); 14811 newtsb->tsb_flags |= TSB_SHAREDCTX; 14812 scsfmmup->sfmmu_tsb->tsb_next = newtsb; 14813 SFMMU_STAT(sf_scd_2ndtsb_alloc); 14814 } 14815 } 14816 SFMMU_STAT(sf_scd_1sttsb_alloc); 14817 } 14818 return (TSB_SUCCESS); 14819 } 14820 14821 static void 14822 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu) 14823 { 14824 while (scd_sfmmu->sfmmu_tsb != NULL) { 14825 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next; 14826 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb); 14827 scd_sfmmu->sfmmu_tsb = next; 14828 } 14829 } 14830 14831 /* 14832 * Link the sfmmu onto the hme region list. 14833 */ 14834 void 14835 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14836 { 14837 uint_t rid; 14838 sf_rgn_link_t *rlink; 14839 sfmmu_t *head; 14840 sf_rgn_link_t *hrlink; 14841 14842 rid = rgnp->rgn_id; 14843 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14844 14845 /* LINTED: constant in conditional context */ 14846 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1); 14847 ASSERT(rlink != NULL); 14848 mutex_enter(&rgnp->rgn_mutex); 14849 if ((head = rgnp->rgn_sfmmu_head) == NULL) { 14850 rlink->next = NULL; 14851 rlink->prev = NULL; 14852 /* 14853 * make sure rlink's next field is NULL 14854 * before making this link visible. 14855 */ 14856 membar_stst(); 14857 rgnp->rgn_sfmmu_head = sfmmup; 14858 } else { 14859 /* LINTED: constant in conditional context */ 14860 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0); 14861 ASSERT(hrlink != NULL); 14862 ASSERT(hrlink->prev == NULL); 14863 rlink->next = head; 14864 rlink->prev = NULL; 14865 hrlink->prev = sfmmup; 14866 /* 14867 * make sure rlink's next field is correct 14868 * before making this link visible. 14869 */ 14870 membar_stst(); 14871 rgnp->rgn_sfmmu_head = sfmmup; 14872 } 14873 mutex_exit(&rgnp->rgn_mutex); 14874 } 14875 14876 /* 14877 * Unlink the sfmmu from the hme region list. 14878 */ 14879 void 14880 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14881 { 14882 uint_t rid; 14883 sf_rgn_link_t *rlink; 14884 14885 rid = rgnp->rgn_id; 14886 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14887 14888 /* LINTED: constant in conditional context */ 14889 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 14890 ASSERT(rlink != NULL); 14891 mutex_enter(&rgnp->rgn_mutex); 14892 if (rgnp->rgn_sfmmu_head == sfmmup) { 14893 sfmmu_t *next = rlink->next; 14894 rgnp->rgn_sfmmu_head = next; 14895 /* 14896 * if we are stopped by xc_attention() after this 14897 * point the forward link walking in 14898 * sfmmu_rgntlb_demap() will work correctly since the 14899 * head correctly points to the next element. 14900 */ 14901 membar_stst(); 14902 rlink->next = NULL; 14903 ASSERT(rlink->prev == NULL); 14904 if (next != NULL) { 14905 sf_rgn_link_t *nrlink; 14906 /* LINTED: constant in conditional context */ 14907 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14908 ASSERT(nrlink != NULL); 14909 ASSERT(nrlink->prev == sfmmup); 14910 nrlink->prev = NULL; 14911 } 14912 } else { 14913 sfmmu_t *next = rlink->next; 14914 sfmmu_t *prev = rlink->prev; 14915 sf_rgn_link_t *prlink; 14916 14917 ASSERT(prev != NULL); 14918 /* LINTED: constant in conditional context */ 14919 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0); 14920 ASSERT(prlink != NULL); 14921 ASSERT(prlink->next == sfmmup); 14922 prlink->next = next; 14923 /* 14924 * if we are stopped by xc_attention() 14925 * after this point the forward link walking 14926 * will work correctly since the prev element 14927 * correctly points to the next element. 14928 */ 14929 membar_stst(); 14930 rlink->next = NULL; 14931 rlink->prev = NULL; 14932 if (next != NULL) { 14933 sf_rgn_link_t *nrlink; 14934 /* LINTED: constant in conditional context */ 14935 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14936 ASSERT(nrlink != NULL); 14937 ASSERT(nrlink->prev == sfmmup); 14938 nrlink->prev = prev; 14939 } 14940 } 14941 mutex_exit(&rgnp->rgn_mutex); 14942 } 14943 14944 /* 14945 * Link scd sfmmu onto ism or hme region list for each region in the 14946 * scd region map. 14947 */ 14948 void 14949 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp) 14950 { 14951 uint_t rid; 14952 uint_t i; 14953 uint_t j; 14954 ulong_t w; 14955 sf_region_t *rgnp; 14956 sfmmu_t *scsfmmup; 14957 14958 scsfmmup = scdp->scd_sfmmup; 14959 ASSERT(scsfmmup->sfmmu_scdhat); 14960 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14961 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14962 continue; 14963 } 14964 j = 0; 14965 while (w) { 14966 if (!(w & 0x1)) { 14967 j++; 14968 w >>= 1; 14969 continue; 14970 } 14971 rid = (i << BT_ULSHIFT) | j; 14972 j++; 14973 w >>= 1; 14974 14975 if (rid < SFMMU_MAX_HME_REGIONS) { 14976 rgnp = srdp->srd_hmergnp[rid]; 14977 ASSERT(rgnp->rgn_id == rid); 14978 ASSERT(rgnp->rgn_refcnt > 0); 14979 sfmmu_link_to_hmeregion(scsfmmup, rgnp); 14980 } else { 14981 sfmmu_t *ism_hatid = NULL; 14982 ism_ment_t *ism_ment; 14983 rid -= SFMMU_MAX_HME_REGIONS; 14984 rgnp = srdp->srd_ismrgnp[rid]; 14985 ASSERT(rgnp->rgn_id == rid); 14986 ASSERT(rgnp->rgn_refcnt > 0); 14987 14988 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14989 ASSERT(ism_hatid->sfmmu_ismhat); 14990 ism_ment = &scdp->scd_ism_links[rid]; 14991 ism_ment->iment_hat = scsfmmup; 14992 ism_ment->iment_base_va = rgnp->rgn_saddr; 14993 mutex_enter(&ism_mlist_lock); 14994 iment_add(ism_ment, ism_hatid); 14995 mutex_exit(&ism_mlist_lock); 14996 14997 } 14998 } 14999 } 15000 } 15001 /* 15002 * Unlink scd sfmmu from ism or hme region list for each region in the 15003 * scd region map. 15004 */ 15005 void 15006 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp) 15007 { 15008 uint_t rid; 15009 uint_t i; 15010 uint_t j; 15011 ulong_t w; 15012 sf_region_t *rgnp; 15013 sfmmu_t *scsfmmup; 15014 15015 scsfmmup = scdp->scd_sfmmup; 15016 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 15017 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 15018 continue; 15019 } 15020 j = 0; 15021 while (w) { 15022 if (!(w & 0x1)) { 15023 j++; 15024 w >>= 1; 15025 continue; 15026 } 15027 rid = (i << BT_ULSHIFT) | j; 15028 j++; 15029 w >>= 1; 15030 15031 if (rid < SFMMU_MAX_HME_REGIONS) { 15032 rgnp = srdp->srd_hmergnp[rid]; 15033 ASSERT(rgnp->rgn_id == rid); 15034 ASSERT(rgnp->rgn_refcnt > 0); 15035 sfmmu_unlink_from_hmeregion(scsfmmup, 15036 rgnp); 15037 15038 } else { 15039 sfmmu_t *ism_hatid = NULL; 15040 ism_ment_t *ism_ment; 15041 rid -= SFMMU_MAX_HME_REGIONS; 15042 rgnp = srdp->srd_ismrgnp[rid]; 15043 ASSERT(rgnp->rgn_id == rid); 15044 ASSERT(rgnp->rgn_refcnt > 0); 15045 15046 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 15047 ASSERT(ism_hatid->sfmmu_ismhat); 15048 ism_ment = &scdp->scd_ism_links[rid]; 15049 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup); 15050 ASSERT(ism_ment->iment_base_va == 15051 rgnp->rgn_saddr); 15052 ism_ment->iment_hat = NULL; 15053 ism_ment->iment_base_va = 0; 15054 mutex_enter(&ism_mlist_lock); 15055 iment_sub(ism_ment, ism_hatid); 15056 mutex_exit(&ism_mlist_lock); 15057 15058 } 15059 } 15060 } 15061 } 15062 /* 15063 * Allocates and initialises a new SCD structure, this is called with 15064 * the srd_scd_mutex held and returns with the reference count 15065 * initialised to 1. 15066 */ 15067 static sf_scd_t * 15068 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map) 15069 { 15070 sf_scd_t *new_scdp; 15071 sfmmu_t *scsfmmup; 15072 int i; 15073 15074 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex)); 15075 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP); 15076 15077 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 15078 new_scdp->scd_sfmmup = scsfmmup; 15079 scsfmmup->sfmmu_srdp = srdp; 15080 scsfmmup->sfmmu_scdp = new_scdp; 15081 scsfmmup->sfmmu_tsb0_4minflcnt = 0; 15082 scsfmmup->sfmmu_scdhat = 1; 15083 CPUSET_ALL(scsfmmup->sfmmu_cpusran); 15084 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 15085 15086 ASSERT(max_mmu_ctxdoms > 0); 15087 for (i = 0; i < max_mmu_ctxdoms; i++) { 15088 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 15089 scsfmmup->sfmmu_ctxs[i].gnum = 0; 15090 } 15091 15092 for (i = 0; i < MMU_PAGE_SIZES; i++) { 15093 new_scdp->scd_rttecnt[i] = 0; 15094 } 15095 15096 new_scdp->scd_region_map = *new_map; 15097 new_scdp->scd_refcnt = 1; 15098 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) { 15099 kmem_cache_free(scd_cache, new_scdp); 15100 kmem_cache_free(sfmmuid_cache, scsfmmup); 15101 return (NULL); 15102 } 15103 if (&mmu_init_scd) { 15104 mmu_init_scd(new_scdp); 15105 } 15106 return (new_scdp); 15107 } 15108 15109 /* 15110 * The first phase of a process joining an SCD. The hat structure is 15111 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set 15112 * and a cross-call with context invalidation is used to cause the 15113 * remaining work to be carried out in the sfmmu_tsbmiss_exception() 15114 * routine. 15115 */ 15116 static void 15117 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup) 15118 { 15119 hatlock_t *hatlockp; 15120 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15121 int i; 15122 sf_scd_t *old_scdp; 15123 15124 ASSERT(srdp != NULL); 15125 ASSERT(scdp != NULL); 15126 ASSERT(scdp->scd_refcnt > 0); 15127 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15128 15129 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) { 15130 ASSERT(old_scdp != scdp); 15131 15132 mutex_enter(&old_scdp->scd_mutex); 15133 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup); 15134 mutex_exit(&old_scdp->scd_mutex); 15135 /* 15136 * sfmmup leaves the old scd. Update sfmmu_ttecnt to 15137 * include the shme rgn ttecnt for rgns that 15138 * were in the old SCD 15139 */ 15140 for (i = 0; i < mmu_page_sizes; i++) { 15141 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15142 old_scdp->scd_rttecnt[i]); 15143 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15144 sfmmup->sfmmu_scdrttecnt[i]); 15145 } 15146 } 15147 15148 /* 15149 * Move sfmmu to the scd lists. 15150 */ 15151 mutex_enter(&scdp->scd_mutex); 15152 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup); 15153 mutex_exit(&scdp->scd_mutex); 15154 SF_SCD_INCR_REF(scdp); 15155 15156 hatlockp = sfmmu_hat_enter(sfmmup); 15157 /* 15158 * For a multi-thread process, we must stop 15159 * all the other threads before joining the scd. 15160 */ 15161 15162 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD); 15163 15164 sfmmu_invalidate_ctx(sfmmup); 15165 sfmmup->sfmmu_scdp = scdp; 15166 15167 /* 15168 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update 15169 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD. 15170 */ 15171 for (i = 0; i < mmu_page_sizes; i++) { 15172 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i]; 15173 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]); 15174 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15175 -sfmmup->sfmmu_scdrttecnt[i]); 15176 } 15177 /* update tsb0 inflation count */ 15178 if (old_scdp != NULL) { 15179 sfmmup->sfmmu_tsb0_4minflcnt += 15180 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15181 } 15182 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 15183 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt); 15184 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15185 15186 sfmmu_hat_exit(hatlockp); 15187 15188 if (old_scdp != NULL) { 15189 SF_SCD_DECR_REF(srdp, old_scdp); 15190 } 15191 15192 } 15193 15194 /* 15195 * This routine is called by a process to become part of an SCD. It is called 15196 * from sfmmu_tsbmiss_exception() once most of the initial work has been 15197 * done by sfmmu_join_scd(). This routine must not drop the hat lock. 15198 */ 15199 static void 15200 sfmmu_finish_join_scd(sfmmu_t *sfmmup) 15201 { 15202 struct tsb_info *tsbinfop; 15203 15204 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15205 ASSERT(sfmmup->sfmmu_scdp != NULL); 15206 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)); 15207 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15208 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)); 15209 15210 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 15211 tsbinfop = tsbinfop->tsb_next) { 15212 if (tsbinfop->tsb_flags & TSB_SWAPPED) { 15213 continue; 15214 } 15215 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG)); 15216 15217 sfmmu_inv_tsb(tsbinfop->tsb_va, 15218 TSB_BYTES(tsbinfop->tsb_szc)); 15219 } 15220 15221 /* Set HAT_CTX1_FLAG for all SCD ISMs */ 15222 sfmmu_ism_hatflags(sfmmup, 1); 15223 15224 SFMMU_STAT(sf_join_scd); 15225 } 15226 15227 /* 15228 * This routine is called in order to check if there is an SCD which matches 15229 * the process's region map if not then a new SCD may be created. 15230 */ 15231 static void 15232 sfmmu_find_scd(sfmmu_t *sfmmup) 15233 { 15234 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15235 sf_scd_t *scdp, *new_scdp; 15236 int ret; 15237 15238 ASSERT(srdp != NULL); 15239 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15240 15241 mutex_enter(&srdp->srd_scd_mutex); 15242 for (scdp = srdp->srd_scdp; scdp != NULL; 15243 scdp = scdp->scd_next) { 15244 SF_RGNMAP_EQUAL(&scdp->scd_region_map, 15245 &sfmmup->sfmmu_region_map, ret); 15246 if (ret == 1) { 15247 SF_SCD_INCR_REF(scdp); 15248 mutex_exit(&srdp->srd_scd_mutex); 15249 sfmmu_join_scd(scdp, sfmmup); 15250 ASSERT(scdp->scd_refcnt >= 2); 15251 atomic_add_32((volatile uint32_t *) 15252 &scdp->scd_refcnt, -1); 15253 return; 15254 } else { 15255 /* 15256 * If the sfmmu region map is a subset of the scd 15257 * region map, then the assumption is that this process 15258 * will continue attaching to ISM segments until the 15259 * region maps are equal. 15260 */ 15261 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map, 15262 &sfmmup->sfmmu_region_map, ret); 15263 if (ret == 1) { 15264 mutex_exit(&srdp->srd_scd_mutex); 15265 return; 15266 } 15267 } 15268 } 15269 15270 ASSERT(scdp == NULL); 15271 /* 15272 * No matching SCD has been found, create a new one. 15273 */ 15274 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) == 15275 NULL) { 15276 mutex_exit(&srdp->srd_scd_mutex); 15277 return; 15278 } 15279 15280 /* 15281 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd. 15282 */ 15283 15284 /* Set scd_rttecnt for shme rgns in SCD */ 15285 sfmmu_set_scd_rttecnt(srdp, new_scdp); 15286 15287 /* 15288 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists. 15289 */ 15290 sfmmu_link_scd_to_regions(srdp, new_scdp); 15291 sfmmu_add_scd(&srdp->srd_scdp, new_scdp); 15292 SFMMU_STAT_ADD(sf_create_scd, 1); 15293 15294 mutex_exit(&srdp->srd_scd_mutex); 15295 sfmmu_join_scd(new_scdp, sfmmup); 15296 ASSERT(new_scdp->scd_refcnt >= 2); 15297 atomic_add_32((volatile uint32_t *)&new_scdp->scd_refcnt, -1); 15298 } 15299 15300 /* 15301 * This routine is called by a process to remove itself from an SCD. It is 15302 * either called when the processes has detached from a segment or from 15303 * hat_free_start() as a result of calling exit. 15304 */ 15305 static void 15306 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type) 15307 { 15308 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15309 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15310 hatlock_t *hatlockp = TSB_HASH(sfmmup); 15311 int i; 15312 15313 ASSERT(scdp != NULL); 15314 ASSERT(srdp != NULL); 15315 15316 if (sfmmup->sfmmu_free) { 15317 /* 15318 * If the process is part of an SCD the sfmmu is unlinked 15319 * from scd_sf_list. 15320 */ 15321 mutex_enter(&scdp->scd_mutex); 15322 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15323 mutex_exit(&scdp->scd_mutex); 15324 /* 15325 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15326 * are about to leave the SCD 15327 */ 15328 for (i = 0; i < mmu_page_sizes; i++) { 15329 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15330 scdp->scd_rttecnt[i]); 15331 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15332 sfmmup->sfmmu_scdrttecnt[i]); 15333 sfmmup->sfmmu_scdrttecnt[i] = 0; 15334 } 15335 sfmmup->sfmmu_scdp = NULL; 15336 15337 SF_SCD_DECR_REF(srdp, scdp); 15338 return; 15339 } 15340 15341 ASSERT(r_type != SFMMU_REGION_ISM || 15342 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15343 ASSERT(scdp->scd_refcnt); 15344 ASSERT(!sfmmup->sfmmu_free); 15345 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15346 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15347 15348 /* 15349 * Wait for ISM maps to be updated. 15350 */ 15351 if (r_type != SFMMU_REGION_ISM) { 15352 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) && 15353 sfmmup->sfmmu_scdp != NULL) { 15354 cv_wait(&sfmmup->sfmmu_tsb_cv, 15355 HATLOCK_MUTEXP(hatlockp)); 15356 } 15357 15358 if (sfmmup->sfmmu_scdp == NULL) { 15359 sfmmu_hat_exit(hatlockp); 15360 return; 15361 } 15362 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 15363 } 15364 15365 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 15366 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 15367 /* 15368 * Since HAT_JOIN_SCD was set our context 15369 * is still invalid. 15370 */ 15371 } else { 15372 /* 15373 * For a multi-thread process, we must stop 15374 * all the other threads before leaving the scd. 15375 */ 15376 15377 sfmmu_invalidate_ctx(sfmmup); 15378 } 15379 15380 /* Clear all the rid's for ISM, delete flags, etc */ 15381 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15382 sfmmu_ism_hatflags(sfmmup, 0); 15383 15384 /* 15385 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15386 * are in SCD before this sfmmup leaves the SCD. 15387 */ 15388 for (i = 0; i < mmu_page_sizes; i++) { 15389 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15390 scdp->scd_rttecnt[i]); 15391 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15392 sfmmup->sfmmu_scdrttecnt[i]); 15393 sfmmup->sfmmu_scdrttecnt[i] = 0; 15394 /* update ismttecnt to include SCD ism before hat leaves SCD */ 15395 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i]; 15396 sfmmup->sfmmu_scdismttecnt[i] = 0; 15397 } 15398 /* update tsb0 inflation count */ 15399 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15400 15401 if (r_type != SFMMU_REGION_ISM) { 15402 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 15403 } 15404 sfmmup->sfmmu_scdp = NULL; 15405 15406 sfmmu_hat_exit(hatlockp); 15407 15408 /* 15409 * Unlink sfmmu from scd_sf_list this can be done without holding 15410 * the hat lock as we hold the sfmmu_as lock which prevents 15411 * hat_join_region from adding this thread to the scd again. Other 15412 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL 15413 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp 15414 * while holding the hat lock. 15415 */ 15416 mutex_enter(&scdp->scd_mutex); 15417 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15418 mutex_exit(&scdp->scd_mutex); 15419 SFMMU_STAT(sf_leave_scd); 15420 15421 SF_SCD_DECR_REF(srdp, scdp); 15422 hatlockp = sfmmu_hat_enter(sfmmup); 15423 15424 } 15425 15426 /* 15427 * Unlink and free up an SCD structure with a reference count of 0. 15428 */ 15429 static void 15430 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap) 15431 { 15432 sfmmu_t *scsfmmup; 15433 sf_scd_t *sp; 15434 hatlock_t *shatlockp; 15435 int i, ret; 15436 15437 mutex_enter(&srdp->srd_scd_mutex); 15438 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) { 15439 if (sp == scdp) 15440 break; 15441 } 15442 if (sp == NULL || sp->scd_refcnt) { 15443 mutex_exit(&srdp->srd_scd_mutex); 15444 return; 15445 } 15446 15447 /* 15448 * It is possible that the scd has been freed and reallocated with a 15449 * different region map while we've been waiting for the srd_scd_mutex. 15450 */ 15451 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret); 15452 if (ret != 1) { 15453 mutex_exit(&srdp->srd_scd_mutex); 15454 return; 15455 } 15456 15457 ASSERT(scdp->scd_sf_list == NULL); 15458 /* 15459 * Unlink scd from srd_scdp list. 15460 */ 15461 sfmmu_remove_scd(&srdp->srd_scdp, scdp); 15462 mutex_exit(&srdp->srd_scd_mutex); 15463 15464 sfmmu_unlink_scd_from_regions(srdp, scdp); 15465 15466 /* Clear shared context tsb and release ctx */ 15467 scsfmmup = scdp->scd_sfmmup; 15468 15469 /* 15470 * create a barrier so that scd will not be destroyed 15471 * if other thread still holds the same shared hat lock. 15472 * E.g., sfmmu_tsbmiss_exception() needs to acquire the 15473 * shared hat lock before checking the shared tsb reloc flag. 15474 */ 15475 shatlockp = sfmmu_hat_enter(scsfmmup); 15476 sfmmu_hat_exit(shatlockp); 15477 15478 sfmmu_free_scd_tsbs(scsfmmup); 15479 15480 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 15481 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) { 15482 kmem_free(scsfmmup->sfmmu_hmeregion_links[i], 15483 SFMMU_L2_HMERLINKS_SIZE); 15484 scsfmmup->sfmmu_hmeregion_links[i] = NULL; 15485 } 15486 } 15487 kmem_cache_free(sfmmuid_cache, scsfmmup); 15488 kmem_cache_free(scd_cache, scdp); 15489 SFMMU_STAT(sf_destroy_scd); 15490 } 15491 15492 /* 15493 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to 15494 * bits which are set in the ism_region_map parameter. This flag indicates to 15495 * the tsbmiss handler that mapping for these segments should be loaded using 15496 * the shared context. 15497 */ 15498 static void 15499 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag) 15500 { 15501 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15502 ism_blk_t *ism_blkp; 15503 ism_map_t *ism_map; 15504 int i, rid; 15505 15506 ASSERT(sfmmup->sfmmu_iblk != NULL); 15507 ASSERT(scdp != NULL); 15508 /* 15509 * Note that the caller either set HAT_ISMBUSY flag or checked 15510 * under hat lock that HAT_ISMBUSY was not set by another thread. 15511 */ 15512 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15513 15514 ism_blkp = sfmmup->sfmmu_iblk; 15515 while (ism_blkp != NULL) { 15516 ism_map = ism_blkp->iblk_maps; 15517 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 15518 rid = ism_map[i].imap_rid; 15519 if (rid == SFMMU_INVALID_ISMRID) { 15520 continue; 15521 } 15522 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS); 15523 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) && 15524 addflag) { 15525 ism_map[i].imap_hatflags |= 15526 HAT_CTX1_FLAG; 15527 } else { 15528 ism_map[i].imap_hatflags &= 15529 ~HAT_CTX1_FLAG; 15530 } 15531 } 15532 ism_blkp = ism_blkp->iblk_next; 15533 } 15534 } 15535 15536 static int 15537 sfmmu_srd_lock_held(sf_srd_t *srdp) 15538 { 15539 return (MUTEX_HELD(&srdp->srd_mutex)); 15540 } 15541 15542 /* ARGSUSED */ 15543 static int 15544 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags) 15545 { 15546 sf_scd_t *scdp = (sf_scd_t *)buf; 15547 15548 bzero(buf, sizeof (sf_scd_t)); 15549 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL); 15550 return (0); 15551 } 15552 15553 /* ARGSUSED */ 15554 static void 15555 sfmmu_scdcache_destructor(void *buf, void *cdrarg) 15556 { 15557 sf_scd_t *scdp = (sf_scd_t *)buf; 15558 15559 mutex_destroy(&scdp->scd_mutex); 15560 } 15561