1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * VM - Hardware Address Translation management for Spitfire MMU. 28 * 29 * This file implements the machine specific hardware translation 30 * needed by the VM system. The machine independent interface is 31 * described in <vm/hat.h> while the machine dependent interface 32 * and data structures are described in <vm/hat_sfmmu.h>. 33 * 34 * The hat layer manages the address translation hardware as a cache 35 * driven by calls from the higher levels in the VM system. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/kstat.h> 40 #include <vm/hat.h> 41 #include <vm/hat_sfmmu.h> 42 #include <vm/page.h> 43 #include <sys/pte.h> 44 #include <sys/systm.h> 45 #include <sys/mman.h> 46 #include <sys/sysmacros.h> 47 #include <sys/machparam.h> 48 #include <sys/vtrace.h> 49 #include <sys/kmem.h> 50 #include <sys/mmu.h> 51 #include <sys/cmn_err.h> 52 #include <sys/cpu.h> 53 #include <sys/cpuvar.h> 54 #include <sys/debug.h> 55 #include <sys/lgrp.h> 56 #include <sys/archsystm.h> 57 #include <sys/machsystm.h> 58 #include <sys/vmsystm.h> 59 #include <vm/as.h> 60 #include <vm/seg.h> 61 #include <vm/seg_kp.h> 62 #include <vm/seg_kmem.h> 63 #include <vm/seg_kpm.h> 64 #include <vm/rm.h> 65 #include <sys/t_lock.h> 66 #include <sys/obpdefs.h> 67 #include <sys/vm_machparam.h> 68 #include <sys/var.h> 69 #include <sys/trap.h> 70 #include <sys/machtrap.h> 71 #include <sys/scb.h> 72 #include <sys/bitmap.h> 73 #include <sys/machlock.h> 74 #include <sys/membar.h> 75 #include <sys/atomic.h> 76 #include <sys/cpu_module.h> 77 #include <sys/prom_debug.h> 78 #include <sys/ksynch.h> 79 #include <sys/mem_config.h> 80 #include <sys/mem_cage.h> 81 #include <vm/vm_dep.h> 82 #include <vm/xhat_sfmmu.h> 83 #include <sys/fpu/fpusystm.h> 84 #include <vm/mach_kpm.h> 85 #include <sys/callb.h> 86 87 #ifdef DEBUG 88 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ 89 if (SFMMU_IS_SHMERID_VALID(rid)) { \ 90 caddr_t _eaddr = (saddr) + (len); \ 91 sf_srd_t *_srdp; \ 92 sf_region_t *_rgnp; \ 93 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 94 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \ 95 ASSERT((hat) != ksfmmup); \ 96 _srdp = (hat)->sfmmu_srdp; \ 97 ASSERT(_srdp != NULL); \ 98 ASSERT(_srdp->srd_refcnt != 0); \ 99 _rgnp = _srdp->srd_hmergnp[(rid)]; \ 100 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \ 101 ASSERT(_rgnp->rgn_refcnt != 0); \ 102 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \ 103 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 104 SFMMU_REGION_HME); \ 105 ASSERT((saddr) >= _rgnp->rgn_saddr); \ 106 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \ 107 ASSERT(_eaddr > _rgnp->rgn_saddr); \ 108 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \ 109 } 110 111 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \ 112 { \ 113 caddr_t _hsva; \ 114 caddr_t _heva; \ 115 caddr_t _rsva; \ 116 caddr_t _reva; \ 117 int _ttesz = get_hblk_ttesz(hmeblkp); \ 118 int _flagtte; \ 119 ASSERT((srdp)->srd_refcnt != 0); \ 120 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 121 ASSERT((rgnp)->rgn_id == rid); \ 122 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \ 123 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 124 SFMMU_REGION_HME); \ 125 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \ 126 _hsva = (caddr_t)get_hblk_base(hmeblkp); \ 127 _heva = get_hblk_endaddr(hmeblkp); \ 128 _rsva = (caddr_t)P2ALIGN( \ 129 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \ 130 _reva = (caddr_t)P2ROUNDUP( \ 131 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \ 132 HBLK_MIN_BYTES); \ 133 ASSERT(_hsva >= _rsva); \ 134 ASSERT(_hsva < _reva); \ 135 ASSERT(_heva > _rsva); \ 136 ASSERT(_heva <= _reva); \ 137 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \ 138 _ttesz; \ 139 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \ 140 } 141 142 #else /* DEBUG */ 143 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len) 144 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 145 #endif /* DEBUG */ 146 147 #if defined(SF_ERRATA_57) 148 extern caddr_t errata57_limit; 149 #endif 150 151 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 152 (sizeof (int64_t))) 153 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 154 155 #define HBLK_RESERVE_CNT 128 156 #define HBLK_RESERVE_MIN 20 157 158 static struct hme_blk *freehblkp; 159 static kmutex_t freehblkp_lock; 160 static int freehblkcnt; 161 162 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 163 static kmutex_t hblk_reserve_lock; 164 static kthread_t *hblk_reserve_thread; 165 166 static nucleus_hblk8_info_t nucleus_hblk8; 167 static nucleus_hblk1_info_t nucleus_hblk1; 168 169 /* 170 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here 171 * after the initial phase of removing an hmeblk from the hash chain, see 172 * the detailed comment in sfmmu_hblk_hash_rm() for further details. 173 */ 174 static cpu_hme_pend_t *cpu_hme_pend; 175 static uint_t cpu_hme_pend_thresh; 176 /* 177 * SFMMU specific hat functions 178 */ 179 void hat_pagecachectl(struct page *, int); 180 181 /* flags for hat_pagecachectl */ 182 #define HAT_CACHE 0x1 183 #define HAT_UNCACHE 0x2 184 #define HAT_TMPNC 0x4 185 186 /* 187 * Flag to allow the creation of non-cacheable translations 188 * to system memory. It is off by default. At the moment this 189 * flag is used by the ecache error injector. The error injector 190 * will turn it on when creating such a translation then shut it 191 * off when it's finished. 192 */ 193 194 int sfmmu_allow_nc_trans = 0; 195 196 /* 197 * Flag to disable large page support. 198 * value of 1 => disable all large pages. 199 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 200 * 201 * For example, use the value 0x4 to disable 512K pages. 202 * 203 */ 204 #define LARGE_PAGES_OFF 0x1 205 206 /* 207 * The disable_large_pages and disable_ism_large_pages variables control 208 * hat_memload_array and the page sizes to be used by ISM and the kernel. 209 * 210 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables 211 * are only used to control which OOB pages to use at upper VM segment creation 212 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines. 213 * Their values may come from platform or CPU specific code to disable page 214 * sizes that should not be used. 215 * 216 * WARNING: 512K pages are currently not supported for ISM/DISM. 217 */ 218 uint_t disable_large_pages = 0; 219 uint_t disable_ism_large_pages = (1 << TTE512K); 220 uint_t disable_auto_data_large_pages = 0; 221 uint_t disable_auto_text_large_pages = 0; 222 223 /* 224 * Private sfmmu data structures for hat management 225 */ 226 static struct kmem_cache *sfmmuid_cache; 227 static struct kmem_cache *mmuctxdom_cache; 228 229 /* 230 * Private sfmmu data structures for tsb management 231 */ 232 static struct kmem_cache *sfmmu_tsbinfo_cache; 233 static struct kmem_cache *sfmmu_tsb8k_cache; 234 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 235 static vmem_t *kmem_bigtsb_arena; 236 static vmem_t *kmem_tsb_arena; 237 238 /* 239 * sfmmu static variables for hmeblk resource management. 240 */ 241 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 242 static struct kmem_cache *sfmmu8_cache; 243 static struct kmem_cache *sfmmu1_cache; 244 static struct kmem_cache *pa_hment_cache; 245 246 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 247 /* 248 * private data for ism 249 */ 250 static struct kmem_cache *ism_blk_cache; 251 static struct kmem_cache *ism_ment_cache; 252 #define ISMID_STARTADDR NULL 253 254 /* 255 * Region management data structures and function declarations. 256 */ 257 258 static void sfmmu_leave_srd(sfmmu_t *); 259 static int sfmmu_srdcache_constructor(void *, void *, int); 260 static void sfmmu_srdcache_destructor(void *, void *); 261 static int sfmmu_rgncache_constructor(void *, void *, int); 262 static void sfmmu_rgncache_destructor(void *, void *); 263 static int sfrgnmap_isnull(sf_region_map_t *); 264 static int sfhmergnmap_isnull(sf_hmeregion_map_t *); 265 static int sfmmu_scdcache_constructor(void *, void *, int); 266 static void sfmmu_scdcache_destructor(void *, void *); 267 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t, 268 size_t, void *, u_offset_t); 269 270 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1; 271 static sf_srd_bucket_t *srd_buckets; 272 static struct kmem_cache *srd_cache; 273 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1; 274 static struct kmem_cache *region_cache; 275 static struct kmem_cache *scd_cache; 276 277 #ifdef sun4v 278 int use_bigtsb_arena = 1; 279 #else 280 int use_bigtsb_arena = 0; 281 #endif 282 283 /* External /etc/system tunable, for turning on&off the shctx support */ 284 int disable_shctx = 0; 285 /* Internal variable, set by MD if the HW supports shctx feature */ 286 int shctx_on = 0; 287 288 #ifdef DEBUG 289 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int); 290 #endif 291 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *); 292 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *); 293 294 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *); 295 static void sfmmu_find_scd(sfmmu_t *); 296 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *); 297 static void sfmmu_finish_join_scd(sfmmu_t *); 298 static void sfmmu_leave_scd(sfmmu_t *, uchar_t); 299 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *); 300 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *); 301 static void sfmmu_free_scd_tsbs(sfmmu_t *); 302 static void sfmmu_tsb_inv_ctx(sfmmu_t *); 303 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *); 304 static void sfmmu_ism_hatflags(sfmmu_t *, int); 305 static int sfmmu_srd_lock_held(sf_srd_t *); 306 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *); 307 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *); 308 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *); 309 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *); 310 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *); 311 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *); 312 313 /* 314 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 315 * HAT flags, synchronizing TLB/TSB coherency, and context management. 316 * The lock is hashed on the sfmmup since the case where we need to lock 317 * all processes is rare but does occur (e.g. we need to unload a shared 318 * mapping from all processes using the mapping). We have a lot of buckets, 319 * and each slab of sfmmu_t's can use about a quarter of them, giving us 320 * a fairly good distribution without wasting too much space and overhead 321 * when we have to grab them all. 322 */ 323 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 324 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 325 326 /* 327 * Hash algorithm optimized for a small number of slabs. 328 * 7 is (highbit((sizeof sfmmu_t)) - 1) 329 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 330 * kmem_cache, and thus they will be sequential within that cache. In 331 * addition, each new slab will have a different "color" up to cache_maxcolor 332 * which will skew the hashing for each successive slab which is allocated. 333 * If the size of sfmmu_t changed to a larger size, this algorithm may need 334 * to be revisited. 335 */ 336 #define TSB_HASH_SHIFT_BITS (7) 337 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 338 339 #ifdef DEBUG 340 int tsb_hash_debug = 0; 341 #define TSB_HASH(sfmmup) \ 342 (tsb_hash_debug ? &hat_lock[0] : \ 343 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 344 #else /* DEBUG */ 345 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 346 #endif /* DEBUG */ 347 348 349 /* sfmmu_replace_tsb() return codes. */ 350 typedef enum tsb_replace_rc { 351 TSB_SUCCESS, 352 TSB_ALLOCFAIL, 353 TSB_LOSTRACE, 354 TSB_ALREADY_SWAPPED, 355 TSB_CANTGROW 356 } tsb_replace_rc_t; 357 358 /* 359 * Flags for TSB allocation routines. 360 */ 361 #define TSB_ALLOC 0x01 362 #define TSB_FORCEALLOC 0x02 363 #define TSB_GROW 0x04 364 #define TSB_SHRINK 0x08 365 #define TSB_SWAPIN 0x10 366 367 /* 368 * Support for HAT callbacks. 369 */ 370 #define SFMMU_MAX_RELOC_CALLBACKS 10 371 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 372 static id_t sfmmu_cb_nextid = 0; 373 static id_t sfmmu_tsb_cb_id; 374 struct sfmmu_callback *sfmmu_cb_table; 375 376 /* 377 * Kernel page relocation is enabled by default for non-caged 378 * kernel pages. This has little effect unless segkmem_reloc is 379 * set, since by default kernel memory comes from inside the 380 * kernel cage. 381 */ 382 int hat_kpr_enabled = 1; 383 384 kmutex_t kpr_mutex; 385 kmutex_t kpr_suspendlock; 386 kthread_t *kreloc_thread; 387 388 /* 389 * Enable VA->PA translation sanity checking on DEBUG kernels. 390 * Disabled by default. This is incompatible with some 391 * drivers (error injector, RSM) so if it breaks you get 392 * to keep both pieces. 393 */ 394 int hat_check_vtop = 0; 395 396 /* 397 * Private sfmmu routines (prototypes) 398 */ 399 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 400 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 401 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t, 402 uint_t); 403 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 404 caddr_t, demap_range_t *, uint_t); 405 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 406 caddr_t, int); 407 static void sfmmu_hblk_free(struct hme_blk **); 408 static void sfmmu_hblks_list_purge(struct hme_blk **, int); 409 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 410 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 411 static struct hme_blk *sfmmu_hblk_steal(int); 412 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 413 struct hme_blk *, uint64_t, struct hme_blk *); 414 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 415 416 static void hat_do_memload_array(struct hat *, caddr_t, size_t, 417 struct page **, uint_t, uint_t, uint_t); 418 static void hat_do_memload(struct hat *, caddr_t, struct page *, 419 uint_t, uint_t, uint_t); 420 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 421 uint_t, uint_t, pgcnt_t, uint_t); 422 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 423 uint_t); 424 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 425 uint_t, uint_t); 426 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 427 caddr_t, int, uint_t); 428 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 429 struct hmehash_bucket *, caddr_t, uint_t, uint_t, 430 uint_t); 431 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 432 caddr_t, page_t **, uint_t, uint_t); 433 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 434 435 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 436 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *); 437 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 438 #ifdef VAC 439 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 440 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 441 int tst_tnc(page_t *pp, pgcnt_t); 442 void conv_tnc(page_t *pp, int); 443 #endif 444 445 static void sfmmu_get_ctx(sfmmu_t *); 446 static void sfmmu_free_sfmmu(sfmmu_t *); 447 448 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 449 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 450 451 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 452 static void hat_pagereload(struct page *, struct page *); 453 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 454 #ifdef VAC 455 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 456 static void sfmmu_page_cache(page_t *, int, int, int); 457 #endif 458 459 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *, 460 struct hme_blk *, int); 461 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 462 pfn_t, int, int, int, int); 463 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 464 pfn_t, int); 465 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 466 static void sfmmu_tlb_range_demap(demap_range_t *); 467 static void sfmmu_invalidate_ctx(sfmmu_t *); 468 static void sfmmu_sync_mmustate(sfmmu_t *); 469 470 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 471 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 472 sfmmu_t *); 473 static void sfmmu_tsb_free(struct tsb_info *); 474 static void sfmmu_tsbinfo_free(struct tsb_info *); 475 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 476 sfmmu_t *); 477 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *); 478 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 479 static int sfmmu_select_tsb_szc(pgcnt_t); 480 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 481 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 482 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 483 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 484 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 485 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 486 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 487 hatlock_t *, uint_t); 488 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int); 489 490 #ifdef VAC 491 void sfmmu_cache_flush(pfn_t, int); 492 void sfmmu_cache_flushcolor(int, pfn_t); 493 #endif 494 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 495 caddr_t, demap_range_t *, uint_t, int); 496 497 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 498 static uint_t sfmmu_ptov_attr(tte_t *); 499 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 500 caddr_t, demap_range_t *, uint_t); 501 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 502 static int sfmmu_idcache_constructor(void *, void *, int); 503 static void sfmmu_idcache_destructor(void *, void *); 504 static int sfmmu_hblkcache_constructor(void *, void *, int); 505 static void sfmmu_hblkcache_destructor(void *, void *); 506 static void sfmmu_hblkcache_reclaim(void *); 507 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 508 struct hmehash_bucket *); 509 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *, 510 struct hme_blk *, struct hme_blk **, int); 511 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *, 512 uint64_t); 513 static struct hme_blk *sfmmu_check_pending_hblks(int); 514 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 515 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int); 516 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t, 517 int, caddr_t *); 518 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *); 519 520 static void sfmmu_rm_large_mappings(page_t *, int); 521 522 static void hat_lock_init(void); 523 static void hat_kstat_init(void); 524 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 525 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *); 526 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t); 527 static void sfmmu_check_page_sizes(sfmmu_t *, int); 528 int fnd_mapping_sz(page_t *); 529 static void iment_add(struct ism_ment *, struct hat *); 530 static void iment_sub(struct ism_ment *, struct hat *); 531 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 532 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 533 extern void sfmmu_clear_utsbinfo(void); 534 535 static void sfmmu_ctx_wrap_around(mmu_ctx_t *, boolean_t); 536 537 extern int vpm_enable; 538 539 /* kpm globals */ 540 #ifdef DEBUG 541 /* 542 * Enable trap level tsbmiss handling 543 */ 544 int kpm_tsbmtl = 1; 545 546 /* 547 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 548 * required TLB shootdowns in this case, so handle w/ care. Off by default. 549 */ 550 int kpm_tlb_flush; 551 #endif /* DEBUG */ 552 553 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 554 555 #ifdef DEBUG 556 static void sfmmu_check_hblk_flist(); 557 #endif 558 559 /* 560 * Semi-private sfmmu data structures. Some of them are initialize in 561 * startup or in hat_init. Some of them are private but accessed by 562 * assembly code or mach_sfmmu.c 563 */ 564 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 565 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 566 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 567 uint64_t khme_hash_pa; /* PA of khme_hash */ 568 int uhmehash_num; /* # of buckets in user hash table */ 569 int khmehash_num; /* # of buckets in kernel hash table */ 570 571 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */ 572 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */ 573 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ 574 575 #define DEFAULT_NUM_CTXS_PER_MMU 8192 576 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; 577 578 int cache; /* describes system cache */ 579 580 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 581 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 582 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 583 int ktsb_sz; /* kernel 8k-indexed tsb size */ 584 585 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 586 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 587 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 588 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 589 590 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 591 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 592 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 593 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 594 595 #ifndef sun4v 596 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 597 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 598 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 599 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 600 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 601 #endif /* sun4v */ 602 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 603 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 604 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */ 605 606 /* 607 * Size to use for TSB slabs. Future platforms that support page sizes 608 * larger than 4M may wish to change these values, and provide their own 609 * assembly macros for building and decoding the TSB base register contents. 610 * Note disable_large_pages will override the value set here. 611 */ 612 static uint_t tsb_slab_ttesz = TTE4M; 613 size_t tsb_slab_size = MMU_PAGESIZE4M; 614 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 615 /* PFN mask for TTE */ 616 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT; 617 618 /* 619 * Size to use for TSB slabs. These are used only when 256M tsb arenas 620 * exist. 621 */ 622 static uint_t bigtsb_slab_ttesz = TTE256M; 623 static size_t bigtsb_slab_size = MMU_PAGESIZE256M; 624 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M; 625 /* 256M page alignment for 8K pfn */ 626 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT; 627 628 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 629 static int tsb_max_growsize = 0; 630 631 /* 632 * Tunable parameters dealing with TSB policies. 633 */ 634 635 /* 636 * This undocumented tunable forces all 8K TSBs to be allocated from 637 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 638 */ 639 #ifdef DEBUG 640 int tsb_forceheap = 0; 641 #endif /* DEBUG */ 642 643 /* 644 * Decide whether to use per-lgroup arenas, or one global set of 645 * TSB arenas. The default is not to break up per-lgroup, since 646 * most platforms don't recognize any tangible benefit from it. 647 */ 648 int tsb_lgrp_affinity = 0; 649 650 /* 651 * Used for growing the TSB based on the process RSS. 652 * tsb_rss_factor is based on the smallest TSB, and is 653 * shifted by the TSB size to determine if we need to grow. 654 * The default will grow the TSB if the number of TTEs for 655 * this page size exceeds 75% of the number of TSB entries, 656 * which should _almost_ eliminate all conflict misses 657 * (at the expense of using up lots and lots of memory). 658 */ 659 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 660 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 661 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 662 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 663 default_tsb_size) 664 #define TSB_OK_SHRINK() \ 665 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 666 #define TSB_OK_GROW() \ 667 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 668 669 int enable_tsb_rss_sizing = 1; 670 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 671 672 /* which TSB size code to use for new address spaces or if rss sizing off */ 673 int default_tsb_size = TSB_8K_SZCODE; 674 675 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 676 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 677 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 678 679 #ifdef DEBUG 680 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 681 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 682 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 683 static int tsb_alloc_fail_mtbf = 0; 684 static int tsb_alloc_count = 0; 685 #endif /* DEBUG */ 686 687 /* if set to 1, will remap valid TTEs when growing TSB. */ 688 int tsb_remap_ttes = 1; 689 690 /* 691 * If we have more than this many mappings, allocate a second TSB. 692 * This default is chosen because the I/D fully associative TLBs are 693 * assumed to have at least 8 available entries. Platforms with a 694 * larger fully-associative TLB could probably override the default. 695 */ 696 697 #ifdef sun4v 698 int tsb_sectsb_threshold = 0; 699 #else 700 int tsb_sectsb_threshold = 8; 701 #endif 702 703 /* 704 * kstat data 705 */ 706 struct sfmmu_global_stat sfmmu_global_stat; 707 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 708 709 /* 710 * Global data 711 */ 712 sfmmu_t *ksfmmup; /* kernel's hat id */ 713 714 #ifdef DEBUG 715 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 716 #endif 717 718 /* sfmmu locking operations */ 719 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 720 static int sfmmu_mlspl_held(struct page *, int); 721 722 kmutex_t *sfmmu_page_enter(page_t *); 723 void sfmmu_page_exit(kmutex_t *); 724 int sfmmu_page_spl_held(struct page *); 725 726 /* sfmmu internal locking operations - accessed directly */ 727 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 728 kmutex_t **, kmutex_t **); 729 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 730 static hatlock_t * 731 sfmmu_hat_enter(sfmmu_t *); 732 static hatlock_t * 733 sfmmu_hat_tryenter(sfmmu_t *); 734 static void sfmmu_hat_exit(hatlock_t *); 735 static void sfmmu_hat_lock_all(void); 736 static void sfmmu_hat_unlock_all(void); 737 static void sfmmu_ismhat_enter(sfmmu_t *, int); 738 static void sfmmu_ismhat_exit(sfmmu_t *, int); 739 740 /* 741 * Array of mutexes protecting a page's mapping list and p_nrm field. 742 * 743 * The hash function looks complicated, but is made up so that: 744 * 745 * "pp" not shifted, so adjacent pp values will hash to different cache lines 746 * (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock) 747 * 748 * "pp" >> mml_shift, incorporates more source bits into the hash result 749 * 750 * "& (mml_table_size - 1), should be faster than using remainder "%" 751 * 752 * Hopefully, mml_table, mml_table_size and mml_shift are all in the same 753 * cacheline, since they get declared next to each other below. We'll trust 754 * ld not to do something random. 755 */ 756 #ifdef DEBUG 757 int mlist_hash_debug = 0; 758 #define MLIST_HASH(pp) (mlist_hash_debug ? &mml_table[0] : \ 759 &mml_table[((uintptr_t)(pp) + \ 760 ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]) 761 #else /* !DEBUG */ 762 #define MLIST_HASH(pp) &mml_table[ \ 763 ((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)] 764 #endif /* !DEBUG */ 765 766 kmutex_t *mml_table; 767 uint_t mml_table_sz; /* must be a power of 2 */ 768 uint_t mml_shift; /* log2(mml_table_sz) + 3 for align */ 769 770 kpm_hlk_t *kpmp_table; 771 uint_t kpmp_table_sz; /* must be a power of 2 */ 772 uchar_t kpmp_shift; 773 774 kpm_shlk_t *kpmp_stable; 775 uint_t kpmp_stable_sz; /* must be a power of 2 */ 776 777 /* 778 * SPL_HASH was improved to avoid false cache line sharing 779 */ 780 #define SPL_TABLE_SIZE 128 781 #define SPL_MASK (SPL_TABLE_SIZE - 1) 782 #define SPL_SHIFT 7 /* log2(SPL_TABLE_SIZE) */ 783 784 #define SPL_INDEX(pp) \ 785 ((((uintptr_t)(pp) >> SPL_SHIFT) ^ \ 786 ((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \ 787 (SPL_TABLE_SIZE - 1)) 788 789 #define SPL_HASH(pp) \ 790 (&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex) 791 792 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 793 794 795 /* 796 * hat_unload_callback() will group together callbacks in order 797 * to avoid xt_sync() calls. This is the maximum size of the group. 798 */ 799 #define MAX_CB_ADDR 32 800 801 tte_t hw_tte; 802 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 803 804 static char *mmu_ctx_kstat_names[] = { 805 "mmu_ctx_tsb_exceptions", 806 "mmu_ctx_tsb_raise_exception", 807 "mmu_ctx_wrap_around", 808 }; 809 810 /* 811 * Wrapper for vmem_xalloc since vmem_create only allows limited 812 * parameters for vm_source_alloc functions. This function allows us 813 * to specify alignment consistent with the size of the object being 814 * allocated. 815 */ 816 static void * 817 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 818 { 819 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 820 } 821 822 /* Common code for setting tsb_alloc_hiwater. */ 823 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 824 ptob(pages) / tsb_alloc_hiwater_factor 825 826 /* 827 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 828 * a single TSB. physmem is the number of physical pages so we need physmem 8K 829 * TTEs to represent all those physical pages. We round this up by using 830 * 1<<highbit(). To figure out which size code to use, remember that the size 831 * code is just an amount to shift the smallest TSB size to get the size of 832 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 833 * highbit() - 1) to get the size code for the smallest TSB that can represent 834 * all of physical memory, while erring on the side of too much. 835 * 836 * Restrict tsb_max_growsize to make sure that: 837 * 1) TSBs can't grow larger than the TSB slab size 838 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 839 */ 840 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 841 int _i, _szc, _slabszc, _tsbszc; \ 842 \ 843 _i = highbit(pages); \ 844 if ((1 << (_i - 1)) == (pages)) \ 845 _i--; /* 2^n case, round down */ \ 846 _szc = _i - TSB_START_SIZE; \ 847 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \ 848 _tsbszc = MIN(_szc, _slabszc); \ 849 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \ 850 } 851 852 /* 853 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 854 * tsb_info which handles that TTE size. 855 */ 856 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \ 857 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 858 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \ 859 sfmmu_hat_lock_held(sfmmup)); \ 860 if ((tte_szc) >= TTE4M) { \ 861 ASSERT((tsbinfop) != NULL); \ 862 (tsbinfop) = (tsbinfop)->tsb_next; \ 863 } \ 864 } 865 866 /* 867 * Macro to use to unload entries from the TSB. 868 * It has knowledge of which page sizes get replicated in the TSB 869 * and will call the appropriate unload routine for the appropriate size. 870 */ 871 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \ 872 { \ 873 int ttesz = get_hblk_ttesz(hmeblkp); \ 874 if (ttesz == TTE8K || ttesz == TTE4M) { \ 875 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 876 } else { \ 877 caddr_t sva = ismhat ? addr : \ 878 (caddr_t)get_hblk_base(hmeblkp); \ 879 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 880 ASSERT(addr >= sva && addr < eva); \ 881 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 882 } \ 883 } 884 885 886 /* Update tsb_alloc_hiwater after memory is configured. */ 887 /*ARGSUSED*/ 888 static void 889 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages) 890 { 891 /* Assumes physmem has already been updated. */ 892 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 893 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 894 } 895 896 /* 897 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 898 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 899 * deleted. 900 */ 901 /*ARGSUSED*/ 902 static int 903 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages) 904 { 905 return (0); 906 } 907 908 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 909 /*ARGSUSED*/ 910 static void 911 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 912 { 913 /* 914 * Whether the delete was cancelled or not, just go ahead and update 915 * tsb_alloc_hiwater and tsb_max_growsize. 916 */ 917 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 918 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 919 } 920 921 static kphysm_setup_vector_t sfmmu_update_vec = { 922 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 923 sfmmu_update_post_add, /* post_add */ 924 sfmmu_update_pre_del, /* pre_del */ 925 sfmmu_update_post_del /* post_del */ 926 }; 927 928 929 /* 930 * HME_BLK HASH PRIMITIVES 931 */ 932 933 /* 934 * Enter a hme on the mapping list for page pp. 935 * When large pages are more prevalent in the system we might want to 936 * keep the mapping list in ascending order by the hment size. For now, 937 * small pages are more frequent, so don't slow it down. 938 */ 939 #define HME_ADD(hme, pp) \ 940 { \ 941 ASSERT(sfmmu_mlist_held(pp)); \ 942 \ 943 hme->hme_prev = NULL; \ 944 hme->hme_next = pp->p_mapping; \ 945 hme->hme_page = pp; \ 946 if (pp->p_mapping) { \ 947 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 948 ASSERT(pp->p_share > 0); \ 949 } else { \ 950 /* EMPTY */ \ 951 ASSERT(pp->p_share == 0); \ 952 } \ 953 pp->p_mapping = hme; \ 954 pp->p_share++; \ 955 } 956 957 /* 958 * Enter a hme on the mapping list for page pp. 959 * If we are unmapping a large translation, we need to make sure that the 960 * change is reflect in the corresponding bit of the p_index field. 961 */ 962 #define HME_SUB(hme, pp) \ 963 { \ 964 ASSERT(sfmmu_mlist_held(pp)); \ 965 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 966 \ 967 if (pp->p_mapping == NULL) { \ 968 panic("hme_remove - no mappings"); \ 969 } \ 970 \ 971 membar_stst(); /* ensure previous stores finish */ \ 972 \ 973 ASSERT(pp->p_share > 0); \ 974 pp->p_share--; \ 975 \ 976 if (hme->hme_prev) { \ 977 ASSERT(pp->p_mapping != hme); \ 978 ASSERT(hme->hme_prev->hme_page == pp || \ 979 IS_PAHME(hme->hme_prev)); \ 980 hme->hme_prev->hme_next = hme->hme_next; \ 981 } else { \ 982 ASSERT(pp->p_mapping == hme); \ 983 pp->p_mapping = hme->hme_next; \ 984 ASSERT((pp->p_mapping == NULL) ? \ 985 (pp->p_share == 0) : 1); \ 986 } \ 987 \ 988 if (hme->hme_next) { \ 989 ASSERT(hme->hme_next->hme_page == pp || \ 990 IS_PAHME(hme->hme_next)); \ 991 hme->hme_next->hme_prev = hme->hme_prev; \ 992 } \ 993 \ 994 /* zero out the entry */ \ 995 hme->hme_next = NULL; \ 996 hme->hme_prev = NULL; \ 997 hme->hme_page = NULL; \ 998 \ 999 if (hme_size(hme) > TTE8K) { \ 1000 /* remove mappings for remainder of large pg */ \ 1001 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 1002 } \ 1003 } 1004 1005 /* 1006 * This function returns the hment given the hme_blk and a vaddr. 1007 * It assumes addr has already been checked to belong to hme_blk's 1008 * range. 1009 */ 1010 #define HBLKTOHME(hment, hmeblkp, addr) \ 1011 { \ 1012 int index; \ 1013 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 1014 } 1015 1016 /* 1017 * Version of HBLKTOHME that also returns the index in hmeblkp 1018 * of the hment. 1019 */ 1020 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 1021 { \ 1022 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 1023 \ 1024 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 1025 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 1026 } else \ 1027 idx = 0; \ 1028 \ 1029 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 1030 } 1031 1032 /* 1033 * Disable any page sizes not supported by the CPU 1034 */ 1035 void 1036 hat_init_pagesizes() 1037 { 1038 int i; 1039 1040 mmu_exported_page_sizes = 0; 1041 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 1042 1043 szc_2_userszc[i] = (uint_t)-1; 1044 userszc_2_szc[i] = (uint_t)-1; 1045 1046 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 1047 disable_large_pages |= (1 << i); 1048 } else { 1049 szc_2_userszc[i] = mmu_exported_page_sizes; 1050 userszc_2_szc[mmu_exported_page_sizes] = i; 1051 mmu_exported_page_sizes++; 1052 } 1053 } 1054 1055 disable_ism_large_pages |= disable_large_pages; 1056 disable_auto_data_large_pages = disable_large_pages; 1057 disable_auto_text_large_pages = disable_large_pages; 1058 1059 /* 1060 * Initialize mmu-specific large page sizes. 1061 */ 1062 if (&mmu_large_pages_disabled) { 1063 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 1064 disable_ism_large_pages |= 1065 mmu_large_pages_disabled(HAT_LOAD_SHARE); 1066 disable_auto_data_large_pages |= 1067 mmu_large_pages_disabled(HAT_AUTO_DATA); 1068 disable_auto_text_large_pages |= 1069 mmu_large_pages_disabled(HAT_AUTO_TEXT); 1070 } 1071 } 1072 1073 /* 1074 * Initialize the hardware address translation structures. 1075 */ 1076 void 1077 hat_init(void) 1078 { 1079 int i; 1080 uint_t sz; 1081 size_t size; 1082 1083 hat_lock_init(); 1084 hat_kstat_init(); 1085 1086 /* 1087 * Hardware-only bits in a TTE 1088 */ 1089 MAKE_TTE_MASK(&hw_tte); 1090 1091 hat_init_pagesizes(); 1092 1093 /* Initialize the hash locks */ 1094 for (i = 0; i < khmehash_num; i++) { 1095 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1096 MUTEX_DEFAULT, NULL); 1097 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1098 } 1099 for (i = 0; i < uhmehash_num; i++) { 1100 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1101 MUTEX_DEFAULT, NULL); 1102 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1103 } 1104 khmehash_num--; /* make sure counter starts from 0 */ 1105 uhmehash_num--; /* make sure counter starts from 0 */ 1106 1107 /* 1108 * Allocate context domain structures. 1109 * 1110 * A platform may choose to modify max_mmu_ctxdoms in 1111 * set_platform_defaults(). If a platform does not define 1112 * a set_platform_defaults() or does not choose to modify 1113 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU. 1114 * 1115 * For all platforms that have CPUs sharing MMUs, this 1116 * value must be defined. 1117 */ 1118 if (max_mmu_ctxdoms == 0) 1119 max_mmu_ctxdoms = max_ncpus; 1120 1121 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *); 1122 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP); 1123 1124 /* mmu_ctx_t is 64 bytes aligned */ 1125 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache", 1126 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1127 /* 1128 * MMU context domain initialization for the Boot CPU. 1129 * This needs the context domains array allocated above. 1130 */ 1131 mutex_enter(&cpu_lock); 1132 sfmmu_cpu_init(CPU); 1133 mutex_exit(&cpu_lock); 1134 1135 /* 1136 * Intialize ism mapping list lock. 1137 */ 1138 1139 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1140 1141 /* 1142 * Each sfmmu structure carries an array of MMU context info 1143 * structures, one per context domain. The size of this array depends 1144 * on the maximum number of context domains. So, the size of the 1145 * sfmmu structure varies per platform. 1146 * 1147 * sfmmu is allocated from static arena, because trap 1148 * handler at TL > 0 is not allowed to touch kernel relocatable 1149 * memory. sfmmu's alignment is changed to 64 bytes from 1150 * default 8 bytes, as the lower 6 bits will be used to pass 1151 * pgcnt to vtag_flush_pgcnt_tl1. 1152 */ 1153 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1); 1154 1155 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size, 1156 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1157 NULL, NULL, static_arena, 0); 1158 1159 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1160 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1161 1162 /* 1163 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1164 * from the heap when low on memory or when TSB_FORCEALLOC is 1165 * specified, don't use magazines to cache them--we want to return 1166 * them to the system as quickly as possible. 1167 */ 1168 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1169 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1170 static_arena, KMC_NOMAGAZINE); 1171 1172 /* 1173 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1174 * memory, which corresponds to the old static reserve for TSBs. 1175 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1176 * memory we'll allocate for TSB slabs; beyond this point TSB 1177 * allocations will be taken from the kernel heap (via 1178 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1179 * consumer. 1180 */ 1181 if (tsb_alloc_hiwater_factor == 0) { 1182 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1183 } 1184 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1185 1186 for (sz = tsb_slab_ttesz; sz > 0; sz--) { 1187 if (!(disable_large_pages & (1 << sz))) 1188 break; 1189 } 1190 1191 if (sz < tsb_slab_ttesz) { 1192 tsb_slab_ttesz = sz; 1193 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz; 1194 tsb_slab_size = 1 << tsb_slab_shift; 1195 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1; 1196 use_bigtsb_arena = 0; 1197 } else if (use_bigtsb_arena && 1198 (disable_large_pages & (1 << bigtsb_slab_ttesz))) { 1199 use_bigtsb_arena = 0; 1200 } 1201 1202 if (!use_bigtsb_arena) { 1203 bigtsb_slab_shift = tsb_slab_shift; 1204 } 1205 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1206 1207 /* 1208 * On smaller memory systems, allocate TSB memory in smaller chunks 1209 * than the default 4M slab size. We also honor disable_large_pages 1210 * here. 1211 * 1212 * The trap handlers need to be patched with the final slab shift, 1213 * since they need to be able to construct the TSB pointer at runtime. 1214 */ 1215 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1216 !(disable_large_pages & (1 << TTE512K))) { 1217 tsb_slab_ttesz = TTE512K; 1218 tsb_slab_shift = MMU_PAGESHIFT512K; 1219 tsb_slab_size = MMU_PAGESIZE512K; 1220 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT; 1221 use_bigtsb_arena = 0; 1222 } 1223 1224 if (!use_bigtsb_arena) { 1225 bigtsb_slab_ttesz = tsb_slab_ttesz; 1226 bigtsb_slab_shift = tsb_slab_shift; 1227 bigtsb_slab_size = tsb_slab_size; 1228 bigtsb_slab_mask = tsb_slab_mask; 1229 } 1230 1231 1232 /* 1233 * Set up memory callback to update tsb_alloc_hiwater and 1234 * tsb_max_growsize. 1235 */ 1236 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0); 1237 ASSERT(i == 0); 1238 1239 /* 1240 * kmem_tsb_arena is the source from which large TSB slabs are 1241 * drawn. The quantum of this arena corresponds to the largest 1242 * TSB size we can dynamically allocate for user processes. 1243 * Currently it must also be a supported page size since we 1244 * use exactly one translation entry to map each slab page. 1245 * 1246 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1247 * which most TSBs are allocated. Since most TSB allocations are 1248 * typically 8K we have a kmem cache we stack on top of each 1249 * kmem_tsb_default_arena to speed up those allocations. 1250 * 1251 * Note the two-level scheme of arenas is required only 1252 * because vmem_create doesn't allow us to specify alignment 1253 * requirements. If this ever changes the code could be 1254 * simplified to use only one level of arenas. 1255 * 1256 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena 1257 * will be provided in addition to the 4M kmem_tsb_arena. 1258 */ 1259 if (use_bigtsb_arena) { 1260 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0, 1261 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper, 1262 vmem_xfree, heap_arena, 0, VM_SLEEP); 1263 } 1264 1265 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1266 sfmmu_vmem_xalloc_aligned_wrapper, 1267 vmem_xfree, heap_arena, 0, VM_SLEEP); 1268 1269 if (tsb_lgrp_affinity) { 1270 char s[50]; 1271 for (i = 0; i < NLGRPS_MAX; i++) { 1272 if (use_bigtsb_arena) { 1273 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i); 1274 kmem_bigtsb_default_arena[i] = vmem_create(s, 1275 NULL, 0, 2 * tsb_slab_size, 1276 sfmmu_tsb_segkmem_alloc, 1277 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 1278 0, VM_SLEEP | VM_BESTFIT); 1279 } 1280 1281 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1282 kmem_tsb_default_arena[i] = vmem_create(s, 1283 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1284 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1285 VM_SLEEP | VM_BESTFIT); 1286 1287 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1288 sfmmu_tsb_cache[i] = kmem_cache_create(s, 1289 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1290 kmem_tsb_default_arena[i], 0); 1291 } 1292 } else { 1293 if (use_bigtsb_arena) { 1294 kmem_bigtsb_default_arena[0] = 1295 vmem_create("kmem_bigtsb_default", NULL, 0, 1296 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc, 1297 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0, 1298 VM_SLEEP | VM_BESTFIT); 1299 } 1300 1301 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1302 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1303 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1304 VM_SLEEP | VM_BESTFIT); 1305 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1306 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1307 kmem_tsb_default_arena[0], 0); 1308 } 1309 1310 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1311 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1312 sfmmu_hblkcache_destructor, 1313 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1314 hat_memload_arena, KMC_NOHASH); 1315 1316 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1317 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, 1318 VMC_DUMPSAFE | VM_SLEEP); 1319 1320 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1321 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1322 sfmmu_hblkcache_destructor, 1323 NULL, (void *)HME1BLK_SZ, 1324 hat_memload1_arena, KMC_NOHASH); 1325 1326 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1327 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1328 1329 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1330 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1331 NULL, NULL, static_arena, KMC_NOHASH); 1332 1333 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1334 sizeof (ism_ment_t), 0, NULL, NULL, 1335 NULL, NULL, NULL, 0); 1336 1337 /* 1338 * We grab the first hat for the kernel, 1339 */ 1340 AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER); 1341 kas.a_hat = hat_alloc(&kas); 1342 AS_LOCK_EXIT(&kas, &kas.a_lock); 1343 1344 /* 1345 * Initialize hblk_reserve. 1346 */ 1347 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1348 va_to_pa((caddr_t)hblk_reserve); 1349 1350 #ifndef UTSB_PHYS 1351 /* 1352 * Reserve some kernel virtual address space for the locked TTEs 1353 * that allow us to probe the TSB from TL>0. 1354 */ 1355 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1356 0, 0, NULL, NULL, VM_SLEEP); 1357 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1358 0, 0, NULL, NULL, VM_SLEEP); 1359 #endif 1360 1361 #ifdef VAC 1362 /* 1363 * The big page VAC handling code assumes VAC 1364 * will not be bigger than the smallest big 1365 * page- which is 64K. 1366 */ 1367 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1368 cmn_err(CE_PANIC, "VAC too big!"); 1369 } 1370 #endif 1371 1372 (void) xhat_init(); 1373 1374 uhme_hash_pa = va_to_pa(uhme_hash); 1375 khme_hash_pa = va_to_pa(khme_hash); 1376 1377 /* 1378 * Initialize relocation locks. kpr_suspendlock is held 1379 * at PIL_MAX to prevent interrupts from pinning the holder 1380 * of a suspended TTE which may access it leading to a 1381 * deadlock condition. 1382 */ 1383 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1384 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1385 1386 /* 1387 * If Shared context support is disabled via /etc/system 1388 * set shctx_on to 0 here if it was set to 1 earlier in boot 1389 * sequence by cpu module initialization code. 1390 */ 1391 if (shctx_on && disable_shctx) { 1392 shctx_on = 0; 1393 } 1394 1395 if (shctx_on) { 1396 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS * 1397 sizeof (srd_buckets[0]), KM_SLEEP); 1398 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) { 1399 mutex_init(&srd_buckets[i].srdb_lock, NULL, 1400 MUTEX_DEFAULT, NULL); 1401 } 1402 1403 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t), 1404 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor, 1405 NULL, NULL, NULL, 0); 1406 region_cache = kmem_cache_create("region_cache", 1407 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor, 1408 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0); 1409 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t), 1410 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor, 1411 NULL, NULL, NULL, 0); 1412 } 1413 1414 /* 1415 * Pre-allocate hrm_hashtab before enabling the collection of 1416 * refmod statistics. Allocating on the fly would mean us 1417 * running the risk of suffering recursive mutex enters or 1418 * deadlocks. 1419 */ 1420 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1421 KM_SLEEP); 1422 1423 /* Allocate per-cpu pending freelist of hmeblks */ 1424 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64, 1425 KM_SLEEP); 1426 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP( 1427 (uintptr_t)cpu_hme_pend, 64); 1428 1429 for (i = 0; i < NCPU; i++) { 1430 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT, 1431 NULL); 1432 } 1433 1434 if (cpu_hme_pend_thresh == 0) { 1435 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH; 1436 } 1437 } 1438 1439 /* 1440 * Initialize locking for the hat layer, called early during boot. 1441 */ 1442 static void 1443 hat_lock_init() 1444 { 1445 int i; 1446 1447 /* 1448 * initialize the array of mutexes protecting a page's mapping 1449 * list and p_nrm field. 1450 */ 1451 for (i = 0; i < mml_table_sz; i++) 1452 mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL); 1453 1454 if (kpm_enable) { 1455 for (i = 0; i < kpmp_table_sz; i++) { 1456 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1457 MUTEX_DEFAULT, NULL); 1458 } 1459 } 1460 1461 /* 1462 * Initialize array of mutex locks that protects sfmmu fields and 1463 * TSB lists. 1464 */ 1465 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1466 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1467 NULL); 1468 } 1469 1470 #define SFMMU_KERNEL_MAXVA \ 1471 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1472 1473 /* 1474 * Allocate a hat structure. 1475 * Called when an address space first uses a hat. 1476 */ 1477 struct hat * 1478 hat_alloc(struct as *as) 1479 { 1480 sfmmu_t *sfmmup; 1481 int i; 1482 uint64_t cnum; 1483 extern uint_t get_color_start(struct as *); 1484 1485 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1486 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1487 sfmmup->sfmmu_as = as; 1488 sfmmup->sfmmu_flags = 0; 1489 sfmmup->sfmmu_tteflags = 0; 1490 sfmmup->sfmmu_rtteflags = 0; 1491 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock); 1492 1493 if (as == &kas) { 1494 ksfmmup = sfmmup; 1495 sfmmup->sfmmu_cext = 0; 1496 cnum = KCONTEXT; 1497 1498 sfmmup->sfmmu_clrstart = 0; 1499 sfmmup->sfmmu_tsb = NULL; 1500 /* 1501 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1502 * to setup tsb_info for ksfmmup. 1503 */ 1504 } else { 1505 1506 /* 1507 * Just set to invalid ctx. When it faults, it will 1508 * get a valid ctx. This would avoid the situation 1509 * where we get a ctx, but it gets stolen and then 1510 * we fault when we try to run and so have to get 1511 * another ctx. 1512 */ 1513 sfmmup->sfmmu_cext = 0; 1514 cnum = INVALID_CONTEXT; 1515 1516 /* initialize original physical page coloring bin */ 1517 sfmmup->sfmmu_clrstart = get_color_start(as); 1518 #ifdef DEBUG 1519 if (tsb_random_size) { 1520 uint32_t randval = (uint32_t)gettick() >> 4; 1521 int size = randval % (tsb_max_growsize + 1); 1522 1523 /* chose a random tsb size for stress testing */ 1524 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1525 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1526 } else 1527 #endif /* DEBUG */ 1528 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1529 default_tsb_size, 1530 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1531 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID; 1532 ASSERT(sfmmup->sfmmu_tsb != NULL); 1533 } 1534 1535 ASSERT(max_mmu_ctxdoms > 0); 1536 for (i = 0; i < max_mmu_ctxdoms; i++) { 1537 sfmmup->sfmmu_ctxs[i].cnum = cnum; 1538 sfmmup->sfmmu_ctxs[i].gnum = 0; 1539 } 1540 1541 for (i = 0; i < max_mmu_page_sizes; i++) { 1542 sfmmup->sfmmu_ttecnt[i] = 0; 1543 sfmmup->sfmmu_scdrttecnt[i] = 0; 1544 sfmmup->sfmmu_ismttecnt[i] = 0; 1545 sfmmup->sfmmu_scdismttecnt[i] = 0; 1546 sfmmup->sfmmu_pgsz[i] = TTE8K; 1547 } 1548 sfmmup->sfmmu_tsb0_4minflcnt = 0; 1549 sfmmup->sfmmu_iblk = NULL; 1550 sfmmup->sfmmu_ismhat = 0; 1551 sfmmup->sfmmu_scdhat = 0; 1552 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1553 if (sfmmup == ksfmmup) { 1554 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1555 } else { 1556 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1557 } 1558 sfmmup->sfmmu_free = 0; 1559 sfmmup->sfmmu_rmstat = 0; 1560 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1561 sfmmup->sfmmu_xhat_provider = NULL; 1562 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1563 sfmmup->sfmmu_srdp = NULL; 1564 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map); 1565 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 1566 sfmmup->sfmmu_scdp = NULL; 1567 sfmmup->sfmmu_scd_link.next = NULL; 1568 sfmmup->sfmmu_scd_link.prev = NULL; 1569 return (sfmmup); 1570 } 1571 1572 /* 1573 * Create per-MMU context domain kstats for a given MMU ctx. 1574 */ 1575 static void 1576 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp) 1577 { 1578 mmu_ctx_stat_t stat; 1579 kstat_t *mmu_kstat; 1580 1581 ASSERT(MUTEX_HELD(&cpu_lock)); 1582 ASSERT(mmu_ctxp->mmu_kstat == NULL); 1583 1584 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx", 1585 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1586 1587 if (mmu_kstat == NULL) { 1588 cmn_err(CE_WARN, "kstat_create for MMU %d failed", 1589 mmu_ctxp->mmu_idx); 1590 } else { 1591 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data; 1592 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++) 1593 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat], 1594 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64); 1595 mmu_ctxp->mmu_kstat = mmu_kstat; 1596 kstat_install(mmu_kstat); 1597 } 1598 } 1599 1600 /* 1601 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU 1602 * context domain information for a given CPU. If a platform does not 1603 * specify that interface, then the function below is used instead to return 1604 * default information. The defaults are as follows: 1605 * 1606 * - The number of MMU context IDs supported on any CPU in the 1607 * system is 8K. 1608 * - There is one MMU context domain per CPU. 1609 */ 1610 /*ARGSUSED*/ 1611 static void 1612 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop) 1613 { 1614 infop->mmu_nctxs = nctxs; 1615 infop->mmu_idx = cpu[cpuid]->cpu_seqid; 1616 } 1617 1618 /* 1619 * Called during CPU initialization to set the MMU context-related information 1620 * for a CPU. 1621 * 1622 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum. 1623 */ 1624 void 1625 sfmmu_cpu_init(cpu_t *cp) 1626 { 1627 mmu_ctx_info_t info; 1628 mmu_ctx_t *mmu_ctxp; 1629 1630 ASSERT(MUTEX_HELD(&cpu_lock)); 1631 1632 if (&plat_cpuid_to_mmu_ctx_info == NULL) 1633 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1634 else 1635 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1636 1637 ASSERT(info.mmu_idx < max_mmu_ctxdoms); 1638 1639 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) { 1640 /* Each mmu_ctx is cacheline aligned. */ 1641 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP); 1642 bzero(mmu_ctxp, sizeof (mmu_ctx_t)); 1643 1644 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN, 1645 (void *)ipltospl(DISP_LEVEL)); 1646 mmu_ctxp->mmu_idx = info.mmu_idx; 1647 mmu_ctxp->mmu_nctxs = info.mmu_nctxs; 1648 /* 1649 * Globally for lifetime of a system, 1650 * gnum must always increase. 1651 * mmu_saved_gnum is protected by the cpu_lock. 1652 */ 1653 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1; 1654 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 1655 1656 sfmmu_mmu_kstat_create(mmu_ctxp); 1657 1658 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp; 1659 } else { 1660 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx); 1661 ASSERT(mmu_ctxp->mmu_nctxs <= info.mmu_nctxs); 1662 } 1663 1664 /* 1665 * The mmu_lock is acquired here to prevent races with 1666 * the wrap-around code. 1667 */ 1668 mutex_enter(&mmu_ctxp->mmu_lock); 1669 1670 1671 mmu_ctxp->mmu_ncpus++; 1672 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1673 CPU_MMU_IDX(cp) = info.mmu_idx; 1674 CPU_MMU_CTXP(cp) = mmu_ctxp; 1675 1676 mutex_exit(&mmu_ctxp->mmu_lock); 1677 } 1678 1679 static void 1680 sfmmu_ctxdom_free(mmu_ctx_t *mmu_ctxp) 1681 { 1682 ASSERT(MUTEX_HELD(&cpu_lock)); 1683 ASSERT(!MUTEX_HELD(&mmu_ctxp->mmu_lock)); 1684 1685 mutex_destroy(&mmu_ctxp->mmu_lock); 1686 1687 if (mmu_ctxp->mmu_kstat) 1688 kstat_delete(mmu_ctxp->mmu_kstat); 1689 1690 /* mmu_saved_gnum is protected by the cpu_lock. */ 1691 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum) 1692 mmu_saved_gnum = mmu_ctxp->mmu_gnum; 1693 1694 kmem_cache_free(mmuctxdom_cache, mmu_ctxp); 1695 } 1696 1697 /* 1698 * Called to perform MMU context-related cleanup for a CPU. 1699 */ 1700 void 1701 sfmmu_cpu_cleanup(cpu_t *cp) 1702 { 1703 mmu_ctx_t *mmu_ctxp; 1704 1705 ASSERT(MUTEX_HELD(&cpu_lock)); 1706 1707 mmu_ctxp = CPU_MMU_CTXP(cp); 1708 ASSERT(mmu_ctxp != NULL); 1709 1710 /* 1711 * The mmu_lock is acquired here to prevent races with 1712 * the wrap-around code. 1713 */ 1714 mutex_enter(&mmu_ctxp->mmu_lock); 1715 1716 CPU_MMU_CTXP(cp) = NULL; 1717 1718 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1719 if (--mmu_ctxp->mmu_ncpus == 0) { 1720 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL; 1721 mutex_exit(&mmu_ctxp->mmu_lock); 1722 sfmmu_ctxdom_free(mmu_ctxp); 1723 return; 1724 } 1725 1726 mutex_exit(&mmu_ctxp->mmu_lock); 1727 } 1728 1729 uint_t 1730 sfmmu_ctxdom_nctxs(int idx) 1731 { 1732 return (mmu_ctxs_tbl[idx]->mmu_nctxs); 1733 } 1734 1735 #ifdef sun4v 1736 /* 1737 * sfmmu_ctxdoms_* is an interface provided to help keep context domains 1738 * consistant after suspend/resume on system that can resume on a different 1739 * hardware than it was suspended. 1740 * 1741 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts 1742 * from being allocated. It acquires all hat_locks, which blocks most access to 1743 * context data, except for a few cases that are handled separately or are 1744 * harmless. It wraps each domain to increment gnum and invalidate on-CPU 1745 * contexts, and forces cnum to its max. As a result of this call all user 1746 * threads that are running on CPUs trap and try to perform wrap around but 1747 * can't because hat_locks are taken. Threads that were not on CPUs but started 1748 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking 1749 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block 1750 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs 1751 * are paused, else it could deadlock acquiring locks held by paused CPUs. 1752 * 1753 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records 1754 * the CPUs that had them. It must be called after CPUs have been paused. This 1755 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data, 1756 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx 1757 * runs with interrupts disabled. When CPUs are later resumed, they may enter 1758 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately 1759 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus 1760 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is 1761 * accessing the old context domains. 1762 * 1763 * sfmmu_ctxdoms_update(void) frees space used by old context domains and 1764 * allocates new context domains based on hardware layout. It initializes 1765 * every CPU that had context domain before migration to have one again. 1766 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it 1767 * could deadlock acquiring locks held by paused CPUs. 1768 * 1769 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads 1770 * acquire new context ids and continue execution. 1771 * 1772 * Therefore functions should be called in the following order: 1773 * suspend_routine() 1774 * sfmmu_ctxdom_lock() 1775 * pause_cpus() 1776 * suspend() 1777 * if (suspend failed) 1778 * sfmmu_ctxdom_unlock() 1779 * ... 1780 * sfmmu_ctxdom_remove() 1781 * resume_cpus() 1782 * sfmmu_ctxdom_update() 1783 * sfmmu_ctxdom_unlock() 1784 */ 1785 static cpuset_t sfmmu_ctxdoms_pset; 1786 1787 void 1788 sfmmu_ctxdoms_remove() 1789 { 1790 processorid_t id; 1791 cpu_t *cp; 1792 1793 /* 1794 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can 1795 * be restored post-migration. A CPU may be powered off and not have a 1796 * domain, for example. 1797 */ 1798 CPUSET_ZERO(sfmmu_ctxdoms_pset); 1799 1800 for (id = 0; id < NCPU; id++) { 1801 if ((cp = cpu[id]) != NULL && CPU_MMU_CTXP(cp) != NULL) { 1802 CPUSET_ADD(sfmmu_ctxdoms_pset, id); 1803 CPU_MMU_CTXP(cp) = NULL; 1804 } 1805 } 1806 } 1807 1808 void 1809 sfmmu_ctxdoms_lock(void) 1810 { 1811 int idx; 1812 mmu_ctx_t *mmu_ctxp; 1813 1814 sfmmu_hat_lock_all(); 1815 1816 /* 1817 * At this point, no thread can be in sfmmu_ctx_wrap_around, because 1818 * hat_lock is always taken before calling it. 1819 * 1820 * For each domain, set mmu_cnum to max so no more contexts can be 1821 * allocated, and wrap to flush on-CPU contexts and force threads to 1822 * acquire a new context when we later drop hat_lock after migration. 1823 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum, 1824 * but the latter uses CAS and will miscompare and not overwrite it. 1825 */ 1826 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */ 1827 for (idx = 0; idx < max_mmu_ctxdoms; idx++) { 1828 if ((mmu_ctxp = mmu_ctxs_tbl[idx]) != NULL) { 1829 mutex_enter(&mmu_ctxp->mmu_lock); 1830 mmu_ctxp->mmu_cnum = mmu_ctxp->mmu_nctxs; 1831 /* make sure updated cnum visible */ 1832 membar_enter(); 1833 mutex_exit(&mmu_ctxp->mmu_lock); 1834 sfmmu_ctx_wrap_around(mmu_ctxp, B_FALSE); 1835 } 1836 } 1837 kpreempt_enable(); 1838 } 1839 1840 void 1841 sfmmu_ctxdoms_unlock(void) 1842 { 1843 sfmmu_hat_unlock_all(); 1844 } 1845 1846 void 1847 sfmmu_ctxdoms_update(void) 1848 { 1849 processorid_t id; 1850 cpu_t *cp; 1851 uint_t idx; 1852 mmu_ctx_t *mmu_ctxp; 1853 1854 /* 1855 * Free all context domains. As side effect, this increases 1856 * mmu_saved_gnum to the maximum gnum over all domains, which is used to 1857 * init gnum in the new domains, which therefore will be larger than the 1858 * sfmmu gnum for any process, guaranteeing that every process will see 1859 * a new generation and allocate a new context regardless of what new 1860 * domain it runs in. 1861 */ 1862 mutex_enter(&cpu_lock); 1863 1864 for (idx = 0; idx < max_mmu_ctxdoms; idx++) { 1865 if (mmu_ctxs_tbl[idx] != NULL) { 1866 mmu_ctxp = mmu_ctxs_tbl[idx]; 1867 mmu_ctxs_tbl[idx] = NULL; 1868 sfmmu_ctxdom_free(mmu_ctxp); 1869 } 1870 } 1871 1872 for (id = 0; id < NCPU; id++) { 1873 if (CPU_IN_SET(sfmmu_ctxdoms_pset, id) && 1874 (cp = cpu[id]) != NULL) 1875 sfmmu_cpu_init(cp); 1876 } 1877 mutex_exit(&cpu_lock); 1878 } 1879 #endif 1880 1881 /* 1882 * Hat_setup, makes an address space context the current active one. 1883 * In sfmmu this translates to setting the secondary context with the 1884 * corresponding context. 1885 */ 1886 void 1887 hat_setup(struct hat *sfmmup, int allocflag) 1888 { 1889 hatlock_t *hatlockp; 1890 1891 /* Init needs some special treatment. */ 1892 if (allocflag == HAT_INIT) { 1893 /* 1894 * Make sure that we have 1895 * 1. a TSB 1896 * 2. a valid ctx that doesn't get stolen after this point. 1897 */ 1898 hatlockp = sfmmu_hat_enter(sfmmup); 1899 1900 /* 1901 * Swap in the TSB. hat_init() allocates tsbinfos without 1902 * TSBs, but we need one for init, since the kernel does some 1903 * special things to set up its stack and needs the TSB to 1904 * resolve page faults. 1905 */ 1906 sfmmu_tsb_swapin(sfmmup, hatlockp); 1907 1908 sfmmu_get_ctx(sfmmup); 1909 1910 sfmmu_hat_exit(hatlockp); 1911 } else { 1912 ASSERT(allocflag == HAT_ALLOC); 1913 1914 hatlockp = sfmmu_hat_enter(sfmmup); 1915 kpreempt_disable(); 1916 1917 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1918 /* 1919 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter, 1920 * pagesize bits don't matter in this case since we are passing 1921 * INVALID_CONTEXT to it. 1922 * Compatibility Note: hw takes care of MMU_SCONTEXT1 1923 */ 1924 sfmmu_setctx_sec(INVALID_CONTEXT); 1925 sfmmu_clear_utsbinfo(); 1926 1927 kpreempt_enable(); 1928 sfmmu_hat_exit(hatlockp); 1929 } 1930 } 1931 1932 /* 1933 * Free all the translation resources for the specified address space. 1934 * Called from as_free when an address space is being destroyed. 1935 */ 1936 void 1937 hat_free_start(struct hat *sfmmup) 1938 { 1939 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 1940 ASSERT(sfmmup != ksfmmup); 1941 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1942 1943 sfmmup->sfmmu_free = 1; 1944 if (sfmmup->sfmmu_scdp != NULL) { 1945 sfmmu_leave_scd(sfmmup, 0); 1946 } 1947 1948 ASSERT(sfmmup->sfmmu_scdp == NULL); 1949 } 1950 1951 void 1952 hat_free_end(struct hat *sfmmup) 1953 { 1954 int i; 1955 1956 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1957 ASSERT(sfmmup->sfmmu_free == 1); 1958 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1959 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1960 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1961 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1962 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1963 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1964 1965 if (sfmmup->sfmmu_rmstat) { 1966 hat_freestat(sfmmup->sfmmu_as, NULL); 1967 } 1968 1969 while (sfmmup->sfmmu_tsb != NULL) { 1970 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1971 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1972 sfmmup->sfmmu_tsb = next; 1973 } 1974 1975 if (sfmmup->sfmmu_srdp != NULL) { 1976 sfmmu_leave_srd(sfmmup); 1977 ASSERT(sfmmup->sfmmu_srdp == NULL); 1978 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1979 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) { 1980 kmem_free(sfmmup->sfmmu_hmeregion_links[i], 1981 SFMMU_L2_HMERLINKS_SIZE); 1982 sfmmup->sfmmu_hmeregion_links[i] = NULL; 1983 } 1984 } 1985 } 1986 sfmmu_free_sfmmu(sfmmup); 1987 1988 #ifdef DEBUG 1989 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1990 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL); 1991 } 1992 #endif 1993 1994 kmem_cache_free(sfmmuid_cache, sfmmup); 1995 } 1996 1997 /* 1998 * Set up any translation structures, for the specified address space, 1999 * that are needed or preferred when the process is being swapped in. 2000 */ 2001 /* ARGSUSED */ 2002 void 2003 hat_swapin(struct hat *hat) 2004 { 2005 ASSERT(hat->sfmmu_xhat_provider == NULL); 2006 } 2007 2008 /* 2009 * Free all of the translation resources, for the specified address space, 2010 * that can be freed while the process is swapped out. Called from as_swapout. 2011 * Also, free up the ctx that this process was using. 2012 */ 2013 void 2014 hat_swapout(struct hat *sfmmup) 2015 { 2016 struct hmehash_bucket *hmebp; 2017 struct hme_blk *hmeblkp; 2018 struct hme_blk *pr_hblk = NULL; 2019 struct hme_blk *nx_hblk; 2020 int i; 2021 struct hme_blk *list = NULL; 2022 hatlock_t *hatlockp; 2023 struct tsb_info *tsbinfop; 2024 struct free_tsb { 2025 struct free_tsb *next; 2026 struct tsb_info *tsbinfop; 2027 }; /* free list of TSBs */ 2028 struct free_tsb *freelist, *last, *next; 2029 2030 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 2031 SFMMU_STAT(sf_swapout); 2032 2033 /* 2034 * There is no way to go from an as to all its translations in sfmmu. 2035 * Here is one of the times when we take the big hit and traverse 2036 * the hash looking for hme_blks to free up. Not only do we free up 2037 * this as hme_blks but all those that are free. We are obviously 2038 * swapping because we need memory so let's free up as much 2039 * as we can. 2040 * 2041 * Note that we don't flush TLB/TSB here -- it's not necessary 2042 * because: 2043 * 1) we free the ctx we're using and throw away the TSB(s); 2044 * 2) processes aren't runnable while being swapped out. 2045 */ 2046 ASSERT(sfmmup != KHATID); 2047 for (i = 0; i <= UHMEHASH_SZ; i++) { 2048 hmebp = &uhme_hash[i]; 2049 SFMMU_HASH_LOCK(hmebp); 2050 hmeblkp = hmebp->hmeblkp; 2051 pr_hblk = NULL; 2052 while (hmeblkp) { 2053 2054 ASSERT(!hmeblkp->hblk_xhat_bit); 2055 2056 if ((hmeblkp->hblk_tag.htag_id == sfmmup) && 2057 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) { 2058 ASSERT(!hmeblkp->hblk_shared); 2059 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 2060 (caddr_t)get_hblk_base(hmeblkp), 2061 get_hblk_endaddr(hmeblkp), 2062 NULL, HAT_UNLOAD); 2063 } 2064 nx_hblk = hmeblkp->hblk_next; 2065 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 2066 ASSERT(!hmeblkp->hblk_lckcnt); 2067 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 2068 &list, 0); 2069 } else { 2070 pr_hblk = hmeblkp; 2071 } 2072 hmeblkp = nx_hblk; 2073 } 2074 SFMMU_HASH_UNLOCK(hmebp); 2075 } 2076 2077 sfmmu_hblks_list_purge(&list, 0); 2078 2079 /* 2080 * Now free up the ctx so that others can reuse it. 2081 */ 2082 hatlockp = sfmmu_hat_enter(sfmmup); 2083 2084 sfmmu_invalidate_ctx(sfmmup); 2085 2086 /* 2087 * Free TSBs, but not tsbinfos, and set SWAPPED flag. 2088 * If TSBs were never swapped in, just return. 2089 * This implies that we don't support partial swapping 2090 * of TSBs -- either all are swapped out, or none are. 2091 * 2092 * We must hold the HAT lock here to prevent racing with another 2093 * thread trying to unmap TTEs from the TSB or running the post- 2094 * relocator after relocating the TSB's memory. Unfortunately, we 2095 * can't free memory while holding the HAT lock or we could 2096 * deadlock, so we build a list of TSBs to be freed after marking 2097 * the tsbinfos as swapped out and free them after dropping the 2098 * lock. 2099 */ 2100 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 2101 sfmmu_hat_exit(hatlockp); 2102 return; 2103 } 2104 2105 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED); 2106 last = freelist = NULL; 2107 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 2108 tsbinfop = tsbinfop->tsb_next) { 2109 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0); 2110 2111 /* 2112 * Cast the TSB into a struct free_tsb and put it on the free 2113 * list. 2114 */ 2115 if (freelist == NULL) { 2116 last = freelist = (struct free_tsb *)tsbinfop->tsb_va; 2117 } else { 2118 last->next = (struct free_tsb *)tsbinfop->tsb_va; 2119 last = last->next; 2120 } 2121 last->next = NULL; 2122 last->tsbinfop = tsbinfop; 2123 tsbinfop->tsb_flags |= TSB_SWAPPED; 2124 /* 2125 * Zero out the TTE to clear the valid bit. 2126 * Note we can't use a value like 0xbad because we want to 2127 * ensure diagnostic bits are NEVER set on TTEs that might 2128 * be loaded. The intent is to catch any invalid access 2129 * to the swapped TSB, such as a thread running with a valid 2130 * context without first calling sfmmu_tsb_swapin() to 2131 * allocate TSB memory. 2132 */ 2133 tsbinfop->tsb_tte.ll = 0; 2134 } 2135 2136 /* Now we can drop the lock and free the TSB memory. */ 2137 sfmmu_hat_exit(hatlockp); 2138 for (; freelist != NULL; freelist = next) { 2139 next = freelist->next; 2140 sfmmu_tsb_free(freelist->tsbinfop); 2141 } 2142 } 2143 2144 /* 2145 * Duplicate the translations of an as into another newas 2146 */ 2147 /* ARGSUSED */ 2148 int 2149 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 2150 uint_t flag) 2151 { 2152 sf_srd_t *srdp; 2153 sf_scd_t *scdp; 2154 int i; 2155 extern uint_t get_color_start(struct as *); 2156 2157 ASSERT(hat->sfmmu_xhat_provider == NULL); 2158 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) || 2159 (flag == HAT_DUP_SRD)); 2160 ASSERT(hat != ksfmmup); 2161 ASSERT(newhat != ksfmmup); 2162 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp); 2163 2164 if (flag == HAT_DUP_COW) { 2165 panic("hat_dup: HAT_DUP_COW not supported"); 2166 } 2167 2168 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) { 2169 ASSERT(srdp->srd_evp != NULL); 2170 VN_HOLD(srdp->srd_evp); 2171 ASSERT(srdp->srd_refcnt > 0); 2172 newhat->sfmmu_srdp = srdp; 2173 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 2174 } 2175 2176 /* 2177 * HAT_DUP_ALL flag is used after as duplication is done. 2178 */ 2179 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) { 2180 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2); 2181 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags; 2182 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) { 2183 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG; 2184 } 2185 2186 /* check if need to join scd */ 2187 if ((scdp = hat->sfmmu_scdp) != NULL && 2188 newhat->sfmmu_scdp != scdp) { 2189 int ret; 2190 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map, 2191 &scdp->scd_region_map, ret); 2192 ASSERT(ret); 2193 sfmmu_join_scd(scdp, newhat); 2194 ASSERT(newhat->sfmmu_scdp == scdp && 2195 scdp->scd_refcnt >= 2); 2196 for (i = 0; i < max_mmu_page_sizes; i++) { 2197 newhat->sfmmu_ismttecnt[i] = 2198 hat->sfmmu_ismttecnt[i]; 2199 newhat->sfmmu_scdismttecnt[i] = 2200 hat->sfmmu_scdismttecnt[i]; 2201 } 2202 } 2203 2204 sfmmu_check_page_sizes(newhat, 1); 2205 } 2206 2207 if (flag == HAT_DUP_ALL && consistent_coloring == 0 && 2208 update_proc_pgcolorbase_after_fork != 0) { 2209 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as); 2210 } 2211 return (0); 2212 } 2213 2214 void 2215 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 2216 uint_t attr, uint_t flags) 2217 { 2218 hat_do_memload(hat, addr, pp, attr, flags, 2219 SFMMU_INVALID_SHMERID); 2220 } 2221 2222 void 2223 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2224 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2225 { 2226 uint_t rid; 2227 if (rcookie == HAT_INVALID_REGION_COOKIE || 2228 hat->sfmmu_xhat_provider != NULL) { 2229 hat_do_memload(hat, addr, pp, attr, flags, 2230 SFMMU_INVALID_SHMERID); 2231 return; 2232 } 2233 rid = (uint_t)((uint64_t)rcookie); 2234 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2235 hat_do_memload(hat, addr, pp, attr, flags, rid); 2236 } 2237 2238 /* 2239 * Set up addr to map to page pp with protection prot. 2240 * As an optimization we also load the TSB with the 2241 * corresponding tte but it is no big deal if the tte gets kicked out. 2242 */ 2243 static void 2244 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp, 2245 uint_t attr, uint_t flags, uint_t rid) 2246 { 2247 tte_t tte; 2248 2249 2250 ASSERT(hat != NULL); 2251 ASSERT(PAGE_LOCKED(pp)); 2252 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2253 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2254 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2255 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE); 2256 2257 if (PP_ISFREE(pp)) { 2258 panic("hat_memload: loading a mapping to free page %p", 2259 (void *)pp); 2260 } 2261 2262 if (hat->sfmmu_xhat_provider) { 2263 /* no regions for xhats */ 2264 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2265 XHAT_MEMLOAD(hat, addr, pp, attr, flags); 2266 return; 2267 } 2268 2269 ASSERT((hat == ksfmmup) || 2270 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2271 2272 if (flags & ~SFMMU_LOAD_ALLFLAG) 2273 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 2274 flags & ~SFMMU_LOAD_ALLFLAG); 2275 2276 if (hat->sfmmu_rmstat) 2277 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 2278 2279 #if defined(SF_ERRATA_57) 2280 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2281 (addr < errata57_limit) && (attr & PROT_EXEC) && 2282 !(flags & HAT_LOAD_SHARE)) { 2283 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 2284 " page executable"); 2285 attr &= ~PROT_EXEC; 2286 } 2287 #endif 2288 2289 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2290 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid); 2291 2292 /* 2293 * Check TSB and TLB page sizes. 2294 */ 2295 if ((flags & HAT_LOAD_SHARE) == 0) { 2296 sfmmu_check_page_sizes(hat, 1); 2297 } 2298 } 2299 2300 /* 2301 * hat_devload can be called to map real memory (e.g. 2302 * /dev/kmem) and even though hat_devload will determine pf is 2303 * for memory, it will be unable to get a shared lock on the 2304 * page (because someone else has it exclusively) and will 2305 * pass dp = NULL. If tteload doesn't get a non-NULL 2306 * page pointer it can't cache memory. 2307 */ 2308 void 2309 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 2310 uint_t attr, int flags) 2311 { 2312 tte_t tte; 2313 struct page *pp = NULL; 2314 int use_lgpg = 0; 2315 2316 ASSERT(hat != NULL); 2317 2318 if (hat->sfmmu_xhat_provider) { 2319 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags); 2320 return; 2321 } 2322 2323 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2324 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2325 ASSERT((hat == ksfmmup) || 2326 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2327 if (len == 0) 2328 panic("hat_devload: zero len"); 2329 if (flags & ~SFMMU_LOAD_ALLFLAG) 2330 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 2331 flags & ~SFMMU_LOAD_ALLFLAG); 2332 2333 #if defined(SF_ERRATA_57) 2334 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2335 (addr < errata57_limit) && (attr & PROT_EXEC) && 2336 !(flags & HAT_LOAD_SHARE)) { 2337 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 2338 " page executable"); 2339 attr &= ~PROT_EXEC; 2340 } 2341 #endif 2342 2343 /* 2344 * If it's a memory page find its pp 2345 */ 2346 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 2347 pp = page_numtopp_nolock(pfn); 2348 if (pp == NULL) { 2349 flags |= HAT_LOAD_NOCONSIST; 2350 } else { 2351 if (PP_ISFREE(pp)) { 2352 panic("hat_memload: loading " 2353 "a mapping to free page %p", 2354 (void *)pp); 2355 } 2356 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2357 panic("hat_memload: loading a mapping " 2358 "to unlocked relocatable page %p", 2359 (void *)pp); 2360 } 2361 ASSERT(len == MMU_PAGESIZE); 2362 } 2363 } 2364 2365 if (hat->sfmmu_rmstat) 2366 hat_resvstat(len, hat->sfmmu_as, addr); 2367 2368 if (flags & HAT_LOAD_NOCONSIST) { 2369 attr |= SFMMU_UNCACHEVTTE; 2370 use_lgpg = 1; 2371 } 2372 if (!pf_is_memory(pfn)) { 2373 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 2374 use_lgpg = 1; 2375 switch (attr & HAT_ORDER_MASK) { 2376 case HAT_STRICTORDER: 2377 case HAT_UNORDERED_OK: 2378 /* 2379 * we set the side effect bit for all non 2380 * memory mappings unless merging is ok 2381 */ 2382 attr |= SFMMU_SIDEFFECT; 2383 break; 2384 case HAT_MERGING_OK: 2385 case HAT_LOADCACHING_OK: 2386 case HAT_STORECACHING_OK: 2387 break; 2388 default: 2389 panic("hat_devload: bad attr"); 2390 break; 2391 } 2392 } 2393 while (len) { 2394 if (!use_lgpg) { 2395 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2396 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2397 flags, SFMMU_INVALID_SHMERID); 2398 len -= MMU_PAGESIZE; 2399 addr += MMU_PAGESIZE; 2400 pfn++; 2401 continue; 2402 } 2403 /* 2404 * try to use large pages, check va/pa alignments 2405 * Note that 32M/256M page sizes are not (yet) supported. 2406 */ 2407 if ((len >= MMU_PAGESIZE4M) && 2408 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 2409 !(disable_large_pages & (1 << TTE4M)) && 2410 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 2411 sfmmu_memtte(&tte, pfn, attr, TTE4M); 2412 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2413 flags, SFMMU_INVALID_SHMERID); 2414 len -= MMU_PAGESIZE4M; 2415 addr += MMU_PAGESIZE4M; 2416 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 2417 } else if ((len >= MMU_PAGESIZE512K) && 2418 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 2419 !(disable_large_pages & (1 << TTE512K)) && 2420 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 2421 sfmmu_memtte(&tte, pfn, attr, TTE512K); 2422 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2423 flags, SFMMU_INVALID_SHMERID); 2424 len -= MMU_PAGESIZE512K; 2425 addr += MMU_PAGESIZE512K; 2426 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 2427 } else if ((len >= MMU_PAGESIZE64K) && 2428 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 2429 !(disable_large_pages & (1 << TTE64K)) && 2430 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 2431 sfmmu_memtte(&tte, pfn, attr, TTE64K); 2432 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2433 flags, SFMMU_INVALID_SHMERID); 2434 len -= MMU_PAGESIZE64K; 2435 addr += MMU_PAGESIZE64K; 2436 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 2437 } else { 2438 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2439 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2440 flags, SFMMU_INVALID_SHMERID); 2441 len -= MMU_PAGESIZE; 2442 addr += MMU_PAGESIZE; 2443 pfn++; 2444 } 2445 } 2446 2447 /* 2448 * Check TSB and TLB page sizes. 2449 */ 2450 if ((flags & HAT_LOAD_SHARE) == 0) { 2451 sfmmu_check_page_sizes(hat, 1); 2452 } 2453 } 2454 2455 void 2456 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 2457 struct page **pps, uint_t attr, uint_t flags) 2458 { 2459 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2460 SFMMU_INVALID_SHMERID); 2461 } 2462 2463 void 2464 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2465 struct page **pps, uint_t attr, uint_t flags, 2466 hat_region_cookie_t rcookie) 2467 { 2468 uint_t rid; 2469 if (rcookie == HAT_INVALID_REGION_COOKIE || 2470 hat->sfmmu_xhat_provider != NULL) { 2471 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2472 SFMMU_INVALID_SHMERID); 2473 return; 2474 } 2475 rid = (uint_t)((uint64_t)rcookie); 2476 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2477 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid); 2478 } 2479 2480 /* 2481 * Map the largest extend possible out of the page array. The array may NOT 2482 * be in order. The largest possible mapping a page can have 2483 * is specified in the p_szc field. The p_szc field 2484 * cannot change as long as there any mappings (large or small) 2485 * to any of the pages that make up the large page. (ie. any 2486 * promotion/demotion of page size is not up to the hat but up to 2487 * the page free list manager). The array 2488 * should consist of properly aligned contigous pages that are 2489 * part of a big page for a large mapping to be created. 2490 */ 2491 static void 2492 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len, 2493 struct page **pps, uint_t attr, uint_t flags, uint_t rid) 2494 { 2495 int ttesz; 2496 size_t mapsz; 2497 pgcnt_t numpg, npgs; 2498 tte_t tte; 2499 page_t *pp; 2500 uint_t large_pages_disable; 2501 2502 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2503 SFMMU_VALIDATE_HMERID(hat, rid, addr, len); 2504 2505 if (hat->sfmmu_xhat_provider) { 2506 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2507 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags); 2508 return; 2509 } 2510 2511 if (hat->sfmmu_rmstat) 2512 hat_resvstat(len, hat->sfmmu_as, addr); 2513 2514 #if defined(SF_ERRATA_57) 2515 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2516 (addr < errata57_limit) && (attr & PROT_EXEC) && 2517 !(flags & HAT_LOAD_SHARE)) { 2518 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 2519 "user page executable"); 2520 attr &= ~PROT_EXEC; 2521 } 2522 #endif 2523 2524 /* Get number of pages */ 2525 npgs = len >> MMU_PAGESHIFT; 2526 2527 if (flags & HAT_LOAD_SHARE) { 2528 large_pages_disable = disable_ism_large_pages; 2529 } else { 2530 large_pages_disable = disable_large_pages; 2531 } 2532 2533 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 2534 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2535 rid); 2536 return; 2537 } 2538 2539 while (npgs >= NHMENTS) { 2540 pp = *pps; 2541 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 2542 /* 2543 * Check if this page size is disabled. 2544 */ 2545 if (large_pages_disable & (1 << ttesz)) 2546 continue; 2547 2548 numpg = TTEPAGES(ttesz); 2549 mapsz = numpg << MMU_PAGESHIFT; 2550 if ((npgs >= numpg) && 2551 IS_P2ALIGNED(addr, mapsz) && 2552 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 2553 /* 2554 * At this point we have enough pages and 2555 * we know the virtual address and the pfn 2556 * are properly aligned. We still need 2557 * to check for physical contiguity but since 2558 * it is very likely that this is the case 2559 * we will assume they are so and undo 2560 * the request if necessary. It would 2561 * be great if we could get a hint flag 2562 * like HAT_CONTIG which would tell us 2563 * the pages are contigous for sure. 2564 */ 2565 sfmmu_memtte(&tte, (*pps)->p_pagenum, 2566 attr, ttesz); 2567 if (!sfmmu_tteload_array(hat, &tte, addr, 2568 pps, flags, rid)) { 2569 break; 2570 } 2571 } 2572 } 2573 if (ttesz == TTE8K) { 2574 /* 2575 * We were not able to map array using a large page 2576 * batch a hmeblk or fraction at a time. 2577 */ 2578 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 2579 & (NHMENTS-1); 2580 numpg = NHMENTS - numpg; 2581 ASSERT(numpg <= npgs); 2582 mapsz = numpg * MMU_PAGESIZE; 2583 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 2584 numpg, rid); 2585 } 2586 addr += mapsz; 2587 npgs -= numpg; 2588 pps += numpg; 2589 } 2590 2591 if (npgs) { 2592 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2593 rid); 2594 } 2595 2596 /* 2597 * Check TSB and TLB page sizes. 2598 */ 2599 if ((flags & HAT_LOAD_SHARE) == 0) { 2600 sfmmu_check_page_sizes(hat, 1); 2601 } 2602 } 2603 2604 /* 2605 * Function tries to batch 8K pages into the same hme blk. 2606 */ 2607 static void 2608 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 2609 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid) 2610 { 2611 tte_t tte; 2612 page_t *pp; 2613 struct hmehash_bucket *hmebp; 2614 struct hme_blk *hmeblkp; 2615 int index; 2616 2617 while (npgs) { 2618 /* 2619 * Acquire the hash bucket. 2620 */ 2621 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K, 2622 rid); 2623 ASSERT(hmebp); 2624 2625 /* 2626 * Find the hment block. 2627 */ 2628 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2629 TTE8K, flags, rid); 2630 ASSERT(hmeblkp); 2631 2632 do { 2633 /* 2634 * Make the tte. 2635 */ 2636 pp = *pps; 2637 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2638 2639 /* 2640 * Add the translation. 2641 */ 2642 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2643 vaddr, pps, flags, rid); 2644 2645 /* 2646 * Goto next page. 2647 */ 2648 pps++; 2649 npgs--; 2650 2651 /* 2652 * Goto next address. 2653 */ 2654 vaddr += MMU_PAGESIZE; 2655 2656 /* 2657 * Don't crossover into a different hmentblk. 2658 */ 2659 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2660 (NHMENTS-1)); 2661 2662 } while (index != 0 && npgs != 0); 2663 2664 /* 2665 * Release the hash bucket. 2666 */ 2667 2668 sfmmu_tteload_release_hashbucket(hmebp); 2669 } 2670 } 2671 2672 /* 2673 * Construct a tte for a page: 2674 * 2675 * tte_valid = 1 2676 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only) 2677 * tte_size = size 2678 * tte_nfo = attr & HAT_NOFAULT 2679 * tte_ie = attr & HAT_STRUCTURE_LE 2680 * tte_hmenum = hmenum 2681 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2682 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2683 * tte_ref = 1 (optimization) 2684 * tte_wr_perm = attr & PROT_WRITE; 2685 * tte_no_sync = attr & HAT_NOSYNC 2686 * tte_lock = attr & SFMMU_LOCKTTE 2687 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2688 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2689 * tte_e = attr & SFMMU_SIDEFFECT 2690 * tte_priv = !(attr & PROT_USER) 2691 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2692 * tte_glb = 0 2693 */ 2694 void 2695 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2696 { 2697 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2698 2699 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2700 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2701 2702 if (TTE_IS_NOSYNC(ttep)) { 2703 TTE_SET_REF(ttep); 2704 if (TTE_IS_WRITABLE(ttep)) { 2705 TTE_SET_MOD(ttep); 2706 } 2707 } 2708 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2709 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2710 } 2711 } 2712 2713 /* 2714 * This function will add a translation to the hme_blk and allocate the 2715 * hme_blk if one does not exist. 2716 * If a page structure is specified then it will add the 2717 * corresponding hment to the mapping list. 2718 * It will also update the hmenum field for the tte. 2719 * 2720 * Currently this function is only used for kernel mappings. 2721 * So pass invalid region to sfmmu_tteload_array(). 2722 */ 2723 void 2724 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2725 uint_t flags) 2726 { 2727 ASSERT(sfmmup == ksfmmup); 2728 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags, 2729 SFMMU_INVALID_SHMERID); 2730 } 2731 2732 /* 2733 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2734 * Assumes that a particular page size may only be resident in one TSB. 2735 */ 2736 static void 2737 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2738 { 2739 struct tsb_info *tsbinfop = NULL; 2740 uint64_t tag; 2741 struct tsbe *tsbe_addr; 2742 uint64_t tsb_base; 2743 uint_t tsb_size; 2744 int vpshift = MMU_PAGESHIFT; 2745 int phys = 0; 2746 2747 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2748 phys = ktsb_phys; 2749 if (ttesz >= TTE4M) { 2750 #ifndef sun4v 2751 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2752 #endif 2753 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2754 tsb_size = ktsb4m_szcode; 2755 } else { 2756 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2757 tsb_size = ktsb_szcode; 2758 } 2759 } else { 2760 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2761 2762 /* 2763 * If there isn't a TSB for this page size, or the TSB is 2764 * swapped out, there is nothing to do. Note that the latter 2765 * case seems impossible but can occur if hat_pageunload() 2766 * is called on an ISM mapping while the process is swapped 2767 * out. 2768 */ 2769 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2770 return; 2771 2772 /* 2773 * If another thread is in the middle of relocating a TSB 2774 * we can't unload the entry so set a flag so that the 2775 * TSB will be flushed before it can be accessed by the 2776 * process. 2777 */ 2778 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2779 if (ttep == NULL) 2780 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2781 return; 2782 } 2783 #if defined(UTSB_PHYS) 2784 phys = 1; 2785 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2786 #else 2787 tsb_base = (uint64_t)tsbinfop->tsb_va; 2788 #endif 2789 tsb_size = tsbinfop->tsb_szc; 2790 } 2791 if (ttesz >= TTE4M) 2792 vpshift = MMU_PAGESHIFT4M; 2793 2794 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2795 tag = sfmmu_make_tsbtag(vaddr); 2796 2797 if (ttep == NULL) { 2798 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2799 } else { 2800 if (ttesz >= TTE4M) { 2801 SFMMU_STAT(sf_tsb_load4m); 2802 } else { 2803 SFMMU_STAT(sf_tsb_load8k); 2804 } 2805 2806 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2807 } 2808 } 2809 2810 /* 2811 * Unmap all entries from [start, end) matching the given page size. 2812 * 2813 * This function is used primarily to unmap replicated 64K or 512K entries 2814 * from the TSB that are inserted using the base page size TSB pointer, but 2815 * it may also be called to unmap a range of addresses from the TSB. 2816 */ 2817 void 2818 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2819 { 2820 struct tsb_info *tsbinfop; 2821 uint64_t tag; 2822 struct tsbe *tsbe_addr; 2823 caddr_t vaddr; 2824 uint64_t tsb_base; 2825 int vpshift, vpgsz; 2826 uint_t tsb_size; 2827 int phys = 0; 2828 2829 /* 2830 * Assumptions: 2831 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2832 * at a time shooting down any valid entries we encounter. 2833 * 2834 * If ttesz >= 4M we walk the range 4M at a time shooting 2835 * down any valid mappings we find. 2836 */ 2837 if (sfmmup == ksfmmup) { 2838 phys = ktsb_phys; 2839 if (ttesz >= TTE4M) { 2840 #ifndef sun4v 2841 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2842 #endif 2843 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2844 tsb_size = ktsb4m_szcode; 2845 } else { 2846 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2847 tsb_size = ktsb_szcode; 2848 } 2849 } else { 2850 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2851 2852 /* 2853 * If there isn't a TSB for this page size, or the TSB is 2854 * swapped out, there is nothing to do. Note that the latter 2855 * case seems impossible but can occur if hat_pageunload() 2856 * is called on an ISM mapping while the process is swapped 2857 * out. 2858 */ 2859 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2860 return; 2861 2862 /* 2863 * If another thread is in the middle of relocating a TSB 2864 * we can't unload the entry so set a flag so that the 2865 * TSB will be flushed before it can be accessed by the 2866 * process. 2867 */ 2868 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2869 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2870 return; 2871 } 2872 #if defined(UTSB_PHYS) 2873 phys = 1; 2874 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2875 #else 2876 tsb_base = (uint64_t)tsbinfop->tsb_va; 2877 #endif 2878 tsb_size = tsbinfop->tsb_szc; 2879 } 2880 if (ttesz >= TTE4M) { 2881 vpshift = MMU_PAGESHIFT4M; 2882 vpgsz = MMU_PAGESIZE4M; 2883 } else { 2884 vpshift = MMU_PAGESHIFT; 2885 vpgsz = MMU_PAGESIZE; 2886 } 2887 2888 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2889 tag = sfmmu_make_tsbtag(vaddr); 2890 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2891 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2892 } 2893 } 2894 2895 /* 2896 * Select the optimum TSB size given the number of mappings 2897 * that need to be cached. 2898 */ 2899 static int 2900 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2901 { 2902 int szc = 0; 2903 2904 #ifdef DEBUG 2905 if (tsb_grow_stress) { 2906 uint32_t randval = (uint32_t)gettick() >> 4; 2907 return (randval % (tsb_max_growsize + 1)); 2908 } 2909 #endif /* DEBUG */ 2910 2911 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2912 szc++; 2913 return (szc); 2914 } 2915 2916 /* 2917 * This function will add a translation to the hme_blk and allocate the 2918 * hme_blk if one does not exist. 2919 * If a page structure is specified then it will add the 2920 * corresponding hment to the mapping list. 2921 * It will also update the hmenum field for the tte. 2922 * Furthermore, it attempts to create a large page translation 2923 * for <addr,hat> at page array pps. It assumes addr and first 2924 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2925 */ 2926 static int 2927 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2928 page_t **pps, uint_t flags, uint_t rid) 2929 { 2930 struct hmehash_bucket *hmebp; 2931 struct hme_blk *hmeblkp; 2932 int ret; 2933 uint_t size; 2934 2935 /* 2936 * Get mapping size. 2937 */ 2938 size = TTE_CSZ(ttep); 2939 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2940 2941 /* 2942 * Acquire the hash bucket. 2943 */ 2944 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid); 2945 ASSERT(hmebp); 2946 2947 /* 2948 * Find the hment block. 2949 */ 2950 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags, 2951 rid); 2952 ASSERT(hmeblkp); 2953 2954 /* 2955 * Add the translation. 2956 */ 2957 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags, 2958 rid); 2959 2960 /* 2961 * Release the hash bucket. 2962 */ 2963 sfmmu_tteload_release_hashbucket(hmebp); 2964 2965 return (ret); 2966 } 2967 2968 /* 2969 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2970 */ 2971 static struct hmehash_bucket * 2972 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size, 2973 uint_t rid) 2974 { 2975 struct hmehash_bucket *hmebp; 2976 int hmeshift; 2977 void *htagid = sfmmutohtagid(sfmmup, rid); 2978 2979 ASSERT(htagid != NULL); 2980 2981 hmeshift = HME_HASH_SHIFT(size); 2982 2983 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift); 2984 2985 SFMMU_HASH_LOCK(hmebp); 2986 2987 return (hmebp); 2988 } 2989 2990 /* 2991 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2992 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2993 * allocated. 2994 */ 2995 static struct hme_blk * 2996 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2997 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid) 2998 { 2999 hmeblk_tag hblktag; 3000 int hmeshift; 3001 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 3002 3003 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 3004 3005 hblktag.htag_id = sfmmutohtagid(sfmmup, rid); 3006 ASSERT(hblktag.htag_id != NULL); 3007 hmeshift = HME_HASH_SHIFT(size); 3008 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3009 hblktag.htag_rehash = HME_HASH_REHASH(size); 3010 hblktag.htag_rid = rid; 3011 3012 ttearray_realloc: 3013 3014 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3015 3016 /* 3017 * We block until hblk_reserve_lock is released; it's held by 3018 * the thread, temporarily using hblk_reserve, until hblk_reserve is 3019 * replaced by a hblk from sfmmu8_cache. 3020 */ 3021 if (hmeblkp == (struct hme_blk *)hblk_reserve && 3022 hblk_reserve_thread != curthread) { 3023 SFMMU_HASH_UNLOCK(hmebp); 3024 mutex_enter(&hblk_reserve_lock); 3025 mutex_exit(&hblk_reserve_lock); 3026 SFMMU_STAT(sf_hblk_reserve_hit); 3027 SFMMU_HASH_LOCK(hmebp); 3028 goto ttearray_realloc; 3029 } 3030 3031 if (hmeblkp == NULL) { 3032 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3033 hblktag, flags, rid); 3034 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3035 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3036 } else { 3037 /* 3038 * It is possible for 8k and 64k hblks to collide since they 3039 * have the same rehash value. This is because we 3040 * lazily free hblks and 8K/64K blks could be lingering. 3041 * If we find size mismatch we free the block and & try again. 3042 */ 3043 if (get_hblk_ttesz(hmeblkp) != size) { 3044 ASSERT(!hmeblkp->hblk_vcnt); 3045 ASSERT(!hmeblkp->hblk_hmecnt); 3046 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3047 &list, 0); 3048 goto ttearray_realloc; 3049 } 3050 if (hmeblkp->hblk_shw_bit) { 3051 /* 3052 * if the hblk was previously used as a shadow hblk then 3053 * we will change it to a normal hblk 3054 */ 3055 ASSERT(!hmeblkp->hblk_shared); 3056 if (hmeblkp->hblk_shw_mask) { 3057 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 3058 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3059 goto ttearray_realloc; 3060 } else { 3061 hmeblkp->hblk_shw_bit = 0; 3062 } 3063 } 3064 SFMMU_STAT(sf_hblk_hit); 3065 } 3066 3067 /* 3068 * hat_memload() should never call kmem_cache_free() for kernel hmeblks; 3069 * see block comment showing the stacktrace in sfmmu_hblk_alloc(); 3070 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will 3071 * just add these hmeblks to the per-cpu pending queue. 3072 */ 3073 sfmmu_hblks_list_purge(&list, 1); 3074 3075 ASSERT(get_hblk_ttesz(hmeblkp) == size); 3076 ASSERT(!hmeblkp->hblk_shw_bit); 3077 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3078 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3079 ASSERT(hmeblkp->hblk_tag.htag_rid == rid); 3080 3081 return (hmeblkp); 3082 } 3083 3084 /* 3085 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 3086 * otherwise. 3087 */ 3088 static int 3089 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 3090 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid) 3091 { 3092 page_t *pp = *pps; 3093 int hmenum, size, remap; 3094 tte_t tteold, flush_tte; 3095 #ifdef DEBUG 3096 tte_t orig_old; 3097 #endif /* DEBUG */ 3098 struct sf_hment *sfhme; 3099 kmutex_t *pml, *pmtx; 3100 hatlock_t *hatlockp; 3101 int myflt; 3102 3103 /* 3104 * remove this panic when we decide to let user virtual address 3105 * space be >= USERLIMIT. 3106 */ 3107 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 3108 panic("user addr %p in kernel space", (void *)vaddr); 3109 #if defined(TTE_IS_GLOBAL) 3110 if (TTE_IS_GLOBAL(ttep)) 3111 panic("sfmmu_tteload: creating global tte"); 3112 #endif 3113 3114 #ifdef DEBUG 3115 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 3116 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 3117 panic("sfmmu_tteload: non cacheable memory tte"); 3118 #endif /* DEBUG */ 3119 3120 /* don't simulate dirty bit for writeable ISM/DISM mappings */ 3121 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) { 3122 TTE_SET_REF(ttep); 3123 TTE_SET_MOD(ttep); 3124 } 3125 3126 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 3127 !TTE_IS_MOD(ttep)) { 3128 /* 3129 * Don't load TSB for dummy as in ISM. Also don't preload 3130 * the TSB if the TTE isn't writable since we're likely to 3131 * fault on it again -- preloading can be fairly expensive. 3132 */ 3133 flags |= SFMMU_NO_TSBLOAD; 3134 } 3135 3136 size = TTE_CSZ(ttep); 3137 switch (size) { 3138 case TTE8K: 3139 SFMMU_STAT(sf_tteload8k); 3140 break; 3141 case TTE64K: 3142 SFMMU_STAT(sf_tteload64k); 3143 break; 3144 case TTE512K: 3145 SFMMU_STAT(sf_tteload512k); 3146 break; 3147 case TTE4M: 3148 SFMMU_STAT(sf_tteload4m); 3149 break; 3150 case (TTE32M): 3151 SFMMU_STAT(sf_tteload32m); 3152 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3153 break; 3154 case (TTE256M): 3155 SFMMU_STAT(sf_tteload256m); 3156 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3157 break; 3158 } 3159 3160 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 3161 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 3162 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3163 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3164 3165 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 3166 3167 /* 3168 * Need to grab mlist lock here so that pageunload 3169 * will not change tte behind us. 3170 */ 3171 if (pp) { 3172 pml = sfmmu_mlist_enter(pp); 3173 } 3174 3175 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3176 /* 3177 * Look for corresponding hment and if valid verify 3178 * pfns are equal. 3179 */ 3180 remap = TTE_IS_VALID(&tteold); 3181 if (remap) { 3182 pfn_t new_pfn, old_pfn; 3183 3184 old_pfn = TTE_TO_PFN(vaddr, &tteold); 3185 new_pfn = TTE_TO_PFN(vaddr, ttep); 3186 3187 if (flags & HAT_LOAD_REMAP) { 3188 /* make sure we are remapping same type of pages */ 3189 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 3190 panic("sfmmu_tteload - tte remap io<->memory"); 3191 } 3192 if (old_pfn != new_pfn && 3193 (pp != NULL || sfhme->hme_page != NULL)) { 3194 panic("sfmmu_tteload - tte remap pp != NULL"); 3195 } 3196 } else if (old_pfn != new_pfn) { 3197 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 3198 (void *)hmeblkp); 3199 } 3200 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 3201 } 3202 3203 if (pp) { 3204 if (size == TTE8K) { 3205 #ifdef VAC 3206 /* 3207 * Handle VAC consistency 3208 */ 3209 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 3210 sfmmu_vac_conflict(sfmmup, vaddr, pp); 3211 } 3212 #endif 3213 3214 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3215 pmtx = sfmmu_page_enter(pp); 3216 PP_CLRRO(pp); 3217 sfmmu_page_exit(pmtx); 3218 } else if (!PP_ISMAPPED(pp) && 3219 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 3220 pmtx = sfmmu_page_enter(pp); 3221 if (!(PP_ISMOD(pp))) { 3222 PP_SETRO(pp); 3223 } 3224 sfmmu_page_exit(pmtx); 3225 } 3226 3227 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 3228 /* 3229 * sfmmu_pagearray_setup failed so return 3230 */ 3231 sfmmu_mlist_exit(pml); 3232 return (1); 3233 } 3234 } 3235 3236 /* 3237 * Make sure hment is not on a mapping list. 3238 */ 3239 ASSERT(remap || (sfhme->hme_page == NULL)); 3240 3241 /* if it is not a remap then hme->next better be NULL */ 3242 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 3243 3244 if (flags & HAT_LOAD_LOCK) { 3245 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 3246 panic("too high lckcnt-hmeblk %p", 3247 (void *)hmeblkp); 3248 } 3249 atomic_add_32(&hmeblkp->hblk_lckcnt, 1); 3250 3251 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 3252 } 3253 3254 #ifdef VAC 3255 if (pp && PP_ISNC(pp)) { 3256 /* 3257 * If the physical page is marked to be uncacheable, like 3258 * by a vac conflict, make sure the new mapping is also 3259 * uncacheable. 3260 */ 3261 TTE_CLR_VCACHEABLE(ttep); 3262 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 3263 } 3264 #endif 3265 ttep->tte_hmenum = hmenum; 3266 3267 #ifdef DEBUG 3268 orig_old = tteold; 3269 #endif /* DEBUG */ 3270 3271 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 3272 if ((sfmmup == KHATID) && 3273 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 3274 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3275 } 3276 #ifdef DEBUG 3277 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 3278 #endif /* DEBUG */ 3279 } 3280 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 3281 3282 if (!TTE_IS_VALID(&tteold)) { 3283 3284 atomic_add_16(&hmeblkp->hblk_vcnt, 1); 3285 if (rid == SFMMU_INVALID_SHMERID) { 3286 atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1); 3287 } else { 3288 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 3289 sf_region_t *rgnp = srdp->srd_hmergnp[rid]; 3290 /* 3291 * We already accounted for region ttecnt's in sfmmu 3292 * during hat_join_region() processing. Here we 3293 * only update ttecnt's in region struture. 3294 */ 3295 atomic_add_long(&rgnp->rgn_ttecnt[size], 1); 3296 } 3297 } 3298 3299 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup); 3300 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 3301 sfmmup != ksfmmup) { 3302 uchar_t tteflag = 1 << size; 3303 if (rid == SFMMU_INVALID_SHMERID) { 3304 if (!(sfmmup->sfmmu_tteflags & tteflag)) { 3305 hatlockp = sfmmu_hat_enter(sfmmup); 3306 sfmmup->sfmmu_tteflags |= tteflag; 3307 sfmmu_hat_exit(hatlockp); 3308 } 3309 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) { 3310 hatlockp = sfmmu_hat_enter(sfmmup); 3311 sfmmup->sfmmu_rtteflags |= tteflag; 3312 sfmmu_hat_exit(hatlockp); 3313 } 3314 /* 3315 * Update the current CPU tsbmiss area, so the current thread 3316 * won't need to take the tsbmiss for the new pagesize. 3317 * The other threads in the process will update their tsb 3318 * miss area lazily in sfmmu_tsbmiss_exception() when they 3319 * fail to find the translation for a newly added pagesize. 3320 */ 3321 if (size > TTE64K && myflt) { 3322 struct tsbmiss *tsbmp; 3323 kpreempt_disable(); 3324 tsbmp = &tsbmiss_area[CPU->cpu_id]; 3325 if (rid == SFMMU_INVALID_SHMERID) { 3326 if (!(tsbmp->uhat_tteflags & tteflag)) { 3327 tsbmp->uhat_tteflags |= tteflag; 3328 } 3329 } else { 3330 if (!(tsbmp->uhat_rtteflags & tteflag)) { 3331 tsbmp->uhat_rtteflags |= tteflag; 3332 } 3333 } 3334 kpreempt_enable(); 3335 } 3336 } 3337 3338 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 3339 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 3340 hatlockp = sfmmu_hat_enter(sfmmup); 3341 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 3342 sfmmu_hat_exit(hatlockp); 3343 } 3344 3345 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 3346 hw_tte.tte_intlo; 3347 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 3348 hw_tte.tte_inthi; 3349 3350 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 3351 /* 3352 * If remap and new tte differs from old tte we need 3353 * to sync the mod bit and flush TLB/TSB. We don't 3354 * need to sync ref bit because we currently always set 3355 * ref bit in tteload. 3356 */ 3357 ASSERT(TTE_IS_REF(ttep)); 3358 if (TTE_IS_MOD(&tteold)) { 3359 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 3360 } 3361 /* 3362 * hwtte bits shouldn't change for SRD hmeblks as long as SRD 3363 * hmes are only used for read only text. Adding this code for 3364 * completeness and future use of shared hmeblks with writable 3365 * mappings of VMODSORT vnodes. 3366 */ 3367 if (hmeblkp->hblk_shared) { 3368 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr, 3369 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1); 3370 xt_sync(cpuset); 3371 SFMMU_STAT_ADD(sf_region_remap_demap, 1); 3372 } else { 3373 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 3374 xt_sync(sfmmup->sfmmu_cpusran); 3375 } 3376 } 3377 3378 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 3379 /* 3380 * We only preload 8K and 4M mappings into the TSB, since 3381 * 64K and 512K mappings are replicated and hence don't 3382 * have a single, unique TSB entry. Ditto for 32M/256M. 3383 */ 3384 if (size == TTE8K || size == TTE4M) { 3385 sf_scd_t *scdp; 3386 hatlockp = sfmmu_hat_enter(sfmmup); 3387 /* 3388 * Don't preload private TSB if the mapping is used 3389 * by the shctx in the SCD. 3390 */ 3391 scdp = sfmmup->sfmmu_scdp; 3392 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL || 3393 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 3394 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, 3395 size); 3396 } 3397 sfmmu_hat_exit(hatlockp); 3398 } 3399 } 3400 if (pp) { 3401 if (!remap) { 3402 HME_ADD(sfhme, pp); 3403 atomic_add_16(&hmeblkp->hblk_hmecnt, 1); 3404 ASSERT(hmeblkp->hblk_hmecnt > 0); 3405 3406 /* 3407 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 3408 * see pageunload() for comment. 3409 */ 3410 } 3411 sfmmu_mlist_exit(pml); 3412 } 3413 3414 return (0); 3415 } 3416 /* 3417 * Function unlocks hash bucket. 3418 */ 3419 static void 3420 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 3421 { 3422 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3423 SFMMU_HASH_UNLOCK(hmebp); 3424 } 3425 3426 /* 3427 * function which checks and sets up page array for a large 3428 * translation. Will set p_vcolor, p_index, p_ro fields. 3429 * Assumes addr and pfnum of first page are properly aligned. 3430 * Will check for physical contiguity. If check fails it return 3431 * non null. 3432 */ 3433 static int 3434 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 3435 { 3436 int i, index, ttesz; 3437 pfn_t pfnum; 3438 pgcnt_t npgs; 3439 page_t *pp, *pp1; 3440 kmutex_t *pmtx; 3441 #ifdef VAC 3442 int osz; 3443 int cflags = 0; 3444 int vac_err = 0; 3445 #endif 3446 int newidx = 0; 3447 3448 ttesz = TTE_CSZ(ttep); 3449 3450 ASSERT(ttesz > TTE8K); 3451 3452 npgs = TTEPAGES(ttesz); 3453 index = PAGESZ_TO_INDEX(ttesz); 3454 3455 pfnum = (*pps)->p_pagenum; 3456 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 3457 3458 /* 3459 * Save the first pp so we can do HAT_TMPNC at the end. 3460 */ 3461 pp1 = *pps; 3462 #ifdef VAC 3463 osz = fnd_mapping_sz(pp1); 3464 #endif 3465 3466 for (i = 0; i < npgs; i++, pps++) { 3467 pp = *pps; 3468 ASSERT(PAGE_LOCKED(pp)); 3469 ASSERT(pp->p_szc >= ttesz); 3470 ASSERT(pp->p_szc == pp1->p_szc); 3471 ASSERT(sfmmu_mlist_held(pp)); 3472 3473 /* 3474 * XXX is it possible to maintain P_RO on the root only? 3475 */ 3476 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3477 pmtx = sfmmu_page_enter(pp); 3478 PP_CLRRO(pp); 3479 sfmmu_page_exit(pmtx); 3480 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 3481 !PP_ISMOD(pp)) { 3482 pmtx = sfmmu_page_enter(pp); 3483 if (!(PP_ISMOD(pp))) { 3484 PP_SETRO(pp); 3485 } 3486 sfmmu_page_exit(pmtx); 3487 } 3488 3489 /* 3490 * If this is a remap we skip vac & contiguity checks. 3491 */ 3492 if (remap) 3493 continue; 3494 3495 /* 3496 * set p_vcolor and detect any vac conflicts. 3497 */ 3498 #ifdef VAC 3499 if (vac_err == 0) { 3500 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 3501 3502 } 3503 #endif 3504 3505 /* 3506 * Save current index in case we need to undo it. 3507 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 3508 * "SFMMU_INDEX_SHIFT 6" 3509 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 3510 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 3511 * 3512 * So: index = PAGESZ_TO_INDEX(ttesz); 3513 * if ttesz == 1 then index = 0x2 3514 * 2 then index = 0x4 3515 * 3 then index = 0x8 3516 * 4 then index = 0x10 3517 * 5 then index = 0x20 3518 * The code below checks if it's a new pagesize (ie, newidx) 3519 * in case we need to take it back out of p_index, 3520 * and then or's the new index into the existing index. 3521 */ 3522 if ((PP_MAPINDEX(pp) & index) == 0) 3523 newidx = 1; 3524 pp->p_index = (PP_MAPINDEX(pp) | index); 3525 3526 /* 3527 * contiguity check 3528 */ 3529 if (pp->p_pagenum != pfnum) { 3530 /* 3531 * If we fail the contiguity test then 3532 * the only thing we need to fix is the p_index field. 3533 * We might get a few extra flushes but since this 3534 * path is rare that is ok. The p_ro field will 3535 * get automatically fixed on the next tteload to 3536 * the page. NO TNC bit is set yet. 3537 */ 3538 while (i >= 0) { 3539 pp = *pps; 3540 if (newidx) 3541 pp->p_index = (PP_MAPINDEX(pp) & 3542 ~index); 3543 pps--; 3544 i--; 3545 } 3546 return (1); 3547 } 3548 pfnum++; 3549 addr += MMU_PAGESIZE; 3550 } 3551 3552 #ifdef VAC 3553 if (vac_err) { 3554 if (ttesz > osz) { 3555 /* 3556 * There are some smaller mappings that causes vac 3557 * conflicts. Convert all existing small mappings to 3558 * TNC. 3559 */ 3560 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 3561 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 3562 npgs); 3563 } else { 3564 /* EMPTY */ 3565 /* 3566 * If there exists an big page mapping, 3567 * that means the whole existing big page 3568 * has TNC setting already. No need to covert to 3569 * TNC again. 3570 */ 3571 ASSERT(PP_ISTNC(pp1)); 3572 } 3573 } 3574 #endif /* VAC */ 3575 3576 return (0); 3577 } 3578 3579 #ifdef VAC 3580 /* 3581 * Routine that detects vac consistency for a large page. It also 3582 * sets virtual color for all pp's for this big mapping. 3583 */ 3584 static int 3585 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 3586 { 3587 int vcolor, ocolor; 3588 3589 ASSERT(sfmmu_mlist_held(pp)); 3590 3591 if (PP_ISNC(pp)) { 3592 return (HAT_TMPNC); 3593 } 3594 3595 vcolor = addr_to_vcolor(addr); 3596 if (PP_NEWPAGE(pp)) { 3597 PP_SET_VCOLOR(pp, vcolor); 3598 return (0); 3599 } 3600 3601 ocolor = PP_GET_VCOLOR(pp); 3602 if (ocolor == vcolor) { 3603 return (0); 3604 } 3605 3606 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 3607 /* 3608 * Previous user of page had a differnet color 3609 * but since there are no current users 3610 * we just flush the cache and change the color. 3611 * As an optimization for large pages we flush the 3612 * entire cache of that color and set a flag. 3613 */ 3614 SFMMU_STAT(sf_pgcolor_conflict); 3615 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 3616 CacheColor_SetFlushed(*cflags, ocolor); 3617 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 3618 } 3619 PP_SET_VCOLOR(pp, vcolor); 3620 return (0); 3621 } 3622 3623 /* 3624 * We got a real conflict with a current mapping. 3625 * set flags to start unencaching all mappings 3626 * and return failure so we restart looping 3627 * the pp array from the beginning. 3628 */ 3629 return (HAT_TMPNC); 3630 } 3631 #endif /* VAC */ 3632 3633 /* 3634 * creates a large page shadow hmeblk for a tte. 3635 * The purpose of this routine is to allow us to do quick unloads because 3636 * the vm layer can easily pass a very large but sparsely populated range. 3637 */ 3638 static struct hme_blk * 3639 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 3640 { 3641 struct hmehash_bucket *hmebp; 3642 hmeblk_tag hblktag; 3643 int hmeshift, size, vshift; 3644 uint_t shw_mask, newshw_mask; 3645 struct hme_blk *hmeblkp; 3646 3647 ASSERT(sfmmup != KHATID); 3648 if (mmu_page_sizes == max_mmu_page_sizes) { 3649 ASSERT(ttesz < TTE256M); 3650 } else { 3651 ASSERT(ttesz < TTE4M); 3652 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 3653 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 3654 } 3655 3656 if (ttesz == TTE8K) { 3657 size = TTE512K; 3658 } else { 3659 size = ++ttesz; 3660 } 3661 3662 hblktag.htag_id = sfmmup; 3663 hmeshift = HME_HASH_SHIFT(size); 3664 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3665 hblktag.htag_rehash = HME_HASH_REHASH(size); 3666 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3667 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 3668 3669 SFMMU_HASH_LOCK(hmebp); 3670 3671 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3672 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 3673 if (hmeblkp == NULL) { 3674 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3675 hblktag, flags, SFMMU_INVALID_SHMERID); 3676 } 3677 ASSERT(hmeblkp); 3678 if (!hmeblkp->hblk_shw_mask) { 3679 /* 3680 * if this is a unused hblk it was just allocated or could 3681 * potentially be a previous large page hblk so we need to 3682 * set the shadow bit. 3683 */ 3684 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3685 hmeblkp->hblk_shw_bit = 1; 3686 } else if (hmeblkp->hblk_shw_bit == 0) { 3687 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p", 3688 (void *)hmeblkp); 3689 } 3690 ASSERT(hmeblkp->hblk_shw_bit == 1); 3691 ASSERT(!hmeblkp->hblk_shared); 3692 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3693 ASSERT(vshift < 8); 3694 /* 3695 * Atomically set shw mask bit 3696 */ 3697 do { 3698 shw_mask = hmeblkp->hblk_shw_mask; 3699 newshw_mask = shw_mask | (1 << vshift); 3700 newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask, 3701 newshw_mask); 3702 } while (newshw_mask != shw_mask); 3703 3704 SFMMU_HASH_UNLOCK(hmebp); 3705 3706 return (hmeblkp); 3707 } 3708 3709 /* 3710 * This routine cleanup a previous shadow hmeblk and changes it to 3711 * a regular hblk. This happens rarely but it is possible 3712 * when a process wants to use large pages and there are hblks still 3713 * lying around from the previous as that used these hmeblks. 3714 * The alternative was to cleanup the shadow hblks at unload time 3715 * but since so few user processes actually use large pages, it is 3716 * better to be lazy and cleanup at this time. 3717 */ 3718 static void 3719 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3720 struct hmehash_bucket *hmebp) 3721 { 3722 caddr_t addr, endaddr; 3723 int hashno, size; 3724 3725 ASSERT(hmeblkp->hblk_shw_bit); 3726 ASSERT(!hmeblkp->hblk_shared); 3727 3728 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3729 3730 if (!hmeblkp->hblk_shw_mask) { 3731 hmeblkp->hblk_shw_bit = 0; 3732 return; 3733 } 3734 addr = (caddr_t)get_hblk_base(hmeblkp); 3735 endaddr = get_hblk_endaddr(hmeblkp); 3736 size = get_hblk_ttesz(hmeblkp); 3737 hashno = size - 1; 3738 ASSERT(hashno > 0); 3739 SFMMU_HASH_UNLOCK(hmebp); 3740 3741 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3742 3743 SFMMU_HASH_LOCK(hmebp); 3744 } 3745 3746 static void 3747 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3748 int hashno) 3749 { 3750 int hmeshift, shadow = 0; 3751 hmeblk_tag hblktag; 3752 struct hmehash_bucket *hmebp; 3753 struct hme_blk *hmeblkp; 3754 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3755 3756 ASSERT(hashno > 0); 3757 hblktag.htag_id = sfmmup; 3758 hblktag.htag_rehash = hashno; 3759 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3760 3761 hmeshift = HME_HASH_SHIFT(hashno); 3762 3763 while (addr < endaddr) { 3764 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3765 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3766 SFMMU_HASH_LOCK(hmebp); 3767 /* inline HME_HASH_SEARCH */ 3768 hmeblkp = hmebp->hmeblkp; 3769 pr_hblk = NULL; 3770 while (hmeblkp) { 3771 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3772 /* found hme_blk */ 3773 ASSERT(!hmeblkp->hblk_shared); 3774 if (hmeblkp->hblk_shw_bit) { 3775 if (hmeblkp->hblk_shw_mask) { 3776 shadow = 1; 3777 sfmmu_shadow_hcleanup(sfmmup, 3778 hmeblkp, hmebp); 3779 break; 3780 } else { 3781 hmeblkp->hblk_shw_bit = 0; 3782 } 3783 } 3784 3785 /* 3786 * Hblk_hmecnt and hblk_vcnt could be non zero 3787 * since hblk_unload() does not gurantee that. 3788 * 3789 * XXX - this could cause tteload() to spin 3790 * where sfmmu_shadow_hcleanup() is called. 3791 */ 3792 } 3793 3794 nx_hblk = hmeblkp->hblk_next; 3795 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3796 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3797 &list, 0); 3798 } else { 3799 pr_hblk = hmeblkp; 3800 } 3801 hmeblkp = nx_hblk; 3802 } 3803 3804 SFMMU_HASH_UNLOCK(hmebp); 3805 3806 if (shadow) { 3807 /* 3808 * We found another shadow hblk so cleaned its 3809 * children. We need to go back and cleanup 3810 * the original hblk so we don't change the 3811 * addr. 3812 */ 3813 shadow = 0; 3814 } else { 3815 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3816 (1 << hmeshift)); 3817 } 3818 } 3819 sfmmu_hblks_list_purge(&list, 0); 3820 } 3821 3822 /* 3823 * This routine's job is to delete stale invalid shared hmeregions hmeblks that 3824 * may still linger on after pageunload. 3825 */ 3826 static void 3827 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz) 3828 { 3829 int hmeshift; 3830 hmeblk_tag hblktag; 3831 struct hmehash_bucket *hmebp; 3832 struct hme_blk *hmeblkp; 3833 struct hme_blk *pr_hblk; 3834 struct hme_blk *list = NULL; 3835 3836 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3837 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3838 3839 hmeshift = HME_HASH_SHIFT(ttesz); 3840 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3841 hblktag.htag_rehash = ttesz; 3842 hblktag.htag_rid = rid; 3843 hblktag.htag_id = srdp; 3844 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3845 3846 SFMMU_HASH_LOCK(hmebp); 3847 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3848 if (hmeblkp != NULL) { 3849 ASSERT(hmeblkp->hblk_shared); 3850 ASSERT(!hmeblkp->hblk_shw_bit); 3851 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3852 panic("sfmmu_cleanup_rhblk: valid hmeblk"); 3853 } 3854 ASSERT(!hmeblkp->hblk_lckcnt); 3855 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3856 &list, 0); 3857 } 3858 SFMMU_HASH_UNLOCK(hmebp); 3859 sfmmu_hblks_list_purge(&list, 0); 3860 } 3861 3862 /* ARGSUSED */ 3863 static void 3864 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 3865 size_t r_size, void *r_obj, u_offset_t r_objoff) 3866 { 3867 } 3868 3869 /* 3870 * Searches for an hmeblk which maps addr, then unloads this mapping 3871 * and updates *eaddrp, if the hmeblk is found. 3872 */ 3873 static void 3874 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr, 3875 caddr_t eaddr, int ttesz, caddr_t *eaddrp) 3876 { 3877 int hmeshift; 3878 hmeblk_tag hblktag; 3879 struct hmehash_bucket *hmebp; 3880 struct hme_blk *hmeblkp; 3881 struct hme_blk *pr_hblk; 3882 struct hme_blk *list = NULL; 3883 3884 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3885 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3886 ASSERT(ttesz >= HBLK_MIN_TTESZ); 3887 3888 hmeshift = HME_HASH_SHIFT(ttesz); 3889 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3890 hblktag.htag_rehash = ttesz; 3891 hblktag.htag_rid = rid; 3892 hblktag.htag_id = srdp; 3893 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3894 3895 SFMMU_HASH_LOCK(hmebp); 3896 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3897 if (hmeblkp != NULL) { 3898 ASSERT(hmeblkp->hblk_shared); 3899 ASSERT(!hmeblkp->hblk_lckcnt); 3900 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3901 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr, 3902 eaddr, NULL, HAT_UNLOAD); 3903 ASSERT(*eaddrp > addr); 3904 } 3905 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3906 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3907 &list, 0); 3908 } 3909 SFMMU_HASH_UNLOCK(hmebp); 3910 sfmmu_hblks_list_purge(&list, 0); 3911 } 3912 3913 static void 3914 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp) 3915 { 3916 int ttesz = rgnp->rgn_pgszc; 3917 size_t rsz = rgnp->rgn_size; 3918 caddr_t rsaddr = rgnp->rgn_saddr; 3919 caddr_t readdr = rsaddr + rsz; 3920 caddr_t rhsaddr; 3921 caddr_t va; 3922 uint_t rid = rgnp->rgn_id; 3923 caddr_t cbsaddr; 3924 caddr_t cbeaddr; 3925 hat_rgn_cb_func_t rcbfunc; 3926 ulong_t cnt; 3927 3928 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3929 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3930 3931 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz))); 3932 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz))); 3933 if (ttesz < HBLK_MIN_TTESZ) { 3934 ttesz = HBLK_MIN_TTESZ; 3935 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES); 3936 } else { 3937 rhsaddr = rsaddr; 3938 } 3939 3940 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) { 3941 rcbfunc = sfmmu_rgn_cb_noop; 3942 } 3943 3944 while (ttesz >= HBLK_MIN_TTESZ) { 3945 cbsaddr = rsaddr; 3946 cbeaddr = rsaddr; 3947 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 3948 ttesz--; 3949 continue; 3950 } 3951 cnt = 0; 3952 va = rsaddr; 3953 while (va < readdr) { 3954 ASSERT(va >= rhsaddr); 3955 if (va != cbeaddr) { 3956 if (cbeaddr != cbsaddr) { 3957 ASSERT(cbeaddr > cbsaddr); 3958 (*rcbfunc)(cbsaddr, cbeaddr, 3959 rsaddr, rsz, rgnp->rgn_obj, 3960 rgnp->rgn_objoff); 3961 } 3962 cbsaddr = va; 3963 cbeaddr = va; 3964 } 3965 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr, 3966 ttesz, &cbeaddr); 3967 cnt++; 3968 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz)); 3969 } 3970 if (cbeaddr != cbsaddr) { 3971 ASSERT(cbeaddr > cbsaddr); 3972 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr, 3973 rsz, rgnp->rgn_obj, 3974 rgnp->rgn_objoff); 3975 } 3976 ttesz--; 3977 } 3978 } 3979 3980 /* 3981 * Release one hardware address translation lock on the given address range. 3982 */ 3983 void 3984 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3985 { 3986 struct hmehash_bucket *hmebp; 3987 hmeblk_tag hblktag; 3988 int hmeshift, hashno = 1; 3989 struct hme_blk *hmeblkp, *list = NULL; 3990 caddr_t endaddr; 3991 3992 ASSERT(sfmmup != NULL); 3993 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3994 3995 ASSERT((sfmmup == ksfmmup) || 3996 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3997 ASSERT((len & MMU_PAGEOFFSET) == 0); 3998 endaddr = addr + len; 3999 hblktag.htag_id = sfmmup; 4000 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4001 4002 /* 4003 * Spitfire supports 4 page sizes. 4004 * Most pages are expected to be of the smallest page size (8K) and 4005 * these will not need to be rehashed. 64K pages also don't need to be 4006 * rehashed because an hmeblk spans 64K of address space. 512K pages 4007 * might need 1 rehash and and 4M pages might need 2 rehashes. 4008 */ 4009 while (addr < endaddr) { 4010 hmeshift = HME_HASH_SHIFT(hashno); 4011 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4012 hblktag.htag_rehash = hashno; 4013 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4014 4015 SFMMU_HASH_LOCK(hmebp); 4016 4017 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4018 if (hmeblkp != NULL) { 4019 ASSERT(!hmeblkp->hblk_shared); 4020 /* 4021 * If we encounter a shadow hmeblk then 4022 * we know there are no valid hmeblks mapping 4023 * this address at this size or larger. 4024 * Just increment address by the smallest 4025 * page size. 4026 */ 4027 if (hmeblkp->hblk_shw_bit) { 4028 addr += MMU_PAGESIZE; 4029 } else { 4030 addr = sfmmu_hblk_unlock(hmeblkp, addr, 4031 endaddr); 4032 } 4033 SFMMU_HASH_UNLOCK(hmebp); 4034 hashno = 1; 4035 continue; 4036 } 4037 SFMMU_HASH_UNLOCK(hmebp); 4038 4039 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4040 /* 4041 * We have traversed the whole list and rehashed 4042 * if necessary without finding the address to unlock 4043 * which should never happen. 4044 */ 4045 panic("sfmmu_unlock: addr not found. " 4046 "addr %p hat %p", (void *)addr, (void *)sfmmup); 4047 } else { 4048 hashno++; 4049 } 4050 } 4051 4052 sfmmu_hblks_list_purge(&list, 0); 4053 } 4054 4055 void 4056 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len, 4057 hat_region_cookie_t rcookie) 4058 { 4059 sf_srd_t *srdp; 4060 sf_region_t *rgnp; 4061 int ttesz; 4062 uint_t rid; 4063 caddr_t eaddr; 4064 caddr_t va; 4065 int hmeshift; 4066 hmeblk_tag hblktag; 4067 struct hmehash_bucket *hmebp; 4068 struct hme_blk *hmeblkp; 4069 struct hme_blk *pr_hblk; 4070 struct hme_blk *list; 4071 4072 if (rcookie == HAT_INVALID_REGION_COOKIE) { 4073 hat_unlock(sfmmup, addr, len); 4074 return; 4075 } 4076 4077 ASSERT(sfmmup != NULL); 4078 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4079 ASSERT(sfmmup != ksfmmup); 4080 4081 srdp = sfmmup->sfmmu_srdp; 4082 rid = (uint_t)((uint64_t)rcookie); 4083 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 4084 eaddr = addr + len; 4085 va = addr; 4086 list = NULL; 4087 rgnp = srdp->srd_hmergnp[rid]; 4088 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len); 4089 4090 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc))); 4091 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc))); 4092 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) { 4093 ttesz = HBLK_MIN_TTESZ; 4094 } else { 4095 ttesz = rgnp->rgn_pgszc; 4096 } 4097 while (va < eaddr) { 4098 while (ttesz < rgnp->rgn_pgszc && 4099 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) { 4100 ttesz++; 4101 } 4102 while (ttesz >= HBLK_MIN_TTESZ) { 4103 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 4104 ttesz--; 4105 continue; 4106 } 4107 hmeshift = HME_HASH_SHIFT(ttesz); 4108 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift); 4109 hblktag.htag_rehash = ttesz; 4110 hblktag.htag_rid = rid; 4111 hblktag.htag_id = srdp; 4112 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift); 4113 SFMMU_HASH_LOCK(hmebp); 4114 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, 4115 &list); 4116 if (hmeblkp == NULL) { 4117 SFMMU_HASH_UNLOCK(hmebp); 4118 ttesz--; 4119 continue; 4120 } 4121 ASSERT(hmeblkp->hblk_shared); 4122 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr); 4123 ASSERT(va >= eaddr || 4124 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz))); 4125 SFMMU_HASH_UNLOCK(hmebp); 4126 break; 4127 } 4128 if (ttesz < HBLK_MIN_TTESZ) { 4129 panic("hat_unlock_region: addr not found " 4130 "addr %p hat %p", (void *)va, (void *)sfmmup); 4131 } 4132 } 4133 sfmmu_hblks_list_purge(&list, 0); 4134 } 4135 4136 /* 4137 * Function to unlock a range of addresses in an hmeblk. It returns the 4138 * next address that needs to be unlocked. 4139 * Should be called with the hash lock held. 4140 */ 4141 static caddr_t 4142 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 4143 { 4144 struct sf_hment *sfhme; 4145 tte_t tteold, ttemod; 4146 int ttesz, ret; 4147 4148 ASSERT(in_hblk_range(hmeblkp, addr)); 4149 ASSERT(hmeblkp->hblk_shw_bit == 0); 4150 4151 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4152 ttesz = get_hblk_ttesz(hmeblkp); 4153 4154 HBLKTOHME(sfhme, hmeblkp, addr); 4155 while (addr < endaddr) { 4156 readtte: 4157 sfmmu_copytte(&sfhme->hme_tte, &tteold); 4158 if (TTE_IS_VALID(&tteold)) { 4159 4160 ttemod = tteold; 4161 4162 ret = sfmmu_modifytte_try(&tteold, &ttemod, 4163 &sfhme->hme_tte); 4164 4165 if (ret < 0) 4166 goto readtte; 4167 4168 if (hmeblkp->hblk_lckcnt == 0) 4169 panic("zero hblk lckcnt"); 4170 4171 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 4172 (uintptr_t)endaddr) 4173 panic("can't unlock large tte"); 4174 4175 ASSERT(hmeblkp->hblk_lckcnt > 0); 4176 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 4177 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 4178 } else { 4179 panic("sfmmu_hblk_unlock: invalid tte"); 4180 } 4181 addr += TTEBYTES(ttesz); 4182 sfhme++; 4183 } 4184 return (addr); 4185 } 4186 4187 /* 4188 * Physical Address Mapping Framework 4189 * 4190 * General rules: 4191 * 4192 * (1) Applies only to seg_kmem memory pages. To make things easier, 4193 * seg_kpm addresses are also accepted by the routines, but nothing 4194 * is done with them since by definition their PA mappings are static. 4195 * (2) hat_add_callback() may only be called while holding the page lock 4196 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()), 4197 * or passing HAC_PAGELOCK flag. 4198 * (3) prehandler() and posthandler() may not call hat_add_callback() or 4199 * hat_delete_callback(), nor should they allocate memory. Post quiesce 4200 * callbacks may not sleep or acquire adaptive mutex locks. 4201 * (4) Either prehandler() or posthandler() (but not both) may be specified 4202 * as being NULL. Specifying an errhandler() is optional. 4203 * 4204 * Details of using the framework: 4205 * 4206 * registering a callback (hat_register_callback()) 4207 * 4208 * Pass prehandler, posthandler, errhandler addresses 4209 * as described below. If capture_cpus argument is nonzero, 4210 * suspend callback to the prehandler will occur with CPUs 4211 * captured and executing xc_loop() and CPUs will remain 4212 * captured until after the posthandler suspend callback 4213 * occurs. 4214 * 4215 * adding a callback (hat_add_callback()) 4216 * 4217 * as_pagelock(); 4218 * hat_add_callback(); 4219 * save returned pfn in private data structures or program registers; 4220 * as_pageunlock(); 4221 * 4222 * prehandler() 4223 * 4224 * Stop all accesses by physical address to this memory page. 4225 * Called twice: the first, PRESUSPEND, is a context safe to acquire 4226 * adaptive locks. The second, SUSPEND, is called at high PIL with 4227 * CPUs captured so adaptive locks may NOT be acquired (and all spin 4228 * locks must be XCALL_PIL or higher locks). 4229 * 4230 * May return the following errors: 4231 * EIO: A fatal error has occurred. This will result in panic. 4232 * EAGAIN: The page cannot be suspended. This will fail the 4233 * relocation. 4234 * 0: Success. 4235 * 4236 * posthandler() 4237 * 4238 * Save new pfn in private data structures or program registers; 4239 * not allowed to fail (non-zero return values will result in panic). 4240 * 4241 * errhandler() 4242 * 4243 * called when an error occurs related to the callback. Currently 4244 * the only such error is HAT_CB_ERR_LEAKED which indicates that 4245 * a page is being freed, but there are still outstanding callback(s) 4246 * registered on the page. 4247 * 4248 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 4249 * 4250 * stop using physical address 4251 * hat_delete_callback(); 4252 * 4253 */ 4254 4255 /* 4256 * Register a callback class. Each subsystem should do this once and 4257 * cache the id_t returned for use in setting up and tearing down callbacks. 4258 * 4259 * There is no facility for removing callback IDs once they are created; 4260 * the "key" should be unique for each module, so in case a module is unloaded 4261 * and subsequently re-loaded, we can recycle the module's previous entry. 4262 */ 4263 id_t 4264 hat_register_callback(int key, 4265 int (*prehandler)(caddr_t, uint_t, uint_t, void *), 4266 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 4267 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 4268 int capture_cpus) 4269 { 4270 id_t id; 4271 4272 /* 4273 * Search the table for a pre-existing callback associated with 4274 * the identifier "key". If one exists, we re-use that entry in 4275 * the table for this instance, otherwise we assign the next 4276 * available table slot. 4277 */ 4278 for (id = 0; id < sfmmu_max_cb_id; id++) { 4279 if (sfmmu_cb_table[id].key == key) 4280 break; 4281 } 4282 4283 if (id == sfmmu_max_cb_id) { 4284 id = sfmmu_cb_nextid++; 4285 if (id >= sfmmu_max_cb_id) 4286 panic("hat_register_callback: out of callback IDs"); 4287 } 4288 4289 ASSERT(prehandler != NULL || posthandler != NULL); 4290 4291 sfmmu_cb_table[id].key = key; 4292 sfmmu_cb_table[id].prehandler = prehandler; 4293 sfmmu_cb_table[id].posthandler = posthandler; 4294 sfmmu_cb_table[id].errhandler = errhandler; 4295 sfmmu_cb_table[id].capture_cpus = capture_cpus; 4296 4297 return (id); 4298 } 4299 4300 #define HAC_COOKIE_NONE (void *)-1 4301 4302 /* 4303 * Add relocation callbacks to the specified addr/len which will be called 4304 * when relocating the associated page. See the description of pre and 4305 * posthandler above for more details. 4306 * 4307 * If HAC_PAGELOCK is included in flags, the underlying memory page is 4308 * locked internally so the caller must be able to deal with the callback 4309 * running even before this function has returned. If HAC_PAGELOCK is not 4310 * set, it is assumed that the underlying memory pages are locked. 4311 * 4312 * Since the caller must track the individual page boundaries anyway, 4313 * we only allow a callback to be added to a single page (large 4314 * or small). Thus [addr, addr + len) MUST be contained within a single 4315 * page. 4316 * 4317 * Registering multiple callbacks on the same [addr, addr+len) is supported, 4318 * _provided_that_ a unique parameter is specified for each callback. 4319 * If multiple callbacks are registered on the same range the callback will 4320 * be invoked with each unique parameter. Registering the same callback with 4321 * the same argument more than once will result in corrupted kernel state. 4322 * 4323 * Returns the pfn of the underlying kernel page in *rpfn 4324 * on success, or PFN_INVALID on failure. 4325 * 4326 * cookiep (if passed) provides storage space for an opaque cookie 4327 * to return later to hat_delete_callback(). This cookie makes the callback 4328 * deletion significantly quicker by avoiding a potentially lengthy hash 4329 * search. 4330 * 4331 * Returns values: 4332 * 0: success 4333 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 4334 * EINVAL: callback ID is not valid 4335 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 4336 * space 4337 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary 4338 */ 4339 int 4340 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 4341 void *pvt, pfn_t *rpfn, void **cookiep) 4342 { 4343 struct hmehash_bucket *hmebp; 4344 hmeblk_tag hblktag; 4345 struct hme_blk *hmeblkp; 4346 int hmeshift, hashno; 4347 caddr_t saddr, eaddr, baseaddr; 4348 struct pa_hment *pahmep; 4349 struct sf_hment *sfhmep, *osfhmep; 4350 kmutex_t *pml; 4351 tte_t tte; 4352 page_t *pp; 4353 vnode_t *vp; 4354 u_offset_t off; 4355 pfn_t pfn; 4356 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 4357 int locked = 0; 4358 4359 /* 4360 * For KPM mappings, just return the physical address since we 4361 * don't need to register any callbacks. 4362 */ 4363 if (IS_KPM_ADDR(vaddr)) { 4364 uint64_t paddr; 4365 SFMMU_KPM_VTOP(vaddr, paddr); 4366 *rpfn = btop(paddr); 4367 if (cookiep != NULL) 4368 *cookiep = HAC_COOKIE_NONE; 4369 return (0); 4370 } 4371 4372 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 4373 *rpfn = PFN_INVALID; 4374 return (EINVAL); 4375 } 4376 4377 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 4378 *rpfn = PFN_INVALID; 4379 return (ENOMEM); 4380 } 4381 4382 sfhmep = &pahmep->sfment; 4383 4384 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4385 eaddr = saddr + len; 4386 4387 rehash: 4388 /* Find the mapping(s) for this page */ 4389 for (hashno = TTE64K, hmeblkp = NULL; 4390 hmeblkp == NULL && hashno <= mmu_hashcnt; 4391 hashno++) { 4392 hmeshift = HME_HASH_SHIFT(hashno); 4393 hblktag.htag_id = ksfmmup; 4394 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4395 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4396 hblktag.htag_rehash = hashno; 4397 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4398 4399 SFMMU_HASH_LOCK(hmebp); 4400 4401 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4402 4403 if (hmeblkp == NULL) 4404 SFMMU_HASH_UNLOCK(hmebp); 4405 } 4406 4407 if (hmeblkp == NULL) { 4408 kmem_cache_free(pa_hment_cache, pahmep); 4409 *rpfn = PFN_INVALID; 4410 return (ENXIO); 4411 } 4412 4413 ASSERT(!hmeblkp->hblk_shared); 4414 4415 HBLKTOHME(osfhmep, hmeblkp, saddr); 4416 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4417 4418 if (!TTE_IS_VALID(&tte)) { 4419 SFMMU_HASH_UNLOCK(hmebp); 4420 kmem_cache_free(pa_hment_cache, pahmep); 4421 *rpfn = PFN_INVALID; 4422 return (ENXIO); 4423 } 4424 4425 /* 4426 * Make sure the boundaries for the callback fall within this 4427 * single mapping. 4428 */ 4429 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 4430 ASSERT(saddr >= baseaddr); 4431 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) { 4432 SFMMU_HASH_UNLOCK(hmebp); 4433 kmem_cache_free(pa_hment_cache, pahmep); 4434 *rpfn = PFN_INVALID; 4435 return (ERANGE); 4436 } 4437 4438 pfn = sfmmu_ttetopfn(&tte, vaddr); 4439 4440 /* 4441 * The pfn may not have a page_t underneath in which case we 4442 * just return it. This can happen if we are doing I/O to a 4443 * static portion of the kernel's address space, for instance. 4444 */ 4445 pp = osfhmep->hme_page; 4446 if (pp == NULL) { 4447 SFMMU_HASH_UNLOCK(hmebp); 4448 kmem_cache_free(pa_hment_cache, pahmep); 4449 *rpfn = pfn; 4450 if (cookiep) 4451 *cookiep = HAC_COOKIE_NONE; 4452 return (0); 4453 } 4454 ASSERT(pp == PP_PAGEROOT(pp)); 4455 4456 vp = pp->p_vnode; 4457 off = pp->p_offset; 4458 4459 pml = sfmmu_mlist_enter(pp); 4460 4461 if (flags & HAC_PAGELOCK) { 4462 if (!page_trylock(pp, SE_SHARED)) { 4463 /* 4464 * Somebody is holding SE_EXCL lock. Might 4465 * even be hat_page_relocate(). Drop all 4466 * our locks, lookup the page in &kvp, and 4467 * retry. If it doesn't exist in &kvp and &zvp, 4468 * then we must be dealing with a kernel mapped 4469 * page which doesn't actually belong to 4470 * segkmem so we punt. 4471 */ 4472 sfmmu_mlist_exit(pml); 4473 SFMMU_HASH_UNLOCK(hmebp); 4474 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4475 4476 /* check zvp before giving up */ 4477 if (pp == NULL) 4478 pp = page_lookup(&zvp, (u_offset_t)saddr, 4479 SE_SHARED); 4480 4481 /* Okay, we didn't find it, give up */ 4482 if (pp == NULL) { 4483 kmem_cache_free(pa_hment_cache, pahmep); 4484 *rpfn = pfn; 4485 if (cookiep) 4486 *cookiep = HAC_COOKIE_NONE; 4487 return (0); 4488 } 4489 page_unlock(pp); 4490 goto rehash; 4491 } 4492 locked = 1; 4493 } 4494 4495 if (!PAGE_LOCKED(pp) && !panicstr) 4496 panic("hat_add_callback: page 0x%p not locked", (void *)pp); 4497 4498 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4499 pp->p_offset != off) { 4500 /* 4501 * The page moved before we got our hands on it. Drop 4502 * all the locks and try again. 4503 */ 4504 ASSERT((flags & HAC_PAGELOCK) != 0); 4505 sfmmu_mlist_exit(pml); 4506 SFMMU_HASH_UNLOCK(hmebp); 4507 page_unlock(pp); 4508 locked = 0; 4509 goto rehash; 4510 } 4511 4512 if (!VN_ISKAS(vp)) { 4513 /* 4514 * This is not a segkmem page but another page which 4515 * has been kernel mapped. It had better have at least 4516 * a share lock on it. Return the pfn. 4517 */ 4518 sfmmu_mlist_exit(pml); 4519 SFMMU_HASH_UNLOCK(hmebp); 4520 if (locked) 4521 page_unlock(pp); 4522 kmem_cache_free(pa_hment_cache, pahmep); 4523 ASSERT(PAGE_LOCKED(pp)); 4524 *rpfn = pfn; 4525 if (cookiep) 4526 *cookiep = HAC_COOKIE_NONE; 4527 return (0); 4528 } 4529 4530 /* 4531 * Setup this pa_hment and link its embedded dummy sf_hment into 4532 * the mapping list. 4533 */ 4534 pp->p_share++; 4535 pahmep->cb_id = callback_id; 4536 pahmep->addr = vaddr; 4537 pahmep->len = len; 4538 pahmep->refcnt = 1; 4539 pahmep->flags = 0; 4540 pahmep->pvt = pvt; 4541 4542 sfhmep->hme_tte.ll = 0; 4543 sfhmep->hme_data = pahmep; 4544 sfhmep->hme_prev = osfhmep; 4545 sfhmep->hme_next = osfhmep->hme_next; 4546 4547 if (osfhmep->hme_next) 4548 osfhmep->hme_next->hme_prev = sfhmep; 4549 4550 osfhmep->hme_next = sfhmep; 4551 4552 sfmmu_mlist_exit(pml); 4553 SFMMU_HASH_UNLOCK(hmebp); 4554 4555 if (locked) 4556 page_unlock(pp); 4557 4558 *rpfn = pfn; 4559 if (cookiep) 4560 *cookiep = (void *)pahmep; 4561 4562 return (0); 4563 } 4564 4565 /* 4566 * Remove the relocation callbacks from the specified addr/len. 4567 */ 4568 void 4569 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags, 4570 void *cookie) 4571 { 4572 struct hmehash_bucket *hmebp; 4573 hmeblk_tag hblktag; 4574 struct hme_blk *hmeblkp; 4575 int hmeshift, hashno; 4576 caddr_t saddr; 4577 struct pa_hment *pahmep; 4578 struct sf_hment *sfhmep, *osfhmep; 4579 kmutex_t *pml; 4580 tte_t tte; 4581 page_t *pp; 4582 vnode_t *vp; 4583 u_offset_t off; 4584 int locked = 0; 4585 4586 /* 4587 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to 4588 * remove so just return. 4589 */ 4590 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr)) 4591 return; 4592 4593 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4594 4595 rehash: 4596 /* Find the mapping(s) for this page */ 4597 for (hashno = TTE64K, hmeblkp = NULL; 4598 hmeblkp == NULL && hashno <= mmu_hashcnt; 4599 hashno++) { 4600 hmeshift = HME_HASH_SHIFT(hashno); 4601 hblktag.htag_id = ksfmmup; 4602 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4603 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4604 hblktag.htag_rehash = hashno; 4605 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4606 4607 SFMMU_HASH_LOCK(hmebp); 4608 4609 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4610 4611 if (hmeblkp == NULL) 4612 SFMMU_HASH_UNLOCK(hmebp); 4613 } 4614 4615 if (hmeblkp == NULL) 4616 return; 4617 4618 ASSERT(!hmeblkp->hblk_shared); 4619 4620 HBLKTOHME(osfhmep, hmeblkp, saddr); 4621 4622 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4623 if (!TTE_IS_VALID(&tte)) { 4624 SFMMU_HASH_UNLOCK(hmebp); 4625 return; 4626 } 4627 4628 pp = osfhmep->hme_page; 4629 if (pp == NULL) { 4630 SFMMU_HASH_UNLOCK(hmebp); 4631 ASSERT(cookie == NULL); 4632 return; 4633 } 4634 4635 vp = pp->p_vnode; 4636 off = pp->p_offset; 4637 4638 pml = sfmmu_mlist_enter(pp); 4639 4640 if (flags & HAC_PAGELOCK) { 4641 if (!page_trylock(pp, SE_SHARED)) { 4642 /* 4643 * Somebody is holding SE_EXCL lock. Might 4644 * even be hat_page_relocate(). Drop all 4645 * our locks, lookup the page in &kvp, and 4646 * retry. If it doesn't exist in &kvp and &zvp, 4647 * then we must be dealing with a kernel mapped 4648 * page which doesn't actually belong to 4649 * segkmem so we punt. 4650 */ 4651 sfmmu_mlist_exit(pml); 4652 SFMMU_HASH_UNLOCK(hmebp); 4653 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4654 /* check zvp before giving up */ 4655 if (pp == NULL) 4656 pp = page_lookup(&zvp, (u_offset_t)saddr, 4657 SE_SHARED); 4658 4659 if (pp == NULL) { 4660 ASSERT(cookie == NULL); 4661 return; 4662 } 4663 page_unlock(pp); 4664 goto rehash; 4665 } 4666 locked = 1; 4667 } 4668 4669 ASSERT(PAGE_LOCKED(pp)); 4670 4671 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4672 pp->p_offset != off) { 4673 /* 4674 * The page moved before we got our hands on it. Drop 4675 * all the locks and try again. 4676 */ 4677 ASSERT((flags & HAC_PAGELOCK) != 0); 4678 sfmmu_mlist_exit(pml); 4679 SFMMU_HASH_UNLOCK(hmebp); 4680 page_unlock(pp); 4681 locked = 0; 4682 goto rehash; 4683 } 4684 4685 if (!VN_ISKAS(vp)) { 4686 /* 4687 * This is not a segkmem page but another page which 4688 * has been kernel mapped. 4689 */ 4690 sfmmu_mlist_exit(pml); 4691 SFMMU_HASH_UNLOCK(hmebp); 4692 if (locked) 4693 page_unlock(pp); 4694 ASSERT(cookie == NULL); 4695 return; 4696 } 4697 4698 if (cookie != NULL) { 4699 pahmep = (struct pa_hment *)cookie; 4700 sfhmep = &pahmep->sfment; 4701 } else { 4702 for (sfhmep = pp->p_mapping; sfhmep != NULL; 4703 sfhmep = sfhmep->hme_next) { 4704 4705 /* 4706 * skip va<->pa mappings 4707 */ 4708 if (!IS_PAHME(sfhmep)) 4709 continue; 4710 4711 pahmep = sfhmep->hme_data; 4712 ASSERT(pahmep != NULL); 4713 4714 /* 4715 * if pa_hment matches, remove it 4716 */ 4717 if ((pahmep->pvt == pvt) && 4718 (pahmep->addr == vaddr) && 4719 (pahmep->len == len)) { 4720 break; 4721 } 4722 } 4723 } 4724 4725 if (sfhmep == NULL) { 4726 if (!panicstr) { 4727 panic("hat_delete_callback: pa_hment not found, pp %p", 4728 (void *)pp); 4729 } 4730 return; 4731 } 4732 4733 /* 4734 * Note: at this point a valid kernel mapping must still be 4735 * present on this page. 4736 */ 4737 pp->p_share--; 4738 if (pp->p_share <= 0) 4739 panic("hat_delete_callback: zero p_share"); 4740 4741 if (--pahmep->refcnt == 0) { 4742 if (pahmep->flags != 0) 4743 panic("hat_delete_callback: pa_hment is busy"); 4744 4745 /* 4746 * Remove sfhmep from the mapping list for the page. 4747 */ 4748 if (sfhmep->hme_prev) { 4749 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 4750 } else { 4751 pp->p_mapping = sfhmep->hme_next; 4752 } 4753 4754 if (sfhmep->hme_next) 4755 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 4756 4757 sfmmu_mlist_exit(pml); 4758 SFMMU_HASH_UNLOCK(hmebp); 4759 4760 if (locked) 4761 page_unlock(pp); 4762 4763 kmem_cache_free(pa_hment_cache, pahmep); 4764 return; 4765 } 4766 4767 sfmmu_mlist_exit(pml); 4768 SFMMU_HASH_UNLOCK(hmebp); 4769 if (locked) 4770 page_unlock(pp); 4771 } 4772 4773 /* 4774 * hat_probe returns 1 if the translation for the address 'addr' is 4775 * loaded, zero otherwise. 4776 * 4777 * hat_probe should be used only for advisorary purposes because it may 4778 * occasionally return the wrong value. The implementation must guarantee that 4779 * returning the wrong value is a very rare event. hat_probe is used 4780 * to implement optimizations in the segment drivers. 4781 * 4782 */ 4783 int 4784 hat_probe(struct hat *sfmmup, caddr_t addr) 4785 { 4786 pfn_t pfn; 4787 tte_t tte; 4788 4789 ASSERT(sfmmup != NULL); 4790 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4791 4792 ASSERT((sfmmup == ksfmmup) || 4793 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4794 4795 if (sfmmup == ksfmmup) { 4796 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 4797 == PFN_SUSPENDED) { 4798 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 4799 } 4800 } else { 4801 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL); 4802 } 4803 4804 if (pfn != PFN_INVALID) 4805 return (1); 4806 else 4807 return (0); 4808 } 4809 4810 ssize_t 4811 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 4812 { 4813 tte_t tte; 4814 4815 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4816 4817 if (sfmmup == ksfmmup) { 4818 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4819 return (-1); 4820 } 4821 } else { 4822 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4823 return (-1); 4824 } 4825 } 4826 4827 ASSERT(TTE_IS_VALID(&tte)); 4828 return (TTEBYTES(TTE_CSZ(&tte))); 4829 } 4830 4831 uint_t 4832 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 4833 { 4834 tte_t tte; 4835 4836 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4837 4838 if (sfmmup == ksfmmup) { 4839 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4840 tte.ll = 0; 4841 } 4842 } else { 4843 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4844 tte.ll = 0; 4845 } 4846 } 4847 if (TTE_IS_VALID(&tte)) { 4848 *attr = sfmmu_ptov_attr(&tte); 4849 return (0); 4850 } 4851 *attr = 0; 4852 return ((uint_t)0xffffffff); 4853 } 4854 4855 /* 4856 * Enables more attributes on specified address range (ie. logical OR) 4857 */ 4858 void 4859 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4860 { 4861 if (hat->sfmmu_xhat_provider) { 4862 XHAT_SETATTR(hat, addr, len, attr); 4863 return; 4864 } else { 4865 /* 4866 * This must be a CPU HAT. If the address space has 4867 * XHATs attached, change attributes for all of them, 4868 * just in case 4869 */ 4870 ASSERT(hat->sfmmu_as != NULL); 4871 if (hat->sfmmu_as->a_xhat != NULL) 4872 xhat_setattr_all(hat->sfmmu_as, addr, len, attr); 4873 } 4874 4875 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 4876 } 4877 4878 /* 4879 * Assigns attributes to the specified address range. All the attributes 4880 * are specified. 4881 */ 4882 void 4883 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4884 { 4885 if (hat->sfmmu_xhat_provider) { 4886 XHAT_CHGATTR(hat, addr, len, attr); 4887 return; 4888 } else { 4889 /* 4890 * This must be a CPU HAT. If the address space has 4891 * XHATs attached, change attributes for all of them, 4892 * just in case 4893 */ 4894 ASSERT(hat->sfmmu_as != NULL); 4895 if (hat->sfmmu_as->a_xhat != NULL) 4896 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr); 4897 } 4898 4899 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 4900 } 4901 4902 /* 4903 * Remove attributes on the specified address range (ie. loginal NAND) 4904 */ 4905 void 4906 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4907 { 4908 if (hat->sfmmu_xhat_provider) { 4909 XHAT_CLRATTR(hat, addr, len, attr); 4910 return; 4911 } else { 4912 /* 4913 * This must be a CPU HAT. If the address space has 4914 * XHATs attached, change attributes for all of them, 4915 * just in case 4916 */ 4917 ASSERT(hat->sfmmu_as != NULL); 4918 if (hat->sfmmu_as->a_xhat != NULL) 4919 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr); 4920 } 4921 4922 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 4923 } 4924 4925 /* 4926 * Change attributes on an address range to that specified by attr and mode. 4927 */ 4928 static void 4929 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 4930 int mode) 4931 { 4932 struct hmehash_bucket *hmebp; 4933 hmeblk_tag hblktag; 4934 int hmeshift, hashno = 1; 4935 struct hme_blk *hmeblkp, *list = NULL; 4936 caddr_t endaddr; 4937 cpuset_t cpuset; 4938 demap_range_t dmr; 4939 4940 CPUSET_ZERO(cpuset); 4941 4942 ASSERT((sfmmup == ksfmmup) || 4943 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4944 ASSERT((len & MMU_PAGEOFFSET) == 0); 4945 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4946 4947 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4948 ((addr + len) > (caddr_t)USERLIMIT)) { 4949 panic("user addr %p in kernel space", 4950 (void *)addr); 4951 } 4952 4953 endaddr = addr + len; 4954 hblktag.htag_id = sfmmup; 4955 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4956 DEMAP_RANGE_INIT(sfmmup, &dmr); 4957 4958 while (addr < endaddr) { 4959 hmeshift = HME_HASH_SHIFT(hashno); 4960 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4961 hblktag.htag_rehash = hashno; 4962 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4963 4964 SFMMU_HASH_LOCK(hmebp); 4965 4966 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4967 if (hmeblkp != NULL) { 4968 ASSERT(!hmeblkp->hblk_shared); 4969 /* 4970 * We've encountered a shadow hmeblk so skip the range 4971 * of the next smaller mapping size. 4972 */ 4973 if (hmeblkp->hblk_shw_bit) { 4974 ASSERT(sfmmup != ksfmmup); 4975 ASSERT(hashno > 1); 4976 addr = (caddr_t)P2END((uintptr_t)addr, 4977 TTEBYTES(hashno - 1)); 4978 } else { 4979 addr = sfmmu_hblk_chgattr(sfmmup, 4980 hmeblkp, addr, endaddr, &dmr, attr, mode); 4981 } 4982 SFMMU_HASH_UNLOCK(hmebp); 4983 hashno = 1; 4984 continue; 4985 } 4986 SFMMU_HASH_UNLOCK(hmebp); 4987 4988 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4989 /* 4990 * We have traversed the whole list and rehashed 4991 * if necessary without finding the address to chgattr. 4992 * This is ok, so we increment the address by the 4993 * smallest hmeblk range for kernel mappings or for 4994 * user mappings with no large pages, and the largest 4995 * hmeblk range, to account for shadow hmeblks, for 4996 * user mappings with large pages and continue. 4997 */ 4998 if (sfmmup == ksfmmup) 4999 addr = (caddr_t)P2END((uintptr_t)addr, 5000 TTEBYTES(1)); 5001 else 5002 addr = (caddr_t)P2END((uintptr_t)addr, 5003 TTEBYTES(hashno)); 5004 hashno = 1; 5005 } else { 5006 hashno++; 5007 } 5008 } 5009 5010 sfmmu_hblks_list_purge(&list, 0); 5011 DEMAP_RANGE_FLUSH(&dmr); 5012 cpuset = sfmmup->sfmmu_cpusran; 5013 xt_sync(cpuset); 5014 } 5015 5016 /* 5017 * This function chgattr on a range of addresses in an hmeblk. It returns the 5018 * next addres that needs to be chgattr. 5019 * It should be called with the hash lock held. 5020 * XXX It should be possible to optimize chgattr by not flushing every time but 5021 * on the other hand: 5022 * 1. do one flush crosscall. 5023 * 2. only flush if we are increasing permissions (make sure this will work) 5024 */ 5025 static caddr_t 5026 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5027 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 5028 { 5029 tte_t tte, tteattr, tteflags, ttemod; 5030 struct sf_hment *sfhmep; 5031 int ttesz; 5032 struct page *pp = NULL; 5033 kmutex_t *pml, *pmtx; 5034 int ret; 5035 int use_demap_range; 5036 #if defined(SF_ERRATA_57) 5037 int check_exec; 5038 #endif 5039 5040 ASSERT(in_hblk_range(hmeblkp, addr)); 5041 ASSERT(hmeblkp->hblk_shw_bit == 0); 5042 ASSERT(!hmeblkp->hblk_shared); 5043 5044 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5045 ttesz = get_hblk_ttesz(hmeblkp); 5046 5047 /* 5048 * Flush the current demap region if addresses have been 5049 * skipped or the page size doesn't match. 5050 */ 5051 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 5052 if (use_demap_range) { 5053 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5054 } else { 5055 DEMAP_RANGE_FLUSH(dmrp); 5056 } 5057 5058 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 5059 #if defined(SF_ERRATA_57) 5060 check_exec = (sfmmup != ksfmmup) && 5061 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 5062 TTE_IS_EXECUTABLE(&tteattr); 5063 #endif 5064 HBLKTOHME(sfhmep, hmeblkp, addr); 5065 while (addr < endaddr) { 5066 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5067 if (TTE_IS_VALID(&tte)) { 5068 if ((tte.ll & tteflags.ll) == tteattr.ll) { 5069 /* 5070 * if the new attr is the same as old 5071 * continue 5072 */ 5073 goto next_addr; 5074 } 5075 if (!TTE_IS_WRITABLE(&tteattr)) { 5076 /* 5077 * make sure we clear hw modify bit if we 5078 * removing write protections 5079 */ 5080 tteflags.tte_intlo |= TTE_HWWR_INT; 5081 } 5082 5083 pml = NULL; 5084 pp = sfhmep->hme_page; 5085 if (pp) { 5086 pml = sfmmu_mlist_enter(pp); 5087 } 5088 5089 if (pp != sfhmep->hme_page) { 5090 /* 5091 * tte must have been unloaded. 5092 */ 5093 ASSERT(pml); 5094 sfmmu_mlist_exit(pml); 5095 continue; 5096 } 5097 5098 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5099 5100 ttemod = tte; 5101 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 5102 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 5103 5104 #if defined(SF_ERRATA_57) 5105 if (check_exec && addr < errata57_limit) 5106 ttemod.tte_exec_perm = 0; 5107 #endif 5108 ret = sfmmu_modifytte_try(&tte, &ttemod, 5109 &sfhmep->hme_tte); 5110 5111 if (ret < 0) { 5112 /* tte changed underneath us */ 5113 if (pml) { 5114 sfmmu_mlist_exit(pml); 5115 } 5116 continue; 5117 } 5118 5119 if (tteflags.tte_intlo & TTE_HWWR_INT) { 5120 /* 5121 * need to sync if we are clearing modify bit. 5122 */ 5123 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5124 } 5125 5126 if (pp && PP_ISRO(pp)) { 5127 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 5128 pmtx = sfmmu_page_enter(pp); 5129 PP_CLRRO(pp); 5130 sfmmu_page_exit(pmtx); 5131 } 5132 } 5133 5134 if (ret > 0 && use_demap_range) { 5135 DEMAP_RANGE_MARKPG(dmrp, addr); 5136 } else if (ret > 0) { 5137 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5138 } 5139 5140 if (pml) { 5141 sfmmu_mlist_exit(pml); 5142 } 5143 } 5144 next_addr: 5145 addr += TTEBYTES(ttesz); 5146 sfhmep++; 5147 DEMAP_RANGE_NEXTPG(dmrp); 5148 } 5149 return (addr); 5150 } 5151 5152 /* 5153 * This routine converts virtual attributes to physical ones. It will 5154 * update the tteflags field with the tte mask corresponding to the attributes 5155 * affected and it returns the new attributes. It will also clear the modify 5156 * bit if we are taking away write permission. This is necessary since the 5157 * modify bit is the hardware permission bit and we need to clear it in order 5158 * to detect write faults. 5159 */ 5160 static uint64_t 5161 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 5162 { 5163 tte_t ttevalue; 5164 5165 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 5166 5167 switch (mode) { 5168 case SFMMU_CHGATTR: 5169 /* all attributes specified */ 5170 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 5171 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 5172 ttemaskp->tte_inthi = TTEINTHI_ATTR; 5173 ttemaskp->tte_intlo = TTEINTLO_ATTR; 5174 break; 5175 case SFMMU_SETATTR: 5176 ASSERT(!(attr & ~HAT_PROT_MASK)); 5177 ttemaskp->ll = 0; 5178 ttevalue.ll = 0; 5179 /* 5180 * a valid tte implies exec and read for sfmmu 5181 * so no need to do anything about them. 5182 * since priviledged access implies user access 5183 * PROT_USER doesn't make sense either. 5184 */ 5185 if (attr & PROT_WRITE) { 5186 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 5187 ttevalue.tte_intlo |= TTE_WRPRM_INT; 5188 } 5189 break; 5190 case SFMMU_CLRATTR: 5191 /* attributes will be nand with current ones */ 5192 if (attr & ~(PROT_WRITE | PROT_USER)) { 5193 panic("sfmmu: attr %x not supported", attr); 5194 } 5195 ttemaskp->ll = 0; 5196 ttevalue.ll = 0; 5197 if (attr & PROT_WRITE) { 5198 /* clear both writable and modify bit */ 5199 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 5200 } 5201 if (attr & PROT_USER) { 5202 ttemaskp->tte_intlo |= TTE_PRIV_INT; 5203 ttevalue.tte_intlo |= TTE_PRIV_INT; 5204 } 5205 break; 5206 default: 5207 panic("sfmmu_vtop_attr: bad mode %x", mode); 5208 } 5209 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 5210 return (ttevalue.ll); 5211 } 5212 5213 static uint_t 5214 sfmmu_ptov_attr(tte_t *ttep) 5215 { 5216 uint_t attr; 5217 5218 ASSERT(TTE_IS_VALID(ttep)); 5219 5220 attr = PROT_READ; 5221 5222 if (TTE_IS_WRITABLE(ttep)) { 5223 attr |= PROT_WRITE; 5224 } 5225 if (TTE_IS_EXECUTABLE(ttep)) { 5226 attr |= PROT_EXEC; 5227 } 5228 if (!TTE_IS_PRIVILEGED(ttep)) { 5229 attr |= PROT_USER; 5230 } 5231 if (TTE_IS_NFO(ttep)) { 5232 attr |= HAT_NOFAULT; 5233 } 5234 if (TTE_IS_NOSYNC(ttep)) { 5235 attr |= HAT_NOSYNC; 5236 } 5237 if (TTE_IS_SIDEFFECT(ttep)) { 5238 attr |= SFMMU_SIDEFFECT; 5239 } 5240 if (!TTE_IS_VCACHEABLE(ttep)) { 5241 attr |= SFMMU_UNCACHEVTTE; 5242 } 5243 if (!TTE_IS_PCACHEABLE(ttep)) { 5244 attr |= SFMMU_UNCACHEPTTE; 5245 } 5246 return (attr); 5247 } 5248 5249 /* 5250 * hat_chgprot is a deprecated hat call. New segment drivers 5251 * should store all attributes and use hat_*attr calls. 5252 * 5253 * Change the protections in the virtual address range 5254 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 5255 * then remove write permission, leaving the other 5256 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 5257 * 5258 */ 5259 void 5260 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 5261 { 5262 struct hmehash_bucket *hmebp; 5263 hmeblk_tag hblktag; 5264 int hmeshift, hashno = 1; 5265 struct hme_blk *hmeblkp, *list = NULL; 5266 caddr_t endaddr; 5267 cpuset_t cpuset; 5268 demap_range_t dmr; 5269 5270 ASSERT((len & MMU_PAGEOFFSET) == 0); 5271 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 5272 5273 if (sfmmup->sfmmu_xhat_provider) { 5274 XHAT_CHGPROT(sfmmup, addr, len, vprot); 5275 return; 5276 } else { 5277 /* 5278 * This must be a CPU HAT. If the address space has 5279 * XHATs attached, change attributes for all of them, 5280 * just in case 5281 */ 5282 ASSERT(sfmmup->sfmmu_as != NULL); 5283 if (sfmmup->sfmmu_as->a_xhat != NULL) 5284 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot); 5285 } 5286 5287 CPUSET_ZERO(cpuset); 5288 5289 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 5290 ((addr + len) > (caddr_t)USERLIMIT)) { 5291 panic("user addr %p vprot %x in kernel space", 5292 (void *)addr, vprot); 5293 } 5294 endaddr = addr + len; 5295 hblktag.htag_id = sfmmup; 5296 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5297 DEMAP_RANGE_INIT(sfmmup, &dmr); 5298 5299 while (addr < endaddr) { 5300 hmeshift = HME_HASH_SHIFT(hashno); 5301 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5302 hblktag.htag_rehash = hashno; 5303 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5304 5305 SFMMU_HASH_LOCK(hmebp); 5306 5307 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5308 if (hmeblkp != NULL) { 5309 ASSERT(!hmeblkp->hblk_shared); 5310 /* 5311 * We've encountered a shadow hmeblk so skip the range 5312 * of the next smaller mapping size. 5313 */ 5314 if (hmeblkp->hblk_shw_bit) { 5315 ASSERT(sfmmup != ksfmmup); 5316 ASSERT(hashno > 1); 5317 addr = (caddr_t)P2END((uintptr_t)addr, 5318 TTEBYTES(hashno - 1)); 5319 } else { 5320 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 5321 addr, endaddr, &dmr, vprot); 5322 } 5323 SFMMU_HASH_UNLOCK(hmebp); 5324 hashno = 1; 5325 continue; 5326 } 5327 SFMMU_HASH_UNLOCK(hmebp); 5328 5329 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5330 /* 5331 * We have traversed the whole list and rehashed 5332 * if necessary without finding the address to chgprot. 5333 * This is ok so we increment the address by the 5334 * smallest hmeblk range for kernel mappings and the 5335 * largest hmeblk range, to account for shadow hmeblks, 5336 * for user mappings and continue. 5337 */ 5338 if (sfmmup == ksfmmup) 5339 addr = (caddr_t)P2END((uintptr_t)addr, 5340 TTEBYTES(1)); 5341 else 5342 addr = (caddr_t)P2END((uintptr_t)addr, 5343 TTEBYTES(hashno)); 5344 hashno = 1; 5345 } else { 5346 hashno++; 5347 } 5348 } 5349 5350 sfmmu_hblks_list_purge(&list, 0); 5351 DEMAP_RANGE_FLUSH(&dmr); 5352 cpuset = sfmmup->sfmmu_cpusran; 5353 xt_sync(cpuset); 5354 } 5355 5356 /* 5357 * This function chgprots a range of addresses in an hmeblk. It returns the 5358 * next addres that needs to be chgprot. 5359 * It should be called with the hash lock held. 5360 * XXX It shold be possible to optimize chgprot by not flushing every time but 5361 * on the other hand: 5362 * 1. do one flush crosscall. 5363 * 2. only flush if we are increasing permissions (make sure this will work) 5364 */ 5365 static caddr_t 5366 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5367 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 5368 { 5369 uint_t pprot; 5370 tte_t tte, ttemod; 5371 struct sf_hment *sfhmep; 5372 uint_t tteflags; 5373 int ttesz; 5374 struct page *pp = NULL; 5375 kmutex_t *pml, *pmtx; 5376 int ret; 5377 int use_demap_range; 5378 #if defined(SF_ERRATA_57) 5379 int check_exec; 5380 #endif 5381 5382 ASSERT(in_hblk_range(hmeblkp, addr)); 5383 ASSERT(hmeblkp->hblk_shw_bit == 0); 5384 ASSERT(!hmeblkp->hblk_shared); 5385 5386 #ifdef DEBUG 5387 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5388 (endaddr < get_hblk_endaddr(hmeblkp))) { 5389 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 5390 } 5391 #endif /* DEBUG */ 5392 5393 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5394 ttesz = get_hblk_ttesz(hmeblkp); 5395 5396 pprot = sfmmu_vtop_prot(vprot, &tteflags); 5397 #if defined(SF_ERRATA_57) 5398 check_exec = (sfmmup != ksfmmup) && 5399 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 5400 ((vprot & PROT_EXEC) == PROT_EXEC); 5401 #endif 5402 HBLKTOHME(sfhmep, hmeblkp, addr); 5403 5404 /* 5405 * Flush the current demap region if addresses have been 5406 * skipped or the page size doesn't match. 5407 */ 5408 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 5409 if (use_demap_range) { 5410 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5411 } else { 5412 DEMAP_RANGE_FLUSH(dmrp); 5413 } 5414 5415 while (addr < endaddr) { 5416 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5417 if (TTE_IS_VALID(&tte)) { 5418 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 5419 /* 5420 * if the new protection is the same as old 5421 * continue 5422 */ 5423 goto next_addr; 5424 } 5425 pml = NULL; 5426 pp = sfhmep->hme_page; 5427 if (pp) { 5428 pml = sfmmu_mlist_enter(pp); 5429 } 5430 if (pp != sfhmep->hme_page) { 5431 /* 5432 * tte most have been unloaded 5433 * underneath us. Recheck 5434 */ 5435 ASSERT(pml); 5436 sfmmu_mlist_exit(pml); 5437 continue; 5438 } 5439 5440 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5441 5442 ttemod = tte; 5443 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 5444 #if defined(SF_ERRATA_57) 5445 if (check_exec && addr < errata57_limit) 5446 ttemod.tte_exec_perm = 0; 5447 #endif 5448 ret = sfmmu_modifytte_try(&tte, &ttemod, 5449 &sfhmep->hme_tte); 5450 5451 if (ret < 0) { 5452 /* tte changed underneath us */ 5453 if (pml) { 5454 sfmmu_mlist_exit(pml); 5455 } 5456 continue; 5457 } 5458 5459 if (tteflags & TTE_HWWR_INT) { 5460 /* 5461 * need to sync if we are clearing modify bit. 5462 */ 5463 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5464 } 5465 5466 if (pp && PP_ISRO(pp)) { 5467 if (pprot & TTE_WRPRM_INT) { 5468 pmtx = sfmmu_page_enter(pp); 5469 PP_CLRRO(pp); 5470 sfmmu_page_exit(pmtx); 5471 } 5472 } 5473 5474 if (ret > 0 && use_demap_range) { 5475 DEMAP_RANGE_MARKPG(dmrp, addr); 5476 } else if (ret > 0) { 5477 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5478 } 5479 5480 if (pml) { 5481 sfmmu_mlist_exit(pml); 5482 } 5483 } 5484 next_addr: 5485 addr += TTEBYTES(ttesz); 5486 sfhmep++; 5487 DEMAP_RANGE_NEXTPG(dmrp); 5488 } 5489 return (addr); 5490 } 5491 5492 /* 5493 * This routine is deprecated and should only be used by hat_chgprot. 5494 * The correct routine is sfmmu_vtop_attr. 5495 * This routine converts virtual page protections to physical ones. It will 5496 * update the tteflags field with the tte mask corresponding to the protections 5497 * affected and it returns the new protections. It will also clear the modify 5498 * bit if we are taking away write permission. This is necessary since the 5499 * modify bit is the hardware permission bit and we need to clear it in order 5500 * to detect write faults. 5501 * It accepts the following special protections: 5502 * ~PROT_WRITE = remove write permissions. 5503 * ~PROT_USER = remove user permissions. 5504 */ 5505 static uint_t 5506 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 5507 { 5508 if (vprot == (uint_t)~PROT_WRITE) { 5509 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 5510 return (0); /* will cause wrprm to be cleared */ 5511 } 5512 if (vprot == (uint_t)~PROT_USER) { 5513 *tteflagsp = TTE_PRIV_INT; 5514 return (0); /* will cause privprm to be cleared */ 5515 } 5516 if ((vprot == 0) || (vprot == PROT_USER) || 5517 ((vprot & PROT_ALL) != vprot)) { 5518 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5519 } 5520 5521 switch (vprot) { 5522 case (PROT_READ): 5523 case (PROT_EXEC): 5524 case (PROT_EXEC | PROT_READ): 5525 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5526 return (TTE_PRIV_INT); /* set prv and clr wrt */ 5527 case (PROT_WRITE): 5528 case (PROT_WRITE | PROT_READ): 5529 case (PROT_EXEC | PROT_WRITE): 5530 case (PROT_EXEC | PROT_WRITE | PROT_READ): 5531 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5532 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 5533 case (PROT_USER | PROT_READ): 5534 case (PROT_USER | PROT_EXEC): 5535 case (PROT_USER | PROT_EXEC | PROT_READ): 5536 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5537 return (0); /* clr prv and wrt */ 5538 case (PROT_USER | PROT_WRITE): 5539 case (PROT_USER | PROT_WRITE | PROT_READ): 5540 case (PROT_USER | PROT_EXEC | PROT_WRITE): 5541 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 5542 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5543 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 5544 default: 5545 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5546 } 5547 return (0); 5548 } 5549 5550 /* 5551 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 5552 * the normal algorithm would take too long for a very large VA range with 5553 * few real mappings. This routine just walks thru all HMEs in the global 5554 * hash table to find and remove mappings. 5555 */ 5556 static void 5557 hat_unload_large_virtual( 5558 struct hat *sfmmup, 5559 caddr_t startaddr, 5560 size_t len, 5561 uint_t flags, 5562 hat_callback_t *callback) 5563 { 5564 struct hmehash_bucket *hmebp; 5565 struct hme_blk *hmeblkp; 5566 struct hme_blk *pr_hblk = NULL; 5567 struct hme_blk *nx_hblk; 5568 struct hme_blk *list = NULL; 5569 int i; 5570 demap_range_t dmr, *dmrp; 5571 cpuset_t cpuset; 5572 caddr_t endaddr = startaddr + len; 5573 caddr_t sa; 5574 caddr_t ea; 5575 caddr_t cb_sa[MAX_CB_ADDR]; 5576 caddr_t cb_ea[MAX_CB_ADDR]; 5577 int addr_cnt = 0; 5578 int a = 0; 5579 5580 if (sfmmup->sfmmu_free) { 5581 dmrp = NULL; 5582 } else { 5583 dmrp = &dmr; 5584 DEMAP_RANGE_INIT(sfmmup, dmrp); 5585 } 5586 5587 /* 5588 * Loop through all the hash buckets of HME blocks looking for matches. 5589 */ 5590 for (i = 0; i <= UHMEHASH_SZ; i++) { 5591 hmebp = &uhme_hash[i]; 5592 SFMMU_HASH_LOCK(hmebp); 5593 hmeblkp = hmebp->hmeblkp; 5594 pr_hblk = NULL; 5595 while (hmeblkp) { 5596 nx_hblk = hmeblkp->hblk_next; 5597 5598 /* 5599 * skip if not this context, if a shadow block or 5600 * if the mapping is not in the requested range 5601 */ 5602 if (hmeblkp->hblk_tag.htag_id != sfmmup || 5603 hmeblkp->hblk_shw_bit || 5604 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 5605 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 5606 pr_hblk = hmeblkp; 5607 goto next_block; 5608 } 5609 5610 ASSERT(!hmeblkp->hblk_shared); 5611 /* 5612 * unload if there are any current valid mappings 5613 */ 5614 if (hmeblkp->hblk_vcnt != 0 || 5615 hmeblkp->hblk_hmecnt != 0) 5616 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 5617 sa, ea, dmrp, flags); 5618 5619 /* 5620 * on unmap we also release the HME block itself, once 5621 * all mappings are gone. 5622 */ 5623 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 5624 !hmeblkp->hblk_vcnt && 5625 !hmeblkp->hblk_hmecnt) { 5626 ASSERT(!hmeblkp->hblk_lckcnt); 5627 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5628 &list, 0); 5629 } else { 5630 pr_hblk = hmeblkp; 5631 } 5632 5633 if (callback == NULL) 5634 goto next_block; 5635 5636 /* 5637 * HME blocks may span more than one page, but we may be 5638 * unmapping only one page, so check for a smaller range 5639 * for the callback 5640 */ 5641 if (sa < startaddr) 5642 sa = startaddr; 5643 if (--ea > endaddr) 5644 ea = endaddr - 1; 5645 5646 cb_sa[addr_cnt] = sa; 5647 cb_ea[addr_cnt] = ea; 5648 if (++addr_cnt == MAX_CB_ADDR) { 5649 if (dmrp != NULL) { 5650 DEMAP_RANGE_FLUSH(dmrp); 5651 cpuset = sfmmup->sfmmu_cpusran; 5652 xt_sync(cpuset); 5653 } 5654 5655 for (a = 0; a < MAX_CB_ADDR; ++a) { 5656 callback->hcb_start_addr = cb_sa[a]; 5657 callback->hcb_end_addr = cb_ea[a]; 5658 callback->hcb_function(callback); 5659 } 5660 addr_cnt = 0; 5661 } 5662 5663 next_block: 5664 hmeblkp = nx_hblk; 5665 } 5666 SFMMU_HASH_UNLOCK(hmebp); 5667 } 5668 5669 sfmmu_hblks_list_purge(&list, 0); 5670 if (dmrp != NULL) { 5671 DEMAP_RANGE_FLUSH(dmrp); 5672 cpuset = sfmmup->sfmmu_cpusran; 5673 xt_sync(cpuset); 5674 } 5675 5676 for (a = 0; a < addr_cnt; ++a) { 5677 callback->hcb_start_addr = cb_sa[a]; 5678 callback->hcb_end_addr = cb_ea[a]; 5679 callback->hcb_function(callback); 5680 } 5681 5682 /* 5683 * Check TSB and TLB page sizes if the process isn't exiting. 5684 */ 5685 if (!sfmmup->sfmmu_free) 5686 sfmmu_check_page_sizes(sfmmup, 0); 5687 } 5688 5689 /* 5690 * Unload all the mappings in the range [addr..addr+len). addr and len must 5691 * be MMU_PAGESIZE aligned. 5692 */ 5693 5694 extern struct seg *segkmap; 5695 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 5696 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 5697 5698 5699 void 5700 hat_unload_callback( 5701 struct hat *sfmmup, 5702 caddr_t addr, 5703 size_t len, 5704 uint_t flags, 5705 hat_callback_t *callback) 5706 { 5707 struct hmehash_bucket *hmebp; 5708 hmeblk_tag hblktag; 5709 int hmeshift, hashno, iskernel; 5710 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 5711 caddr_t endaddr; 5712 cpuset_t cpuset; 5713 int addr_count = 0; 5714 int a; 5715 caddr_t cb_start_addr[MAX_CB_ADDR]; 5716 caddr_t cb_end_addr[MAX_CB_ADDR]; 5717 int issegkmap = ISSEGKMAP(sfmmup, addr); 5718 demap_range_t dmr, *dmrp; 5719 5720 if (sfmmup->sfmmu_xhat_provider) { 5721 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback); 5722 return; 5723 } else { 5724 /* 5725 * This must be a CPU HAT. If the address space has 5726 * XHATs attached, unload the mappings for all of them, 5727 * just in case 5728 */ 5729 ASSERT(sfmmup->sfmmu_as != NULL); 5730 if (sfmmup->sfmmu_as->a_xhat != NULL) 5731 xhat_unload_callback_all(sfmmup->sfmmu_as, addr, 5732 len, flags, callback); 5733 } 5734 5735 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 5736 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 5737 5738 ASSERT(sfmmup != NULL); 5739 ASSERT((len & MMU_PAGEOFFSET) == 0); 5740 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 5741 5742 /* 5743 * Probing through a large VA range (say 63 bits) will be slow, even 5744 * at 4 Meg steps between the probes. So, when the virtual address range 5745 * is very large, search the HME entries for what to unload. 5746 * 5747 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 5748 * 5749 * UHMEHASH_SZ is number of hash buckets to examine 5750 * 5751 */ 5752 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 5753 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 5754 return; 5755 } 5756 5757 CPUSET_ZERO(cpuset); 5758 5759 /* 5760 * If the process is exiting, we can save a lot of fuss since 5761 * we'll flush the TLB when we free the ctx anyway. 5762 */ 5763 if (sfmmup->sfmmu_free) 5764 dmrp = NULL; 5765 else 5766 dmrp = &dmr; 5767 5768 DEMAP_RANGE_INIT(sfmmup, dmrp); 5769 endaddr = addr + len; 5770 hblktag.htag_id = sfmmup; 5771 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5772 5773 /* 5774 * It is likely for the vm to call unload over a wide range of 5775 * addresses that are actually very sparsely populated by 5776 * translations. In order to speed this up the sfmmu hat supports 5777 * the concept of shadow hmeblks. Dummy large page hmeblks that 5778 * correspond to actual small translations are allocated at tteload 5779 * time and are referred to as shadow hmeblks. Now, during unload 5780 * time, we first check if we have a shadow hmeblk for that 5781 * translation. The absence of one means the corresponding address 5782 * range is empty and can be skipped. 5783 * 5784 * The kernel is an exception to above statement and that is why 5785 * we don't use shadow hmeblks and hash starting from the smallest 5786 * page size. 5787 */ 5788 if (sfmmup == KHATID) { 5789 iskernel = 1; 5790 hashno = TTE64K; 5791 } else { 5792 iskernel = 0; 5793 if (mmu_page_sizes == max_mmu_page_sizes) { 5794 hashno = TTE256M; 5795 } else { 5796 hashno = TTE4M; 5797 } 5798 } 5799 while (addr < endaddr) { 5800 hmeshift = HME_HASH_SHIFT(hashno); 5801 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5802 hblktag.htag_rehash = hashno; 5803 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5804 5805 SFMMU_HASH_LOCK(hmebp); 5806 5807 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 5808 if (hmeblkp == NULL) { 5809 /* 5810 * didn't find an hmeblk. skip the appropiate 5811 * address range. 5812 */ 5813 SFMMU_HASH_UNLOCK(hmebp); 5814 if (iskernel) { 5815 if (hashno < mmu_hashcnt) { 5816 hashno++; 5817 continue; 5818 } else { 5819 hashno = TTE64K; 5820 addr = (caddr_t)roundup((uintptr_t)addr 5821 + 1, MMU_PAGESIZE64K); 5822 continue; 5823 } 5824 } 5825 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5826 (1 << hmeshift)); 5827 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5828 ASSERT(hashno == TTE64K); 5829 continue; 5830 } 5831 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5832 hashno = TTE512K; 5833 continue; 5834 } 5835 if (mmu_page_sizes == max_mmu_page_sizes) { 5836 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5837 hashno = TTE4M; 5838 continue; 5839 } 5840 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5841 hashno = TTE32M; 5842 continue; 5843 } 5844 hashno = TTE256M; 5845 continue; 5846 } else { 5847 hashno = TTE4M; 5848 continue; 5849 } 5850 } 5851 ASSERT(hmeblkp); 5852 ASSERT(!hmeblkp->hblk_shared); 5853 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5854 /* 5855 * If the valid count is zero we can skip the range 5856 * mapped by this hmeblk. 5857 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 5858 * is used by segment drivers as a hint 5859 * that the mapping resource won't be used any longer. 5860 * The best example of this is during exit(). 5861 */ 5862 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5863 get_hblk_span(hmeblkp)); 5864 if ((flags & HAT_UNLOAD_UNMAP) || 5865 (iskernel && !issegkmap)) { 5866 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5867 &list, 0); 5868 } 5869 SFMMU_HASH_UNLOCK(hmebp); 5870 5871 if (iskernel) { 5872 hashno = TTE64K; 5873 continue; 5874 } 5875 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5876 ASSERT(hashno == TTE64K); 5877 continue; 5878 } 5879 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5880 hashno = TTE512K; 5881 continue; 5882 } 5883 if (mmu_page_sizes == max_mmu_page_sizes) { 5884 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5885 hashno = TTE4M; 5886 continue; 5887 } 5888 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5889 hashno = TTE32M; 5890 continue; 5891 } 5892 hashno = TTE256M; 5893 continue; 5894 } else { 5895 hashno = TTE4M; 5896 continue; 5897 } 5898 } 5899 if (hmeblkp->hblk_shw_bit) { 5900 /* 5901 * If we encounter a shadow hmeblk we know there is 5902 * smaller sized hmeblks mapping the same address space. 5903 * Decrement the hash size and rehash. 5904 */ 5905 ASSERT(sfmmup != KHATID); 5906 hashno--; 5907 SFMMU_HASH_UNLOCK(hmebp); 5908 continue; 5909 } 5910 5911 /* 5912 * track callback address ranges. 5913 * only start a new range when it's not contiguous 5914 */ 5915 if (callback != NULL) { 5916 if (addr_count > 0 && 5917 addr == cb_end_addr[addr_count - 1]) 5918 --addr_count; 5919 else 5920 cb_start_addr[addr_count] = addr; 5921 } 5922 5923 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5924 dmrp, flags); 5925 5926 if (callback != NULL) 5927 cb_end_addr[addr_count++] = addr; 5928 5929 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5930 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5931 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0); 5932 } 5933 SFMMU_HASH_UNLOCK(hmebp); 5934 5935 /* 5936 * Notify our caller as to exactly which pages 5937 * have been unloaded. We do these in clumps, 5938 * to minimize the number of xt_sync()s that need to occur. 5939 */ 5940 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5941 DEMAP_RANGE_FLUSH(dmrp); 5942 if (dmrp != NULL) { 5943 cpuset = sfmmup->sfmmu_cpusran; 5944 xt_sync(cpuset); 5945 } 5946 5947 for (a = 0; a < MAX_CB_ADDR; ++a) { 5948 callback->hcb_start_addr = cb_start_addr[a]; 5949 callback->hcb_end_addr = cb_end_addr[a]; 5950 callback->hcb_function(callback); 5951 } 5952 addr_count = 0; 5953 } 5954 if (iskernel) { 5955 hashno = TTE64K; 5956 continue; 5957 } 5958 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5959 ASSERT(hashno == TTE64K); 5960 continue; 5961 } 5962 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5963 hashno = TTE512K; 5964 continue; 5965 } 5966 if (mmu_page_sizes == max_mmu_page_sizes) { 5967 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5968 hashno = TTE4M; 5969 continue; 5970 } 5971 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5972 hashno = TTE32M; 5973 continue; 5974 } 5975 hashno = TTE256M; 5976 } else { 5977 hashno = TTE4M; 5978 } 5979 } 5980 5981 sfmmu_hblks_list_purge(&list, 0); 5982 DEMAP_RANGE_FLUSH(dmrp); 5983 if (dmrp != NULL) { 5984 cpuset = sfmmup->sfmmu_cpusran; 5985 xt_sync(cpuset); 5986 } 5987 if (callback && addr_count != 0) { 5988 for (a = 0; a < addr_count; ++a) { 5989 callback->hcb_start_addr = cb_start_addr[a]; 5990 callback->hcb_end_addr = cb_end_addr[a]; 5991 callback->hcb_function(callback); 5992 } 5993 } 5994 5995 /* 5996 * Check TSB and TLB page sizes if the process isn't exiting. 5997 */ 5998 if (!sfmmup->sfmmu_free) 5999 sfmmu_check_page_sizes(sfmmup, 0); 6000 } 6001 6002 /* 6003 * Unload all the mappings in the range [addr..addr+len). addr and len must 6004 * be MMU_PAGESIZE aligned. 6005 */ 6006 void 6007 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 6008 { 6009 if (sfmmup->sfmmu_xhat_provider) { 6010 XHAT_UNLOAD(sfmmup, addr, len, flags); 6011 return; 6012 } 6013 hat_unload_callback(sfmmup, addr, len, flags, NULL); 6014 } 6015 6016 6017 /* 6018 * Find the largest mapping size for this page. 6019 */ 6020 int 6021 fnd_mapping_sz(page_t *pp) 6022 { 6023 int sz; 6024 int p_index; 6025 6026 p_index = PP_MAPINDEX(pp); 6027 6028 sz = 0; 6029 p_index >>= 1; /* don't care about 8K bit */ 6030 for (; p_index; p_index >>= 1) { 6031 sz++; 6032 } 6033 6034 return (sz); 6035 } 6036 6037 /* 6038 * This function unloads a range of addresses for an hmeblk. 6039 * It returns the next address to be unloaded. 6040 * It should be called with the hash lock held. 6041 */ 6042 static caddr_t 6043 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 6044 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 6045 { 6046 tte_t tte, ttemod; 6047 struct sf_hment *sfhmep; 6048 int ttesz; 6049 long ttecnt; 6050 page_t *pp; 6051 kmutex_t *pml; 6052 int ret; 6053 int use_demap_range; 6054 6055 ASSERT(in_hblk_range(hmeblkp, addr)); 6056 ASSERT(!hmeblkp->hblk_shw_bit); 6057 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared); 6058 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared); 6059 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared); 6060 6061 #ifdef DEBUG 6062 if (get_hblk_ttesz(hmeblkp) != TTE8K && 6063 (endaddr < get_hblk_endaddr(hmeblkp))) { 6064 panic("sfmmu_hblk_unload: partial unload of large page"); 6065 } 6066 #endif /* DEBUG */ 6067 6068 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6069 ttesz = get_hblk_ttesz(hmeblkp); 6070 6071 use_demap_range = ((dmrp == NULL) || 6072 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp))); 6073 6074 if (use_demap_range) { 6075 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 6076 } else { 6077 DEMAP_RANGE_FLUSH(dmrp); 6078 } 6079 ttecnt = 0; 6080 HBLKTOHME(sfhmep, hmeblkp, addr); 6081 6082 while (addr < endaddr) { 6083 pml = NULL; 6084 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6085 if (TTE_IS_VALID(&tte)) { 6086 pp = sfhmep->hme_page; 6087 if (pp != NULL) { 6088 pml = sfmmu_mlist_enter(pp); 6089 } 6090 6091 /* 6092 * Verify if hme still points to 'pp' now that 6093 * we have p_mapping lock. 6094 */ 6095 if (sfhmep->hme_page != pp) { 6096 if (pp != NULL && sfhmep->hme_page != NULL) { 6097 ASSERT(pml != NULL); 6098 sfmmu_mlist_exit(pml); 6099 /* Re-start this iteration. */ 6100 continue; 6101 } 6102 ASSERT((pp != NULL) && 6103 (sfhmep->hme_page == NULL)); 6104 goto tte_unloaded; 6105 } 6106 6107 /* 6108 * This point on we have both HASH and p_mapping 6109 * lock. 6110 */ 6111 ASSERT(pp == sfhmep->hme_page); 6112 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6113 6114 /* 6115 * We need to loop on modify tte because it is 6116 * possible for pagesync to come along and 6117 * change the software bits beneath us. 6118 * 6119 * Page_unload can also invalidate the tte after 6120 * we read tte outside of p_mapping lock. 6121 */ 6122 again: 6123 ttemod = tte; 6124 6125 TTE_SET_INVALID(&ttemod); 6126 ret = sfmmu_modifytte_try(&tte, &ttemod, 6127 &sfhmep->hme_tte); 6128 6129 if (ret <= 0) { 6130 if (TTE_IS_VALID(&tte)) { 6131 ASSERT(ret < 0); 6132 goto again; 6133 } 6134 if (pp != NULL) { 6135 panic("sfmmu_hblk_unload: pp = 0x%p " 6136 "tte became invalid under mlist" 6137 " lock = 0x%p", (void *)pp, 6138 (void *)pml); 6139 } 6140 continue; 6141 } 6142 6143 if (!(flags & HAT_UNLOAD_NOSYNC)) { 6144 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6145 } 6146 6147 /* 6148 * Ok- we invalidated the tte. Do the rest of the job. 6149 */ 6150 ttecnt++; 6151 6152 if (flags & HAT_UNLOAD_UNLOCK) { 6153 ASSERT(hmeblkp->hblk_lckcnt > 0); 6154 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 6155 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 6156 } 6157 6158 /* 6159 * Normally we would need to flush the page 6160 * from the virtual cache at this point in 6161 * order to prevent a potential cache alias 6162 * inconsistency. 6163 * The particular scenario we need to worry 6164 * about is: 6165 * Given: va1 and va2 are two virtual address 6166 * that alias and map the same physical 6167 * address. 6168 * 1. mapping exists from va1 to pa and data 6169 * has been read into the cache. 6170 * 2. unload va1. 6171 * 3. load va2 and modify data using va2. 6172 * 4 unload va2. 6173 * 5. load va1 and reference data. Unless we 6174 * flush the data cache when we unload we will 6175 * get stale data. 6176 * Fortunately, page coloring eliminates the 6177 * above scenario by remembering the color a 6178 * physical page was last or is currently 6179 * mapped to. Now, we delay the flush until 6180 * the loading of translations. Only when the 6181 * new translation is of a different color 6182 * are we forced to flush. 6183 */ 6184 if (use_demap_range) { 6185 /* 6186 * Mark this page as needing a demap. 6187 */ 6188 DEMAP_RANGE_MARKPG(dmrp, addr); 6189 } else { 6190 ASSERT(sfmmup != NULL); 6191 ASSERT(!hmeblkp->hblk_shared); 6192 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 6193 sfmmup->sfmmu_free, 0); 6194 } 6195 6196 if (pp) { 6197 /* 6198 * Remove the hment from the mapping list 6199 */ 6200 ASSERT(hmeblkp->hblk_hmecnt > 0); 6201 6202 /* 6203 * Again, we cannot 6204 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 6205 */ 6206 HME_SUB(sfhmep, pp); 6207 membar_stst(); 6208 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 6209 } 6210 6211 ASSERT(hmeblkp->hblk_vcnt > 0); 6212 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 6213 6214 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 6215 !hmeblkp->hblk_lckcnt); 6216 6217 #ifdef VAC 6218 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 6219 if (PP_ISTNC(pp)) { 6220 /* 6221 * If page was temporary 6222 * uncached, try to recache 6223 * it. Note that HME_SUB() was 6224 * called above so p_index and 6225 * mlist had been updated. 6226 */ 6227 conv_tnc(pp, ttesz); 6228 } else if (pp->p_mapping == NULL) { 6229 ASSERT(kpm_enable); 6230 /* 6231 * Page is marked to be in VAC conflict 6232 * to an existing kpm mapping and/or is 6233 * kpm mapped using only the regular 6234 * pagesize. 6235 */ 6236 sfmmu_kpm_hme_unload(pp); 6237 } 6238 } 6239 #endif /* VAC */ 6240 } else if ((pp = sfhmep->hme_page) != NULL) { 6241 /* 6242 * TTE is invalid but the hme 6243 * still exists. let pageunload 6244 * complete its job. 6245 */ 6246 ASSERT(pml == NULL); 6247 pml = sfmmu_mlist_enter(pp); 6248 if (sfhmep->hme_page != NULL) { 6249 sfmmu_mlist_exit(pml); 6250 continue; 6251 } 6252 ASSERT(sfhmep->hme_page == NULL); 6253 } else if (hmeblkp->hblk_hmecnt != 0) { 6254 /* 6255 * pageunload may have not finished decrementing 6256 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 6257 * wait for pageunload to finish. Rely on pageunload 6258 * to decrement hblk_hmecnt after hblk_vcnt. 6259 */ 6260 pfn_t pfn = TTE_TO_TTEPFN(&tte); 6261 ASSERT(pml == NULL); 6262 if (pf_is_memory(pfn)) { 6263 pp = page_numtopp_nolock(pfn); 6264 if (pp != NULL) { 6265 pml = sfmmu_mlist_enter(pp); 6266 sfmmu_mlist_exit(pml); 6267 pml = NULL; 6268 } 6269 } 6270 } 6271 6272 tte_unloaded: 6273 /* 6274 * At this point, the tte we are looking at 6275 * should be unloaded, and hme has been unlinked 6276 * from page too. This is important because in 6277 * pageunload, it does ttesync() then HME_SUB. 6278 * We need to make sure HME_SUB has been completed 6279 * so we know ttesync() has been completed. Otherwise, 6280 * at exit time, after return from hat layer, VM will 6281 * release as structure which hat_setstat() (called 6282 * by ttesync()) needs. 6283 */ 6284 #ifdef DEBUG 6285 { 6286 tte_t dtte; 6287 6288 ASSERT(sfhmep->hme_page == NULL); 6289 6290 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 6291 ASSERT(!TTE_IS_VALID(&dtte)); 6292 } 6293 #endif 6294 6295 if (pml) { 6296 sfmmu_mlist_exit(pml); 6297 } 6298 6299 addr += TTEBYTES(ttesz); 6300 sfhmep++; 6301 DEMAP_RANGE_NEXTPG(dmrp); 6302 } 6303 /* 6304 * For shared hmeblks this routine is only called when region is freed 6305 * and no longer referenced. So no need to decrement ttecnt 6306 * in the region structure here. 6307 */ 6308 if (ttecnt > 0 && sfmmup != NULL) { 6309 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 6310 } 6311 return (addr); 6312 } 6313 6314 /* 6315 * Invalidate a virtual address range for the local CPU. 6316 * For best performance ensure that the va range is completely 6317 * mapped, otherwise the entire TLB will be flushed. 6318 */ 6319 void 6320 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size) 6321 { 6322 ssize_t sz; 6323 caddr_t endva = va + size; 6324 6325 while (va < endva) { 6326 sz = hat_getpagesize(sfmmup, va); 6327 if (sz < 0) { 6328 vtag_flushall(); 6329 break; 6330 } 6331 vtag_flushpage(va, (uint64_t)sfmmup); 6332 va += sz; 6333 } 6334 } 6335 6336 /* 6337 * Synchronize all the mappings in the range [addr..addr+len). 6338 * Can be called with clearflag having two states: 6339 * HAT_SYNC_DONTZERO means just return the rm stats 6340 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 6341 */ 6342 void 6343 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 6344 { 6345 struct hmehash_bucket *hmebp; 6346 hmeblk_tag hblktag; 6347 int hmeshift, hashno = 1; 6348 struct hme_blk *hmeblkp, *list = NULL; 6349 caddr_t endaddr; 6350 cpuset_t cpuset; 6351 6352 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 6353 ASSERT((sfmmup == ksfmmup) || 6354 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 6355 ASSERT((len & MMU_PAGEOFFSET) == 0); 6356 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6357 (clearflag == HAT_SYNC_ZERORM)); 6358 6359 CPUSET_ZERO(cpuset); 6360 6361 endaddr = addr + len; 6362 hblktag.htag_id = sfmmup; 6363 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 6364 6365 /* 6366 * Spitfire supports 4 page sizes. 6367 * Most pages are expected to be of the smallest page 6368 * size (8K) and these will not need to be rehashed. 64K 6369 * pages also don't need to be rehashed because the an hmeblk 6370 * spans 64K of address space. 512K pages might need 1 rehash and 6371 * and 4M pages 2 rehashes. 6372 */ 6373 while (addr < endaddr) { 6374 hmeshift = HME_HASH_SHIFT(hashno); 6375 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 6376 hblktag.htag_rehash = hashno; 6377 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 6378 6379 SFMMU_HASH_LOCK(hmebp); 6380 6381 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 6382 if (hmeblkp != NULL) { 6383 ASSERT(!hmeblkp->hblk_shared); 6384 /* 6385 * We've encountered a shadow hmeblk so skip the range 6386 * of the next smaller mapping size. 6387 */ 6388 if (hmeblkp->hblk_shw_bit) { 6389 ASSERT(sfmmup != ksfmmup); 6390 ASSERT(hashno > 1); 6391 addr = (caddr_t)P2END((uintptr_t)addr, 6392 TTEBYTES(hashno - 1)); 6393 } else { 6394 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 6395 addr, endaddr, clearflag); 6396 } 6397 SFMMU_HASH_UNLOCK(hmebp); 6398 hashno = 1; 6399 continue; 6400 } 6401 SFMMU_HASH_UNLOCK(hmebp); 6402 6403 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 6404 /* 6405 * We have traversed the whole list and rehashed 6406 * if necessary without finding the address to sync. 6407 * This is ok so we increment the address by the 6408 * smallest hmeblk range for kernel mappings and the 6409 * largest hmeblk range, to account for shadow hmeblks, 6410 * for user mappings and continue. 6411 */ 6412 if (sfmmup == ksfmmup) 6413 addr = (caddr_t)P2END((uintptr_t)addr, 6414 TTEBYTES(1)); 6415 else 6416 addr = (caddr_t)P2END((uintptr_t)addr, 6417 TTEBYTES(hashno)); 6418 hashno = 1; 6419 } else { 6420 hashno++; 6421 } 6422 } 6423 sfmmu_hblks_list_purge(&list, 0); 6424 cpuset = sfmmup->sfmmu_cpusran; 6425 xt_sync(cpuset); 6426 } 6427 6428 static caddr_t 6429 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 6430 caddr_t endaddr, int clearflag) 6431 { 6432 tte_t tte, ttemod; 6433 struct sf_hment *sfhmep; 6434 int ttesz; 6435 struct page *pp; 6436 kmutex_t *pml; 6437 int ret; 6438 6439 ASSERT(hmeblkp->hblk_shw_bit == 0); 6440 ASSERT(!hmeblkp->hblk_shared); 6441 6442 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6443 6444 ttesz = get_hblk_ttesz(hmeblkp); 6445 HBLKTOHME(sfhmep, hmeblkp, addr); 6446 6447 while (addr < endaddr) { 6448 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6449 if (TTE_IS_VALID(&tte)) { 6450 pml = NULL; 6451 pp = sfhmep->hme_page; 6452 if (pp) { 6453 pml = sfmmu_mlist_enter(pp); 6454 } 6455 if (pp != sfhmep->hme_page) { 6456 /* 6457 * tte most have been unloaded 6458 * underneath us. Recheck 6459 */ 6460 ASSERT(pml); 6461 sfmmu_mlist_exit(pml); 6462 continue; 6463 } 6464 6465 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6466 6467 if (clearflag == HAT_SYNC_ZERORM) { 6468 ttemod = tte; 6469 TTE_CLR_RM(&ttemod); 6470 ret = sfmmu_modifytte_try(&tte, &ttemod, 6471 &sfhmep->hme_tte); 6472 if (ret < 0) { 6473 if (pml) { 6474 sfmmu_mlist_exit(pml); 6475 } 6476 continue; 6477 } 6478 6479 if (ret > 0) { 6480 sfmmu_tlb_demap(addr, sfmmup, 6481 hmeblkp, 0, 0); 6482 } 6483 } 6484 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6485 if (pml) { 6486 sfmmu_mlist_exit(pml); 6487 } 6488 } 6489 addr += TTEBYTES(ttesz); 6490 sfhmep++; 6491 } 6492 return (addr); 6493 } 6494 6495 /* 6496 * This function will sync a tte to the page struct and it will 6497 * update the hat stats. Currently it allows us to pass a NULL pp 6498 * and we will simply update the stats. We may want to change this 6499 * so we only keep stats for pages backed by pp's. 6500 */ 6501 static void 6502 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 6503 { 6504 uint_t rm = 0; 6505 int sz; 6506 pgcnt_t npgs; 6507 6508 ASSERT(TTE_IS_VALID(ttep)); 6509 6510 if (TTE_IS_NOSYNC(ttep)) { 6511 return; 6512 } 6513 6514 if (TTE_IS_REF(ttep)) { 6515 rm = P_REF; 6516 } 6517 if (TTE_IS_MOD(ttep)) { 6518 rm |= P_MOD; 6519 } 6520 6521 if (rm == 0) { 6522 return; 6523 } 6524 6525 sz = TTE_CSZ(ttep); 6526 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { 6527 int i; 6528 caddr_t vaddr = addr; 6529 6530 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) { 6531 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm); 6532 } 6533 6534 } 6535 6536 /* 6537 * XXX I want to use cas to update nrm bits but they 6538 * currently belong in common/vm and not in hat where 6539 * they should be. 6540 * The nrm bits are protected by the same mutex as 6541 * the one that protects the page's mapping list. 6542 */ 6543 if (!pp) 6544 return; 6545 ASSERT(sfmmu_mlist_held(pp)); 6546 /* 6547 * If the tte is for a large page, we need to sync all the 6548 * pages covered by the tte. 6549 */ 6550 if (sz != TTE8K) { 6551 ASSERT(pp->p_szc != 0); 6552 pp = PP_GROUPLEADER(pp, sz); 6553 ASSERT(sfmmu_mlist_held(pp)); 6554 } 6555 6556 /* Get number of pages from tte size. */ 6557 npgs = TTEPAGES(sz); 6558 6559 do { 6560 ASSERT(pp); 6561 ASSERT(sfmmu_mlist_held(pp)); 6562 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 6563 ((rm & P_MOD) != 0 && !PP_ISMOD(pp))) 6564 hat_page_setattr(pp, rm); 6565 6566 /* 6567 * Are we done? If not, we must have a large mapping. 6568 * For large mappings we need to sync the rest of the pages 6569 * covered by this tte; goto the next page. 6570 */ 6571 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 6572 } 6573 6574 /* 6575 * Execute pre-callback handler of each pa_hment linked to pp 6576 * 6577 * Inputs: 6578 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 6579 * capture_cpus: pointer to return value (below) 6580 * 6581 * Returns: 6582 * Propagates the subsystem callback return values back to the caller; 6583 * returns 0 on success. If capture_cpus is non-NULL, the value returned 6584 * is zero if all of the pa_hments are of a type that do not require 6585 * capturing CPUs prior to suspending the mapping, else it is 1. 6586 */ 6587 static int 6588 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 6589 { 6590 struct sf_hment *sfhmep; 6591 struct pa_hment *pahmep; 6592 int (*f)(caddr_t, uint_t, uint_t, void *); 6593 int ret; 6594 id_t id; 6595 int locked = 0; 6596 kmutex_t *pml; 6597 6598 ASSERT(PAGE_EXCL(pp)); 6599 if (!sfmmu_mlist_held(pp)) { 6600 pml = sfmmu_mlist_enter(pp); 6601 locked = 1; 6602 } 6603 6604 if (capture_cpus) 6605 *capture_cpus = 0; 6606 6607 top: 6608 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6609 /* 6610 * skip sf_hments corresponding to VA<->PA mappings; 6611 * for pa_hment's, hme_tte.ll is zero 6612 */ 6613 if (!IS_PAHME(sfhmep)) 6614 continue; 6615 6616 pahmep = sfhmep->hme_data; 6617 ASSERT(pahmep != NULL); 6618 6619 /* 6620 * skip if pre-handler has been called earlier in this loop 6621 */ 6622 if (pahmep->flags & flag) 6623 continue; 6624 6625 id = pahmep->cb_id; 6626 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6627 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 6628 *capture_cpus = 1; 6629 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 6630 pahmep->flags |= flag; 6631 continue; 6632 } 6633 6634 /* 6635 * Drop the mapping list lock to avoid locking order issues. 6636 */ 6637 if (locked) 6638 sfmmu_mlist_exit(pml); 6639 6640 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 6641 if (ret != 0) 6642 return (ret); /* caller must do the cleanup */ 6643 6644 if (locked) { 6645 pml = sfmmu_mlist_enter(pp); 6646 pahmep->flags |= flag; 6647 goto top; 6648 } 6649 6650 pahmep->flags |= flag; 6651 } 6652 6653 if (locked) 6654 sfmmu_mlist_exit(pml); 6655 6656 return (0); 6657 } 6658 6659 /* 6660 * Execute post-callback handler of each pa_hment linked to pp 6661 * 6662 * Same overall assumptions and restrictions apply as for 6663 * hat_pageprocess_precallbacks(). 6664 */ 6665 static void 6666 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 6667 { 6668 pfn_t pgpfn = pp->p_pagenum; 6669 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 6670 pfn_t newpfn; 6671 struct sf_hment *sfhmep; 6672 struct pa_hment *pahmep; 6673 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 6674 id_t id; 6675 int locked = 0; 6676 kmutex_t *pml; 6677 6678 ASSERT(PAGE_EXCL(pp)); 6679 if (!sfmmu_mlist_held(pp)) { 6680 pml = sfmmu_mlist_enter(pp); 6681 locked = 1; 6682 } 6683 6684 top: 6685 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6686 /* 6687 * skip sf_hments corresponding to VA<->PA mappings; 6688 * for pa_hment's, hme_tte.ll is zero 6689 */ 6690 if (!IS_PAHME(sfhmep)) 6691 continue; 6692 6693 pahmep = sfhmep->hme_data; 6694 ASSERT(pahmep != NULL); 6695 6696 if ((pahmep->flags & flag) == 0) 6697 continue; 6698 6699 pahmep->flags &= ~flag; 6700 6701 id = pahmep->cb_id; 6702 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6703 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 6704 continue; 6705 6706 /* 6707 * Convert the base page PFN into the constituent PFN 6708 * which is needed by the callback handler. 6709 */ 6710 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 6711 6712 /* 6713 * Drop the mapping list lock to avoid locking order issues. 6714 */ 6715 if (locked) 6716 sfmmu_mlist_exit(pml); 6717 6718 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 6719 != 0) 6720 panic("sfmmu: posthandler failed"); 6721 6722 if (locked) { 6723 pml = sfmmu_mlist_enter(pp); 6724 goto top; 6725 } 6726 } 6727 6728 if (locked) 6729 sfmmu_mlist_exit(pml); 6730 } 6731 6732 /* 6733 * Suspend locked kernel mapping 6734 */ 6735 void 6736 hat_pagesuspend(struct page *pp) 6737 { 6738 struct sf_hment *sfhmep; 6739 sfmmu_t *sfmmup; 6740 tte_t tte, ttemod; 6741 struct hme_blk *hmeblkp; 6742 caddr_t addr; 6743 int index, cons; 6744 cpuset_t cpuset; 6745 6746 ASSERT(PAGE_EXCL(pp)); 6747 ASSERT(sfmmu_mlist_held(pp)); 6748 6749 mutex_enter(&kpr_suspendlock); 6750 6751 /* 6752 * We're about to suspend a kernel mapping so mark this thread as 6753 * non-traceable by DTrace. This prevents us from running into issues 6754 * with probe context trying to touch a suspended page 6755 * in the relocation codepath itself. 6756 */ 6757 curthread->t_flag |= T_DONTDTRACE; 6758 6759 index = PP_MAPINDEX(pp); 6760 cons = TTE8K; 6761 6762 retry: 6763 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6764 6765 if (IS_PAHME(sfhmep)) 6766 continue; 6767 6768 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 6769 continue; 6770 6771 /* 6772 * Loop until we successfully set the suspend bit in 6773 * the TTE. 6774 */ 6775 again: 6776 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6777 ASSERT(TTE_IS_VALID(&tte)); 6778 6779 ttemod = tte; 6780 TTE_SET_SUSPEND(&ttemod); 6781 if (sfmmu_modifytte_try(&tte, &ttemod, 6782 &sfhmep->hme_tte) < 0) 6783 goto again; 6784 6785 /* 6786 * Invalidate TSB entry 6787 */ 6788 hmeblkp = sfmmu_hmetohblk(sfhmep); 6789 6790 sfmmup = hblktosfmmu(hmeblkp); 6791 ASSERT(sfmmup == ksfmmup); 6792 ASSERT(!hmeblkp->hblk_shared); 6793 6794 addr = tte_to_vaddr(hmeblkp, tte); 6795 6796 /* 6797 * No need to make sure that the TSB for this sfmmu is 6798 * not being relocated since it is ksfmmup and thus it 6799 * will never be relocated. 6800 */ 6801 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 6802 6803 /* 6804 * Update xcall stats 6805 */ 6806 cpuset = cpu_ready_set; 6807 CPUSET_DEL(cpuset, CPU->cpu_id); 6808 6809 /* LINTED: constant in conditional context */ 6810 SFMMU_XCALL_STATS(ksfmmup); 6811 6812 /* 6813 * Flush TLB entry on remote CPU's 6814 */ 6815 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 6816 (uint64_t)ksfmmup); 6817 xt_sync(cpuset); 6818 6819 /* 6820 * Flush TLB entry on local CPU 6821 */ 6822 vtag_flushpage(addr, (uint64_t)ksfmmup); 6823 } 6824 6825 while (index != 0) { 6826 index = index >> 1; 6827 if (index != 0) 6828 cons++; 6829 if (index & 0x1) { 6830 pp = PP_GROUPLEADER(pp, cons); 6831 goto retry; 6832 } 6833 } 6834 } 6835 6836 #ifdef DEBUG 6837 6838 #define N_PRLE 1024 6839 struct prle { 6840 page_t *targ; 6841 page_t *repl; 6842 int status; 6843 int pausecpus; 6844 hrtime_t whence; 6845 }; 6846 6847 static struct prle page_relocate_log[N_PRLE]; 6848 static int prl_entry; 6849 static kmutex_t prl_mutex; 6850 6851 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 6852 mutex_enter(&prl_mutex); \ 6853 page_relocate_log[prl_entry].targ = *(t); \ 6854 page_relocate_log[prl_entry].repl = *(r); \ 6855 page_relocate_log[prl_entry].status = (s); \ 6856 page_relocate_log[prl_entry].pausecpus = (p); \ 6857 page_relocate_log[prl_entry].whence = gethrtime(); \ 6858 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 6859 mutex_exit(&prl_mutex); 6860 6861 #else /* !DEBUG */ 6862 #define PAGE_RELOCATE_LOG(t, r, s, p) 6863 #endif 6864 6865 /* 6866 * Core Kernel Page Relocation Algorithm 6867 * 6868 * Input: 6869 * 6870 * target : constituent pages are SE_EXCL locked. 6871 * replacement: constituent pages are SE_EXCL locked. 6872 * 6873 * Output: 6874 * 6875 * nrelocp: number of pages relocated 6876 */ 6877 int 6878 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 6879 { 6880 page_t *targ, *repl; 6881 page_t *tpp, *rpp; 6882 kmutex_t *low, *high; 6883 spgcnt_t npages, i; 6884 page_t *pl = NULL; 6885 int old_pil; 6886 cpuset_t cpuset; 6887 int cap_cpus; 6888 int ret; 6889 #ifdef VAC 6890 int cflags = 0; 6891 #endif 6892 6893 if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) { 6894 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 6895 return (EAGAIN); 6896 } 6897 6898 mutex_enter(&kpr_mutex); 6899 kreloc_thread = curthread; 6900 6901 targ = *target; 6902 repl = *replacement; 6903 ASSERT(repl != NULL); 6904 ASSERT(targ->p_szc == repl->p_szc); 6905 6906 npages = page_get_pagecnt(targ->p_szc); 6907 6908 /* 6909 * unload VA<->PA mappings that are not locked 6910 */ 6911 tpp = targ; 6912 for (i = 0; i < npages; i++) { 6913 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 6914 tpp++; 6915 } 6916 6917 /* 6918 * Do "presuspend" callbacks, in a context from which we can still 6919 * block as needed. Note that we don't hold the mapping list lock 6920 * of "targ" at this point due to potential locking order issues; 6921 * we assume that between the hat_pageunload() above and holding 6922 * the SE_EXCL lock that the mapping list *cannot* change at this 6923 * point. 6924 */ 6925 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 6926 if (ret != 0) { 6927 /* 6928 * EIO translates to fatal error, for all others cleanup 6929 * and return EAGAIN. 6930 */ 6931 ASSERT(ret != EIO); 6932 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 6933 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 6934 kreloc_thread = NULL; 6935 mutex_exit(&kpr_mutex); 6936 return (EAGAIN); 6937 } 6938 6939 /* 6940 * acquire p_mapping list lock for both the target and replacement 6941 * root pages. 6942 * 6943 * low and high refer to the need to grab the mlist locks in a 6944 * specific order in order to prevent race conditions. Thus the 6945 * lower lock must be grabbed before the higher lock. 6946 * 6947 * This will block hat_unload's accessing p_mapping list. Since 6948 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 6949 * blocked. Thus, no one else will be accessing the p_mapping list 6950 * while we suspend and reload the locked mapping below. 6951 */ 6952 tpp = targ; 6953 rpp = repl; 6954 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6955 6956 kpreempt_disable(); 6957 6958 /* 6959 * We raise our PIL to 13 so that we don't get captured by 6960 * another CPU or pinned by an interrupt thread. We can't go to 6961 * PIL 14 since the nexus driver(s) may need to interrupt at 6962 * that level in the case of IOMMU pseudo mappings. 6963 */ 6964 cpuset = cpu_ready_set; 6965 CPUSET_DEL(cpuset, CPU->cpu_id); 6966 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6967 old_pil = splr(XCALL_PIL); 6968 } else { 6969 old_pil = -1; 6970 xc_attention(cpuset); 6971 } 6972 ASSERT(getpil() == XCALL_PIL); 6973 6974 /* 6975 * Now do suspend callbacks. In the case of an IOMMU mapping 6976 * this will suspend all DMA activity to the page while it is 6977 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6978 * may be captured at this point we should have acquired any needed 6979 * locks in the presuspend callback. 6980 */ 6981 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6982 if (ret != 0) { 6983 repl = targ; 6984 goto suspend_fail; 6985 } 6986 6987 /* 6988 * Raise the PIL yet again, this time to block all high-level 6989 * interrupts on this CPU. This is necessary to prevent an 6990 * interrupt routine from pinning the thread which holds the 6991 * mapping suspended and then touching the suspended page. 6992 * 6993 * Once the page is suspended we also need to be careful to 6994 * avoid calling any functions which touch any seg_kmem memory 6995 * since that memory may be backed by the very page we are 6996 * relocating in here! 6997 */ 6998 hat_pagesuspend(targ); 6999 7000 /* 7001 * Now that we are confident everybody has stopped using this page, 7002 * copy the page contents. Note we use a physical copy to prevent 7003 * locking issues and to avoid fpRAS because we can't handle it in 7004 * this context. 7005 */ 7006 for (i = 0; i < npages; i++, tpp++, rpp++) { 7007 #ifdef VAC 7008 /* 7009 * If the replacement has a different vcolor than 7010 * the one being replacd, we need to handle VAC 7011 * consistency for it just as we were setting up 7012 * a new mapping to it. 7013 */ 7014 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) && 7015 (tpp->p_vcolor != rpp->p_vcolor) && 7016 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) { 7017 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp)); 7018 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 7019 rpp->p_pagenum); 7020 } 7021 #endif 7022 /* 7023 * Copy the contents of the page. 7024 */ 7025 ppcopy_kernel(tpp, rpp); 7026 } 7027 7028 tpp = targ; 7029 rpp = repl; 7030 for (i = 0; i < npages; i++, tpp++, rpp++) { 7031 /* 7032 * Copy attributes. VAC consistency was handled above, 7033 * if required. 7034 */ 7035 rpp->p_nrm = tpp->p_nrm; 7036 tpp->p_nrm = 0; 7037 rpp->p_index = tpp->p_index; 7038 tpp->p_index = 0; 7039 #ifdef VAC 7040 rpp->p_vcolor = tpp->p_vcolor; 7041 #endif 7042 } 7043 7044 /* 7045 * First, unsuspend the page, if we set the suspend bit, and transfer 7046 * the mapping list from the target page to the replacement page. 7047 * Next process postcallbacks; since pa_hment's are linked only to the 7048 * p_mapping list of root page, we don't iterate over the constituent 7049 * pages. 7050 */ 7051 hat_pagereload(targ, repl); 7052 7053 suspend_fail: 7054 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 7055 7056 /* 7057 * Now lower our PIL and release any captured CPUs since we 7058 * are out of the "danger zone". After this it will again be 7059 * safe to acquire adaptive mutex locks, or to drop them... 7060 */ 7061 if (old_pil != -1) { 7062 splx(old_pil); 7063 } else { 7064 xc_dismissed(cpuset); 7065 } 7066 7067 kpreempt_enable(); 7068 7069 sfmmu_mlist_reloc_exit(low, high); 7070 7071 /* 7072 * Postsuspend callbacks should drop any locks held across 7073 * the suspend callbacks. As before, we don't hold the mapping 7074 * list lock at this point.. our assumption is that the mapping 7075 * list still can't change due to our holding SE_EXCL lock and 7076 * there being no unlocked mappings left. Hence the restriction 7077 * on calling context to hat_delete_callback() 7078 */ 7079 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 7080 if (ret != 0) { 7081 /* 7082 * The second presuspend call failed: we got here through 7083 * the suspend_fail label above. 7084 */ 7085 ASSERT(ret != EIO); 7086 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 7087 kreloc_thread = NULL; 7088 mutex_exit(&kpr_mutex); 7089 return (EAGAIN); 7090 } 7091 7092 /* 7093 * Now that we're out of the performance critical section we can 7094 * take care of updating the hash table, since we still 7095 * hold all the pages locked SE_EXCL at this point we 7096 * needn't worry about things changing out from under us. 7097 */ 7098 tpp = targ; 7099 rpp = repl; 7100 for (i = 0; i < npages; i++, tpp++, rpp++) { 7101 7102 /* 7103 * replace targ with replacement in page_hash table 7104 */ 7105 targ = tpp; 7106 page_relocate_hash(rpp, targ); 7107 7108 /* 7109 * concatenate target; caller of platform_page_relocate() 7110 * expects target to be concatenated after returning. 7111 */ 7112 ASSERT(targ->p_next == targ); 7113 ASSERT(targ->p_prev == targ); 7114 page_list_concat(&pl, &targ); 7115 } 7116 7117 ASSERT(*target == pl); 7118 *nrelocp = npages; 7119 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 7120 kreloc_thread = NULL; 7121 mutex_exit(&kpr_mutex); 7122 return (0); 7123 } 7124 7125 /* 7126 * Called when stray pa_hments are found attached to a page which is 7127 * being freed. Notify the subsystem which attached the pa_hment of 7128 * the error if it registered a suitable handler, else panic. 7129 */ 7130 static void 7131 sfmmu_pahment_leaked(struct pa_hment *pahmep) 7132 { 7133 id_t cb_id = pahmep->cb_id; 7134 7135 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 7136 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 7137 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 7138 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 7139 return; /* non-fatal */ 7140 } 7141 panic("pa_hment leaked: 0x%p", (void *)pahmep); 7142 } 7143 7144 /* 7145 * Remove all mappings to page 'pp'. 7146 */ 7147 int 7148 hat_pageunload(struct page *pp, uint_t forceflag) 7149 { 7150 struct page *origpp = pp; 7151 struct sf_hment *sfhme, *tmphme; 7152 struct hme_blk *hmeblkp; 7153 kmutex_t *pml; 7154 #ifdef VAC 7155 kmutex_t *pmtx; 7156 #endif 7157 cpuset_t cpuset, tset; 7158 int index, cons; 7159 int xhme_blks; 7160 int pa_hments; 7161 7162 ASSERT(PAGE_EXCL(pp)); 7163 7164 retry_xhat: 7165 tmphme = NULL; 7166 xhme_blks = 0; 7167 pa_hments = 0; 7168 CPUSET_ZERO(cpuset); 7169 7170 pml = sfmmu_mlist_enter(pp); 7171 7172 #ifdef VAC 7173 if (pp->p_kpmref) 7174 sfmmu_kpm_pageunload(pp); 7175 ASSERT(!PP_ISMAPPED_KPM(pp)); 7176 #endif 7177 /* 7178 * Clear vpm reference. Since the page is exclusively locked 7179 * vpm cannot be referencing it. 7180 */ 7181 if (vpm_enable) { 7182 pp->p_vpmref = 0; 7183 } 7184 7185 index = PP_MAPINDEX(pp); 7186 cons = TTE8K; 7187 retry: 7188 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7189 tmphme = sfhme->hme_next; 7190 7191 if (IS_PAHME(sfhme)) { 7192 ASSERT(sfhme->hme_data != NULL); 7193 pa_hments++; 7194 continue; 7195 } 7196 7197 hmeblkp = sfmmu_hmetohblk(sfhme); 7198 if (hmeblkp->hblk_xhat_bit) { 7199 struct xhat_hme_blk *xblk = 7200 (struct xhat_hme_blk *)hmeblkp; 7201 7202 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat, 7203 pp, forceflag, XBLK2PROVBLK(xblk)); 7204 7205 xhme_blks = 1; 7206 continue; 7207 } 7208 7209 /* 7210 * If there are kernel mappings don't unload them, they will 7211 * be suspended. 7212 */ 7213 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 7214 hmeblkp->hblk_tag.htag_id == ksfmmup) 7215 continue; 7216 7217 tset = sfmmu_pageunload(pp, sfhme, cons); 7218 CPUSET_OR(cpuset, tset); 7219 } 7220 7221 while (index != 0) { 7222 index = index >> 1; 7223 if (index != 0) 7224 cons++; 7225 if (index & 0x1) { 7226 /* Go to leading page */ 7227 pp = PP_GROUPLEADER(pp, cons); 7228 ASSERT(sfmmu_mlist_held(pp)); 7229 goto retry; 7230 } 7231 } 7232 7233 /* 7234 * cpuset may be empty if the page was only mapped by segkpm, 7235 * in which case we won't actually cross-trap. 7236 */ 7237 xt_sync(cpuset); 7238 7239 /* 7240 * The page should have no mappings at this point, unless 7241 * we were called from hat_page_relocate() in which case we 7242 * leave the locked mappings which will be suspended later. 7243 */ 7244 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments || 7245 (forceflag == SFMMU_KERNEL_RELOC)); 7246 7247 #ifdef VAC 7248 if (PP_ISTNC(pp)) { 7249 if (cons == TTE8K) { 7250 pmtx = sfmmu_page_enter(pp); 7251 PP_CLRTNC(pp); 7252 sfmmu_page_exit(pmtx); 7253 } else { 7254 conv_tnc(pp, cons); 7255 } 7256 } 7257 #endif /* VAC */ 7258 7259 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 7260 /* 7261 * Unlink any pa_hments and free them, calling back 7262 * the responsible subsystem to notify it of the error. 7263 * This can occur in situations such as drivers leaking 7264 * DMA handles: naughty, but common enough that we'd like 7265 * to keep the system running rather than bringing it 7266 * down with an obscure error like "pa_hment leaked" 7267 * which doesn't aid the user in debugging their driver. 7268 */ 7269 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7270 tmphme = sfhme->hme_next; 7271 if (IS_PAHME(sfhme)) { 7272 struct pa_hment *pahmep = sfhme->hme_data; 7273 sfmmu_pahment_leaked(pahmep); 7274 HME_SUB(sfhme, pp); 7275 kmem_cache_free(pa_hment_cache, pahmep); 7276 } 7277 } 7278 7279 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks); 7280 } 7281 7282 sfmmu_mlist_exit(pml); 7283 7284 /* 7285 * XHAT may not have finished unloading pages 7286 * because some other thread was waiting for 7287 * mlist lock and XHAT_PAGEUNLOAD let it do 7288 * the job. 7289 */ 7290 if (xhme_blks) { 7291 pp = origpp; 7292 goto retry_xhat; 7293 } 7294 7295 return (0); 7296 } 7297 7298 cpuset_t 7299 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 7300 { 7301 struct hme_blk *hmeblkp; 7302 sfmmu_t *sfmmup; 7303 tte_t tte, ttemod; 7304 #ifdef DEBUG 7305 tte_t orig_old; 7306 #endif /* DEBUG */ 7307 caddr_t addr; 7308 int ttesz; 7309 int ret; 7310 cpuset_t cpuset; 7311 7312 ASSERT(pp != NULL); 7313 ASSERT(sfmmu_mlist_held(pp)); 7314 ASSERT(!PP_ISKAS(pp)); 7315 7316 CPUSET_ZERO(cpuset); 7317 7318 hmeblkp = sfmmu_hmetohblk(sfhme); 7319 7320 readtte: 7321 sfmmu_copytte(&sfhme->hme_tte, &tte); 7322 if (TTE_IS_VALID(&tte)) { 7323 sfmmup = hblktosfmmu(hmeblkp); 7324 ttesz = get_hblk_ttesz(hmeblkp); 7325 /* 7326 * Only unload mappings of 'cons' size. 7327 */ 7328 if (ttesz != cons) 7329 return (cpuset); 7330 7331 /* 7332 * Note that we have p_mapping lock, but no hash lock here. 7333 * hblk_unload() has to have both hash lock AND p_mapping 7334 * lock before it tries to modify tte. So, the tte could 7335 * not become invalid in the sfmmu_modifytte_try() below. 7336 */ 7337 ttemod = tte; 7338 #ifdef DEBUG 7339 orig_old = tte; 7340 #endif /* DEBUG */ 7341 7342 TTE_SET_INVALID(&ttemod); 7343 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7344 if (ret < 0) { 7345 #ifdef DEBUG 7346 /* only R/M bits can change. */ 7347 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 7348 #endif /* DEBUG */ 7349 goto readtte; 7350 } 7351 7352 if (ret == 0) { 7353 panic("pageunload: cas failed?"); 7354 } 7355 7356 addr = tte_to_vaddr(hmeblkp, tte); 7357 7358 if (hmeblkp->hblk_shared) { 7359 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7360 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7361 sf_region_t *rgnp; 7362 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7363 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7364 ASSERT(srdp != NULL); 7365 rgnp = srdp->srd_hmergnp[rid]; 7366 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7367 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1); 7368 sfmmu_ttesync(NULL, addr, &tte, pp); 7369 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0); 7370 atomic_add_long(&rgnp->rgn_ttecnt[ttesz], -1); 7371 } else { 7372 sfmmu_ttesync(sfmmup, addr, &tte, pp); 7373 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1); 7374 7375 /* 7376 * We need to flush the page from the virtual cache 7377 * in order to prevent a virtual cache alias 7378 * inconsistency. The particular scenario we need 7379 * to worry about is: 7380 * Given: va1 and va2 are two virtual address that 7381 * alias and will map the same physical address. 7382 * 1. mapping exists from va1 to pa and data has 7383 * been read into the cache. 7384 * 2. unload va1. 7385 * 3. load va2 and modify data using va2. 7386 * 4 unload va2. 7387 * 5. load va1 and reference data. Unless we flush 7388 * the data cache when we unload we will get 7389 * stale data. 7390 * This scenario is taken care of by using virtual 7391 * page coloring. 7392 */ 7393 if (sfmmup->sfmmu_ismhat) { 7394 /* 7395 * Flush TSBs, TLBs and caches 7396 * of every process 7397 * sharing this ism segment. 7398 */ 7399 sfmmu_hat_lock_all(); 7400 mutex_enter(&ism_mlist_lock); 7401 kpreempt_disable(); 7402 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 7403 pp->p_pagenum, CACHE_NO_FLUSH); 7404 kpreempt_enable(); 7405 mutex_exit(&ism_mlist_lock); 7406 sfmmu_hat_unlock_all(); 7407 cpuset = cpu_ready_set; 7408 } else { 7409 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7410 cpuset = sfmmup->sfmmu_cpusran; 7411 } 7412 } 7413 7414 /* 7415 * Hme_sub has to run after ttesync() and a_rss update. 7416 * See hblk_unload(). 7417 */ 7418 HME_SUB(sfhme, pp); 7419 membar_stst(); 7420 7421 /* 7422 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 7423 * since pteload may have done a HME_ADD() right after 7424 * we did the HME_SUB() above. Hmecnt is now maintained 7425 * by cas only. no lock guranteed its value. The only 7426 * gurantee we have is the hmecnt should not be less than 7427 * what it should be so the hblk will not be taken away. 7428 * It's also important that we decremented the hmecnt after 7429 * we are done with hmeblkp so that this hmeblk won't be 7430 * stolen. 7431 */ 7432 ASSERT(hmeblkp->hblk_hmecnt > 0); 7433 ASSERT(hmeblkp->hblk_vcnt > 0); 7434 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 7435 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 7436 /* 7437 * This is bug 4063182. 7438 * XXX: fixme 7439 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 7440 * !hmeblkp->hblk_lckcnt); 7441 */ 7442 } else { 7443 panic("invalid tte? pp %p &tte %p", 7444 (void *)pp, (void *)&tte); 7445 } 7446 7447 return (cpuset); 7448 } 7449 7450 /* 7451 * While relocating a kernel page, this function will move the mappings 7452 * from tpp to dpp and modify any associated data with these mappings. 7453 * It also unsuspends the suspended kernel mapping. 7454 */ 7455 static void 7456 hat_pagereload(struct page *tpp, struct page *dpp) 7457 { 7458 struct sf_hment *sfhme; 7459 tte_t tte, ttemod; 7460 int index, cons; 7461 7462 ASSERT(getpil() == PIL_MAX); 7463 ASSERT(sfmmu_mlist_held(tpp)); 7464 ASSERT(sfmmu_mlist_held(dpp)); 7465 7466 index = PP_MAPINDEX(tpp); 7467 cons = TTE8K; 7468 7469 /* Update real mappings to the page */ 7470 retry: 7471 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 7472 if (IS_PAHME(sfhme)) 7473 continue; 7474 sfmmu_copytte(&sfhme->hme_tte, &tte); 7475 ttemod = tte; 7476 7477 /* 7478 * replace old pfn with new pfn in TTE 7479 */ 7480 PFN_TO_TTE(ttemod, dpp->p_pagenum); 7481 7482 /* 7483 * clear suspend bit 7484 */ 7485 ASSERT(TTE_IS_SUSPEND(&ttemod)); 7486 TTE_CLR_SUSPEND(&ttemod); 7487 7488 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 7489 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 7490 7491 /* 7492 * set hme_page point to new page 7493 */ 7494 sfhme->hme_page = dpp; 7495 } 7496 7497 /* 7498 * move p_mapping list from old page to new page 7499 */ 7500 dpp->p_mapping = tpp->p_mapping; 7501 tpp->p_mapping = NULL; 7502 dpp->p_share = tpp->p_share; 7503 tpp->p_share = 0; 7504 7505 while (index != 0) { 7506 index = index >> 1; 7507 if (index != 0) 7508 cons++; 7509 if (index & 0x1) { 7510 tpp = PP_GROUPLEADER(tpp, cons); 7511 dpp = PP_GROUPLEADER(dpp, cons); 7512 goto retry; 7513 } 7514 } 7515 7516 curthread->t_flag &= ~T_DONTDTRACE; 7517 mutex_exit(&kpr_suspendlock); 7518 } 7519 7520 uint_t 7521 hat_pagesync(struct page *pp, uint_t clearflag) 7522 { 7523 struct sf_hment *sfhme, *tmphme = NULL; 7524 struct hme_blk *hmeblkp; 7525 kmutex_t *pml; 7526 cpuset_t cpuset, tset; 7527 int index, cons; 7528 extern ulong_t po_share; 7529 page_t *save_pp = pp; 7530 int stop_on_sh = 0; 7531 uint_t shcnt; 7532 7533 CPUSET_ZERO(cpuset); 7534 7535 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 7536 return (PP_GENERIC_ATTR(pp)); 7537 } 7538 7539 if ((clearflag & HAT_SYNC_ZERORM) == 0) { 7540 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) { 7541 return (PP_GENERIC_ATTR(pp)); 7542 } 7543 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) { 7544 return (PP_GENERIC_ATTR(pp)); 7545 } 7546 if (clearflag & HAT_SYNC_STOPON_SHARED) { 7547 if (pp->p_share > po_share) { 7548 hat_page_setattr(pp, P_REF); 7549 return (PP_GENERIC_ATTR(pp)); 7550 } 7551 stop_on_sh = 1; 7552 shcnt = 0; 7553 } 7554 } 7555 7556 clearflag &= ~HAT_SYNC_STOPON_SHARED; 7557 pml = sfmmu_mlist_enter(pp); 7558 index = PP_MAPINDEX(pp); 7559 cons = TTE8K; 7560 retry: 7561 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7562 /* 7563 * We need to save the next hment on the list since 7564 * it is possible for pagesync to remove an invalid hment 7565 * from the list. 7566 */ 7567 tmphme = sfhme->hme_next; 7568 if (IS_PAHME(sfhme)) 7569 continue; 7570 /* 7571 * If we are looking for large mappings and this hme doesn't 7572 * reach the range we are seeking, just ignore it. 7573 */ 7574 hmeblkp = sfmmu_hmetohblk(sfhme); 7575 if (hmeblkp->hblk_xhat_bit) 7576 continue; 7577 7578 if (hme_size(sfhme) < cons) 7579 continue; 7580 7581 if (stop_on_sh) { 7582 if (hmeblkp->hblk_shared) { 7583 sf_srd_t *srdp = hblktosrd(hmeblkp); 7584 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7585 sf_region_t *rgnp; 7586 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7587 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7588 ASSERT(srdp != NULL); 7589 rgnp = srdp->srd_hmergnp[rid]; 7590 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 7591 rgnp, rid); 7592 shcnt += rgnp->rgn_refcnt; 7593 } else { 7594 shcnt++; 7595 } 7596 if (shcnt > po_share) { 7597 /* 7598 * tell the pager to spare the page this time 7599 * around. 7600 */ 7601 hat_page_setattr(save_pp, P_REF); 7602 index = 0; 7603 break; 7604 } 7605 } 7606 tset = sfmmu_pagesync(pp, sfhme, 7607 clearflag & ~HAT_SYNC_STOPON_RM); 7608 CPUSET_OR(cpuset, tset); 7609 7610 /* 7611 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 7612 * as the "ref" or "mod" is set or share cnt exceeds po_share. 7613 */ 7614 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 7615 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 7616 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) { 7617 index = 0; 7618 break; 7619 } 7620 } 7621 7622 while (index) { 7623 index = index >> 1; 7624 cons++; 7625 if (index & 0x1) { 7626 /* Go to leading page */ 7627 pp = PP_GROUPLEADER(pp, cons); 7628 goto retry; 7629 } 7630 } 7631 7632 xt_sync(cpuset); 7633 sfmmu_mlist_exit(pml); 7634 return (PP_GENERIC_ATTR(save_pp)); 7635 } 7636 7637 /* 7638 * Get all the hardware dependent attributes for a page struct 7639 */ 7640 static cpuset_t 7641 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 7642 uint_t clearflag) 7643 { 7644 caddr_t addr; 7645 tte_t tte, ttemod; 7646 struct hme_blk *hmeblkp; 7647 int ret; 7648 sfmmu_t *sfmmup; 7649 cpuset_t cpuset; 7650 7651 ASSERT(pp != NULL); 7652 ASSERT(sfmmu_mlist_held(pp)); 7653 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 7654 (clearflag == HAT_SYNC_ZERORM)); 7655 7656 SFMMU_STAT(sf_pagesync); 7657 7658 CPUSET_ZERO(cpuset); 7659 7660 sfmmu_pagesync_retry: 7661 7662 sfmmu_copytte(&sfhme->hme_tte, &tte); 7663 if (TTE_IS_VALID(&tte)) { 7664 hmeblkp = sfmmu_hmetohblk(sfhme); 7665 sfmmup = hblktosfmmu(hmeblkp); 7666 addr = tte_to_vaddr(hmeblkp, tte); 7667 if (clearflag == HAT_SYNC_ZERORM) { 7668 ttemod = tte; 7669 TTE_CLR_RM(&ttemod); 7670 ret = sfmmu_modifytte_try(&tte, &ttemod, 7671 &sfhme->hme_tte); 7672 if (ret < 0) { 7673 /* 7674 * cas failed and the new value is not what 7675 * we want. 7676 */ 7677 goto sfmmu_pagesync_retry; 7678 } 7679 7680 if (ret > 0) { 7681 /* we win the cas */ 7682 if (hmeblkp->hblk_shared) { 7683 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7684 uint_t rid = 7685 hmeblkp->hblk_tag.htag_rid; 7686 sf_region_t *rgnp; 7687 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7688 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7689 ASSERT(srdp != NULL); 7690 rgnp = srdp->srd_hmergnp[rid]; 7691 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7692 srdp, rgnp, rid); 7693 cpuset = sfmmu_rgntlb_demap(addr, 7694 rgnp, hmeblkp, 1); 7695 } else { 7696 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 7697 0, 0); 7698 cpuset = sfmmup->sfmmu_cpusran; 7699 } 7700 } 7701 } 7702 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr, 7703 &tte, pp); 7704 } 7705 return (cpuset); 7706 } 7707 7708 /* 7709 * Remove write permission from a mappings to a page, so that 7710 * we can detect the next modification of it. This requires modifying 7711 * the TTE then invalidating (demap) any TLB entry using that TTE. 7712 * This code is similar to sfmmu_pagesync(). 7713 */ 7714 static cpuset_t 7715 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 7716 { 7717 caddr_t addr; 7718 tte_t tte; 7719 tte_t ttemod; 7720 struct hme_blk *hmeblkp; 7721 int ret; 7722 sfmmu_t *sfmmup; 7723 cpuset_t cpuset; 7724 7725 ASSERT(pp != NULL); 7726 ASSERT(sfmmu_mlist_held(pp)); 7727 7728 CPUSET_ZERO(cpuset); 7729 SFMMU_STAT(sf_clrwrt); 7730 7731 retry: 7732 7733 sfmmu_copytte(&sfhme->hme_tte, &tte); 7734 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 7735 hmeblkp = sfmmu_hmetohblk(sfhme); 7736 7737 /* 7738 * xhat mappings should never be to a VMODSORT page. 7739 */ 7740 ASSERT(hmeblkp->hblk_xhat_bit == 0); 7741 7742 sfmmup = hblktosfmmu(hmeblkp); 7743 addr = tte_to_vaddr(hmeblkp, tte); 7744 7745 ttemod = tte; 7746 TTE_CLR_WRT(&ttemod); 7747 TTE_CLR_MOD(&ttemod); 7748 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7749 7750 /* 7751 * if cas failed and the new value is not what 7752 * we want retry 7753 */ 7754 if (ret < 0) 7755 goto retry; 7756 7757 /* we win the cas */ 7758 if (ret > 0) { 7759 if (hmeblkp->hblk_shared) { 7760 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7761 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7762 sf_region_t *rgnp; 7763 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7764 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7765 ASSERT(srdp != NULL); 7766 rgnp = srdp->srd_hmergnp[rid]; 7767 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7768 srdp, rgnp, rid); 7769 cpuset = sfmmu_rgntlb_demap(addr, 7770 rgnp, hmeblkp, 1); 7771 } else { 7772 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7773 cpuset = sfmmup->sfmmu_cpusran; 7774 } 7775 } 7776 } 7777 7778 return (cpuset); 7779 } 7780 7781 /* 7782 * Walk all mappings of a page, removing write permission and clearing the 7783 * ref/mod bits. This code is similar to hat_pagesync() 7784 */ 7785 static void 7786 hat_page_clrwrt(page_t *pp) 7787 { 7788 struct sf_hment *sfhme; 7789 struct sf_hment *tmphme = NULL; 7790 kmutex_t *pml; 7791 cpuset_t cpuset; 7792 cpuset_t tset; 7793 int index; 7794 int cons; 7795 7796 CPUSET_ZERO(cpuset); 7797 7798 pml = sfmmu_mlist_enter(pp); 7799 index = PP_MAPINDEX(pp); 7800 cons = TTE8K; 7801 retry: 7802 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7803 tmphme = sfhme->hme_next; 7804 7805 /* 7806 * If we are looking for large mappings and this hme doesn't 7807 * reach the range we are seeking, just ignore its. 7808 */ 7809 7810 if (hme_size(sfhme) < cons) 7811 continue; 7812 7813 tset = sfmmu_pageclrwrt(pp, sfhme); 7814 CPUSET_OR(cpuset, tset); 7815 } 7816 7817 while (index) { 7818 index = index >> 1; 7819 cons++; 7820 if (index & 0x1) { 7821 /* Go to leading page */ 7822 pp = PP_GROUPLEADER(pp, cons); 7823 goto retry; 7824 } 7825 } 7826 7827 xt_sync(cpuset); 7828 sfmmu_mlist_exit(pml); 7829 } 7830 7831 /* 7832 * Set the given REF/MOD/RO bits for the given page. 7833 * For a vnode with a sorted v_pages list, we need to change 7834 * the attributes and the v_pages list together under page_vnode_mutex. 7835 */ 7836 void 7837 hat_page_setattr(page_t *pp, uint_t flag) 7838 { 7839 vnode_t *vp = pp->p_vnode; 7840 page_t **listp; 7841 kmutex_t *pmtx; 7842 kmutex_t *vphm = NULL; 7843 int noshuffle; 7844 7845 noshuffle = flag & P_NSH; 7846 flag &= ~P_NSH; 7847 7848 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7849 7850 /* 7851 * nothing to do if attribute already set 7852 */ 7853 if ((pp->p_nrm & flag) == flag) 7854 return; 7855 7856 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 7857 !noshuffle) { 7858 vphm = page_vnode_mutex(vp); 7859 mutex_enter(vphm); 7860 } 7861 7862 pmtx = sfmmu_page_enter(pp); 7863 pp->p_nrm |= flag; 7864 sfmmu_page_exit(pmtx); 7865 7866 if (vphm != NULL) { 7867 /* 7868 * Some File Systems examine v_pages for NULL w/o 7869 * grabbing the vphm mutex. Must not let it become NULL when 7870 * pp is the only page on the list. 7871 */ 7872 if (pp->p_vpnext != pp) { 7873 page_vpsub(&vp->v_pages, pp); 7874 if (vp->v_pages != NULL) 7875 listp = &vp->v_pages->p_vpprev->p_vpnext; 7876 else 7877 listp = &vp->v_pages; 7878 page_vpadd(listp, pp); 7879 } 7880 mutex_exit(vphm); 7881 } 7882 } 7883 7884 void 7885 hat_page_clrattr(page_t *pp, uint_t flag) 7886 { 7887 vnode_t *vp = pp->p_vnode; 7888 kmutex_t *pmtx; 7889 7890 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7891 7892 pmtx = sfmmu_page_enter(pp); 7893 7894 /* 7895 * Caller is expected to hold page's io lock for VMODSORT to work 7896 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 7897 * bit is cleared. 7898 * We don't have assert to avoid tripping some existing third party 7899 * code. The dirty page is moved back to top of the v_page list 7900 * after IO is done in pvn_write_done(). 7901 */ 7902 pp->p_nrm &= ~flag; 7903 sfmmu_page_exit(pmtx); 7904 7905 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 7906 7907 /* 7908 * VMODSORT works by removing write permissions and getting 7909 * a fault when a page is made dirty. At this point 7910 * we need to remove write permission from all mappings 7911 * to this page. 7912 */ 7913 hat_page_clrwrt(pp); 7914 } 7915 } 7916 7917 uint_t 7918 hat_page_getattr(page_t *pp, uint_t flag) 7919 { 7920 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7921 return ((uint_t)(pp->p_nrm & flag)); 7922 } 7923 7924 /* 7925 * DEBUG kernels: verify that a kernel va<->pa translation 7926 * is safe by checking the underlying page_t is in a page 7927 * relocation-safe state. 7928 */ 7929 #ifdef DEBUG 7930 void 7931 sfmmu_check_kpfn(pfn_t pfn) 7932 { 7933 page_t *pp; 7934 int index, cons; 7935 7936 if (hat_check_vtop == 0) 7937 return; 7938 7939 if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr) 7940 return; 7941 7942 pp = page_numtopp_nolock(pfn); 7943 if (!pp) 7944 return; 7945 7946 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7947 return; 7948 7949 /* 7950 * Handed a large kernel page, we dig up the root page since we 7951 * know the root page might have the lock also. 7952 */ 7953 if (pp->p_szc != 0) { 7954 index = PP_MAPINDEX(pp); 7955 cons = TTE8K; 7956 again: 7957 while (index != 0) { 7958 index >>= 1; 7959 if (index != 0) 7960 cons++; 7961 if (index & 0x1) { 7962 pp = PP_GROUPLEADER(pp, cons); 7963 goto again; 7964 } 7965 } 7966 } 7967 7968 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7969 return; 7970 7971 /* 7972 * Pages need to be locked or allocated "permanent" (either from 7973 * static_arena arena or explicitly setting PG_NORELOC when calling 7974 * page_create_va()) for VA->PA translations to be valid. 7975 */ 7976 if (!PP_ISNORELOC(pp)) 7977 panic("Illegal VA->PA translation, pp 0x%p not permanent", 7978 (void *)pp); 7979 else 7980 panic("Illegal VA->PA translation, pp 0x%p not locked", 7981 (void *)pp); 7982 } 7983 #endif /* DEBUG */ 7984 7985 /* 7986 * Returns a page frame number for a given virtual address. 7987 * Returns PFN_INVALID to indicate an invalid mapping 7988 */ 7989 pfn_t 7990 hat_getpfnum(struct hat *hat, caddr_t addr) 7991 { 7992 pfn_t pfn; 7993 tte_t tte; 7994 7995 /* 7996 * We would like to 7997 * ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 7998 * but we can't because the iommu driver will call this 7999 * routine at interrupt time and it can't grab the as lock 8000 * or it will deadlock: A thread could have the as lock 8001 * and be waiting for io. The io can't complete 8002 * because the interrupt thread is blocked trying to grab 8003 * the as lock. 8004 */ 8005 8006 ASSERT(hat->sfmmu_xhat_provider == NULL); 8007 8008 if (hat == ksfmmup) { 8009 if (IS_KMEM_VA_LARGEPAGE(addr)) { 8010 ASSERT(segkmem_lpszc > 0); 8011 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc); 8012 if (pfn != PFN_INVALID) { 8013 sfmmu_check_kpfn(pfn); 8014 return (pfn); 8015 } 8016 } else if (segkpm && IS_KPM_ADDR(addr)) { 8017 return (sfmmu_kpm_vatopfn(addr)); 8018 } 8019 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 8020 == PFN_SUSPENDED) { 8021 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 8022 } 8023 sfmmu_check_kpfn(pfn); 8024 return (pfn); 8025 } else { 8026 return (sfmmu_uvatopfn(addr, hat, NULL)); 8027 } 8028 } 8029 8030 /* 8031 * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged. 8032 * Use hat_getpfnum(kas.a_hat, ...) instead. 8033 * 8034 * We'd like to return PFN_INVALID if the mappings have underlying page_t's 8035 * but can't right now due to the fact that some software has grown to use 8036 * this interface incorrectly. So for now when the interface is misused, 8037 * return a warning to the user that in the future it won't work in the 8038 * way they're abusing it, and carry on (after disabling page relocation). 8039 */ 8040 pfn_t 8041 hat_getkpfnum(caddr_t addr) 8042 { 8043 pfn_t pfn; 8044 tte_t tte; 8045 int badcaller = 0; 8046 extern int segkmem_reloc; 8047 8048 if (segkpm && IS_KPM_ADDR(addr)) { 8049 badcaller = 1; 8050 pfn = sfmmu_kpm_vatopfn(addr); 8051 } else { 8052 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 8053 == PFN_SUSPENDED) { 8054 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 8055 } 8056 badcaller = pf_is_memory(pfn); 8057 } 8058 8059 if (badcaller) { 8060 /* 8061 * We can't return PFN_INVALID or the caller may panic 8062 * or corrupt the system. The only alternative is to 8063 * disable page relocation at this point for all kernel 8064 * memory. This will impact any callers of page_relocate() 8065 * such as FMA or DR. 8066 * 8067 * RFE: Add junk here to spit out an ereport so the sysadmin 8068 * can be advised that he should upgrade his device driver 8069 * so that this doesn't happen. 8070 */ 8071 hat_getkpfnum_badcall(caller()); 8072 if (hat_kpr_enabled && segkmem_reloc) { 8073 hat_kpr_enabled = 0; 8074 segkmem_reloc = 0; 8075 cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED"); 8076 } 8077 } 8078 return (pfn); 8079 } 8080 8081 /* 8082 * This routine will return both pfn and tte for the vaddr. 8083 */ 8084 static pfn_t 8085 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep) 8086 { 8087 struct hmehash_bucket *hmebp; 8088 hmeblk_tag hblktag; 8089 int hmeshift, hashno = 1; 8090 struct hme_blk *hmeblkp = NULL; 8091 tte_t tte; 8092 8093 struct sf_hment *sfhmep; 8094 pfn_t pfn; 8095 8096 /* support for ISM */ 8097 ism_map_t *ism_map; 8098 ism_blk_t *ism_blkp; 8099 int i; 8100 sfmmu_t *ism_hatid = NULL; 8101 sfmmu_t *locked_hatid = NULL; 8102 sfmmu_t *sv_sfmmup = sfmmup; 8103 caddr_t sv_vaddr = vaddr; 8104 sf_srd_t *srdp; 8105 8106 if (ttep == NULL) { 8107 ttep = &tte; 8108 } else { 8109 ttep->ll = 0; 8110 } 8111 8112 ASSERT(sfmmup != ksfmmup); 8113 SFMMU_STAT(sf_user_vtop); 8114 /* 8115 * Set ism_hatid if vaddr falls in a ISM segment. 8116 */ 8117 ism_blkp = sfmmup->sfmmu_iblk; 8118 if (ism_blkp != NULL) { 8119 sfmmu_ismhat_enter(sfmmup, 0); 8120 locked_hatid = sfmmup; 8121 } 8122 while (ism_blkp != NULL && ism_hatid == NULL) { 8123 ism_map = ism_blkp->iblk_maps; 8124 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 8125 if (vaddr >= ism_start(ism_map[i]) && 8126 vaddr < ism_end(ism_map[i])) { 8127 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 8128 vaddr = (caddr_t)(vaddr - 8129 ism_start(ism_map[i])); 8130 break; 8131 } 8132 } 8133 ism_blkp = ism_blkp->iblk_next; 8134 } 8135 if (locked_hatid) { 8136 sfmmu_ismhat_exit(locked_hatid, 0); 8137 } 8138 8139 hblktag.htag_id = sfmmup; 8140 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 8141 do { 8142 hmeshift = HME_HASH_SHIFT(hashno); 8143 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 8144 hblktag.htag_rehash = hashno; 8145 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 8146 8147 SFMMU_HASH_LOCK(hmebp); 8148 8149 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 8150 if (hmeblkp != NULL) { 8151 ASSERT(!hmeblkp->hblk_shared); 8152 HBLKTOHME(sfhmep, hmeblkp, vaddr); 8153 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8154 SFMMU_HASH_UNLOCK(hmebp); 8155 if (TTE_IS_VALID(ttep)) { 8156 pfn = TTE_TO_PFN(vaddr, ttep); 8157 return (pfn); 8158 } 8159 break; 8160 } 8161 SFMMU_HASH_UNLOCK(hmebp); 8162 hashno++; 8163 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 8164 8165 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) { 8166 return (PFN_INVALID); 8167 } 8168 srdp = sv_sfmmup->sfmmu_srdp; 8169 ASSERT(srdp != NULL); 8170 ASSERT(srdp->srd_refcnt != 0); 8171 hblktag.htag_id = srdp; 8172 hashno = 1; 8173 do { 8174 hmeshift = HME_HASH_SHIFT(hashno); 8175 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift); 8176 hblktag.htag_rehash = hashno; 8177 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift); 8178 8179 SFMMU_HASH_LOCK(hmebp); 8180 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL; 8181 hmeblkp = hmeblkp->hblk_next) { 8182 uint_t rid; 8183 sf_region_t *rgnp; 8184 caddr_t rsaddr; 8185 caddr_t readdr; 8186 8187 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag, 8188 sv_sfmmup->sfmmu_hmeregion_map)) { 8189 continue; 8190 } 8191 ASSERT(hmeblkp->hblk_shared); 8192 rid = hmeblkp->hblk_tag.htag_rid; 8193 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8194 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8195 rgnp = srdp->srd_hmergnp[rid]; 8196 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 8197 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr); 8198 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8199 rsaddr = rgnp->rgn_saddr; 8200 readdr = rsaddr + rgnp->rgn_size; 8201 #ifdef DEBUG 8202 if (TTE_IS_VALID(ttep) || 8203 get_hblk_ttesz(hmeblkp) > TTE8K) { 8204 caddr_t eva = tte_to_evaddr(hmeblkp, ttep); 8205 ASSERT(eva > sv_vaddr); 8206 ASSERT(sv_vaddr >= rsaddr); 8207 ASSERT(sv_vaddr < readdr); 8208 ASSERT(eva <= readdr); 8209 } 8210 #endif /* DEBUG */ 8211 /* 8212 * Continue the search if we 8213 * found an invalid 8K tte outside of the area 8214 * covered by this hmeblk's region. 8215 */ 8216 if (TTE_IS_VALID(ttep)) { 8217 SFMMU_HASH_UNLOCK(hmebp); 8218 pfn = TTE_TO_PFN(sv_vaddr, ttep); 8219 return (pfn); 8220 } else if (get_hblk_ttesz(hmeblkp) > TTE8K || 8221 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) { 8222 SFMMU_HASH_UNLOCK(hmebp); 8223 pfn = PFN_INVALID; 8224 return (pfn); 8225 } 8226 } 8227 SFMMU_HASH_UNLOCK(hmebp); 8228 hashno++; 8229 } while (hashno <= mmu_hashcnt); 8230 return (PFN_INVALID); 8231 } 8232 8233 8234 /* 8235 * For compatability with AT&T and later optimizations 8236 */ 8237 /* ARGSUSED */ 8238 void 8239 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 8240 { 8241 ASSERT(hat != NULL); 8242 ASSERT(hat->sfmmu_xhat_provider == NULL); 8243 } 8244 8245 /* 8246 * Return the number of mappings to a particular page. This number is an 8247 * approximation of the number of people sharing the page. 8248 * 8249 * shared hmeblks or ism hmeblks are counted as 1 mapping here. 8250 * hat_page_checkshare() can be used to compare threshold to share 8251 * count that reflects the number of region sharers albeit at higher cost. 8252 */ 8253 ulong_t 8254 hat_page_getshare(page_t *pp) 8255 { 8256 page_t *spp = pp; /* start page */ 8257 kmutex_t *pml; 8258 ulong_t cnt; 8259 int index, sz = TTE64K; 8260 8261 /* 8262 * We need to grab the mlist lock to make sure any outstanding 8263 * load/unloads complete. Otherwise we could return zero 8264 * even though the unload(s) hasn't finished yet. 8265 */ 8266 pml = sfmmu_mlist_enter(spp); 8267 cnt = spp->p_share; 8268 8269 #ifdef VAC 8270 if (kpm_enable) 8271 cnt += spp->p_kpmref; 8272 #endif 8273 if (vpm_enable && pp->p_vpmref) { 8274 cnt += 1; 8275 } 8276 8277 /* 8278 * If we have any large mappings, we count the number of 8279 * mappings that this large page is part of. 8280 */ 8281 index = PP_MAPINDEX(spp); 8282 index >>= 1; 8283 while (index) { 8284 pp = PP_GROUPLEADER(spp, sz); 8285 if ((index & 0x1) && pp != spp) { 8286 cnt += pp->p_share; 8287 spp = pp; 8288 } 8289 index >>= 1; 8290 sz++; 8291 } 8292 sfmmu_mlist_exit(pml); 8293 return (cnt); 8294 } 8295 8296 /* 8297 * Return 1 if the number of mappings exceeds sh_thresh. Return 0 8298 * otherwise. Count shared hmeblks by region's refcnt. 8299 */ 8300 int 8301 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 8302 { 8303 kmutex_t *pml; 8304 ulong_t cnt = 0; 8305 int index, sz = TTE8K; 8306 struct sf_hment *sfhme, *tmphme = NULL; 8307 struct hme_blk *hmeblkp; 8308 8309 pml = sfmmu_mlist_enter(pp); 8310 8311 #ifdef VAC 8312 if (kpm_enable) 8313 cnt = pp->p_kpmref; 8314 #endif 8315 8316 if (vpm_enable && pp->p_vpmref) { 8317 cnt += 1; 8318 } 8319 8320 if (pp->p_share + cnt > sh_thresh) { 8321 sfmmu_mlist_exit(pml); 8322 return (1); 8323 } 8324 8325 index = PP_MAPINDEX(pp); 8326 8327 again: 8328 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 8329 tmphme = sfhme->hme_next; 8330 if (IS_PAHME(sfhme)) { 8331 continue; 8332 } 8333 8334 hmeblkp = sfmmu_hmetohblk(sfhme); 8335 if (hmeblkp->hblk_xhat_bit) { 8336 cnt++; 8337 if (cnt > sh_thresh) { 8338 sfmmu_mlist_exit(pml); 8339 return (1); 8340 } 8341 continue; 8342 } 8343 if (hme_size(sfhme) != sz) { 8344 continue; 8345 } 8346 8347 if (hmeblkp->hblk_shared) { 8348 sf_srd_t *srdp = hblktosrd(hmeblkp); 8349 uint_t rid = hmeblkp->hblk_tag.htag_rid; 8350 sf_region_t *rgnp; 8351 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8352 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8353 ASSERT(srdp != NULL); 8354 rgnp = srdp->srd_hmergnp[rid]; 8355 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 8356 rgnp, rid); 8357 cnt += rgnp->rgn_refcnt; 8358 } else { 8359 cnt++; 8360 } 8361 if (cnt > sh_thresh) { 8362 sfmmu_mlist_exit(pml); 8363 return (1); 8364 } 8365 } 8366 8367 index >>= 1; 8368 sz++; 8369 while (index) { 8370 pp = PP_GROUPLEADER(pp, sz); 8371 ASSERT(sfmmu_mlist_held(pp)); 8372 if (index & 0x1) { 8373 goto again; 8374 } 8375 index >>= 1; 8376 sz++; 8377 } 8378 sfmmu_mlist_exit(pml); 8379 return (0); 8380 } 8381 8382 /* 8383 * Unload all large mappings to the pp and reset the p_szc field of every 8384 * constituent page according to the remaining mappings. 8385 * 8386 * pp must be locked SE_EXCL. Even though no other constituent pages are 8387 * locked it's legal to unload the large mappings to the pp because all 8388 * constituent pages of large locked mappings have to be locked SE_SHARED. 8389 * This means if we have SE_EXCL lock on one of constituent pages none of the 8390 * large mappings to pp are locked. 8391 * 8392 * Decrease p_szc field starting from the last constituent page and ending 8393 * with the root page. This method is used because other threads rely on the 8394 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 8395 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 8396 * ensures that p_szc changes of the constituent pages appears atomic for all 8397 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 8398 * 8399 * This mechanism is only used for file system pages where it's not always 8400 * possible to get SE_EXCL locks on all constituent pages to demote the size 8401 * code (as is done for anonymous or kernel large pages). 8402 * 8403 * See more comments in front of sfmmu_mlspl_enter(). 8404 */ 8405 void 8406 hat_page_demote(page_t *pp) 8407 { 8408 int index; 8409 int sz; 8410 cpuset_t cpuset; 8411 int sync = 0; 8412 page_t *rootpp; 8413 struct sf_hment *sfhme; 8414 struct sf_hment *tmphme = NULL; 8415 struct hme_blk *hmeblkp; 8416 uint_t pszc; 8417 page_t *lastpp; 8418 cpuset_t tset; 8419 pgcnt_t npgs; 8420 kmutex_t *pml; 8421 kmutex_t *pmtx = NULL; 8422 8423 ASSERT(PAGE_EXCL(pp)); 8424 ASSERT(!PP_ISFREE(pp)); 8425 ASSERT(!PP_ISKAS(pp)); 8426 ASSERT(page_szc_lock_assert(pp)); 8427 pml = sfmmu_mlist_enter(pp); 8428 8429 pszc = pp->p_szc; 8430 if (pszc == 0) { 8431 goto out; 8432 } 8433 8434 index = PP_MAPINDEX(pp) >> 1; 8435 8436 if (index) { 8437 CPUSET_ZERO(cpuset); 8438 sz = TTE64K; 8439 sync = 1; 8440 } 8441 8442 while (index) { 8443 if (!(index & 0x1)) { 8444 index >>= 1; 8445 sz++; 8446 continue; 8447 } 8448 ASSERT(sz <= pszc); 8449 rootpp = PP_GROUPLEADER(pp, sz); 8450 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 8451 tmphme = sfhme->hme_next; 8452 ASSERT(!IS_PAHME(sfhme)); 8453 hmeblkp = sfmmu_hmetohblk(sfhme); 8454 if (hme_size(sfhme) != sz) { 8455 continue; 8456 } 8457 if (hmeblkp->hblk_xhat_bit) { 8458 cmn_err(CE_PANIC, 8459 "hat_page_demote: xhat hmeblk"); 8460 } 8461 tset = sfmmu_pageunload(rootpp, sfhme, sz); 8462 CPUSET_OR(cpuset, tset); 8463 } 8464 if (index >>= 1) { 8465 sz++; 8466 } 8467 } 8468 8469 ASSERT(!PP_ISMAPPED_LARGE(pp)); 8470 8471 if (sync) { 8472 xt_sync(cpuset); 8473 #ifdef VAC 8474 if (PP_ISTNC(pp)) { 8475 conv_tnc(rootpp, sz); 8476 } 8477 #endif /* VAC */ 8478 } 8479 8480 pmtx = sfmmu_page_enter(pp); 8481 8482 ASSERT(pp->p_szc == pszc); 8483 rootpp = PP_PAGEROOT(pp); 8484 ASSERT(rootpp->p_szc == pszc); 8485 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 8486 8487 while (lastpp != rootpp) { 8488 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 8489 ASSERT(sz < pszc); 8490 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 8491 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 8492 while (--npgs > 0) { 8493 lastpp->p_szc = (uchar_t)sz; 8494 lastpp = PP_PAGEPREV(lastpp); 8495 } 8496 if (sz) { 8497 /* 8498 * make sure before current root's pszc 8499 * is updated all updates to constituent pages pszc 8500 * fields are globally visible. 8501 */ 8502 membar_producer(); 8503 } 8504 lastpp->p_szc = sz; 8505 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 8506 if (lastpp != rootpp) { 8507 lastpp = PP_PAGEPREV(lastpp); 8508 } 8509 } 8510 if (sz == 0) { 8511 /* the loop above doesn't cover this case */ 8512 rootpp->p_szc = 0; 8513 } 8514 out: 8515 ASSERT(pp->p_szc == 0); 8516 if (pmtx != NULL) { 8517 sfmmu_page_exit(pmtx); 8518 } 8519 sfmmu_mlist_exit(pml); 8520 } 8521 8522 /* 8523 * Refresh the HAT ismttecnt[] element for size szc. 8524 * Caller must have set ISM busy flag to prevent mapping 8525 * lists from changing while we're traversing them. 8526 */ 8527 pgcnt_t 8528 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 8529 { 8530 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 8531 ism_map_t *ism_map; 8532 pgcnt_t npgs = 0; 8533 pgcnt_t npgs_scd = 0; 8534 int j; 8535 sf_scd_t *scdp; 8536 uchar_t rid; 8537 8538 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 8539 scdp = sfmmup->sfmmu_scdp; 8540 8541 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 8542 ism_map = ism_blkp->iblk_maps; 8543 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) { 8544 rid = ism_map[j].imap_rid; 8545 ASSERT(rid == SFMMU_INVALID_ISMRID || 8546 rid < sfmmup->sfmmu_srdp->srd_next_ismrid); 8547 8548 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID && 8549 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 8550 /* ISM is in sfmmup's SCD */ 8551 npgs_scd += 8552 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8553 } else { 8554 /* ISMs is not in SCD */ 8555 npgs += 8556 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8557 } 8558 } 8559 } 8560 sfmmup->sfmmu_ismttecnt[szc] = npgs; 8561 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd; 8562 return (npgs); 8563 } 8564 8565 /* 8566 * Yield the memory claim requirement for an address space. 8567 * 8568 * This is currently implemented as the number of bytes that have active 8569 * hardware translations that have page structures. Therefore, it can 8570 * underestimate the traditional resident set size, eg, if the 8571 * physical page is present and the hardware translation is missing; 8572 * and it can overestimate the rss, eg, if there are active 8573 * translations to a frame buffer with page structs. 8574 * Also, it does not take sharing into account. 8575 * 8576 * Note that we don't acquire locks here since this function is most often 8577 * called from the clock thread. 8578 */ 8579 size_t 8580 hat_get_mapped_size(struct hat *hat) 8581 { 8582 size_t assize = 0; 8583 int i; 8584 8585 if (hat == NULL) 8586 return (0); 8587 8588 ASSERT(hat->sfmmu_xhat_provider == NULL); 8589 8590 for (i = 0; i < mmu_page_sizes; i++) 8591 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] + 8592 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i); 8593 8594 if (hat->sfmmu_iblk == NULL) 8595 return (assize); 8596 8597 for (i = 0; i < mmu_page_sizes; i++) 8598 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] + 8599 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i); 8600 8601 return (assize); 8602 } 8603 8604 int 8605 hat_stats_enable(struct hat *hat) 8606 { 8607 hatlock_t *hatlockp; 8608 8609 ASSERT(hat->sfmmu_xhat_provider == NULL); 8610 8611 hatlockp = sfmmu_hat_enter(hat); 8612 hat->sfmmu_rmstat++; 8613 sfmmu_hat_exit(hatlockp); 8614 return (1); 8615 } 8616 8617 void 8618 hat_stats_disable(struct hat *hat) 8619 { 8620 hatlock_t *hatlockp; 8621 8622 ASSERT(hat->sfmmu_xhat_provider == NULL); 8623 8624 hatlockp = sfmmu_hat_enter(hat); 8625 hat->sfmmu_rmstat--; 8626 sfmmu_hat_exit(hatlockp); 8627 } 8628 8629 /* 8630 * Routines for entering or removing ourselves from the 8631 * ism_hat's mapping list. This is used for both private and 8632 * SCD hats. 8633 */ 8634 static void 8635 iment_add(struct ism_ment *iment, struct hat *ism_hat) 8636 { 8637 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8638 8639 iment->iment_prev = NULL; 8640 iment->iment_next = ism_hat->sfmmu_iment; 8641 if (ism_hat->sfmmu_iment) { 8642 ism_hat->sfmmu_iment->iment_prev = iment; 8643 } 8644 ism_hat->sfmmu_iment = iment; 8645 } 8646 8647 static void 8648 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 8649 { 8650 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8651 8652 if (ism_hat->sfmmu_iment == NULL) { 8653 panic("ism map entry remove - no entries"); 8654 } 8655 8656 if (iment->iment_prev) { 8657 ASSERT(ism_hat->sfmmu_iment != iment); 8658 iment->iment_prev->iment_next = iment->iment_next; 8659 } else { 8660 ASSERT(ism_hat->sfmmu_iment == iment); 8661 ism_hat->sfmmu_iment = iment->iment_next; 8662 } 8663 8664 if (iment->iment_next) { 8665 iment->iment_next->iment_prev = iment->iment_prev; 8666 } 8667 8668 /* 8669 * zero out the entry 8670 */ 8671 iment->iment_next = NULL; 8672 iment->iment_prev = NULL; 8673 iment->iment_hat = NULL; 8674 iment->iment_base_va = 0; 8675 } 8676 8677 /* 8678 * Hat_share()/unshare() return an (non-zero) error 8679 * when saddr and daddr are not properly aligned. 8680 * 8681 * The top level mapping element determines the alignment 8682 * requirement for saddr and daddr, depending on different 8683 * architectures. 8684 * 8685 * When hat_share()/unshare() are not supported, 8686 * HATOP_SHARE()/UNSHARE() return 0 8687 */ 8688 int 8689 hat_share(struct hat *sfmmup, caddr_t addr, 8690 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 8691 { 8692 ism_blk_t *ism_blkp; 8693 ism_blk_t *new_iblk; 8694 ism_map_t *ism_map; 8695 ism_ment_t *ism_ment; 8696 int i, added; 8697 hatlock_t *hatlockp; 8698 int reload_mmu = 0; 8699 uint_t ismshift = page_get_shift(ismszc); 8700 size_t ismpgsz = page_get_pagesize(ismszc); 8701 uint_t ismmask = (uint_t)ismpgsz - 1; 8702 size_t sh_size = ISM_SHIFT(ismshift, len); 8703 ushort_t ismhatflag; 8704 hat_region_cookie_t rcookie; 8705 sf_scd_t *old_scdp; 8706 8707 #ifdef DEBUG 8708 caddr_t eaddr = addr + len; 8709 #endif /* DEBUG */ 8710 8711 ASSERT(ism_hatid != NULL && sfmmup != NULL); 8712 ASSERT(sptaddr == ISMID_STARTADDR); 8713 /* 8714 * Check the alignment. 8715 */ 8716 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 8717 return (EINVAL); 8718 8719 /* 8720 * Check size alignment. 8721 */ 8722 if (!ISM_ALIGNED(ismshift, len)) 8723 return (EINVAL); 8724 8725 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 8726 8727 /* 8728 * Allocate ism_ment for the ism_hat's mapping list, and an 8729 * ism map blk in case we need one. We must do our 8730 * allocations before acquiring locks to prevent a deadlock 8731 * in the kmem allocator on the mapping list lock. 8732 */ 8733 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 8734 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 8735 8736 /* 8737 * Serialize ISM mappings with the ISM busy flag, and also the 8738 * trap handlers. 8739 */ 8740 sfmmu_ismhat_enter(sfmmup, 0); 8741 8742 /* 8743 * Allocate an ism map blk if necessary. 8744 */ 8745 if (sfmmup->sfmmu_iblk == NULL) { 8746 sfmmup->sfmmu_iblk = new_iblk; 8747 bzero(new_iblk, sizeof (*new_iblk)); 8748 new_iblk->iblk_nextpa = (uint64_t)-1; 8749 membar_stst(); /* make sure next ptr visible to all CPUs */ 8750 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 8751 reload_mmu = 1; 8752 new_iblk = NULL; 8753 } 8754 8755 #ifdef DEBUG 8756 /* 8757 * Make sure mapping does not already exist. 8758 */ 8759 ism_blkp = sfmmup->sfmmu_iblk; 8760 while (ism_blkp != NULL) { 8761 ism_map = ism_blkp->iblk_maps; 8762 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 8763 if ((addr >= ism_start(ism_map[i]) && 8764 addr < ism_end(ism_map[i])) || 8765 eaddr > ism_start(ism_map[i]) && 8766 eaddr <= ism_end(ism_map[i])) { 8767 panic("sfmmu_share: Already mapped!"); 8768 } 8769 } 8770 ism_blkp = ism_blkp->iblk_next; 8771 } 8772 #endif /* DEBUG */ 8773 8774 ASSERT(ismszc >= TTE4M); 8775 if (ismszc == TTE4M) { 8776 ismhatflag = HAT_4M_FLAG; 8777 } else if (ismszc == TTE32M) { 8778 ismhatflag = HAT_32M_FLAG; 8779 } else if (ismszc == TTE256M) { 8780 ismhatflag = HAT_256M_FLAG; 8781 } 8782 /* 8783 * Add mapping to first available mapping slot. 8784 */ 8785 ism_blkp = sfmmup->sfmmu_iblk; 8786 added = 0; 8787 while (!added) { 8788 ism_map = ism_blkp->iblk_maps; 8789 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8790 if (ism_map[i].imap_ismhat == NULL) { 8791 8792 ism_map[i].imap_ismhat = ism_hatid; 8793 ism_map[i].imap_vb_shift = (uchar_t)ismshift; 8794 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8795 ism_map[i].imap_hatflags = ismhatflag; 8796 ism_map[i].imap_sz_mask = ismmask; 8797 /* 8798 * imap_seg is checked in ISM_CHECK to see if 8799 * non-NULL, then other info assumed valid. 8800 */ 8801 membar_stst(); 8802 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 8803 ism_map[i].imap_ment = ism_ment; 8804 8805 /* 8806 * Now add ourselves to the ism_hat's 8807 * mapping list. 8808 */ 8809 ism_ment->iment_hat = sfmmup; 8810 ism_ment->iment_base_va = addr; 8811 ism_hatid->sfmmu_ismhat = 1; 8812 mutex_enter(&ism_mlist_lock); 8813 iment_add(ism_ment, ism_hatid); 8814 mutex_exit(&ism_mlist_lock); 8815 added = 1; 8816 break; 8817 } 8818 } 8819 if (!added && ism_blkp->iblk_next == NULL) { 8820 ism_blkp->iblk_next = new_iblk; 8821 new_iblk = NULL; 8822 bzero(ism_blkp->iblk_next, 8823 sizeof (*ism_blkp->iblk_next)); 8824 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 8825 membar_stst(); 8826 ism_blkp->iblk_nextpa = 8827 va_to_pa((caddr_t)ism_blkp->iblk_next); 8828 } 8829 ism_blkp = ism_blkp->iblk_next; 8830 } 8831 8832 /* 8833 * After calling hat_join_region, sfmmup may join a new SCD or 8834 * move from the old scd to a new scd, in which case, we want to 8835 * shrink the sfmmup's private tsb size, i.e., pass shrink to 8836 * sfmmu_check_page_sizes at the end of this routine. 8837 */ 8838 old_scdp = sfmmup->sfmmu_scdp; 8839 8840 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0, 8841 PROT_ALL, ismszc, NULL, HAT_REGION_ISM); 8842 if (rcookie != HAT_INVALID_REGION_COOKIE) { 8843 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie); 8844 } 8845 /* 8846 * Update our counters for this sfmmup's ism mappings. 8847 */ 8848 for (i = 0; i <= ismszc; i++) { 8849 if (!(disable_ism_large_pages & (1 << i))) 8850 (void) ism_tsb_entries(sfmmup, i); 8851 } 8852 8853 /* 8854 * For ISM and DISM we do not support 512K pages, so we only only 8855 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the 8856 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 8857 * 8858 * Need to set 32M/256M ISM flags to make sure 8859 * sfmmu_check_page_sizes() enables them on Panther. 8860 */ 8861 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 8862 8863 switch (ismszc) { 8864 case TTE256M: 8865 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) { 8866 hatlockp = sfmmu_hat_enter(sfmmup); 8867 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM); 8868 sfmmu_hat_exit(hatlockp); 8869 } 8870 break; 8871 case TTE32M: 8872 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) { 8873 hatlockp = sfmmu_hat_enter(sfmmup); 8874 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM); 8875 sfmmu_hat_exit(hatlockp); 8876 } 8877 break; 8878 default: 8879 break; 8880 } 8881 8882 /* 8883 * If we updated the ismblkpa for this HAT we must make 8884 * sure all CPUs running this process reload their tsbmiss area. 8885 * Otherwise they will fail to load the mappings in the tsbmiss 8886 * handler and will loop calling pagefault(). 8887 */ 8888 if (reload_mmu) { 8889 hatlockp = sfmmu_hat_enter(sfmmup); 8890 sfmmu_sync_mmustate(sfmmup); 8891 sfmmu_hat_exit(hatlockp); 8892 } 8893 8894 sfmmu_ismhat_exit(sfmmup, 0); 8895 8896 /* 8897 * Free up ismblk if we didn't use it. 8898 */ 8899 if (new_iblk != NULL) 8900 kmem_cache_free(ism_blk_cache, new_iblk); 8901 8902 /* 8903 * Check TSB and TLB page sizes. 8904 */ 8905 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) { 8906 sfmmu_check_page_sizes(sfmmup, 0); 8907 } else { 8908 sfmmu_check_page_sizes(sfmmup, 1); 8909 } 8910 return (0); 8911 } 8912 8913 /* 8914 * hat_unshare removes exactly one ism_map from 8915 * this process's as. It expects multiple calls 8916 * to hat_unshare for multiple shm segments. 8917 */ 8918 void 8919 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 8920 { 8921 ism_map_t *ism_map; 8922 ism_ment_t *free_ment = NULL; 8923 ism_blk_t *ism_blkp; 8924 struct hat *ism_hatid; 8925 int found, i; 8926 hatlock_t *hatlockp; 8927 struct tsb_info *tsbinfo; 8928 uint_t ismshift = page_get_shift(ismszc); 8929 size_t sh_size = ISM_SHIFT(ismshift, len); 8930 uchar_t ism_rid; 8931 sf_scd_t *old_scdp; 8932 8933 ASSERT(ISM_ALIGNED(ismshift, addr)); 8934 ASSERT(ISM_ALIGNED(ismshift, len)); 8935 ASSERT(sfmmup != NULL); 8936 ASSERT(sfmmup != ksfmmup); 8937 8938 if (sfmmup->sfmmu_xhat_provider) { 8939 XHAT_UNSHARE(sfmmup, addr, len); 8940 return; 8941 } else { 8942 /* 8943 * This must be a CPU HAT. If the address space has 8944 * XHATs attached, inform all XHATs that ISM segment 8945 * is going away 8946 */ 8947 ASSERT(sfmmup->sfmmu_as != NULL); 8948 if (sfmmup->sfmmu_as->a_xhat != NULL) 8949 xhat_unshare_all(sfmmup->sfmmu_as, addr, len); 8950 } 8951 8952 /* 8953 * Make sure that during the entire time ISM mappings are removed, 8954 * the trap handlers serialize behind us, and that no one else 8955 * can be mucking with ISM mappings. This also lets us get away 8956 * with not doing expensive cross calls to flush the TLB -- we 8957 * just discard the context, flush the entire TSB, and call it 8958 * a day. 8959 */ 8960 sfmmu_ismhat_enter(sfmmup, 0); 8961 8962 /* 8963 * Remove the mapping. 8964 * 8965 * We can't have any holes in the ism map. 8966 * The tsb miss code while searching the ism map will 8967 * stop on an empty map slot. So we must move 8968 * everyone past the hole up 1 if any. 8969 * 8970 * Also empty ism map blks are not freed until the 8971 * process exits. This is to prevent a MT race condition 8972 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 8973 */ 8974 found = 0; 8975 ism_blkp = sfmmup->sfmmu_iblk; 8976 while (!found && ism_blkp != NULL) { 8977 ism_map = ism_blkp->iblk_maps; 8978 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8979 if (addr == ism_start(ism_map[i]) && 8980 sh_size == (size_t)(ism_size(ism_map[i]))) { 8981 found = 1; 8982 break; 8983 } 8984 } 8985 if (!found) 8986 ism_blkp = ism_blkp->iblk_next; 8987 } 8988 8989 if (found) { 8990 ism_hatid = ism_map[i].imap_ismhat; 8991 ism_rid = ism_map[i].imap_rid; 8992 ASSERT(ism_hatid != NULL); 8993 ASSERT(ism_hatid->sfmmu_ismhat == 1); 8994 8995 /* 8996 * After hat_leave_region, the sfmmup may leave SCD, 8997 * in which case, we want to grow the private tsb size when 8998 * calling sfmmu_check_page_sizes at the end of the routine. 8999 */ 9000 old_scdp = sfmmup->sfmmu_scdp; 9001 /* 9002 * Then remove ourselves from the region. 9003 */ 9004 if (ism_rid != SFMMU_INVALID_ISMRID) { 9005 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid), 9006 HAT_REGION_ISM); 9007 } 9008 9009 /* 9010 * And now guarantee that any other cpu 9011 * that tries to process an ISM miss 9012 * will go to tl=0. 9013 */ 9014 hatlockp = sfmmu_hat_enter(sfmmup); 9015 sfmmu_invalidate_ctx(sfmmup); 9016 sfmmu_hat_exit(hatlockp); 9017 9018 /* 9019 * Remove ourselves from the ism mapping list. 9020 */ 9021 mutex_enter(&ism_mlist_lock); 9022 iment_sub(ism_map[i].imap_ment, ism_hatid); 9023 mutex_exit(&ism_mlist_lock); 9024 free_ment = ism_map[i].imap_ment; 9025 9026 /* 9027 * We delete the ism map by copying 9028 * the next map over the current one. 9029 * We will take the next one in the maps 9030 * array or from the next ism_blk. 9031 */ 9032 while (ism_blkp != NULL) { 9033 ism_map = ism_blkp->iblk_maps; 9034 while (i < (ISM_MAP_SLOTS - 1)) { 9035 ism_map[i] = ism_map[i + 1]; 9036 i++; 9037 } 9038 /* i == (ISM_MAP_SLOTS - 1) */ 9039 ism_blkp = ism_blkp->iblk_next; 9040 if (ism_blkp != NULL) { 9041 ism_map[i] = ism_blkp->iblk_maps[0]; 9042 i = 0; 9043 } else { 9044 ism_map[i].imap_seg = 0; 9045 ism_map[i].imap_vb_shift = 0; 9046 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 9047 ism_map[i].imap_hatflags = 0; 9048 ism_map[i].imap_sz_mask = 0; 9049 ism_map[i].imap_ismhat = NULL; 9050 ism_map[i].imap_ment = NULL; 9051 } 9052 } 9053 9054 /* 9055 * Now flush entire TSB for the process, since 9056 * demapping page by page can be too expensive. 9057 * We don't have to flush the TLB here anymore 9058 * since we switch to a new TLB ctx instead. 9059 * Also, there is no need to flush if the process 9060 * is exiting since the TSB will be freed later. 9061 */ 9062 if (!sfmmup->sfmmu_free) { 9063 hatlockp = sfmmu_hat_enter(sfmmup); 9064 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 9065 tsbinfo = tsbinfo->tsb_next) { 9066 if (tsbinfo->tsb_flags & TSB_SWAPPED) 9067 continue; 9068 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) { 9069 tsbinfo->tsb_flags |= 9070 TSB_FLUSH_NEEDED; 9071 continue; 9072 } 9073 9074 sfmmu_inv_tsb(tsbinfo->tsb_va, 9075 TSB_BYTES(tsbinfo->tsb_szc)); 9076 } 9077 sfmmu_hat_exit(hatlockp); 9078 } 9079 } 9080 9081 /* 9082 * Update our counters for this sfmmup's ism mappings. 9083 */ 9084 for (i = 0; i <= ismszc; i++) { 9085 if (!(disable_ism_large_pages & (1 << i))) 9086 (void) ism_tsb_entries(sfmmup, i); 9087 } 9088 9089 sfmmu_ismhat_exit(sfmmup, 0); 9090 9091 /* 9092 * We must do our freeing here after dropping locks 9093 * to prevent a deadlock in the kmem allocator on the 9094 * mapping list lock. 9095 */ 9096 if (free_ment != NULL) 9097 kmem_cache_free(ism_ment_cache, free_ment); 9098 9099 /* 9100 * Check TSB and TLB page sizes if the process isn't exiting. 9101 */ 9102 if (!sfmmup->sfmmu_free) { 9103 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 9104 sfmmu_check_page_sizes(sfmmup, 1); 9105 } else { 9106 sfmmu_check_page_sizes(sfmmup, 0); 9107 } 9108 } 9109 } 9110 9111 /* ARGSUSED */ 9112 static int 9113 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 9114 { 9115 /* void *buf is sfmmu_t pointer */ 9116 bzero(buf, sizeof (sfmmu_t)); 9117 9118 return (0); 9119 } 9120 9121 /* ARGSUSED */ 9122 static void 9123 sfmmu_idcache_destructor(void *buf, void *cdrarg) 9124 { 9125 /* void *buf is sfmmu_t pointer */ 9126 } 9127 9128 /* 9129 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 9130 * field to be the pa of this hmeblk 9131 */ 9132 /* ARGSUSED */ 9133 static int 9134 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 9135 { 9136 struct hme_blk *hmeblkp; 9137 9138 bzero(buf, (size_t)cdrarg); 9139 hmeblkp = (struct hme_blk *)buf; 9140 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 9141 9142 #ifdef HBLK_TRACE 9143 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 9144 #endif /* HBLK_TRACE */ 9145 9146 return (0); 9147 } 9148 9149 /* ARGSUSED */ 9150 static void 9151 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 9152 { 9153 9154 #ifdef HBLK_TRACE 9155 9156 struct hme_blk *hmeblkp; 9157 9158 hmeblkp = (struct hme_blk *)buf; 9159 mutex_destroy(&hmeblkp->hblk_audit_lock); 9160 9161 #endif /* HBLK_TRACE */ 9162 } 9163 9164 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 9165 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 9166 /* 9167 * The kmem allocator will callback into our reclaim routine when the system 9168 * is running low in memory. We traverse the hash and free up all unused but 9169 * still cached hme_blks. We also traverse the free list and free them up 9170 * as well. 9171 */ 9172 /*ARGSUSED*/ 9173 static void 9174 sfmmu_hblkcache_reclaim(void *cdrarg) 9175 { 9176 int i; 9177 struct hmehash_bucket *hmebp; 9178 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 9179 static struct hmehash_bucket *uhmehash_reclaim_hand; 9180 static struct hmehash_bucket *khmehash_reclaim_hand; 9181 struct hme_blk *list = NULL, *last_hmeblkp; 9182 cpuset_t cpuset = cpu_ready_set; 9183 cpu_hme_pend_t *cpuhp; 9184 9185 /* Free up hmeblks on the cpu pending lists */ 9186 for (i = 0; i < NCPU; i++) { 9187 cpuhp = &cpu_hme_pend[i]; 9188 if (cpuhp->chp_listp != NULL) { 9189 mutex_enter(&cpuhp->chp_mutex); 9190 if (cpuhp->chp_listp == NULL) { 9191 mutex_exit(&cpuhp->chp_mutex); 9192 continue; 9193 } 9194 for (last_hmeblkp = cpuhp->chp_listp; 9195 last_hmeblkp->hblk_next != NULL; 9196 last_hmeblkp = last_hmeblkp->hblk_next) 9197 ; 9198 last_hmeblkp->hblk_next = list; 9199 list = cpuhp->chp_listp; 9200 cpuhp->chp_listp = NULL; 9201 cpuhp->chp_count = 0; 9202 mutex_exit(&cpuhp->chp_mutex); 9203 } 9204 9205 } 9206 9207 if (list != NULL) { 9208 kpreempt_disable(); 9209 CPUSET_DEL(cpuset, CPU->cpu_id); 9210 xt_sync(cpuset); 9211 xt_sync(cpuset); 9212 kpreempt_enable(); 9213 sfmmu_hblk_free(&list); 9214 list = NULL; 9215 } 9216 9217 hmebp = uhmehash_reclaim_hand; 9218 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 9219 uhmehash_reclaim_hand = hmebp = uhme_hash; 9220 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9221 9222 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9223 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9224 hmeblkp = hmebp->hmeblkp; 9225 pr_hblk = NULL; 9226 while (hmeblkp) { 9227 nx_hblk = hmeblkp->hblk_next; 9228 if (!hmeblkp->hblk_vcnt && 9229 !hmeblkp->hblk_hmecnt) { 9230 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9231 pr_hblk, &list, 0); 9232 } else { 9233 pr_hblk = hmeblkp; 9234 } 9235 hmeblkp = nx_hblk; 9236 } 9237 SFMMU_HASH_UNLOCK(hmebp); 9238 } 9239 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 9240 hmebp = uhme_hash; 9241 } 9242 9243 hmebp = khmehash_reclaim_hand; 9244 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 9245 khmehash_reclaim_hand = hmebp = khme_hash; 9246 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9247 9248 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9249 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9250 hmeblkp = hmebp->hmeblkp; 9251 pr_hblk = NULL; 9252 while (hmeblkp) { 9253 nx_hblk = hmeblkp->hblk_next; 9254 if (!hmeblkp->hblk_vcnt && 9255 !hmeblkp->hblk_hmecnt) { 9256 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9257 pr_hblk, &list, 0); 9258 } else { 9259 pr_hblk = hmeblkp; 9260 } 9261 hmeblkp = nx_hblk; 9262 } 9263 SFMMU_HASH_UNLOCK(hmebp); 9264 } 9265 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 9266 hmebp = khme_hash; 9267 } 9268 sfmmu_hblks_list_purge(&list, 0); 9269 } 9270 9271 /* 9272 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 9273 * same goes for sfmmu_get_addrvcolor(). 9274 * 9275 * This function will return the virtual color for the specified page. The 9276 * virtual color corresponds to this page current mapping or its last mapping. 9277 * It is used by memory allocators to choose addresses with the correct 9278 * alignment so vac consistency is automatically maintained. If the page 9279 * has no color it returns -1. 9280 */ 9281 /*ARGSUSED*/ 9282 int 9283 sfmmu_get_ppvcolor(struct page *pp) 9284 { 9285 #ifdef VAC 9286 int color; 9287 9288 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 9289 return (-1); 9290 } 9291 color = PP_GET_VCOLOR(pp); 9292 ASSERT(color < mmu_btop(shm_alignment)); 9293 return (color); 9294 #else 9295 return (-1); 9296 #endif /* VAC */ 9297 } 9298 9299 /* 9300 * This function will return the desired alignment for vac consistency 9301 * (vac color) given a virtual address. If no vac is present it returns -1. 9302 */ 9303 /*ARGSUSED*/ 9304 int 9305 sfmmu_get_addrvcolor(caddr_t vaddr) 9306 { 9307 #ifdef VAC 9308 if (cache & CACHE_VAC) { 9309 return (addr_to_vcolor(vaddr)); 9310 } else { 9311 return (-1); 9312 } 9313 #else 9314 return (-1); 9315 #endif /* VAC */ 9316 } 9317 9318 #ifdef VAC 9319 /* 9320 * Check for conflicts. 9321 * A conflict exists if the new and existent mappings do not match in 9322 * their "shm_alignment fields. If conflicts exist, the existant mappings 9323 * are flushed unless one of them is locked. If one of them is locked, then 9324 * the mappings are flushed and converted to non-cacheable mappings. 9325 */ 9326 static void 9327 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 9328 { 9329 struct hat *tmphat; 9330 struct sf_hment *sfhmep, *tmphme = NULL; 9331 struct hme_blk *hmeblkp; 9332 int vcolor; 9333 tte_t tte; 9334 9335 ASSERT(sfmmu_mlist_held(pp)); 9336 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 9337 9338 vcolor = addr_to_vcolor(addr); 9339 if (PP_NEWPAGE(pp)) { 9340 PP_SET_VCOLOR(pp, vcolor); 9341 return; 9342 } 9343 9344 if (PP_GET_VCOLOR(pp) == vcolor) { 9345 return; 9346 } 9347 9348 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 9349 /* 9350 * Previous user of page had a different color 9351 * but since there are no current users 9352 * we just flush the cache and change the color. 9353 */ 9354 SFMMU_STAT(sf_pgcolor_conflict); 9355 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9356 PP_SET_VCOLOR(pp, vcolor); 9357 return; 9358 } 9359 9360 /* 9361 * If we get here we have a vac conflict with a current 9362 * mapping. VAC conflict policy is as follows. 9363 * - The default is to unload the other mappings unless: 9364 * - If we have a large mapping we uncache the page. 9365 * We need to uncache the rest of the large page too. 9366 * - If any of the mappings are locked we uncache the page. 9367 * - If the requested mapping is inconsistent 9368 * with another mapping and that mapping 9369 * is in the same address space we have to 9370 * make it non-cached. The default thing 9371 * to do is unload the inconsistent mapping 9372 * but if they are in the same address space 9373 * we run the risk of unmapping the pc or the 9374 * stack which we will use as we return to the user, 9375 * in which case we can then fault on the thing 9376 * we just unloaded and get into an infinite loop. 9377 */ 9378 if (PP_ISMAPPED_LARGE(pp)) { 9379 int sz; 9380 9381 /* 9382 * Existing mapping is for big pages. We don't unload 9383 * existing big mappings to satisfy new mappings. 9384 * Always convert all mappings to TNC. 9385 */ 9386 sz = fnd_mapping_sz(pp); 9387 pp = PP_GROUPLEADER(pp, sz); 9388 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 9389 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 9390 TTEPAGES(sz)); 9391 9392 return; 9393 } 9394 9395 /* 9396 * check if any mapping is in same as or if it is locked 9397 * since in that case we need to uncache. 9398 */ 9399 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9400 tmphme = sfhmep->hme_next; 9401 if (IS_PAHME(sfhmep)) 9402 continue; 9403 hmeblkp = sfmmu_hmetohblk(sfhmep); 9404 if (hmeblkp->hblk_xhat_bit) 9405 continue; 9406 tmphat = hblktosfmmu(hmeblkp); 9407 sfmmu_copytte(&sfhmep->hme_tte, &tte); 9408 ASSERT(TTE_IS_VALID(&tte)); 9409 if (hmeblkp->hblk_shared || tmphat == hat || 9410 hmeblkp->hblk_lckcnt) { 9411 /* 9412 * We have an uncache conflict 9413 */ 9414 SFMMU_STAT(sf_uncache_conflict); 9415 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 9416 return; 9417 } 9418 } 9419 9420 /* 9421 * We have an unload conflict 9422 * We have already checked for LARGE mappings, therefore 9423 * the remaining mapping(s) must be TTE8K. 9424 */ 9425 SFMMU_STAT(sf_unload_conflict); 9426 9427 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9428 tmphme = sfhmep->hme_next; 9429 if (IS_PAHME(sfhmep)) 9430 continue; 9431 hmeblkp = sfmmu_hmetohblk(sfhmep); 9432 if (hmeblkp->hblk_xhat_bit) 9433 continue; 9434 ASSERT(!hmeblkp->hblk_shared); 9435 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 9436 } 9437 9438 if (PP_ISMAPPED_KPM(pp)) 9439 sfmmu_kpm_vac_unload(pp, addr); 9440 9441 /* 9442 * Unloads only do TLB flushes so we need to flush the 9443 * cache here. 9444 */ 9445 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9446 PP_SET_VCOLOR(pp, vcolor); 9447 } 9448 9449 /* 9450 * Whenever a mapping is unloaded and the page is in TNC state, 9451 * we see if the page can be made cacheable again. 'pp' is 9452 * the page that we just unloaded a mapping from, the size 9453 * of mapping that was unloaded is 'ottesz'. 9454 * Remark: 9455 * The recache policy for mpss pages can leave a performance problem 9456 * under the following circumstances: 9457 * . A large page in uncached mode has just been unmapped. 9458 * . All constituent pages are TNC due to a conflicting small mapping. 9459 * . There are many other, non conflicting, small mappings around for 9460 * a lot of the constituent pages. 9461 * . We're called w/ the "old" groupleader page and the old ottesz, 9462 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 9463 * we end up w/ TTE8K or npages == 1. 9464 * . We call tst_tnc w/ the old groupleader only, and if there is no 9465 * conflict, we re-cache only this page. 9466 * . All other small mappings are not checked and will be left in TNC mode. 9467 * The problem is not very serious because: 9468 * . mpss is actually only defined for heap and stack, so the probability 9469 * is not very high that a large page mapping exists in parallel to a small 9470 * one (this is possible, but seems to be bad programming style in the 9471 * appl). 9472 * . The problem gets a little bit more serious, when those TNC pages 9473 * have to be mapped into kernel space, e.g. for networking. 9474 * . When VAC alias conflicts occur in applications, this is regarded 9475 * as an application bug. So if kstat's show them, the appl should 9476 * be changed anyway. 9477 */ 9478 void 9479 conv_tnc(page_t *pp, int ottesz) 9480 { 9481 int cursz, dosz; 9482 pgcnt_t curnpgs, dopgs; 9483 pgcnt_t pg64k; 9484 page_t *pp2; 9485 9486 /* 9487 * Determine how big a range we check for TNC and find 9488 * leader page. cursz is the size of the biggest 9489 * mapping that still exist on 'pp'. 9490 */ 9491 if (PP_ISMAPPED_LARGE(pp)) { 9492 cursz = fnd_mapping_sz(pp); 9493 } else { 9494 cursz = TTE8K; 9495 } 9496 9497 if (ottesz >= cursz) { 9498 dosz = ottesz; 9499 pp2 = pp; 9500 } else { 9501 dosz = cursz; 9502 pp2 = PP_GROUPLEADER(pp, dosz); 9503 } 9504 9505 pg64k = TTEPAGES(TTE64K); 9506 dopgs = TTEPAGES(dosz); 9507 9508 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 9509 9510 while (dopgs != 0) { 9511 curnpgs = TTEPAGES(cursz); 9512 if (tst_tnc(pp2, curnpgs)) { 9513 SFMMU_STAT_ADD(sf_recache, curnpgs); 9514 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 9515 curnpgs); 9516 } 9517 9518 ASSERT(dopgs >= curnpgs); 9519 dopgs -= curnpgs; 9520 9521 if (dopgs == 0) { 9522 break; 9523 } 9524 9525 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 9526 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 9527 cursz = fnd_mapping_sz(pp2); 9528 } else { 9529 cursz = TTE8K; 9530 } 9531 } 9532 } 9533 9534 /* 9535 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 9536 * returns 0 otherwise. Note that oaddr argument is valid for only 9537 * 8k pages. 9538 */ 9539 int 9540 tst_tnc(page_t *pp, pgcnt_t npages) 9541 { 9542 struct sf_hment *sfhme; 9543 struct hme_blk *hmeblkp; 9544 tte_t tte; 9545 caddr_t vaddr; 9546 int clr_valid = 0; 9547 int color, color1, bcolor; 9548 int i, ncolors; 9549 9550 ASSERT(pp != NULL); 9551 ASSERT(!(cache & CACHE_WRITEBACK)); 9552 9553 if (npages > 1) { 9554 ncolors = CACHE_NUM_COLOR; 9555 } 9556 9557 for (i = 0; i < npages; i++) { 9558 ASSERT(sfmmu_mlist_held(pp)); 9559 ASSERT(PP_ISTNC(pp)); 9560 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 9561 9562 if (PP_ISPNC(pp)) { 9563 return (0); 9564 } 9565 9566 clr_valid = 0; 9567 if (PP_ISMAPPED_KPM(pp)) { 9568 caddr_t kpmvaddr; 9569 9570 ASSERT(kpm_enable); 9571 kpmvaddr = hat_kpm_page2va(pp, 1); 9572 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 9573 color1 = addr_to_vcolor(kpmvaddr); 9574 clr_valid = 1; 9575 } 9576 9577 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9578 if (IS_PAHME(sfhme)) 9579 continue; 9580 hmeblkp = sfmmu_hmetohblk(sfhme); 9581 if (hmeblkp->hblk_xhat_bit) 9582 continue; 9583 9584 sfmmu_copytte(&sfhme->hme_tte, &tte); 9585 ASSERT(TTE_IS_VALID(&tte)); 9586 9587 vaddr = tte_to_vaddr(hmeblkp, tte); 9588 color = addr_to_vcolor(vaddr); 9589 9590 if (npages > 1) { 9591 /* 9592 * If there is a big mapping, make sure 9593 * 8K mapping is consistent with the big 9594 * mapping. 9595 */ 9596 bcolor = i % ncolors; 9597 if (color != bcolor) { 9598 return (0); 9599 } 9600 } 9601 if (!clr_valid) { 9602 clr_valid = 1; 9603 color1 = color; 9604 } 9605 9606 if (color1 != color) { 9607 return (0); 9608 } 9609 } 9610 9611 pp = PP_PAGENEXT(pp); 9612 } 9613 9614 return (1); 9615 } 9616 9617 void 9618 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 9619 pgcnt_t npages) 9620 { 9621 kmutex_t *pmtx; 9622 int i, ncolors, bcolor; 9623 kpm_hlk_t *kpmp; 9624 cpuset_t cpuset; 9625 9626 ASSERT(pp != NULL); 9627 ASSERT(!(cache & CACHE_WRITEBACK)); 9628 9629 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 9630 pmtx = sfmmu_page_enter(pp); 9631 9632 /* 9633 * Fast path caching single unmapped page 9634 */ 9635 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 9636 flags == HAT_CACHE) { 9637 PP_CLRTNC(pp); 9638 PP_CLRPNC(pp); 9639 sfmmu_page_exit(pmtx); 9640 sfmmu_kpm_kpmp_exit(kpmp); 9641 return; 9642 } 9643 9644 /* 9645 * We need to capture all cpus in order to change cacheability 9646 * because we can't allow one cpu to access the same physical 9647 * page using a cacheable and a non-cachebale mapping at the same 9648 * time. Since we may end up walking the ism mapping list 9649 * have to grab it's lock now since we can't after all the 9650 * cpus have been captured. 9651 */ 9652 sfmmu_hat_lock_all(); 9653 mutex_enter(&ism_mlist_lock); 9654 kpreempt_disable(); 9655 cpuset = cpu_ready_set; 9656 xc_attention(cpuset); 9657 9658 if (npages > 1) { 9659 /* 9660 * Make sure all colors are flushed since the 9661 * sfmmu_page_cache() only flushes one color- 9662 * it does not know big pages. 9663 */ 9664 ncolors = CACHE_NUM_COLOR; 9665 if (flags & HAT_TMPNC) { 9666 for (i = 0; i < ncolors; i++) { 9667 sfmmu_cache_flushcolor(i, pp->p_pagenum); 9668 } 9669 cache_flush_flag = CACHE_NO_FLUSH; 9670 } 9671 } 9672 9673 for (i = 0; i < npages; i++) { 9674 9675 ASSERT(sfmmu_mlist_held(pp)); 9676 9677 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 9678 9679 if (npages > 1) { 9680 bcolor = i % ncolors; 9681 } else { 9682 bcolor = NO_VCOLOR; 9683 } 9684 9685 sfmmu_page_cache(pp, flags, cache_flush_flag, 9686 bcolor); 9687 } 9688 9689 pp = PP_PAGENEXT(pp); 9690 } 9691 9692 xt_sync(cpuset); 9693 xc_dismissed(cpuset); 9694 mutex_exit(&ism_mlist_lock); 9695 sfmmu_hat_unlock_all(); 9696 sfmmu_page_exit(pmtx); 9697 sfmmu_kpm_kpmp_exit(kpmp); 9698 kpreempt_enable(); 9699 } 9700 9701 /* 9702 * This function changes the virtual cacheability of all mappings to a 9703 * particular page. When changing from uncache to cacheable the mappings will 9704 * only be changed if all of them have the same virtual color. 9705 * We need to flush the cache in all cpus. It is possible that 9706 * a process referenced a page as cacheable but has sinced exited 9707 * and cleared the mapping list. We still to flush it but have no 9708 * state so all cpus is the only alternative. 9709 */ 9710 static void 9711 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 9712 { 9713 struct sf_hment *sfhme; 9714 struct hme_blk *hmeblkp; 9715 sfmmu_t *sfmmup; 9716 tte_t tte, ttemod; 9717 caddr_t vaddr; 9718 int ret, color; 9719 pfn_t pfn; 9720 9721 color = bcolor; 9722 pfn = pp->p_pagenum; 9723 9724 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9725 9726 if (IS_PAHME(sfhme)) 9727 continue; 9728 hmeblkp = sfmmu_hmetohblk(sfhme); 9729 9730 if (hmeblkp->hblk_xhat_bit) 9731 continue; 9732 9733 sfmmu_copytte(&sfhme->hme_tte, &tte); 9734 ASSERT(TTE_IS_VALID(&tte)); 9735 vaddr = tte_to_vaddr(hmeblkp, tte); 9736 color = addr_to_vcolor(vaddr); 9737 9738 #ifdef DEBUG 9739 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 9740 ASSERT(color == bcolor); 9741 } 9742 #endif 9743 9744 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 9745 9746 ttemod = tte; 9747 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 9748 TTE_CLR_VCACHEABLE(&ttemod); 9749 } else { /* flags & HAT_CACHE */ 9750 TTE_SET_VCACHEABLE(&ttemod); 9751 } 9752 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 9753 if (ret < 0) { 9754 /* 9755 * Since all cpus are captured modifytte should not 9756 * fail. 9757 */ 9758 panic("sfmmu_page_cache: write to tte failed"); 9759 } 9760 9761 sfmmup = hblktosfmmu(hmeblkp); 9762 if (cache_flush_flag == CACHE_FLUSH) { 9763 /* 9764 * Flush TSBs, TLBs and caches 9765 */ 9766 if (hmeblkp->hblk_shared) { 9767 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9768 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9769 sf_region_t *rgnp; 9770 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9771 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9772 ASSERT(srdp != NULL); 9773 rgnp = srdp->srd_hmergnp[rid]; 9774 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9775 srdp, rgnp, rid); 9776 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9777 hmeblkp, 0); 9778 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr)); 9779 } else if (sfmmup->sfmmu_ismhat) { 9780 if (flags & HAT_CACHE) { 9781 SFMMU_STAT(sf_ism_recache); 9782 } else { 9783 SFMMU_STAT(sf_ism_uncache); 9784 } 9785 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9786 pfn, CACHE_FLUSH); 9787 } else { 9788 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 9789 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 9790 } 9791 9792 /* 9793 * all cache entries belonging to this pfn are 9794 * now flushed. 9795 */ 9796 cache_flush_flag = CACHE_NO_FLUSH; 9797 } else { 9798 /* 9799 * Flush only TSBs and TLBs. 9800 */ 9801 if (hmeblkp->hblk_shared) { 9802 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9803 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9804 sf_region_t *rgnp; 9805 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9806 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9807 ASSERT(srdp != NULL); 9808 rgnp = srdp->srd_hmergnp[rid]; 9809 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9810 srdp, rgnp, rid); 9811 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9812 hmeblkp, 0); 9813 } else if (sfmmup->sfmmu_ismhat) { 9814 if (flags & HAT_CACHE) { 9815 SFMMU_STAT(sf_ism_recache); 9816 } else { 9817 SFMMU_STAT(sf_ism_uncache); 9818 } 9819 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9820 pfn, CACHE_NO_FLUSH); 9821 } else { 9822 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 9823 } 9824 } 9825 } 9826 9827 if (PP_ISMAPPED_KPM(pp)) 9828 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 9829 9830 switch (flags) { 9831 9832 default: 9833 panic("sfmmu_pagecache: unknown flags"); 9834 break; 9835 9836 case HAT_CACHE: 9837 PP_CLRTNC(pp); 9838 PP_CLRPNC(pp); 9839 PP_SET_VCOLOR(pp, color); 9840 break; 9841 9842 case HAT_TMPNC: 9843 PP_SETTNC(pp); 9844 PP_SET_VCOLOR(pp, NO_VCOLOR); 9845 break; 9846 9847 case HAT_UNCACHE: 9848 PP_SETPNC(pp); 9849 PP_CLRTNC(pp); 9850 PP_SET_VCOLOR(pp, NO_VCOLOR); 9851 break; 9852 } 9853 } 9854 #endif /* VAC */ 9855 9856 9857 /* 9858 * Wrapper routine used to return a context. 9859 * 9860 * It's the responsibility of the caller to guarantee that the 9861 * process serializes on calls here by taking the HAT lock for 9862 * the hat. 9863 * 9864 */ 9865 static void 9866 sfmmu_get_ctx(sfmmu_t *sfmmup) 9867 { 9868 mmu_ctx_t *mmu_ctxp; 9869 uint_t pstate_save; 9870 int ret; 9871 9872 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9873 ASSERT(sfmmup != ksfmmup); 9874 9875 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) { 9876 sfmmu_setup_tsbinfo(sfmmup); 9877 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID); 9878 } 9879 9880 kpreempt_disable(); 9881 9882 mmu_ctxp = CPU_MMU_CTXP(CPU); 9883 ASSERT(mmu_ctxp); 9884 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 9885 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 9886 9887 /* 9888 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU. 9889 */ 9890 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs) 9891 sfmmu_ctx_wrap_around(mmu_ctxp, B_TRUE); 9892 9893 /* 9894 * Let the MMU set up the page sizes to use for 9895 * this context in the TLB. Don't program 2nd dtlb for ism hat. 9896 */ 9897 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) { 9898 mmu_set_ctx_page_sizes(sfmmup); 9899 } 9900 9901 /* 9902 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with 9903 * interrupts disabled to prevent race condition with wrap-around 9904 * ctx invalidatation. In sun4v, ctx invalidation also involves 9905 * a HV call to set the number of TSBs to 0. If interrupts are not 9906 * disabled until after sfmmu_load_mmustate is complete TSBs may 9907 * become assigned to INVALID_CONTEXT. This is not allowed. 9908 */ 9909 pstate_save = sfmmu_disable_intrs(); 9910 9911 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) && 9912 sfmmup->sfmmu_scdp != NULL) { 9913 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 9914 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 9915 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED); 9916 /* debug purpose only */ 9917 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 9918 != INVALID_CONTEXT); 9919 } 9920 sfmmu_load_mmustate(sfmmup); 9921 9922 sfmmu_enable_intrs(pstate_save); 9923 9924 kpreempt_enable(); 9925 } 9926 9927 /* 9928 * When all cnums are used up in a MMU, cnum will wrap around to the 9929 * next generation and start from 2. 9930 */ 9931 static void 9932 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp, boolean_t reset_cnum) 9933 { 9934 9935 /* caller must have disabled the preemption */ 9936 ASSERT(curthread->t_preempt >= 1); 9937 ASSERT(mmu_ctxp != NULL); 9938 9939 /* acquire Per-MMU (PM) spin lock */ 9940 mutex_enter(&mmu_ctxp->mmu_lock); 9941 9942 /* re-check to see if wrap-around is needed */ 9943 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs) 9944 goto done; 9945 9946 SFMMU_MMU_STAT(mmu_wrap_around); 9947 9948 /* update gnum */ 9949 ASSERT(mmu_ctxp->mmu_gnum != 0); 9950 mmu_ctxp->mmu_gnum++; 9951 if (mmu_ctxp->mmu_gnum == 0 || 9952 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) { 9953 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.", 9954 (void *)mmu_ctxp); 9955 } 9956 9957 if (mmu_ctxp->mmu_ncpus > 1) { 9958 cpuset_t cpuset; 9959 9960 membar_enter(); /* make sure updated gnum visible */ 9961 9962 SFMMU_XCALL_STATS(NULL); 9963 9964 /* xcall to others on the same MMU to invalidate ctx */ 9965 cpuset = mmu_ctxp->mmu_cpuset; 9966 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id) || !reset_cnum); 9967 CPUSET_DEL(cpuset, CPU->cpu_id); 9968 CPUSET_AND(cpuset, cpu_ready_set); 9969 9970 /* 9971 * Pass in INVALID_CONTEXT as the first parameter to 9972 * sfmmu_raise_tsb_exception, which invalidates the context 9973 * of any process running on the CPUs in the MMU. 9974 */ 9975 xt_some(cpuset, sfmmu_raise_tsb_exception, 9976 INVALID_CONTEXT, INVALID_CONTEXT); 9977 xt_sync(cpuset); 9978 9979 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 9980 } 9981 9982 if (sfmmu_getctx_sec() != INVALID_CONTEXT) { 9983 sfmmu_setctx_sec(INVALID_CONTEXT); 9984 sfmmu_clear_utsbinfo(); 9985 } 9986 9987 /* 9988 * No xcall is needed here. For sun4u systems all CPUs in context 9989 * domain share a single physical MMU therefore it's enough to flush 9990 * TLB on local CPU. On sun4v systems we use 1 global context 9991 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception 9992 * handler. Note that vtag_flushall_uctxs() is called 9993 * for Ultra II machine, where the equivalent flushall functionality 9994 * is implemented in SW, and only user ctx TLB entries are flushed. 9995 */ 9996 if (&vtag_flushall_uctxs != NULL) { 9997 vtag_flushall_uctxs(); 9998 } else { 9999 vtag_flushall(); 10000 } 10001 10002 /* reset mmu cnum, skips cnum 0 and 1 */ 10003 if (reset_cnum == B_TRUE) 10004 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 10005 10006 done: 10007 mutex_exit(&mmu_ctxp->mmu_lock); 10008 } 10009 10010 10011 /* 10012 * For multi-threaded process, set the process context to INVALID_CONTEXT 10013 * so that it faults and reloads the MMU state from TL=0. For single-threaded 10014 * process, we can just load the MMU state directly without having to 10015 * set context invalid. Caller must hold the hat lock since we don't 10016 * acquire it here. 10017 */ 10018 static void 10019 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 10020 { 10021 uint_t cnum; 10022 uint_t pstate_save; 10023 10024 ASSERT(sfmmup != ksfmmup); 10025 ASSERT(sfmmu_hat_lock_held(sfmmup)); 10026 10027 kpreempt_disable(); 10028 10029 /* 10030 * We check whether the pass'ed-in sfmmup is the same as the 10031 * current running proc. This is to makes sure the current proc 10032 * stays single-threaded if it already is. 10033 */ 10034 if ((sfmmup == curthread->t_procp->p_as->a_hat) && 10035 (curthread->t_procp->p_lwpcnt == 1)) { 10036 /* single-thread */ 10037 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum; 10038 if (cnum != INVALID_CONTEXT) { 10039 uint_t curcnum; 10040 /* 10041 * Disable interrupts to prevent race condition 10042 * with sfmmu_ctx_wrap_around ctx invalidation. 10043 * In sun4v, ctx invalidation involves setting 10044 * TSB to NULL, hence, interrupts should be disabled 10045 * untill after sfmmu_load_mmustate is completed. 10046 */ 10047 pstate_save = sfmmu_disable_intrs(); 10048 curcnum = sfmmu_getctx_sec(); 10049 if (curcnum == cnum) 10050 sfmmu_load_mmustate(sfmmup); 10051 sfmmu_enable_intrs(pstate_save); 10052 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT); 10053 } 10054 } else { 10055 /* 10056 * multi-thread 10057 * or when sfmmup is not the same as the curproc. 10058 */ 10059 sfmmu_invalidate_ctx(sfmmup); 10060 } 10061 10062 kpreempt_enable(); 10063 } 10064 10065 10066 /* 10067 * Replace the specified TSB with a new TSB. This function gets called when 10068 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the 10069 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 10070 * (8K). 10071 * 10072 * Caller must hold the HAT lock, but should assume any tsb_info 10073 * pointers it has are no longer valid after calling this function. 10074 * 10075 * Return values: 10076 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 10077 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 10078 * something to this tsbinfo/TSB 10079 * TSB_SUCCESS Operation succeeded 10080 */ 10081 static tsb_replace_rc_t 10082 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 10083 hatlock_t *hatlockp, uint_t flags) 10084 { 10085 struct tsb_info *new_tsbinfo = NULL; 10086 struct tsb_info *curtsb, *prevtsb; 10087 uint_t tte_sz_mask; 10088 int i; 10089 10090 ASSERT(sfmmup != ksfmmup); 10091 ASSERT(sfmmup->sfmmu_ismhat == 0); 10092 ASSERT(sfmmu_hat_lock_held(sfmmup)); 10093 ASSERT(szc <= tsb_max_growsize); 10094 10095 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 10096 return (TSB_LOSTRACE); 10097 10098 /* 10099 * Find the tsb_info ahead of this one in the list, and 10100 * also make sure that the tsb_info passed in really 10101 * exists! 10102 */ 10103 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10104 curtsb != old_tsbinfo && curtsb != NULL; 10105 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10106 ; 10107 ASSERT(curtsb != NULL); 10108 10109 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10110 /* 10111 * The process is swapped out, so just set the new size 10112 * code. When it swaps back in, we'll allocate a new one 10113 * of the new chosen size. 10114 */ 10115 curtsb->tsb_szc = szc; 10116 return (TSB_SUCCESS); 10117 } 10118 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 10119 10120 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 10121 10122 /* 10123 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 10124 * If we fail to allocate a TSB, exit. 10125 * 10126 * If tsb grows with new tsb size > 4M and old tsb size < 4M, 10127 * then try 4M slab after the initial alloc fails. 10128 * 10129 * If tsb swapin with tsb size > 4M, then try 4M after the 10130 * initial alloc fails. 10131 */ 10132 sfmmu_hat_exit(hatlockp); 10133 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, 10134 tte_sz_mask, flags, sfmmup) && 10135 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) || 10136 (!(flags & TSB_SWAPIN) && 10137 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) || 10138 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE, 10139 tte_sz_mask, flags, sfmmup))) { 10140 (void) sfmmu_hat_enter(sfmmup); 10141 if (!(flags & TSB_SWAPIN)) 10142 SFMMU_STAT(sf_tsb_resize_failures); 10143 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10144 return (TSB_ALLOCFAIL); 10145 } 10146 (void) sfmmu_hat_enter(sfmmup); 10147 10148 /* 10149 * Re-check to make sure somebody else didn't muck with us while we 10150 * didn't hold the HAT lock. If the process swapped out, fine, just 10151 * exit; this can happen if we try to shrink the TSB from the context 10152 * of another process (such as on an ISM unmap), though it is rare. 10153 */ 10154 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10155 SFMMU_STAT(sf_tsb_resize_failures); 10156 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10157 sfmmu_hat_exit(hatlockp); 10158 sfmmu_tsbinfo_free(new_tsbinfo); 10159 (void) sfmmu_hat_enter(sfmmup); 10160 return (TSB_LOSTRACE); 10161 } 10162 10163 #ifdef DEBUG 10164 /* Reverify that the tsb_info still exists.. for debugging only */ 10165 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10166 curtsb != old_tsbinfo && curtsb != NULL; 10167 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10168 ; 10169 ASSERT(curtsb != NULL); 10170 #endif /* DEBUG */ 10171 10172 /* 10173 * Quiesce any CPUs running this process on their next TLB miss 10174 * so they atomically see the new tsb_info. We temporarily set the 10175 * context to invalid context so new threads that come on processor 10176 * after we do the xcall to cpusran will also serialize behind the 10177 * HAT lock on TLB miss and will see the new TSB. Since this short 10178 * race with a new thread coming on processor is relatively rare, 10179 * this synchronization mechanism should be cheaper than always 10180 * pausing all CPUs for the duration of the setup, which is what 10181 * the old implementation did. This is particuarly true if we are 10182 * copying a huge chunk of memory around during that window. 10183 * 10184 * The memory barriers are to make sure things stay consistent 10185 * with resume() since it does not hold the HAT lock while 10186 * walking the list of tsb_info structures. 10187 */ 10188 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 10189 /* The TSB is either growing or shrinking. */ 10190 sfmmu_invalidate_ctx(sfmmup); 10191 } else { 10192 /* 10193 * It is illegal to swap in TSBs from a process other 10194 * than a process being swapped in. This in turn 10195 * implies we do not have a valid MMU context here 10196 * since a process needs one to resolve translation 10197 * misses. 10198 */ 10199 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 10200 } 10201 10202 #ifdef DEBUG 10203 ASSERT(max_mmu_ctxdoms > 0); 10204 10205 /* 10206 * Process should have INVALID_CONTEXT on all MMUs 10207 */ 10208 for (i = 0; i < max_mmu_ctxdoms; i++) { 10209 10210 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT); 10211 } 10212 #endif 10213 10214 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 10215 membar_stst(); /* strict ordering required */ 10216 if (prevtsb) 10217 prevtsb->tsb_next = new_tsbinfo; 10218 else 10219 sfmmup->sfmmu_tsb = new_tsbinfo; 10220 membar_enter(); /* make sure new TSB globally visible */ 10221 10222 /* 10223 * We need to migrate TSB entries from the old TSB to the new TSB 10224 * if tsb_remap_ttes is set and the TSB is growing. 10225 */ 10226 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 10227 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 10228 10229 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10230 10231 /* 10232 * Drop the HAT lock to free our old tsb_info. 10233 */ 10234 sfmmu_hat_exit(hatlockp); 10235 10236 if ((flags & TSB_GROW) == TSB_GROW) { 10237 SFMMU_STAT(sf_tsb_grow); 10238 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 10239 SFMMU_STAT(sf_tsb_shrink); 10240 } 10241 10242 sfmmu_tsbinfo_free(old_tsbinfo); 10243 10244 (void) sfmmu_hat_enter(sfmmup); 10245 return (TSB_SUCCESS); 10246 } 10247 10248 /* 10249 * This function will re-program hat pgsz array, and invalidate the 10250 * process' context, forcing the process to switch to another 10251 * context on the next TLB miss, and therefore start using the 10252 * TLB that is reprogrammed for the new page sizes. 10253 */ 10254 void 10255 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 10256 { 10257 int i; 10258 hatlock_t *hatlockp = NULL; 10259 10260 hatlockp = sfmmu_hat_enter(sfmmup); 10261 /* USIII+-IV+ optimization, requires hat lock */ 10262 if (tmp_pgsz) { 10263 for (i = 0; i < mmu_page_sizes; i++) 10264 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 10265 } 10266 SFMMU_STAT(sf_tlb_reprog_pgsz); 10267 10268 sfmmu_invalidate_ctx(sfmmup); 10269 10270 sfmmu_hat_exit(hatlockp); 10271 } 10272 10273 /* 10274 * The scd_rttecnt field in the SCD must be updated to take account of the 10275 * regions which it contains. 10276 */ 10277 static void 10278 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp) 10279 { 10280 uint_t rid; 10281 uint_t i, j; 10282 ulong_t w; 10283 sf_region_t *rgnp; 10284 10285 ASSERT(srdp != NULL); 10286 10287 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 10288 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 10289 continue; 10290 } 10291 10292 j = 0; 10293 while (w) { 10294 if (!(w & 0x1)) { 10295 j++; 10296 w >>= 1; 10297 continue; 10298 } 10299 rid = (i << BT_ULSHIFT) | j; 10300 j++; 10301 w >>= 1; 10302 10303 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 10304 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 10305 rgnp = srdp->srd_hmergnp[rid]; 10306 ASSERT(rgnp->rgn_refcnt > 0); 10307 ASSERT(rgnp->rgn_id == rid); 10308 10309 scdp->scd_rttecnt[rgnp->rgn_pgszc] += 10310 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 10311 10312 /* 10313 * Maintain the tsb0 inflation cnt for the regions 10314 * in the SCD. 10315 */ 10316 if (rgnp->rgn_pgszc >= TTE4M) { 10317 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt += 10318 rgnp->rgn_size >> 10319 (TTE_PAGE_SHIFT(TTE8K) + 2); 10320 } 10321 } 10322 } 10323 } 10324 10325 /* 10326 * This function assumes that there are either four or six supported page 10327 * sizes and at most two programmable TLBs, so we need to decide which 10328 * page sizes are most important and then tell the MMU layer so it 10329 * can adjust the TLB page sizes accordingly (if supported). 10330 * 10331 * If these assumptions change, this function will need to be 10332 * updated to support whatever the new limits are. 10333 * 10334 * The growing flag is nonzero if we are growing the address space, 10335 * and zero if it is shrinking. This allows us to decide whether 10336 * to grow or shrink our TSB, depending upon available memory 10337 * conditions. 10338 */ 10339 static void 10340 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 10341 { 10342 uint64_t ttecnt[MMU_PAGE_SIZES]; 10343 uint64_t tte8k_cnt, tte4m_cnt; 10344 uint8_t i; 10345 int sectsb_thresh; 10346 10347 /* 10348 * Kernel threads, processes with small address spaces not using 10349 * large pages, and dummy ISM HATs need not apply. 10350 */ 10351 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 10352 return; 10353 10354 if (!SFMMU_LGPGS_INUSE(sfmmup) && 10355 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 10356 return; 10357 10358 for (i = 0; i < mmu_page_sizes; i++) { 10359 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] + 10360 sfmmup->sfmmu_ismttecnt[i]; 10361 } 10362 10363 /* Check pagesizes in use, and possibly reprogram DTLB. */ 10364 if (&mmu_check_page_sizes) 10365 mmu_check_page_sizes(sfmmup, ttecnt); 10366 10367 /* 10368 * Calculate the number of 8k ttes to represent the span of these 10369 * pages. 10370 */ 10371 tte8k_cnt = ttecnt[TTE8K] + 10372 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 10373 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 10374 if (mmu_page_sizes == max_mmu_page_sizes) { 10375 tte4m_cnt = ttecnt[TTE4M] + 10376 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 10377 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 10378 } else { 10379 tte4m_cnt = ttecnt[TTE4M]; 10380 } 10381 10382 /* 10383 * Inflate tte8k_cnt to allow for region large page allocation failure. 10384 */ 10385 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt; 10386 10387 /* 10388 * Inflate TSB sizes by a factor of 2 if this process 10389 * uses 4M text pages to minimize extra conflict misses 10390 * in the first TSB since without counting text pages 10391 * 8K TSB may become too small. 10392 * 10393 * Also double the size of the second TSB to minimize 10394 * extra conflict misses due to competition between 4M text pages 10395 * and data pages. 10396 * 10397 * We need to adjust the second TSB allocation threshold by the 10398 * inflation factor, since there is no point in creating a second 10399 * TSB when we know all the mappings can fit in the I/D TLBs. 10400 */ 10401 sectsb_thresh = tsb_sectsb_threshold; 10402 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 10403 tte8k_cnt <<= 1; 10404 tte4m_cnt <<= 1; 10405 sectsb_thresh <<= 1; 10406 } 10407 10408 /* 10409 * Check to see if our TSB is the right size; we may need to 10410 * grow or shrink it. If the process is small, our work is 10411 * finished at this point. 10412 */ 10413 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) { 10414 return; 10415 } 10416 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh); 10417 } 10418 10419 static void 10420 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 10421 uint64_t tte4m_cnt, int sectsb_thresh) 10422 { 10423 int tsb_bits; 10424 uint_t tsb_szc; 10425 struct tsb_info *tsbinfop; 10426 hatlock_t *hatlockp = NULL; 10427 10428 hatlockp = sfmmu_hat_enter(sfmmup); 10429 ASSERT(hatlockp != NULL); 10430 tsbinfop = sfmmup->sfmmu_tsb; 10431 ASSERT(tsbinfop != NULL); 10432 10433 /* 10434 * If we're growing, select the size based on RSS. If we're 10435 * shrinking, leave some room so we don't have to turn around and 10436 * grow again immediately. 10437 */ 10438 if (growing) 10439 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 10440 else 10441 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 10442 10443 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10444 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10445 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10446 hatlockp, TSB_SHRINK); 10447 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 10448 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10449 hatlockp, TSB_GROW); 10450 } 10451 tsbinfop = sfmmup->sfmmu_tsb; 10452 10453 /* 10454 * With the TLB and first TSB out of the way, we need to see if 10455 * we need a second TSB for 4M pages. If we managed to reprogram 10456 * the TLB page sizes above, the process will start using this new 10457 * TSB right away; otherwise, it will start using it on the next 10458 * context switch. Either way, it's no big deal so there's no 10459 * synchronization with the trap handlers here unless we grow the 10460 * TSB (in which case it's required to prevent using the old one 10461 * after it's freed). Note: second tsb is required for 32M/256M 10462 * page sizes. 10463 */ 10464 if (tte4m_cnt > sectsb_thresh) { 10465 /* 10466 * If we're growing, select the size based on RSS. If we're 10467 * shrinking, leave some room so we don't have to turn 10468 * around and grow again immediately. 10469 */ 10470 if (growing) 10471 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 10472 else 10473 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 10474 if (tsbinfop->tsb_next == NULL) { 10475 struct tsb_info *newtsb; 10476 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 10477 0 : TSB_ALLOC; 10478 10479 sfmmu_hat_exit(hatlockp); 10480 10481 /* 10482 * Try to allocate a TSB for 4[32|256]M pages. If we 10483 * can't get the size we want, retry w/a minimum sized 10484 * TSB. If that still didn't work, give up; we can 10485 * still run without one. 10486 */ 10487 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 10488 TSB4M|TSB32M|TSB256M:TSB4M; 10489 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 10490 allocflags, sfmmup)) && 10491 (tsb_szc <= TSB_4M_SZCODE || 10492 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 10493 tsb_bits, allocflags, sfmmup)) && 10494 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 10495 tsb_bits, allocflags, sfmmup)) { 10496 return; 10497 } 10498 10499 hatlockp = sfmmu_hat_enter(sfmmup); 10500 10501 sfmmu_invalidate_ctx(sfmmup); 10502 10503 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 10504 sfmmup->sfmmu_tsb->tsb_next = newtsb; 10505 SFMMU_STAT(sf_tsb_sectsb_create); 10506 sfmmu_hat_exit(hatlockp); 10507 return; 10508 } else { 10509 /* 10510 * It's annoying, but possible for us 10511 * to get here.. we dropped the HAT lock 10512 * because of locking order in the kmem 10513 * allocator, and while we were off getting 10514 * our memory, some other thread decided to 10515 * do us a favor and won the race to get a 10516 * second TSB for this process. Sigh. 10517 */ 10518 sfmmu_hat_exit(hatlockp); 10519 sfmmu_tsbinfo_free(newtsb); 10520 return; 10521 } 10522 } 10523 10524 /* 10525 * We have a second TSB, see if it's big enough. 10526 */ 10527 tsbinfop = tsbinfop->tsb_next; 10528 10529 /* 10530 * Check to see if our second TSB is the right size; 10531 * we may need to grow or shrink it. 10532 * To prevent thrashing (e.g. growing the TSB on a 10533 * subsequent map operation), only try to shrink if 10534 * the TSB reach exceeds twice the virtual address 10535 * space size. 10536 */ 10537 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10538 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10539 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10540 tsb_szc, hatlockp, TSB_SHRINK); 10541 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 10542 TSB_OK_GROW()) { 10543 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10544 tsb_szc, hatlockp, TSB_GROW); 10545 } 10546 } 10547 10548 sfmmu_hat_exit(hatlockp); 10549 } 10550 10551 /* 10552 * Free up a sfmmu 10553 * Since the sfmmu is currently embedded in the hat struct we simply zero 10554 * out our fields and free up the ism map blk list if any. 10555 */ 10556 static void 10557 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 10558 { 10559 ism_blk_t *blkp, *nx_blkp; 10560 #ifdef DEBUG 10561 ism_map_t *map; 10562 int i; 10563 #endif 10564 10565 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 10566 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 10567 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 10568 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 10569 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 10570 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 10571 ASSERT(SF_RGNMAP_ISNULL(sfmmup)); 10572 10573 sfmmup->sfmmu_free = 0; 10574 sfmmup->sfmmu_ismhat = 0; 10575 10576 blkp = sfmmup->sfmmu_iblk; 10577 sfmmup->sfmmu_iblk = NULL; 10578 10579 while (blkp) { 10580 #ifdef DEBUG 10581 map = blkp->iblk_maps; 10582 for (i = 0; i < ISM_MAP_SLOTS; i++) { 10583 ASSERT(map[i].imap_seg == 0); 10584 ASSERT(map[i].imap_ismhat == NULL); 10585 ASSERT(map[i].imap_ment == NULL); 10586 } 10587 #endif 10588 nx_blkp = blkp->iblk_next; 10589 blkp->iblk_next = NULL; 10590 blkp->iblk_nextpa = (uint64_t)-1; 10591 kmem_cache_free(ism_blk_cache, blkp); 10592 blkp = nx_blkp; 10593 } 10594 } 10595 10596 /* 10597 * Locking primitves accessed by HATLOCK macros 10598 */ 10599 10600 #define SFMMU_SPL_MTX (0x0) 10601 #define SFMMU_ML_MTX (0x1) 10602 10603 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 10604 SPL_HASH(pg) : MLIST_HASH(pg)) 10605 10606 kmutex_t * 10607 sfmmu_page_enter(struct page *pp) 10608 { 10609 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 10610 } 10611 10612 void 10613 sfmmu_page_exit(kmutex_t *spl) 10614 { 10615 mutex_exit(spl); 10616 } 10617 10618 int 10619 sfmmu_page_spl_held(struct page *pp) 10620 { 10621 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 10622 } 10623 10624 kmutex_t * 10625 sfmmu_mlist_enter(struct page *pp) 10626 { 10627 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 10628 } 10629 10630 void 10631 sfmmu_mlist_exit(kmutex_t *mml) 10632 { 10633 mutex_exit(mml); 10634 } 10635 10636 int 10637 sfmmu_mlist_held(struct page *pp) 10638 { 10639 10640 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 10641 } 10642 10643 /* 10644 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 10645 * sfmmu_mlist_enter() case mml_table lock array is used and for 10646 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 10647 * 10648 * The lock is taken on a root page so that it protects an operation on all 10649 * constituent pages of a large page pp belongs to. 10650 * 10651 * The routine takes a lock from the appropriate array. The lock is determined 10652 * by hashing the root page. After taking the lock this routine checks if the 10653 * root page has the same size code that was used to determine the root (i.e 10654 * that root hasn't changed). If root page has the expected p_szc field we 10655 * have the right lock and it's returned to the caller. If root's p_szc 10656 * decreased we release the lock and retry from the beginning. This case can 10657 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 10658 * value and taking the lock. The number of retries due to p_szc decrease is 10659 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 10660 * determined by hashing pp itself. 10661 * 10662 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 10663 * possible that p_szc can increase. To increase p_szc a thread has to lock 10664 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 10665 * callers that don't hold a page locked recheck if hmeblk through which pp 10666 * was found still maps this pp. If it doesn't map it anymore returned lock 10667 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 10668 * p_szc increase after taking the lock it returns this lock without further 10669 * retries because in this case the caller doesn't care about which lock was 10670 * taken. The caller will drop it right away. 10671 * 10672 * After the routine returns it's guaranteed that hat_page_demote() can't 10673 * change p_szc field of any of constituent pages of a large page pp belongs 10674 * to as long as pp was either locked at least SHARED prior to this call or 10675 * the caller finds that hment that pointed to this pp still references this 10676 * pp (this also assumes that the caller holds hme hash bucket lock so that 10677 * the same pp can't be remapped into the same hmeblk after it was unmapped by 10678 * hat_pageunload()). 10679 */ 10680 static kmutex_t * 10681 sfmmu_mlspl_enter(struct page *pp, int type) 10682 { 10683 kmutex_t *mtx; 10684 uint_t prev_rszc = UINT_MAX; 10685 page_t *rootpp; 10686 uint_t szc; 10687 uint_t rszc; 10688 uint_t pszc = pp->p_szc; 10689 10690 ASSERT(pp != NULL); 10691 10692 again: 10693 if (pszc == 0) { 10694 mtx = SFMMU_MLSPL_MTX(type, pp); 10695 mutex_enter(mtx); 10696 return (mtx); 10697 } 10698 10699 /* The lock lives in the root page */ 10700 rootpp = PP_GROUPLEADER(pp, pszc); 10701 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10702 mutex_enter(mtx); 10703 10704 /* 10705 * Return mml in the following 3 cases: 10706 * 10707 * 1) If pp itself is root since if its p_szc decreased before we took 10708 * the lock pp is still the root of smaller szc page. And if its p_szc 10709 * increased it doesn't matter what lock we return (see comment in 10710 * front of this routine). 10711 * 10712 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 10713 * large page we have the right lock since any previous potential 10714 * hat_page_demote() is done demoting from greater than current root's 10715 * p_szc because hat_page_demote() changes root's p_szc last. No 10716 * further hat_page_demote() can start or be in progress since it 10717 * would need the same lock we currently hold. 10718 * 10719 * 3) If rootpp's p_szc increased since previous iteration it doesn't 10720 * matter what lock we return (see comment in front of this routine). 10721 */ 10722 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 10723 rszc >= prev_rszc) { 10724 return (mtx); 10725 } 10726 10727 /* 10728 * hat_page_demote() could have decreased root's p_szc. 10729 * In this case pp's p_szc must also be smaller than pszc. 10730 * Retry. 10731 */ 10732 if (rszc < pszc) { 10733 szc = pp->p_szc; 10734 if (szc < pszc) { 10735 mutex_exit(mtx); 10736 pszc = szc; 10737 goto again; 10738 } 10739 /* 10740 * pp's p_szc increased after it was decreased. 10741 * page cannot be mapped. Return current lock. The caller 10742 * will drop it right away. 10743 */ 10744 return (mtx); 10745 } 10746 10747 /* 10748 * root's p_szc is greater than pp's p_szc. 10749 * hat_page_demote() is not done with all pages 10750 * yet. Wait for it to complete. 10751 */ 10752 mutex_exit(mtx); 10753 rootpp = PP_GROUPLEADER(rootpp, rszc); 10754 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10755 mutex_enter(mtx); 10756 mutex_exit(mtx); 10757 prev_rszc = rszc; 10758 goto again; 10759 } 10760 10761 static int 10762 sfmmu_mlspl_held(struct page *pp, int type) 10763 { 10764 kmutex_t *mtx; 10765 10766 ASSERT(pp != NULL); 10767 /* The lock lives in the root page */ 10768 pp = PP_PAGEROOT(pp); 10769 ASSERT(pp != NULL); 10770 10771 mtx = SFMMU_MLSPL_MTX(type, pp); 10772 return (MUTEX_HELD(mtx)); 10773 } 10774 10775 static uint_t 10776 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 10777 { 10778 struct hme_blk *hblkp; 10779 10780 10781 if (freehblkp != NULL) { 10782 mutex_enter(&freehblkp_lock); 10783 if (freehblkp != NULL) { 10784 /* 10785 * If the current thread is owning hblk_reserve OR 10786 * critical request from sfmmu_hblk_steal() 10787 * let it succeed even if freehblkcnt is really low. 10788 */ 10789 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 10790 SFMMU_STAT(sf_get_free_throttle); 10791 mutex_exit(&freehblkp_lock); 10792 return (0); 10793 } 10794 freehblkcnt--; 10795 *hmeblkpp = freehblkp; 10796 hblkp = *hmeblkpp; 10797 freehblkp = hblkp->hblk_next; 10798 mutex_exit(&freehblkp_lock); 10799 hblkp->hblk_next = NULL; 10800 SFMMU_STAT(sf_get_free_success); 10801 10802 ASSERT(hblkp->hblk_hmecnt == 0); 10803 ASSERT(hblkp->hblk_vcnt == 0); 10804 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp)); 10805 10806 return (1); 10807 } 10808 mutex_exit(&freehblkp_lock); 10809 } 10810 10811 /* Check cpu hblk pending queues */ 10812 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) { 10813 hblkp = *hmeblkpp; 10814 hblkp->hblk_next = NULL; 10815 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp); 10816 10817 ASSERT(hblkp->hblk_hmecnt == 0); 10818 ASSERT(hblkp->hblk_vcnt == 0); 10819 10820 return (1); 10821 } 10822 10823 SFMMU_STAT(sf_get_free_fail); 10824 return (0); 10825 } 10826 10827 static uint_t 10828 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 10829 { 10830 struct hme_blk *hblkp; 10831 10832 ASSERT(hmeblkp->hblk_hmecnt == 0); 10833 ASSERT(hmeblkp->hblk_vcnt == 0); 10834 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10835 10836 /* 10837 * If the current thread is mapping into kernel space, 10838 * let it succede even if freehblkcnt is max 10839 * so that it will avoid freeing it to kmem. 10840 * This will prevent stack overflow due to 10841 * possible recursion since kmem_cache_free() 10842 * might require creation of a slab which 10843 * in turn needs an hmeblk to map that slab; 10844 * let's break this vicious chain at the first 10845 * opportunity. 10846 */ 10847 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10848 mutex_enter(&freehblkp_lock); 10849 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10850 SFMMU_STAT(sf_put_free_success); 10851 freehblkcnt++; 10852 hmeblkp->hblk_next = freehblkp; 10853 freehblkp = hmeblkp; 10854 mutex_exit(&freehblkp_lock); 10855 return (1); 10856 } 10857 mutex_exit(&freehblkp_lock); 10858 } 10859 10860 /* 10861 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 10862 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 10863 * we are not in the process of mapping into kernel space. 10864 */ 10865 ASSERT(!critical); 10866 while (freehblkcnt > HBLK_RESERVE_CNT) { 10867 mutex_enter(&freehblkp_lock); 10868 if (freehblkcnt > HBLK_RESERVE_CNT) { 10869 freehblkcnt--; 10870 hblkp = freehblkp; 10871 freehblkp = hblkp->hblk_next; 10872 mutex_exit(&freehblkp_lock); 10873 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 10874 kmem_cache_free(sfmmu8_cache, hblkp); 10875 continue; 10876 } 10877 mutex_exit(&freehblkp_lock); 10878 } 10879 SFMMU_STAT(sf_put_free_fail); 10880 return (0); 10881 } 10882 10883 static void 10884 sfmmu_hblk_swap(struct hme_blk *new) 10885 { 10886 struct hme_blk *old, *hblkp, *prev; 10887 uint64_t newpa; 10888 caddr_t base, vaddr, endaddr; 10889 struct hmehash_bucket *hmebp; 10890 struct sf_hment *osfhme, *nsfhme; 10891 page_t *pp; 10892 kmutex_t *pml; 10893 tte_t tte; 10894 struct hme_blk *list = NULL; 10895 10896 #ifdef DEBUG 10897 hmeblk_tag hblktag; 10898 struct hme_blk *found; 10899 #endif 10900 old = HBLK_RESERVE; 10901 ASSERT(!old->hblk_shared); 10902 10903 /* 10904 * save pa before bcopy clobbers it 10905 */ 10906 newpa = new->hblk_nextpa; 10907 10908 base = (caddr_t)get_hblk_base(old); 10909 endaddr = base + get_hblk_span(old); 10910 10911 /* 10912 * acquire hash bucket lock. 10913 */ 10914 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K, 10915 SFMMU_INVALID_SHMERID); 10916 10917 /* 10918 * copy contents from old to new 10919 */ 10920 bcopy((void *)old, (void *)new, HME8BLK_SZ); 10921 10922 /* 10923 * add new to hash chain 10924 */ 10925 sfmmu_hblk_hash_add(hmebp, new, newpa); 10926 10927 /* 10928 * search hash chain for hblk_reserve; this needs to be performed 10929 * after adding new, otherwise prev won't correspond to the hblk which 10930 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to 10931 * remove old later. 10932 */ 10933 for (prev = NULL, 10934 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old; 10935 prev = hblkp, hblkp = hblkp->hblk_next) 10936 ; 10937 10938 if (hblkp != old) 10939 panic("sfmmu_hblk_swap: hblk_reserve not found"); 10940 10941 /* 10942 * p_mapping list is still pointing to hments in hblk_reserve; 10943 * fix up p_mapping list so that they point to hments in new. 10944 * 10945 * Since all these mappings are created by hblk_reserve_thread 10946 * on the way and it's using at least one of the buffers from each of 10947 * the newly minted slabs, there is no danger of any of these 10948 * mappings getting unloaded by another thread. 10949 * 10950 * tsbmiss could only modify ref/mod bits of hments in old/new. 10951 * Since all of these hments hold mappings established by segkmem 10952 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 10953 * have no meaning for the mappings in hblk_reserve. hments in 10954 * old and new are identical except for ref/mod bits. 10955 */ 10956 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 10957 10958 HBLKTOHME(osfhme, old, vaddr); 10959 sfmmu_copytte(&osfhme->hme_tte, &tte); 10960 10961 if (TTE_IS_VALID(&tte)) { 10962 if ((pp = osfhme->hme_page) == NULL) 10963 panic("sfmmu_hblk_swap: page not mapped"); 10964 10965 pml = sfmmu_mlist_enter(pp); 10966 10967 if (pp != osfhme->hme_page) 10968 panic("sfmmu_hblk_swap: mapping changed"); 10969 10970 HBLKTOHME(nsfhme, new, vaddr); 10971 10972 HME_ADD(nsfhme, pp); 10973 HME_SUB(osfhme, pp); 10974 10975 sfmmu_mlist_exit(pml); 10976 } 10977 } 10978 10979 /* 10980 * remove old from hash chain 10981 */ 10982 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1); 10983 10984 #ifdef DEBUG 10985 10986 hblktag.htag_id = ksfmmup; 10987 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 10988 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 10989 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 10990 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 10991 10992 if (found != new) 10993 panic("sfmmu_hblk_swap: new hblk not found"); 10994 #endif 10995 10996 SFMMU_HASH_UNLOCK(hmebp); 10997 10998 /* 10999 * Reset hblk_reserve 11000 */ 11001 bzero((void *)old, HME8BLK_SZ); 11002 old->hblk_nextpa = va_to_pa((caddr_t)old); 11003 } 11004 11005 /* 11006 * Grab the mlist mutex for both pages passed in. 11007 * 11008 * low and high will be returned as pointers to the mutexes for these pages. 11009 * low refers to the mutex residing in the lower bin of the mlist hash, while 11010 * high refers to the mutex residing in the higher bin of the mlist hash. This 11011 * is due to the locking order restrictions on the same thread grabbing 11012 * multiple mlist mutexes. The low lock must be acquired before the high lock. 11013 * 11014 * If both pages hash to the same mutex, only grab that single mutex, and 11015 * high will be returned as NULL 11016 * If the pages hash to different bins in the hash, grab the lower addressed 11017 * lock first and then the higher addressed lock in order to follow the locking 11018 * rules involved with the same thread grabbing multiple mlist mutexes. 11019 * low and high will both have non-NULL values. 11020 */ 11021 static void 11022 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 11023 kmutex_t **low, kmutex_t **high) 11024 { 11025 kmutex_t *mml_targ, *mml_repl; 11026 11027 /* 11028 * no need to do the dance around szc as in sfmmu_mlist_enter() 11029 * because this routine is only called by hat_page_relocate() and all 11030 * targ and repl pages are already locked EXCL so szc can't change. 11031 */ 11032 11033 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 11034 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 11035 11036 if (mml_targ == mml_repl) { 11037 *low = mml_targ; 11038 *high = NULL; 11039 } else { 11040 if (mml_targ < mml_repl) { 11041 *low = mml_targ; 11042 *high = mml_repl; 11043 } else { 11044 *low = mml_repl; 11045 *high = mml_targ; 11046 } 11047 } 11048 11049 mutex_enter(*low); 11050 if (*high) 11051 mutex_enter(*high); 11052 } 11053 11054 static void 11055 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 11056 { 11057 if (high) 11058 mutex_exit(high); 11059 mutex_exit(low); 11060 } 11061 11062 static hatlock_t * 11063 sfmmu_hat_enter(sfmmu_t *sfmmup) 11064 { 11065 hatlock_t *hatlockp; 11066 11067 if (sfmmup != ksfmmup) { 11068 hatlockp = TSB_HASH(sfmmup); 11069 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 11070 return (hatlockp); 11071 } 11072 return (NULL); 11073 } 11074 11075 static hatlock_t * 11076 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 11077 { 11078 hatlock_t *hatlockp; 11079 11080 if (sfmmup != ksfmmup) { 11081 hatlockp = TSB_HASH(sfmmup); 11082 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 11083 return (NULL); 11084 return (hatlockp); 11085 } 11086 return (NULL); 11087 } 11088 11089 static void 11090 sfmmu_hat_exit(hatlock_t *hatlockp) 11091 { 11092 if (hatlockp != NULL) 11093 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 11094 } 11095 11096 static void 11097 sfmmu_hat_lock_all(void) 11098 { 11099 int i; 11100 for (i = 0; i < SFMMU_NUM_LOCK; i++) 11101 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 11102 } 11103 11104 static void 11105 sfmmu_hat_unlock_all(void) 11106 { 11107 int i; 11108 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 11109 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 11110 } 11111 11112 int 11113 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 11114 { 11115 ASSERT(sfmmup != ksfmmup); 11116 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 11117 } 11118 11119 /* 11120 * Locking primitives to provide consistency between ISM unmap 11121 * and other operations. Since ISM unmap can take a long time, we 11122 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 11123 * contention on the hatlock buckets while ISM segments are being 11124 * unmapped. The tradeoff is that the flags don't prevent priority 11125 * inversion from occurring, so we must request kernel priority in 11126 * case we have to sleep to keep from getting buried while holding 11127 * the HAT_ISMBUSY flag set, which in turn could block other kernel 11128 * threads from running (for example, in sfmmu_uvatopfn()). 11129 */ 11130 static void 11131 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 11132 { 11133 hatlock_t *hatlockp; 11134 11135 THREAD_KPRI_REQUEST(); 11136 if (!hatlock_held) 11137 hatlockp = sfmmu_hat_enter(sfmmup); 11138 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 11139 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11140 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 11141 if (!hatlock_held) 11142 sfmmu_hat_exit(hatlockp); 11143 } 11144 11145 static void 11146 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 11147 { 11148 hatlock_t *hatlockp; 11149 11150 if (!hatlock_held) 11151 hatlockp = sfmmu_hat_enter(sfmmup); 11152 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 11153 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 11154 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11155 if (!hatlock_held) 11156 sfmmu_hat_exit(hatlockp); 11157 THREAD_KPRI_RELEASE(); 11158 } 11159 11160 /* 11161 * 11162 * Algorithm: 11163 * 11164 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 11165 * hblks. 11166 * 11167 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 11168 * 11169 * (a) try to return an hblk from reserve pool of free hblks; 11170 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 11171 * and return hblk_reserve. 11172 * 11173 * (3) call kmem_cache_alloc() to allocate hblk; 11174 * 11175 * (a) if hblk_reserve_lock is held by the current thread, 11176 * atomically replace hblk_reserve by the hblk that is 11177 * returned by kmem_cache_alloc; release hblk_reserve_lock 11178 * and call kmem_cache_alloc() again. 11179 * (b) if reserve pool is not full, add the hblk that is 11180 * returned by kmem_cache_alloc to reserve pool and 11181 * call kmem_cache_alloc again. 11182 * 11183 */ 11184 static struct hme_blk * 11185 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 11186 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 11187 uint_t flags, uint_t rid) 11188 { 11189 struct hme_blk *hmeblkp = NULL; 11190 struct hme_blk *newhblkp; 11191 struct hme_blk *shw_hblkp = NULL; 11192 struct kmem_cache *sfmmu_cache = NULL; 11193 uint64_t hblkpa; 11194 ulong_t index; 11195 uint_t owner; /* set to 1 if using hblk_reserve */ 11196 uint_t forcefree; 11197 int sleep; 11198 sf_srd_t *srdp; 11199 sf_region_t *rgnp; 11200 11201 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11202 ASSERT(hblktag.htag_rid == rid); 11203 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 11204 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11205 IS_P2ALIGNED(vaddr, TTEBYTES(size))); 11206 11207 /* 11208 * If segkmem is not created yet, allocate from static hmeblks 11209 * created at the end of startup_modules(). See the block comment 11210 * in startup_modules() describing how we estimate the number of 11211 * static hmeblks that will be needed during re-map. 11212 */ 11213 if (!hblk_alloc_dynamic) { 11214 11215 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11216 11217 if (size == TTE8K) { 11218 index = nucleus_hblk8.index; 11219 if (index >= nucleus_hblk8.len) { 11220 /* 11221 * If we panic here, see startup_modules() to 11222 * make sure that we are calculating the 11223 * number of hblk8's that we need correctly. 11224 */ 11225 prom_panic("no nucleus hblk8 to allocate"); 11226 } 11227 hmeblkp = 11228 (struct hme_blk *)&nucleus_hblk8.list[index]; 11229 nucleus_hblk8.index++; 11230 SFMMU_STAT(sf_hblk8_nalloc); 11231 } else { 11232 index = nucleus_hblk1.index; 11233 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 11234 /* 11235 * If we panic here, see startup_modules(). 11236 * Most likely you need to update the 11237 * calculation of the number of hblk1 elements 11238 * that the kernel needs to boot. 11239 */ 11240 prom_panic("no nucleus hblk1 to allocate"); 11241 } 11242 hmeblkp = 11243 (struct hme_blk *)&nucleus_hblk1.list[index]; 11244 nucleus_hblk1.index++; 11245 SFMMU_STAT(sf_hblk1_nalloc); 11246 } 11247 11248 goto hblk_init; 11249 } 11250 11251 SFMMU_HASH_UNLOCK(hmebp); 11252 11253 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) { 11254 if (mmu_page_sizes == max_mmu_page_sizes) { 11255 if (size < TTE256M) 11256 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11257 size, flags); 11258 } else { 11259 if (size < TTE4M) 11260 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11261 size, flags); 11262 } 11263 } else if (SFMMU_IS_SHMERID_VALID(rid)) { 11264 /* 11265 * Shared hmes use per region bitmaps in rgn_hmeflag 11266 * rather than shadow hmeblks to keep track of the 11267 * mapping sizes which have been allocated for the region. 11268 * Here we cleanup old invalid hmeblks with this rid, 11269 * which may be left around by pageunload(). 11270 */ 11271 int ttesz; 11272 caddr_t va; 11273 caddr_t eva = vaddr + TTEBYTES(size); 11274 11275 ASSERT(sfmmup != KHATID); 11276 11277 srdp = sfmmup->sfmmu_srdp; 11278 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11279 rgnp = srdp->srd_hmergnp[rid]; 11280 ASSERT(rgnp != NULL && rgnp->rgn_id == rid); 11281 ASSERT(rgnp->rgn_refcnt != 0); 11282 ASSERT(size <= rgnp->rgn_pgszc); 11283 11284 ttesz = HBLK_MIN_TTESZ; 11285 do { 11286 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) { 11287 continue; 11288 } 11289 11290 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) { 11291 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz); 11292 } else if (ttesz < size) { 11293 for (va = vaddr; va < eva; 11294 va += TTEBYTES(ttesz)) { 11295 sfmmu_cleanup_rhblk(srdp, va, rid, 11296 ttesz); 11297 } 11298 } 11299 } while (++ttesz <= rgnp->rgn_pgszc); 11300 } 11301 11302 fill_hblk: 11303 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 11304 11305 if (owner && size == TTE8K) { 11306 11307 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11308 /* 11309 * We are really in a tight spot. We already own 11310 * hblk_reserve and we need another hblk. In anticipation 11311 * of this kind of scenario, we specifically set aside 11312 * HBLK_RESERVE_MIN number of hblks to be used exclusively 11313 * by owner of hblk_reserve. 11314 */ 11315 SFMMU_STAT(sf_hblk_recurse_cnt); 11316 11317 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 11318 panic("sfmmu_hblk_alloc: reserve list is empty"); 11319 11320 goto hblk_verify; 11321 } 11322 11323 ASSERT(!owner); 11324 11325 if ((flags & HAT_NO_KALLOC) == 0) { 11326 11327 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 11328 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 11329 11330 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 11331 hmeblkp = sfmmu_hblk_steal(size); 11332 } else { 11333 /* 11334 * if we are the owner of hblk_reserve, 11335 * swap hblk_reserve with hmeblkp and 11336 * start a fresh life. Hope things go 11337 * better this time. 11338 */ 11339 if (hblk_reserve_thread == curthread) { 11340 ASSERT(sfmmu_cache == sfmmu8_cache); 11341 sfmmu_hblk_swap(hmeblkp); 11342 hblk_reserve_thread = NULL; 11343 mutex_exit(&hblk_reserve_lock); 11344 goto fill_hblk; 11345 } 11346 /* 11347 * let's donate this hblk to our reserve list if 11348 * we are not mapping kernel range 11349 */ 11350 if (size == TTE8K && sfmmup != KHATID) { 11351 if (sfmmu_put_free_hblk(hmeblkp, 0)) 11352 goto fill_hblk; 11353 } 11354 } 11355 } else { 11356 /* 11357 * We are here to map the slab in sfmmu8_cache; let's 11358 * check if we could tap our reserve list; if successful, 11359 * this will avoid the pain of going thru sfmmu_hblk_swap 11360 */ 11361 SFMMU_STAT(sf_hblk_slab_cnt); 11362 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 11363 /* 11364 * let's start hblk_reserve dance 11365 */ 11366 SFMMU_STAT(sf_hblk_reserve_cnt); 11367 owner = 1; 11368 mutex_enter(&hblk_reserve_lock); 11369 hmeblkp = HBLK_RESERVE; 11370 hblk_reserve_thread = curthread; 11371 } 11372 } 11373 11374 hblk_verify: 11375 ASSERT(hmeblkp != NULL); 11376 set_hblk_sz(hmeblkp, size); 11377 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 11378 SFMMU_HASH_LOCK(hmebp); 11379 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11380 if (newhblkp != NULL) { 11381 SFMMU_HASH_UNLOCK(hmebp); 11382 if (hmeblkp != HBLK_RESERVE) { 11383 /* 11384 * This is really tricky! 11385 * 11386 * vmem_alloc(vmem_seg_arena) 11387 * vmem_alloc(vmem_internal_arena) 11388 * segkmem_alloc(heap_arena) 11389 * vmem_alloc(heap_arena) 11390 * page_create() 11391 * hat_memload() 11392 * kmem_cache_free() 11393 * kmem_cache_alloc() 11394 * kmem_slab_create() 11395 * vmem_alloc(kmem_internal_arena) 11396 * segkmem_alloc(heap_arena) 11397 * vmem_alloc(heap_arena) 11398 * page_create() 11399 * hat_memload() 11400 * kmem_cache_free() 11401 * ... 11402 * 11403 * Thus, hat_memload() could call kmem_cache_free 11404 * for enough number of times that we could easily 11405 * hit the bottom of the stack or run out of reserve 11406 * list of vmem_seg structs. So, we must donate 11407 * this hblk to reserve list if it's allocated 11408 * from sfmmu8_cache *and* mapping kernel range. 11409 * We don't need to worry about freeing hmeblk1's 11410 * to kmem since they don't map any kmem slabs. 11411 * 11412 * Note: When segkmem supports largepages, we must 11413 * free hmeblk1's to reserve list as well. 11414 */ 11415 forcefree = (sfmmup == KHATID) ? 1 : 0; 11416 if (size == TTE8K && 11417 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 11418 goto re_verify; 11419 } 11420 ASSERT(sfmmup != KHATID); 11421 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 11422 } else { 11423 /* 11424 * Hey! we don't need hblk_reserve any more. 11425 */ 11426 ASSERT(owner); 11427 hblk_reserve_thread = NULL; 11428 mutex_exit(&hblk_reserve_lock); 11429 owner = 0; 11430 } 11431 re_verify: 11432 /* 11433 * let's check if the goodies are still present 11434 */ 11435 SFMMU_HASH_LOCK(hmebp); 11436 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11437 if (newhblkp != NULL) { 11438 /* 11439 * return newhblkp if it's not hblk_reserve; 11440 * if newhblkp is hblk_reserve, return it 11441 * _only if_ we are the owner of hblk_reserve. 11442 */ 11443 if (newhblkp != HBLK_RESERVE || owner) { 11444 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11445 newhblkp->hblk_shared); 11446 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || 11447 !newhblkp->hblk_shared); 11448 return (newhblkp); 11449 } else { 11450 /* 11451 * we just hit hblk_reserve in the hash and 11452 * we are not the owner of that; 11453 * 11454 * block until hblk_reserve_thread completes 11455 * swapping hblk_reserve and try the dance 11456 * once again. 11457 */ 11458 SFMMU_HASH_UNLOCK(hmebp); 11459 mutex_enter(&hblk_reserve_lock); 11460 mutex_exit(&hblk_reserve_lock); 11461 SFMMU_STAT(sf_hblk_reserve_hit); 11462 goto fill_hblk; 11463 } 11464 } else { 11465 /* 11466 * it's no more! try the dance once again. 11467 */ 11468 SFMMU_HASH_UNLOCK(hmebp); 11469 goto fill_hblk; 11470 } 11471 } 11472 11473 hblk_init: 11474 if (SFMMU_IS_SHMERID_VALID(rid)) { 11475 uint16_t tteflag = 0x1 << 11476 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size); 11477 11478 if (!(rgnp->rgn_hmeflags & tteflag)) { 11479 atomic_or_16(&rgnp->rgn_hmeflags, tteflag); 11480 } 11481 hmeblkp->hblk_shared = 1; 11482 } else { 11483 hmeblkp->hblk_shared = 0; 11484 } 11485 set_hblk_sz(hmeblkp, size); 11486 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11487 hmeblkp->hblk_next = (struct hme_blk *)NULL; 11488 hmeblkp->hblk_tag = hblktag; 11489 hmeblkp->hblk_shadow = shw_hblkp; 11490 hblkpa = hmeblkp->hblk_nextpa; 11491 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 11492 11493 ASSERT(get_hblk_ttesz(hmeblkp) == size); 11494 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 11495 ASSERT(hmeblkp->hblk_hmecnt == 0); 11496 ASSERT(hmeblkp->hblk_vcnt == 0); 11497 ASSERT(hmeblkp->hblk_lckcnt == 0); 11498 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 11499 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 11500 return (hmeblkp); 11501 } 11502 11503 /* 11504 * This function cleans up the hme_blk and returns it to the free list. 11505 */ 11506 /* ARGSUSED */ 11507 static void 11508 sfmmu_hblk_free(struct hme_blk **listp) 11509 { 11510 struct hme_blk *hmeblkp, *next_hmeblkp; 11511 int size; 11512 uint_t critical; 11513 uint64_t hblkpa; 11514 11515 ASSERT(*listp != NULL); 11516 11517 hmeblkp = *listp; 11518 while (hmeblkp != NULL) { 11519 next_hmeblkp = hmeblkp->hblk_next; 11520 ASSERT(!hmeblkp->hblk_hmecnt); 11521 ASSERT(!hmeblkp->hblk_vcnt); 11522 ASSERT(!hmeblkp->hblk_lckcnt); 11523 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 11524 ASSERT(hmeblkp->hblk_shared == 0); 11525 ASSERT(hmeblkp->hblk_shw_bit == 0); 11526 ASSERT(hmeblkp->hblk_shadow == NULL); 11527 11528 hblkpa = va_to_pa((caddr_t)hmeblkp); 11529 ASSERT(hblkpa != (uint64_t)-1); 11530 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 11531 11532 size = get_hblk_ttesz(hmeblkp); 11533 hmeblkp->hblk_next = NULL; 11534 hmeblkp->hblk_nextpa = hblkpa; 11535 11536 if (hmeblkp->hblk_nuc_bit == 0) { 11537 11538 if (size != TTE8K || 11539 !sfmmu_put_free_hblk(hmeblkp, critical)) 11540 kmem_cache_free(get_hblk_cache(hmeblkp), 11541 hmeblkp); 11542 } 11543 hmeblkp = next_hmeblkp; 11544 } 11545 } 11546 11547 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 11548 #define SFMMU_HBLK_STEAL_THRESHOLD 5 11549 11550 static uint_t sfmmu_hblk_steal_twice; 11551 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 11552 11553 /* 11554 * Steal a hmeblk from user or kernel hme hash lists. 11555 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to 11556 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts 11557 * tap into critical reserve of freehblkp. 11558 * Note: We remain looping in this routine until we find one. 11559 */ 11560 static struct hme_blk * 11561 sfmmu_hblk_steal(int size) 11562 { 11563 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 11564 struct hmehash_bucket *hmebp; 11565 struct hme_blk *hmeblkp = NULL, *pr_hblk; 11566 uint64_t hblkpa; 11567 int i; 11568 uint_t loop_cnt = 0, critical; 11569 11570 for (;;) { 11571 /* Check cpu hblk pending queues */ 11572 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) { 11573 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 11574 ASSERT(hmeblkp->hblk_hmecnt == 0); 11575 ASSERT(hmeblkp->hblk_vcnt == 0); 11576 return (hmeblkp); 11577 } 11578 11579 if (size == TTE8K) { 11580 critical = 11581 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0; 11582 if (sfmmu_get_free_hblk(&hmeblkp, critical)) 11583 return (hmeblkp); 11584 } 11585 11586 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 11587 uhmehash_steal_hand; 11588 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 11589 11590 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 11591 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 11592 SFMMU_HASH_LOCK(hmebp); 11593 hmeblkp = hmebp->hmeblkp; 11594 hblkpa = hmebp->hmeh_nextpa; 11595 pr_hblk = NULL; 11596 while (hmeblkp) { 11597 /* 11598 * check if it is a hmeblk that is not locked 11599 * and not shared. skip shadow hmeblks with 11600 * shadow_mask set i.e valid count non zero. 11601 */ 11602 if ((get_hblk_ttesz(hmeblkp) == size) && 11603 (hmeblkp->hblk_shw_bit == 0 || 11604 hmeblkp->hblk_vcnt == 0) && 11605 (hmeblkp->hblk_lckcnt == 0)) { 11606 /* 11607 * there is a high probability that we 11608 * will find a free one. search some 11609 * buckets for a free hmeblk initially 11610 * before unloading a valid hmeblk. 11611 */ 11612 if ((hmeblkp->hblk_vcnt == 0 && 11613 hmeblkp->hblk_hmecnt == 0) || (i >= 11614 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 11615 if (sfmmu_steal_this_hblk(hmebp, 11616 hmeblkp, hblkpa, pr_hblk)) { 11617 /* 11618 * Hblk is unloaded 11619 * successfully 11620 */ 11621 break; 11622 } 11623 } 11624 } 11625 pr_hblk = hmeblkp; 11626 hblkpa = hmeblkp->hblk_nextpa; 11627 hmeblkp = hmeblkp->hblk_next; 11628 } 11629 11630 SFMMU_HASH_UNLOCK(hmebp); 11631 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 11632 hmebp = uhme_hash; 11633 } 11634 uhmehash_steal_hand = hmebp; 11635 11636 if (hmeblkp != NULL) 11637 break; 11638 11639 /* 11640 * in the worst case, look for a free one in the kernel 11641 * hash table. 11642 */ 11643 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 11644 SFMMU_HASH_LOCK(hmebp); 11645 hmeblkp = hmebp->hmeblkp; 11646 hblkpa = hmebp->hmeh_nextpa; 11647 pr_hblk = NULL; 11648 while (hmeblkp) { 11649 /* 11650 * check if it is free hmeblk 11651 */ 11652 if ((get_hblk_ttesz(hmeblkp) == size) && 11653 (hmeblkp->hblk_lckcnt == 0) && 11654 (hmeblkp->hblk_vcnt == 0) && 11655 (hmeblkp->hblk_hmecnt == 0)) { 11656 if (sfmmu_steal_this_hblk(hmebp, 11657 hmeblkp, hblkpa, pr_hblk)) { 11658 break; 11659 } else { 11660 /* 11661 * Cannot fail since we have 11662 * hash lock. 11663 */ 11664 panic("fail to steal?"); 11665 } 11666 } 11667 11668 pr_hblk = hmeblkp; 11669 hblkpa = hmeblkp->hblk_nextpa; 11670 hmeblkp = hmeblkp->hblk_next; 11671 } 11672 11673 SFMMU_HASH_UNLOCK(hmebp); 11674 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 11675 hmebp = khme_hash; 11676 } 11677 11678 if (hmeblkp != NULL) 11679 break; 11680 sfmmu_hblk_steal_twice++; 11681 } 11682 return (hmeblkp); 11683 } 11684 11685 /* 11686 * This routine does real work to prepare a hblk to be "stolen" by 11687 * unloading the mappings, updating shadow counts .... 11688 * It returns 1 if the block is ready to be reused (stolen), or 0 11689 * means the block cannot be stolen yet- pageunload is still working 11690 * on this hblk. 11691 */ 11692 static int 11693 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 11694 uint64_t hblkpa, struct hme_blk *pr_hblk) 11695 { 11696 int shw_size, vshift; 11697 struct hme_blk *shw_hblkp; 11698 caddr_t vaddr; 11699 uint_t shw_mask, newshw_mask; 11700 struct hme_blk *list = NULL; 11701 11702 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11703 11704 /* 11705 * check if the hmeblk is free, unload if necessary 11706 */ 11707 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11708 sfmmu_t *sfmmup; 11709 demap_range_t dmr; 11710 11711 sfmmup = hblktosfmmu(hmeblkp); 11712 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) { 11713 return (0); 11714 } 11715 DEMAP_RANGE_INIT(sfmmup, &dmr); 11716 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 11717 (caddr_t)get_hblk_base(hmeblkp), 11718 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 11719 DEMAP_RANGE_FLUSH(&dmr); 11720 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11721 /* 11722 * Pageunload is working on the same hblk. 11723 */ 11724 return (0); 11725 } 11726 11727 sfmmu_hblk_steal_unload_count++; 11728 } 11729 11730 ASSERT(hmeblkp->hblk_lckcnt == 0); 11731 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 11732 11733 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1); 11734 hmeblkp->hblk_nextpa = hblkpa; 11735 11736 shw_hblkp = hmeblkp->hblk_shadow; 11737 if (shw_hblkp) { 11738 ASSERT(!hmeblkp->hblk_shared); 11739 shw_size = get_hblk_ttesz(shw_hblkp); 11740 vaddr = (caddr_t)get_hblk_base(hmeblkp); 11741 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 11742 ASSERT(vshift < 8); 11743 /* 11744 * Atomically clear shadow mask bit 11745 */ 11746 do { 11747 shw_mask = shw_hblkp->hblk_shw_mask; 11748 ASSERT(shw_mask & (1 << vshift)); 11749 newshw_mask = shw_mask & ~(1 << vshift); 11750 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 11751 shw_mask, newshw_mask); 11752 } while (newshw_mask != shw_mask); 11753 hmeblkp->hblk_shadow = NULL; 11754 } 11755 11756 /* 11757 * remove shadow bit if we are stealing an unused shadow hmeblk. 11758 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 11759 * we are indeed allocating a shadow hmeblk. 11760 */ 11761 hmeblkp->hblk_shw_bit = 0; 11762 11763 if (hmeblkp->hblk_shared) { 11764 sf_srd_t *srdp; 11765 sf_region_t *rgnp; 11766 uint_t rid; 11767 11768 srdp = hblktosrd(hmeblkp); 11769 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11770 rid = hmeblkp->hblk_tag.htag_rid; 11771 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11772 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11773 rgnp = srdp->srd_hmergnp[rid]; 11774 ASSERT(rgnp != NULL); 11775 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 11776 hmeblkp->hblk_shared = 0; 11777 } 11778 11779 sfmmu_hblk_steal_count++; 11780 SFMMU_STAT(sf_steal_count); 11781 11782 return (1); 11783 } 11784 11785 struct hme_blk * 11786 sfmmu_hmetohblk(struct sf_hment *sfhme) 11787 { 11788 struct hme_blk *hmeblkp; 11789 struct sf_hment *sfhme0; 11790 struct hme_blk *hblk_dummy = 0; 11791 11792 /* 11793 * No dummy sf_hments, please. 11794 */ 11795 ASSERT(sfhme->hme_tte.ll != 0); 11796 11797 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 11798 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 11799 (uintptr_t)&hblk_dummy->hblk_hme[0]); 11800 11801 return (hmeblkp); 11802 } 11803 11804 /* 11805 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 11806 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 11807 * KM_SLEEP allocation. 11808 * 11809 * Return 0 on success, -1 otherwise. 11810 */ 11811 static void 11812 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11813 { 11814 struct tsb_info *tsbinfop, *next; 11815 tsb_replace_rc_t rc; 11816 boolean_t gotfirst = B_FALSE; 11817 11818 ASSERT(sfmmup != ksfmmup); 11819 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11820 11821 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 11822 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11823 } 11824 11825 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11826 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 11827 } else { 11828 return; 11829 } 11830 11831 ASSERT(sfmmup->sfmmu_tsb != NULL); 11832 11833 /* 11834 * Loop over all tsbinfo's replacing them with ones that actually have 11835 * a TSB. If any of the replacements ever fail, bail out of the loop. 11836 */ 11837 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 11838 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 11839 next = tsbinfop->tsb_next; 11840 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 11841 hatlockp, TSB_SWAPIN); 11842 if (rc != TSB_SUCCESS) { 11843 break; 11844 } 11845 gotfirst = B_TRUE; 11846 } 11847 11848 switch (rc) { 11849 case TSB_SUCCESS: 11850 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11851 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11852 return; 11853 case TSB_LOSTRACE: 11854 break; 11855 case TSB_ALLOCFAIL: 11856 break; 11857 default: 11858 panic("sfmmu_replace_tsb returned unrecognized failure code " 11859 "%d", rc); 11860 } 11861 11862 /* 11863 * In this case, we failed to get one of our TSBs. If we failed to 11864 * get the first TSB, get one of minimum size (8KB). Walk the list 11865 * and throw away the tsbinfos, starting where the allocation failed; 11866 * we can get by with just one TSB as long as we don't leave the 11867 * SWAPPED tsbinfo structures lying around. 11868 */ 11869 tsbinfop = sfmmup->sfmmu_tsb; 11870 next = tsbinfop->tsb_next; 11871 tsbinfop->tsb_next = NULL; 11872 11873 sfmmu_hat_exit(hatlockp); 11874 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 11875 next = tsbinfop->tsb_next; 11876 sfmmu_tsbinfo_free(tsbinfop); 11877 } 11878 hatlockp = sfmmu_hat_enter(sfmmup); 11879 11880 /* 11881 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 11882 * pages. 11883 */ 11884 if (!gotfirst) { 11885 tsbinfop = sfmmup->sfmmu_tsb; 11886 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 11887 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 11888 ASSERT(rc == TSB_SUCCESS); 11889 } 11890 11891 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11892 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11893 } 11894 11895 static int 11896 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw) 11897 { 11898 ulong_t bix = 0; 11899 uint_t rid; 11900 sf_region_t *rgnp; 11901 11902 ASSERT(srdp != NULL); 11903 ASSERT(srdp->srd_refcnt != 0); 11904 11905 w <<= BT_ULSHIFT; 11906 while (bmw) { 11907 if (!(bmw & 0x1)) { 11908 bix++; 11909 bmw >>= 1; 11910 continue; 11911 } 11912 rid = w | bix; 11913 rgnp = srdp->srd_hmergnp[rid]; 11914 ASSERT(rgnp->rgn_refcnt > 0); 11915 ASSERT(rgnp->rgn_id == rid); 11916 if (addr < rgnp->rgn_saddr || 11917 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) { 11918 bix++; 11919 bmw >>= 1; 11920 } else { 11921 return (1); 11922 } 11923 } 11924 return (0); 11925 } 11926 11927 /* 11928 * Handle exceptions for low level tsb_handler. 11929 * 11930 * There are many scenarios that could land us here: 11931 * 11932 * If the context is invalid we land here. The context can be invalid 11933 * for 3 reasons: 1) we couldn't allocate a new context and now need to 11934 * perform a wrap around operation in order to allocate a new context. 11935 * 2) Context was invalidated to change pagesize programming 3) ISMs or 11936 * TSBs configuration is changeing for this process and we are forced into 11937 * here to do a syncronization operation. If the context is valid we can 11938 * be here from window trap hanlder. In this case just call trap to handle 11939 * the fault. 11940 * 11941 * Note that the process will run in INVALID_CONTEXT before 11942 * faulting into here and subsequently loading the MMU registers 11943 * (including the TSB base register) associated with this process. 11944 * For this reason, the trap handlers must all test for 11945 * INVALID_CONTEXT before attempting to access any registers other 11946 * than the context registers. 11947 */ 11948 void 11949 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 11950 { 11951 sfmmu_t *sfmmup, *shsfmmup; 11952 uint_t ctxtype; 11953 klwp_id_t lwp; 11954 char lwp_save_state; 11955 hatlock_t *hatlockp, *shatlockp; 11956 struct tsb_info *tsbinfop; 11957 struct tsbmiss *tsbmp; 11958 sf_scd_t *scdp; 11959 11960 SFMMU_STAT(sf_tsb_exceptions); 11961 SFMMU_MMU_STAT(mmu_tsb_exceptions); 11962 sfmmup = astosfmmu(curthread->t_procp->p_as); 11963 /* 11964 * note that in sun4u, tagacces register contains ctxnum 11965 * while sun4v passes ctxtype in the tagaccess register. 11966 */ 11967 ctxtype = tagaccess & TAGACC_CTX_MASK; 11968 11969 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT); 11970 ASSERT(sfmmup->sfmmu_ismhat == 0); 11971 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) || 11972 ctxtype == INVALID_CONTEXT); 11973 11974 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) { 11975 /* 11976 * We may land here because shme bitmap and pagesize 11977 * flags are updated lazily in tsbmiss area on other cpus. 11978 * If we detect here that tsbmiss area is out of sync with 11979 * sfmmu update it and retry the trapped instruction. 11980 * Otherwise call trap(). 11981 */ 11982 int ret = 0; 11983 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K); 11984 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK); 11985 11986 /* 11987 * Must set lwp state to LWP_SYS before 11988 * trying to acquire any adaptive lock 11989 */ 11990 lwp = ttolwp(curthread); 11991 ASSERT(lwp); 11992 lwp_save_state = lwp->lwp_state; 11993 lwp->lwp_state = LWP_SYS; 11994 11995 hatlockp = sfmmu_hat_enter(sfmmup); 11996 kpreempt_disable(); 11997 tsbmp = &tsbmiss_area[CPU->cpu_id]; 11998 ASSERT(sfmmup == tsbmp->usfmmup); 11999 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) & 12000 ~tteflag_mask) || 12001 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) & 12002 ~tteflag_mask)) { 12003 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags; 12004 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags; 12005 ret = 1; 12006 } 12007 if (sfmmup->sfmmu_srdp != NULL) { 12008 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap; 12009 ulong_t *tm = tsbmp->shmermap; 12010 ulong_t i; 12011 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 12012 ulong_t d = tm[i] ^ sm[i]; 12013 if (d) { 12014 if (d & sm[i]) { 12015 if (!ret && sfmmu_is_rgnva( 12016 sfmmup->sfmmu_srdp, 12017 addr, i, d & sm[i])) { 12018 ret = 1; 12019 } 12020 } 12021 tm[i] = sm[i]; 12022 } 12023 } 12024 } 12025 kpreempt_enable(); 12026 sfmmu_hat_exit(hatlockp); 12027 lwp->lwp_state = lwp_save_state; 12028 if (ret) { 12029 return; 12030 } 12031 } else if (ctxtype == INVALID_CONTEXT) { 12032 /* 12033 * First, make sure we come out of here with a valid ctx, 12034 * since if we don't get one we'll simply loop on the 12035 * faulting instruction. 12036 * 12037 * If the ISM mappings are changing, the TSB is relocated, 12038 * the process is swapped, the process is joining SCD or 12039 * leaving SCD or shared regions we serialize behind the 12040 * controlling thread with hat lock, sfmmu_flags and 12041 * sfmmu_tsb_cv condition variable. 12042 */ 12043 12044 /* 12045 * Must set lwp state to LWP_SYS before 12046 * trying to acquire any adaptive lock 12047 */ 12048 lwp = ttolwp(curthread); 12049 ASSERT(lwp); 12050 lwp_save_state = lwp->lwp_state; 12051 lwp->lwp_state = LWP_SYS; 12052 12053 hatlockp = sfmmu_hat_enter(sfmmup); 12054 retry: 12055 if ((scdp = sfmmup->sfmmu_scdp) != NULL) { 12056 shsfmmup = scdp->scd_sfmmup; 12057 ASSERT(shsfmmup != NULL); 12058 12059 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL; 12060 tsbinfop = tsbinfop->tsb_next) { 12061 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 12062 /* drop the private hat lock */ 12063 sfmmu_hat_exit(hatlockp); 12064 /* acquire the shared hat lock */ 12065 shatlockp = sfmmu_hat_enter(shsfmmup); 12066 /* 12067 * recheck to see if anything changed 12068 * after we drop the private hat lock. 12069 */ 12070 if (sfmmup->sfmmu_scdp == scdp && 12071 shsfmmup == scdp->scd_sfmmup) { 12072 sfmmu_tsb_chk_reloc(shsfmmup, 12073 shatlockp); 12074 } 12075 sfmmu_hat_exit(shatlockp); 12076 hatlockp = sfmmu_hat_enter(sfmmup); 12077 goto retry; 12078 } 12079 } 12080 } 12081 12082 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 12083 tsbinfop = tsbinfop->tsb_next) { 12084 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 12085 cv_wait(&sfmmup->sfmmu_tsb_cv, 12086 HATLOCK_MUTEXP(hatlockp)); 12087 goto retry; 12088 } 12089 } 12090 12091 /* 12092 * Wait for ISM maps to be updated. 12093 */ 12094 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12095 cv_wait(&sfmmup->sfmmu_tsb_cv, 12096 HATLOCK_MUTEXP(hatlockp)); 12097 goto retry; 12098 } 12099 12100 /* Is this process joining an SCD? */ 12101 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12102 /* 12103 * Flush private TSB and setup shared TSB. 12104 * sfmmu_finish_join_scd() does not drop the 12105 * hat lock. 12106 */ 12107 sfmmu_finish_join_scd(sfmmup); 12108 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 12109 } 12110 12111 /* 12112 * If we're swapping in, get TSB(s). Note that we must do 12113 * this before we get a ctx or load the MMU state. Once 12114 * we swap in we have to recheck to make sure the TSB(s) and 12115 * ISM mappings didn't change while we slept. 12116 */ 12117 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 12118 sfmmu_tsb_swapin(sfmmup, hatlockp); 12119 goto retry; 12120 } 12121 12122 sfmmu_get_ctx(sfmmup); 12123 12124 sfmmu_hat_exit(hatlockp); 12125 /* 12126 * Must restore lwp_state if not calling 12127 * trap() for further processing. Restore 12128 * it anyway. 12129 */ 12130 lwp->lwp_state = lwp_save_state; 12131 return; 12132 } 12133 trap(rp, (caddr_t)tagaccess, traptype, 0); 12134 } 12135 12136 static void 12137 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp) 12138 { 12139 struct tsb_info *tp; 12140 12141 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12142 12143 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) { 12144 if (tp->tsb_flags & TSB_RELOC_FLAG) { 12145 cv_wait(&sfmmup->sfmmu_tsb_cv, 12146 HATLOCK_MUTEXP(hatlockp)); 12147 break; 12148 } 12149 } 12150 } 12151 12152 /* 12153 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 12154 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 12155 * rather than spinning to avoid send mondo timeouts with 12156 * interrupts enabled. When the lock is acquired it is immediately 12157 * released and we return back to sfmmu_vatopfn just after 12158 * the GET_TTE call. 12159 */ 12160 void 12161 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 12162 { 12163 struct page **pp; 12164 12165 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12166 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12167 } 12168 12169 /* 12170 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 12171 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 12172 * cross traps which cannot be handled while spinning in the 12173 * trap handlers. Simply enter and exit the kpr_suspendlock spin 12174 * mutex, which is held by the holder of the suspend bit, and then 12175 * retry the trapped instruction after unwinding. 12176 */ 12177 /*ARGSUSED*/ 12178 void 12179 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 12180 { 12181 ASSERT(curthread != kreloc_thread); 12182 mutex_enter(&kpr_suspendlock); 12183 mutex_exit(&kpr_suspendlock); 12184 } 12185 12186 /* 12187 * This routine could be optimized to reduce the number of xcalls by flushing 12188 * the entire TLBs if region reference count is above some threshold but the 12189 * tradeoff will depend on the size of the TLB. So for now flush the specific 12190 * page a context at a time. 12191 * 12192 * If uselocks is 0 then it's called after all cpus were captured and all the 12193 * hat locks were taken. In this case don't take the region lock by relying on 12194 * the order of list region update operations in hat_join_region(), 12195 * hat_leave_region() and hat_dup_region(). The ordering in those routines 12196 * guarantees that list is always forward walkable and reaches active sfmmus 12197 * regardless of where xc_attention() captures a cpu. 12198 */ 12199 cpuset_t 12200 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp, 12201 struct hme_blk *hmeblkp, int uselocks) 12202 { 12203 sfmmu_t *sfmmup; 12204 cpuset_t cpuset; 12205 cpuset_t rcpuset; 12206 hatlock_t *hatlockp; 12207 uint_t rid = rgnp->rgn_id; 12208 sf_rgn_link_t *rlink; 12209 sf_scd_t *scdp; 12210 12211 ASSERT(hmeblkp->hblk_shared); 12212 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 12213 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 12214 12215 CPUSET_ZERO(rcpuset); 12216 if (uselocks) { 12217 mutex_enter(&rgnp->rgn_mutex); 12218 } 12219 sfmmup = rgnp->rgn_sfmmu_head; 12220 while (sfmmup != NULL) { 12221 if (uselocks) { 12222 hatlockp = sfmmu_hat_enter(sfmmup); 12223 } 12224 12225 /* 12226 * When an SCD is created the SCD hat is linked on the sfmmu 12227 * region lists for each hme region which is part of the 12228 * SCD. If we find an SCD hat, when walking these lists, 12229 * then we flush the shared TSBs, if we find a private hat, 12230 * which is part of an SCD, but where the region 12231 * is not part of the SCD then we flush the private TSBs. 12232 */ 12233 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12234 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12235 scdp = sfmmup->sfmmu_scdp; 12236 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 12237 if (uselocks) { 12238 sfmmu_hat_exit(hatlockp); 12239 } 12240 goto next; 12241 } 12242 } 12243 12244 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12245 12246 kpreempt_disable(); 12247 cpuset = sfmmup->sfmmu_cpusran; 12248 CPUSET_AND(cpuset, cpu_ready_set); 12249 CPUSET_DEL(cpuset, CPU->cpu_id); 12250 SFMMU_XCALL_STATS(sfmmup); 12251 xt_some(cpuset, vtag_flushpage_tl1, 12252 (uint64_t)addr, (uint64_t)sfmmup); 12253 vtag_flushpage(addr, (uint64_t)sfmmup); 12254 if (uselocks) { 12255 sfmmu_hat_exit(hatlockp); 12256 } 12257 kpreempt_enable(); 12258 CPUSET_OR(rcpuset, cpuset); 12259 12260 next: 12261 /* LINTED: constant in conditional context */ 12262 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 12263 ASSERT(rlink != NULL); 12264 sfmmup = rlink->next; 12265 } 12266 if (uselocks) { 12267 mutex_exit(&rgnp->rgn_mutex); 12268 } 12269 return (rcpuset); 12270 } 12271 12272 /* 12273 * This routine takes an sfmmu pointer and the va for an adddress in an 12274 * ISM region as input and returns the corresponding region id in ism_rid. 12275 * The return value of 1 indicates that a region has been found and ism_rid 12276 * is valid, otherwise 0 is returned. 12277 */ 12278 static int 12279 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid) 12280 { 12281 ism_blk_t *ism_blkp; 12282 int i; 12283 ism_map_t *ism_map; 12284 #ifdef DEBUG 12285 struct hat *ism_hatid; 12286 #endif 12287 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12288 12289 ism_blkp = sfmmup->sfmmu_iblk; 12290 while (ism_blkp != NULL) { 12291 ism_map = ism_blkp->iblk_maps; 12292 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 12293 if ((va >= ism_start(ism_map[i])) && 12294 (va < ism_end(ism_map[i]))) { 12295 12296 *ism_rid = ism_map[i].imap_rid; 12297 #ifdef DEBUG 12298 ism_hatid = ism_map[i].imap_ismhat; 12299 ASSERT(ism_hatid == ism_sfmmup); 12300 ASSERT(ism_hatid->sfmmu_ismhat); 12301 #endif 12302 return (1); 12303 } 12304 } 12305 ism_blkp = ism_blkp->iblk_next; 12306 } 12307 return (0); 12308 } 12309 12310 /* 12311 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 12312 * This routine may be called with all cpu's captured. Therefore, the 12313 * caller is responsible for holding all locks and disabling kernel 12314 * preemption. 12315 */ 12316 /* ARGSUSED */ 12317 static void 12318 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 12319 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 12320 { 12321 cpuset_t cpuset; 12322 caddr_t va; 12323 ism_ment_t *ment; 12324 sfmmu_t *sfmmup; 12325 #ifdef VAC 12326 int vcolor; 12327 #endif 12328 12329 sf_scd_t *scdp; 12330 uint_t ism_rid; 12331 12332 ASSERT(!hmeblkp->hblk_shared); 12333 /* 12334 * Walk the ism_hat's mapping list and flush the page 12335 * from every hat sharing this ism_hat. This routine 12336 * may be called while all cpu's have been captured. 12337 * Therefore we can't attempt to grab any locks. For now 12338 * this means we will protect the ism mapping list under 12339 * a single lock which will be grabbed by the caller. 12340 * If hat_share/unshare scalibility becomes a performance 12341 * problem then we may need to re-think ism mapping list locking. 12342 */ 12343 ASSERT(ism_sfmmup->sfmmu_ismhat); 12344 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 12345 addr = addr - ISMID_STARTADDR; 12346 12347 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 12348 12349 sfmmup = ment->iment_hat; 12350 12351 va = ment->iment_base_va; 12352 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 12353 12354 /* 12355 * When an SCD is created the SCD hat is linked on the ism 12356 * mapping lists for each ISM segment which is part of the 12357 * SCD. If we find an SCD hat, when walking these lists, 12358 * then we flush the shared TSBs, if we find a private hat, 12359 * which is part of an SCD, but where the region 12360 * corresponding to this va is not part of the SCD then we 12361 * flush the private TSBs. 12362 */ 12363 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12364 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) && 12365 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12366 if (!find_ism_rid(sfmmup, ism_sfmmup, va, 12367 &ism_rid)) { 12368 cmn_err(CE_PANIC, 12369 "can't find matching ISM rid!"); 12370 } 12371 12372 scdp = sfmmup->sfmmu_scdp; 12373 if (SFMMU_IS_ISMRID_VALID(ism_rid) && 12374 SF_RGNMAP_TEST(scdp->scd_ismregion_map, 12375 ism_rid)) { 12376 continue; 12377 } 12378 } 12379 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1); 12380 12381 cpuset = sfmmup->sfmmu_cpusran; 12382 CPUSET_AND(cpuset, cpu_ready_set); 12383 CPUSET_DEL(cpuset, CPU->cpu_id); 12384 SFMMU_XCALL_STATS(sfmmup); 12385 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 12386 (uint64_t)sfmmup); 12387 vtag_flushpage(va, (uint64_t)sfmmup); 12388 12389 #ifdef VAC 12390 /* 12391 * Flush D$ 12392 * When flushing D$ we must flush all 12393 * cpu's. See sfmmu_cache_flush(). 12394 */ 12395 if (cache_flush_flag == CACHE_FLUSH) { 12396 cpuset = cpu_ready_set; 12397 CPUSET_DEL(cpuset, CPU->cpu_id); 12398 12399 SFMMU_XCALL_STATS(sfmmup); 12400 vcolor = addr_to_vcolor(va); 12401 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12402 vac_flushpage(pfnum, vcolor); 12403 } 12404 #endif /* VAC */ 12405 } 12406 } 12407 12408 /* 12409 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 12410 * a particular virtual address and ctx. If noflush is set we do not 12411 * flush the TLB/TSB. This function may or may not be called with the 12412 * HAT lock held. 12413 */ 12414 static void 12415 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12416 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 12417 int hat_lock_held) 12418 { 12419 #ifdef VAC 12420 int vcolor; 12421 #endif 12422 cpuset_t cpuset; 12423 hatlock_t *hatlockp; 12424 12425 ASSERT(!hmeblkp->hblk_shared); 12426 12427 #if defined(lint) && !defined(VAC) 12428 pfnum = pfnum; 12429 cpu_flag = cpu_flag; 12430 cache_flush_flag = cache_flush_flag; 12431 #endif 12432 12433 /* 12434 * There is no longer a need to protect against ctx being 12435 * stolen here since we don't store the ctx in the TSB anymore. 12436 */ 12437 #ifdef VAC 12438 vcolor = addr_to_vcolor(addr); 12439 #endif 12440 12441 /* 12442 * We must hold the hat lock during the flush of TLB, 12443 * to avoid a race with sfmmu_invalidate_ctx(), where 12444 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12445 * causing TLB demap routine to skip flush on that MMU. 12446 * If the context on a MMU has already been set to 12447 * INVALID_CONTEXT, we just get an extra flush on 12448 * that MMU. 12449 */ 12450 if (!hat_lock_held && !tlb_noflush) 12451 hatlockp = sfmmu_hat_enter(sfmmup); 12452 12453 kpreempt_disable(); 12454 if (!tlb_noflush) { 12455 /* 12456 * Flush the TSB and TLB. 12457 */ 12458 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12459 12460 cpuset = sfmmup->sfmmu_cpusran; 12461 CPUSET_AND(cpuset, cpu_ready_set); 12462 CPUSET_DEL(cpuset, CPU->cpu_id); 12463 12464 SFMMU_XCALL_STATS(sfmmup); 12465 12466 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 12467 (uint64_t)sfmmup); 12468 12469 vtag_flushpage(addr, (uint64_t)sfmmup); 12470 } 12471 12472 if (!hat_lock_held && !tlb_noflush) 12473 sfmmu_hat_exit(hatlockp); 12474 12475 #ifdef VAC 12476 /* 12477 * Flush the D$ 12478 * 12479 * Even if the ctx is stolen, we need to flush the 12480 * cache. Our ctx stealer only flushes the TLBs. 12481 */ 12482 if (cache_flush_flag == CACHE_FLUSH) { 12483 if (cpu_flag & FLUSH_ALL_CPUS) { 12484 cpuset = cpu_ready_set; 12485 } else { 12486 cpuset = sfmmup->sfmmu_cpusran; 12487 CPUSET_AND(cpuset, cpu_ready_set); 12488 } 12489 CPUSET_DEL(cpuset, CPU->cpu_id); 12490 SFMMU_XCALL_STATS(sfmmup); 12491 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12492 vac_flushpage(pfnum, vcolor); 12493 } 12494 #endif /* VAC */ 12495 kpreempt_enable(); 12496 } 12497 12498 /* 12499 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 12500 * address and ctx. If noflush is set we do not currently do anything. 12501 * This function may or may not be called with the HAT lock held. 12502 */ 12503 static void 12504 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12505 int tlb_noflush, int hat_lock_held) 12506 { 12507 cpuset_t cpuset; 12508 hatlock_t *hatlockp; 12509 12510 ASSERT(!hmeblkp->hblk_shared); 12511 12512 /* 12513 * If the process is exiting we have nothing to do. 12514 */ 12515 if (tlb_noflush) 12516 return; 12517 12518 /* 12519 * Flush TSB. 12520 */ 12521 if (!hat_lock_held) 12522 hatlockp = sfmmu_hat_enter(sfmmup); 12523 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12524 12525 kpreempt_disable(); 12526 12527 cpuset = sfmmup->sfmmu_cpusran; 12528 CPUSET_AND(cpuset, cpu_ready_set); 12529 CPUSET_DEL(cpuset, CPU->cpu_id); 12530 12531 SFMMU_XCALL_STATS(sfmmup); 12532 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup); 12533 12534 vtag_flushpage(addr, (uint64_t)sfmmup); 12535 12536 if (!hat_lock_held) 12537 sfmmu_hat_exit(hatlockp); 12538 12539 kpreempt_enable(); 12540 12541 } 12542 12543 /* 12544 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 12545 * call handler that can flush a range of pages to save on xcalls. 12546 */ 12547 static int sfmmu_xcall_save; 12548 12549 /* 12550 * this routine is never used for demaping addresses backed by SRD hmeblks. 12551 */ 12552 static void 12553 sfmmu_tlb_range_demap(demap_range_t *dmrp) 12554 { 12555 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 12556 hatlock_t *hatlockp; 12557 cpuset_t cpuset; 12558 uint64_t sfmmu_pgcnt; 12559 pgcnt_t pgcnt = 0; 12560 int pgunload = 0; 12561 int dirtypg = 0; 12562 caddr_t addr = dmrp->dmr_addr; 12563 caddr_t eaddr; 12564 uint64_t bitvec = dmrp->dmr_bitvec; 12565 12566 ASSERT(bitvec & 1); 12567 12568 /* 12569 * Flush TSB and calculate number of pages to flush. 12570 */ 12571 while (bitvec != 0) { 12572 dirtypg = 0; 12573 /* 12574 * Find the first page to flush and then count how many 12575 * pages there are after it that also need to be flushed. 12576 * This way the number of TSB flushes is minimized. 12577 */ 12578 while ((bitvec & 1) == 0) { 12579 pgcnt++; 12580 addr += MMU_PAGESIZE; 12581 bitvec >>= 1; 12582 } 12583 while (bitvec & 1) { 12584 dirtypg++; 12585 bitvec >>= 1; 12586 } 12587 eaddr = addr + ptob(dirtypg); 12588 hatlockp = sfmmu_hat_enter(sfmmup); 12589 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 12590 sfmmu_hat_exit(hatlockp); 12591 pgunload += dirtypg; 12592 addr = eaddr; 12593 pgcnt += dirtypg; 12594 } 12595 12596 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 12597 if (sfmmup->sfmmu_free == 0) { 12598 addr = dmrp->dmr_addr; 12599 bitvec = dmrp->dmr_bitvec; 12600 12601 /* 12602 * make sure it has SFMMU_PGCNT_SHIFT bits only, 12603 * as it will be used to pack argument for xt_some 12604 */ 12605 ASSERT((pgcnt > 0) && 12606 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT))); 12607 12608 /* 12609 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in 12610 * the low 6 bits of sfmmup. This is doable since pgcnt 12611 * always >= 1. 12612 */ 12613 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK)); 12614 sfmmu_pgcnt = (uint64_t)sfmmup | 12615 ((pgcnt - 1) & SFMMU_PGCNT_MASK); 12616 12617 /* 12618 * We must hold the hat lock during the flush of TLB, 12619 * to avoid a race with sfmmu_invalidate_ctx(), where 12620 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12621 * causing TLB demap routine to skip flush on that MMU. 12622 * If the context on a MMU has already been set to 12623 * INVALID_CONTEXT, we just get an extra flush on 12624 * that MMU. 12625 */ 12626 hatlockp = sfmmu_hat_enter(sfmmup); 12627 kpreempt_disable(); 12628 12629 cpuset = sfmmup->sfmmu_cpusran; 12630 CPUSET_AND(cpuset, cpu_ready_set); 12631 CPUSET_DEL(cpuset, CPU->cpu_id); 12632 12633 SFMMU_XCALL_STATS(sfmmup); 12634 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 12635 sfmmu_pgcnt); 12636 12637 for (; bitvec != 0; bitvec >>= 1) { 12638 if (bitvec & 1) 12639 vtag_flushpage(addr, (uint64_t)sfmmup); 12640 addr += MMU_PAGESIZE; 12641 } 12642 kpreempt_enable(); 12643 sfmmu_hat_exit(hatlockp); 12644 12645 sfmmu_xcall_save += (pgunload-1); 12646 } 12647 dmrp->dmr_bitvec = 0; 12648 } 12649 12650 /* 12651 * In cases where we need to synchronize with TLB/TSB miss trap 12652 * handlers, _and_ need to flush the TLB, it's a lot easier to 12653 * throw away the context from the process than to do a 12654 * special song and dance to keep things consistent for the 12655 * handlers. 12656 * 12657 * Since the process suddenly ends up without a context and our caller 12658 * holds the hat lock, threads that fault after this function is called 12659 * will pile up on the lock. We can then do whatever we need to 12660 * atomically from the context of the caller. The first blocked thread 12661 * to resume executing will get the process a new context, and the 12662 * process will resume executing. 12663 * 12664 * One added advantage of this approach is that on MMUs that 12665 * support a "flush all" operation, we will delay the flush until 12666 * cnum wrap-around, and then flush the TLB one time. This 12667 * is rather rare, so it's a lot less expensive than making 8000 12668 * x-calls to flush the TLB 8000 times. 12669 * 12670 * A per-process (PP) lock is used to synchronize ctx allocations in 12671 * resume() and ctx invalidations here. 12672 */ 12673 static void 12674 sfmmu_invalidate_ctx(sfmmu_t *sfmmup) 12675 { 12676 cpuset_t cpuset; 12677 int cnum, currcnum; 12678 mmu_ctx_t *mmu_ctxp; 12679 int i; 12680 uint_t pstate_save; 12681 12682 SFMMU_STAT(sf_ctx_inv); 12683 12684 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12685 ASSERT(sfmmup != ksfmmup); 12686 12687 kpreempt_disable(); 12688 12689 mmu_ctxp = CPU_MMU_CTXP(CPU); 12690 ASSERT(mmu_ctxp); 12691 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 12692 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 12693 12694 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum; 12695 12696 pstate_save = sfmmu_disable_intrs(); 12697 12698 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */ 12699 /* set HAT cnum invalid across all context domains. */ 12700 for (i = 0; i < max_mmu_ctxdoms; i++) { 12701 12702 cnum = sfmmup->sfmmu_ctxs[i].cnum; 12703 if (cnum == INVALID_CONTEXT) { 12704 continue; 12705 } 12706 12707 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 12708 } 12709 membar_enter(); /* make sure globally visible to all CPUs */ 12710 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */ 12711 12712 sfmmu_enable_intrs(pstate_save); 12713 12714 cpuset = sfmmup->sfmmu_cpusran; 12715 CPUSET_DEL(cpuset, CPU->cpu_id); 12716 CPUSET_AND(cpuset, cpu_ready_set); 12717 if (!CPUSET_ISNULL(cpuset)) { 12718 SFMMU_XCALL_STATS(sfmmup); 12719 xt_some(cpuset, sfmmu_raise_tsb_exception, 12720 (uint64_t)sfmmup, INVALID_CONTEXT); 12721 xt_sync(cpuset); 12722 SFMMU_STAT(sf_tsb_raise_exception); 12723 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 12724 } 12725 12726 /* 12727 * If the hat to-be-invalidated is the same as the current 12728 * process on local CPU we need to invalidate 12729 * this CPU context as well. 12730 */ 12731 if ((sfmmu_getctx_sec() == currcnum) && 12732 (currcnum != INVALID_CONTEXT)) { 12733 /* sets shared context to INVALID too */ 12734 sfmmu_setctx_sec(INVALID_CONTEXT); 12735 sfmmu_clear_utsbinfo(); 12736 } 12737 12738 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID); 12739 12740 kpreempt_enable(); 12741 12742 /* 12743 * we hold the hat lock, so nobody should allocate a context 12744 * for us yet 12745 */ 12746 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT); 12747 } 12748 12749 #ifdef VAC 12750 /* 12751 * We need to flush the cache in all cpus. It is possible that 12752 * a process referenced a page as cacheable but has sinced exited 12753 * and cleared the mapping list. We still to flush it but have no 12754 * state so all cpus is the only alternative. 12755 */ 12756 void 12757 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 12758 { 12759 cpuset_t cpuset; 12760 12761 kpreempt_disable(); 12762 cpuset = cpu_ready_set; 12763 CPUSET_DEL(cpuset, CPU->cpu_id); 12764 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12765 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12766 xt_sync(cpuset); 12767 vac_flushpage(pfnum, vcolor); 12768 kpreempt_enable(); 12769 } 12770 12771 void 12772 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 12773 { 12774 cpuset_t cpuset; 12775 12776 ASSERT(vcolor >= 0); 12777 12778 kpreempt_disable(); 12779 cpuset = cpu_ready_set; 12780 CPUSET_DEL(cpuset, CPU->cpu_id); 12781 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12782 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 12783 xt_sync(cpuset); 12784 vac_flushcolor(vcolor, pfnum); 12785 kpreempt_enable(); 12786 } 12787 #endif /* VAC */ 12788 12789 /* 12790 * We need to prevent processes from accessing the TSB using a cached physical 12791 * address. It's alright if they try to access the TSB via virtual address 12792 * since they will just fault on that virtual address once the mapping has 12793 * been suspended. 12794 */ 12795 #pragma weak sendmondo_in_recover 12796 12797 /* ARGSUSED */ 12798 static int 12799 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 12800 { 12801 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12802 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12803 hatlock_t *hatlockp; 12804 sf_scd_t *scdp; 12805 12806 if (flags != HAT_PRESUSPEND) 12807 return (0); 12808 12809 /* 12810 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must 12811 * be a shared hat, then set SCD's tsbinfo's flag. 12812 * If tsb is not shared, sfmmup is a private hat, then set 12813 * its private tsbinfo's flag. 12814 */ 12815 hatlockp = sfmmu_hat_enter(sfmmup); 12816 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 12817 12818 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) { 12819 sfmmu_tsb_inv_ctx(sfmmup); 12820 sfmmu_hat_exit(hatlockp); 12821 } else { 12822 /* release lock on the shared hat */ 12823 sfmmu_hat_exit(hatlockp); 12824 /* sfmmup is a shared hat */ 12825 ASSERT(sfmmup->sfmmu_scdhat); 12826 scdp = sfmmup->sfmmu_scdp; 12827 ASSERT(scdp != NULL); 12828 /* get private hat from the scd list */ 12829 mutex_enter(&scdp->scd_mutex); 12830 sfmmup = scdp->scd_sf_list; 12831 while (sfmmup != NULL) { 12832 hatlockp = sfmmu_hat_enter(sfmmup); 12833 /* 12834 * We do not call sfmmu_tsb_inv_ctx here because 12835 * sendmondo_in_recover check is only needed for 12836 * sun4u. 12837 */ 12838 sfmmu_invalidate_ctx(sfmmup); 12839 sfmmu_hat_exit(hatlockp); 12840 sfmmup = sfmmup->sfmmu_scd_link.next; 12841 12842 } 12843 mutex_exit(&scdp->scd_mutex); 12844 } 12845 return (0); 12846 } 12847 12848 static void 12849 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup) 12850 { 12851 extern uint32_t sendmondo_in_recover; 12852 12853 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12854 12855 /* 12856 * For Cheetah+ Erratum 25: 12857 * Wait for any active recovery to finish. We can't risk 12858 * relocating the TSB of the thread running mondo_recover_proc() 12859 * since, if we did that, we would deadlock. The scenario we are 12860 * trying to avoid is as follows: 12861 * 12862 * THIS CPU RECOVER CPU 12863 * -------- ----------- 12864 * Begins recovery, walking through TSB 12865 * hat_pagesuspend() TSB TTE 12866 * TLB miss on TSB TTE, spins at TL1 12867 * xt_sync() 12868 * send_mondo_timeout() 12869 * mondo_recover_proc() 12870 * ((deadlocked)) 12871 * 12872 * The second half of the workaround is that mondo_recover_proc() 12873 * checks to see if the tsb_info has the RELOC flag set, and if it 12874 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 12875 * and hence avoiding the TLB miss that could result in a deadlock. 12876 */ 12877 if (&sendmondo_in_recover) { 12878 membar_enter(); /* make sure RELOC flag visible */ 12879 while (sendmondo_in_recover) { 12880 drv_usecwait(1); 12881 membar_consumer(); 12882 } 12883 } 12884 12885 sfmmu_invalidate_ctx(sfmmup); 12886 } 12887 12888 /* ARGSUSED */ 12889 static int 12890 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 12891 void *tsbinfo, pfn_t newpfn) 12892 { 12893 hatlock_t *hatlockp; 12894 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12895 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12896 12897 if (flags != HAT_POSTUNSUSPEND) 12898 return (0); 12899 12900 hatlockp = sfmmu_hat_enter(sfmmup); 12901 12902 SFMMU_STAT(sf_tsb_reloc); 12903 12904 /* 12905 * The process may have swapped out while we were relocating one 12906 * of its TSBs. If so, don't bother doing the setup since the 12907 * process can't be using the memory anymore. 12908 */ 12909 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 12910 ASSERT(va == tsbinfop->tsb_va); 12911 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 12912 12913 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 12914 sfmmu_inv_tsb(tsbinfop->tsb_va, 12915 TSB_BYTES(tsbinfop->tsb_szc)); 12916 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 12917 } 12918 } 12919 12920 membar_exit(); 12921 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 12922 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 12923 12924 sfmmu_hat_exit(hatlockp); 12925 12926 return (0); 12927 } 12928 12929 /* 12930 * Allocate and initialize a tsb_info structure. Note that we may or may not 12931 * allocate a TSB here, depending on the flags passed in. 12932 */ 12933 static int 12934 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 12935 uint_t flags, sfmmu_t *sfmmup) 12936 { 12937 int err; 12938 12939 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 12940 sfmmu_tsbinfo_cache, KM_SLEEP); 12941 12942 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 12943 tsb_szc, flags, sfmmup)) != 0) { 12944 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 12945 SFMMU_STAT(sf_tsb_allocfail); 12946 *tsbinfopp = NULL; 12947 return (err); 12948 } 12949 SFMMU_STAT(sf_tsb_alloc); 12950 12951 /* 12952 * Bump the TSB size counters for this TSB size. 12953 */ 12954 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 12955 return (0); 12956 } 12957 12958 static void 12959 sfmmu_tsb_free(struct tsb_info *tsbinfo) 12960 { 12961 caddr_t tsbva = tsbinfo->tsb_va; 12962 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 12963 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 12964 vmem_t *vmp = tsbinfo->tsb_vmp; 12965 12966 /* 12967 * If we allocated this TSB from relocatable kernel memory, then we 12968 * need to uninstall the callback handler. 12969 */ 12970 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 12971 uintptr_t slab_mask; 12972 caddr_t slab_vaddr; 12973 page_t **ppl; 12974 int ret; 12975 12976 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena); 12977 if (tsb_size > MMU_PAGESIZE4M) 12978 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12979 else 12980 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12981 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 12982 12983 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 12984 ASSERT(ret == 0); 12985 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 12986 0, NULL); 12987 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 12988 } 12989 12990 if (kmem_cachep != NULL) { 12991 kmem_cache_free(kmem_cachep, tsbva); 12992 } else { 12993 vmem_xfree(vmp, (void *)tsbva, tsb_size); 12994 } 12995 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 12996 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 12997 } 12998 12999 static void 13000 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 13001 { 13002 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 13003 sfmmu_tsb_free(tsbinfo); 13004 } 13005 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 13006 13007 } 13008 13009 /* 13010 * Setup all the references to physical memory for this tsbinfo. 13011 * The underlying page(s) must be locked. 13012 */ 13013 static void 13014 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 13015 { 13016 ASSERT(pfn != PFN_INVALID); 13017 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 13018 13019 #ifndef sun4v 13020 if (tsbinfo->tsb_szc == 0) { 13021 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 13022 PROT_WRITE|PROT_READ, TTE8K); 13023 } else { 13024 /* 13025 * Round down PA and use a large mapping; the handlers will 13026 * compute the TSB pointer at the correct offset into the 13027 * big virtual page. NOTE: this assumes all TSBs larger 13028 * than 8K must come from physically contiguous slabs of 13029 * size tsb_slab_size. 13030 */ 13031 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 13032 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 13033 } 13034 tsbinfo->tsb_pa = ptob(pfn); 13035 13036 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 13037 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 13038 13039 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 13040 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 13041 #else /* sun4v */ 13042 tsbinfo->tsb_pa = ptob(pfn); 13043 #endif /* sun4v */ 13044 } 13045 13046 13047 /* 13048 * Returns zero on success, ENOMEM if over the high water mark, 13049 * or EAGAIN if the caller needs to retry with a smaller TSB 13050 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 13051 * 13052 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 13053 * is specified and the TSB requested is PAGESIZE, though it 13054 * may sleep waiting for memory if sufficient memory is not 13055 * available. 13056 */ 13057 static int 13058 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 13059 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 13060 { 13061 caddr_t vaddr = NULL; 13062 caddr_t slab_vaddr; 13063 uintptr_t slab_mask; 13064 int tsbbytes = TSB_BYTES(tsbcode); 13065 int lowmem = 0; 13066 struct kmem_cache *kmem_cachep = NULL; 13067 vmem_t *vmp = NULL; 13068 lgrp_id_t lgrpid = LGRP_NONE; 13069 pfn_t pfn; 13070 uint_t cbflags = HAC_SLEEP; 13071 page_t **pplist; 13072 int ret; 13073 13074 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena); 13075 if (tsbbytes > MMU_PAGESIZE4M) 13076 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 13077 else 13078 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 13079 13080 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 13081 flags |= TSB_ALLOC; 13082 13083 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 13084 13085 tsbinfo->tsb_sfmmu = sfmmup; 13086 13087 /* 13088 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 13089 * return. 13090 */ 13091 if ((flags & TSB_ALLOC) == 0) { 13092 tsbinfo->tsb_szc = tsbcode; 13093 tsbinfo->tsb_ttesz_mask = tteszmask; 13094 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 13095 tsbinfo->tsb_pa = -1; 13096 tsbinfo->tsb_tte.ll = 0; 13097 tsbinfo->tsb_next = NULL; 13098 tsbinfo->tsb_flags = TSB_SWAPPED; 13099 tsbinfo->tsb_cache = NULL; 13100 tsbinfo->tsb_vmp = NULL; 13101 return (0); 13102 } 13103 13104 #ifdef DEBUG 13105 /* 13106 * For debugging: 13107 * Randomly force allocation failures every tsb_alloc_mtbf 13108 * tries if TSB_FORCEALLOC is not specified. This will 13109 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 13110 * it is even, to allow testing of both failure paths... 13111 */ 13112 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 13113 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 13114 tsb_alloc_count = 0; 13115 tsb_alloc_fail_mtbf++; 13116 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 13117 } 13118 #endif /* DEBUG */ 13119 13120 /* 13121 * Enforce high water mark if we are not doing a forced allocation 13122 * and are not shrinking a process' TSB. 13123 */ 13124 if ((flags & TSB_SHRINK) == 0 && 13125 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 13126 if ((flags & TSB_FORCEALLOC) == 0) 13127 return (ENOMEM); 13128 lowmem = 1; 13129 } 13130 13131 /* 13132 * Allocate from the correct location based upon the size of the TSB 13133 * compared to the base page size, and what memory conditions dictate. 13134 * Note we always do nonblocking allocations from the TSB arena since 13135 * we don't want memory fragmentation to cause processes to block 13136 * indefinitely waiting for memory; until the kernel algorithms that 13137 * coalesce large pages are improved this is our best option. 13138 * 13139 * Algorithm: 13140 * If allocating a "large" TSB (>8K), allocate from the 13141 * appropriate kmem_tsb_default_arena vmem arena 13142 * else if low on memory or the TSB_FORCEALLOC flag is set or 13143 * tsb_forceheap is set 13144 * Allocate from kernel heap via sfmmu_tsb8k_cache with 13145 * KM_SLEEP (never fails) 13146 * else 13147 * Allocate from appropriate sfmmu_tsb_cache with 13148 * KM_NOSLEEP 13149 * endif 13150 */ 13151 if (tsb_lgrp_affinity) 13152 lgrpid = lgrp_home_id(curthread); 13153 if (lgrpid == LGRP_NONE) 13154 lgrpid = 0; /* use lgrp of boot CPU */ 13155 13156 if (tsbbytes > MMU_PAGESIZE) { 13157 if (tsbbytes > MMU_PAGESIZE4M) { 13158 vmp = kmem_bigtsb_default_arena[lgrpid]; 13159 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13160 0, 0, NULL, NULL, VM_NOSLEEP); 13161 } else { 13162 vmp = kmem_tsb_default_arena[lgrpid]; 13163 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13164 0, 0, NULL, NULL, VM_NOSLEEP); 13165 } 13166 #ifdef DEBUG 13167 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 13168 #else /* !DEBUG */ 13169 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 13170 #endif /* DEBUG */ 13171 kmem_cachep = sfmmu_tsb8k_cache; 13172 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 13173 ASSERT(vaddr != NULL); 13174 } else { 13175 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 13176 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 13177 } 13178 13179 tsbinfo->tsb_cache = kmem_cachep; 13180 tsbinfo->tsb_vmp = vmp; 13181 13182 if (vaddr == NULL) { 13183 return (EAGAIN); 13184 } 13185 13186 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 13187 kmem_cachep = tsbinfo->tsb_cache; 13188 13189 /* 13190 * If we are allocating from outside the cage, then we need to 13191 * register a relocation callback handler. Note that for now 13192 * since pseudo mappings always hang off of the slab's root page, 13193 * we need only lock the first 8K of the TSB slab. This is a bit 13194 * hacky but it is good for performance. 13195 */ 13196 if (kmem_cachep != sfmmu_tsb8k_cache) { 13197 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 13198 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 13199 ASSERT(ret == 0); 13200 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 13201 cbflags, (void *)tsbinfo, &pfn, NULL); 13202 13203 /* 13204 * Need to free up resources if we could not successfully 13205 * add the callback function and return an error condition. 13206 */ 13207 if (ret != 0) { 13208 if (kmem_cachep) { 13209 kmem_cache_free(kmem_cachep, vaddr); 13210 } else { 13211 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 13212 } 13213 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 13214 S_WRITE); 13215 return (EAGAIN); 13216 } 13217 } else { 13218 /* 13219 * Since allocation of 8K TSBs from heap is rare and occurs 13220 * during memory pressure we allocate them from permanent 13221 * memory rather than using callbacks to get the PFN. 13222 */ 13223 pfn = hat_getpfnum(kas.a_hat, vaddr); 13224 } 13225 13226 tsbinfo->tsb_va = vaddr; 13227 tsbinfo->tsb_szc = tsbcode; 13228 tsbinfo->tsb_ttesz_mask = tteszmask; 13229 tsbinfo->tsb_next = NULL; 13230 tsbinfo->tsb_flags = 0; 13231 13232 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 13233 13234 sfmmu_inv_tsb(vaddr, tsbbytes); 13235 13236 if (kmem_cachep != sfmmu_tsb8k_cache) { 13237 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 13238 } 13239 13240 return (0); 13241 } 13242 13243 /* 13244 * Initialize per cpu tsb and per cpu tsbmiss_area 13245 */ 13246 void 13247 sfmmu_init_tsbs(void) 13248 { 13249 int i; 13250 struct tsbmiss *tsbmissp; 13251 struct kpmtsbm *kpmtsbmp; 13252 #ifndef sun4v 13253 extern int dcache_line_mask; 13254 #endif /* sun4v */ 13255 extern uint_t vac_colors; 13256 13257 /* 13258 * Init. tsb miss area. 13259 */ 13260 tsbmissp = tsbmiss_area; 13261 13262 for (i = 0; i < NCPU; tsbmissp++, i++) { 13263 /* 13264 * initialize the tsbmiss area. 13265 * Do this for all possible CPUs as some may be added 13266 * while the system is running. There is no cost to this. 13267 */ 13268 tsbmissp->ksfmmup = ksfmmup; 13269 #ifndef sun4v 13270 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 13271 #endif /* sun4v */ 13272 tsbmissp->khashstart = 13273 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 13274 tsbmissp->uhashstart = 13275 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 13276 tsbmissp->khashsz = khmehash_num; 13277 tsbmissp->uhashsz = uhmehash_num; 13278 } 13279 13280 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B', 13281 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0); 13282 13283 if (kpm_enable == 0) 13284 return; 13285 13286 /* -- Begin KPM specific init -- */ 13287 13288 if (kpm_smallpages) { 13289 /* 13290 * If we're using base pagesize pages for seg_kpm 13291 * mappings, we use the kernel TSB since we can't afford 13292 * to allocate a second huge TSB for these mappings. 13293 */ 13294 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13295 kpm_tsbsz = ktsb_szcode; 13296 kpmsm_tsbbase = kpm_tsbbase; 13297 kpmsm_tsbsz = kpm_tsbsz; 13298 } else { 13299 /* 13300 * In VAC conflict case, just put the entries in the 13301 * kernel 8K indexed TSB for now so we can find them. 13302 * This could really be changed in the future if we feel 13303 * the need... 13304 */ 13305 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13306 kpmsm_tsbsz = ktsb_szcode; 13307 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 13308 kpm_tsbsz = ktsb4m_szcode; 13309 } 13310 13311 kpmtsbmp = kpmtsbm_area; 13312 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 13313 /* 13314 * Initialize the kpmtsbm area. 13315 * Do this for all possible CPUs as some may be added 13316 * while the system is running. There is no cost to this. 13317 */ 13318 kpmtsbmp->vbase = kpm_vbase; 13319 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 13320 kpmtsbmp->sz_shift = kpm_size_shift; 13321 kpmtsbmp->kpmp_shift = kpmp_shift; 13322 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 13323 if (kpm_smallpages == 0) { 13324 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 13325 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 13326 } else { 13327 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 13328 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 13329 } 13330 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 13331 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 13332 #ifdef DEBUG 13333 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 13334 #endif /* DEBUG */ 13335 if (ktsb_phys) 13336 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 13337 } 13338 13339 /* -- End KPM specific init -- */ 13340 } 13341 13342 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 13343 struct tsb_info ktsb_info[2]; 13344 13345 /* 13346 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 13347 */ 13348 void 13349 sfmmu_init_ktsbinfo() 13350 { 13351 ASSERT(ksfmmup != NULL); 13352 ASSERT(ksfmmup->sfmmu_tsb == NULL); 13353 /* 13354 * Allocate tsbinfos for kernel and copy in data 13355 * to make debug easier and sun4v setup easier. 13356 */ 13357 ktsb_info[0].tsb_sfmmu = ksfmmup; 13358 ktsb_info[0].tsb_szc = ktsb_szcode; 13359 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 13360 ktsb_info[0].tsb_va = ktsb_base; 13361 ktsb_info[0].tsb_pa = ktsb_pbase; 13362 ktsb_info[0].tsb_flags = 0; 13363 ktsb_info[0].tsb_tte.ll = 0; 13364 ktsb_info[0].tsb_cache = NULL; 13365 13366 ktsb_info[1].tsb_sfmmu = ksfmmup; 13367 ktsb_info[1].tsb_szc = ktsb4m_szcode; 13368 ktsb_info[1].tsb_ttesz_mask = TSB4M; 13369 ktsb_info[1].tsb_va = ktsb4m_base; 13370 ktsb_info[1].tsb_pa = ktsb4m_pbase; 13371 ktsb_info[1].tsb_flags = 0; 13372 ktsb_info[1].tsb_tte.ll = 0; 13373 ktsb_info[1].tsb_cache = NULL; 13374 13375 /* Link them into ksfmmup. */ 13376 ktsb_info[0].tsb_next = &ktsb_info[1]; 13377 ktsb_info[1].tsb_next = NULL; 13378 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 13379 13380 sfmmu_setup_tsbinfo(ksfmmup); 13381 } 13382 13383 /* 13384 * Cache the last value returned from va_to_pa(). If the VA specified 13385 * in the current call to cached_va_to_pa() maps to the same Page (as the 13386 * previous call to cached_va_to_pa()), then compute the PA using 13387 * cached info, else call va_to_pa(). 13388 * 13389 * Note: this function is neither MT-safe nor consistent in the presence 13390 * of multiple, interleaved threads. This function was created to enable 13391 * an optimization used during boot (at a point when there's only one thread 13392 * executing on the "boot CPU", and before startup_vm() has been called). 13393 */ 13394 static uint64_t 13395 cached_va_to_pa(void *vaddr) 13396 { 13397 static uint64_t prev_vaddr_base = 0; 13398 static uint64_t prev_pfn = 0; 13399 13400 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 13401 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 13402 } else { 13403 uint64_t pa = va_to_pa(vaddr); 13404 13405 if (pa != ((uint64_t)-1)) { 13406 /* 13407 * Computed physical address is valid. Cache its 13408 * related info for the next cached_va_to_pa() call. 13409 */ 13410 prev_pfn = pa & MMU_PAGEMASK; 13411 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 13412 } 13413 13414 return (pa); 13415 } 13416 } 13417 13418 /* 13419 * Carve up our nucleus hblk region. We may allocate more hblks than 13420 * asked due to rounding errors but we are guaranteed to have at least 13421 * enough space to allocate the requested number of hblk8's and hblk1's. 13422 */ 13423 void 13424 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 13425 { 13426 struct hme_blk *hmeblkp; 13427 size_t hme8blk_sz, hme1blk_sz; 13428 size_t i; 13429 size_t hblk8_bound; 13430 ulong_t j = 0, k = 0; 13431 13432 ASSERT(addr != NULL && size != 0); 13433 13434 /* Need to use proper structure alignment */ 13435 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 13436 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 13437 13438 nucleus_hblk8.list = (void *)addr; 13439 nucleus_hblk8.index = 0; 13440 13441 /* 13442 * Use as much memory as possible for hblk8's since we 13443 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 13444 * We need to hold back enough space for the hblk1's which 13445 * we'll allocate next. 13446 */ 13447 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 13448 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 13449 hmeblkp = (struct hme_blk *)addr; 13450 addr += hme8blk_sz; 13451 hmeblkp->hblk_nuc_bit = 1; 13452 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13453 } 13454 nucleus_hblk8.len = j; 13455 ASSERT(j >= nhblk8); 13456 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 13457 13458 nucleus_hblk1.list = (void *)addr; 13459 nucleus_hblk1.index = 0; 13460 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 13461 hmeblkp = (struct hme_blk *)addr; 13462 addr += hme1blk_sz; 13463 hmeblkp->hblk_nuc_bit = 1; 13464 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13465 } 13466 ASSERT(k >= nhblk1); 13467 nucleus_hblk1.len = k; 13468 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 13469 } 13470 13471 /* 13472 * This function is currently not supported on this platform. For what 13473 * it's supposed to do, see hat.c and hat_srmmu.c 13474 */ 13475 /* ARGSUSED */ 13476 faultcode_t 13477 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 13478 uint_t flags) 13479 { 13480 ASSERT(hat->sfmmu_xhat_provider == NULL); 13481 return (FC_NOSUPPORT); 13482 } 13483 13484 /* 13485 * Searchs the mapping list of the page for a mapping of the same size. If not 13486 * found the corresponding bit is cleared in the p_index field. When large 13487 * pages are more prevalent in the system, we can maintain the mapping list 13488 * in order and we don't have to traverse the list each time. Just check the 13489 * next and prev entries, and if both are of different size, we clear the bit. 13490 */ 13491 static void 13492 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 13493 { 13494 struct sf_hment *sfhmep; 13495 struct hme_blk *hmeblkp; 13496 int index; 13497 pgcnt_t npgs; 13498 13499 ASSERT(ttesz > TTE8K); 13500 13501 ASSERT(sfmmu_mlist_held(pp)); 13502 13503 ASSERT(PP_ISMAPPED_LARGE(pp)); 13504 13505 /* 13506 * Traverse mapping list looking for another mapping of same size. 13507 * since we only want to clear index field if all mappings of 13508 * that size are gone. 13509 */ 13510 13511 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 13512 if (IS_PAHME(sfhmep)) 13513 continue; 13514 hmeblkp = sfmmu_hmetohblk(sfhmep); 13515 if (hmeblkp->hblk_xhat_bit) 13516 continue; 13517 if (hme_size(sfhmep) == ttesz) { 13518 /* 13519 * another mapping of the same size. don't clear index. 13520 */ 13521 return; 13522 } 13523 } 13524 13525 /* 13526 * Clear the p_index bit for large page. 13527 */ 13528 index = PAGESZ_TO_INDEX(ttesz); 13529 npgs = TTEPAGES(ttesz); 13530 while (npgs-- > 0) { 13531 ASSERT(pp->p_index & index); 13532 pp->p_index &= ~index; 13533 pp = PP_PAGENEXT(pp); 13534 } 13535 } 13536 13537 /* 13538 * return supported features 13539 */ 13540 /* ARGSUSED */ 13541 int 13542 hat_supported(enum hat_features feature, void *arg) 13543 { 13544 switch (feature) { 13545 case HAT_SHARED_PT: 13546 case HAT_DYNAMIC_ISM_UNMAP: 13547 case HAT_VMODSORT: 13548 return (1); 13549 case HAT_SHARED_REGIONS: 13550 if (shctx_on) 13551 return (1); 13552 else 13553 return (0); 13554 default: 13555 return (0); 13556 } 13557 } 13558 13559 void 13560 hat_enter(struct hat *hat) 13561 { 13562 hatlock_t *hatlockp; 13563 13564 if (hat != ksfmmup) { 13565 hatlockp = TSB_HASH(hat); 13566 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 13567 } 13568 } 13569 13570 void 13571 hat_exit(struct hat *hat) 13572 { 13573 hatlock_t *hatlockp; 13574 13575 if (hat != ksfmmup) { 13576 hatlockp = TSB_HASH(hat); 13577 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 13578 } 13579 } 13580 13581 /*ARGSUSED*/ 13582 void 13583 hat_reserve(struct as *as, caddr_t addr, size_t len) 13584 { 13585 } 13586 13587 static void 13588 hat_kstat_init(void) 13589 { 13590 kstat_t *ksp; 13591 13592 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 13593 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 13594 KSTAT_FLAG_VIRTUAL); 13595 if (ksp) { 13596 ksp->ks_data = (void *) &sfmmu_global_stat; 13597 kstat_install(ksp); 13598 } 13599 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 13600 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 13601 KSTAT_FLAG_VIRTUAL); 13602 if (ksp) { 13603 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 13604 kstat_install(ksp); 13605 } 13606 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 13607 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 13608 KSTAT_FLAG_WRITABLE); 13609 if (ksp) { 13610 ksp->ks_update = sfmmu_kstat_percpu_update; 13611 kstat_install(ksp); 13612 } 13613 } 13614 13615 /* ARGSUSED */ 13616 static int 13617 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 13618 { 13619 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 13620 struct tsbmiss *tsbm = tsbmiss_area; 13621 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 13622 int i; 13623 13624 ASSERT(cpu_kstat); 13625 if (rw == KSTAT_READ) { 13626 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 13627 cpu_kstat->sf_itlb_misses = 0; 13628 cpu_kstat->sf_dtlb_misses = 0; 13629 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 13630 tsbm->uprot_traps; 13631 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 13632 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 13633 cpu_kstat->sf_tsb_hits = 0; 13634 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 13635 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 13636 } 13637 } else { 13638 /* KSTAT_WRITE is used to clear stats */ 13639 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 13640 tsbm->utsb_misses = 0; 13641 tsbm->ktsb_misses = 0; 13642 tsbm->uprot_traps = 0; 13643 tsbm->kprot_traps = 0; 13644 kpmtsbm->kpm_dtlb_misses = 0; 13645 kpmtsbm->kpm_tsb_misses = 0; 13646 } 13647 } 13648 return (0); 13649 } 13650 13651 #ifdef DEBUG 13652 13653 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 13654 13655 /* 13656 * A tte checker. *orig_old is the value we read before cas. 13657 * *cur is the value returned by cas. 13658 * *new is the desired value when we do the cas. 13659 * 13660 * *hmeblkp is currently unused. 13661 */ 13662 13663 /* ARGSUSED */ 13664 void 13665 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 13666 { 13667 pfn_t i, j, k; 13668 int cpuid = CPU->cpu_id; 13669 13670 gorig[cpuid] = orig_old; 13671 gcur[cpuid] = cur; 13672 gnew[cpuid] = new; 13673 13674 #ifdef lint 13675 hmeblkp = hmeblkp; 13676 #endif 13677 13678 if (TTE_IS_VALID(orig_old)) { 13679 if (TTE_IS_VALID(cur)) { 13680 i = TTE_TO_TTEPFN(orig_old); 13681 j = TTE_TO_TTEPFN(cur); 13682 k = TTE_TO_TTEPFN(new); 13683 if (i != j) { 13684 /* remap error? */ 13685 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j); 13686 } 13687 13688 if (i != k) { 13689 /* remap error? */ 13690 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k); 13691 } 13692 } else { 13693 if (TTE_IS_VALID(new)) { 13694 panic("chk_tte: invalid cur? "); 13695 } 13696 13697 i = TTE_TO_TTEPFN(orig_old); 13698 k = TTE_TO_TTEPFN(new); 13699 if (i != k) { 13700 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k); 13701 } 13702 } 13703 } else { 13704 if (TTE_IS_VALID(cur)) { 13705 j = TTE_TO_TTEPFN(cur); 13706 if (TTE_IS_VALID(new)) { 13707 k = TTE_TO_TTEPFN(new); 13708 if (j != k) { 13709 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx", 13710 j, k); 13711 } 13712 } else { 13713 panic("chk_tte: why here?"); 13714 } 13715 } else { 13716 if (!TTE_IS_VALID(new)) { 13717 panic("chk_tte: why here2 ?"); 13718 } 13719 } 13720 } 13721 } 13722 13723 #endif /* DEBUG */ 13724 13725 extern void prefetch_tsbe_read(struct tsbe *); 13726 extern void prefetch_tsbe_write(struct tsbe *); 13727 13728 13729 /* 13730 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 13731 * us optimal performance on Cheetah+. You can only have 8 outstanding 13732 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 13733 * prefetch to make the most utilization of the prefetch capability. 13734 */ 13735 #define TSBE_PREFETCH_STRIDE (7) 13736 13737 void 13738 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 13739 { 13740 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 13741 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 13742 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 13743 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 13744 struct tsbe *old; 13745 struct tsbe *new; 13746 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 13747 uint64_t va; 13748 int new_offset; 13749 int i; 13750 int vpshift; 13751 int last_prefetch; 13752 13753 if (old_bytes == new_bytes) { 13754 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 13755 } else { 13756 13757 /* 13758 * A TSBE is 16 bytes which means there are four TSBE's per 13759 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 13760 */ 13761 old = (struct tsbe *)old_tsbinfo->tsb_va; 13762 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 13763 for (i = 0; i < old_entries; i++, old++) { 13764 if (((i & (4-1)) == 0) && (i < last_prefetch)) 13765 prefetch_tsbe_read(old); 13766 if (!old->tte_tag.tag_invalid) { 13767 /* 13768 * We have a valid TTE to remap. Check the 13769 * size. We won't remap 64K or 512K TTEs 13770 * because they span more than one TSB entry 13771 * and are indexed using an 8K virt. page. 13772 * Ditto for 32M and 256M TTEs. 13773 */ 13774 if (TTE_CSZ(&old->tte_data) == TTE64K || 13775 TTE_CSZ(&old->tte_data) == TTE512K) 13776 continue; 13777 if (mmu_page_sizes == max_mmu_page_sizes) { 13778 if (TTE_CSZ(&old->tte_data) == TTE32M || 13779 TTE_CSZ(&old->tte_data) == TTE256M) 13780 continue; 13781 } 13782 13783 /* clear the lower 22 bits of the va */ 13784 va = *(uint64_t *)old << 22; 13785 /* turn va into a virtual pfn */ 13786 va >>= 22 - TSB_START_SIZE; 13787 /* 13788 * or in bits from the offset in the tsb 13789 * to get the real virtual pfn. These 13790 * correspond to bits [21:13] in the va 13791 */ 13792 vpshift = 13793 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 13794 0x1ff; 13795 va |= (i << vpshift); 13796 va >>= vpshift; 13797 new_offset = va & (new_entries - 1); 13798 new = new_base + new_offset; 13799 prefetch_tsbe_write(new); 13800 *new = *old; 13801 } 13802 } 13803 } 13804 } 13805 13806 /* 13807 * unused in sfmmu 13808 */ 13809 void 13810 hat_dump(void) 13811 { 13812 } 13813 13814 /* 13815 * Called when a thread is exiting and we have switched to the kernel address 13816 * space. Perform the same VM initialization resume() uses when switching 13817 * processes. 13818 * 13819 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 13820 * we call it anyway in case the semantics change in the future. 13821 */ 13822 /*ARGSUSED*/ 13823 void 13824 hat_thread_exit(kthread_t *thd) 13825 { 13826 uint_t pgsz_cnum; 13827 uint_t pstate_save; 13828 13829 ASSERT(thd->t_procp->p_as == &kas); 13830 13831 pgsz_cnum = KCONTEXT; 13832 #ifdef sun4u 13833 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT); 13834 #endif 13835 13836 /* 13837 * Note that sfmmu_load_mmustate() is currently a no-op for 13838 * kernel threads. We need to disable interrupts here, 13839 * simply because otherwise sfmmu_load_mmustate() would panic 13840 * if the caller does not disable interrupts. 13841 */ 13842 pstate_save = sfmmu_disable_intrs(); 13843 13844 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */ 13845 sfmmu_setctx_sec(pgsz_cnum); 13846 sfmmu_load_mmustate(ksfmmup); 13847 sfmmu_enable_intrs(pstate_save); 13848 } 13849 13850 13851 /* 13852 * SRD support 13853 */ 13854 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \ 13855 (((uintptr_t)(vp)) >> 11)) & \ 13856 srd_hashmask) 13857 13858 /* 13859 * Attach the process to the srd struct associated with the exec vnode 13860 * from which the process is started. 13861 */ 13862 void 13863 hat_join_srd(struct hat *sfmmup, vnode_t *evp) 13864 { 13865 uint_t hash = SRD_HASH_FUNCTION(evp); 13866 sf_srd_t *srdp; 13867 sf_srd_t *newsrdp; 13868 13869 ASSERT(sfmmup != ksfmmup); 13870 ASSERT(sfmmup->sfmmu_srdp == NULL); 13871 13872 if (!shctx_on) { 13873 return; 13874 } 13875 13876 VN_HOLD(evp); 13877 13878 if (srd_buckets[hash].srdb_srdp != NULL) { 13879 mutex_enter(&srd_buckets[hash].srdb_lock); 13880 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13881 srdp = srdp->srd_hash) { 13882 if (srdp->srd_evp == evp) { 13883 ASSERT(srdp->srd_refcnt >= 0); 13884 sfmmup->sfmmu_srdp = srdp; 13885 atomic_add_32( 13886 (volatile uint_t *)&srdp->srd_refcnt, 1); 13887 mutex_exit(&srd_buckets[hash].srdb_lock); 13888 return; 13889 } 13890 } 13891 mutex_exit(&srd_buckets[hash].srdb_lock); 13892 } 13893 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP); 13894 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0); 13895 13896 newsrdp->srd_evp = evp; 13897 newsrdp->srd_refcnt = 1; 13898 newsrdp->srd_hmergnfree = NULL; 13899 newsrdp->srd_ismrgnfree = NULL; 13900 13901 mutex_enter(&srd_buckets[hash].srdb_lock); 13902 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13903 srdp = srdp->srd_hash) { 13904 if (srdp->srd_evp == evp) { 13905 ASSERT(srdp->srd_refcnt >= 0); 13906 sfmmup->sfmmu_srdp = srdp; 13907 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 13908 mutex_exit(&srd_buckets[hash].srdb_lock); 13909 kmem_cache_free(srd_cache, newsrdp); 13910 return; 13911 } 13912 } 13913 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp; 13914 srd_buckets[hash].srdb_srdp = newsrdp; 13915 sfmmup->sfmmu_srdp = newsrdp; 13916 13917 mutex_exit(&srd_buckets[hash].srdb_lock); 13918 13919 } 13920 13921 static void 13922 sfmmu_leave_srd(sfmmu_t *sfmmup) 13923 { 13924 vnode_t *evp; 13925 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13926 uint_t hash; 13927 sf_srd_t **prev_srdpp; 13928 sf_region_t *rgnp; 13929 sf_region_t *nrgnp; 13930 #ifdef DEBUG 13931 int rgns = 0; 13932 #endif 13933 int i; 13934 13935 ASSERT(sfmmup != ksfmmup); 13936 ASSERT(srdp != NULL); 13937 ASSERT(srdp->srd_refcnt > 0); 13938 ASSERT(sfmmup->sfmmu_scdp == NULL); 13939 ASSERT(sfmmup->sfmmu_free == 1); 13940 13941 sfmmup->sfmmu_srdp = NULL; 13942 evp = srdp->srd_evp; 13943 ASSERT(evp != NULL); 13944 if (atomic_add_32_nv( 13945 (volatile uint_t *)&srdp->srd_refcnt, -1)) { 13946 VN_RELE(evp); 13947 return; 13948 } 13949 13950 hash = SRD_HASH_FUNCTION(evp); 13951 mutex_enter(&srd_buckets[hash].srdb_lock); 13952 for (prev_srdpp = &srd_buckets[hash].srdb_srdp; 13953 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) { 13954 if (srdp->srd_evp == evp) { 13955 break; 13956 } 13957 } 13958 if (srdp == NULL || srdp->srd_refcnt) { 13959 mutex_exit(&srd_buckets[hash].srdb_lock); 13960 VN_RELE(evp); 13961 return; 13962 } 13963 *prev_srdpp = srdp->srd_hash; 13964 mutex_exit(&srd_buckets[hash].srdb_lock); 13965 13966 ASSERT(srdp->srd_refcnt == 0); 13967 VN_RELE(evp); 13968 13969 #ifdef DEBUG 13970 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) { 13971 ASSERT(srdp->srd_rgnhash[i] == NULL); 13972 } 13973 #endif /* DEBUG */ 13974 13975 /* free each hme regions in the srd */ 13976 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) { 13977 nrgnp = rgnp->rgn_next; 13978 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid); 13979 ASSERT(rgnp->rgn_refcnt == 0); 13980 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13981 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13982 ASSERT(rgnp->rgn_hmeflags == 0); 13983 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp); 13984 #ifdef DEBUG 13985 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13986 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13987 } 13988 rgns++; 13989 #endif /* DEBUG */ 13990 kmem_cache_free(region_cache, rgnp); 13991 } 13992 ASSERT(rgns == srdp->srd_next_hmerid); 13993 13994 #ifdef DEBUG 13995 rgns = 0; 13996 #endif 13997 /* free each ism rgns in the srd */ 13998 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) { 13999 nrgnp = rgnp->rgn_next; 14000 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid); 14001 ASSERT(rgnp->rgn_refcnt == 0); 14002 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14003 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 14004 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp); 14005 #ifdef DEBUG 14006 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14007 ASSERT(rgnp->rgn_ttecnt[i] == 0); 14008 } 14009 rgns++; 14010 #endif /* DEBUG */ 14011 kmem_cache_free(region_cache, rgnp); 14012 } 14013 ASSERT(rgns == srdp->srd_next_ismrid); 14014 ASSERT(srdp->srd_ismbusyrgns == 0); 14015 ASSERT(srdp->srd_hmebusyrgns == 0); 14016 14017 srdp->srd_next_ismrid = 0; 14018 srdp->srd_next_hmerid = 0; 14019 14020 bzero((void *)srdp->srd_ismrgnp, 14021 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS); 14022 bzero((void *)srdp->srd_hmergnp, 14023 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS); 14024 14025 ASSERT(srdp->srd_scdp == NULL); 14026 kmem_cache_free(srd_cache, srdp); 14027 } 14028 14029 /* ARGSUSED */ 14030 static int 14031 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags) 14032 { 14033 sf_srd_t *srdp = (sf_srd_t *)buf; 14034 bzero(buf, sizeof (*srdp)); 14035 14036 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL); 14037 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL); 14038 return (0); 14039 } 14040 14041 /* ARGSUSED */ 14042 static void 14043 sfmmu_srdcache_destructor(void *buf, void *cdrarg) 14044 { 14045 sf_srd_t *srdp = (sf_srd_t *)buf; 14046 14047 mutex_destroy(&srdp->srd_mutex); 14048 mutex_destroy(&srdp->srd_scd_mutex); 14049 } 14050 14051 /* 14052 * The caller makes sure hat_join_region()/hat_leave_region() can't be called 14053 * at the same time for the same process and address range. This is ensured by 14054 * the fact that address space is locked as writer when a process joins the 14055 * regions. Therefore there's no need to hold an srd lock during the entire 14056 * execution of hat_join_region()/hat_leave_region(). 14057 */ 14058 14059 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \ 14060 (((uintptr_t)(obj)) >> 11)) & \ 14061 srd_rgn_hashmask) 14062 /* 14063 * This routine implements the shared context functionality required when 14064 * attaching a segment to an address space. It must be called from 14065 * hat_share() for D(ISM) segments and from segvn_create() for segments 14066 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie 14067 * which is saved in the private segment data for hme segments and 14068 * the ism_map structure for ism segments. 14069 */ 14070 hat_region_cookie_t 14071 hat_join_region(struct hat *sfmmup, 14072 caddr_t r_saddr, 14073 size_t r_size, 14074 void *r_obj, 14075 u_offset_t r_objoff, 14076 uchar_t r_perm, 14077 uchar_t r_pgszc, 14078 hat_rgn_cb_func_t r_cb_function, 14079 uint_t flags) 14080 { 14081 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14082 uint_t rhash; 14083 uint_t rid; 14084 hatlock_t *hatlockp; 14085 sf_region_t *rgnp; 14086 sf_region_t *new_rgnp = NULL; 14087 int i; 14088 uint16_t *nextidp; 14089 sf_region_t **freelistp; 14090 int maxids; 14091 sf_region_t **rarrp; 14092 uint16_t *busyrgnsp; 14093 ulong_t rttecnt; 14094 uchar_t tteflag; 14095 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14096 int text = (r_type == HAT_REGION_TEXT); 14097 14098 if (srdp == NULL || r_size == 0) { 14099 return (HAT_INVALID_REGION_COOKIE); 14100 } 14101 14102 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14103 ASSERT(sfmmup != ksfmmup); 14104 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14105 ASSERT(srdp->srd_refcnt > 0); 14106 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14107 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14108 ASSERT(r_pgszc < mmu_page_sizes); 14109 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) || 14110 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) { 14111 panic("hat_join_region: region addr or size is not aligned\n"); 14112 } 14113 14114 14115 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14116 SFMMU_REGION_HME; 14117 /* 14118 * Currently only support shared hmes for the read only main text 14119 * region. 14120 */ 14121 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) || 14122 (r_perm & PROT_WRITE))) { 14123 return (HAT_INVALID_REGION_COOKIE); 14124 } 14125 14126 rhash = RGN_HASH_FUNCTION(r_obj); 14127 14128 if (r_type == SFMMU_REGION_ISM) { 14129 nextidp = &srdp->srd_next_ismrid; 14130 freelistp = &srdp->srd_ismrgnfree; 14131 maxids = SFMMU_MAX_ISM_REGIONS; 14132 rarrp = srdp->srd_ismrgnp; 14133 busyrgnsp = &srdp->srd_ismbusyrgns; 14134 } else { 14135 nextidp = &srdp->srd_next_hmerid; 14136 freelistp = &srdp->srd_hmergnfree; 14137 maxids = SFMMU_MAX_HME_REGIONS; 14138 rarrp = srdp->srd_hmergnp; 14139 busyrgnsp = &srdp->srd_hmebusyrgns; 14140 } 14141 14142 mutex_enter(&srdp->srd_mutex); 14143 14144 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14145 rgnp = rgnp->rgn_hash) { 14146 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size && 14147 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff && 14148 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) { 14149 break; 14150 } 14151 } 14152 14153 rfound: 14154 if (rgnp != NULL) { 14155 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14156 ASSERT(rgnp->rgn_cb_function == r_cb_function); 14157 ASSERT(rgnp->rgn_refcnt >= 0); 14158 rid = rgnp->rgn_id; 14159 ASSERT(rid < maxids); 14160 ASSERT(rarrp[rid] == rgnp); 14161 ASSERT(rid < *nextidp); 14162 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14163 mutex_exit(&srdp->srd_mutex); 14164 if (new_rgnp != NULL) { 14165 kmem_cache_free(region_cache, new_rgnp); 14166 } 14167 if (r_type == SFMMU_REGION_HME) { 14168 int myjoin = 14169 (sfmmup == astosfmmu(curthread->t_procp->p_as)); 14170 14171 sfmmu_link_to_hmeregion(sfmmup, rgnp); 14172 /* 14173 * bitmap should be updated after linking sfmmu on 14174 * region list so that pageunload() doesn't skip 14175 * TSB/TLB flush. As soon as bitmap is updated another 14176 * thread in this process can already start accessing 14177 * this region. 14178 */ 14179 /* 14180 * Normally ttecnt accounting is done as part of 14181 * pagefault handling. But a process may not take any 14182 * pagefaults on shared hmeblks created by some other 14183 * process. To compensate for this assume that the 14184 * entire region will end up faulted in using 14185 * the region's pagesize. 14186 * 14187 */ 14188 if (r_pgszc > TTE8K) { 14189 tteflag = 1 << r_pgszc; 14190 if (disable_large_pages & tteflag) { 14191 tteflag = 0; 14192 } 14193 } else { 14194 tteflag = 0; 14195 } 14196 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) { 14197 hatlockp = sfmmu_hat_enter(sfmmup); 14198 sfmmup->sfmmu_rtteflags |= tteflag; 14199 sfmmu_hat_exit(hatlockp); 14200 } 14201 hatlockp = sfmmu_hat_enter(sfmmup); 14202 14203 /* 14204 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M 14205 * region to allow for large page allocation failure. 14206 */ 14207 if (r_pgszc >= TTE4M) { 14208 sfmmup->sfmmu_tsb0_4minflcnt += 14209 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14210 } 14211 14212 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14213 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14214 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14215 rttecnt); 14216 14217 if (text && r_pgszc >= TTE4M && 14218 (tteflag || ((disable_large_pages >> TTE4M) & 14219 ((1 << (r_pgszc - TTE4M + 1)) - 1))) && 14220 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 14221 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 14222 } 14223 14224 sfmmu_hat_exit(hatlockp); 14225 /* 14226 * On Panther we need to make sure TLB is programmed 14227 * to accept 32M/256M pages. Call 14228 * sfmmu_check_page_sizes() now to make sure TLB is 14229 * setup before making hmeregions visible to other 14230 * threads. 14231 */ 14232 sfmmu_check_page_sizes(sfmmup, 1); 14233 hatlockp = sfmmu_hat_enter(sfmmup); 14234 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14235 14236 /* 14237 * if context is invalid tsb miss exception code will 14238 * call sfmmu_check_page_sizes() and update tsbmiss 14239 * area later. 14240 */ 14241 kpreempt_disable(); 14242 if (myjoin && 14243 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 14244 != INVALID_CONTEXT)) { 14245 struct tsbmiss *tsbmp; 14246 14247 tsbmp = &tsbmiss_area[CPU->cpu_id]; 14248 ASSERT(sfmmup == tsbmp->usfmmup); 14249 BT_SET(tsbmp->shmermap, rid); 14250 if (r_pgszc > TTE64K) { 14251 tsbmp->uhat_rtteflags |= tteflag; 14252 } 14253 14254 } 14255 kpreempt_enable(); 14256 14257 sfmmu_hat_exit(hatlockp); 14258 ASSERT((hat_region_cookie_t)((uint64_t)rid) != 14259 HAT_INVALID_REGION_COOKIE); 14260 } else { 14261 hatlockp = sfmmu_hat_enter(sfmmup); 14262 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid); 14263 sfmmu_hat_exit(hatlockp); 14264 } 14265 ASSERT(rid < maxids); 14266 14267 if (r_type == SFMMU_REGION_ISM) { 14268 sfmmu_find_scd(sfmmup); 14269 } 14270 return ((hat_region_cookie_t)((uint64_t)rid)); 14271 } 14272 14273 ASSERT(new_rgnp == NULL); 14274 14275 if (*busyrgnsp >= maxids) { 14276 mutex_exit(&srdp->srd_mutex); 14277 return (HAT_INVALID_REGION_COOKIE); 14278 } 14279 14280 ASSERT(MUTEX_HELD(&srdp->srd_mutex)); 14281 if (*freelistp != NULL) { 14282 rgnp = *freelistp; 14283 *freelistp = rgnp->rgn_next; 14284 ASSERT(rgnp->rgn_id < *nextidp); 14285 ASSERT(rgnp->rgn_id < maxids); 14286 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 14287 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) 14288 == r_type); 14289 ASSERT(rarrp[rgnp->rgn_id] == rgnp); 14290 ASSERT(rgnp->rgn_hmeflags == 0); 14291 } else { 14292 /* 14293 * release local locks before memory allocation. 14294 */ 14295 mutex_exit(&srdp->srd_mutex); 14296 14297 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP); 14298 14299 mutex_enter(&srdp->srd_mutex); 14300 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14301 rgnp = rgnp->rgn_hash) { 14302 if (rgnp->rgn_saddr == r_saddr && 14303 rgnp->rgn_size == r_size && 14304 rgnp->rgn_obj == r_obj && 14305 rgnp->rgn_objoff == r_objoff && 14306 rgnp->rgn_perm == r_perm && 14307 rgnp->rgn_pgszc == r_pgszc) { 14308 break; 14309 } 14310 } 14311 if (rgnp != NULL) { 14312 goto rfound; 14313 } 14314 14315 if (*nextidp >= maxids) { 14316 mutex_exit(&srdp->srd_mutex); 14317 goto fail; 14318 } 14319 rgnp = new_rgnp; 14320 new_rgnp = NULL; 14321 rgnp->rgn_id = (*nextidp)++; 14322 ASSERT(rgnp->rgn_id < maxids); 14323 ASSERT(rarrp[rgnp->rgn_id] == NULL); 14324 rarrp[rgnp->rgn_id] = rgnp; 14325 } 14326 14327 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14328 ASSERT(rgnp->rgn_hmeflags == 0); 14329 #ifdef DEBUG 14330 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14331 ASSERT(rgnp->rgn_ttecnt[i] == 0); 14332 } 14333 #endif 14334 rgnp->rgn_saddr = r_saddr; 14335 rgnp->rgn_size = r_size; 14336 rgnp->rgn_obj = r_obj; 14337 rgnp->rgn_objoff = r_objoff; 14338 rgnp->rgn_perm = r_perm; 14339 rgnp->rgn_pgszc = r_pgszc; 14340 rgnp->rgn_flags = r_type; 14341 rgnp->rgn_refcnt = 0; 14342 rgnp->rgn_cb_function = r_cb_function; 14343 rgnp->rgn_hash = srdp->srd_rgnhash[rhash]; 14344 srdp->srd_rgnhash[rhash] = rgnp; 14345 (*busyrgnsp)++; 14346 ASSERT(*busyrgnsp <= maxids); 14347 goto rfound; 14348 14349 fail: 14350 ASSERT(new_rgnp != NULL); 14351 kmem_cache_free(region_cache, new_rgnp); 14352 return (HAT_INVALID_REGION_COOKIE); 14353 } 14354 14355 /* 14356 * This function implements the shared context functionality required 14357 * when detaching a segment from an address space. It must be called 14358 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(), 14359 * for segments with a valid region_cookie. 14360 * It will also be called from all seg_vn routines which change a 14361 * segment's attributes such as segvn_setprot(), segvn_setpagesize(), 14362 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault 14363 * from segvn_fault(). 14364 */ 14365 void 14366 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags) 14367 { 14368 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14369 sf_scd_t *scdp; 14370 uint_t rhash; 14371 uint_t rid = (uint_t)((uint64_t)rcookie); 14372 hatlock_t *hatlockp = NULL; 14373 sf_region_t *rgnp; 14374 sf_region_t **prev_rgnpp; 14375 sf_region_t *cur_rgnp; 14376 void *r_obj; 14377 int i; 14378 caddr_t r_saddr; 14379 caddr_t r_eaddr; 14380 size_t r_size; 14381 uchar_t r_pgszc; 14382 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14383 14384 ASSERT(sfmmup != ksfmmup); 14385 ASSERT(srdp != NULL); 14386 ASSERT(srdp->srd_refcnt > 0); 14387 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14388 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14389 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL); 14390 14391 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14392 SFMMU_REGION_HME; 14393 14394 if (r_type == SFMMU_REGION_ISM) { 14395 ASSERT(SFMMU_IS_ISMRID_VALID(rid)); 14396 ASSERT(rid < SFMMU_MAX_ISM_REGIONS); 14397 rgnp = srdp->srd_ismrgnp[rid]; 14398 } else { 14399 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14400 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14401 rgnp = srdp->srd_hmergnp[rid]; 14402 } 14403 ASSERT(rgnp != NULL); 14404 ASSERT(rgnp->rgn_id == rid); 14405 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14406 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14407 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14408 14409 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14410 if (r_type == SFMMU_REGION_HME && sfmmup->sfmmu_as->a_xhat != NULL) { 14411 xhat_unload_callback_all(sfmmup->sfmmu_as, rgnp->rgn_saddr, 14412 rgnp->rgn_size, 0, NULL); 14413 } 14414 14415 if (sfmmup->sfmmu_free) { 14416 ulong_t rttecnt; 14417 r_pgszc = rgnp->rgn_pgszc; 14418 r_size = rgnp->rgn_size; 14419 14420 ASSERT(sfmmup->sfmmu_scdp == NULL); 14421 if (r_type == SFMMU_REGION_ISM) { 14422 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14423 } else { 14424 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14425 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14426 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14427 14428 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14429 -rttecnt); 14430 14431 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14432 } 14433 } else if (r_type == SFMMU_REGION_ISM) { 14434 hatlockp = sfmmu_hat_enter(sfmmup); 14435 ASSERT(rid < srdp->srd_next_ismrid); 14436 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14437 scdp = sfmmup->sfmmu_scdp; 14438 if (scdp != NULL && 14439 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 14440 sfmmu_leave_scd(sfmmup, r_type); 14441 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14442 } 14443 sfmmu_hat_exit(hatlockp); 14444 } else { 14445 ulong_t rttecnt; 14446 r_pgszc = rgnp->rgn_pgszc; 14447 r_saddr = rgnp->rgn_saddr; 14448 r_size = rgnp->rgn_size; 14449 r_eaddr = r_saddr + r_size; 14450 14451 ASSERT(r_type == SFMMU_REGION_HME); 14452 hatlockp = sfmmu_hat_enter(sfmmup); 14453 ASSERT(rid < srdp->srd_next_hmerid); 14454 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14455 14456 /* 14457 * If region is part of an SCD call sfmmu_leave_scd(). 14458 * Otherwise if process is not exiting and has valid context 14459 * just drop the context on the floor to lose stale TLB 14460 * entries and force the update of tsb miss area to reflect 14461 * the new region map. After that clean our TSB entries. 14462 */ 14463 scdp = sfmmup->sfmmu_scdp; 14464 if (scdp != NULL && 14465 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 14466 sfmmu_leave_scd(sfmmup, r_type); 14467 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14468 } 14469 sfmmu_invalidate_ctx(sfmmup); 14470 14471 i = TTE8K; 14472 while (i < mmu_page_sizes) { 14473 if (rgnp->rgn_ttecnt[i] != 0) { 14474 sfmmu_unload_tsb_range(sfmmup, r_saddr, 14475 r_eaddr, i); 14476 if (i < TTE4M) { 14477 i = TTE4M; 14478 continue; 14479 } else { 14480 break; 14481 } 14482 } 14483 i++; 14484 } 14485 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */ 14486 if (r_pgszc >= TTE4M) { 14487 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14488 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14489 rttecnt); 14490 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt; 14491 } 14492 14493 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14494 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14495 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14496 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt); 14497 14498 sfmmu_hat_exit(hatlockp); 14499 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 14500 /* sfmmup left the scd, grow private tsb */ 14501 sfmmu_check_page_sizes(sfmmup, 1); 14502 } else { 14503 sfmmu_check_page_sizes(sfmmup, 0); 14504 } 14505 } 14506 14507 if (r_type == SFMMU_REGION_HME) { 14508 sfmmu_unlink_from_hmeregion(sfmmup, rgnp); 14509 } 14510 14511 r_obj = rgnp->rgn_obj; 14512 if (atomic_add_32_nv((volatile uint_t *)&rgnp->rgn_refcnt, -1)) { 14513 return; 14514 } 14515 14516 /* 14517 * looks like nobody uses this region anymore. Free it. 14518 */ 14519 rhash = RGN_HASH_FUNCTION(r_obj); 14520 mutex_enter(&srdp->srd_mutex); 14521 for (prev_rgnpp = &srdp->srd_rgnhash[rhash]; 14522 (cur_rgnp = *prev_rgnpp) != NULL; 14523 prev_rgnpp = &cur_rgnp->rgn_hash) { 14524 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) { 14525 break; 14526 } 14527 } 14528 14529 if (cur_rgnp == NULL) { 14530 mutex_exit(&srdp->srd_mutex); 14531 return; 14532 } 14533 14534 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14535 *prev_rgnpp = rgnp->rgn_hash; 14536 if (r_type == SFMMU_REGION_ISM) { 14537 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14538 ASSERT(rid < srdp->srd_next_ismrid); 14539 rgnp->rgn_next = srdp->srd_ismrgnfree; 14540 srdp->srd_ismrgnfree = rgnp; 14541 ASSERT(srdp->srd_ismbusyrgns > 0); 14542 srdp->srd_ismbusyrgns--; 14543 mutex_exit(&srdp->srd_mutex); 14544 return; 14545 } 14546 mutex_exit(&srdp->srd_mutex); 14547 14548 /* 14549 * Destroy region's hmeblks. 14550 */ 14551 sfmmu_unload_hmeregion(srdp, rgnp); 14552 14553 rgnp->rgn_hmeflags = 0; 14554 14555 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14556 ASSERT(rgnp->rgn_id == rid); 14557 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14558 rgnp->rgn_ttecnt[i] = 0; 14559 } 14560 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14561 mutex_enter(&srdp->srd_mutex); 14562 ASSERT(rid < srdp->srd_next_hmerid); 14563 rgnp->rgn_next = srdp->srd_hmergnfree; 14564 srdp->srd_hmergnfree = rgnp; 14565 ASSERT(srdp->srd_hmebusyrgns > 0); 14566 srdp->srd_hmebusyrgns--; 14567 mutex_exit(&srdp->srd_mutex); 14568 } 14569 14570 /* 14571 * For now only called for hmeblk regions and not for ISM regions. 14572 */ 14573 void 14574 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie) 14575 { 14576 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14577 uint_t rid = (uint_t)((uint64_t)rcookie); 14578 sf_region_t *rgnp; 14579 sf_rgn_link_t *rlink; 14580 sf_rgn_link_t *hrlink; 14581 ulong_t rttecnt; 14582 14583 ASSERT(sfmmup != ksfmmup); 14584 ASSERT(srdp != NULL); 14585 ASSERT(srdp->srd_refcnt > 0); 14586 14587 ASSERT(rid < srdp->srd_next_hmerid); 14588 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14589 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14590 14591 rgnp = srdp->srd_hmergnp[rid]; 14592 ASSERT(rgnp->rgn_refcnt > 0); 14593 ASSERT(rgnp->rgn_id == rid); 14594 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME); 14595 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14596 14597 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14598 14599 /* LINTED: constant in conditional context */ 14600 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0); 14601 ASSERT(rlink != NULL); 14602 mutex_enter(&rgnp->rgn_mutex); 14603 ASSERT(rgnp->rgn_sfmmu_head != NULL); 14604 /* LINTED: constant in conditional context */ 14605 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0); 14606 ASSERT(hrlink != NULL); 14607 ASSERT(hrlink->prev == NULL); 14608 rlink->next = rgnp->rgn_sfmmu_head; 14609 rlink->prev = NULL; 14610 hrlink->prev = sfmmup; 14611 /* 14612 * make sure rlink's next field is correct 14613 * before making this link visible. 14614 */ 14615 membar_stst(); 14616 rgnp->rgn_sfmmu_head = sfmmup; 14617 mutex_exit(&rgnp->rgn_mutex); 14618 14619 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14620 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 14621 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt); 14622 /* update tsb0 inflation count */ 14623 if (rgnp->rgn_pgszc >= TTE4M) { 14624 sfmmup->sfmmu_tsb0_4minflcnt += 14625 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14626 } 14627 /* 14628 * Update regionid bitmask without hat lock since no other thread 14629 * can update this region bitmask right now. 14630 */ 14631 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14632 } 14633 14634 /* ARGSUSED */ 14635 static int 14636 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags) 14637 { 14638 sf_region_t *rgnp = (sf_region_t *)buf; 14639 bzero(buf, sizeof (*rgnp)); 14640 14641 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL); 14642 14643 return (0); 14644 } 14645 14646 /* ARGSUSED */ 14647 static void 14648 sfmmu_rgncache_destructor(void *buf, void *cdrarg) 14649 { 14650 sf_region_t *rgnp = (sf_region_t *)buf; 14651 mutex_destroy(&rgnp->rgn_mutex); 14652 } 14653 14654 static int 14655 sfrgnmap_isnull(sf_region_map_t *map) 14656 { 14657 int i; 14658 14659 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14660 if (map->bitmap[i] != 0) { 14661 return (0); 14662 } 14663 } 14664 return (1); 14665 } 14666 14667 static int 14668 sfhmergnmap_isnull(sf_hmeregion_map_t *map) 14669 { 14670 int i; 14671 14672 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 14673 if (map->bitmap[i] != 0) { 14674 return (0); 14675 } 14676 } 14677 return (1); 14678 } 14679 14680 #ifdef DEBUG 14681 static void 14682 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist) 14683 { 14684 sfmmu_t *sp; 14685 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14686 14687 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) { 14688 ASSERT(srdp == sp->sfmmu_srdp); 14689 if (sp == sfmmup) { 14690 if (onlist) { 14691 return; 14692 } else { 14693 panic("shctx: sfmmu 0x%p found on scd" 14694 "list 0x%p", (void *)sfmmup, 14695 (void *)*headp); 14696 } 14697 } 14698 } 14699 if (onlist) { 14700 panic("shctx: sfmmu 0x%p not found on scd list 0x%p", 14701 (void *)sfmmup, (void *)*headp); 14702 } else { 14703 return; 14704 } 14705 } 14706 #else /* DEBUG */ 14707 #define check_scd_sfmmu_list(headp, sfmmup, onlist) 14708 #endif /* DEBUG */ 14709 14710 /* 14711 * Removes an sfmmu from the SCD sfmmu list. 14712 */ 14713 static void 14714 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14715 { 14716 ASSERT(sfmmup->sfmmu_srdp != NULL); 14717 check_scd_sfmmu_list(headp, sfmmup, 1); 14718 if (sfmmup->sfmmu_scd_link.prev != NULL) { 14719 ASSERT(*headp != sfmmup); 14720 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next = 14721 sfmmup->sfmmu_scd_link.next; 14722 } else { 14723 ASSERT(*headp == sfmmup); 14724 *headp = sfmmup->sfmmu_scd_link.next; 14725 } 14726 if (sfmmup->sfmmu_scd_link.next != NULL) { 14727 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev = 14728 sfmmup->sfmmu_scd_link.prev; 14729 } 14730 } 14731 14732 14733 /* 14734 * Adds an sfmmu to the start of the queue. 14735 */ 14736 static void 14737 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14738 { 14739 check_scd_sfmmu_list(headp, sfmmup, 0); 14740 sfmmup->sfmmu_scd_link.prev = NULL; 14741 sfmmup->sfmmu_scd_link.next = *headp; 14742 if (*headp != NULL) 14743 (*headp)->sfmmu_scd_link.prev = sfmmup; 14744 *headp = sfmmup; 14745 } 14746 14747 /* 14748 * Remove an scd from the start of the queue. 14749 */ 14750 static void 14751 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp) 14752 { 14753 if (scdp->scd_prev != NULL) { 14754 ASSERT(*headp != scdp); 14755 scdp->scd_prev->scd_next = scdp->scd_next; 14756 } else { 14757 ASSERT(*headp == scdp); 14758 *headp = scdp->scd_next; 14759 } 14760 14761 if (scdp->scd_next != NULL) { 14762 scdp->scd_next->scd_prev = scdp->scd_prev; 14763 } 14764 } 14765 14766 /* 14767 * Add an scd to the start of the queue. 14768 */ 14769 static void 14770 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp) 14771 { 14772 scdp->scd_prev = NULL; 14773 scdp->scd_next = *headp; 14774 if (*headp != NULL) { 14775 (*headp)->scd_prev = scdp; 14776 } 14777 *headp = scdp; 14778 } 14779 14780 static int 14781 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp) 14782 { 14783 uint_t rid; 14784 uint_t i; 14785 uint_t j; 14786 ulong_t w; 14787 sf_region_t *rgnp; 14788 ulong_t tte8k_cnt = 0; 14789 ulong_t tte4m_cnt = 0; 14790 uint_t tsb_szc; 14791 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 14792 sfmmu_t *ism_hatid; 14793 struct tsb_info *newtsb; 14794 int szc; 14795 14796 ASSERT(srdp != NULL); 14797 14798 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14799 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14800 continue; 14801 } 14802 j = 0; 14803 while (w) { 14804 if (!(w & 0x1)) { 14805 j++; 14806 w >>= 1; 14807 continue; 14808 } 14809 rid = (i << BT_ULSHIFT) | j; 14810 j++; 14811 w >>= 1; 14812 14813 if (rid < SFMMU_MAX_HME_REGIONS) { 14814 rgnp = srdp->srd_hmergnp[rid]; 14815 ASSERT(rgnp->rgn_id == rid); 14816 ASSERT(rgnp->rgn_refcnt > 0); 14817 14818 if (rgnp->rgn_pgszc < TTE4M) { 14819 tte8k_cnt += rgnp->rgn_size >> 14820 TTE_PAGE_SHIFT(TTE8K); 14821 } else { 14822 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14823 tte4m_cnt += rgnp->rgn_size >> 14824 TTE_PAGE_SHIFT(TTE4M); 14825 /* 14826 * Inflate SCD tsb0 by preallocating 14827 * 1/4 8k ttecnt for 4M regions to 14828 * allow for lgpg alloc failure. 14829 */ 14830 tte8k_cnt += rgnp->rgn_size >> 14831 (TTE_PAGE_SHIFT(TTE8K) + 2); 14832 } 14833 } else { 14834 rid -= SFMMU_MAX_HME_REGIONS; 14835 rgnp = srdp->srd_ismrgnp[rid]; 14836 ASSERT(rgnp->rgn_id == rid); 14837 ASSERT(rgnp->rgn_refcnt > 0); 14838 14839 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14840 ASSERT(ism_hatid->sfmmu_ismhat); 14841 14842 for (szc = 0; szc < TTE4M; szc++) { 14843 tte8k_cnt += 14844 ism_hatid->sfmmu_ttecnt[szc] << 14845 TTE_BSZS_SHIFT(szc); 14846 } 14847 14848 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14849 if (rgnp->rgn_pgszc >= TTE4M) { 14850 tte4m_cnt += rgnp->rgn_size >> 14851 TTE_PAGE_SHIFT(TTE4M); 14852 } 14853 } 14854 } 14855 } 14856 14857 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 14858 14859 /* Allocate both the SCD TSBs here. */ 14860 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14861 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) && 14862 (tsb_szc <= TSB_4M_SZCODE || 14863 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14864 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K, 14865 TSB_ALLOC, scsfmmup))) { 14866 14867 SFMMU_STAT(sf_scd_1sttsb_allocfail); 14868 return (TSB_ALLOCFAIL); 14869 } else { 14870 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX; 14871 14872 if (tte4m_cnt) { 14873 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 14874 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, 14875 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) && 14876 (tsb_szc <= TSB_4M_SZCODE || 14877 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 14878 TSB4M|TSB32M|TSB256M, 14879 TSB_ALLOC, scsfmmup))) { 14880 /* 14881 * If we fail to allocate the 2nd shared tsb, 14882 * just free the 1st tsb, return failure. 14883 */ 14884 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb); 14885 SFMMU_STAT(sf_scd_2ndtsb_allocfail); 14886 return (TSB_ALLOCFAIL); 14887 } else { 14888 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL); 14889 newtsb->tsb_flags |= TSB_SHAREDCTX; 14890 scsfmmup->sfmmu_tsb->tsb_next = newtsb; 14891 SFMMU_STAT(sf_scd_2ndtsb_alloc); 14892 } 14893 } 14894 SFMMU_STAT(sf_scd_1sttsb_alloc); 14895 } 14896 return (TSB_SUCCESS); 14897 } 14898 14899 static void 14900 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu) 14901 { 14902 while (scd_sfmmu->sfmmu_tsb != NULL) { 14903 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next; 14904 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb); 14905 scd_sfmmu->sfmmu_tsb = next; 14906 } 14907 } 14908 14909 /* 14910 * Link the sfmmu onto the hme region list. 14911 */ 14912 void 14913 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14914 { 14915 uint_t rid; 14916 sf_rgn_link_t *rlink; 14917 sfmmu_t *head; 14918 sf_rgn_link_t *hrlink; 14919 14920 rid = rgnp->rgn_id; 14921 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14922 14923 /* LINTED: constant in conditional context */ 14924 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1); 14925 ASSERT(rlink != NULL); 14926 mutex_enter(&rgnp->rgn_mutex); 14927 if ((head = rgnp->rgn_sfmmu_head) == NULL) { 14928 rlink->next = NULL; 14929 rlink->prev = NULL; 14930 /* 14931 * make sure rlink's next field is NULL 14932 * before making this link visible. 14933 */ 14934 membar_stst(); 14935 rgnp->rgn_sfmmu_head = sfmmup; 14936 } else { 14937 /* LINTED: constant in conditional context */ 14938 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0); 14939 ASSERT(hrlink != NULL); 14940 ASSERT(hrlink->prev == NULL); 14941 rlink->next = head; 14942 rlink->prev = NULL; 14943 hrlink->prev = sfmmup; 14944 /* 14945 * make sure rlink's next field is correct 14946 * before making this link visible. 14947 */ 14948 membar_stst(); 14949 rgnp->rgn_sfmmu_head = sfmmup; 14950 } 14951 mutex_exit(&rgnp->rgn_mutex); 14952 } 14953 14954 /* 14955 * Unlink the sfmmu from the hme region list. 14956 */ 14957 void 14958 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14959 { 14960 uint_t rid; 14961 sf_rgn_link_t *rlink; 14962 14963 rid = rgnp->rgn_id; 14964 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14965 14966 /* LINTED: constant in conditional context */ 14967 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 14968 ASSERT(rlink != NULL); 14969 mutex_enter(&rgnp->rgn_mutex); 14970 if (rgnp->rgn_sfmmu_head == sfmmup) { 14971 sfmmu_t *next = rlink->next; 14972 rgnp->rgn_sfmmu_head = next; 14973 /* 14974 * if we are stopped by xc_attention() after this 14975 * point the forward link walking in 14976 * sfmmu_rgntlb_demap() will work correctly since the 14977 * head correctly points to the next element. 14978 */ 14979 membar_stst(); 14980 rlink->next = NULL; 14981 ASSERT(rlink->prev == NULL); 14982 if (next != NULL) { 14983 sf_rgn_link_t *nrlink; 14984 /* LINTED: constant in conditional context */ 14985 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14986 ASSERT(nrlink != NULL); 14987 ASSERT(nrlink->prev == sfmmup); 14988 nrlink->prev = NULL; 14989 } 14990 } else { 14991 sfmmu_t *next = rlink->next; 14992 sfmmu_t *prev = rlink->prev; 14993 sf_rgn_link_t *prlink; 14994 14995 ASSERT(prev != NULL); 14996 /* LINTED: constant in conditional context */ 14997 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0); 14998 ASSERT(prlink != NULL); 14999 ASSERT(prlink->next == sfmmup); 15000 prlink->next = next; 15001 /* 15002 * if we are stopped by xc_attention() 15003 * after this point the forward link walking 15004 * will work correctly since the prev element 15005 * correctly points to the next element. 15006 */ 15007 membar_stst(); 15008 rlink->next = NULL; 15009 rlink->prev = NULL; 15010 if (next != NULL) { 15011 sf_rgn_link_t *nrlink; 15012 /* LINTED: constant in conditional context */ 15013 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 15014 ASSERT(nrlink != NULL); 15015 ASSERT(nrlink->prev == sfmmup); 15016 nrlink->prev = prev; 15017 } 15018 } 15019 mutex_exit(&rgnp->rgn_mutex); 15020 } 15021 15022 /* 15023 * Link scd sfmmu onto ism or hme region list for each region in the 15024 * scd region map. 15025 */ 15026 void 15027 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp) 15028 { 15029 uint_t rid; 15030 uint_t i; 15031 uint_t j; 15032 ulong_t w; 15033 sf_region_t *rgnp; 15034 sfmmu_t *scsfmmup; 15035 15036 scsfmmup = scdp->scd_sfmmup; 15037 ASSERT(scsfmmup->sfmmu_scdhat); 15038 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 15039 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 15040 continue; 15041 } 15042 j = 0; 15043 while (w) { 15044 if (!(w & 0x1)) { 15045 j++; 15046 w >>= 1; 15047 continue; 15048 } 15049 rid = (i << BT_ULSHIFT) | j; 15050 j++; 15051 w >>= 1; 15052 15053 if (rid < SFMMU_MAX_HME_REGIONS) { 15054 rgnp = srdp->srd_hmergnp[rid]; 15055 ASSERT(rgnp->rgn_id == rid); 15056 ASSERT(rgnp->rgn_refcnt > 0); 15057 sfmmu_link_to_hmeregion(scsfmmup, rgnp); 15058 } else { 15059 sfmmu_t *ism_hatid = NULL; 15060 ism_ment_t *ism_ment; 15061 rid -= SFMMU_MAX_HME_REGIONS; 15062 rgnp = srdp->srd_ismrgnp[rid]; 15063 ASSERT(rgnp->rgn_id == rid); 15064 ASSERT(rgnp->rgn_refcnt > 0); 15065 15066 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 15067 ASSERT(ism_hatid->sfmmu_ismhat); 15068 ism_ment = &scdp->scd_ism_links[rid]; 15069 ism_ment->iment_hat = scsfmmup; 15070 ism_ment->iment_base_va = rgnp->rgn_saddr; 15071 mutex_enter(&ism_mlist_lock); 15072 iment_add(ism_ment, ism_hatid); 15073 mutex_exit(&ism_mlist_lock); 15074 15075 } 15076 } 15077 } 15078 } 15079 /* 15080 * Unlink scd sfmmu from ism or hme region list for each region in the 15081 * scd region map. 15082 */ 15083 void 15084 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp) 15085 { 15086 uint_t rid; 15087 uint_t i; 15088 uint_t j; 15089 ulong_t w; 15090 sf_region_t *rgnp; 15091 sfmmu_t *scsfmmup; 15092 15093 scsfmmup = scdp->scd_sfmmup; 15094 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 15095 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 15096 continue; 15097 } 15098 j = 0; 15099 while (w) { 15100 if (!(w & 0x1)) { 15101 j++; 15102 w >>= 1; 15103 continue; 15104 } 15105 rid = (i << BT_ULSHIFT) | j; 15106 j++; 15107 w >>= 1; 15108 15109 if (rid < SFMMU_MAX_HME_REGIONS) { 15110 rgnp = srdp->srd_hmergnp[rid]; 15111 ASSERT(rgnp->rgn_id == rid); 15112 ASSERT(rgnp->rgn_refcnt > 0); 15113 sfmmu_unlink_from_hmeregion(scsfmmup, 15114 rgnp); 15115 15116 } else { 15117 sfmmu_t *ism_hatid = NULL; 15118 ism_ment_t *ism_ment; 15119 rid -= SFMMU_MAX_HME_REGIONS; 15120 rgnp = srdp->srd_ismrgnp[rid]; 15121 ASSERT(rgnp->rgn_id == rid); 15122 ASSERT(rgnp->rgn_refcnt > 0); 15123 15124 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 15125 ASSERT(ism_hatid->sfmmu_ismhat); 15126 ism_ment = &scdp->scd_ism_links[rid]; 15127 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup); 15128 ASSERT(ism_ment->iment_base_va == 15129 rgnp->rgn_saddr); 15130 mutex_enter(&ism_mlist_lock); 15131 iment_sub(ism_ment, ism_hatid); 15132 mutex_exit(&ism_mlist_lock); 15133 15134 } 15135 } 15136 } 15137 } 15138 /* 15139 * Allocates and initialises a new SCD structure, this is called with 15140 * the srd_scd_mutex held and returns with the reference count 15141 * initialised to 1. 15142 */ 15143 static sf_scd_t * 15144 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map) 15145 { 15146 sf_scd_t *new_scdp; 15147 sfmmu_t *scsfmmup; 15148 int i; 15149 15150 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex)); 15151 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP); 15152 15153 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 15154 new_scdp->scd_sfmmup = scsfmmup; 15155 scsfmmup->sfmmu_srdp = srdp; 15156 scsfmmup->sfmmu_scdp = new_scdp; 15157 scsfmmup->sfmmu_tsb0_4minflcnt = 0; 15158 scsfmmup->sfmmu_scdhat = 1; 15159 CPUSET_ALL(scsfmmup->sfmmu_cpusran); 15160 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 15161 15162 ASSERT(max_mmu_ctxdoms > 0); 15163 for (i = 0; i < max_mmu_ctxdoms; i++) { 15164 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 15165 scsfmmup->sfmmu_ctxs[i].gnum = 0; 15166 } 15167 15168 for (i = 0; i < MMU_PAGE_SIZES; i++) { 15169 new_scdp->scd_rttecnt[i] = 0; 15170 } 15171 15172 new_scdp->scd_region_map = *new_map; 15173 new_scdp->scd_refcnt = 1; 15174 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) { 15175 kmem_cache_free(scd_cache, new_scdp); 15176 kmem_cache_free(sfmmuid_cache, scsfmmup); 15177 return (NULL); 15178 } 15179 if (&mmu_init_scd) { 15180 mmu_init_scd(new_scdp); 15181 } 15182 return (new_scdp); 15183 } 15184 15185 /* 15186 * The first phase of a process joining an SCD. The hat structure is 15187 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set 15188 * and a cross-call with context invalidation is used to cause the 15189 * remaining work to be carried out in the sfmmu_tsbmiss_exception() 15190 * routine. 15191 */ 15192 static void 15193 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup) 15194 { 15195 hatlock_t *hatlockp; 15196 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15197 int i; 15198 sf_scd_t *old_scdp; 15199 15200 ASSERT(srdp != NULL); 15201 ASSERT(scdp != NULL); 15202 ASSERT(scdp->scd_refcnt > 0); 15203 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15204 15205 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) { 15206 ASSERT(old_scdp != scdp); 15207 15208 mutex_enter(&old_scdp->scd_mutex); 15209 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup); 15210 mutex_exit(&old_scdp->scd_mutex); 15211 /* 15212 * sfmmup leaves the old scd. Update sfmmu_ttecnt to 15213 * include the shme rgn ttecnt for rgns that 15214 * were in the old SCD 15215 */ 15216 for (i = 0; i < mmu_page_sizes; i++) { 15217 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15218 old_scdp->scd_rttecnt[i]); 15219 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15220 sfmmup->sfmmu_scdrttecnt[i]); 15221 } 15222 } 15223 15224 /* 15225 * Move sfmmu to the scd lists. 15226 */ 15227 mutex_enter(&scdp->scd_mutex); 15228 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup); 15229 mutex_exit(&scdp->scd_mutex); 15230 SF_SCD_INCR_REF(scdp); 15231 15232 hatlockp = sfmmu_hat_enter(sfmmup); 15233 /* 15234 * For a multi-thread process, we must stop 15235 * all the other threads before joining the scd. 15236 */ 15237 15238 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD); 15239 15240 sfmmu_invalidate_ctx(sfmmup); 15241 sfmmup->sfmmu_scdp = scdp; 15242 15243 /* 15244 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update 15245 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD. 15246 */ 15247 for (i = 0; i < mmu_page_sizes; i++) { 15248 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i]; 15249 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]); 15250 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15251 -sfmmup->sfmmu_scdrttecnt[i]); 15252 } 15253 /* update tsb0 inflation count */ 15254 if (old_scdp != NULL) { 15255 sfmmup->sfmmu_tsb0_4minflcnt += 15256 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15257 } 15258 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 15259 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt); 15260 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15261 15262 sfmmu_hat_exit(hatlockp); 15263 15264 if (old_scdp != NULL) { 15265 SF_SCD_DECR_REF(srdp, old_scdp); 15266 } 15267 15268 } 15269 15270 /* 15271 * This routine is called by a process to become part of an SCD. It is called 15272 * from sfmmu_tsbmiss_exception() once most of the initial work has been 15273 * done by sfmmu_join_scd(). This routine must not drop the hat lock. 15274 */ 15275 static void 15276 sfmmu_finish_join_scd(sfmmu_t *sfmmup) 15277 { 15278 struct tsb_info *tsbinfop; 15279 15280 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15281 ASSERT(sfmmup->sfmmu_scdp != NULL); 15282 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)); 15283 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15284 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)); 15285 15286 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 15287 tsbinfop = tsbinfop->tsb_next) { 15288 if (tsbinfop->tsb_flags & TSB_SWAPPED) { 15289 continue; 15290 } 15291 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG)); 15292 15293 sfmmu_inv_tsb(tsbinfop->tsb_va, 15294 TSB_BYTES(tsbinfop->tsb_szc)); 15295 } 15296 15297 /* Set HAT_CTX1_FLAG for all SCD ISMs */ 15298 sfmmu_ism_hatflags(sfmmup, 1); 15299 15300 SFMMU_STAT(sf_join_scd); 15301 } 15302 15303 /* 15304 * This routine is called in order to check if there is an SCD which matches 15305 * the process's region map if not then a new SCD may be created. 15306 */ 15307 static void 15308 sfmmu_find_scd(sfmmu_t *sfmmup) 15309 { 15310 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15311 sf_scd_t *scdp, *new_scdp; 15312 int ret; 15313 15314 ASSERT(srdp != NULL); 15315 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15316 15317 mutex_enter(&srdp->srd_scd_mutex); 15318 for (scdp = srdp->srd_scdp; scdp != NULL; 15319 scdp = scdp->scd_next) { 15320 SF_RGNMAP_EQUAL(&scdp->scd_region_map, 15321 &sfmmup->sfmmu_region_map, ret); 15322 if (ret == 1) { 15323 SF_SCD_INCR_REF(scdp); 15324 mutex_exit(&srdp->srd_scd_mutex); 15325 sfmmu_join_scd(scdp, sfmmup); 15326 ASSERT(scdp->scd_refcnt >= 2); 15327 atomic_add_32((volatile uint32_t *) 15328 &scdp->scd_refcnt, -1); 15329 return; 15330 } else { 15331 /* 15332 * If the sfmmu region map is a subset of the scd 15333 * region map, then the assumption is that this process 15334 * will continue attaching to ISM segments until the 15335 * region maps are equal. 15336 */ 15337 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map, 15338 &sfmmup->sfmmu_region_map, ret); 15339 if (ret == 1) { 15340 mutex_exit(&srdp->srd_scd_mutex); 15341 return; 15342 } 15343 } 15344 } 15345 15346 ASSERT(scdp == NULL); 15347 /* 15348 * No matching SCD has been found, create a new one. 15349 */ 15350 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) == 15351 NULL) { 15352 mutex_exit(&srdp->srd_scd_mutex); 15353 return; 15354 } 15355 15356 /* 15357 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd. 15358 */ 15359 15360 /* Set scd_rttecnt for shme rgns in SCD */ 15361 sfmmu_set_scd_rttecnt(srdp, new_scdp); 15362 15363 /* 15364 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists. 15365 */ 15366 sfmmu_link_scd_to_regions(srdp, new_scdp); 15367 sfmmu_add_scd(&srdp->srd_scdp, new_scdp); 15368 SFMMU_STAT_ADD(sf_create_scd, 1); 15369 15370 mutex_exit(&srdp->srd_scd_mutex); 15371 sfmmu_join_scd(new_scdp, sfmmup); 15372 ASSERT(new_scdp->scd_refcnt >= 2); 15373 atomic_add_32((volatile uint32_t *)&new_scdp->scd_refcnt, -1); 15374 } 15375 15376 /* 15377 * This routine is called by a process to remove itself from an SCD. It is 15378 * either called when the processes has detached from a segment or from 15379 * hat_free_start() as a result of calling exit. 15380 */ 15381 static void 15382 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type) 15383 { 15384 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15385 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15386 hatlock_t *hatlockp = TSB_HASH(sfmmup); 15387 int i; 15388 15389 ASSERT(scdp != NULL); 15390 ASSERT(srdp != NULL); 15391 15392 if (sfmmup->sfmmu_free) { 15393 /* 15394 * If the process is part of an SCD the sfmmu is unlinked 15395 * from scd_sf_list. 15396 */ 15397 mutex_enter(&scdp->scd_mutex); 15398 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15399 mutex_exit(&scdp->scd_mutex); 15400 /* 15401 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15402 * are about to leave the SCD 15403 */ 15404 for (i = 0; i < mmu_page_sizes; i++) { 15405 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15406 scdp->scd_rttecnt[i]); 15407 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15408 sfmmup->sfmmu_scdrttecnt[i]); 15409 sfmmup->sfmmu_scdrttecnt[i] = 0; 15410 } 15411 sfmmup->sfmmu_scdp = NULL; 15412 15413 SF_SCD_DECR_REF(srdp, scdp); 15414 return; 15415 } 15416 15417 ASSERT(r_type != SFMMU_REGION_ISM || 15418 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15419 ASSERT(scdp->scd_refcnt); 15420 ASSERT(!sfmmup->sfmmu_free); 15421 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15422 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15423 15424 /* 15425 * Wait for ISM maps to be updated. 15426 */ 15427 if (r_type != SFMMU_REGION_ISM) { 15428 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) && 15429 sfmmup->sfmmu_scdp != NULL) { 15430 cv_wait(&sfmmup->sfmmu_tsb_cv, 15431 HATLOCK_MUTEXP(hatlockp)); 15432 } 15433 15434 if (sfmmup->sfmmu_scdp == NULL) { 15435 sfmmu_hat_exit(hatlockp); 15436 return; 15437 } 15438 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 15439 } 15440 15441 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 15442 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 15443 /* 15444 * Since HAT_JOIN_SCD was set our context 15445 * is still invalid. 15446 */ 15447 } else { 15448 /* 15449 * For a multi-thread process, we must stop 15450 * all the other threads before leaving the scd. 15451 */ 15452 15453 sfmmu_invalidate_ctx(sfmmup); 15454 } 15455 15456 /* Clear all the rid's for ISM, delete flags, etc */ 15457 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15458 sfmmu_ism_hatflags(sfmmup, 0); 15459 15460 /* 15461 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15462 * are in SCD before this sfmmup leaves the SCD. 15463 */ 15464 for (i = 0; i < mmu_page_sizes; i++) { 15465 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15466 scdp->scd_rttecnt[i]); 15467 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15468 sfmmup->sfmmu_scdrttecnt[i]); 15469 sfmmup->sfmmu_scdrttecnt[i] = 0; 15470 /* update ismttecnt to include SCD ism before hat leaves SCD */ 15471 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i]; 15472 sfmmup->sfmmu_scdismttecnt[i] = 0; 15473 } 15474 /* update tsb0 inflation count */ 15475 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15476 15477 if (r_type != SFMMU_REGION_ISM) { 15478 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 15479 } 15480 sfmmup->sfmmu_scdp = NULL; 15481 15482 sfmmu_hat_exit(hatlockp); 15483 15484 /* 15485 * Unlink sfmmu from scd_sf_list this can be done without holding 15486 * the hat lock as we hold the sfmmu_as lock which prevents 15487 * hat_join_region from adding this thread to the scd again. Other 15488 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL 15489 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp 15490 * while holding the hat lock. 15491 */ 15492 mutex_enter(&scdp->scd_mutex); 15493 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15494 mutex_exit(&scdp->scd_mutex); 15495 SFMMU_STAT(sf_leave_scd); 15496 15497 SF_SCD_DECR_REF(srdp, scdp); 15498 hatlockp = sfmmu_hat_enter(sfmmup); 15499 15500 } 15501 15502 /* 15503 * Unlink and free up an SCD structure with a reference count of 0. 15504 */ 15505 static void 15506 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap) 15507 { 15508 sfmmu_t *scsfmmup; 15509 sf_scd_t *sp; 15510 hatlock_t *shatlockp; 15511 int i, ret; 15512 15513 mutex_enter(&srdp->srd_scd_mutex); 15514 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) { 15515 if (sp == scdp) 15516 break; 15517 } 15518 if (sp == NULL || sp->scd_refcnt) { 15519 mutex_exit(&srdp->srd_scd_mutex); 15520 return; 15521 } 15522 15523 /* 15524 * It is possible that the scd has been freed and reallocated with a 15525 * different region map while we've been waiting for the srd_scd_mutex. 15526 */ 15527 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret); 15528 if (ret != 1) { 15529 mutex_exit(&srdp->srd_scd_mutex); 15530 return; 15531 } 15532 15533 ASSERT(scdp->scd_sf_list == NULL); 15534 /* 15535 * Unlink scd from srd_scdp list. 15536 */ 15537 sfmmu_remove_scd(&srdp->srd_scdp, scdp); 15538 mutex_exit(&srdp->srd_scd_mutex); 15539 15540 sfmmu_unlink_scd_from_regions(srdp, scdp); 15541 15542 /* Clear shared context tsb and release ctx */ 15543 scsfmmup = scdp->scd_sfmmup; 15544 15545 /* 15546 * create a barrier so that scd will not be destroyed 15547 * if other thread still holds the same shared hat lock. 15548 * E.g., sfmmu_tsbmiss_exception() needs to acquire the 15549 * shared hat lock before checking the shared tsb reloc flag. 15550 */ 15551 shatlockp = sfmmu_hat_enter(scsfmmup); 15552 sfmmu_hat_exit(shatlockp); 15553 15554 sfmmu_free_scd_tsbs(scsfmmup); 15555 15556 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 15557 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) { 15558 kmem_free(scsfmmup->sfmmu_hmeregion_links[i], 15559 SFMMU_L2_HMERLINKS_SIZE); 15560 scsfmmup->sfmmu_hmeregion_links[i] = NULL; 15561 } 15562 } 15563 kmem_cache_free(sfmmuid_cache, scsfmmup); 15564 kmem_cache_free(scd_cache, scdp); 15565 SFMMU_STAT(sf_destroy_scd); 15566 } 15567 15568 /* 15569 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to 15570 * bits which are set in the ism_region_map parameter. This flag indicates to 15571 * the tsbmiss handler that mapping for these segments should be loaded using 15572 * the shared context. 15573 */ 15574 static void 15575 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag) 15576 { 15577 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15578 ism_blk_t *ism_blkp; 15579 ism_map_t *ism_map; 15580 int i, rid; 15581 15582 ASSERT(sfmmup->sfmmu_iblk != NULL); 15583 ASSERT(scdp != NULL); 15584 /* 15585 * Note that the caller either set HAT_ISMBUSY flag or checked 15586 * under hat lock that HAT_ISMBUSY was not set by another thread. 15587 */ 15588 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15589 15590 ism_blkp = sfmmup->sfmmu_iblk; 15591 while (ism_blkp != NULL) { 15592 ism_map = ism_blkp->iblk_maps; 15593 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 15594 rid = ism_map[i].imap_rid; 15595 if (rid == SFMMU_INVALID_ISMRID) { 15596 continue; 15597 } 15598 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS); 15599 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) && 15600 addflag) { 15601 ism_map[i].imap_hatflags |= 15602 HAT_CTX1_FLAG; 15603 } else { 15604 ism_map[i].imap_hatflags &= 15605 ~HAT_CTX1_FLAG; 15606 } 15607 } 15608 ism_blkp = ism_blkp->iblk_next; 15609 } 15610 } 15611 15612 static int 15613 sfmmu_srd_lock_held(sf_srd_t *srdp) 15614 { 15615 return (MUTEX_HELD(&srdp->srd_mutex)); 15616 } 15617 15618 /* ARGSUSED */ 15619 static int 15620 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags) 15621 { 15622 sf_scd_t *scdp = (sf_scd_t *)buf; 15623 15624 bzero(buf, sizeof (sf_scd_t)); 15625 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL); 15626 return (0); 15627 } 15628 15629 /* ARGSUSED */ 15630 static void 15631 sfmmu_scdcache_destructor(void *buf, void *cdrarg) 15632 { 15633 sf_scd_t *scdp = (sf_scd_t *)buf; 15634 15635 mutex_destroy(&scdp->scd_mutex); 15636 } 15637 15638 /* 15639 * The listp parameter is a pointer to a list of hmeblks which are partially 15640 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the 15641 * freeing process is to cross-call all cpus to ensure that there are no 15642 * remaining cached references. 15643 * 15644 * If the local generation number is less than the global then we can free 15645 * hmeblks which are already on the pending queue as another cpu has completed 15646 * the cross-call. 15647 * 15648 * We cross-call to make sure that there are no threads on other cpus accessing 15649 * these hmblks and then complete the process of freeing them under the 15650 * following conditions: 15651 * The total number of pending hmeblks is greater than the threshold 15652 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks 15653 * It is at least 1 second since the last time we cross-called 15654 * 15655 * Otherwise, we add the hmeblks to the per-cpu pending queue. 15656 */ 15657 static void 15658 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree) 15659 { 15660 struct hme_blk *hblkp, *pr_hblkp = NULL; 15661 int count = 0; 15662 cpuset_t cpuset = cpu_ready_set; 15663 cpu_hme_pend_t *cpuhp; 15664 timestruc_t now; 15665 int one_second_expired = 0; 15666 15667 gethrestime_lasttick(&now); 15668 15669 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) { 15670 ASSERT(hblkp->hblk_shw_bit == 0); 15671 ASSERT(hblkp->hblk_shared == 0); 15672 count++; 15673 pr_hblkp = hblkp; 15674 } 15675 15676 cpuhp = &cpu_hme_pend[CPU->cpu_seqid]; 15677 mutex_enter(&cpuhp->chp_mutex); 15678 15679 if ((cpuhp->chp_count + count) == 0) { 15680 mutex_exit(&cpuhp->chp_mutex); 15681 return; 15682 } 15683 15684 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) { 15685 one_second_expired = 1; 15686 } 15687 15688 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT || 15689 (cpuhp->chp_count + count) > cpu_hme_pend_thresh || 15690 one_second_expired)) { 15691 /* Append global list to local */ 15692 if (pr_hblkp == NULL) { 15693 *listp = cpuhp->chp_listp; 15694 } else { 15695 pr_hblkp->hblk_next = cpuhp->chp_listp; 15696 } 15697 cpuhp->chp_listp = NULL; 15698 cpuhp->chp_count = 0; 15699 cpuhp->chp_timestamp = now.tv_sec; 15700 mutex_exit(&cpuhp->chp_mutex); 15701 15702 kpreempt_disable(); 15703 CPUSET_DEL(cpuset, CPU->cpu_id); 15704 xt_sync(cpuset); 15705 xt_sync(cpuset); 15706 kpreempt_enable(); 15707 15708 /* 15709 * At this stage we know that no trap handlers on other 15710 * cpus can have references to hmeblks on the list. 15711 */ 15712 sfmmu_hblk_free(listp); 15713 } else if (*listp != NULL) { 15714 pr_hblkp->hblk_next = cpuhp->chp_listp; 15715 cpuhp->chp_listp = *listp; 15716 cpuhp->chp_count += count; 15717 *listp = NULL; 15718 mutex_exit(&cpuhp->chp_mutex); 15719 } else { 15720 mutex_exit(&cpuhp->chp_mutex); 15721 } 15722 } 15723 15724 /* 15725 * Add an hmeblk to the the hash list. 15726 */ 15727 void 15728 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15729 uint64_t hblkpa) 15730 { 15731 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15732 #ifdef DEBUG 15733 if (hmebp->hmeblkp == NULL) { 15734 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA); 15735 } 15736 #endif /* DEBUG */ 15737 15738 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa; 15739 /* 15740 * Since the TSB miss handler now does not lock the hash chain before 15741 * walking it, make sure that the hmeblks nextpa is globally visible 15742 * before we make the hmeblk globally visible by updating the chain root 15743 * pointer in the hash bucket. 15744 */ 15745 membar_producer(); 15746 hmebp->hmeh_nextpa = hblkpa; 15747 hmeblkp->hblk_next = hmebp->hmeblkp; 15748 hmebp->hmeblkp = hmeblkp; 15749 15750 } 15751 15752 /* 15753 * This function is the first part of a 2 part process to remove an hmeblk 15754 * from the hash chain. In this phase we unlink the hmeblk from the hash chain 15755 * but leave the next physical pointer unchanged. The hmeblk is then linked onto 15756 * a per-cpu pending list using the virtual address pointer. 15757 * 15758 * TSB miss trap handlers that start after this phase will no longer see 15759 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register 15760 * can still use it for further chain traversal because we haven't yet modifed 15761 * the next physical pointer or freed it. 15762 * 15763 * In the second phase of hmeblk removal we'll issue a barrier xcall before 15764 * we reuse or free this hmeblk. This will make sure all lingering references to 15765 * the hmeblk after first phase disappear before we finally reclaim it. 15766 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains 15767 * during their traversal. 15768 * 15769 * The hmehash_mutex must be held when calling this function. 15770 * 15771 * Input: 15772 * hmebp - hme hash bucket pointer 15773 * hmeblkp - address of hmeblk to be removed 15774 * pr_hblk - virtual address of previous hmeblkp 15775 * listp - pointer to list of hmeblks linked by virtual address 15776 * free_now flag - indicates that a complete removal from the hash chains 15777 * is necessary. 15778 * 15779 * It is inefficient to use the free_now flag as a cross-call is required to 15780 * remove a single hmeblk from the hash chain but is necessary when hmeblks are 15781 * in short supply. 15782 */ 15783 void 15784 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15785 struct hme_blk *pr_hblk, struct hme_blk **listp, 15786 int free_now) 15787 { 15788 int shw_size, vshift; 15789 struct hme_blk *shw_hblkp; 15790 uint_t shw_mask, newshw_mask; 15791 caddr_t vaddr; 15792 int size; 15793 cpuset_t cpuset = cpu_ready_set; 15794 15795 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15796 15797 if (hmebp->hmeblkp == hmeblkp) { 15798 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa; 15799 hmebp->hmeblkp = hmeblkp->hblk_next; 15800 } else { 15801 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa; 15802 pr_hblk->hblk_next = hmeblkp->hblk_next; 15803 } 15804 15805 size = get_hblk_ttesz(hmeblkp); 15806 shw_hblkp = hmeblkp->hblk_shadow; 15807 if (shw_hblkp) { 15808 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 15809 ASSERT(!hmeblkp->hblk_shared); 15810 #ifdef DEBUG 15811 if (mmu_page_sizes == max_mmu_page_sizes) { 15812 ASSERT(size < TTE256M); 15813 } else { 15814 ASSERT(size < TTE4M); 15815 } 15816 #endif /* DEBUG */ 15817 15818 shw_size = get_hblk_ttesz(shw_hblkp); 15819 vaddr = (caddr_t)get_hblk_base(hmeblkp); 15820 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 15821 ASSERT(vshift < 8); 15822 /* 15823 * Atomically clear shadow mask bit 15824 */ 15825 do { 15826 shw_mask = shw_hblkp->hblk_shw_mask; 15827 ASSERT(shw_mask & (1 << vshift)); 15828 newshw_mask = shw_mask & ~(1 << vshift); 15829 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 15830 shw_mask, newshw_mask); 15831 } while (newshw_mask != shw_mask); 15832 hmeblkp->hblk_shadow = NULL; 15833 } 15834 hmeblkp->hblk_shw_bit = 0; 15835 15836 if (hmeblkp->hblk_shared) { 15837 #ifdef DEBUG 15838 sf_srd_t *srdp; 15839 sf_region_t *rgnp; 15840 uint_t rid; 15841 15842 srdp = hblktosrd(hmeblkp); 15843 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 15844 rid = hmeblkp->hblk_tag.htag_rid; 15845 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 15846 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 15847 rgnp = srdp->srd_hmergnp[rid]; 15848 ASSERT(rgnp != NULL); 15849 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 15850 #endif /* DEBUG */ 15851 hmeblkp->hblk_shared = 0; 15852 } 15853 if (free_now) { 15854 kpreempt_disable(); 15855 CPUSET_DEL(cpuset, CPU->cpu_id); 15856 xt_sync(cpuset); 15857 xt_sync(cpuset); 15858 kpreempt_enable(); 15859 15860 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 15861 hmeblkp->hblk_next = NULL; 15862 } else { 15863 /* Append hmeblkp to listp for processing later. */ 15864 hmeblkp->hblk_next = *listp; 15865 *listp = hmeblkp; 15866 } 15867 } 15868 15869 /* 15870 * This routine is called when memory is in short supply and returns a free 15871 * hmeblk of the requested size from the cpu pending lists. 15872 */ 15873 static struct hme_blk * 15874 sfmmu_check_pending_hblks(int size) 15875 { 15876 int i; 15877 struct hme_blk *hmeblkp = NULL, *last_hmeblkp; 15878 int found_hmeblk; 15879 cpuset_t cpuset = cpu_ready_set; 15880 cpu_hme_pend_t *cpuhp; 15881 15882 /* Flush cpu hblk pending queues */ 15883 for (i = 0; i < NCPU; i++) { 15884 cpuhp = &cpu_hme_pend[i]; 15885 if (cpuhp->chp_listp != NULL) { 15886 mutex_enter(&cpuhp->chp_mutex); 15887 if (cpuhp->chp_listp == NULL) { 15888 mutex_exit(&cpuhp->chp_mutex); 15889 continue; 15890 } 15891 found_hmeblk = 0; 15892 last_hmeblkp = NULL; 15893 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL; 15894 hmeblkp = hmeblkp->hblk_next) { 15895 if (get_hblk_ttesz(hmeblkp) == size) { 15896 if (last_hmeblkp == NULL) { 15897 cpuhp->chp_listp = 15898 hmeblkp->hblk_next; 15899 } else { 15900 last_hmeblkp->hblk_next = 15901 hmeblkp->hblk_next; 15902 } 15903 ASSERT(cpuhp->chp_count > 0); 15904 cpuhp->chp_count--; 15905 found_hmeblk = 1; 15906 break; 15907 } else { 15908 last_hmeblkp = hmeblkp; 15909 } 15910 } 15911 mutex_exit(&cpuhp->chp_mutex); 15912 15913 if (found_hmeblk) { 15914 kpreempt_disable(); 15915 CPUSET_DEL(cpuset, CPU->cpu_id); 15916 xt_sync(cpuset); 15917 xt_sync(cpuset); 15918 kpreempt_enable(); 15919 return (hmeblkp); 15920 } 15921 } 15922 } 15923 return (NULL); 15924 } 15925