1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * VM - Hardware Address Translation management for Spitfire MMU. 28 * 29 * This file implements the machine specific hardware translation 30 * needed by the VM system. The machine independent interface is 31 * described in <vm/hat.h> while the machine dependent interface 32 * and data structures are described in <vm/hat_sfmmu.h>. 33 * 34 * The hat layer manages the address translation hardware as a cache 35 * driven by calls from the higher levels in the VM system. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/kstat.h> 40 #include <vm/hat.h> 41 #include <vm/hat_sfmmu.h> 42 #include <vm/page.h> 43 #include <sys/pte.h> 44 #include <sys/systm.h> 45 #include <sys/mman.h> 46 #include <sys/sysmacros.h> 47 #include <sys/machparam.h> 48 #include <sys/vtrace.h> 49 #include <sys/kmem.h> 50 #include <sys/mmu.h> 51 #include <sys/cmn_err.h> 52 #include <sys/cpu.h> 53 #include <sys/cpuvar.h> 54 #include <sys/debug.h> 55 #include <sys/lgrp.h> 56 #include <sys/archsystm.h> 57 #include <sys/machsystm.h> 58 #include <sys/vmsystm.h> 59 #include <vm/as.h> 60 #include <vm/seg.h> 61 #include <vm/seg_kp.h> 62 #include <vm/seg_kmem.h> 63 #include <vm/seg_kpm.h> 64 #include <vm/rm.h> 65 #include <sys/t_lock.h> 66 #include <sys/obpdefs.h> 67 #include <sys/vm_machparam.h> 68 #include <sys/var.h> 69 #include <sys/trap.h> 70 #include <sys/machtrap.h> 71 #include <sys/scb.h> 72 #include <sys/bitmap.h> 73 #include <sys/machlock.h> 74 #include <sys/membar.h> 75 #include <sys/atomic.h> 76 #include <sys/cpu_module.h> 77 #include <sys/prom_debug.h> 78 #include <sys/ksynch.h> 79 #include <sys/mem_config.h> 80 #include <sys/mem_cage.h> 81 #include <vm/vm_dep.h> 82 #include <vm/xhat_sfmmu.h> 83 #include <sys/fpu/fpusystm.h> 84 #include <vm/mach_kpm.h> 85 #include <sys/callb.h> 86 87 #ifdef DEBUG 88 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ 89 if (SFMMU_IS_SHMERID_VALID(rid)) { \ 90 caddr_t _eaddr = (saddr) + (len); \ 91 sf_srd_t *_srdp; \ 92 sf_region_t *_rgnp; \ 93 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 94 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \ 95 ASSERT((hat) != ksfmmup); \ 96 _srdp = (hat)->sfmmu_srdp; \ 97 ASSERT(_srdp != NULL); \ 98 ASSERT(_srdp->srd_refcnt != 0); \ 99 _rgnp = _srdp->srd_hmergnp[(rid)]; \ 100 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \ 101 ASSERT(_rgnp->rgn_refcnt != 0); \ 102 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \ 103 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 104 SFMMU_REGION_HME); \ 105 ASSERT((saddr) >= _rgnp->rgn_saddr); \ 106 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \ 107 ASSERT(_eaddr > _rgnp->rgn_saddr); \ 108 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \ 109 } 110 111 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \ 112 { \ 113 caddr_t _hsva; \ 114 caddr_t _heva; \ 115 caddr_t _rsva; \ 116 caddr_t _reva; \ 117 int _ttesz = get_hblk_ttesz(hmeblkp); \ 118 int _flagtte; \ 119 ASSERT((srdp)->srd_refcnt != 0); \ 120 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 121 ASSERT((rgnp)->rgn_id == rid); \ 122 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \ 123 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 124 SFMMU_REGION_HME); \ 125 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \ 126 _hsva = (caddr_t)get_hblk_base(hmeblkp); \ 127 _heva = get_hblk_endaddr(hmeblkp); \ 128 _rsva = (caddr_t)P2ALIGN( \ 129 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \ 130 _reva = (caddr_t)P2ROUNDUP( \ 131 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \ 132 HBLK_MIN_BYTES); \ 133 ASSERT(_hsva >= _rsva); \ 134 ASSERT(_hsva < _reva); \ 135 ASSERT(_heva > _rsva); \ 136 ASSERT(_heva <= _reva); \ 137 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \ 138 _ttesz; \ 139 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \ 140 } 141 142 #else /* DEBUG */ 143 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len) 144 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 145 #endif /* DEBUG */ 146 147 #if defined(SF_ERRATA_57) 148 extern caddr_t errata57_limit; 149 #endif 150 151 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 152 (sizeof (int64_t))) 153 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 154 155 #define HBLK_RESERVE_CNT 128 156 #define HBLK_RESERVE_MIN 20 157 158 static struct hme_blk *freehblkp; 159 static kmutex_t freehblkp_lock; 160 static int freehblkcnt; 161 162 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 163 static kmutex_t hblk_reserve_lock; 164 static kthread_t *hblk_reserve_thread; 165 166 static nucleus_hblk8_info_t nucleus_hblk8; 167 static nucleus_hblk1_info_t nucleus_hblk1; 168 169 /* 170 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here 171 * after the initial phase of removing an hmeblk from the hash chain, see 172 * the detailed comment in sfmmu_hblk_hash_rm() for further details. 173 */ 174 static cpu_hme_pend_t *cpu_hme_pend; 175 static uint_t cpu_hme_pend_thresh; 176 /* 177 * SFMMU specific hat functions 178 */ 179 void hat_pagecachectl(struct page *, int); 180 181 /* flags for hat_pagecachectl */ 182 #define HAT_CACHE 0x1 183 #define HAT_UNCACHE 0x2 184 #define HAT_TMPNC 0x4 185 186 /* 187 * This flag is set to 0 via the MD in platforms that do not support 188 * I-cache coherency in hardware. Used to enable "soft exec" mode. 189 * The MD "coherency" property is optional, and defaults to 1 (because 190 * coherent I-cache is the norm.) 191 */ 192 uint_t icache_is_coherent = 1; 193 194 /* 195 * Flag to allow the creation of non-cacheable translations 196 * to system memory. It is off by default. At the moment this 197 * flag is used by the ecache error injector. The error injector 198 * will turn it on when creating such a translation then shut it 199 * off when it's finished. 200 */ 201 202 int sfmmu_allow_nc_trans = 0; 203 204 /* 205 * Flag to disable large page support. 206 * value of 1 => disable all large pages. 207 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 208 * 209 * For example, use the value 0x4 to disable 512K pages. 210 * 211 */ 212 #define LARGE_PAGES_OFF 0x1 213 214 /* 215 * The disable_large_pages and disable_ism_large_pages variables control 216 * hat_memload_array and the page sizes to be used by ISM and the kernel. 217 * 218 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables 219 * are only used to control which OOB pages to use at upper VM segment creation 220 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines. 221 * Their values may come from platform or CPU specific code to disable page 222 * sizes that should not be used. 223 * 224 * WARNING: 512K pages are currently not supported for ISM/DISM. 225 */ 226 uint_t disable_large_pages = 0; 227 uint_t disable_ism_large_pages = (1 << TTE512K); 228 uint_t disable_auto_data_large_pages = 0; 229 uint_t disable_auto_text_large_pages = 0; 230 uint_t disable_shctx_large_pages = 0; 231 232 /* 233 * Private sfmmu data structures for hat management 234 */ 235 static struct kmem_cache *sfmmuid_cache; 236 static struct kmem_cache *mmuctxdom_cache; 237 238 /* 239 * Private sfmmu data structures for tsb management 240 */ 241 static struct kmem_cache *sfmmu_tsbinfo_cache; 242 static struct kmem_cache *sfmmu_tsb8k_cache; 243 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 244 static vmem_t *kmem_bigtsb_arena; 245 static vmem_t *kmem_tsb_arena; 246 247 /* 248 * sfmmu static variables for hmeblk resource management. 249 */ 250 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 251 static struct kmem_cache *sfmmu8_cache; 252 static struct kmem_cache *sfmmu1_cache; 253 static struct kmem_cache *pa_hment_cache; 254 255 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 256 /* 257 * private data for ism 258 */ 259 static struct kmem_cache *ism_blk_cache; 260 static struct kmem_cache *ism_ment_cache; 261 #define ISMID_STARTADDR NULL 262 263 /* 264 * Region management data structures and function declarations. 265 */ 266 267 static void sfmmu_leave_srd(sfmmu_t *); 268 static int sfmmu_srdcache_constructor(void *, void *, int); 269 static void sfmmu_srdcache_destructor(void *, void *); 270 static int sfmmu_rgncache_constructor(void *, void *, int); 271 static void sfmmu_rgncache_destructor(void *, void *); 272 static int sfrgnmap_isnull(sf_region_map_t *); 273 static int sfhmergnmap_isnull(sf_hmeregion_map_t *); 274 static int sfmmu_scdcache_constructor(void *, void *, int); 275 static void sfmmu_scdcache_destructor(void *, void *); 276 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t, 277 size_t, void *, u_offset_t); 278 279 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1; 280 static sf_srd_bucket_t *srd_buckets; 281 static struct kmem_cache *srd_cache; 282 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1; 283 static struct kmem_cache *region_cache; 284 static struct kmem_cache *scd_cache; 285 286 #ifdef sun4v 287 int use_bigtsb_arena = 1; 288 #else 289 int use_bigtsb_arena = 0; 290 #endif 291 292 /* External /etc/system tunable, for turning on&off the shctx support */ 293 int disable_shctx = 0; 294 /* Internal variable, set by MD if the HW supports shctx feature */ 295 int shctx_on = 0; 296 297 /* Internal variable, set by MD if the HW supports the search order register */ 298 int pgsz_search_on = 0; 299 /* 300 * External /etc/system tunable, for controlling search order register 301 * support. 302 */ 303 int disable_pgsz_search = 0; 304 305 #ifdef DEBUG 306 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int); 307 #endif 308 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *); 309 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *); 310 311 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *); 312 static void sfmmu_find_scd(sfmmu_t *); 313 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *); 314 static void sfmmu_finish_join_scd(sfmmu_t *); 315 static void sfmmu_leave_scd(sfmmu_t *, uchar_t); 316 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *); 317 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *); 318 static void sfmmu_free_scd_tsbs(sfmmu_t *); 319 static void sfmmu_tsb_inv_ctx(sfmmu_t *); 320 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *); 321 static void sfmmu_ism_hatflags(sfmmu_t *, int); 322 static int sfmmu_srd_lock_held(sf_srd_t *); 323 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *); 324 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *); 325 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *); 326 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *); 327 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *); 328 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *); 329 330 /* 331 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 332 * HAT flags, synchronizing TLB/TSB coherency, and context management. 333 * The lock is hashed on the sfmmup since the case where we need to lock 334 * all processes is rare but does occur (e.g. we need to unload a shared 335 * mapping from all processes using the mapping). We have a lot of buckets, 336 * and each slab of sfmmu_t's can use about a quarter of them, giving us 337 * a fairly good distribution without wasting too much space and overhead 338 * when we have to grab them all. 339 */ 340 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 341 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 342 343 /* 344 * Hash algorithm optimized for a small number of slabs. 345 * 7 is (highbit((sizeof sfmmu_t)) - 1) 346 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 347 * kmem_cache, and thus they will be sequential within that cache. In 348 * addition, each new slab will have a different "color" up to cache_maxcolor 349 * which will skew the hashing for each successive slab which is allocated. 350 * If the size of sfmmu_t changed to a larger size, this algorithm may need 351 * to be revisited. 352 */ 353 #define TSB_HASH_SHIFT_BITS (7) 354 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 355 356 #ifdef DEBUG 357 int tsb_hash_debug = 0; 358 #define TSB_HASH(sfmmup) \ 359 (tsb_hash_debug ? &hat_lock[0] : \ 360 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 361 #else /* DEBUG */ 362 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 363 #endif /* DEBUG */ 364 365 366 /* sfmmu_replace_tsb() return codes. */ 367 typedef enum tsb_replace_rc { 368 TSB_SUCCESS, 369 TSB_ALLOCFAIL, 370 TSB_LOSTRACE, 371 TSB_ALREADY_SWAPPED, 372 TSB_CANTGROW 373 } tsb_replace_rc_t; 374 375 /* 376 * Flags for TSB allocation routines. 377 */ 378 #define TSB_ALLOC 0x01 379 #define TSB_FORCEALLOC 0x02 380 #define TSB_GROW 0x04 381 #define TSB_SHRINK 0x08 382 #define TSB_SWAPIN 0x10 383 384 /* 385 * Support for HAT callbacks. 386 */ 387 #define SFMMU_MAX_RELOC_CALLBACKS 10 388 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 389 static id_t sfmmu_cb_nextid = 0; 390 static id_t sfmmu_tsb_cb_id; 391 struct sfmmu_callback *sfmmu_cb_table; 392 393 /* 394 * Kernel page relocation is enabled by default for non-caged 395 * kernel pages. This has little effect unless segkmem_reloc is 396 * set, since by default kernel memory comes from inside the 397 * kernel cage. 398 */ 399 int hat_kpr_enabled = 1; 400 401 kmutex_t kpr_mutex; 402 kmutex_t kpr_suspendlock; 403 kthread_t *kreloc_thread; 404 405 /* 406 * Enable VA->PA translation sanity checking on DEBUG kernels. 407 * Disabled by default. This is incompatible with some 408 * drivers (error injector, RSM) so if it breaks you get 409 * to keep both pieces. 410 */ 411 int hat_check_vtop = 0; 412 413 /* 414 * Private sfmmu routines (prototypes) 415 */ 416 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 417 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 418 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t, 419 uint_t); 420 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 421 caddr_t, demap_range_t *, uint_t); 422 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 423 caddr_t, int); 424 static void sfmmu_hblk_free(struct hme_blk **); 425 static void sfmmu_hblks_list_purge(struct hme_blk **, int); 426 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 427 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 428 static struct hme_blk *sfmmu_hblk_steal(int); 429 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 430 struct hme_blk *, uint64_t, struct hme_blk *); 431 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 432 433 static void hat_do_memload_array(struct hat *, caddr_t, size_t, 434 struct page **, uint_t, uint_t, uint_t); 435 static void hat_do_memload(struct hat *, caddr_t, struct page *, 436 uint_t, uint_t, uint_t); 437 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 438 uint_t, uint_t, pgcnt_t, uint_t); 439 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 440 uint_t); 441 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 442 uint_t, uint_t); 443 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 444 caddr_t, int, uint_t); 445 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 446 struct hmehash_bucket *, caddr_t, uint_t, uint_t, 447 uint_t); 448 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 449 caddr_t, page_t **, uint_t, uint_t); 450 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 451 452 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 453 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *); 454 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 455 #ifdef VAC 456 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 457 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 458 int tst_tnc(page_t *pp, pgcnt_t); 459 void conv_tnc(page_t *pp, int); 460 #endif 461 462 static void sfmmu_get_ctx(sfmmu_t *); 463 static void sfmmu_free_sfmmu(sfmmu_t *); 464 465 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 466 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 467 468 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 469 static void hat_pagereload(struct page *, struct page *); 470 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 471 #ifdef VAC 472 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 473 static void sfmmu_page_cache(page_t *, int, int, int); 474 #endif 475 476 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *, 477 struct hme_blk *, int); 478 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 479 pfn_t, int, int, int, int); 480 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 481 pfn_t, int); 482 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 483 static void sfmmu_tlb_range_demap(demap_range_t *); 484 static void sfmmu_sync_mmustate(sfmmu_t *); 485 486 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 487 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 488 sfmmu_t *); 489 static void sfmmu_tsb_free(struct tsb_info *); 490 static void sfmmu_tsbinfo_free(struct tsb_info *); 491 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 492 sfmmu_t *); 493 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *); 494 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 495 static int sfmmu_select_tsb_szc(pgcnt_t); 496 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 497 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 498 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 499 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 500 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 501 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 502 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 503 hatlock_t *, uint_t); 504 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int); 505 506 #ifdef VAC 507 void sfmmu_cache_flush(pfn_t, int); 508 void sfmmu_cache_flushcolor(int, pfn_t); 509 #endif 510 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 511 caddr_t, demap_range_t *, uint_t, int); 512 513 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 514 static uint_t sfmmu_ptov_attr(tte_t *); 515 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 516 caddr_t, demap_range_t *, uint_t); 517 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 518 static int sfmmu_idcache_constructor(void *, void *, int); 519 static void sfmmu_idcache_destructor(void *, void *); 520 static int sfmmu_hblkcache_constructor(void *, void *, int); 521 static void sfmmu_hblkcache_destructor(void *, void *); 522 static void sfmmu_hblkcache_reclaim(void *); 523 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 524 struct hmehash_bucket *); 525 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *, 526 struct hme_blk *, struct hme_blk **, int); 527 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *, 528 uint64_t); 529 static struct hme_blk *sfmmu_check_pending_hblks(int); 530 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 531 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int); 532 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t, 533 int, caddr_t *); 534 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *); 535 536 static void sfmmu_rm_large_mappings(page_t *, int); 537 538 static void hat_lock_init(void); 539 static void hat_kstat_init(void); 540 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 541 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *); 542 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t); 543 static void sfmmu_check_page_sizes(sfmmu_t *, int); 544 int fnd_mapping_sz(page_t *); 545 static void iment_add(struct ism_ment *, struct hat *); 546 static void iment_sub(struct ism_ment *, struct hat *); 547 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 548 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 549 extern void sfmmu_clear_utsbinfo(void); 550 551 static void sfmmu_ctx_wrap_around(mmu_ctx_t *); 552 553 extern int vpm_enable; 554 555 /* kpm globals */ 556 #ifdef DEBUG 557 /* 558 * Enable trap level tsbmiss handling 559 */ 560 int kpm_tsbmtl = 1; 561 562 /* 563 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 564 * required TLB shootdowns in this case, so handle w/ care. Off by default. 565 */ 566 int kpm_tlb_flush; 567 #endif /* DEBUG */ 568 569 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 570 571 #ifdef DEBUG 572 static void sfmmu_check_hblk_flist(); 573 #endif 574 575 /* 576 * Semi-private sfmmu data structures. Some of them are initialize in 577 * startup or in hat_init. Some of them are private but accessed by 578 * assembly code or mach_sfmmu.c 579 */ 580 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 581 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 582 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 583 uint64_t khme_hash_pa; /* PA of khme_hash */ 584 int uhmehash_num; /* # of buckets in user hash table */ 585 int khmehash_num; /* # of buckets in kernel hash table */ 586 587 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */ 588 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */ 589 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ 590 591 #define DEFAULT_NUM_CTXS_PER_MMU 8192 592 uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; 593 594 int cache; /* describes system cache */ 595 596 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 597 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 598 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 599 int ktsb_sz; /* kernel 8k-indexed tsb size */ 600 601 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 602 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 603 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 604 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 605 606 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 607 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 608 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 609 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 610 611 #ifndef sun4v 612 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 613 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 614 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 615 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 616 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 617 #endif /* sun4v */ 618 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 619 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 620 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */ 621 622 /* 623 * Size to use for TSB slabs. Future platforms that support page sizes 624 * larger than 4M may wish to change these values, and provide their own 625 * assembly macros for building and decoding the TSB base register contents. 626 * Note disable_large_pages will override the value set here. 627 */ 628 static uint_t tsb_slab_ttesz = TTE4M; 629 size_t tsb_slab_size = MMU_PAGESIZE4M; 630 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 631 /* PFN mask for TTE */ 632 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT; 633 634 /* 635 * Size to use for TSB slabs. These are used only when 256M tsb arenas 636 * exist. 637 */ 638 static uint_t bigtsb_slab_ttesz = TTE256M; 639 static size_t bigtsb_slab_size = MMU_PAGESIZE256M; 640 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M; 641 /* 256M page alignment for 8K pfn */ 642 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT; 643 644 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 645 static int tsb_max_growsize = 0; 646 647 /* 648 * Tunable parameters dealing with TSB policies. 649 */ 650 651 /* 652 * This undocumented tunable forces all 8K TSBs to be allocated from 653 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 654 */ 655 #ifdef DEBUG 656 int tsb_forceheap = 0; 657 #endif /* DEBUG */ 658 659 /* 660 * Decide whether to use per-lgroup arenas, or one global set of 661 * TSB arenas. The default is not to break up per-lgroup, since 662 * most platforms don't recognize any tangible benefit from it. 663 */ 664 int tsb_lgrp_affinity = 0; 665 666 /* 667 * Used for growing the TSB based on the process RSS. 668 * tsb_rss_factor is based on the smallest TSB, and is 669 * shifted by the TSB size to determine if we need to grow. 670 * The default will grow the TSB if the number of TTEs for 671 * this page size exceeds 75% of the number of TSB entries, 672 * which should _almost_ eliminate all conflict misses 673 * (at the expense of using up lots and lots of memory). 674 */ 675 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 676 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 677 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 678 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 679 default_tsb_size) 680 #define TSB_OK_SHRINK() \ 681 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 682 #define TSB_OK_GROW() \ 683 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 684 685 int enable_tsb_rss_sizing = 1; 686 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 687 688 /* which TSB size code to use for new address spaces or if rss sizing off */ 689 int default_tsb_size = TSB_8K_SZCODE; 690 691 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 692 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 693 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 694 695 #ifdef DEBUG 696 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 697 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 698 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 699 static int tsb_alloc_fail_mtbf = 0; 700 static int tsb_alloc_count = 0; 701 #endif /* DEBUG */ 702 703 /* if set to 1, will remap valid TTEs when growing TSB. */ 704 int tsb_remap_ttes = 1; 705 706 /* 707 * If we have more than this many mappings, allocate a second TSB. 708 * This default is chosen because the I/D fully associative TLBs are 709 * assumed to have at least 8 available entries. Platforms with a 710 * larger fully-associative TLB could probably override the default. 711 */ 712 713 #ifdef sun4v 714 int tsb_sectsb_threshold = 0; 715 #else 716 int tsb_sectsb_threshold = 8; 717 #endif 718 719 /* 720 * kstat data 721 */ 722 struct sfmmu_global_stat sfmmu_global_stat; 723 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 724 725 /* 726 * Global data 727 */ 728 sfmmu_t *ksfmmup; /* kernel's hat id */ 729 730 #ifdef DEBUG 731 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 732 #endif 733 734 /* sfmmu locking operations */ 735 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 736 static int sfmmu_mlspl_held(struct page *, int); 737 738 kmutex_t *sfmmu_page_enter(page_t *); 739 void sfmmu_page_exit(kmutex_t *); 740 int sfmmu_page_spl_held(struct page *); 741 742 /* sfmmu internal locking operations - accessed directly */ 743 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 744 kmutex_t **, kmutex_t **); 745 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 746 static hatlock_t *sfmmu_hat_tryenter(sfmmu_t *); 747 static void sfmmu_hat_lock_all(void); 748 static void sfmmu_hat_unlock_all(void); 749 static void sfmmu_ismhat_enter(sfmmu_t *, int); 750 static void sfmmu_ismhat_exit(sfmmu_t *, int); 751 752 /* 753 * Array of mutexes protecting a page's mapping list and p_nrm field. 754 * 755 * The hash function looks complicated, but is made up so that: 756 * 757 * "pp" not shifted, so adjacent pp values will hash to different cache lines 758 * (8 byte alignment * 8 bytes/mutes == 64 byte coherency subblock) 759 * 760 * "pp" >> mml_shift, incorporates more source bits into the hash result 761 * 762 * "& (mml_table_size - 1), should be faster than using remainder "%" 763 * 764 * Hopefully, mml_table, mml_table_size and mml_shift are all in the same 765 * cacheline, since they get declared next to each other below. We'll trust 766 * ld not to do something random. 767 */ 768 #ifdef DEBUG 769 int mlist_hash_debug = 0; 770 #define MLIST_HASH(pp) (mlist_hash_debug ? &mml_table[0] : \ 771 &mml_table[((uintptr_t)(pp) + \ 772 ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)]) 773 #else /* !DEBUG */ 774 #define MLIST_HASH(pp) &mml_table[ \ 775 ((uintptr_t)(pp) + ((uintptr_t)(pp) >> mml_shift)) & (mml_table_sz - 1)] 776 #endif /* !DEBUG */ 777 778 kmutex_t *mml_table; 779 uint_t mml_table_sz; /* must be a power of 2 */ 780 uint_t mml_shift; /* log2(mml_table_sz) + 3 for align */ 781 782 kpm_hlk_t *kpmp_table; 783 uint_t kpmp_table_sz; /* must be a power of 2 */ 784 uchar_t kpmp_shift; 785 786 kpm_shlk_t *kpmp_stable; 787 uint_t kpmp_stable_sz; /* must be a power of 2 */ 788 789 /* 790 * SPL_HASH was improved to avoid false cache line sharing 791 */ 792 #define SPL_TABLE_SIZE 128 793 #define SPL_MASK (SPL_TABLE_SIZE - 1) 794 #define SPL_SHIFT 7 /* log2(SPL_TABLE_SIZE) */ 795 796 #define SPL_INDEX(pp) \ 797 ((((uintptr_t)(pp) >> SPL_SHIFT) ^ \ 798 ((uintptr_t)(pp) >> (SPL_SHIFT << 1))) & \ 799 (SPL_TABLE_SIZE - 1)) 800 801 #define SPL_HASH(pp) \ 802 (&sfmmu_page_lock[SPL_INDEX(pp) & SPL_MASK].pad_mutex) 803 804 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 805 806 807 /* 808 * hat_unload_callback() will group together callbacks in order 809 * to avoid xt_sync() calls. This is the maximum size of the group. 810 */ 811 #define MAX_CB_ADDR 32 812 813 tte_t hw_tte; 814 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 815 816 static char *mmu_ctx_kstat_names[] = { 817 "mmu_ctx_tsb_exceptions", 818 "mmu_ctx_tsb_raise_exception", 819 "mmu_ctx_wrap_around", 820 }; 821 822 /* 823 * Wrapper for vmem_xalloc since vmem_create only allows limited 824 * parameters for vm_source_alloc functions. This function allows us 825 * to specify alignment consistent with the size of the object being 826 * allocated. 827 */ 828 static void * 829 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 830 { 831 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 832 } 833 834 /* Common code for setting tsb_alloc_hiwater. */ 835 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 836 ptob(pages) / tsb_alloc_hiwater_factor 837 838 /* 839 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 840 * a single TSB. physmem is the number of physical pages so we need physmem 8K 841 * TTEs to represent all those physical pages. We round this up by using 842 * 1<<highbit(). To figure out which size code to use, remember that the size 843 * code is just an amount to shift the smallest TSB size to get the size of 844 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 845 * highbit() - 1) to get the size code for the smallest TSB that can represent 846 * all of physical memory, while erring on the side of too much. 847 * 848 * Restrict tsb_max_growsize to make sure that: 849 * 1) TSBs can't grow larger than the TSB slab size 850 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 851 */ 852 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 853 int _i, _szc, _slabszc, _tsbszc; \ 854 \ 855 _i = highbit(pages); \ 856 if ((1 << (_i - 1)) == (pages)) \ 857 _i--; /* 2^n case, round down */ \ 858 _szc = _i - TSB_START_SIZE; \ 859 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \ 860 _tsbszc = MIN(_szc, _slabszc); \ 861 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \ 862 } 863 864 /* 865 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 866 * tsb_info which handles that TTE size. 867 */ 868 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \ 869 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 870 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \ 871 sfmmu_hat_lock_held(sfmmup)); \ 872 if ((tte_szc) >= TTE4M) { \ 873 ASSERT((tsbinfop) != NULL); \ 874 (tsbinfop) = (tsbinfop)->tsb_next; \ 875 } \ 876 } 877 878 /* 879 * Macro to use to unload entries from the TSB. 880 * It has knowledge of which page sizes get replicated in the TSB 881 * and will call the appropriate unload routine for the appropriate size. 882 */ 883 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \ 884 { \ 885 int ttesz = get_hblk_ttesz(hmeblkp); \ 886 if (ttesz == TTE8K || ttesz == TTE4M) { \ 887 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 888 } else { \ 889 caddr_t sva = ismhat ? addr : \ 890 (caddr_t)get_hblk_base(hmeblkp); \ 891 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 892 ASSERT(addr >= sva && addr < eva); \ 893 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 894 } \ 895 } 896 897 898 /* Update tsb_alloc_hiwater after memory is configured. */ 899 /*ARGSUSED*/ 900 static void 901 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages) 902 { 903 /* Assumes physmem has already been updated. */ 904 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 905 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 906 } 907 908 /* 909 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 910 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 911 * deleted. 912 */ 913 /*ARGSUSED*/ 914 static int 915 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages) 916 { 917 return (0); 918 } 919 920 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 921 /*ARGSUSED*/ 922 static void 923 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 924 { 925 /* 926 * Whether the delete was cancelled or not, just go ahead and update 927 * tsb_alloc_hiwater and tsb_max_growsize. 928 */ 929 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 930 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 931 } 932 933 static kphysm_setup_vector_t sfmmu_update_vec = { 934 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 935 sfmmu_update_post_add, /* post_add */ 936 sfmmu_update_pre_del, /* pre_del */ 937 sfmmu_update_post_del /* post_del */ 938 }; 939 940 941 /* 942 * HME_BLK HASH PRIMITIVES 943 */ 944 945 /* 946 * Enter a hme on the mapping list for page pp. 947 * When large pages are more prevalent in the system we might want to 948 * keep the mapping list in ascending order by the hment size. For now, 949 * small pages are more frequent, so don't slow it down. 950 */ 951 #define HME_ADD(hme, pp) \ 952 { \ 953 ASSERT(sfmmu_mlist_held(pp)); \ 954 \ 955 hme->hme_prev = NULL; \ 956 hme->hme_next = pp->p_mapping; \ 957 hme->hme_page = pp; \ 958 if (pp->p_mapping) { \ 959 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 960 ASSERT(pp->p_share > 0); \ 961 } else { \ 962 /* EMPTY */ \ 963 ASSERT(pp->p_share == 0); \ 964 } \ 965 pp->p_mapping = hme; \ 966 pp->p_share++; \ 967 } 968 969 /* 970 * Enter a hme on the mapping list for page pp. 971 * If we are unmapping a large translation, we need to make sure that the 972 * change is reflect in the corresponding bit of the p_index field. 973 */ 974 #define HME_SUB(hme, pp) \ 975 { \ 976 ASSERT(sfmmu_mlist_held(pp)); \ 977 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 978 \ 979 if (pp->p_mapping == NULL) { \ 980 panic("hme_remove - no mappings"); \ 981 } \ 982 \ 983 membar_stst(); /* ensure previous stores finish */ \ 984 \ 985 ASSERT(pp->p_share > 0); \ 986 pp->p_share--; \ 987 \ 988 if (hme->hme_prev) { \ 989 ASSERT(pp->p_mapping != hme); \ 990 ASSERT(hme->hme_prev->hme_page == pp || \ 991 IS_PAHME(hme->hme_prev)); \ 992 hme->hme_prev->hme_next = hme->hme_next; \ 993 } else { \ 994 ASSERT(pp->p_mapping == hme); \ 995 pp->p_mapping = hme->hme_next; \ 996 ASSERT((pp->p_mapping == NULL) ? \ 997 (pp->p_share == 0) : 1); \ 998 } \ 999 \ 1000 if (hme->hme_next) { \ 1001 ASSERT(hme->hme_next->hme_page == pp || \ 1002 IS_PAHME(hme->hme_next)); \ 1003 hme->hme_next->hme_prev = hme->hme_prev; \ 1004 } \ 1005 \ 1006 /* zero out the entry */ \ 1007 hme->hme_next = NULL; \ 1008 hme->hme_prev = NULL; \ 1009 hme->hme_page = NULL; \ 1010 \ 1011 if (hme_size(hme) > TTE8K) { \ 1012 /* remove mappings for remainder of large pg */ \ 1013 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 1014 } \ 1015 } 1016 1017 /* 1018 * This function returns the hment given the hme_blk and a vaddr. 1019 * It assumes addr has already been checked to belong to hme_blk's 1020 * range. 1021 */ 1022 #define HBLKTOHME(hment, hmeblkp, addr) \ 1023 { \ 1024 int index; \ 1025 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 1026 } 1027 1028 /* 1029 * Version of HBLKTOHME that also returns the index in hmeblkp 1030 * of the hment. 1031 */ 1032 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 1033 { \ 1034 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 1035 \ 1036 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 1037 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 1038 } else \ 1039 idx = 0; \ 1040 \ 1041 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 1042 } 1043 1044 /* 1045 * Disable any page sizes not supported by the CPU 1046 */ 1047 void 1048 hat_init_pagesizes() 1049 { 1050 int i; 1051 1052 mmu_exported_page_sizes = 0; 1053 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 1054 1055 szc_2_userszc[i] = (uint_t)-1; 1056 userszc_2_szc[i] = (uint_t)-1; 1057 1058 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 1059 disable_large_pages |= (1 << i); 1060 } else { 1061 szc_2_userszc[i] = mmu_exported_page_sizes; 1062 userszc_2_szc[mmu_exported_page_sizes] = i; 1063 mmu_exported_page_sizes++; 1064 } 1065 } 1066 1067 disable_ism_large_pages |= disable_large_pages; 1068 disable_auto_data_large_pages = disable_large_pages; 1069 disable_auto_text_large_pages = disable_large_pages; 1070 disable_shctx_large_pages |= disable_large_pages; 1071 1072 /* 1073 * Initialize mmu-specific large page sizes. 1074 */ 1075 if (&mmu_large_pages_disabled) { 1076 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 1077 disable_shctx_large_pages |= disable_large_pages; 1078 disable_ism_large_pages |= 1079 mmu_large_pages_disabled(HAT_LOAD_SHARE); 1080 disable_auto_data_large_pages |= 1081 mmu_large_pages_disabled(HAT_AUTO_DATA); 1082 disable_auto_text_large_pages |= 1083 mmu_large_pages_disabled(HAT_AUTO_TEXT); 1084 } 1085 } 1086 1087 /* 1088 * Initialize the hardware address translation structures. 1089 */ 1090 void 1091 hat_init(void) 1092 { 1093 int i; 1094 uint_t sz; 1095 size_t size; 1096 1097 hat_lock_init(); 1098 hat_kstat_init(); 1099 1100 /* 1101 * Hardware-only bits in a TTE 1102 */ 1103 MAKE_TTE_MASK(&hw_tte); 1104 1105 hat_init_pagesizes(); 1106 1107 /* Initialize the hash locks */ 1108 for (i = 0; i < khmehash_num; i++) { 1109 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1110 MUTEX_DEFAULT, NULL); 1111 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1112 } 1113 for (i = 0; i < uhmehash_num; i++) { 1114 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1115 MUTEX_DEFAULT, NULL); 1116 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1117 } 1118 khmehash_num--; /* make sure counter starts from 0 */ 1119 uhmehash_num--; /* make sure counter starts from 0 */ 1120 1121 /* 1122 * Allocate context domain structures. 1123 * 1124 * A platform may choose to modify max_mmu_ctxdoms in 1125 * set_platform_defaults(). If a platform does not define 1126 * a set_platform_defaults() or does not choose to modify 1127 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU. 1128 * 1129 * For sun4v, there will be one global context domain, this is to 1130 * avoid the ldom cpu substitution problem. 1131 * 1132 * For all platforms that have CPUs sharing MMUs, this 1133 * value must be defined. 1134 */ 1135 if (max_mmu_ctxdoms == 0) { 1136 #ifndef sun4v 1137 max_mmu_ctxdoms = max_ncpus; 1138 #else /* sun4v */ 1139 max_mmu_ctxdoms = 1; 1140 #endif /* sun4v */ 1141 } 1142 1143 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *); 1144 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP); 1145 1146 /* mmu_ctx_t is 64 bytes aligned */ 1147 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache", 1148 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1149 /* 1150 * MMU context domain initialization for the Boot CPU. 1151 * This needs the context domains array allocated above. 1152 */ 1153 mutex_enter(&cpu_lock); 1154 sfmmu_cpu_init(CPU); 1155 mutex_exit(&cpu_lock); 1156 1157 /* 1158 * Intialize ism mapping list lock. 1159 */ 1160 1161 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1162 1163 /* 1164 * Each sfmmu structure carries an array of MMU context info 1165 * structures, one per context domain. The size of this array depends 1166 * on the maximum number of context domains. So, the size of the 1167 * sfmmu structure varies per platform. 1168 * 1169 * sfmmu is allocated from static arena, because trap 1170 * handler at TL > 0 is not allowed to touch kernel relocatable 1171 * memory. sfmmu's alignment is changed to 64 bytes from 1172 * default 8 bytes, as the lower 6 bits will be used to pass 1173 * pgcnt to vtag_flush_pgcnt_tl1. 1174 */ 1175 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1); 1176 1177 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size, 1178 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1179 NULL, NULL, static_arena, 0); 1180 1181 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1182 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1183 1184 /* 1185 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1186 * from the heap when low on memory or when TSB_FORCEALLOC is 1187 * specified, don't use magazines to cache them--we want to return 1188 * them to the system as quickly as possible. 1189 */ 1190 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1191 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1192 static_arena, KMC_NOMAGAZINE); 1193 1194 /* 1195 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1196 * memory, which corresponds to the old static reserve for TSBs. 1197 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1198 * memory we'll allocate for TSB slabs; beyond this point TSB 1199 * allocations will be taken from the kernel heap (via 1200 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1201 * consumer. 1202 */ 1203 if (tsb_alloc_hiwater_factor == 0) { 1204 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1205 } 1206 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1207 1208 for (sz = tsb_slab_ttesz; sz > 0; sz--) { 1209 if (!(disable_large_pages & (1 << sz))) 1210 break; 1211 } 1212 1213 if (sz < tsb_slab_ttesz) { 1214 tsb_slab_ttesz = sz; 1215 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz; 1216 tsb_slab_size = 1 << tsb_slab_shift; 1217 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1; 1218 use_bigtsb_arena = 0; 1219 } else if (use_bigtsb_arena && 1220 (disable_large_pages & (1 << bigtsb_slab_ttesz))) { 1221 use_bigtsb_arena = 0; 1222 } 1223 1224 if (!use_bigtsb_arena) { 1225 bigtsb_slab_shift = tsb_slab_shift; 1226 } 1227 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1228 1229 /* 1230 * On smaller memory systems, allocate TSB memory in smaller chunks 1231 * than the default 4M slab size. We also honor disable_large_pages 1232 * here. 1233 * 1234 * The trap handlers need to be patched with the final slab shift, 1235 * since they need to be able to construct the TSB pointer at runtime. 1236 */ 1237 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1238 !(disable_large_pages & (1 << TTE512K))) { 1239 tsb_slab_ttesz = TTE512K; 1240 tsb_slab_shift = MMU_PAGESHIFT512K; 1241 tsb_slab_size = MMU_PAGESIZE512K; 1242 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT; 1243 use_bigtsb_arena = 0; 1244 } 1245 1246 if (!use_bigtsb_arena) { 1247 bigtsb_slab_ttesz = tsb_slab_ttesz; 1248 bigtsb_slab_shift = tsb_slab_shift; 1249 bigtsb_slab_size = tsb_slab_size; 1250 bigtsb_slab_mask = tsb_slab_mask; 1251 } 1252 1253 1254 /* 1255 * Set up memory callback to update tsb_alloc_hiwater and 1256 * tsb_max_growsize. 1257 */ 1258 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0); 1259 ASSERT(i == 0); 1260 1261 /* 1262 * kmem_tsb_arena is the source from which large TSB slabs are 1263 * drawn. The quantum of this arena corresponds to the largest 1264 * TSB size we can dynamically allocate for user processes. 1265 * Currently it must also be a supported page size since we 1266 * use exactly one translation entry to map each slab page. 1267 * 1268 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1269 * which most TSBs are allocated. Since most TSB allocations are 1270 * typically 8K we have a kmem cache we stack on top of each 1271 * kmem_tsb_default_arena to speed up those allocations. 1272 * 1273 * Note the two-level scheme of arenas is required only 1274 * because vmem_create doesn't allow us to specify alignment 1275 * requirements. If this ever changes the code could be 1276 * simplified to use only one level of arenas. 1277 * 1278 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena 1279 * will be provided in addition to the 4M kmem_tsb_arena. 1280 */ 1281 if (use_bigtsb_arena) { 1282 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0, 1283 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper, 1284 vmem_xfree, heap_arena, 0, VM_SLEEP); 1285 } 1286 1287 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1288 sfmmu_vmem_xalloc_aligned_wrapper, 1289 vmem_xfree, heap_arena, 0, VM_SLEEP); 1290 1291 if (tsb_lgrp_affinity) { 1292 char s[50]; 1293 for (i = 0; i < NLGRPS_MAX; i++) { 1294 if (use_bigtsb_arena) { 1295 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i); 1296 kmem_bigtsb_default_arena[i] = vmem_create(s, 1297 NULL, 0, 2 * tsb_slab_size, 1298 sfmmu_tsb_segkmem_alloc, 1299 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 1300 0, VM_SLEEP | VM_BESTFIT); 1301 } 1302 1303 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1304 kmem_tsb_default_arena[i] = vmem_create(s, 1305 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1306 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1307 VM_SLEEP | VM_BESTFIT); 1308 1309 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1310 sfmmu_tsb_cache[i] = kmem_cache_create(s, 1311 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1312 kmem_tsb_default_arena[i], 0); 1313 } 1314 } else { 1315 if (use_bigtsb_arena) { 1316 kmem_bigtsb_default_arena[0] = 1317 vmem_create("kmem_bigtsb_default", NULL, 0, 1318 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc, 1319 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0, 1320 VM_SLEEP | VM_BESTFIT); 1321 } 1322 1323 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1324 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1325 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1326 VM_SLEEP | VM_BESTFIT); 1327 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1328 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1329 kmem_tsb_default_arena[0], 0); 1330 } 1331 1332 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1333 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1334 sfmmu_hblkcache_destructor, 1335 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1336 hat_memload_arena, KMC_NOHASH); 1337 1338 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1339 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, VM_SLEEP); 1340 1341 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1342 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1343 sfmmu_hblkcache_destructor, 1344 NULL, (void *)HME1BLK_SZ, 1345 hat_memload1_arena, KMC_NOHASH); 1346 1347 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1348 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1349 1350 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1351 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1352 NULL, NULL, static_arena, KMC_NOHASH); 1353 1354 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1355 sizeof (ism_ment_t), 0, NULL, NULL, 1356 NULL, NULL, NULL, 0); 1357 1358 /* 1359 * We grab the first hat for the kernel, 1360 */ 1361 AS_LOCK_ENTER(&kas, &kas.a_lock, RW_WRITER); 1362 kas.a_hat = hat_alloc(&kas); 1363 AS_LOCK_EXIT(&kas, &kas.a_lock); 1364 1365 /* 1366 * Initialize hblk_reserve. 1367 */ 1368 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1369 va_to_pa((caddr_t)hblk_reserve); 1370 1371 #ifndef UTSB_PHYS 1372 /* 1373 * Reserve some kernel virtual address space for the locked TTEs 1374 * that allow us to probe the TSB from TL>0. 1375 */ 1376 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1377 0, 0, NULL, NULL, VM_SLEEP); 1378 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1379 0, 0, NULL, NULL, VM_SLEEP); 1380 #endif 1381 1382 #ifdef VAC 1383 /* 1384 * The big page VAC handling code assumes VAC 1385 * will not be bigger than the smallest big 1386 * page- which is 64K. 1387 */ 1388 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1389 cmn_err(CE_PANIC, "VAC too big!"); 1390 } 1391 #endif 1392 1393 (void) xhat_init(); 1394 1395 uhme_hash_pa = va_to_pa(uhme_hash); 1396 khme_hash_pa = va_to_pa(khme_hash); 1397 1398 /* 1399 * Initialize relocation locks. kpr_suspendlock is held 1400 * at PIL_MAX to prevent interrupts from pinning the holder 1401 * of a suspended TTE which may access it leading to a 1402 * deadlock condition. 1403 */ 1404 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1405 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1406 1407 /* 1408 * If Shared context support is disabled via /etc/system 1409 * set shctx_on to 0 here if it was set to 1 earlier in boot 1410 * sequence by cpu module initialization code. 1411 */ 1412 if (shctx_on && disable_shctx) { 1413 shctx_on = 0; 1414 } 1415 1416 /* 1417 * If support for page size search is disabled via /etc/system 1418 * set pgsz_search_on to 0 here. 1419 */ 1420 if (pgsz_search_on && disable_pgsz_search) { 1421 pgsz_search_on = 0; 1422 } 1423 1424 if (shctx_on) { 1425 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS * 1426 sizeof (srd_buckets[0]), KM_SLEEP); 1427 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) { 1428 mutex_init(&srd_buckets[i].srdb_lock, NULL, 1429 MUTEX_DEFAULT, NULL); 1430 } 1431 1432 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t), 1433 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor, 1434 NULL, NULL, NULL, 0); 1435 region_cache = kmem_cache_create("region_cache", 1436 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor, 1437 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0); 1438 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t), 1439 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor, 1440 NULL, NULL, NULL, 0); 1441 } 1442 1443 /* 1444 * Pre-allocate hrm_hashtab before enabling the collection of 1445 * refmod statistics. Allocating on the fly would mean us 1446 * running the risk of suffering recursive mutex enters or 1447 * deadlocks. 1448 */ 1449 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1450 KM_SLEEP); 1451 1452 /* Allocate per-cpu pending freelist of hmeblks */ 1453 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64, 1454 KM_SLEEP); 1455 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP( 1456 (uintptr_t)cpu_hme_pend, 64); 1457 1458 for (i = 0; i < NCPU; i++) { 1459 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT, 1460 NULL); 1461 } 1462 1463 if (cpu_hme_pend_thresh == 0) { 1464 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH; 1465 } 1466 } 1467 1468 /* 1469 * Initialize locking for the hat layer, called early during boot. 1470 */ 1471 static void 1472 hat_lock_init() 1473 { 1474 int i; 1475 1476 /* 1477 * initialize the array of mutexes protecting a page's mapping 1478 * list and p_nrm field. 1479 */ 1480 for (i = 0; i < mml_table_sz; i++) 1481 mutex_init(&mml_table[i], NULL, MUTEX_DEFAULT, NULL); 1482 1483 if (kpm_enable) { 1484 for (i = 0; i < kpmp_table_sz; i++) { 1485 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1486 MUTEX_DEFAULT, NULL); 1487 } 1488 } 1489 1490 /* 1491 * Initialize array of mutex locks that protects sfmmu fields and 1492 * TSB lists. 1493 */ 1494 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1495 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1496 NULL); 1497 } 1498 1499 #define SFMMU_KERNEL_MAXVA \ 1500 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1501 1502 /* 1503 * Allocate a hat structure. 1504 * Called when an address space first uses a hat. 1505 */ 1506 struct hat * 1507 hat_alloc(struct as *as) 1508 { 1509 sfmmu_t *sfmmup; 1510 int i; 1511 uint64_t cnum; 1512 extern uint_t get_color_start(struct as *); 1513 1514 ASSERT(AS_WRITE_HELD(as, &as->a_lock)); 1515 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1516 sfmmup->sfmmu_as = as; 1517 sfmmup->sfmmu_flags = 0; 1518 sfmmup->sfmmu_tteflags = 0; 1519 sfmmup->sfmmu_rtteflags = 0; 1520 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock); 1521 1522 if (as == &kas) { 1523 ksfmmup = sfmmup; 1524 sfmmup->sfmmu_cext = 0; 1525 cnum = KCONTEXT; 1526 1527 sfmmup->sfmmu_clrstart = 0; 1528 sfmmup->sfmmu_tsb = NULL; 1529 /* 1530 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1531 * to setup tsb_info for ksfmmup. 1532 */ 1533 } else { 1534 1535 /* 1536 * Just set to invalid ctx. When it faults, it will 1537 * get a valid ctx. This would avoid the situation 1538 * where we get a ctx, but it gets stolen and then 1539 * we fault when we try to run and so have to get 1540 * another ctx. 1541 */ 1542 sfmmup->sfmmu_cext = 0; 1543 cnum = INVALID_CONTEXT; 1544 1545 /* initialize original physical page coloring bin */ 1546 sfmmup->sfmmu_clrstart = get_color_start(as); 1547 #ifdef DEBUG 1548 if (tsb_random_size) { 1549 uint32_t randval = (uint32_t)gettick() >> 4; 1550 int size = randval % (tsb_max_growsize + 1); 1551 1552 /* chose a random tsb size for stress testing */ 1553 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1554 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1555 } else 1556 #endif /* DEBUG */ 1557 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1558 default_tsb_size, 1559 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1560 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID; 1561 ASSERT(sfmmup->sfmmu_tsb != NULL); 1562 } 1563 1564 ASSERT(max_mmu_ctxdoms > 0); 1565 for (i = 0; i < max_mmu_ctxdoms; i++) { 1566 sfmmup->sfmmu_ctxs[i].cnum = cnum; 1567 sfmmup->sfmmu_ctxs[i].gnum = 0; 1568 } 1569 1570 for (i = 0; i < max_mmu_page_sizes; i++) { 1571 sfmmup->sfmmu_ttecnt[i] = 0; 1572 sfmmup->sfmmu_scdrttecnt[i] = 0; 1573 sfmmup->sfmmu_ismttecnt[i] = 0; 1574 sfmmup->sfmmu_scdismttecnt[i] = 0; 1575 sfmmup->sfmmu_pgsz[i] = TTE8K; 1576 } 1577 sfmmup->sfmmu_tsb0_4minflcnt = 0; 1578 sfmmup->sfmmu_iblk = NULL; 1579 sfmmup->sfmmu_ismhat = 0; 1580 sfmmup->sfmmu_scdhat = 0; 1581 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1582 if (sfmmup == ksfmmup) { 1583 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1584 } else { 1585 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1586 } 1587 sfmmup->sfmmu_free = 0; 1588 sfmmup->sfmmu_rmstat = 0; 1589 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1590 sfmmup->sfmmu_xhat_provider = NULL; 1591 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1592 sfmmup->sfmmu_srdp = NULL; 1593 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map); 1594 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 1595 sfmmup->sfmmu_scdp = NULL; 1596 sfmmup->sfmmu_scd_link.next = NULL; 1597 sfmmup->sfmmu_scd_link.prev = NULL; 1598 1599 if (&mmu_set_pgsz_order && sfmmup != ksfmmup) { 1600 mmu_set_pgsz_order(sfmmup, 0); 1601 sfmmu_init_pgsz_hv(sfmmup); 1602 } 1603 return (sfmmup); 1604 } 1605 1606 /* 1607 * Create per-MMU context domain kstats for a given MMU ctx. 1608 */ 1609 static void 1610 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp) 1611 { 1612 mmu_ctx_stat_t stat; 1613 kstat_t *mmu_kstat; 1614 1615 ASSERT(MUTEX_HELD(&cpu_lock)); 1616 ASSERT(mmu_ctxp->mmu_kstat == NULL); 1617 1618 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx", 1619 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1620 1621 if (mmu_kstat == NULL) { 1622 cmn_err(CE_WARN, "kstat_create for MMU %d failed", 1623 mmu_ctxp->mmu_idx); 1624 } else { 1625 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data; 1626 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++) 1627 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat], 1628 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64); 1629 mmu_ctxp->mmu_kstat = mmu_kstat; 1630 kstat_install(mmu_kstat); 1631 } 1632 } 1633 1634 /* 1635 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU 1636 * context domain information for a given CPU. If a platform does not 1637 * specify that interface, then the function below is used instead to return 1638 * default information. The defaults are as follows: 1639 * 1640 * - For sun4u systems there's one MMU context domain per CPU. 1641 * This default is used by all sun4u systems except OPL. OPL systems 1642 * provide platform specific interface to map CPU ids to MMU ids 1643 * because on OPL more than 1 CPU shares a single MMU. 1644 * Note that on sun4v, there is one global context domain for 1645 * the entire system. This is to avoid running into potential problem 1646 * with ldom physical cpu substitution feature. 1647 * - The number of MMU context IDs supported on any CPU in the 1648 * system is 8K. 1649 */ 1650 /*ARGSUSED*/ 1651 static void 1652 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop) 1653 { 1654 infop->mmu_nctxs = nctxs; 1655 #ifndef sun4v 1656 infop->mmu_idx = cpu[cpuid]->cpu_seqid; 1657 #else /* sun4v */ 1658 infop->mmu_idx = 0; 1659 #endif /* sun4v */ 1660 } 1661 1662 /* 1663 * Called during CPU initialization to set the MMU context-related information 1664 * for a CPU. 1665 * 1666 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum. 1667 */ 1668 void 1669 sfmmu_cpu_init(cpu_t *cp) 1670 { 1671 mmu_ctx_info_t info; 1672 mmu_ctx_t *mmu_ctxp; 1673 1674 ASSERT(MUTEX_HELD(&cpu_lock)); 1675 1676 if (&plat_cpuid_to_mmu_ctx_info == NULL) 1677 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1678 else 1679 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1680 1681 ASSERT(info.mmu_idx < max_mmu_ctxdoms); 1682 1683 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) { 1684 /* Each mmu_ctx is cacheline aligned. */ 1685 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP); 1686 bzero(mmu_ctxp, sizeof (mmu_ctx_t)); 1687 1688 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN, 1689 (void *)ipltospl(DISP_LEVEL)); 1690 mmu_ctxp->mmu_idx = info.mmu_idx; 1691 mmu_ctxp->mmu_nctxs = info.mmu_nctxs; 1692 /* 1693 * Globally for lifetime of a system, 1694 * gnum must always increase. 1695 * mmu_saved_gnum is protected by the cpu_lock. 1696 */ 1697 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1; 1698 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 1699 1700 sfmmu_mmu_kstat_create(mmu_ctxp); 1701 1702 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp; 1703 } else { 1704 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx); 1705 } 1706 1707 /* 1708 * The mmu_lock is acquired here to prevent races with 1709 * the wrap-around code. 1710 */ 1711 mutex_enter(&mmu_ctxp->mmu_lock); 1712 1713 1714 mmu_ctxp->mmu_ncpus++; 1715 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1716 CPU_MMU_IDX(cp) = info.mmu_idx; 1717 CPU_MMU_CTXP(cp) = mmu_ctxp; 1718 1719 mutex_exit(&mmu_ctxp->mmu_lock); 1720 } 1721 1722 /* 1723 * Called to perform MMU context-related cleanup for a CPU. 1724 */ 1725 void 1726 sfmmu_cpu_cleanup(cpu_t *cp) 1727 { 1728 mmu_ctx_t *mmu_ctxp; 1729 1730 ASSERT(MUTEX_HELD(&cpu_lock)); 1731 1732 mmu_ctxp = CPU_MMU_CTXP(cp); 1733 ASSERT(mmu_ctxp != NULL); 1734 1735 /* 1736 * The mmu_lock is acquired here to prevent races with 1737 * the wrap-around code. 1738 */ 1739 mutex_enter(&mmu_ctxp->mmu_lock); 1740 1741 CPU_MMU_CTXP(cp) = NULL; 1742 1743 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1744 if (--mmu_ctxp->mmu_ncpus == 0) { 1745 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL; 1746 mutex_exit(&mmu_ctxp->mmu_lock); 1747 mutex_destroy(&mmu_ctxp->mmu_lock); 1748 1749 if (mmu_ctxp->mmu_kstat) 1750 kstat_delete(mmu_ctxp->mmu_kstat); 1751 1752 /* mmu_saved_gnum is protected by the cpu_lock. */ 1753 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum) 1754 mmu_saved_gnum = mmu_ctxp->mmu_gnum; 1755 1756 kmem_cache_free(mmuctxdom_cache, mmu_ctxp); 1757 1758 return; 1759 } 1760 1761 mutex_exit(&mmu_ctxp->mmu_lock); 1762 } 1763 1764 /* 1765 * Hat_setup, makes an address space context the current active one. 1766 * In sfmmu this translates to setting the secondary context with the 1767 * corresponding context. 1768 */ 1769 void 1770 hat_setup(struct hat *sfmmup, int allocflag) 1771 { 1772 hatlock_t *hatlockp; 1773 1774 /* Init needs some special treatment. */ 1775 if (allocflag == HAT_INIT) { 1776 /* 1777 * Make sure that we have 1778 * 1. a TSB 1779 * 2. a valid ctx that doesn't get stolen after this point. 1780 */ 1781 hatlockp = sfmmu_hat_enter(sfmmup); 1782 1783 /* 1784 * Swap in the TSB. hat_init() allocates tsbinfos without 1785 * TSBs, but we need one for init, since the kernel does some 1786 * special things to set up its stack and needs the TSB to 1787 * resolve page faults. 1788 */ 1789 sfmmu_tsb_swapin(sfmmup, hatlockp); 1790 1791 sfmmu_get_ctx(sfmmup); 1792 1793 sfmmu_hat_exit(hatlockp); 1794 } else { 1795 ASSERT(allocflag == HAT_ALLOC); 1796 1797 hatlockp = sfmmu_hat_enter(sfmmup); 1798 kpreempt_disable(); 1799 1800 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1801 /* 1802 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter, 1803 * pagesize bits don't matter in this case since we are passing 1804 * INVALID_CONTEXT to it. 1805 * Compatibility Note: hw takes care of MMU_SCONTEXT1 1806 */ 1807 sfmmu_setctx_sec(INVALID_CONTEXT); 1808 sfmmu_clear_utsbinfo(); 1809 1810 kpreempt_enable(); 1811 sfmmu_hat_exit(hatlockp); 1812 } 1813 } 1814 1815 /* 1816 * Free all the translation resources for the specified address space. 1817 * Called from as_free when an address space is being destroyed. 1818 */ 1819 void 1820 hat_free_start(struct hat *sfmmup) 1821 { 1822 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 1823 ASSERT(sfmmup != ksfmmup); 1824 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1825 1826 sfmmup->sfmmu_free = 1; 1827 if (sfmmup->sfmmu_scdp != NULL) { 1828 sfmmu_leave_scd(sfmmup, 0); 1829 } 1830 1831 ASSERT(sfmmup->sfmmu_scdp == NULL); 1832 } 1833 1834 void 1835 hat_free_end(struct hat *sfmmup) 1836 { 1837 int i; 1838 1839 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1840 ASSERT(sfmmup->sfmmu_free == 1); 1841 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1842 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1843 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1844 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1845 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1846 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1847 1848 if (sfmmup->sfmmu_rmstat) { 1849 hat_freestat(sfmmup->sfmmu_as, NULL); 1850 } 1851 1852 while (sfmmup->sfmmu_tsb != NULL) { 1853 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1854 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1855 sfmmup->sfmmu_tsb = next; 1856 } 1857 1858 if (sfmmup->sfmmu_srdp != NULL) { 1859 sfmmu_leave_srd(sfmmup); 1860 ASSERT(sfmmup->sfmmu_srdp == NULL); 1861 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1862 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) { 1863 kmem_free(sfmmup->sfmmu_hmeregion_links[i], 1864 SFMMU_L2_HMERLINKS_SIZE); 1865 sfmmup->sfmmu_hmeregion_links[i] = NULL; 1866 } 1867 } 1868 } 1869 sfmmu_free_sfmmu(sfmmup); 1870 1871 #ifdef DEBUG 1872 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1873 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL); 1874 } 1875 #endif 1876 1877 kmem_cache_free(sfmmuid_cache, sfmmup); 1878 } 1879 1880 /* 1881 * Set up any translation structures, for the specified address space, 1882 * that are needed or preferred when the process is being swapped in. 1883 */ 1884 /* ARGSUSED */ 1885 void 1886 hat_swapin(struct hat *hat) 1887 { 1888 ASSERT(hat->sfmmu_xhat_provider == NULL); 1889 } 1890 1891 /* 1892 * Free all of the translation resources, for the specified address space, 1893 * that can be freed while the process is swapped out. Called from as_swapout. 1894 * Also, free up the ctx that this process was using. 1895 */ 1896 void 1897 hat_swapout(struct hat *sfmmup) 1898 { 1899 struct hmehash_bucket *hmebp; 1900 struct hme_blk *hmeblkp; 1901 struct hme_blk *pr_hblk = NULL; 1902 struct hme_blk *nx_hblk; 1903 int i; 1904 struct hme_blk *list = NULL; 1905 hatlock_t *hatlockp; 1906 struct tsb_info *tsbinfop; 1907 struct free_tsb { 1908 struct free_tsb *next; 1909 struct tsb_info *tsbinfop; 1910 }; /* free list of TSBs */ 1911 struct free_tsb *freelist, *last, *next; 1912 1913 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 1914 SFMMU_STAT(sf_swapout); 1915 1916 /* 1917 * There is no way to go from an as to all its translations in sfmmu. 1918 * Here is one of the times when we take the big hit and traverse 1919 * the hash looking for hme_blks to free up. Not only do we free up 1920 * this as hme_blks but all those that are free. We are obviously 1921 * swapping because we need memory so let's free up as much 1922 * as we can. 1923 * 1924 * Note that we don't flush TLB/TSB here -- it's not necessary 1925 * because: 1926 * 1) we free the ctx we're using and throw away the TSB(s); 1927 * 2) processes aren't runnable while being swapped out. 1928 */ 1929 ASSERT(sfmmup != KHATID); 1930 for (i = 0; i <= UHMEHASH_SZ; i++) { 1931 hmebp = &uhme_hash[i]; 1932 SFMMU_HASH_LOCK(hmebp); 1933 hmeblkp = hmebp->hmeblkp; 1934 pr_hblk = NULL; 1935 while (hmeblkp) { 1936 1937 ASSERT(!hmeblkp->hblk_xhat_bit); 1938 1939 if ((hmeblkp->hblk_tag.htag_id == sfmmup) && 1940 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) { 1941 ASSERT(!hmeblkp->hblk_shared); 1942 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 1943 (caddr_t)get_hblk_base(hmeblkp), 1944 get_hblk_endaddr(hmeblkp), 1945 NULL, HAT_UNLOAD); 1946 } 1947 nx_hblk = hmeblkp->hblk_next; 1948 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 1949 ASSERT(!hmeblkp->hblk_lckcnt); 1950 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 1951 &list, 0); 1952 } else { 1953 pr_hblk = hmeblkp; 1954 } 1955 hmeblkp = nx_hblk; 1956 } 1957 SFMMU_HASH_UNLOCK(hmebp); 1958 } 1959 1960 sfmmu_hblks_list_purge(&list, 0); 1961 1962 /* 1963 * Now free up the ctx so that others can reuse it. 1964 */ 1965 hatlockp = sfmmu_hat_enter(sfmmup); 1966 1967 sfmmu_invalidate_ctx(sfmmup); 1968 1969 /* 1970 * Free TSBs, but not tsbinfos, and set SWAPPED flag. 1971 * If TSBs were never swapped in, just return. 1972 * This implies that we don't support partial swapping 1973 * of TSBs -- either all are swapped out, or none are. 1974 * 1975 * We must hold the HAT lock here to prevent racing with another 1976 * thread trying to unmap TTEs from the TSB or running the post- 1977 * relocator after relocating the TSB's memory. Unfortunately, we 1978 * can't free memory while holding the HAT lock or we could 1979 * deadlock, so we build a list of TSBs to be freed after marking 1980 * the tsbinfos as swapped out and free them after dropping the 1981 * lock. 1982 */ 1983 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 1984 sfmmu_hat_exit(hatlockp); 1985 return; 1986 } 1987 1988 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED); 1989 last = freelist = NULL; 1990 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 1991 tsbinfop = tsbinfop->tsb_next) { 1992 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0); 1993 1994 /* 1995 * Cast the TSB into a struct free_tsb and put it on the free 1996 * list. 1997 */ 1998 if (freelist == NULL) { 1999 last = freelist = (struct free_tsb *)tsbinfop->tsb_va; 2000 } else { 2001 last->next = (struct free_tsb *)tsbinfop->tsb_va; 2002 last = last->next; 2003 } 2004 last->next = NULL; 2005 last->tsbinfop = tsbinfop; 2006 tsbinfop->tsb_flags |= TSB_SWAPPED; 2007 /* 2008 * Zero out the TTE to clear the valid bit. 2009 * Note we can't use a value like 0xbad because we want to 2010 * ensure diagnostic bits are NEVER set on TTEs that might 2011 * be loaded. The intent is to catch any invalid access 2012 * to the swapped TSB, such as a thread running with a valid 2013 * context without first calling sfmmu_tsb_swapin() to 2014 * allocate TSB memory. 2015 */ 2016 tsbinfop->tsb_tte.ll = 0; 2017 } 2018 2019 /* Now we can drop the lock and free the TSB memory. */ 2020 sfmmu_hat_exit(hatlockp); 2021 for (; freelist != NULL; freelist = next) { 2022 next = freelist->next; 2023 sfmmu_tsb_free(freelist->tsbinfop); 2024 } 2025 } 2026 2027 /* 2028 * Duplicate the translations of an as into another newas 2029 */ 2030 /* ARGSUSED */ 2031 int 2032 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 2033 uint_t flag) 2034 { 2035 sf_srd_t *srdp; 2036 sf_scd_t *scdp; 2037 int i; 2038 extern uint_t get_color_start(struct as *); 2039 2040 ASSERT(hat->sfmmu_xhat_provider == NULL); 2041 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) || 2042 (flag == HAT_DUP_SRD)); 2043 ASSERT(hat != ksfmmup); 2044 ASSERT(newhat != ksfmmup); 2045 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp); 2046 2047 if (flag == HAT_DUP_COW) { 2048 panic("hat_dup: HAT_DUP_COW not supported"); 2049 } 2050 2051 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) { 2052 ASSERT(srdp->srd_evp != NULL); 2053 VN_HOLD(srdp->srd_evp); 2054 ASSERT(srdp->srd_refcnt > 0); 2055 newhat->sfmmu_srdp = srdp; 2056 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 2057 } 2058 2059 /* 2060 * HAT_DUP_ALL flag is used after as duplication is done. 2061 */ 2062 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) { 2063 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2); 2064 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags; 2065 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) { 2066 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG; 2067 } 2068 2069 /* check if need to join scd */ 2070 if ((scdp = hat->sfmmu_scdp) != NULL && 2071 newhat->sfmmu_scdp != scdp) { 2072 int ret; 2073 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map, 2074 &scdp->scd_region_map, ret); 2075 ASSERT(ret); 2076 sfmmu_join_scd(scdp, newhat); 2077 ASSERT(newhat->sfmmu_scdp == scdp && 2078 scdp->scd_refcnt >= 2); 2079 for (i = 0; i < max_mmu_page_sizes; i++) { 2080 newhat->sfmmu_ismttecnt[i] = 2081 hat->sfmmu_ismttecnt[i]; 2082 newhat->sfmmu_scdismttecnt[i] = 2083 hat->sfmmu_scdismttecnt[i]; 2084 } 2085 } else if (&mmu_set_pgsz_order) { 2086 mmu_set_pgsz_order(newhat, 0); 2087 } 2088 2089 sfmmu_check_page_sizes(newhat, 1); 2090 } 2091 2092 if (flag == HAT_DUP_ALL && consistent_coloring == 0 && 2093 update_proc_pgcolorbase_after_fork != 0) { 2094 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as); 2095 } 2096 return (0); 2097 } 2098 2099 void 2100 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 2101 uint_t attr, uint_t flags) 2102 { 2103 hat_do_memload(hat, addr, pp, attr, flags, 2104 SFMMU_INVALID_SHMERID); 2105 } 2106 2107 void 2108 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2109 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2110 { 2111 uint_t rid; 2112 if (rcookie == HAT_INVALID_REGION_COOKIE || 2113 hat->sfmmu_xhat_provider != NULL) { 2114 hat_do_memload(hat, addr, pp, attr, flags, 2115 SFMMU_INVALID_SHMERID); 2116 return; 2117 } 2118 rid = (uint_t)((uint64_t)rcookie); 2119 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2120 hat_do_memload(hat, addr, pp, attr, flags, rid); 2121 } 2122 2123 /* 2124 * Set up addr to map to page pp with protection prot. 2125 * As an optimization we also load the TSB with the 2126 * corresponding tte but it is no big deal if the tte gets kicked out. 2127 */ 2128 static void 2129 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp, 2130 uint_t attr, uint_t flags, uint_t rid) 2131 { 2132 tte_t tte; 2133 2134 2135 ASSERT(hat != NULL); 2136 ASSERT(PAGE_LOCKED(pp)); 2137 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2138 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2139 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2140 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE); 2141 2142 if (PP_ISFREE(pp)) { 2143 panic("hat_memload: loading a mapping to free page %p", 2144 (void *)pp); 2145 } 2146 2147 if (hat->sfmmu_xhat_provider) { 2148 /* no regions for xhats */ 2149 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2150 XHAT_MEMLOAD(hat, addr, pp, attr, flags); 2151 return; 2152 } 2153 2154 ASSERT((hat == ksfmmup) || 2155 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2156 2157 if (flags & ~SFMMU_LOAD_ALLFLAG) 2158 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 2159 flags & ~SFMMU_LOAD_ALLFLAG); 2160 2161 if (hat->sfmmu_rmstat) 2162 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 2163 2164 #if defined(SF_ERRATA_57) 2165 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2166 (addr < errata57_limit) && (attr & PROT_EXEC) && 2167 !(flags & HAT_LOAD_SHARE)) { 2168 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 2169 " page executable"); 2170 attr &= ~PROT_EXEC; 2171 } 2172 #endif 2173 2174 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2175 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid); 2176 2177 /* 2178 * Check TSB and TLB page sizes. 2179 */ 2180 if ((flags & HAT_LOAD_SHARE) == 0) { 2181 sfmmu_check_page_sizes(hat, 1); 2182 } 2183 } 2184 2185 /* 2186 * hat_devload can be called to map real memory (e.g. 2187 * /dev/kmem) and even though hat_devload will determine pf is 2188 * for memory, it will be unable to get a shared lock on the 2189 * page (because someone else has it exclusively) and will 2190 * pass dp = NULL. If tteload doesn't get a non-NULL 2191 * page pointer it can't cache memory. 2192 */ 2193 void 2194 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 2195 uint_t attr, int flags) 2196 { 2197 tte_t tte; 2198 struct page *pp = NULL; 2199 int use_lgpg = 0; 2200 2201 ASSERT(hat != NULL); 2202 2203 if (hat->sfmmu_xhat_provider) { 2204 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags); 2205 return; 2206 } 2207 2208 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2209 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2210 ASSERT((hat == ksfmmup) || 2211 AS_LOCK_HELD(hat->sfmmu_as, &hat->sfmmu_as->a_lock)); 2212 if (len == 0) 2213 panic("hat_devload: zero len"); 2214 if (flags & ~SFMMU_LOAD_ALLFLAG) 2215 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 2216 flags & ~SFMMU_LOAD_ALLFLAG); 2217 2218 #if defined(SF_ERRATA_57) 2219 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2220 (addr < errata57_limit) && (attr & PROT_EXEC) && 2221 !(flags & HAT_LOAD_SHARE)) { 2222 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 2223 " page executable"); 2224 attr &= ~PROT_EXEC; 2225 } 2226 #endif 2227 2228 /* 2229 * If it's a memory page find its pp 2230 */ 2231 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 2232 pp = page_numtopp_nolock(pfn); 2233 if (pp == NULL) { 2234 flags |= HAT_LOAD_NOCONSIST; 2235 } else { 2236 if (PP_ISFREE(pp)) { 2237 panic("hat_memload: loading " 2238 "a mapping to free page %p", 2239 (void *)pp); 2240 } 2241 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2242 panic("hat_memload: loading a mapping " 2243 "to unlocked relocatable page %p", 2244 (void *)pp); 2245 } 2246 ASSERT(len == MMU_PAGESIZE); 2247 } 2248 } 2249 2250 if (hat->sfmmu_rmstat) 2251 hat_resvstat(len, hat->sfmmu_as, addr); 2252 2253 if (flags & HAT_LOAD_NOCONSIST) { 2254 attr |= SFMMU_UNCACHEVTTE; 2255 use_lgpg = 1; 2256 } 2257 if (!pf_is_memory(pfn)) { 2258 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 2259 use_lgpg = 1; 2260 switch (attr & HAT_ORDER_MASK) { 2261 case HAT_STRICTORDER: 2262 case HAT_UNORDERED_OK: 2263 /* 2264 * we set the side effect bit for all non 2265 * memory mappings unless merging is ok 2266 */ 2267 attr |= SFMMU_SIDEFFECT; 2268 break; 2269 case HAT_MERGING_OK: 2270 case HAT_LOADCACHING_OK: 2271 case HAT_STORECACHING_OK: 2272 break; 2273 default: 2274 panic("hat_devload: bad attr"); 2275 break; 2276 } 2277 } 2278 while (len) { 2279 if (!use_lgpg) { 2280 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2281 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2282 flags, SFMMU_INVALID_SHMERID); 2283 len -= MMU_PAGESIZE; 2284 addr += MMU_PAGESIZE; 2285 pfn++; 2286 continue; 2287 } 2288 /* 2289 * try to use large pages, check va/pa alignments 2290 * Note that 32M/256M page sizes are not (yet) supported. 2291 */ 2292 if ((len >= MMU_PAGESIZE4M) && 2293 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 2294 !(disable_large_pages & (1 << TTE4M)) && 2295 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 2296 sfmmu_memtte(&tte, pfn, attr, TTE4M); 2297 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2298 flags, SFMMU_INVALID_SHMERID); 2299 len -= MMU_PAGESIZE4M; 2300 addr += MMU_PAGESIZE4M; 2301 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 2302 } else if ((len >= MMU_PAGESIZE512K) && 2303 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 2304 !(disable_large_pages & (1 << TTE512K)) && 2305 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 2306 sfmmu_memtte(&tte, pfn, attr, TTE512K); 2307 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2308 flags, SFMMU_INVALID_SHMERID); 2309 len -= MMU_PAGESIZE512K; 2310 addr += MMU_PAGESIZE512K; 2311 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 2312 } else if ((len >= MMU_PAGESIZE64K) && 2313 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 2314 !(disable_large_pages & (1 << TTE64K)) && 2315 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 2316 sfmmu_memtte(&tte, pfn, attr, TTE64K); 2317 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2318 flags, SFMMU_INVALID_SHMERID); 2319 len -= MMU_PAGESIZE64K; 2320 addr += MMU_PAGESIZE64K; 2321 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 2322 } else { 2323 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2324 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2325 flags, SFMMU_INVALID_SHMERID); 2326 len -= MMU_PAGESIZE; 2327 addr += MMU_PAGESIZE; 2328 pfn++; 2329 } 2330 } 2331 2332 /* 2333 * Check TSB and TLB page sizes. 2334 */ 2335 if ((flags & HAT_LOAD_SHARE) == 0) { 2336 sfmmu_check_page_sizes(hat, 1); 2337 } 2338 } 2339 2340 void 2341 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 2342 struct page **pps, uint_t attr, uint_t flags) 2343 { 2344 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2345 SFMMU_INVALID_SHMERID); 2346 } 2347 2348 void 2349 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2350 struct page **pps, uint_t attr, uint_t flags, 2351 hat_region_cookie_t rcookie) 2352 { 2353 uint_t rid; 2354 if (rcookie == HAT_INVALID_REGION_COOKIE || 2355 hat->sfmmu_xhat_provider != NULL) { 2356 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2357 SFMMU_INVALID_SHMERID); 2358 return; 2359 } 2360 rid = (uint_t)((uint64_t)rcookie); 2361 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2362 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid); 2363 } 2364 2365 /* 2366 * Map the largest extend possible out of the page array. The array may NOT 2367 * be in order. The largest possible mapping a page can have 2368 * is specified in the p_szc field. The p_szc field 2369 * cannot change as long as there any mappings (large or small) 2370 * to any of the pages that make up the large page. (ie. any 2371 * promotion/demotion of page size is not up to the hat but up to 2372 * the page free list manager). The array 2373 * should consist of properly aligned contigous pages that are 2374 * part of a big page for a large mapping to be created. 2375 */ 2376 static void 2377 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len, 2378 struct page **pps, uint_t attr, uint_t flags, uint_t rid) 2379 { 2380 int ttesz; 2381 size_t mapsz; 2382 pgcnt_t numpg, npgs; 2383 tte_t tte; 2384 page_t *pp; 2385 uint_t large_pages_disable; 2386 2387 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2388 SFMMU_VALIDATE_HMERID(hat, rid, addr, len); 2389 2390 if (hat->sfmmu_xhat_provider) { 2391 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 2392 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags); 2393 return; 2394 } 2395 2396 if (hat->sfmmu_rmstat) 2397 hat_resvstat(len, hat->sfmmu_as, addr); 2398 2399 #if defined(SF_ERRATA_57) 2400 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2401 (addr < errata57_limit) && (attr & PROT_EXEC) && 2402 !(flags & HAT_LOAD_SHARE)) { 2403 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 2404 "user page executable"); 2405 attr &= ~PROT_EXEC; 2406 } 2407 #endif 2408 2409 /* Get number of pages */ 2410 npgs = len >> MMU_PAGESHIFT; 2411 2412 if (flags & HAT_LOAD_SHARE) { 2413 large_pages_disable = disable_ism_large_pages; 2414 } else { 2415 large_pages_disable = disable_large_pages; 2416 } 2417 2418 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 2419 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2420 rid); 2421 return; 2422 } 2423 2424 while (npgs >= NHMENTS) { 2425 pp = *pps; 2426 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 2427 /* 2428 * Check if this page size is disabled. 2429 */ 2430 if (large_pages_disable & (1 << ttesz)) 2431 continue; 2432 2433 numpg = TTEPAGES(ttesz); 2434 mapsz = numpg << MMU_PAGESHIFT; 2435 if ((npgs >= numpg) && 2436 IS_P2ALIGNED(addr, mapsz) && 2437 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 2438 /* 2439 * At this point we have enough pages and 2440 * we know the virtual address and the pfn 2441 * are properly aligned. We still need 2442 * to check for physical contiguity but since 2443 * it is very likely that this is the case 2444 * we will assume they are so and undo 2445 * the request if necessary. It would 2446 * be great if we could get a hint flag 2447 * like HAT_CONTIG which would tell us 2448 * the pages are contigous for sure. 2449 */ 2450 sfmmu_memtte(&tte, (*pps)->p_pagenum, 2451 attr, ttesz); 2452 if (!sfmmu_tteload_array(hat, &tte, addr, 2453 pps, flags, rid)) { 2454 break; 2455 } 2456 } 2457 } 2458 if (ttesz == TTE8K) { 2459 /* 2460 * We were not able to map array using a large page 2461 * batch a hmeblk or fraction at a time. 2462 */ 2463 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 2464 & (NHMENTS-1); 2465 numpg = NHMENTS - numpg; 2466 ASSERT(numpg <= npgs); 2467 mapsz = numpg * MMU_PAGESIZE; 2468 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 2469 numpg, rid); 2470 } 2471 addr += mapsz; 2472 npgs -= numpg; 2473 pps += numpg; 2474 } 2475 2476 if (npgs) { 2477 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2478 rid); 2479 } 2480 2481 /* 2482 * Check TSB and TLB page sizes. 2483 */ 2484 if ((flags & HAT_LOAD_SHARE) == 0) { 2485 sfmmu_check_page_sizes(hat, 1); 2486 } 2487 } 2488 2489 /* 2490 * Function tries to batch 8K pages into the same hme blk. 2491 */ 2492 static void 2493 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 2494 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid) 2495 { 2496 tte_t tte; 2497 page_t *pp; 2498 struct hmehash_bucket *hmebp; 2499 struct hme_blk *hmeblkp; 2500 int index; 2501 2502 while (npgs) { 2503 /* 2504 * Acquire the hash bucket. 2505 */ 2506 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K, 2507 rid); 2508 ASSERT(hmebp); 2509 2510 /* 2511 * Find the hment block. 2512 */ 2513 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2514 TTE8K, flags, rid); 2515 ASSERT(hmeblkp); 2516 2517 do { 2518 /* 2519 * Make the tte. 2520 */ 2521 pp = *pps; 2522 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2523 2524 /* 2525 * Add the translation. 2526 */ 2527 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2528 vaddr, pps, flags, rid); 2529 2530 /* 2531 * Goto next page. 2532 */ 2533 pps++; 2534 npgs--; 2535 2536 /* 2537 * Goto next address. 2538 */ 2539 vaddr += MMU_PAGESIZE; 2540 2541 /* 2542 * Don't crossover into a different hmentblk. 2543 */ 2544 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2545 (NHMENTS-1)); 2546 2547 } while (index != 0 && npgs != 0); 2548 2549 /* 2550 * Release the hash bucket. 2551 */ 2552 2553 sfmmu_tteload_release_hashbucket(hmebp); 2554 } 2555 } 2556 2557 /* 2558 * Construct a tte for a page: 2559 * 2560 * tte_valid = 1 2561 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only) 2562 * tte_size = size 2563 * tte_nfo = attr & HAT_NOFAULT 2564 * tte_ie = attr & HAT_STRUCTURE_LE 2565 * tte_hmenum = hmenum 2566 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2567 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2568 * tte_ref = 1 (optimization) 2569 * tte_wr_perm = attr & PROT_WRITE; 2570 * tte_no_sync = attr & HAT_NOSYNC 2571 * tte_lock = attr & SFMMU_LOCKTTE 2572 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2573 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2574 * tte_e = attr & SFMMU_SIDEFFECT 2575 * tte_priv = !(attr & PROT_USER) 2576 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2577 * tte_glb = 0 2578 */ 2579 void 2580 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2581 { 2582 ASSERT((attr & ~(SFMMU_LOAD_ALLATTR | HAT_ATTR_NOSOFTEXEC)) == 0); 2583 2584 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2585 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2586 2587 if (TTE_IS_NOSYNC(ttep)) { 2588 TTE_SET_REF(ttep); 2589 if (TTE_IS_WRITABLE(ttep)) { 2590 TTE_SET_MOD(ttep); 2591 } 2592 } 2593 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2594 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2595 } 2596 2597 /* 2598 * Disable hardware execute permission to force a fault if 2599 * this page is executed, so we can detect the execution. Set 2600 * the soft exec bit to remember that this TTE has execute 2601 * permission. 2602 */ 2603 if (TTE_IS_EXECUTABLE(ttep) && (attr & HAT_ATTR_NOSOFTEXEC) == 0 && 2604 icache_is_coherent == 0) { 2605 TTE_CLR_EXEC(ttep); 2606 TTE_SET_SOFTEXEC(ttep); 2607 } 2608 } 2609 2610 /* 2611 * This function will add a translation to the hme_blk and allocate the 2612 * hme_blk if one does not exist. 2613 * If a page structure is specified then it will add the 2614 * corresponding hment to the mapping list. 2615 * It will also update the hmenum field for the tte. 2616 * 2617 * Currently this function is only used for kernel mappings. 2618 * So pass invalid region to sfmmu_tteload_array(). 2619 */ 2620 void 2621 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2622 uint_t flags) 2623 { 2624 ASSERT(sfmmup == ksfmmup); 2625 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags, 2626 SFMMU_INVALID_SHMERID); 2627 } 2628 2629 /* 2630 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2631 * Assumes that a particular page size may only be resident in one TSB. 2632 */ 2633 static void 2634 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2635 { 2636 struct tsb_info *tsbinfop = NULL; 2637 uint64_t tag; 2638 struct tsbe *tsbe_addr; 2639 uint64_t tsb_base; 2640 uint_t tsb_size; 2641 int vpshift = MMU_PAGESHIFT; 2642 int phys = 0; 2643 2644 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2645 phys = ktsb_phys; 2646 if (ttesz >= TTE4M) { 2647 #ifndef sun4v 2648 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2649 #endif 2650 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2651 tsb_size = ktsb4m_szcode; 2652 } else { 2653 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2654 tsb_size = ktsb_szcode; 2655 } 2656 } else { 2657 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2658 2659 /* 2660 * If there isn't a TSB for this page size, or the TSB is 2661 * swapped out, there is nothing to do. Note that the latter 2662 * case seems impossible but can occur if hat_pageunload() 2663 * is called on an ISM mapping while the process is swapped 2664 * out. 2665 */ 2666 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2667 return; 2668 2669 /* 2670 * If another thread is in the middle of relocating a TSB 2671 * we can't unload the entry so set a flag so that the 2672 * TSB will be flushed before it can be accessed by the 2673 * process. 2674 */ 2675 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2676 if (ttep == NULL) 2677 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2678 return; 2679 } 2680 #if defined(UTSB_PHYS) 2681 phys = 1; 2682 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2683 #else 2684 tsb_base = (uint64_t)tsbinfop->tsb_va; 2685 #endif 2686 tsb_size = tsbinfop->tsb_szc; 2687 } 2688 if (ttesz >= TTE4M) 2689 vpshift = MMU_PAGESHIFT4M; 2690 2691 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2692 tag = sfmmu_make_tsbtag(vaddr); 2693 2694 if (ttep == NULL) { 2695 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2696 } else { 2697 if (ttesz >= TTE4M) { 2698 SFMMU_STAT(sf_tsb_load4m); 2699 } else { 2700 SFMMU_STAT(sf_tsb_load8k); 2701 } 2702 2703 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2704 } 2705 } 2706 2707 /* 2708 * Unmap all entries from [start, end) matching the given page size. 2709 * 2710 * This function is used primarily to unmap replicated 64K or 512K entries 2711 * from the TSB that are inserted using the base page size TSB pointer, but 2712 * it may also be called to unmap a range of addresses from the TSB. 2713 */ 2714 void 2715 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2716 { 2717 struct tsb_info *tsbinfop; 2718 uint64_t tag; 2719 struct tsbe *tsbe_addr; 2720 caddr_t vaddr; 2721 uint64_t tsb_base; 2722 int vpshift, vpgsz; 2723 uint_t tsb_size; 2724 int phys = 0; 2725 2726 /* 2727 * Assumptions: 2728 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2729 * at a time shooting down any valid entries we encounter. 2730 * 2731 * If ttesz >= 4M we walk the range 4M at a time shooting 2732 * down any valid mappings we find. 2733 */ 2734 if (sfmmup == ksfmmup) { 2735 phys = ktsb_phys; 2736 if (ttesz >= TTE4M) { 2737 #ifndef sun4v 2738 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2739 #endif 2740 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2741 tsb_size = ktsb4m_szcode; 2742 } else { 2743 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2744 tsb_size = ktsb_szcode; 2745 } 2746 } else { 2747 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2748 2749 /* 2750 * If there isn't a TSB for this page size, or the TSB is 2751 * swapped out, there is nothing to do. Note that the latter 2752 * case seems impossible but can occur if hat_pageunload() 2753 * is called on an ISM mapping while the process is swapped 2754 * out. 2755 */ 2756 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2757 return; 2758 2759 /* 2760 * If another thread is in the middle of relocating a TSB 2761 * we can't unload the entry so set a flag so that the 2762 * TSB will be flushed before it can be accessed by the 2763 * process. 2764 */ 2765 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2766 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2767 return; 2768 } 2769 #if defined(UTSB_PHYS) 2770 phys = 1; 2771 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2772 #else 2773 tsb_base = (uint64_t)tsbinfop->tsb_va; 2774 #endif 2775 tsb_size = tsbinfop->tsb_szc; 2776 } 2777 if (ttesz >= TTE4M) { 2778 vpshift = MMU_PAGESHIFT4M; 2779 vpgsz = MMU_PAGESIZE4M; 2780 } else { 2781 vpshift = MMU_PAGESHIFT; 2782 vpgsz = MMU_PAGESIZE; 2783 } 2784 2785 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2786 tag = sfmmu_make_tsbtag(vaddr); 2787 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2788 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2789 } 2790 } 2791 2792 /* 2793 * Select the optimum TSB size given the number of mappings 2794 * that need to be cached. 2795 */ 2796 static int 2797 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2798 { 2799 int szc = 0; 2800 2801 #ifdef DEBUG 2802 if (tsb_grow_stress) { 2803 uint32_t randval = (uint32_t)gettick() >> 4; 2804 return (randval % (tsb_max_growsize + 1)); 2805 } 2806 #endif /* DEBUG */ 2807 2808 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2809 szc++; 2810 return (szc); 2811 } 2812 2813 /* 2814 * This function will add a translation to the hme_blk and allocate the 2815 * hme_blk if one does not exist. 2816 * If a page structure is specified then it will add the 2817 * corresponding hment to the mapping list. 2818 * It will also update the hmenum field for the tte. 2819 * Furthermore, it attempts to create a large page translation 2820 * for <addr,hat> at page array pps. It assumes addr and first 2821 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2822 */ 2823 static int 2824 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2825 page_t **pps, uint_t flags, uint_t rid) 2826 { 2827 struct hmehash_bucket *hmebp; 2828 struct hme_blk *hmeblkp; 2829 int ret; 2830 uint_t size; 2831 2832 /* 2833 * Get mapping size. 2834 */ 2835 size = TTE_CSZ(ttep); 2836 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2837 2838 /* 2839 * Acquire the hash bucket. 2840 */ 2841 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid); 2842 ASSERT(hmebp); 2843 2844 /* 2845 * Find the hment block. 2846 */ 2847 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags, 2848 rid); 2849 ASSERT(hmeblkp); 2850 2851 /* 2852 * Add the translation. 2853 */ 2854 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags, 2855 rid); 2856 2857 /* 2858 * Release the hash bucket. 2859 */ 2860 sfmmu_tteload_release_hashbucket(hmebp); 2861 2862 return (ret); 2863 } 2864 2865 /* 2866 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2867 */ 2868 static struct hmehash_bucket * 2869 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size, 2870 uint_t rid) 2871 { 2872 struct hmehash_bucket *hmebp; 2873 int hmeshift; 2874 void *htagid = sfmmutohtagid(sfmmup, rid); 2875 2876 ASSERT(htagid != NULL); 2877 2878 hmeshift = HME_HASH_SHIFT(size); 2879 2880 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift); 2881 2882 SFMMU_HASH_LOCK(hmebp); 2883 2884 return (hmebp); 2885 } 2886 2887 /* 2888 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2889 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2890 * allocated. 2891 */ 2892 static struct hme_blk * 2893 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2894 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid) 2895 { 2896 hmeblk_tag hblktag; 2897 int hmeshift; 2898 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 2899 2900 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 2901 2902 hblktag.htag_id = sfmmutohtagid(sfmmup, rid); 2903 ASSERT(hblktag.htag_id != NULL); 2904 hmeshift = HME_HASH_SHIFT(size); 2905 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2906 hblktag.htag_rehash = HME_HASH_REHASH(size); 2907 hblktag.htag_rid = rid; 2908 2909 ttearray_realloc: 2910 2911 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 2912 2913 /* 2914 * We block until hblk_reserve_lock is released; it's held by 2915 * the thread, temporarily using hblk_reserve, until hblk_reserve is 2916 * replaced by a hblk from sfmmu8_cache. 2917 */ 2918 if (hmeblkp == (struct hme_blk *)hblk_reserve && 2919 hblk_reserve_thread != curthread) { 2920 SFMMU_HASH_UNLOCK(hmebp); 2921 mutex_enter(&hblk_reserve_lock); 2922 mutex_exit(&hblk_reserve_lock); 2923 SFMMU_STAT(sf_hblk_reserve_hit); 2924 SFMMU_HASH_LOCK(hmebp); 2925 goto ttearray_realloc; 2926 } 2927 2928 if (hmeblkp == NULL) { 2929 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2930 hblktag, flags, rid); 2931 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2932 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2933 } else { 2934 /* 2935 * It is possible for 8k and 64k hblks to collide since they 2936 * have the same rehash value. This is because we 2937 * lazily free hblks and 8K/64K blks could be lingering. 2938 * If we find size mismatch we free the block and & try again. 2939 */ 2940 if (get_hblk_ttesz(hmeblkp) != size) { 2941 ASSERT(!hmeblkp->hblk_vcnt); 2942 ASSERT(!hmeblkp->hblk_hmecnt); 2943 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 2944 &list, 0); 2945 goto ttearray_realloc; 2946 } 2947 if (hmeblkp->hblk_shw_bit) { 2948 /* 2949 * if the hblk was previously used as a shadow hblk then 2950 * we will change it to a normal hblk 2951 */ 2952 ASSERT(!hmeblkp->hblk_shared); 2953 if (hmeblkp->hblk_shw_mask) { 2954 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 2955 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 2956 goto ttearray_realloc; 2957 } else { 2958 hmeblkp->hblk_shw_bit = 0; 2959 } 2960 } 2961 SFMMU_STAT(sf_hblk_hit); 2962 } 2963 2964 /* 2965 * hat_memload() should never call kmem_cache_free() for kernel hmeblks; 2966 * see block comment showing the stacktrace in sfmmu_hblk_alloc(); 2967 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will 2968 * just add these hmeblks to the per-cpu pending queue. 2969 */ 2970 sfmmu_hblks_list_purge(&list, 1); 2971 2972 ASSERT(get_hblk_ttesz(hmeblkp) == size); 2973 ASSERT(!hmeblkp->hblk_shw_bit); 2974 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2975 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2976 ASSERT(hmeblkp->hblk_tag.htag_rid == rid); 2977 2978 return (hmeblkp); 2979 } 2980 2981 /* 2982 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 2983 * otherwise. 2984 */ 2985 static int 2986 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 2987 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid) 2988 { 2989 page_t *pp = *pps; 2990 int hmenum, size, remap; 2991 tte_t tteold, flush_tte; 2992 #ifdef DEBUG 2993 tte_t orig_old; 2994 #endif /* DEBUG */ 2995 struct sf_hment *sfhme; 2996 kmutex_t *pml, *pmtx; 2997 hatlock_t *hatlockp; 2998 int myflt; 2999 3000 /* 3001 * remove this panic when we decide to let user virtual address 3002 * space be >= USERLIMIT. 3003 */ 3004 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 3005 panic("user addr %p in kernel space", (void *)vaddr); 3006 #if defined(TTE_IS_GLOBAL) 3007 if (TTE_IS_GLOBAL(ttep)) 3008 panic("sfmmu_tteload: creating global tte"); 3009 #endif 3010 3011 #ifdef DEBUG 3012 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 3013 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 3014 panic("sfmmu_tteload: non cacheable memory tte"); 3015 #endif /* DEBUG */ 3016 3017 /* don't simulate dirty bit for writeable ISM/DISM mappings */ 3018 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) { 3019 TTE_SET_REF(ttep); 3020 TTE_SET_MOD(ttep); 3021 } 3022 3023 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 3024 !TTE_IS_MOD(ttep)) { 3025 /* 3026 * Don't load TSB for dummy as in ISM. Also don't preload 3027 * the TSB if the TTE isn't writable since we're likely to 3028 * fault on it again -- preloading can be fairly expensive. 3029 */ 3030 flags |= SFMMU_NO_TSBLOAD; 3031 } 3032 3033 size = TTE_CSZ(ttep); 3034 switch (size) { 3035 case TTE8K: 3036 SFMMU_STAT(sf_tteload8k); 3037 break; 3038 case TTE64K: 3039 SFMMU_STAT(sf_tteload64k); 3040 break; 3041 case TTE512K: 3042 SFMMU_STAT(sf_tteload512k); 3043 break; 3044 case TTE4M: 3045 SFMMU_STAT(sf_tteload4m); 3046 break; 3047 case (TTE32M): 3048 SFMMU_STAT(sf_tteload32m); 3049 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3050 break; 3051 case (TTE256M): 3052 SFMMU_STAT(sf_tteload256m); 3053 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3054 break; 3055 } 3056 3057 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 3058 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 3059 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3060 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3061 3062 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 3063 3064 /* 3065 * Need to grab mlist lock here so that pageunload 3066 * will not change tte behind us. 3067 */ 3068 if (pp) { 3069 pml = sfmmu_mlist_enter(pp); 3070 } 3071 3072 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3073 /* 3074 * Look for corresponding hment and if valid verify 3075 * pfns are equal. 3076 */ 3077 remap = TTE_IS_VALID(&tteold); 3078 if (remap) { 3079 pfn_t new_pfn, old_pfn; 3080 3081 old_pfn = TTE_TO_PFN(vaddr, &tteold); 3082 new_pfn = TTE_TO_PFN(vaddr, ttep); 3083 3084 if (flags & HAT_LOAD_REMAP) { 3085 /* make sure we are remapping same type of pages */ 3086 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 3087 panic("sfmmu_tteload - tte remap io<->memory"); 3088 } 3089 if (old_pfn != new_pfn && 3090 (pp != NULL || sfhme->hme_page != NULL)) { 3091 panic("sfmmu_tteload - tte remap pp != NULL"); 3092 } 3093 } else if (old_pfn != new_pfn) { 3094 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 3095 (void *)hmeblkp); 3096 } 3097 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 3098 3099 if (TTE_IS_EXECUTABLE(&tteold) && TTE_IS_SOFTEXEC(ttep)) { 3100 TTE_SET_EXEC(ttep); 3101 } 3102 } 3103 3104 if (pp) { 3105 /* 3106 * If we know that this page will be executed, because 3107 * it was in the past (PP_ISEXEC is already true), or 3108 * if the caller says it will likely be executed 3109 * (HAT_LOAD_TEXT is true), then there is no need to 3110 * dynamically detect execution with a soft exec 3111 * fault. Enable hardware execute permission now. 3112 */ 3113 if ((PP_ISEXEC(pp) || (flags & HAT_LOAD_TEXT)) && 3114 TTE_IS_SOFTEXEC(ttep)) { 3115 TTE_SET_EXEC(ttep); 3116 } 3117 3118 if (size == TTE8K) { 3119 #ifdef VAC 3120 /* 3121 * Handle VAC consistency 3122 */ 3123 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 3124 sfmmu_vac_conflict(sfmmup, vaddr, pp); 3125 } 3126 #endif 3127 3128 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3129 pmtx = sfmmu_page_enter(pp); 3130 PP_CLRRO(pp); 3131 sfmmu_page_exit(pmtx); 3132 } else if (!PP_ISMAPPED(pp) && 3133 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 3134 pmtx = sfmmu_page_enter(pp); 3135 if (!(PP_ISMOD(pp))) { 3136 PP_SETRO(pp); 3137 } 3138 sfmmu_page_exit(pmtx); 3139 } 3140 3141 if (TTE_EXECUTED(ttep)) { 3142 pmtx = sfmmu_page_enter(pp); 3143 PP_SETEXEC(pp); 3144 sfmmu_page_exit(pmtx); 3145 } 3146 3147 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 3148 /* 3149 * sfmmu_pagearray_setup failed so return 3150 */ 3151 sfmmu_mlist_exit(pml); 3152 return (1); 3153 } 3154 3155 } else if (TTE_IS_SOFTEXEC(ttep)) { 3156 TTE_SET_EXEC(ttep); 3157 } 3158 3159 /* 3160 * Make sure hment is not on a mapping list. 3161 */ 3162 ASSERT(remap || (sfhme->hme_page == NULL)); 3163 3164 /* if it is not a remap then hme->next better be NULL */ 3165 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 3166 3167 if (flags & HAT_LOAD_LOCK) { 3168 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 3169 panic("too high lckcnt-hmeblk %p", 3170 (void *)hmeblkp); 3171 } 3172 atomic_add_32(&hmeblkp->hblk_lckcnt, 1); 3173 3174 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 3175 } 3176 3177 #ifdef VAC 3178 if (pp && PP_ISNC(pp)) { 3179 /* 3180 * If the physical page is marked to be uncacheable, like 3181 * by a vac conflict, make sure the new mapping is also 3182 * uncacheable. 3183 */ 3184 TTE_CLR_VCACHEABLE(ttep); 3185 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 3186 } 3187 #endif 3188 ttep->tte_hmenum = hmenum; 3189 3190 #ifdef DEBUG 3191 orig_old = tteold; 3192 #endif /* DEBUG */ 3193 3194 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 3195 if ((sfmmup == KHATID) && 3196 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 3197 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3198 } 3199 #ifdef DEBUG 3200 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 3201 #endif /* DEBUG */ 3202 } 3203 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 3204 3205 if (!TTE_IS_VALID(&tteold)) { 3206 3207 atomic_add_16(&hmeblkp->hblk_vcnt, 1); 3208 if (rid == SFMMU_INVALID_SHMERID) { 3209 atomic_add_long(&sfmmup->sfmmu_ttecnt[size], 1); 3210 } else { 3211 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 3212 sf_region_t *rgnp = srdp->srd_hmergnp[rid]; 3213 /* 3214 * We already accounted for region ttecnt's in sfmmu 3215 * during hat_join_region() processing. Here we 3216 * only update ttecnt's in region struture. 3217 */ 3218 atomic_add_long(&rgnp->rgn_ttecnt[size], 1); 3219 } 3220 } 3221 3222 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup); 3223 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 3224 sfmmup != ksfmmup) { 3225 uchar_t tteflag = 1 << size; 3226 if (rid == SFMMU_INVALID_SHMERID) { 3227 if (!(sfmmup->sfmmu_tteflags & tteflag)) { 3228 hatlockp = sfmmu_hat_enter(sfmmup); 3229 sfmmup->sfmmu_tteflags |= tteflag; 3230 if (&mmu_set_pgsz_order) { 3231 mmu_set_pgsz_order(sfmmup, 1); 3232 } 3233 sfmmu_hat_exit(hatlockp); 3234 } 3235 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) { 3236 hatlockp = sfmmu_hat_enter(sfmmup); 3237 sfmmup->sfmmu_rtteflags |= tteflag; 3238 if (&mmu_set_pgsz_order && sfmmup != ksfmmup) { 3239 mmu_set_pgsz_order(sfmmup, 1); 3240 } 3241 sfmmu_hat_exit(hatlockp); 3242 } 3243 /* 3244 * Update the current CPU tsbmiss area, so the current thread 3245 * won't need to take the tsbmiss for the new pagesize. 3246 * The other threads in the process will update their tsb 3247 * miss area lazily in sfmmu_tsbmiss_exception() when they 3248 * fail to find the translation for a newly added pagesize. 3249 */ 3250 if (size > TTE64K && myflt) { 3251 struct tsbmiss *tsbmp; 3252 kpreempt_disable(); 3253 tsbmp = &tsbmiss_area[CPU->cpu_id]; 3254 if (rid == SFMMU_INVALID_SHMERID) { 3255 if (!(tsbmp->uhat_tteflags & tteflag)) { 3256 tsbmp->uhat_tteflags |= tteflag; 3257 } 3258 } else { 3259 if (!(tsbmp->uhat_rtteflags & tteflag)) { 3260 tsbmp->uhat_rtteflags |= tteflag; 3261 } 3262 } 3263 kpreempt_enable(); 3264 } 3265 } 3266 3267 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 3268 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 3269 hatlockp = sfmmu_hat_enter(sfmmup); 3270 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 3271 sfmmu_hat_exit(hatlockp); 3272 } 3273 3274 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 3275 hw_tte.tte_intlo; 3276 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 3277 hw_tte.tte_inthi; 3278 3279 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 3280 /* 3281 * If remap and new tte differs from old tte we need 3282 * to sync the mod bit and flush TLB/TSB. We don't 3283 * need to sync ref bit because we currently always set 3284 * ref bit in tteload. 3285 */ 3286 ASSERT(TTE_IS_REF(ttep)); 3287 if (TTE_IS_MOD(&tteold) || (TTE_EXECUTED(&tteold) && 3288 !TTE_IS_EXECUTABLE(ttep))) { 3289 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 3290 } 3291 /* 3292 * hwtte bits shouldn't change for SRD hmeblks as long as SRD 3293 * hmes are only used for read only text. Adding this code for 3294 * completeness and future use of shared hmeblks with writable 3295 * mappings of VMODSORT vnodes. 3296 */ 3297 if (hmeblkp->hblk_shared) { 3298 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr, 3299 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1); 3300 xt_sync(cpuset); 3301 SFMMU_STAT_ADD(sf_region_remap_demap, 1); 3302 } else { 3303 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 3304 xt_sync(sfmmup->sfmmu_cpusran); 3305 } 3306 } 3307 3308 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 3309 /* 3310 * We only preload 8K and 4M mappings into the TSB, since 3311 * 64K and 512K mappings are replicated and hence don't 3312 * have a single, unique TSB entry. Ditto for 32M/256M. 3313 */ 3314 if (size == TTE8K || size == TTE4M) { 3315 sf_scd_t *scdp; 3316 hatlockp = sfmmu_hat_enter(sfmmup); 3317 /* 3318 * Don't preload private TSB if the mapping is used 3319 * by the shctx in the SCD. 3320 */ 3321 scdp = sfmmup->sfmmu_scdp; 3322 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL || 3323 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 3324 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, 3325 size); 3326 } 3327 sfmmu_hat_exit(hatlockp); 3328 } 3329 } 3330 if (pp) { 3331 if (!remap) { 3332 HME_ADD(sfhme, pp); 3333 atomic_add_16(&hmeblkp->hblk_hmecnt, 1); 3334 ASSERT(hmeblkp->hblk_hmecnt > 0); 3335 3336 /* 3337 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 3338 * see pageunload() for comment. 3339 */ 3340 } 3341 sfmmu_mlist_exit(pml); 3342 } 3343 3344 return (0); 3345 } 3346 /* 3347 * Function unlocks hash bucket. 3348 */ 3349 static void 3350 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 3351 { 3352 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3353 SFMMU_HASH_UNLOCK(hmebp); 3354 } 3355 3356 /* 3357 * function which checks and sets up page array for a large 3358 * translation. Will set p_vcolor, p_index, p_ro fields. 3359 * Assumes addr and pfnum of first page are properly aligned. 3360 * Will check for physical contiguity. If check fails it return 3361 * non null. 3362 */ 3363 static int 3364 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 3365 { 3366 int i, index, ttesz; 3367 pfn_t pfnum; 3368 pgcnt_t npgs; 3369 page_t *pp, *pp1; 3370 kmutex_t *pmtx; 3371 #ifdef VAC 3372 int osz; 3373 int cflags = 0; 3374 int vac_err = 0; 3375 #endif 3376 int newidx = 0; 3377 3378 ttesz = TTE_CSZ(ttep); 3379 3380 ASSERT(ttesz > TTE8K); 3381 3382 npgs = TTEPAGES(ttesz); 3383 index = PAGESZ_TO_INDEX(ttesz); 3384 3385 pfnum = (*pps)->p_pagenum; 3386 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 3387 3388 /* 3389 * Save the first pp so we can do HAT_TMPNC at the end. 3390 */ 3391 pp1 = *pps; 3392 #ifdef VAC 3393 osz = fnd_mapping_sz(pp1); 3394 #endif 3395 3396 for (i = 0; i < npgs; i++, pps++) { 3397 pp = *pps; 3398 ASSERT(PAGE_LOCKED(pp)); 3399 ASSERT(pp->p_szc >= ttesz); 3400 ASSERT(pp->p_szc == pp1->p_szc); 3401 ASSERT(sfmmu_mlist_held(pp)); 3402 3403 /* 3404 * XXX is it possible to maintain P_RO on the root only? 3405 */ 3406 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3407 pmtx = sfmmu_page_enter(pp); 3408 PP_CLRRO(pp); 3409 sfmmu_page_exit(pmtx); 3410 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 3411 !PP_ISMOD(pp)) { 3412 pmtx = sfmmu_page_enter(pp); 3413 if (!(PP_ISMOD(pp))) { 3414 PP_SETRO(pp); 3415 } 3416 sfmmu_page_exit(pmtx); 3417 } 3418 3419 if (TTE_EXECUTED(ttep)) { 3420 pmtx = sfmmu_page_enter(pp); 3421 PP_SETEXEC(pp); 3422 sfmmu_page_exit(pmtx); 3423 } 3424 3425 /* 3426 * If this is a remap we skip vac & contiguity checks. 3427 */ 3428 if (remap) 3429 continue; 3430 3431 /* 3432 * set p_vcolor and detect any vac conflicts. 3433 */ 3434 #ifdef VAC 3435 if (vac_err == 0) { 3436 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 3437 3438 } 3439 #endif 3440 3441 /* 3442 * Save current index in case we need to undo it. 3443 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 3444 * "SFMMU_INDEX_SHIFT 6" 3445 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 3446 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 3447 * 3448 * So: index = PAGESZ_TO_INDEX(ttesz); 3449 * if ttesz == 1 then index = 0x2 3450 * 2 then index = 0x4 3451 * 3 then index = 0x8 3452 * 4 then index = 0x10 3453 * 5 then index = 0x20 3454 * The code below checks if it's a new pagesize (ie, newidx) 3455 * in case we need to take it back out of p_index, 3456 * and then or's the new index into the existing index. 3457 */ 3458 if ((PP_MAPINDEX(pp) & index) == 0) 3459 newidx = 1; 3460 pp->p_index = (PP_MAPINDEX(pp) | index); 3461 3462 /* 3463 * contiguity check 3464 */ 3465 if (pp->p_pagenum != pfnum) { 3466 /* 3467 * If we fail the contiguity test then 3468 * the only thing we need to fix is the p_index field. 3469 * We might get a few extra flushes but since this 3470 * path is rare that is ok. The p_ro field will 3471 * get automatically fixed on the next tteload to 3472 * the page. NO TNC bit is set yet. 3473 */ 3474 while (i >= 0) { 3475 pp = *pps; 3476 if (newidx) 3477 pp->p_index = (PP_MAPINDEX(pp) & 3478 ~index); 3479 pps--; 3480 i--; 3481 } 3482 return (1); 3483 } 3484 pfnum++; 3485 addr += MMU_PAGESIZE; 3486 } 3487 3488 #ifdef VAC 3489 if (vac_err) { 3490 if (ttesz > osz) { 3491 /* 3492 * There are some smaller mappings that causes vac 3493 * conflicts. Convert all existing small mappings to 3494 * TNC. 3495 */ 3496 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 3497 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 3498 npgs); 3499 } else { 3500 /* EMPTY */ 3501 /* 3502 * If there exists an big page mapping, 3503 * that means the whole existing big page 3504 * has TNC setting already. No need to covert to 3505 * TNC again. 3506 */ 3507 ASSERT(PP_ISTNC(pp1)); 3508 } 3509 } 3510 #endif /* VAC */ 3511 3512 return (0); 3513 } 3514 3515 #ifdef VAC 3516 /* 3517 * Routine that detects vac consistency for a large page. It also 3518 * sets virtual color for all pp's for this big mapping. 3519 */ 3520 static int 3521 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 3522 { 3523 int vcolor, ocolor; 3524 3525 ASSERT(sfmmu_mlist_held(pp)); 3526 3527 if (PP_ISNC(pp)) { 3528 return (HAT_TMPNC); 3529 } 3530 3531 vcolor = addr_to_vcolor(addr); 3532 if (PP_NEWPAGE(pp)) { 3533 PP_SET_VCOLOR(pp, vcolor); 3534 return (0); 3535 } 3536 3537 ocolor = PP_GET_VCOLOR(pp); 3538 if (ocolor == vcolor) { 3539 return (0); 3540 } 3541 3542 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 3543 /* 3544 * Previous user of page had a differnet color 3545 * but since there are no current users 3546 * we just flush the cache and change the color. 3547 * As an optimization for large pages we flush the 3548 * entire cache of that color and set a flag. 3549 */ 3550 SFMMU_STAT(sf_pgcolor_conflict); 3551 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 3552 CacheColor_SetFlushed(*cflags, ocolor); 3553 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 3554 } 3555 PP_SET_VCOLOR(pp, vcolor); 3556 return (0); 3557 } 3558 3559 /* 3560 * We got a real conflict with a current mapping. 3561 * set flags to start unencaching all mappings 3562 * and return failure so we restart looping 3563 * the pp array from the beginning. 3564 */ 3565 return (HAT_TMPNC); 3566 } 3567 #endif /* VAC */ 3568 3569 /* 3570 * creates a large page shadow hmeblk for a tte. 3571 * The purpose of this routine is to allow us to do quick unloads because 3572 * the vm layer can easily pass a very large but sparsely populated range. 3573 */ 3574 static struct hme_blk * 3575 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 3576 { 3577 struct hmehash_bucket *hmebp; 3578 hmeblk_tag hblktag; 3579 int hmeshift, size, vshift; 3580 uint_t shw_mask, newshw_mask; 3581 struct hme_blk *hmeblkp; 3582 3583 ASSERT(sfmmup != KHATID); 3584 if (mmu_page_sizes == max_mmu_page_sizes) { 3585 ASSERT(ttesz < TTE256M); 3586 } else { 3587 ASSERT(ttesz < TTE4M); 3588 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 3589 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 3590 } 3591 3592 if (ttesz == TTE8K) { 3593 size = TTE512K; 3594 } else { 3595 size = ++ttesz; 3596 } 3597 3598 hblktag.htag_id = sfmmup; 3599 hmeshift = HME_HASH_SHIFT(size); 3600 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3601 hblktag.htag_rehash = HME_HASH_REHASH(size); 3602 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3603 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 3604 3605 SFMMU_HASH_LOCK(hmebp); 3606 3607 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3608 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 3609 if (hmeblkp == NULL) { 3610 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3611 hblktag, flags, SFMMU_INVALID_SHMERID); 3612 } 3613 ASSERT(hmeblkp); 3614 if (!hmeblkp->hblk_shw_mask) { 3615 /* 3616 * if this is a unused hblk it was just allocated or could 3617 * potentially be a previous large page hblk so we need to 3618 * set the shadow bit. 3619 */ 3620 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3621 hmeblkp->hblk_shw_bit = 1; 3622 } else if (hmeblkp->hblk_shw_bit == 0) { 3623 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p", 3624 (void *)hmeblkp); 3625 } 3626 ASSERT(hmeblkp->hblk_shw_bit == 1); 3627 ASSERT(!hmeblkp->hblk_shared); 3628 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3629 ASSERT(vshift < 8); 3630 /* 3631 * Atomically set shw mask bit 3632 */ 3633 do { 3634 shw_mask = hmeblkp->hblk_shw_mask; 3635 newshw_mask = shw_mask | (1 << vshift); 3636 newshw_mask = cas32(&hmeblkp->hblk_shw_mask, shw_mask, 3637 newshw_mask); 3638 } while (newshw_mask != shw_mask); 3639 3640 SFMMU_HASH_UNLOCK(hmebp); 3641 3642 return (hmeblkp); 3643 } 3644 3645 /* 3646 * This routine cleanup a previous shadow hmeblk and changes it to 3647 * a regular hblk. This happens rarely but it is possible 3648 * when a process wants to use large pages and there are hblks still 3649 * lying around from the previous as that used these hmeblks. 3650 * The alternative was to cleanup the shadow hblks at unload time 3651 * but since so few user processes actually use large pages, it is 3652 * better to be lazy and cleanup at this time. 3653 */ 3654 static void 3655 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3656 struct hmehash_bucket *hmebp) 3657 { 3658 caddr_t addr, endaddr; 3659 int hashno, size; 3660 3661 ASSERT(hmeblkp->hblk_shw_bit); 3662 ASSERT(!hmeblkp->hblk_shared); 3663 3664 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3665 3666 if (!hmeblkp->hblk_shw_mask) { 3667 hmeblkp->hblk_shw_bit = 0; 3668 return; 3669 } 3670 addr = (caddr_t)get_hblk_base(hmeblkp); 3671 endaddr = get_hblk_endaddr(hmeblkp); 3672 size = get_hblk_ttesz(hmeblkp); 3673 hashno = size - 1; 3674 ASSERT(hashno > 0); 3675 SFMMU_HASH_UNLOCK(hmebp); 3676 3677 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3678 3679 SFMMU_HASH_LOCK(hmebp); 3680 } 3681 3682 static void 3683 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3684 int hashno) 3685 { 3686 int hmeshift, shadow = 0; 3687 hmeblk_tag hblktag; 3688 struct hmehash_bucket *hmebp; 3689 struct hme_blk *hmeblkp; 3690 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3691 3692 ASSERT(hashno > 0); 3693 hblktag.htag_id = sfmmup; 3694 hblktag.htag_rehash = hashno; 3695 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3696 3697 hmeshift = HME_HASH_SHIFT(hashno); 3698 3699 while (addr < endaddr) { 3700 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3701 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3702 SFMMU_HASH_LOCK(hmebp); 3703 /* inline HME_HASH_SEARCH */ 3704 hmeblkp = hmebp->hmeblkp; 3705 pr_hblk = NULL; 3706 while (hmeblkp) { 3707 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3708 /* found hme_blk */ 3709 ASSERT(!hmeblkp->hblk_shared); 3710 if (hmeblkp->hblk_shw_bit) { 3711 if (hmeblkp->hblk_shw_mask) { 3712 shadow = 1; 3713 sfmmu_shadow_hcleanup(sfmmup, 3714 hmeblkp, hmebp); 3715 break; 3716 } else { 3717 hmeblkp->hblk_shw_bit = 0; 3718 } 3719 } 3720 3721 /* 3722 * Hblk_hmecnt and hblk_vcnt could be non zero 3723 * since hblk_unload() does not gurantee that. 3724 * 3725 * XXX - this could cause tteload() to spin 3726 * where sfmmu_shadow_hcleanup() is called. 3727 */ 3728 } 3729 3730 nx_hblk = hmeblkp->hblk_next; 3731 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3732 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3733 &list, 0); 3734 } else { 3735 pr_hblk = hmeblkp; 3736 } 3737 hmeblkp = nx_hblk; 3738 } 3739 3740 SFMMU_HASH_UNLOCK(hmebp); 3741 3742 if (shadow) { 3743 /* 3744 * We found another shadow hblk so cleaned its 3745 * children. We need to go back and cleanup 3746 * the original hblk so we don't change the 3747 * addr. 3748 */ 3749 shadow = 0; 3750 } else { 3751 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3752 (1 << hmeshift)); 3753 } 3754 } 3755 sfmmu_hblks_list_purge(&list, 0); 3756 } 3757 3758 /* 3759 * This routine's job is to delete stale invalid shared hmeregions hmeblks that 3760 * may still linger on after pageunload. 3761 */ 3762 static void 3763 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz) 3764 { 3765 int hmeshift; 3766 hmeblk_tag hblktag; 3767 struct hmehash_bucket *hmebp; 3768 struct hme_blk *hmeblkp; 3769 struct hme_blk *pr_hblk; 3770 struct hme_blk *list = NULL; 3771 3772 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3773 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3774 3775 hmeshift = HME_HASH_SHIFT(ttesz); 3776 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3777 hblktag.htag_rehash = ttesz; 3778 hblktag.htag_rid = rid; 3779 hblktag.htag_id = srdp; 3780 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3781 3782 SFMMU_HASH_LOCK(hmebp); 3783 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3784 if (hmeblkp != NULL) { 3785 ASSERT(hmeblkp->hblk_shared); 3786 ASSERT(!hmeblkp->hblk_shw_bit); 3787 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3788 panic("sfmmu_cleanup_rhblk: valid hmeblk"); 3789 } 3790 ASSERT(!hmeblkp->hblk_lckcnt); 3791 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3792 &list, 0); 3793 } 3794 SFMMU_HASH_UNLOCK(hmebp); 3795 sfmmu_hblks_list_purge(&list, 0); 3796 } 3797 3798 /* ARGSUSED */ 3799 static void 3800 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 3801 size_t r_size, void *r_obj, u_offset_t r_objoff) 3802 { 3803 } 3804 3805 /* 3806 * Searches for an hmeblk which maps addr, then unloads this mapping 3807 * and updates *eaddrp, if the hmeblk is found. 3808 */ 3809 static void 3810 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr, 3811 caddr_t eaddr, int ttesz, caddr_t *eaddrp) 3812 { 3813 int hmeshift; 3814 hmeblk_tag hblktag; 3815 struct hmehash_bucket *hmebp; 3816 struct hme_blk *hmeblkp; 3817 struct hme_blk *pr_hblk; 3818 struct hme_blk *list = NULL; 3819 3820 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3821 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3822 ASSERT(ttesz >= HBLK_MIN_TTESZ); 3823 3824 hmeshift = HME_HASH_SHIFT(ttesz); 3825 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3826 hblktag.htag_rehash = ttesz; 3827 hblktag.htag_rid = rid; 3828 hblktag.htag_id = srdp; 3829 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3830 3831 SFMMU_HASH_LOCK(hmebp); 3832 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3833 if (hmeblkp != NULL) { 3834 ASSERT(hmeblkp->hblk_shared); 3835 ASSERT(!hmeblkp->hblk_lckcnt); 3836 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3837 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr, 3838 eaddr, NULL, HAT_UNLOAD); 3839 ASSERT(*eaddrp > addr); 3840 } 3841 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3842 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3843 &list, 0); 3844 } 3845 SFMMU_HASH_UNLOCK(hmebp); 3846 sfmmu_hblks_list_purge(&list, 0); 3847 } 3848 3849 static void 3850 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp) 3851 { 3852 int ttesz = rgnp->rgn_pgszc; 3853 size_t rsz = rgnp->rgn_size; 3854 caddr_t rsaddr = rgnp->rgn_saddr; 3855 caddr_t readdr = rsaddr + rsz; 3856 caddr_t rhsaddr; 3857 caddr_t va; 3858 uint_t rid = rgnp->rgn_id; 3859 caddr_t cbsaddr; 3860 caddr_t cbeaddr; 3861 hat_rgn_cb_func_t rcbfunc; 3862 ulong_t cnt; 3863 3864 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3865 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3866 3867 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz))); 3868 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz))); 3869 if (ttesz < HBLK_MIN_TTESZ) { 3870 ttesz = HBLK_MIN_TTESZ; 3871 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES); 3872 } else { 3873 rhsaddr = rsaddr; 3874 } 3875 3876 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) { 3877 rcbfunc = sfmmu_rgn_cb_noop; 3878 } 3879 3880 while (ttesz >= HBLK_MIN_TTESZ) { 3881 cbsaddr = rsaddr; 3882 cbeaddr = rsaddr; 3883 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 3884 ttesz--; 3885 continue; 3886 } 3887 cnt = 0; 3888 va = rsaddr; 3889 while (va < readdr) { 3890 ASSERT(va >= rhsaddr); 3891 if (va != cbeaddr) { 3892 if (cbeaddr != cbsaddr) { 3893 ASSERT(cbeaddr > cbsaddr); 3894 (*rcbfunc)(cbsaddr, cbeaddr, 3895 rsaddr, rsz, rgnp->rgn_obj, 3896 rgnp->rgn_objoff); 3897 } 3898 cbsaddr = va; 3899 cbeaddr = va; 3900 } 3901 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr, 3902 ttesz, &cbeaddr); 3903 cnt++; 3904 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz)); 3905 } 3906 if (cbeaddr != cbsaddr) { 3907 ASSERT(cbeaddr > cbsaddr); 3908 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr, 3909 rsz, rgnp->rgn_obj, 3910 rgnp->rgn_objoff); 3911 } 3912 ttesz--; 3913 } 3914 } 3915 3916 /* 3917 * Release one hardware address translation lock on the given address range. 3918 */ 3919 void 3920 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3921 { 3922 struct hmehash_bucket *hmebp; 3923 hmeblk_tag hblktag; 3924 int hmeshift, hashno = 1; 3925 struct hme_blk *hmeblkp, *list = NULL; 3926 caddr_t endaddr; 3927 3928 ASSERT(sfmmup != NULL); 3929 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 3930 3931 ASSERT((sfmmup == ksfmmup) || 3932 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 3933 ASSERT((len & MMU_PAGEOFFSET) == 0); 3934 endaddr = addr + len; 3935 hblktag.htag_id = sfmmup; 3936 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3937 3938 /* 3939 * Spitfire supports 4 page sizes. 3940 * Most pages are expected to be of the smallest page size (8K) and 3941 * these will not need to be rehashed. 64K pages also don't need to be 3942 * rehashed because an hmeblk spans 64K of address space. 512K pages 3943 * might need 1 rehash and and 4M pages might need 2 rehashes. 3944 */ 3945 while (addr < endaddr) { 3946 hmeshift = HME_HASH_SHIFT(hashno); 3947 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3948 hblktag.htag_rehash = hashno; 3949 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3950 3951 SFMMU_HASH_LOCK(hmebp); 3952 3953 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3954 if (hmeblkp != NULL) { 3955 ASSERT(!hmeblkp->hblk_shared); 3956 /* 3957 * If we encounter a shadow hmeblk then 3958 * we know there are no valid hmeblks mapping 3959 * this address at this size or larger. 3960 * Just increment address by the smallest 3961 * page size. 3962 */ 3963 if (hmeblkp->hblk_shw_bit) { 3964 addr += MMU_PAGESIZE; 3965 } else { 3966 addr = sfmmu_hblk_unlock(hmeblkp, addr, 3967 endaddr); 3968 } 3969 SFMMU_HASH_UNLOCK(hmebp); 3970 hashno = 1; 3971 continue; 3972 } 3973 SFMMU_HASH_UNLOCK(hmebp); 3974 3975 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 3976 /* 3977 * We have traversed the whole list and rehashed 3978 * if necessary without finding the address to unlock 3979 * which should never happen. 3980 */ 3981 panic("sfmmu_unlock: addr not found. " 3982 "addr %p hat %p", (void *)addr, (void *)sfmmup); 3983 } else { 3984 hashno++; 3985 } 3986 } 3987 3988 sfmmu_hblks_list_purge(&list, 0); 3989 } 3990 3991 void 3992 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len, 3993 hat_region_cookie_t rcookie) 3994 { 3995 sf_srd_t *srdp; 3996 sf_region_t *rgnp; 3997 int ttesz; 3998 uint_t rid; 3999 caddr_t eaddr; 4000 caddr_t va; 4001 int hmeshift; 4002 hmeblk_tag hblktag; 4003 struct hmehash_bucket *hmebp; 4004 struct hme_blk *hmeblkp; 4005 struct hme_blk *pr_hblk; 4006 struct hme_blk *list; 4007 4008 if (rcookie == HAT_INVALID_REGION_COOKIE) { 4009 hat_unlock(sfmmup, addr, len); 4010 return; 4011 } 4012 4013 ASSERT(sfmmup != NULL); 4014 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4015 ASSERT(sfmmup != ksfmmup); 4016 4017 srdp = sfmmup->sfmmu_srdp; 4018 rid = (uint_t)((uint64_t)rcookie); 4019 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 4020 eaddr = addr + len; 4021 va = addr; 4022 list = NULL; 4023 rgnp = srdp->srd_hmergnp[rid]; 4024 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len); 4025 4026 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc))); 4027 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc))); 4028 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) { 4029 ttesz = HBLK_MIN_TTESZ; 4030 } else { 4031 ttesz = rgnp->rgn_pgszc; 4032 } 4033 while (va < eaddr) { 4034 while (ttesz < rgnp->rgn_pgszc && 4035 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) { 4036 ttesz++; 4037 } 4038 while (ttesz >= HBLK_MIN_TTESZ) { 4039 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 4040 ttesz--; 4041 continue; 4042 } 4043 hmeshift = HME_HASH_SHIFT(ttesz); 4044 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift); 4045 hblktag.htag_rehash = ttesz; 4046 hblktag.htag_rid = rid; 4047 hblktag.htag_id = srdp; 4048 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift); 4049 SFMMU_HASH_LOCK(hmebp); 4050 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, 4051 &list); 4052 if (hmeblkp == NULL) { 4053 SFMMU_HASH_UNLOCK(hmebp); 4054 ttesz--; 4055 continue; 4056 } 4057 ASSERT(hmeblkp->hblk_shared); 4058 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr); 4059 ASSERT(va >= eaddr || 4060 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz))); 4061 SFMMU_HASH_UNLOCK(hmebp); 4062 break; 4063 } 4064 if (ttesz < HBLK_MIN_TTESZ) { 4065 panic("hat_unlock_region: addr not found " 4066 "addr %p hat %p", (void *)va, (void *)sfmmup); 4067 } 4068 } 4069 sfmmu_hblks_list_purge(&list, 0); 4070 } 4071 4072 /* 4073 * Function to unlock a range of addresses in an hmeblk. It returns the 4074 * next address that needs to be unlocked. 4075 * Should be called with the hash lock held. 4076 */ 4077 static caddr_t 4078 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 4079 { 4080 struct sf_hment *sfhme; 4081 tte_t tteold, ttemod; 4082 int ttesz, ret; 4083 4084 ASSERT(in_hblk_range(hmeblkp, addr)); 4085 ASSERT(hmeblkp->hblk_shw_bit == 0); 4086 4087 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4088 ttesz = get_hblk_ttesz(hmeblkp); 4089 4090 HBLKTOHME(sfhme, hmeblkp, addr); 4091 while (addr < endaddr) { 4092 readtte: 4093 sfmmu_copytte(&sfhme->hme_tte, &tteold); 4094 if (TTE_IS_VALID(&tteold)) { 4095 4096 ttemod = tteold; 4097 4098 ret = sfmmu_modifytte_try(&tteold, &ttemod, 4099 &sfhme->hme_tte); 4100 4101 if (ret < 0) 4102 goto readtte; 4103 4104 if (hmeblkp->hblk_lckcnt == 0) 4105 panic("zero hblk lckcnt"); 4106 4107 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 4108 (uintptr_t)endaddr) 4109 panic("can't unlock large tte"); 4110 4111 ASSERT(hmeblkp->hblk_lckcnt > 0); 4112 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 4113 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 4114 } else { 4115 panic("sfmmu_hblk_unlock: invalid tte"); 4116 } 4117 addr += TTEBYTES(ttesz); 4118 sfhme++; 4119 } 4120 return (addr); 4121 } 4122 4123 /* 4124 * Physical Address Mapping Framework 4125 * 4126 * General rules: 4127 * 4128 * (1) Applies only to seg_kmem memory pages. To make things easier, 4129 * seg_kpm addresses are also accepted by the routines, but nothing 4130 * is done with them since by definition their PA mappings are static. 4131 * (2) hat_add_callback() may only be called while holding the page lock 4132 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()), 4133 * or passing HAC_PAGELOCK flag. 4134 * (3) prehandler() and posthandler() may not call hat_add_callback() or 4135 * hat_delete_callback(), nor should they allocate memory. Post quiesce 4136 * callbacks may not sleep or acquire adaptive mutex locks. 4137 * (4) Either prehandler() or posthandler() (but not both) may be specified 4138 * as being NULL. Specifying an errhandler() is optional. 4139 * 4140 * Details of using the framework: 4141 * 4142 * registering a callback (hat_register_callback()) 4143 * 4144 * Pass prehandler, posthandler, errhandler addresses 4145 * as described below. If capture_cpus argument is nonzero, 4146 * suspend callback to the prehandler will occur with CPUs 4147 * captured and executing xc_loop() and CPUs will remain 4148 * captured until after the posthandler suspend callback 4149 * occurs. 4150 * 4151 * adding a callback (hat_add_callback()) 4152 * 4153 * as_pagelock(); 4154 * hat_add_callback(); 4155 * save returned pfn in private data structures or program registers; 4156 * as_pageunlock(); 4157 * 4158 * prehandler() 4159 * 4160 * Stop all accesses by physical address to this memory page. 4161 * Called twice: the first, PRESUSPEND, is a context safe to acquire 4162 * adaptive locks. The second, SUSPEND, is called at high PIL with 4163 * CPUs captured so adaptive locks may NOT be acquired (and all spin 4164 * locks must be XCALL_PIL or higher locks). 4165 * 4166 * May return the following errors: 4167 * EIO: A fatal error has occurred. This will result in panic. 4168 * EAGAIN: The page cannot be suspended. This will fail the 4169 * relocation. 4170 * 0: Success. 4171 * 4172 * posthandler() 4173 * 4174 * Save new pfn in private data structures or program registers; 4175 * not allowed to fail (non-zero return values will result in panic). 4176 * 4177 * errhandler() 4178 * 4179 * called when an error occurs related to the callback. Currently 4180 * the only such error is HAT_CB_ERR_LEAKED which indicates that 4181 * a page is being freed, but there are still outstanding callback(s) 4182 * registered on the page. 4183 * 4184 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 4185 * 4186 * stop using physical address 4187 * hat_delete_callback(); 4188 * 4189 */ 4190 4191 /* 4192 * Register a callback class. Each subsystem should do this once and 4193 * cache the id_t returned for use in setting up and tearing down callbacks. 4194 * 4195 * There is no facility for removing callback IDs once they are created; 4196 * the "key" should be unique for each module, so in case a module is unloaded 4197 * and subsequently re-loaded, we can recycle the module's previous entry. 4198 */ 4199 id_t 4200 hat_register_callback(int key, 4201 int (*prehandler)(caddr_t, uint_t, uint_t, void *), 4202 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 4203 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 4204 int capture_cpus) 4205 { 4206 id_t id; 4207 4208 /* 4209 * Search the table for a pre-existing callback associated with 4210 * the identifier "key". If one exists, we re-use that entry in 4211 * the table for this instance, otherwise we assign the next 4212 * available table slot. 4213 */ 4214 for (id = 0; id < sfmmu_max_cb_id; id++) { 4215 if (sfmmu_cb_table[id].key == key) 4216 break; 4217 } 4218 4219 if (id == sfmmu_max_cb_id) { 4220 id = sfmmu_cb_nextid++; 4221 if (id >= sfmmu_max_cb_id) 4222 panic("hat_register_callback: out of callback IDs"); 4223 } 4224 4225 ASSERT(prehandler != NULL || posthandler != NULL); 4226 4227 sfmmu_cb_table[id].key = key; 4228 sfmmu_cb_table[id].prehandler = prehandler; 4229 sfmmu_cb_table[id].posthandler = posthandler; 4230 sfmmu_cb_table[id].errhandler = errhandler; 4231 sfmmu_cb_table[id].capture_cpus = capture_cpus; 4232 4233 return (id); 4234 } 4235 4236 #define HAC_COOKIE_NONE (void *)-1 4237 4238 /* 4239 * Add relocation callbacks to the specified addr/len which will be called 4240 * when relocating the associated page. See the description of pre and 4241 * posthandler above for more details. 4242 * 4243 * If HAC_PAGELOCK is included in flags, the underlying memory page is 4244 * locked internally so the caller must be able to deal with the callback 4245 * running even before this function has returned. If HAC_PAGELOCK is not 4246 * set, it is assumed that the underlying memory pages are locked. 4247 * 4248 * Since the caller must track the individual page boundaries anyway, 4249 * we only allow a callback to be added to a single page (large 4250 * or small). Thus [addr, addr + len) MUST be contained within a single 4251 * page. 4252 * 4253 * Registering multiple callbacks on the same [addr, addr+len) is supported, 4254 * _provided_that_ a unique parameter is specified for each callback. 4255 * If multiple callbacks are registered on the same range the callback will 4256 * be invoked with each unique parameter. Registering the same callback with 4257 * the same argument more than once will result in corrupted kernel state. 4258 * 4259 * Returns the pfn of the underlying kernel page in *rpfn 4260 * on success, or PFN_INVALID on failure. 4261 * 4262 * cookiep (if passed) provides storage space for an opaque cookie 4263 * to return later to hat_delete_callback(). This cookie makes the callback 4264 * deletion significantly quicker by avoiding a potentially lengthy hash 4265 * search. 4266 * 4267 * Returns values: 4268 * 0: success 4269 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 4270 * EINVAL: callback ID is not valid 4271 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 4272 * space 4273 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary 4274 */ 4275 int 4276 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 4277 void *pvt, pfn_t *rpfn, void **cookiep) 4278 { 4279 struct hmehash_bucket *hmebp; 4280 hmeblk_tag hblktag; 4281 struct hme_blk *hmeblkp; 4282 int hmeshift, hashno; 4283 caddr_t saddr, eaddr, baseaddr; 4284 struct pa_hment *pahmep; 4285 struct sf_hment *sfhmep, *osfhmep; 4286 kmutex_t *pml; 4287 tte_t tte; 4288 page_t *pp; 4289 vnode_t *vp; 4290 u_offset_t off; 4291 pfn_t pfn; 4292 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 4293 int locked = 0; 4294 4295 /* 4296 * For KPM mappings, just return the physical address since we 4297 * don't need to register any callbacks. 4298 */ 4299 if (IS_KPM_ADDR(vaddr)) { 4300 uint64_t paddr; 4301 SFMMU_KPM_VTOP(vaddr, paddr); 4302 *rpfn = btop(paddr); 4303 if (cookiep != NULL) 4304 *cookiep = HAC_COOKIE_NONE; 4305 return (0); 4306 } 4307 4308 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 4309 *rpfn = PFN_INVALID; 4310 return (EINVAL); 4311 } 4312 4313 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 4314 *rpfn = PFN_INVALID; 4315 return (ENOMEM); 4316 } 4317 4318 sfhmep = &pahmep->sfment; 4319 4320 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4321 eaddr = saddr + len; 4322 4323 rehash: 4324 /* Find the mapping(s) for this page */ 4325 for (hashno = TTE64K, hmeblkp = NULL; 4326 hmeblkp == NULL && hashno <= mmu_hashcnt; 4327 hashno++) { 4328 hmeshift = HME_HASH_SHIFT(hashno); 4329 hblktag.htag_id = ksfmmup; 4330 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4331 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4332 hblktag.htag_rehash = hashno; 4333 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4334 4335 SFMMU_HASH_LOCK(hmebp); 4336 4337 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4338 4339 if (hmeblkp == NULL) 4340 SFMMU_HASH_UNLOCK(hmebp); 4341 } 4342 4343 if (hmeblkp == NULL) { 4344 kmem_cache_free(pa_hment_cache, pahmep); 4345 *rpfn = PFN_INVALID; 4346 return (ENXIO); 4347 } 4348 4349 ASSERT(!hmeblkp->hblk_shared); 4350 4351 HBLKTOHME(osfhmep, hmeblkp, saddr); 4352 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4353 4354 if (!TTE_IS_VALID(&tte)) { 4355 SFMMU_HASH_UNLOCK(hmebp); 4356 kmem_cache_free(pa_hment_cache, pahmep); 4357 *rpfn = PFN_INVALID; 4358 return (ENXIO); 4359 } 4360 4361 /* 4362 * Make sure the boundaries for the callback fall within this 4363 * single mapping. 4364 */ 4365 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 4366 ASSERT(saddr >= baseaddr); 4367 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) { 4368 SFMMU_HASH_UNLOCK(hmebp); 4369 kmem_cache_free(pa_hment_cache, pahmep); 4370 *rpfn = PFN_INVALID; 4371 return (ERANGE); 4372 } 4373 4374 pfn = sfmmu_ttetopfn(&tte, vaddr); 4375 4376 /* 4377 * The pfn may not have a page_t underneath in which case we 4378 * just return it. This can happen if we are doing I/O to a 4379 * static portion of the kernel's address space, for instance. 4380 */ 4381 pp = osfhmep->hme_page; 4382 if (pp == NULL) { 4383 SFMMU_HASH_UNLOCK(hmebp); 4384 kmem_cache_free(pa_hment_cache, pahmep); 4385 *rpfn = pfn; 4386 if (cookiep) 4387 *cookiep = HAC_COOKIE_NONE; 4388 return (0); 4389 } 4390 ASSERT(pp == PP_PAGEROOT(pp)); 4391 4392 vp = pp->p_vnode; 4393 off = pp->p_offset; 4394 4395 pml = sfmmu_mlist_enter(pp); 4396 4397 if (flags & HAC_PAGELOCK) { 4398 if (!page_trylock(pp, SE_SHARED)) { 4399 /* 4400 * Somebody is holding SE_EXCL lock. Might 4401 * even be hat_page_relocate(). Drop all 4402 * our locks, lookup the page in &kvp, and 4403 * retry. If it doesn't exist in &kvp and &zvp, 4404 * then we must be dealing with a kernel mapped 4405 * page which doesn't actually belong to 4406 * segkmem so we punt. 4407 */ 4408 sfmmu_mlist_exit(pml); 4409 SFMMU_HASH_UNLOCK(hmebp); 4410 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4411 4412 /* check zvp before giving up */ 4413 if (pp == NULL) 4414 pp = page_lookup(&zvp, (u_offset_t)saddr, 4415 SE_SHARED); 4416 4417 /* Okay, we didn't find it, give up */ 4418 if (pp == NULL) { 4419 kmem_cache_free(pa_hment_cache, pahmep); 4420 *rpfn = pfn; 4421 if (cookiep) 4422 *cookiep = HAC_COOKIE_NONE; 4423 return (0); 4424 } 4425 page_unlock(pp); 4426 goto rehash; 4427 } 4428 locked = 1; 4429 } 4430 4431 if (!PAGE_LOCKED(pp) && !panicstr) 4432 panic("hat_add_callback: page 0x%p not locked", (void *)pp); 4433 4434 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4435 pp->p_offset != off) { 4436 /* 4437 * The page moved before we got our hands on it. Drop 4438 * all the locks and try again. 4439 */ 4440 ASSERT((flags & HAC_PAGELOCK) != 0); 4441 sfmmu_mlist_exit(pml); 4442 SFMMU_HASH_UNLOCK(hmebp); 4443 page_unlock(pp); 4444 locked = 0; 4445 goto rehash; 4446 } 4447 4448 if (!VN_ISKAS(vp)) { 4449 /* 4450 * This is not a segkmem page but another page which 4451 * has been kernel mapped. It had better have at least 4452 * a share lock on it. Return the pfn. 4453 */ 4454 sfmmu_mlist_exit(pml); 4455 SFMMU_HASH_UNLOCK(hmebp); 4456 if (locked) 4457 page_unlock(pp); 4458 kmem_cache_free(pa_hment_cache, pahmep); 4459 ASSERT(PAGE_LOCKED(pp)); 4460 *rpfn = pfn; 4461 if (cookiep) 4462 *cookiep = HAC_COOKIE_NONE; 4463 return (0); 4464 } 4465 4466 /* 4467 * Setup this pa_hment and link its embedded dummy sf_hment into 4468 * the mapping list. 4469 */ 4470 pp->p_share++; 4471 pahmep->cb_id = callback_id; 4472 pahmep->addr = vaddr; 4473 pahmep->len = len; 4474 pahmep->refcnt = 1; 4475 pahmep->flags = 0; 4476 pahmep->pvt = pvt; 4477 4478 sfhmep->hme_tte.ll = 0; 4479 sfhmep->hme_data = pahmep; 4480 sfhmep->hme_prev = osfhmep; 4481 sfhmep->hme_next = osfhmep->hme_next; 4482 4483 if (osfhmep->hme_next) 4484 osfhmep->hme_next->hme_prev = sfhmep; 4485 4486 osfhmep->hme_next = sfhmep; 4487 4488 sfmmu_mlist_exit(pml); 4489 SFMMU_HASH_UNLOCK(hmebp); 4490 4491 if (locked) 4492 page_unlock(pp); 4493 4494 *rpfn = pfn; 4495 if (cookiep) 4496 *cookiep = (void *)pahmep; 4497 4498 return (0); 4499 } 4500 4501 /* 4502 * Remove the relocation callbacks from the specified addr/len. 4503 */ 4504 void 4505 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags, 4506 void *cookie) 4507 { 4508 struct hmehash_bucket *hmebp; 4509 hmeblk_tag hblktag; 4510 struct hme_blk *hmeblkp; 4511 int hmeshift, hashno; 4512 caddr_t saddr; 4513 struct pa_hment *pahmep; 4514 struct sf_hment *sfhmep, *osfhmep; 4515 kmutex_t *pml; 4516 tte_t tte; 4517 page_t *pp; 4518 vnode_t *vp; 4519 u_offset_t off; 4520 int locked = 0; 4521 4522 /* 4523 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to 4524 * remove so just return. 4525 */ 4526 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr)) 4527 return; 4528 4529 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4530 4531 rehash: 4532 /* Find the mapping(s) for this page */ 4533 for (hashno = TTE64K, hmeblkp = NULL; 4534 hmeblkp == NULL && hashno <= mmu_hashcnt; 4535 hashno++) { 4536 hmeshift = HME_HASH_SHIFT(hashno); 4537 hblktag.htag_id = ksfmmup; 4538 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4539 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4540 hblktag.htag_rehash = hashno; 4541 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4542 4543 SFMMU_HASH_LOCK(hmebp); 4544 4545 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4546 4547 if (hmeblkp == NULL) 4548 SFMMU_HASH_UNLOCK(hmebp); 4549 } 4550 4551 if (hmeblkp == NULL) 4552 return; 4553 4554 ASSERT(!hmeblkp->hblk_shared); 4555 4556 HBLKTOHME(osfhmep, hmeblkp, saddr); 4557 4558 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4559 if (!TTE_IS_VALID(&tte)) { 4560 SFMMU_HASH_UNLOCK(hmebp); 4561 return; 4562 } 4563 4564 pp = osfhmep->hme_page; 4565 if (pp == NULL) { 4566 SFMMU_HASH_UNLOCK(hmebp); 4567 ASSERT(cookie == NULL); 4568 return; 4569 } 4570 4571 vp = pp->p_vnode; 4572 off = pp->p_offset; 4573 4574 pml = sfmmu_mlist_enter(pp); 4575 4576 if (flags & HAC_PAGELOCK) { 4577 if (!page_trylock(pp, SE_SHARED)) { 4578 /* 4579 * Somebody is holding SE_EXCL lock. Might 4580 * even be hat_page_relocate(). Drop all 4581 * our locks, lookup the page in &kvp, and 4582 * retry. If it doesn't exist in &kvp and &zvp, 4583 * then we must be dealing with a kernel mapped 4584 * page which doesn't actually belong to 4585 * segkmem so we punt. 4586 */ 4587 sfmmu_mlist_exit(pml); 4588 SFMMU_HASH_UNLOCK(hmebp); 4589 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4590 /* check zvp before giving up */ 4591 if (pp == NULL) 4592 pp = page_lookup(&zvp, (u_offset_t)saddr, 4593 SE_SHARED); 4594 4595 if (pp == NULL) { 4596 ASSERT(cookie == NULL); 4597 return; 4598 } 4599 page_unlock(pp); 4600 goto rehash; 4601 } 4602 locked = 1; 4603 } 4604 4605 ASSERT(PAGE_LOCKED(pp)); 4606 4607 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4608 pp->p_offset != off) { 4609 /* 4610 * The page moved before we got our hands on it. Drop 4611 * all the locks and try again. 4612 */ 4613 ASSERT((flags & HAC_PAGELOCK) != 0); 4614 sfmmu_mlist_exit(pml); 4615 SFMMU_HASH_UNLOCK(hmebp); 4616 page_unlock(pp); 4617 locked = 0; 4618 goto rehash; 4619 } 4620 4621 if (!VN_ISKAS(vp)) { 4622 /* 4623 * This is not a segkmem page but another page which 4624 * has been kernel mapped. 4625 */ 4626 sfmmu_mlist_exit(pml); 4627 SFMMU_HASH_UNLOCK(hmebp); 4628 if (locked) 4629 page_unlock(pp); 4630 ASSERT(cookie == NULL); 4631 return; 4632 } 4633 4634 if (cookie != NULL) { 4635 pahmep = (struct pa_hment *)cookie; 4636 sfhmep = &pahmep->sfment; 4637 } else { 4638 for (sfhmep = pp->p_mapping; sfhmep != NULL; 4639 sfhmep = sfhmep->hme_next) { 4640 4641 /* 4642 * skip va<->pa mappings 4643 */ 4644 if (!IS_PAHME(sfhmep)) 4645 continue; 4646 4647 pahmep = sfhmep->hme_data; 4648 ASSERT(pahmep != NULL); 4649 4650 /* 4651 * if pa_hment matches, remove it 4652 */ 4653 if ((pahmep->pvt == pvt) && 4654 (pahmep->addr == vaddr) && 4655 (pahmep->len == len)) { 4656 break; 4657 } 4658 } 4659 } 4660 4661 if (sfhmep == NULL) { 4662 if (!panicstr) { 4663 panic("hat_delete_callback: pa_hment not found, pp %p", 4664 (void *)pp); 4665 } 4666 return; 4667 } 4668 4669 /* 4670 * Note: at this point a valid kernel mapping must still be 4671 * present on this page. 4672 */ 4673 pp->p_share--; 4674 if (pp->p_share <= 0) 4675 panic("hat_delete_callback: zero p_share"); 4676 4677 if (--pahmep->refcnt == 0) { 4678 if (pahmep->flags != 0) 4679 panic("hat_delete_callback: pa_hment is busy"); 4680 4681 /* 4682 * Remove sfhmep from the mapping list for the page. 4683 */ 4684 if (sfhmep->hme_prev) { 4685 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 4686 } else { 4687 pp->p_mapping = sfhmep->hme_next; 4688 } 4689 4690 if (sfhmep->hme_next) 4691 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 4692 4693 sfmmu_mlist_exit(pml); 4694 SFMMU_HASH_UNLOCK(hmebp); 4695 4696 if (locked) 4697 page_unlock(pp); 4698 4699 kmem_cache_free(pa_hment_cache, pahmep); 4700 return; 4701 } 4702 4703 sfmmu_mlist_exit(pml); 4704 SFMMU_HASH_UNLOCK(hmebp); 4705 if (locked) 4706 page_unlock(pp); 4707 } 4708 4709 /* 4710 * hat_probe returns 1 if the translation for the address 'addr' is 4711 * loaded, zero otherwise. 4712 * 4713 * hat_probe should be used only for advisorary purposes because it may 4714 * occasionally return the wrong value. The implementation must guarantee that 4715 * returning the wrong value is a very rare event. hat_probe is used 4716 * to implement optimizations in the segment drivers. 4717 * 4718 */ 4719 int 4720 hat_probe(struct hat *sfmmup, caddr_t addr) 4721 { 4722 pfn_t pfn; 4723 tte_t tte; 4724 4725 ASSERT(sfmmup != NULL); 4726 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4727 4728 ASSERT((sfmmup == ksfmmup) || 4729 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4730 4731 if (sfmmup == ksfmmup) { 4732 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 4733 == PFN_SUSPENDED) { 4734 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 4735 } 4736 } else { 4737 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL); 4738 } 4739 4740 if (pfn != PFN_INVALID) 4741 return (1); 4742 else 4743 return (0); 4744 } 4745 4746 ssize_t 4747 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 4748 { 4749 tte_t tte; 4750 4751 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4752 4753 if (sfmmup == ksfmmup) { 4754 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4755 return (-1); 4756 } 4757 } else { 4758 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4759 return (-1); 4760 } 4761 } 4762 4763 ASSERT(TTE_IS_VALID(&tte)); 4764 return (TTEBYTES(TTE_CSZ(&tte))); 4765 } 4766 4767 uint_t 4768 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 4769 { 4770 tte_t tte; 4771 4772 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 4773 4774 if (sfmmup == ksfmmup) { 4775 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4776 tte.ll = 0; 4777 } 4778 } else { 4779 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4780 tte.ll = 0; 4781 } 4782 } 4783 if (TTE_IS_VALID(&tte)) { 4784 *attr = sfmmu_ptov_attr(&tte); 4785 return (0); 4786 } 4787 *attr = 0; 4788 return ((uint_t)0xffffffff); 4789 } 4790 4791 /* 4792 * Enables more attributes on specified address range (ie. logical OR) 4793 */ 4794 void 4795 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4796 { 4797 if (hat->sfmmu_xhat_provider) { 4798 XHAT_SETATTR(hat, addr, len, attr); 4799 return; 4800 } else { 4801 /* 4802 * This must be a CPU HAT. If the address space has 4803 * XHATs attached, change attributes for all of them, 4804 * just in case 4805 */ 4806 ASSERT(hat->sfmmu_as != NULL); 4807 if (hat->sfmmu_as->a_xhat != NULL) 4808 xhat_setattr_all(hat->sfmmu_as, addr, len, attr); 4809 } 4810 4811 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 4812 } 4813 4814 /* 4815 * Assigns attributes to the specified address range. All the attributes 4816 * are specified. 4817 */ 4818 void 4819 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4820 { 4821 if (hat->sfmmu_xhat_provider) { 4822 XHAT_CHGATTR(hat, addr, len, attr); 4823 return; 4824 } else { 4825 /* 4826 * This must be a CPU HAT. If the address space has 4827 * XHATs attached, change attributes for all of them, 4828 * just in case 4829 */ 4830 ASSERT(hat->sfmmu_as != NULL); 4831 if (hat->sfmmu_as->a_xhat != NULL) 4832 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr); 4833 } 4834 4835 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 4836 } 4837 4838 /* 4839 * Remove attributes on the specified address range (ie. loginal NAND) 4840 */ 4841 void 4842 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4843 { 4844 if (hat->sfmmu_xhat_provider) { 4845 XHAT_CLRATTR(hat, addr, len, attr); 4846 return; 4847 } else { 4848 /* 4849 * This must be a CPU HAT. If the address space has 4850 * XHATs attached, change attributes for all of them, 4851 * just in case 4852 */ 4853 ASSERT(hat->sfmmu_as != NULL); 4854 if (hat->sfmmu_as->a_xhat != NULL) 4855 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr); 4856 } 4857 4858 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 4859 } 4860 4861 /* 4862 * Change attributes on an address range to that specified by attr and mode. 4863 */ 4864 static void 4865 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 4866 int mode) 4867 { 4868 struct hmehash_bucket *hmebp; 4869 hmeblk_tag hblktag; 4870 int hmeshift, hashno = 1; 4871 struct hme_blk *hmeblkp, *list = NULL; 4872 caddr_t endaddr; 4873 cpuset_t cpuset; 4874 demap_range_t dmr; 4875 4876 CPUSET_ZERO(cpuset); 4877 4878 ASSERT((sfmmup == ksfmmup) || 4879 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 4880 ASSERT((len & MMU_PAGEOFFSET) == 0); 4881 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4882 4883 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4884 ((addr + len) > (caddr_t)USERLIMIT)) { 4885 panic("user addr %p in kernel space", 4886 (void *)addr); 4887 } 4888 4889 endaddr = addr + len; 4890 hblktag.htag_id = sfmmup; 4891 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4892 DEMAP_RANGE_INIT(sfmmup, &dmr); 4893 4894 while (addr < endaddr) { 4895 hmeshift = HME_HASH_SHIFT(hashno); 4896 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4897 hblktag.htag_rehash = hashno; 4898 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4899 4900 SFMMU_HASH_LOCK(hmebp); 4901 4902 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4903 if (hmeblkp != NULL) { 4904 ASSERT(!hmeblkp->hblk_shared); 4905 /* 4906 * We've encountered a shadow hmeblk so skip the range 4907 * of the next smaller mapping size. 4908 */ 4909 if (hmeblkp->hblk_shw_bit) { 4910 ASSERT(sfmmup != ksfmmup); 4911 ASSERT(hashno > 1); 4912 addr = (caddr_t)P2END((uintptr_t)addr, 4913 TTEBYTES(hashno - 1)); 4914 } else { 4915 addr = sfmmu_hblk_chgattr(sfmmup, 4916 hmeblkp, addr, endaddr, &dmr, attr, mode); 4917 } 4918 SFMMU_HASH_UNLOCK(hmebp); 4919 hashno = 1; 4920 continue; 4921 } 4922 SFMMU_HASH_UNLOCK(hmebp); 4923 4924 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4925 /* 4926 * We have traversed the whole list and rehashed 4927 * if necessary without finding the address to chgattr. 4928 * This is ok, so we increment the address by the 4929 * smallest hmeblk range for kernel mappings or for 4930 * user mappings with no large pages, and the largest 4931 * hmeblk range, to account for shadow hmeblks, for 4932 * user mappings with large pages and continue. 4933 */ 4934 if (sfmmup == ksfmmup) 4935 addr = (caddr_t)P2END((uintptr_t)addr, 4936 TTEBYTES(1)); 4937 else 4938 addr = (caddr_t)P2END((uintptr_t)addr, 4939 TTEBYTES(hashno)); 4940 hashno = 1; 4941 } else { 4942 hashno++; 4943 } 4944 } 4945 4946 sfmmu_hblks_list_purge(&list, 0); 4947 DEMAP_RANGE_FLUSH(&dmr); 4948 cpuset = sfmmup->sfmmu_cpusran; 4949 xt_sync(cpuset); 4950 } 4951 4952 /* 4953 * This function chgattr on a range of addresses in an hmeblk. It returns the 4954 * next addres that needs to be chgattr. 4955 * It should be called with the hash lock held. 4956 * XXX It should be possible to optimize chgattr by not flushing every time but 4957 * on the other hand: 4958 * 1. do one flush crosscall. 4959 * 2. only flush if we are increasing permissions (make sure this will work) 4960 */ 4961 static caddr_t 4962 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4963 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 4964 { 4965 tte_t tte, tteattr, tteflags, ttemod; 4966 struct sf_hment *sfhmep; 4967 int ttesz; 4968 struct page *pp = NULL; 4969 kmutex_t *pml, *pmtx; 4970 int ret; 4971 int use_demap_range; 4972 #if defined(SF_ERRATA_57) 4973 int check_exec; 4974 #endif 4975 4976 ASSERT(in_hblk_range(hmeblkp, addr)); 4977 ASSERT(hmeblkp->hblk_shw_bit == 0); 4978 ASSERT(!hmeblkp->hblk_shared); 4979 4980 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4981 ttesz = get_hblk_ttesz(hmeblkp); 4982 4983 /* 4984 * Flush the current demap region if addresses have been 4985 * skipped or the page size doesn't match. 4986 */ 4987 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 4988 if (use_demap_range) { 4989 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4990 } else { 4991 DEMAP_RANGE_FLUSH(dmrp); 4992 } 4993 4994 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 4995 #if defined(SF_ERRATA_57) 4996 check_exec = (sfmmup != ksfmmup) && 4997 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4998 TTE_IS_EXECUTABLE(&tteattr); 4999 #endif 5000 HBLKTOHME(sfhmep, hmeblkp, addr); 5001 while (addr < endaddr) { 5002 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5003 if (TTE_IS_VALID(&tte)) { 5004 if ((tte.ll & tteflags.ll) == tteattr.ll) { 5005 /* 5006 * if the new attr is the same as old 5007 * continue 5008 */ 5009 goto next_addr; 5010 } 5011 if (!TTE_IS_WRITABLE(&tteattr)) { 5012 /* 5013 * make sure we clear hw modify bit if we 5014 * removing write protections 5015 */ 5016 tteflags.tte_intlo |= TTE_HWWR_INT; 5017 } 5018 5019 pml = NULL; 5020 pp = sfhmep->hme_page; 5021 if (pp) { 5022 pml = sfmmu_mlist_enter(pp); 5023 } 5024 5025 if (pp != sfhmep->hme_page) { 5026 /* 5027 * tte must have been unloaded. 5028 */ 5029 ASSERT(pml); 5030 sfmmu_mlist_exit(pml); 5031 continue; 5032 } 5033 5034 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5035 5036 ttemod = tte; 5037 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 5038 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 5039 5040 #if defined(SF_ERRATA_57) 5041 if (check_exec && addr < errata57_limit) 5042 ttemod.tte_exec_perm = 0; 5043 #endif 5044 ret = sfmmu_modifytte_try(&tte, &ttemod, 5045 &sfhmep->hme_tte); 5046 5047 if (ret < 0) { 5048 /* tte changed underneath us */ 5049 if (pml) { 5050 sfmmu_mlist_exit(pml); 5051 } 5052 continue; 5053 } 5054 5055 if ((tteflags.tte_intlo & TTE_HWWR_INT) || 5056 (TTE_EXECUTED(&tte) && 5057 !TTE_IS_EXECUTABLE(&ttemod))) { 5058 /* 5059 * need to sync if clearing modify/exec bit. 5060 */ 5061 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5062 } 5063 5064 if (pp && PP_ISRO(pp)) { 5065 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 5066 pmtx = sfmmu_page_enter(pp); 5067 PP_CLRRO(pp); 5068 sfmmu_page_exit(pmtx); 5069 } 5070 } 5071 5072 if (ret > 0 && use_demap_range) { 5073 DEMAP_RANGE_MARKPG(dmrp, addr); 5074 } else if (ret > 0) { 5075 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5076 } 5077 5078 if (pml) { 5079 sfmmu_mlist_exit(pml); 5080 } 5081 } 5082 next_addr: 5083 addr += TTEBYTES(ttesz); 5084 sfhmep++; 5085 DEMAP_RANGE_NEXTPG(dmrp); 5086 } 5087 return (addr); 5088 } 5089 5090 /* 5091 * This routine converts virtual attributes to physical ones. It will 5092 * update the tteflags field with the tte mask corresponding to the attributes 5093 * affected and it returns the new attributes. It will also clear the modify 5094 * bit if we are taking away write permission. This is necessary since the 5095 * modify bit is the hardware permission bit and we need to clear it in order 5096 * to detect write faults. 5097 */ 5098 static uint64_t 5099 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 5100 { 5101 tte_t ttevalue; 5102 5103 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 5104 5105 switch (mode) { 5106 case SFMMU_CHGATTR: 5107 /* all attributes specified */ 5108 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 5109 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 5110 ttemaskp->tte_inthi = TTEINTHI_ATTR; 5111 ttemaskp->tte_intlo = TTEINTLO_ATTR; 5112 if (!icache_is_coherent) { 5113 if (!(attr & PROT_EXEC)) { 5114 TTE_SET_SOFTEXEC(ttemaskp); 5115 } else { 5116 TTE_CLR_EXEC(ttemaskp); 5117 TTE_SET_SOFTEXEC(&ttevalue); 5118 } 5119 } 5120 break; 5121 case SFMMU_SETATTR: 5122 ASSERT(!(attr & ~HAT_PROT_MASK)); 5123 ttemaskp->ll = 0; 5124 ttevalue.ll = 0; 5125 /* 5126 * a valid tte implies exec and read for sfmmu 5127 * so no need to do anything about them. 5128 * since priviledged access implies user access 5129 * PROT_USER doesn't make sense either. 5130 */ 5131 if (attr & PROT_WRITE) { 5132 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 5133 ttevalue.tte_intlo |= TTE_WRPRM_INT; 5134 } 5135 break; 5136 case SFMMU_CLRATTR: 5137 /* attributes will be nand with current ones */ 5138 if (attr & ~(PROT_WRITE | PROT_USER)) { 5139 panic("sfmmu: attr %x not supported", attr); 5140 } 5141 ttemaskp->ll = 0; 5142 ttevalue.ll = 0; 5143 if (attr & PROT_WRITE) { 5144 /* clear both writable and modify bit */ 5145 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 5146 } 5147 if (attr & PROT_USER) { 5148 ttemaskp->tte_intlo |= TTE_PRIV_INT; 5149 ttevalue.tte_intlo |= TTE_PRIV_INT; 5150 } 5151 break; 5152 default: 5153 panic("sfmmu_vtop_attr: bad mode %x", mode); 5154 } 5155 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 5156 return (ttevalue.ll); 5157 } 5158 5159 static uint_t 5160 sfmmu_ptov_attr(tte_t *ttep) 5161 { 5162 uint_t attr; 5163 5164 ASSERT(TTE_IS_VALID(ttep)); 5165 5166 attr = PROT_READ; 5167 5168 if (TTE_IS_WRITABLE(ttep)) { 5169 attr |= PROT_WRITE; 5170 } 5171 if (TTE_IS_EXECUTABLE(ttep)) { 5172 attr |= PROT_EXEC; 5173 } 5174 if (TTE_IS_SOFTEXEC(ttep)) { 5175 attr |= PROT_EXEC; 5176 } 5177 if (!TTE_IS_PRIVILEGED(ttep)) { 5178 attr |= PROT_USER; 5179 } 5180 if (TTE_IS_NFO(ttep)) { 5181 attr |= HAT_NOFAULT; 5182 } 5183 if (TTE_IS_NOSYNC(ttep)) { 5184 attr |= HAT_NOSYNC; 5185 } 5186 if (TTE_IS_SIDEFFECT(ttep)) { 5187 attr |= SFMMU_SIDEFFECT; 5188 } 5189 if (!TTE_IS_VCACHEABLE(ttep)) { 5190 attr |= SFMMU_UNCACHEVTTE; 5191 } 5192 if (!TTE_IS_PCACHEABLE(ttep)) { 5193 attr |= SFMMU_UNCACHEPTTE; 5194 } 5195 return (attr); 5196 } 5197 5198 /* 5199 * hat_chgprot is a deprecated hat call. New segment drivers 5200 * should store all attributes and use hat_*attr calls. 5201 * 5202 * Change the protections in the virtual address range 5203 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 5204 * then remove write permission, leaving the other 5205 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 5206 * 5207 */ 5208 void 5209 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 5210 { 5211 struct hmehash_bucket *hmebp; 5212 hmeblk_tag hblktag; 5213 int hmeshift, hashno = 1; 5214 struct hme_blk *hmeblkp, *list = NULL; 5215 caddr_t endaddr; 5216 cpuset_t cpuset; 5217 demap_range_t dmr; 5218 5219 ASSERT((len & MMU_PAGEOFFSET) == 0); 5220 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 5221 5222 if (sfmmup->sfmmu_xhat_provider) { 5223 XHAT_CHGPROT(sfmmup, addr, len, vprot); 5224 return; 5225 } else { 5226 /* 5227 * This must be a CPU HAT. If the address space has 5228 * XHATs attached, change attributes for all of them, 5229 * just in case 5230 */ 5231 ASSERT(sfmmup->sfmmu_as != NULL); 5232 if (sfmmup->sfmmu_as->a_xhat != NULL) 5233 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot); 5234 } 5235 5236 CPUSET_ZERO(cpuset); 5237 5238 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 5239 ((addr + len) > (caddr_t)USERLIMIT)) { 5240 panic("user addr %p vprot %x in kernel space", 5241 (void *)addr, vprot); 5242 } 5243 endaddr = addr + len; 5244 hblktag.htag_id = sfmmup; 5245 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5246 DEMAP_RANGE_INIT(sfmmup, &dmr); 5247 5248 while (addr < endaddr) { 5249 hmeshift = HME_HASH_SHIFT(hashno); 5250 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5251 hblktag.htag_rehash = hashno; 5252 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5253 5254 SFMMU_HASH_LOCK(hmebp); 5255 5256 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5257 if (hmeblkp != NULL) { 5258 ASSERT(!hmeblkp->hblk_shared); 5259 /* 5260 * We've encountered a shadow hmeblk so skip the range 5261 * of the next smaller mapping size. 5262 */ 5263 if (hmeblkp->hblk_shw_bit) { 5264 ASSERT(sfmmup != ksfmmup); 5265 ASSERT(hashno > 1); 5266 addr = (caddr_t)P2END((uintptr_t)addr, 5267 TTEBYTES(hashno - 1)); 5268 } else { 5269 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 5270 addr, endaddr, &dmr, vprot); 5271 } 5272 SFMMU_HASH_UNLOCK(hmebp); 5273 hashno = 1; 5274 continue; 5275 } 5276 SFMMU_HASH_UNLOCK(hmebp); 5277 5278 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5279 /* 5280 * We have traversed the whole list and rehashed 5281 * if necessary without finding the address to chgprot. 5282 * This is ok so we increment the address by the 5283 * smallest hmeblk range for kernel mappings and the 5284 * largest hmeblk range, to account for shadow hmeblks, 5285 * for user mappings and continue. 5286 */ 5287 if (sfmmup == ksfmmup) 5288 addr = (caddr_t)P2END((uintptr_t)addr, 5289 TTEBYTES(1)); 5290 else 5291 addr = (caddr_t)P2END((uintptr_t)addr, 5292 TTEBYTES(hashno)); 5293 hashno = 1; 5294 } else { 5295 hashno++; 5296 } 5297 } 5298 5299 sfmmu_hblks_list_purge(&list, 0); 5300 DEMAP_RANGE_FLUSH(&dmr); 5301 cpuset = sfmmup->sfmmu_cpusran; 5302 xt_sync(cpuset); 5303 } 5304 5305 /* 5306 * This function chgprots a range of addresses in an hmeblk. It returns the 5307 * next addres that needs to be chgprot. 5308 * It should be called with the hash lock held. 5309 * XXX It shold be possible to optimize chgprot by not flushing every time but 5310 * on the other hand: 5311 * 1. do one flush crosscall. 5312 * 2. only flush if we are increasing permissions (make sure this will work) 5313 */ 5314 static caddr_t 5315 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5316 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 5317 { 5318 uint_t pprot; 5319 tte_t tte, ttemod; 5320 struct sf_hment *sfhmep; 5321 uint_t tteflags; 5322 int ttesz; 5323 struct page *pp = NULL; 5324 kmutex_t *pml, *pmtx; 5325 int ret; 5326 int use_demap_range; 5327 #if defined(SF_ERRATA_57) 5328 int check_exec; 5329 #endif 5330 5331 ASSERT(in_hblk_range(hmeblkp, addr)); 5332 ASSERT(hmeblkp->hblk_shw_bit == 0); 5333 ASSERT(!hmeblkp->hblk_shared); 5334 5335 #ifdef DEBUG 5336 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5337 (endaddr < get_hblk_endaddr(hmeblkp))) { 5338 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 5339 } 5340 #endif /* DEBUG */ 5341 5342 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5343 ttesz = get_hblk_ttesz(hmeblkp); 5344 5345 pprot = sfmmu_vtop_prot(vprot, &tteflags); 5346 #if defined(SF_ERRATA_57) 5347 check_exec = (sfmmup != ksfmmup) && 5348 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 5349 ((vprot & PROT_EXEC) == PROT_EXEC); 5350 #endif 5351 HBLKTOHME(sfhmep, hmeblkp, addr); 5352 5353 /* 5354 * Flush the current demap region if addresses have been 5355 * skipped or the page size doesn't match. 5356 */ 5357 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 5358 if (use_demap_range) { 5359 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5360 } else { 5361 DEMAP_RANGE_FLUSH(dmrp); 5362 } 5363 5364 while (addr < endaddr) { 5365 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5366 if (TTE_IS_VALID(&tte)) { 5367 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 5368 /* 5369 * if the new protection is the same as old 5370 * continue 5371 */ 5372 goto next_addr; 5373 } 5374 pml = NULL; 5375 pp = sfhmep->hme_page; 5376 if (pp) { 5377 pml = sfmmu_mlist_enter(pp); 5378 } 5379 if (pp != sfhmep->hme_page) { 5380 /* 5381 * tte most have been unloaded 5382 * underneath us. Recheck 5383 */ 5384 ASSERT(pml); 5385 sfmmu_mlist_exit(pml); 5386 continue; 5387 } 5388 5389 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5390 5391 ttemod = tte; 5392 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 5393 ASSERT(TTE_IS_SOFTEXEC(&tte) == 5394 TTE_IS_SOFTEXEC(&ttemod)); 5395 ASSERT(TTE_IS_EXECUTABLE(&tte) == 5396 TTE_IS_EXECUTABLE(&ttemod)); 5397 5398 #if defined(SF_ERRATA_57) 5399 if (check_exec && addr < errata57_limit) 5400 ttemod.tte_exec_perm = 0; 5401 #endif 5402 ret = sfmmu_modifytte_try(&tte, &ttemod, 5403 &sfhmep->hme_tte); 5404 5405 if (ret < 0) { 5406 /* tte changed underneath us */ 5407 if (pml) { 5408 sfmmu_mlist_exit(pml); 5409 } 5410 continue; 5411 } 5412 5413 if (tteflags & TTE_HWWR_INT) { 5414 /* 5415 * need to sync if we are clearing modify bit. 5416 */ 5417 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5418 } 5419 5420 if (pp && PP_ISRO(pp)) { 5421 if (pprot & TTE_WRPRM_INT) { 5422 pmtx = sfmmu_page_enter(pp); 5423 PP_CLRRO(pp); 5424 sfmmu_page_exit(pmtx); 5425 } 5426 } 5427 5428 if (ret > 0 && use_demap_range) { 5429 DEMAP_RANGE_MARKPG(dmrp, addr); 5430 } else if (ret > 0) { 5431 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5432 } 5433 5434 if (pml) { 5435 sfmmu_mlist_exit(pml); 5436 } 5437 } 5438 next_addr: 5439 addr += TTEBYTES(ttesz); 5440 sfhmep++; 5441 DEMAP_RANGE_NEXTPG(dmrp); 5442 } 5443 return (addr); 5444 } 5445 5446 /* 5447 * This routine is deprecated and should only be used by hat_chgprot. 5448 * The correct routine is sfmmu_vtop_attr. 5449 * This routine converts virtual page protections to physical ones. It will 5450 * update the tteflags field with the tte mask corresponding to the protections 5451 * affected and it returns the new protections. It will also clear the modify 5452 * bit if we are taking away write permission. This is necessary since the 5453 * modify bit is the hardware permission bit and we need to clear it in order 5454 * to detect write faults. 5455 * It accepts the following special protections: 5456 * ~PROT_WRITE = remove write permissions. 5457 * ~PROT_USER = remove user permissions. 5458 */ 5459 static uint_t 5460 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 5461 { 5462 if (vprot == (uint_t)~PROT_WRITE) { 5463 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 5464 return (0); /* will cause wrprm to be cleared */ 5465 } 5466 if (vprot == (uint_t)~PROT_USER) { 5467 *tteflagsp = TTE_PRIV_INT; 5468 return (0); /* will cause privprm to be cleared */ 5469 } 5470 if ((vprot == 0) || (vprot == PROT_USER) || 5471 ((vprot & PROT_ALL) != vprot)) { 5472 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5473 } 5474 5475 switch (vprot) { 5476 case (PROT_READ): 5477 case (PROT_EXEC): 5478 case (PROT_EXEC | PROT_READ): 5479 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5480 return (TTE_PRIV_INT); /* set prv and clr wrt */ 5481 case (PROT_WRITE): 5482 case (PROT_WRITE | PROT_READ): 5483 case (PROT_EXEC | PROT_WRITE): 5484 case (PROT_EXEC | PROT_WRITE | PROT_READ): 5485 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5486 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 5487 case (PROT_USER | PROT_READ): 5488 case (PROT_USER | PROT_EXEC): 5489 case (PROT_USER | PROT_EXEC | PROT_READ): 5490 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5491 return (0); /* clr prv and wrt */ 5492 case (PROT_USER | PROT_WRITE): 5493 case (PROT_USER | PROT_WRITE | PROT_READ): 5494 case (PROT_USER | PROT_EXEC | PROT_WRITE): 5495 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 5496 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5497 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 5498 default: 5499 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5500 } 5501 return (0); 5502 } 5503 5504 /* 5505 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 5506 * the normal algorithm would take too long for a very large VA range with 5507 * few real mappings. This routine just walks thru all HMEs in the global 5508 * hash table to find and remove mappings. 5509 */ 5510 static void 5511 hat_unload_large_virtual( 5512 struct hat *sfmmup, 5513 caddr_t startaddr, 5514 size_t len, 5515 uint_t flags, 5516 hat_callback_t *callback) 5517 { 5518 struct hmehash_bucket *hmebp; 5519 struct hme_blk *hmeblkp; 5520 struct hme_blk *pr_hblk = NULL; 5521 struct hme_blk *nx_hblk; 5522 struct hme_blk *list = NULL; 5523 int i; 5524 demap_range_t dmr, *dmrp; 5525 cpuset_t cpuset; 5526 caddr_t endaddr = startaddr + len; 5527 caddr_t sa; 5528 caddr_t ea; 5529 caddr_t cb_sa[MAX_CB_ADDR]; 5530 caddr_t cb_ea[MAX_CB_ADDR]; 5531 int addr_cnt = 0; 5532 int a = 0; 5533 5534 if (sfmmup->sfmmu_free) { 5535 dmrp = NULL; 5536 } else { 5537 dmrp = &dmr; 5538 DEMAP_RANGE_INIT(sfmmup, dmrp); 5539 } 5540 5541 /* 5542 * Loop through all the hash buckets of HME blocks looking for matches. 5543 */ 5544 for (i = 0; i <= UHMEHASH_SZ; i++) { 5545 hmebp = &uhme_hash[i]; 5546 SFMMU_HASH_LOCK(hmebp); 5547 hmeblkp = hmebp->hmeblkp; 5548 pr_hblk = NULL; 5549 while (hmeblkp) { 5550 nx_hblk = hmeblkp->hblk_next; 5551 5552 /* 5553 * skip if not this context, if a shadow block or 5554 * if the mapping is not in the requested range 5555 */ 5556 if (hmeblkp->hblk_tag.htag_id != sfmmup || 5557 hmeblkp->hblk_shw_bit || 5558 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 5559 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 5560 pr_hblk = hmeblkp; 5561 goto next_block; 5562 } 5563 5564 ASSERT(!hmeblkp->hblk_shared); 5565 /* 5566 * unload if there are any current valid mappings 5567 */ 5568 if (hmeblkp->hblk_vcnt != 0 || 5569 hmeblkp->hblk_hmecnt != 0) 5570 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 5571 sa, ea, dmrp, flags); 5572 5573 /* 5574 * on unmap we also release the HME block itself, once 5575 * all mappings are gone. 5576 */ 5577 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 5578 !hmeblkp->hblk_vcnt && 5579 !hmeblkp->hblk_hmecnt) { 5580 ASSERT(!hmeblkp->hblk_lckcnt); 5581 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5582 &list, 0); 5583 } else { 5584 pr_hblk = hmeblkp; 5585 } 5586 5587 if (callback == NULL) 5588 goto next_block; 5589 5590 /* 5591 * HME blocks may span more than one page, but we may be 5592 * unmapping only one page, so check for a smaller range 5593 * for the callback 5594 */ 5595 if (sa < startaddr) 5596 sa = startaddr; 5597 if (--ea > endaddr) 5598 ea = endaddr - 1; 5599 5600 cb_sa[addr_cnt] = sa; 5601 cb_ea[addr_cnt] = ea; 5602 if (++addr_cnt == MAX_CB_ADDR) { 5603 if (dmrp != NULL) { 5604 DEMAP_RANGE_FLUSH(dmrp); 5605 cpuset = sfmmup->sfmmu_cpusran; 5606 xt_sync(cpuset); 5607 } 5608 5609 for (a = 0; a < MAX_CB_ADDR; ++a) { 5610 callback->hcb_start_addr = cb_sa[a]; 5611 callback->hcb_end_addr = cb_ea[a]; 5612 callback->hcb_function(callback); 5613 } 5614 addr_cnt = 0; 5615 } 5616 5617 next_block: 5618 hmeblkp = nx_hblk; 5619 } 5620 SFMMU_HASH_UNLOCK(hmebp); 5621 } 5622 5623 sfmmu_hblks_list_purge(&list, 0); 5624 if (dmrp != NULL) { 5625 DEMAP_RANGE_FLUSH(dmrp); 5626 cpuset = sfmmup->sfmmu_cpusran; 5627 xt_sync(cpuset); 5628 } 5629 5630 for (a = 0; a < addr_cnt; ++a) { 5631 callback->hcb_start_addr = cb_sa[a]; 5632 callback->hcb_end_addr = cb_ea[a]; 5633 callback->hcb_function(callback); 5634 } 5635 5636 /* 5637 * Check TSB and TLB page sizes if the process isn't exiting. 5638 */ 5639 if (!sfmmup->sfmmu_free) 5640 sfmmu_check_page_sizes(sfmmup, 0); 5641 } 5642 5643 /* 5644 * Unload all the mappings in the range [addr..addr+len). addr and len must 5645 * be MMU_PAGESIZE aligned. 5646 */ 5647 5648 extern struct seg *segkmap; 5649 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 5650 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 5651 5652 5653 void 5654 hat_unload_callback( 5655 struct hat *sfmmup, 5656 caddr_t addr, 5657 size_t len, 5658 uint_t flags, 5659 hat_callback_t *callback) 5660 { 5661 struct hmehash_bucket *hmebp; 5662 hmeblk_tag hblktag; 5663 int hmeshift, hashno, iskernel; 5664 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 5665 caddr_t endaddr; 5666 cpuset_t cpuset; 5667 int addr_count = 0; 5668 int a; 5669 caddr_t cb_start_addr[MAX_CB_ADDR]; 5670 caddr_t cb_end_addr[MAX_CB_ADDR]; 5671 int issegkmap = ISSEGKMAP(sfmmup, addr); 5672 demap_range_t dmr, *dmrp; 5673 5674 if (sfmmup->sfmmu_xhat_provider) { 5675 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback); 5676 return; 5677 } else { 5678 /* 5679 * This must be a CPU HAT. If the address space has 5680 * XHATs attached, unload the mappings for all of them, 5681 * just in case 5682 */ 5683 ASSERT(sfmmup->sfmmu_as != NULL); 5684 if (sfmmup->sfmmu_as->a_xhat != NULL) 5685 xhat_unload_callback_all(sfmmup->sfmmu_as, addr, 5686 len, flags, callback); 5687 } 5688 5689 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 5690 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 5691 5692 ASSERT(sfmmup != NULL); 5693 ASSERT((len & MMU_PAGEOFFSET) == 0); 5694 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 5695 5696 /* 5697 * Probing through a large VA range (say 63 bits) will be slow, even 5698 * at 4 Meg steps between the probes. So, when the virtual address range 5699 * is very large, search the HME entries for what to unload. 5700 * 5701 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 5702 * 5703 * UHMEHASH_SZ is number of hash buckets to examine 5704 * 5705 */ 5706 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 5707 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 5708 return; 5709 } 5710 5711 CPUSET_ZERO(cpuset); 5712 5713 /* 5714 * If the process is exiting, we can save a lot of fuss since 5715 * we'll flush the TLB when we free the ctx anyway. 5716 */ 5717 if (sfmmup->sfmmu_free) 5718 dmrp = NULL; 5719 else 5720 dmrp = &dmr; 5721 5722 DEMAP_RANGE_INIT(sfmmup, dmrp); 5723 endaddr = addr + len; 5724 hblktag.htag_id = sfmmup; 5725 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5726 5727 /* 5728 * It is likely for the vm to call unload over a wide range of 5729 * addresses that are actually very sparsely populated by 5730 * translations. In order to speed this up the sfmmu hat supports 5731 * the concept of shadow hmeblks. Dummy large page hmeblks that 5732 * correspond to actual small translations are allocated at tteload 5733 * time and are referred to as shadow hmeblks. Now, during unload 5734 * time, we first check if we have a shadow hmeblk for that 5735 * translation. The absence of one means the corresponding address 5736 * range is empty and can be skipped. 5737 * 5738 * The kernel is an exception to above statement and that is why 5739 * we don't use shadow hmeblks and hash starting from the smallest 5740 * page size. 5741 */ 5742 if (sfmmup == KHATID) { 5743 iskernel = 1; 5744 hashno = TTE64K; 5745 } else { 5746 iskernel = 0; 5747 if (mmu_page_sizes == max_mmu_page_sizes) { 5748 hashno = TTE256M; 5749 } else { 5750 hashno = TTE4M; 5751 } 5752 } 5753 while (addr < endaddr) { 5754 hmeshift = HME_HASH_SHIFT(hashno); 5755 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5756 hblktag.htag_rehash = hashno; 5757 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5758 5759 SFMMU_HASH_LOCK(hmebp); 5760 5761 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 5762 if (hmeblkp == NULL) { 5763 /* 5764 * didn't find an hmeblk. skip the appropiate 5765 * address range. 5766 */ 5767 SFMMU_HASH_UNLOCK(hmebp); 5768 if (iskernel) { 5769 if (hashno < mmu_hashcnt) { 5770 hashno++; 5771 continue; 5772 } else { 5773 hashno = TTE64K; 5774 addr = (caddr_t)roundup((uintptr_t)addr 5775 + 1, MMU_PAGESIZE64K); 5776 continue; 5777 } 5778 } 5779 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5780 (1 << hmeshift)); 5781 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5782 ASSERT(hashno == TTE64K); 5783 continue; 5784 } 5785 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5786 hashno = TTE512K; 5787 continue; 5788 } 5789 if (mmu_page_sizes == max_mmu_page_sizes) { 5790 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5791 hashno = TTE4M; 5792 continue; 5793 } 5794 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5795 hashno = TTE32M; 5796 continue; 5797 } 5798 hashno = TTE256M; 5799 continue; 5800 } else { 5801 hashno = TTE4M; 5802 continue; 5803 } 5804 } 5805 ASSERT(hmeblkp); 5806 ASSERT(!hmeblkp->hblk_shared); 5807 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5808 /* 5809 * If the valid count is zero we can skip the range 5810 * mapped by this hmeblk. 5811 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 5812 * is used by segment drivers as a hint 5813 * that the mapping resource won't be used any longer. 5814 * The best example of this is during exit(). 5815 */ 5816 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5817 get_hblk_span(hmeblkp)); 5818 if ((flags & HAT_UNLOAD_UNMAP) || 5819 (iskernel && !issegkmap)) { 5820 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5821 &list, 0); 5822 } 5823 SFMMU_HASH_UNLOCK(hmebp); 5824 5825 if (iskernel) { 5826 hashno = TTE64K; 5827 continue; 5828 } 5829 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5830 ASSERT(hashno == TTE64K); 5831 continue; 5832 } 5833 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5834 hashno = TTE512K; 5835 continue; 5836 } 5837 if (mmu_page_sizes == max_mmu_page_sizes) { 5838 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5839 hashno = TTE4M; 5840 continue; 5841 } 5842 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5843 hashno = TTE32M; 5844 continue; 5845 } 5846 hashno = TTE256M; 5847 continue; 5848 } else { 5849 hashno = TTE4M; 5850 continue; 5851 } 5852 } 5853 if (hmeblkp->hblk_shw_bit) { 5854 /* 5855 * If we encounter a shadow hmeblk we know there is 5856 * smaller sized hmeblks mapping the same address space. 5857 * Decrement the hash size and rehash. 5858 */ 5859 ASSERT(sfmmup != KHATID); 5860 hashno--; 5861 SFMMU_HASH_UNLOCK(hmebp); 5862 continue; 5863 } 5864 5865 /* 5866 * track callback address ranges. 5867 * only start a new range when it's not contiguous 5868 */ 5869 if (callback != NULL) { 5870 if (addr_count > 0 && 5871 addr == cb_end_addr[addr_count - 1]) 5872 --addr_count; 5873 else 5874 cb_start_addr[addr_count] = addr; 5875 } 5876 5877 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5878 dmrp, flags); 5879 5880 if (callback != NULL) 5881 cb_end_addr[addr_count++] = addr; 5882 5883 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5884 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5885 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0); 5886 } 5887 SFMMU_HASH_UNLOCK(hmebp); 5888 5889 /* 5890 * Notify our caller as to exactly which pages 5891 * have been unloaded. We do these in clumps, 5892 * to minimize the number of xt_sync()s that need to occur. 5893 */ 5894 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5895 DEMAP_RANGE_FLUSH(dmrp); 5896 if (dmrp != NULL) { 5897 cpuset = sfmmup->sfmmu_cpusran; 5898 xt_sync(cpuset); 5899 } 5900 5901 for (a = 0; a < MAX_CB_ADDR; ++a) { 5902 callback->hcb_start_addr = cb_start_addr[a]; 5903 callback->hcb_end_addr = cb_end_addr[a]; 5904 callback->hcb_function(callback); 5905 } 5906 addr_count = 0; 5907 } 5908 if (iskernel) { 5909 hashno = TTE64K; 5910 continue; 5911 } 5912 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5913 ASSERT(hashno == TTE64K); 5914 continue; 5915 } 5916 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5917 hashno = TTE512K; 5918 continue; 5919 } 5920 if (mmu_page_sizes == max_mmu_page_sizes) { 5921 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5922 hashno = TTE4M; 5923 continue; 5924 } 5925 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5926 hashno = TTE32M; 5927 continue; 5928 } 5929 hashno = TTE256M; 5930 } else { 5931 hashno = TTE4M; 5932 } 5933 } 5934 5935 sfmmu_hblks_list_purge(&list, 0); 5936 DEMAP_RANGE_FLUSH(dmrp); 5937 if (dmrp != NULL) { 5938 cpuset = sfmmup->sfmmu_cpusran; 5939 xt_sync(cpuset); 5940 } 5941 if (callback && addr_count != 0) { 5942 for (a = 0; a < addr_count; ++a) { 5943 callback->hcb_start_addr = cb_start_addr[a]; 5944 callback->hcb_end_addr = cb_end_addr[a]; 5945 callback->hcb_function(callback); 5946 } 5947 } 5948 5949 /* 5950 * Check TSB and TLB page sizes if the process isn't exiting. 5951 */ 5952 if (!sfmmup->sfmmu_free) 5953 sfmmu_check_page_sizes(sfmmup, 0); 5954 } 5955 5956 /* 5957 * Unload all the mappings in the range [addr..addr+len). addr and len must 5958 * be MMU_PAGESIZE aligned. 5959 */ 5960 void 5961 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 5962 { 5963 if (sfmmup->sfmmu_xhat_provider) { 5964 XHAT_UNLOAD(sfmmup, addr, len, flags); 5965 return; 5966 } 5967 hat_unload_callback(sfmmup, addr, len, flags, NULL); 5968 } 5969 5970 5971 /* 5972 * Find the largest mapping size for this page. 5973 */ 5974 int 5975 fnd_mapping_sz(page_t *pp) 5976 { 5977 int sz; 5978 int p_index; 5979 5980 p_index = PP_MAPINDEX(pp); 5981 5982 sz = 0; 5983 p_index >>= 1; /* don't care about 8K bit */ 5984 for (; p_index; p_index >>= 1) { 5985 sz++; 5986 } 5987 5988 return (sz); 5989 } 5990 5991 /* 5992 * This function unloads a range of addresses for an hmeblk. 5993 * It returns the next address to be unloaded. 5994 * It should be called with the hash lock held. 5995 */ 5996 static caddr_t 5997 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5998 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 5999 { 6000 tte_t tte, ttemod; 6001 struct sf_hment *sfhmep; 6002 int ttesz; 6003 long ttecnt; 6004 page_t *pp; 6005 kmutex_t *pml; 6006 int ret; 6007 int use_demap_range; 6008 6009 ASSERT(in_hblk_range(hmeblkp, addr)); 6010 ASSERT(!hmeblkp->hblk_shw_bit); 6011 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared); 6012 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared); 6013 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared); 6014 6015 #ifdef DEBUG 6016 if (get_hblk_ttesz(hmeblkp) != TTE8K && 6017 (endaddr < get_hblk_endaddr(hmeblkp))) { 6018 panic("sfmmu_hblk_unload: partial unload of large page"); 6019 } 6020 #endif /* DEBUG */ 6021 6022 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6023 ttesz = get_hblk_ttesz(hmeblkp); 6024 6025 use_demap_range = ((dmrp == NULL) || 6026 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp))); 6027 6028 if (use_demap_range) { 6029 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 6030 } else { 6031 DEMAP_RANGE_FLUSH(dmrp); 6032 } 6033 ttecnt = 0; 6034 HBLKTOHME(sfhmep, hmeblkp, addr); 6035 6036 while (addr < endaddr) { 6037 pml = NULL; 6038 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6039 if (TTE_IS_VALID(&tte)) { 6040 pp = sfhmep->hme_page; 6041 if (pp != NULL) { 6042 pml = sfmmu_mlist_enter(pp); 6043 } 6044 6045 /* 6046 * Verify if hme still points to 'pp' now that 6047 * we have p_mapping lock. 6048 */ 6049 if (sfhmep->hme_page != pp) { 6050 if (pp != NULL && sfhmep->hme_page != NULL) { 6051 ASSERT(pml != NULL); 6052 sfmmu_mlist_exit(pml); 6053 /* Re-start this iteration. */ 6054 continue; 6055 } 6056 ASSERT((pp != NULL) && 6057 (sfhmep->hme_page == NULL)); 6058 goto tte_unloaded; 6059 } 6060 6061 /* 6062 * This point on we have both HASH and p_mapping 6063 * lock. 6064 */ 6065 ASSERT(pp == sfhmep->hme_page); 6066 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6067 6068 /* 6069 * We need to loop on modify tte because it is 6070 * possible for pagesync to come along and 6071 * change the software bits beneath us. 6072 * 6073 * Page_unload can also invalidate the tte after 6074 * we read tte outside of p_mapping lock. 6075 */ 6076 again: 6077 ttemod = tte; 6078 6079 TTE_SET_INVALID(&ttemod); 6080 ret = sfmmu_modifytte_try(&tte, &ttemod, 6081 &sfhmep->hme_tte); 6082 6083 if (ret <= 0) { 6084 if (TTE_IS_VALID(&tte)) { 6085 ASSERT(ret < 0); 6086 goto again; 6087 } 6088 if (pp != NULL) { 6089 panic("sfmmu_hblk_unload: pp = 0x%p " 6090 "tte became invalid under mlist" 6091 " lock = 0x%p", (void *)pp, 6092 (void *)pml); 6093 } 6094 continue; 6095 } 6096 6097 if (!(flags & HAT_UNLOAD_NOSYNC) || 6098 (pp != NULL && TTE_EXECUTED(&tte))) { 6099 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6100 } 6101 6102 /* 6103 * Ok- we invalidated the tte. Do the rest of the job. 6104 */ 6105 ttecnt++; 6106 6107 if (flags & HAT_UNLOAD_UNLOCK) { 6108 ASSERT(hmeblkp->hblk_lckcnt > 0); 6109 atomic_add_32(&hmeblkp->hblk_lckcnt, -1); 6110 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 6111 } 6112 6113 /* 6114 * Normally we would need to flush the page 6115 * from the virtual cache at this point in 6116 * order to prevent a potential cache alias 6117 * inconsistency. 6118 * The particular scenario we need to worry 6119 * about is: 6120 * Given: va1 and va2 are two virtual address 6121 * that alias and map the same physical 6122 * address. 6123 * 1. mapping exists from va1 to pa and data 6124 * has been read into the cache. 6125 * 2. unload va1. 6126 * 3. load va2 and modify data using va2. 6127 * 4 unload va2. 6128 * 5. load va1 and reference data. Unless we 6129 * flush the data cache when we unload we will 6130 * get stale data. 6131 * Fortunately, page coloring eliminates the 6132 * above scenario by remembering the color a 6133 * physical page was last or is currently 6134 * mapped to. Now, we delay the flush until 6135 * the loading of translations. Only when the 6136 * new translation is of a different color 6137 * are we forced to flush. 6138 */ 6139 if (use_demap_range) { 6140 /* 6141 * Mark this page as needing a demap. 6142 */ 6143 DEMAP_RANGE_MARKPG(dmrp, addr); 6144 } else { 6145 ASSERT(sfmmup != NULL); 6146 ASSERT(!hmeblkp->hblk_shared); 6147 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 6148 sfmmup->sfmmu_free, 0); 6149 } 6150 6151 if (pp) { 6152 /* 6153 * Remove the hment from the mapping list 6154 */ 6155 ASSERT(hmeblkp->hblk_hmecnt > 0); 6156 6157 /* 6158 * Again, we cannot 6159 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 6160 */ 6161 HME_SUB(sfhmep, pp); 6162 membar_stst(); 6163 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 6164 } 6165 6166 ASSERT(hmeblkp->hblk_vcnt > 0); 6167 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 6168 6169 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 6170 !hmeblkp->hblk_lckcnt); 6171 6172 #ifdef VAC 6173 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 6174 if (PP_ISTNC(pp)) { 6175 /* 6176 * If page was temporary 6177 * uncached, try to recache 6178 * it. Note that HME_SUB() was 6179 * called above so p_index and 6180 * mlist had been updated. 6181 */ 6182 conv_tnc(pp, ttesz); 6183 } else if (pp->p_mapping == NULL) { 6184 ASSERT(kpm_enable); 6185 /* 6186 * Page is marked to be in VAC conflict 6187 * to an existing kpm mapping and/or is 6188 * kpm mapped using only the regular 6189 * pagesize. 6190 */ 6191 sfmmu_kpm_hme_unload(pp); 6192 } 6193 } 6194 #endif /* VAC */ 6195 } else if ((pp = sfhmep->hme_page) != NULL) { 6196 /* 6197 * TTE is invalid but the hme 6198 * still exists. let pageunload 6199 * complete its job. 6200 */ 6201 ASSERT(pml == NULL); 6202 pml = sfmmu_mlist_enter(pp); 6203 if (sfhmep->hme_page != NULL) { 6204 sfmmu_mlist_exit(pml); 6205 continue; 6206 } 6207 ASSERT(sfhmep->hme_page == NULL); 6208 } else if (hmeblkp->hblk_hmecnt != 0) { 6209 /* 6210 * pageunload may have not finished decrementing 6211 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 6212 * wait for pageunload to finish. Rely on pageunload 6213 * to decrement hblk_hmecnt after hblk_vcnt. 6214 */ 6215 pfn_t pfn = TTE_TO_TTEPFN(&tte); 6216 ASSERT(pml == NULL); 6217 if (pf_is_memory(pfn)) { 6218 pp = page_numtopp_nolock(pfn); 6219 if (pp != NULL) { 6220 pml = sfmmu_mlist_enter(pp); 6221 sfmmu_mlist_exit(pml); 6222 pml = NULL; 6223 } 6224 } 6225 } 6226 6227 tte_unloaded: 6228 /* 6229 * At this point, the tte we are looking at 6230 * should be unloaded, and hme has been unlinked 6231 * from page too. This is important because in 6232 * pageunload, it does ttesync() then HME_SUB. 6233 * We need to make sure HME_SUB has been completed 6234 * so we know ttesync() has been completed. Otherwise, 6235 * at exit time, after return from hat layer, VM will 6236 * release as structure which hat_setstat() (called 6237 * by ttesync()) needs. 6238 */ 6239 #ifdef DEBUG 6240 { 6241 tte_t dtte; 6242 6243 ASSERT(sfhmep->hme_page == NULL); 6244 6245 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 6246 ASSERT(!TTE_IS_VALID(&dtte)); 6247 } 6248 #endif 6249 6250 if (pml) { 6251 sfmmu_mlist_exit(pml); 6252 } 6253 6254 addr += TTEBYTES(ttesz); 6255 sfhmep++; 6256 DEMAP_RANGE_NEXTPG(dmrp); 6257 } 6258 /* 6259 * For shared hmeblks this routine is only called when region is freed 6260 * and no longer referenced. So no need to decrement ttecnt 6261 * in the region structure here. 6262 */ 6263 if (ttecnt > 0 && sfmmup != NULL) { 6264 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 6265 } 6266 return (addr); 6267 } 6268 6269 /* 6270 * Synchronize all the mappings in the range [addr..addr+len). 6271 * Can be called with clearflag having two states: 6272 * HAT_SYNC_DONTZERO means just return the rm stats 6273 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 6274 */ 6275 void 6276 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 6277 { 6278 struct hmehash_bucket *hmebp; 6279 hmeblk_tag hblktag; 6280 int hmeshift, hashno = 1; 6281 struct hme_blk *hmeblkp, *list = NULL; 6282 caddr_t endaddr; 6283 cpuset_t cpuset; 6284 6285 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 6286 ASSERT((sfmmup == ksfmmup) || 6287 AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 6288 ASSERT((len & MMU_PAGEOFFSET) == 0); 6289 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6290 (clearflag == HAT_SYNC_ZERORM)); 6291 6292 CPUSET_ZERO(cpuset); 6293 6294 endaddr = addr + len; 6295 hblktag.htag_id = sfmmup; 6296 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 6297 6298 /* 6299 * Spitfire supports 4 page sizes. 6300 * Most pages are expected to be of the smallest page 6301 * size (8K) and these will not need to be rehashed. 64K 6302 * pages also don't need to be rehashed because the an hmeblk 6303 * spans 64K of address space. 512K pages might need 1 rehash and 6304 * and 4M pages 2 rehashes. 6305 */ 6306 while (addr < endaddr) { 6307 hmeshift = HME_HASH_SHIFT(hashno); 6308 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 6309 hblktag.htag_rehash = hashno; 6310 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 6311 6312 SFMMU_HASH_LOCK(hmebp); 6313 6314 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 6315 if (hmeblkp != NULL) { 6316 ASSERT(!hmeblkp->hblk_shared); 6317 /* 6318 * We've encountered a shadow hmeblk so skip the range 6319 * of the next smaller mapping size. 6320 */ 6321 if (hmeblkp->hblk_shw_bit) { 6322 ASSERT(sfmmup != ksfmmup); 6323 ASSERT(hashno > 1); 6324 addr = (caddr_t)P2END((uintptr_t)addr, 6325 TTEBYTES(hashno - 1)); 6326 } else { 6327 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 6328 addr, endaddr, clearflag); 6329 } 6330 SFMMU_HASH_UNLOCK(hmebp); 6331 hashno = 1; 6332 continue; 6333 } 6334 SFMMU_HASH_UNLOCK(hmebp); 6335 6336 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 6337 /* 6338 * We have traversed the whole list and rehashed 6339 * if necessary without finding the address to sync. 6340 * This is ok so we increment the address by the 6341 * smallest hmeblk range for kernel mappings and the 6342 * largest hmeblk range, to account for shadow hmeblks, 6343 * for user mappings and continue. 6344 */ 6345 if (sfmmup == ksfmmup) 6346 addr = (caddr_t)P2END((uintptr_t)addr, 6347 TTEBYTES(1)); 6348 else 6349 addr = (caddr_t)P2END((uintptr_t)addr, 6350 TTEBYTES(hashno)); 6351 hashno = 1; 6352 } else { 6353 hashno++; 6354 } 6355 } 6356 sfmmu_hblks_list_purge(&list, 0); 6357 cpuset = sfmmup->sfmmu_cpusran; 6358 xt_sync(cpuset); 6359 } 6360 6361 static caddr_t 6362 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 6363 caddr_t endaddr, int clearflag) 6364 { 6365 tte_t tte, ttemod; 6366 struct sf_hment *sfhmep; 6367 int ttesz; 6368 struct page *pp; 6369 kmutex_t *pml; 6370 int ret; 6371 6372 ASSERT(hmeblkp->hblk_shw_bit == 0); 6373 ASSERT(!hmeblkp->hblk_shared); 6374 6375 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6376 6377 ttesz = get_hblk_ttesz(hmeblkp); 6378 HBLKTOHME(sfhmep, hmeblkp, addr); 6379 6380 while (addr < endaddr) { 6381 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6382 if (TTE_IS_VALID(&tte)) { 6383 pml = NULL; 6384 pp = sfhmep->hme_page; 6385 if (pp) { 6386 pml = sfmmu_mlist_enter(pp); 6387 } 6388 if (pp != sfhmep->hme_page) { 6389 /* 6390 * tte most have been unloaded 6391 * underneath us. Recheck 6392 */ 6393 ASSERT(pml); 6394 sfmmu_mlist_exit(pml); 6395 continue; 6396 } 6397 6398 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6399 6400 if (clearflag == HAT_SYNC_ZERORM) { 6401 ttemod = tte; 6402 TTE_CLR_RM(&ttemod); 6403 ret = sfmmu_modifytte_try(&tte, &ttemod, 6404 &sfhmep->hme_tte); 6405 if (ret < 0) { 6406 if (pml) { 6407 sfmmu_mlist_exit(pml); 6408 } 6409 continue; 6410 } 6411 6412 if (ret > 0) { 6413 sfmmu_tlb_demap(addr, sfmmup, 6414 hmeblkp, 0, 0); 6415 } 6416 } 6417 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6418 if (pml) { 6419 sfmmu_mlist_exit(pml); 6420 } 6421 } 6422 addr += TTEBYTES(ttesz); 6423 sfhmep++; 6424 } 6425 return (addr); 6426 } 6427 6428 /* 6429 * This function will sync a tte to the page struct and it will 6430 * update the hat stats. Currently it allows us to pass a NULL pp 6431 * and we will simply update the stats. We may want to change this 6432 * so we only keep stats for pages backed by pp's. 6433 */ 6434 static void 6435 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 6436 { 6437 uint_t rm = 0; 6438 int sz = TTE_CSZ(ttep); 6439 pgcnt_t npgs; 6440 6441 ASSERT(TTE_IS_VALID(ttep)); 6442 6443 if (!TTE_IS_NOSYNC(ttep)) { 6444 6445 if (TTE_IS_REF(ttep)) 6446 rm |= P_REF; 6447 6448 if (TTE_IS_MOD(ttep)) 6449 rm |= P_MOD; 6450 6451 if (rm != 0) { 6452 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { 6453 int i; 6454 caddr_t vaddr = addr; 6455 6456 for (i = 0; i < TTEPAGES(sz); i++) { 6457 hat_setstat(sfmmup->sfmmu_as, vaddr, 6458 MMU_PAGESIZE, rm); 6459 vaddr += MMU_PAGESIZE; 6460 } 6461 } 6462 } 6463 } 6464 6465 if (!pp) 6466 return; 6467 6468 /* 6469 * If software says this page is executable, and the page was 6470 * in fact executed (indicated by hardware exec permission 6471 * being enabled), then set P_EXEC on the page to remember 6472 * that it was executed. The I$ will be flushed when the page 6473 * is reassigned. 6474 */ 6475 if (TTE_EXECUTED(ttep)) { 6476 rm |= P_EXEC; 6477 } else if (rm == 0) { 6478 return; 6479 } 6480 6481 /* 6482 * XXX I want to use cas to update nrm bits but they 6483 * currently belong in common/vm and not in hat where 6484 * they should be. 6485 * The nrm bits are protected by the same mutex as 6486 * the one that protects the page's mapping list. 6487 */ 6488 ASSERT(sfmmu_mlist_held(pp)); 6489 /* 6490 * If the tte is for a large page, we need to sync all the 6491 * pages covered by the tte. 6492 */ 6493 if (sz != TTE8K) { 6494 ASSERT(pp->p_szc != 0); 6495 pp = PP_GROUPLEADER(pp, sz); 6496 ASSERT(sfmmu_mlist_held(pp)); 6497 } 6498 6499 /* Get number of pages from tte size. */ 6500 npgs = TTEPAGES(sz); 6501 6502 do { 6503 ASSERT(pp); 6504 ASSERT(sfmmu_mlist_held(pp)); 6505 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 6506 ((rm & P_MOD) != 0 && !PP_ISMOD(pp)) || 6507 ((rm & P_EXEC) != 0 && !PP_ISEXEC(pp))) 6508 hat_page_setattr(pp, rm); 6509 6510 /* 6511 * Are we done? If not, we must have a large mapping. 6512 * For large mappings we need to sync the rest of the pages 6513 * covered by this tte; goto the next page. 6514 */ 6515 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 6516 } 6517 6518 /* 6519 * Execute pre-callback handler of each pa_hment linked to pp 6520 * 6521 * Inputs: 6522 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 6523 * capture_cpus: pointer to return value (below) 6524 * 6525 * Returns: 6526 * Propagates the subsystem callback return values back to the caller; 6527 * returns 0 on success. If capture_cpus is non-NULL, the value returned 6528 * is zero if all of the pa_hments are of a type that do not require 6529 * capturing CPUs prior to suspending the mapping, else it is 1. 6530 */ 6531 static int 6532 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 6533 { 6534 struct sf_hment *sfhmep; 6535 struct pa_hment *pahmep; 6536 int (*f)(caddr_t, uint_t, uint_t, void *); 6537 int ret; 6538 id_t id; 6539 int locked = 0; 6540 kmutex_t *pml; 6541 6542 ASSERT(PAGE_EXCL(pp)); 6543 if (!sfmmu_mlist_held(pp)) { 6544 pml = sfmmu_mlist_enter(pp); 6545 locked = 1; 6546 } 6547 6548 if (capture_cpus) 6549 *capture_cpus = 0; 6550 6551 top: 6552 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6553 /* 6554 * skip sf_hments corresponding to VA<->PA mappings; 6555 * for pa_hment's, hme_tte.ll is zero 6556 */ 6557 if (!IS_PAHME(sfhmep)) 6558 continue; 6559 6560 pahmep = sfhmep->hme_data; 6561 ASSERT(pahmep != NULL); 6562 6563 /* 6564 * skip if pre-handler has been called earlier in this loop 6565 */ 6566 if (pahmep->flags & flag) 6567 continue; 6568 6569 id = pahmep->cb_id; 6570 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6571 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 6572 *capture_cpus = 1; 6573 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 6574 pahmep->flags |= flag; 6575 continue; 6576 } 6577 6578 /* 6579 * Drop the mapping list lock to avoid locking order issues. 6580 */ 6581 if (locked) 6582 sfmmu_mlist_exit(pml); 6583 6584 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 6585 if (ret != 0) 6586 return (ret); /* caller must do the cleanup */ 6587 6588 if (locked) { 6589 pml = sfmmu_mlist_enter(pp); 6590 pahmep->flags |= flag; 6591 goto top; 6592 } 6593 6594 pahmep->flags |= flag; 6595 } 6596 6597 if (locked) 6598 sfmmu_mlist_exit(pml); 6599 6600 return (0); 6601 } 6602 6603 /* 6604 * Execute post-callback handler of each pa_hment linked to pp 6605 * 6606 * Same overall assumptions and restrictions apply as for 6607 * hat_pageprocess_precallbacks(). 6608 */ 6609 static void 6610 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 6611 { 6612 pfn_t pgpfn = pp->p_pagenum; 6613 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 6614 pfn_t newpfn; 6615 struct sf_hment *sfhmep; 6616 struct pa_hment *pahmep; 6617 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 6618 id_t id; 6619 int locked = 0; 6620 kmutex_t *pml; 6621 6622 ASSERT(PAGE_EXCL(pp)); 6623 if (!sfmmu_mlist_held(pp)) { 6624 pml = sfmmu_mlist_enter(pp); 6625 locked = 1; 6626 } 6627 6628 top: 6629 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6630 /* 6631 * skip sf_hments corresponding to VA<->PA mappings; 6632 * for pa_hment's, hme_tte.ll is zero 6633 */ 6634 if (!IS_PAHME(sfhmep)) 6635 continue; 6636 6637 pahmep = sfhmep->hme_data; 6638 ASSERT(pahmep != NULL); 6639 6640 if ((pahmep->flags & flag) == 0) 6641 continue; 6642 6643 pahmep->flags &= ~flag; 6644 6645 id = pahmep->cb_id; 6646 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6647 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 6648 continue; 6649 6650 /* 6651 * Convert the base page PFN into the constituent PFN 6652 * which is needed by the callback handler. 6653 */ 6654 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 6655 6656 /* 6657 * Drop the mapping list lock to avoid locking order issues. 6658 */ 6659 if (locked) 6660 sfmmu_mlist_exit(pml); 6661 6662 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 6663 != 0) 6664 panic("sfmmu: posthandler failed"); 6665 6666 if (locked) { 6667 pml = sfmmu_mlist_enter(pp); 6668 goto top; 6669 } 6670 } 6671 6672 if (locked) 6673 sfmmu_mlist_exit(pml); 6674 } 6675 6676 /* 6677 * Suspend locked kernel mapping 6678 */ 6679 void 6680 hat_pagesuspend(struct page *pp) 6681 { 6682 struct sf_hment *sfhmep; 6683 sfmmu_t *sfmmup; 6684 tte_t tte, ttemod; 6685 struct hme_blk *hmeblkp; 6686 caddr_t addr; 6687 int index, cons; 6688 cpuset_t cpuset; 6689 6690 ASSERT(PAGE_EXCL(pp)); 6691 ASSERT(sfmmu_mlist_held(pp)); 6692 6693 mutex_enter(&kpr_suspendlock); 6694 6695 /* 6696 * We're about to suspend a kernel mapping so mark this thread as 6697 * non-traceable by DTrace. This prevents us from running into issues 6698 * with probe context trying to touch a suspended page 6699 * in the relocation codepath itself. 6700 */ 6701 curthread->t_flag |= T_DONTDTRACE; 6702 6703 index = PP_MAPINDEX(pp); 6704 cons = TTE8K; 6705 6706 retry: 6707 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6708 6709 if (IS_PAHME(sfhmep)) 6710 continue; 6711 6712 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 6713 continue; 6714 6715 /* 6716 * Loop until we successfully set the suspend bit in 6717 * the TTE. 6718 */ 6719 again: 6720 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6721 ASSERT(TTE_IS_VALID(&tte)); 6722 6723 ttemod = tte; 6724 TTE_SET_SUSPEND(&ttemod); 6725 if (sfmmu_modifytte_try(&tte, &ttemod, 6726 &sfhmep->hme_tte) < 0) 6727 goto again; 6728 6729 /* 6730 * Invalidate TSB entry 6731 */ 6732 hmeblkp = sfmmu_hmetohblk(sfhmep); 6733 6734 sfmmup = hblktosfmmu(hmeblkp); 6735 ASSERT(sfmmup == ksfmmup); 6736 ASSERT(!hmeblkp->hblk_shared); 6737 6738 addr = tte_to_vaddr(hmeblkp, tte); 6739 6740 /* 6741 * No need to make sure that the TSB for this sfmmu is 6742 * not being relocated since it is ksfmmup and thus it 6743 * will never be relocated. 6744 */ 6745 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 6746 6747 /* 6748 * Update xcall stats 6749 */ 6750 cpuset = cpu_ready_set; 6751 CPUSET_DEL(cpuset, CPU->cpu_id); 6752 6753 /* LINTED: constant in conditional context */ 6754 SFMMU_XCALL_STATS(ksfmmup); 6755 6756 /* 6757 * Flush TLB entry on remote CPU's 6758 */ 6759 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 6760 (uint64_t)ksfmmup); 6761 xt_sync(cpuset); 6762 6763 /* 6764 * Flush TLB entry on local CPU 6765 */ 6766 vtag_flushpage(addr, (uint64_t)ksfmmup); 6767 } 6768 6769 while (index != 0) { 6770 index = index >> 1; 6771 if (index != 0) 6772 cons++; 6773 if (index & 0x1) { 6774 pp = PP_GROUPLEADER(pp, cons); 6775 goto retry; 6776 } 6777 } 6778 } 6779 6780 #ifdef DEBUG 6781 6782 #define N_PRLE 1024 6783 struct prle { 6784 page_t *targ; 6785 page_t *repl; 6786 int status; 6787 int pausecpus; 6788 hrtime_t whence; 6789 }; 6790 6791 static struct prle page_relocate_log[N_PRLE]; 6792 static int prl_entry; 6793 static kmutex_t prl_mutex; 6794 6795 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 6796 mutex_enter(&prl_mutex); \ 6797 page_relocate_log[prl_entry].targ = *(t); \ 6798 page_relocate_log[prl_entry].repl = *(r); \ 6799 page_relocate_log[prl_entry].status = (s); \ 6800 page_relocate_log[prl_entry].pausecpus = (p); \ 6801 page_relocate_log[prl_entry].whence = gethrtime(); \ 6802 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 6803 mutex_exit(&prl_mutex); 6804 6805 #else /* !DEBUG */ 6806 #define PAGE_RELOCATE_LOG(t, r, s, p) 6807 #endif 6808 6809 /* 6810 * Core Kernel Page Relocation Algorithm 6811 * 6812 * Input: 6813 * 6814 * target : constituent pages are SE_EXCL locked. 6815 * replacement: constituent pages are SE_EXCL locked. 6816 * 6817 * Output: 6818 * 6819 * nrelocp: number of pages relocated 6820 */ 6821 int 6822 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 6823 { 6824 page_t *targ, *repl; 6825 page_t *tpp, *rpp; 6826 kmutex_t *low, *high; 6827 spgcnt_t npages, i; 6828 page_t *pl = NULL; 6829 uint_t ppattr; 6830 int old_pil; 6831 cpuset_t cpuset; 6832 int cap_cpus; 6833 int ret; 6834 #ifdef VAC 6835 int cflags = 0; 6836 #endif 6837 6838 if (hat_kpr_enabled == 0 || !kcage_on || PP_ISNORELOC(*target)) { 6839 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 6840 return (EAGAIN); 6841 } 6842 6843 mutex_enter(&kpr_mutex); 6844 kreloc_thread = curthread; 6845 6846 targ = *target; 6847 repl = *replacement; 6848 ASSERT(repl != NULL); 6849 ASSERT(targ->p_szc == repl->p_szc); 6850 6851 npages = page_get_pagecnt(targ->p_szc); 6852 6853 /* 6854 * unload VA<->PA mappings that are not locked 6855 */ 6856 tpp = targ; 6857 for (i = 0; i < npages; i++) { 6858 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 6859 tpp++; 6860 } 6861 6862 /* 6863 * Do "presuspend" callbacks, in a context from which we can still 6864 * block as needed. Note that we don't hold the mapping list lock 6865 * of "targ" at this point due to potential locking order issues; 6866 * we assume that between the hat_pageunload() above and holding 6867 * the SE_EXCL lock that the mapping list *cannot* change at this 6868 * point. 6869 */ 6870 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 6871 if (ret != 0) { 6872 /* 6873 * EIO translates to fatal error, for all others cleanup 6874 * and return EAGAIN. 6875 */ 6876 ASSERT(ret != EIO); 6877 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 6878 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 6879 kreloc_thread = NULL; 6880 mutex_exit(&kpr_mutex); 6881 return (EAGAIN); 6882 } 6883 6884 /* 6885 * acquire p_mapping list lock for both the target and replacement 6886 * root pages. 6887 * 6888 * low and high refer to the need to grab the mlist locks in a 6889 * specific order in order to prevent race conditions. Thus the 6890 * lower lock must be grabbed before the higher lock. 6891 * 6892 * This will block hat_unload's accessing p_mapping list. Since 6893 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 6894 * blocked. Thus, no one else will be accessing the p_mapping list 6895 * while we suspend and reload the locked mapping below. 6896 */ 6897 tpp = targ; 6898 rpp = repl; 6899 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6900 6901 kpreempt_disable(); 6902 6903 /* 6904 * We raise our PIL to 13 so that we don't get captured by 6905 * another CPU or pinned by an interrupt thread. We can't go to 6906 * PIL 14 since the nexus driver(s) may need to interrupt at 6907 * that level in the case of IOMMU pseudo mappings. 6908 */ 6909 cpuset = cpu_ready_set; 6910 CPUSET_DEL(cpuset, CPU->cpu_id); 6911 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6912 old_pil = splr(XCALL_PIL); 6913 } else { 6914 old_pil = -1; 6915 xc_attention(cpuset); 6916 } 6917 ASSERT(getpil() == XCALL_PIL); 6918 6919 /* 6920 * Now do suspend callbacks. In the case of an IOMMU mapping 6921 * this will suspend all DMA activity to the page while it is 6922 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6923 * may be captured at this point we should have acquired any needed 6924 * locks in the presuspend callback. 6925 */ 6926 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6927 if (ret != 0) { 6928 repl = targ; 6929 goto suspend_fail; 6930 } 6931 6932 /* 6933 * Raise the PIL yet again, this time to block all high-level 6934 * interrupts on this CPU. This is necessary to prevent an 6935 * interrupt routine from pinning the thread which holds the 6936 * mapping suspended and then touching the suspended page. 6937 * 6938 * Once the page is suspended we also need to be careful to 6939 * avoid calling any functions which touch any seg_kmem memory 6940 * since that memory may be backed by the very page we are 6941 * relocating in here! 6942 */ 6943 hat_pagesuspend(targ); 6944 6945 /* 6946 * Now that we are confident everybody has stopped using this page, 6947 * copy the page contents. Note we use a physical copy to prevent 6948 * locking issues and to avoid fpRAS because we can't handle it in 6949 * this context. 6950 */ 6951 for (i = 0; i < npages; i++, tpp++, rpp++) { 6952 #ifdef VAC 6953 /* 6954 * If the replacement has a different vcolor than 6955 * the one being replacd, we need to handle VAC 6956 * consistency for it just as we were setting up 6957 * a new mapping to it. 6958 */ 6959 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) && 6960 (tpp->p_vcolor != rpp->p_vcolor) && 6961 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) { 6962 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp)); 6963 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 6964 rpp->p_pagenum); 6965 } 6966 #endif 6967 /* 6968 * Copy the contents of the page. 6969 */ 6970 ppcopy_kernel(tpp, rpp); 6971 } 6972 6973 tpp = targ; 6974 rpp = repl; 6975 for (i = 0; i < npages; i++, tpp++, rpp++) { 6976 /* 6977 * Copy attributes. VAC consistency was handled above, 6978 * if required. 6979 */ 6980 ppattr = hat_page_getattr(tpp, (P_MOD | P_REF | P_RO)); 6981 page_clr_all_props(rpp, 0); 6982 page_set_props(rpp, ppattr); 6983 rpp->p_index = tpp->p_index; 6984 tpp->p_index = 0; 6985 #ifdef VAC 6986 rpp->p_vcolor = tpp->p_vcolor; 6987 #endif 6988 } 6989 6990 /* 6991 * First, unsuspend the page, if we set the suspend bit, and transfer 6992 * the mapping list from the target page to the replacement page. 6993 * Next process postcallbacks; since pa_hment's are linked only to the 6994 * p_mapping list of root page, we don't iterate over the constituent 6995 * pages. 6996 */ 6997 hat_pagereload(targ, repl); 6998 6999 suspend_fail: 7000 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 7001 7002 /* 7003 * Now lower our PIL and release any captured CPUs since we 7004 * are out of the "danger zone". After this it will again be 7005 * safe to acquire adaptive mutex locks, or to drop them... 7006 */ 7007 if (old_pil != -1) { 7008 splx(old_pil); 7009 } else { 7010 xc_dismissed(cpuset); 7011 } 7012 7013 kpreempt_enable(); 7014 7015 sfmmu_mlist_reloc_exit(low, high); 7016 7017 /* 7018 * Postsuspend callbacks should drop any locks held across 7019 * the suspend callbacks. As before, we don't hold the mapping 7020 * list lock at this point.. our assumption is that the mapping 7021 * list still can't change due to our holding SE_EXCL lock and 7022 * there being no unlocked mappings left. Hence the restriction 7023 * on calling context to hat_delete_callback() 7024 */ 7025 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 7026 if (ret != 0) { 7027 /* 7028 * The second presuspend call failed: we got here through 7029 * the suspend_fail label above. 7030 */ 7031 ASSERT(ret != EIO); 7032 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 7033 kreloc_thread = NULL; 7034 mutex_exit(&kpr_mutex); 7035 return (EAGAIN); 7036 } 7037 7038 /* 7039 * Now that we're out of the performance critical section we can 7040 * take care of updating the hash table, since we still 7041 * hold all the pages locked SE_EXCL at this point we 7042 * needn't worry about things changing out from under us. 7043 */ 7044 tpp = targ; 7045 rpp = repl; 7046 for (i = 0; i < npages; i++, tpp++, rpp++) { 7047 7048 /* 7049 * replace targ with replacement in page_hash table 7050 */ 7051 targ = tpp; 7052 page_relocate_hash(rpp, targ); 7053 7054 /* 7055 * concatenate target; caller of platform_page_relocate() 7056 * expects target to be concatenated after returning. 7057 */ 7058 ASSERT(targ->p_next == targ); 7059 ASSERT(targ->p_prev == targ); 7060 page_list_concat(&pl, &targ); 7061 } 7062 7063 ASSERT(*target == pl); 7064 *nrelocp = npages; 7065 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 7066 kreloc_thread = NULL; 7067 mutex_exit(&kpr_mutex); 7068 return (0); 7069 } 7070 7071 /* 7072 * Called when stray pa_hments are found attached to a page which is 7073 * being freed. Notify the subsystem which attached the pa_hment of 7074 * the error if it registered a suitable handler, else panic. 7075 */ 7076 static void 7077 sfmmu_pahment_leaked(struct pa_hment *pahmep) 7078 { 7079 id_t cb_id = pahmep->cb_id; 7080 7081 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 7082 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 7083 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 7084 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 7085 return; /* non-fatal */ 7086 } 7087 panic("pa_hment leaked: 0x%p", (void *)pahmep); 7088 } 7089 7090 /* 7091 * Remove all mappings to page 'pp'. 7092 */ 7093 int 7094 hat_pageunload(struct page *pp, uint_t forceflag) 7095 { 7096 struct page *origpp = pp; 7097 struct sf_hment *sfhme, *tmphme; 7098 struct hme_blk *hmeblkp; 7099 kmutex_t *pml; 7100 #ifdef VAC 7101 kmutex_t *pmtx; 7102 #endif 7103 cpuset_t cpuset, tset; 7104 int index, cons; 7105 int xhme_blks; 7106 int pa_hments; 7107 7108 ASSERT(PAGE_EXCL(pp)); 7109 7110 retry_xhat: 7111 tmphme = NULL; 7112 xhme_blks = 0; 7113 pa_hments = 0; 7114 CPUSET_ZERO(cpuset); 7115 7116 pml = sfmmu_mlist_enter(pp); 7117 7118 #ifdef VAC 7119 if (pp->p_kpmref) 7120 sfmmu_kpm_pageunload(pp); 7121 ASSERT(!PP_ISMAPPED_KPM(pp)); 7122 #endif 7123 /* 7124 * Clear vpm reference. Since the page is exclusively locked 7125 * vpm cannot be referencing it. 7126 */ 7127 if (vpm_enable) { 7128 pp->p_vpmref = 0; 7129 } 7130 7131 index = PP_MAPINDEX(pp); 7132 cons = TTE8K; 7133 retry: 7134 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7135 tmphme = sfhme->hme_next; 7136 7137 if (IS_PAHME(sfhme)) { 7138 ASSERT(sfhme->hme_data != NULL); 7139 pa_hments++; 7140 continue; 7141 } 7142 7143 hmeblkp = sfmmu_hmetohblk(sfhme); 7144 if (hmeblkp->hblk_xhat_bit) { 7145 struct xhat_hme_blk *xblk = 7146 (struct xhat_hme_blk *)hmeblkp; 7147 7148 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat, 7149 pp, forceflag, XBLK2PROVBLK(xblk)); 7150 7151 xhme_blks = 1; 7152 continue; 7153 } 7154 7155 /* 7156 * If there are kernel mappings don't unload them, they will 7157 * be suspended. 7158 */ 7159 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 7160 hmeblkp->hblk_tag.htag_id == ksfmmup) 7161 continue; 7162 7163 tset = sfmmu_pageunload(pp, sfhme, cons); 7164 CPUSET_OR(cpuset, tset); 7165 } 7166 7167 while (index != 0) { 7168 index = index >> 1; 7169 if (index != 0) 7170 cons++; 7171 if (index & 0x1) { 7172 /* Go to leading page */ 7173 pp = PP_GROUPLEADER(pp, cons); 7174 ASSERT(sfmmu_mlist_held(pp)); 7175 goto retry; 7176 } 7177 } 7178 7179 /* 7180 * cpuset may be empty if the page was only mapped by segkpm, 7181 * in which case we won't actually cross-trap. 7182 */ 7183 xt_sync(cpuset); 7184 7185 /* 7186 * The page should have no mappings at this point, unless 7187 * we were called from hat_page_relocate() in which case we 7188 * leave the locked mappings which will be suspended later. 7189 */ 7190 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments || 7191 (forceflag == SFMMU_KERNEL_RELOC)); 7192 7193 #ifdef VAC 7194 if (PP_ISTNC(pp)) { 7195 if (cons == TTE8K) { 7196 pmtx = sfmmu_page_enter(pp); 7197 PP_CLRTNC(pp); 7198 sfmmu_page_exit(pmtx); 7199 } else { 7200 conv_tnc(pp, cons); 7201 } 7202 } 7203 #endif /* VAC */ 7204 7205 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 7206 /* 7207 * Unlink any pa_hments and free them, calling back 7208 * the responsible subsystem to notify it of the error. 7209 * This can occur in situations such as drivers leaking 7210 * DMA handles: naughty, but common enough that we'd like 7211 * to keep the system running rather than bringing it 7212 * down with an obscure error like "pa_hment leaked" 7213 * which doesn't aid the user in debugging their driver. 7214 */ 7215 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7216 tmphme = sfhme->hme_next; 7217 if (IS_PAHME(sfhme)) { 7218 struct pa_hment *pahmep = sfhme->hme_data; 7219 sfmmu_pahment_leaked(pahmep); 7220 HME_SUB(sfhme, pp); 7221 kmem_cache_free(pa_hment_cache, pahmep); 7222 } 7223 } 7224 7225 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks); 7226 } 7227 7228 sfmmu_mlist_exit(pml); 7229 7230 /* 7231 * XHAT may not have finished unloading pages 7232 * because some other thread was waiting for 7233 * mlist lock and XHAT_PAGEUNLOAD let it do 7234 * the job. 7235 */ 7236 if (xhme_blks) { 7237 pp = origpp; 7238 goto retry_xhat; 7239 } 7240 7241 return (0); 7242 } 7243 7244 cpuset_t 7245 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 7246 { 7247 struct hme_blk *hmeblkp; 7248 sfmmu_t *sfmmup; 7249 tte_t tte, ttemod; 7250 #ifdef DEBUG 7251 tte_t orig_old; 7252 #endif /* DEBUG */ 7253 caddr_t addr; 7254 int ttesz; 7255 int ret; 7256 cpuset_t cpuset; 7257 7258 ASSERT(pp != NULL); 7259 ASSERT(sfmmu_mlist_held(pp)); 7260 ASSERT(!PP_ISKAS(pp)); 7261 7262 CPUSET_ZERO(cpuset); 7263 7264 hmeblkp = sfmmu_hmetohblk(sfhme); 7265 7266 readtte: 7267 sfmmu_copytte(&sfhme->hme_tte, &tte); 7268 if (TTE_IS_VALID(&tte)) { 7269 sfmmup = hblktosfmmu(hmeblkp); 7270 ttesz = get_hblk_ttesz(hmeblkp); 7271 /* 7272 * Only unload mappings of 'cons' size. 7273 */ 7274 if (ttesz != cons) 7275 return (cpuset); 7276 7277 /* 7278 * Note that we have p_mapping lock, but no hash lock here. 7279 * hblk_unload() has to have both hash lock AND p_mapping 7280 * lock before it tries to modify tte. So, the tte could 7281 * not become invalid in the sfmmu_modifytte_try() below. 7282 */ 7283 ttemod = tte; 7284 #ifdef DEBUG 7285 orig_old = tte; 7286 #endif /* DEBUG */ 7287 7288 TTE_SET_INVALID(&ttemod); 7289 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7290 if (ret < 0) { 7291 #ifdef DEBUG 7292 /* only R/M bits can change. */ 7293 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 7294 #endif /* DEBUG */ 7295 goto readtte; 7296 } 7297 7298 if (ret == 0) { 7299 panic("pageunload: cas failed?"); 7300 } 7301 7302 addr = tte_to_vaddr(hmeblkp, tte); 7303 7304 if (hmeblkp->hblk_shared) { 7305 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7306 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7307 sf_region_t *rgnp; 7308 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7309 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7310 ASSERT(srdp != NULL); 7311 rgnp = srdp->srd_hmergnp[rid]; 7312 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7313 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1); 7314 sfmmu_ttesync(NULL, addr, &tte, pp); 7315 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0); 7316 atomic_add_long(&rgnp->rgn_ttecnt[ttesz], -1); 7317 } else { 7318 sfmmu_ttesync(sfmmup, addr, &tte, pp); 7319 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -1); 7320 7321 /* 7322 * We need to flush the page from the virtual cache 7323 * in order to prevent a virtual cache alias 7324 * inconsistency. The particular scenario we need 7325 * to worry about is: 7326 * Given: va1 and va2 are two virtual address that 7327 * alias and will map the same physical address. 7328 * 1. mapping exists from va1 to pa and data has 7329 * been read into the cache. 7330 * 2. unload va1. 7331 * 3. load va2 and modify data using va2. 7332 * 4 unload va2. 7333 * 5. load va1 and reference data. Unless we flush 7334 * the data cache when we unload we will get 7335 * stale data. 7336 * This scenario is taken care of by using virtual 7337 * page coloring. 7338 */ 7339 if (sfmmup->sfmmu_ismhat) { 7340 /* 7341 * Flush TSBs, TLBs and caches 7342 * of every process 7343 * sharing this ism segment. 7344 */ 7345 sfmmu_hat_lock_all(); 7346 mutex_enter(&ism_mlist_lock); 7347 kpreempt_disable(); 7348 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 7349 pp->p_pagenum, CACHE_NO_FLUSH); 7350 kpreempt_enable(); 7351 mutex_exit(&ism_mlist_lock); 7352 sfmmu_hat_unlock_all(); 7353 cpuset = cpu_ready_set; 7354 } else { 7355 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7356 cpuset = sfmmup->sfmmu_cpusran; 7357 } 7358 } 7359 7360 /* 7361 * Hme_sub has to run after ttesync() and a_rss update. 7362 * See hblk_unload(). 7363 */ 7364 HME_SUB(sfhme, pp); 7365 membar_stst(); 7366 7367 /* 7368 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 7369 * since pteload may have done a HME_ADD() right after 7370 * we did the HME_SUB() above. Hmecnt is now maintained 7371 * by cas only. no lock guranteed its value. The only 7372 * gurantee we have is the hmecnt should not be less than 7373 * what it should be so the hblk will not be taken away. 7374 * It's also important that we decremented the hmecnt after 7375 * we are done with hmeblkp so that this hmeblk won't be 7376 * stolen. 7377 */ 7378 ASSERT(hmeblkp->hblk_hmecnt > 0); 7379 ASSERT(hmeblkp->hblk_vcnt > 0); 7380 atomic_add_16(&hmeblkp->hblk_vcnt, -1); 7381 atomic_add_16(&hmeblkp->hblk_hmecnt, -1); 7382 /* 7383 * This is bug 4063182. 7384 * XXX: fixme 7385 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 7386 * !hmeblkp->hblk_lckcnt); 7387 */ 7388 } else { 7389 panic("invalid tte? pp %p &tte %p", 7390 (void *)pp, (void *)&tte); 7391 } 7392 7393 return (cpuset); 7394 } 7395 7396 /* 7397 * While relocating a kernel page, this function will move the mappings 7398 * from tpp to dpp and modify any associated data with these mappings. 7399 * It also unsuspends the suspended kernel mapping. 7400 */ 7401 static void 7402 hat_pagereload(struct page *tpp, struct page *dpp) 7403 { 7404 struct sf_hment *sfhme; 7405 tte_t tte, ttemod; 7406 int index, cons; 7407 7408 ASSERT(getpil() == PIL_MAX); 7409 ASSERT(sfmmu_mlist_held(tpp)); 7410 ASSERT(sfmmu_mlist_held(dpp)); 7411 7412 index = PP_MAPINDEX(tpp); 7413 cons = TTE8K; 7414 7415 /* Update real mappings to the page */ 7416 retry: 7417 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 7418 if (IS_PAHME(sfhme)) 7419 continue; 7420 sfmmu_copytte(&sfhme->hme_tte, &tte); 7421 ttemod = tte; 7422 7423 /* 7424 * replace old pfn with new pfn in TTE 7425 */ 7426 PFN_TO_TTE(ttemod, dpp->p_pagenum); 7427 7428 /* 7429 * clear suspend bit 7430 */ 7431 ASSERT(TTE_IS_SUSPEND(&ttemod)); 7432 TTE_CLR_SUSPEND(&ttemod); 7433 7434 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 7435 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 7436 7437 /* 7438 * set hme_page point to new page 7439 */ 7440 sfhme->hme_page = dpp; 7441 } 7442 7443 /* 7444 * move p_mapping list from old page to new page 7445 */ 7446 dpp->p_mapping = tpp->p_mapping; 7447 tpp->p_mapping = NULL; 7448 dpp->p_share = tpp->p_share; 7449 tpp->p_share = 0; 7450 7451 while (index != 0) { 7452 index = index >> 1; 7453 if (index != 0) 7454 cons++; 7455 if (index & 0x1) { 7456 tpp = PP_GROUPLEADER(tpp, cons); 7457 dpp = PP_GROUPLEADER(dpp, cons); 7458 goto retry; 7459 } 7460 } 7461 7462 curthread->t_flag &= ~T_DONTDTRACE; 7463 mutex_exit(&kpr_suspendlock); 7464 } 7465 7466 uint_t 7467 hat_pagesync(struct page *pp, uint_t clearflag) 7468 { 7469 struct sf_hment *sfhme, *tmphme = NULL; 7470 struct hme_blk *hmeblkp; 7471 kmutex_t *pml; 7472 cpuset_t cpuset, tset; 7473 int index, cons; 7474 extern ulong_t po_share; 7475 page_t *save_pp = pp; 7476 int stop_on_sh = 0; 7477 uint_t shcnt; 7478 7479 CPUSET_ZERO(cpuset); 7480 7481 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 7482 return (PP_GENERIC_ATTR(pp)); 7483 } 7484 7485 if ((clearflag & HAT_SYNC_ZERORM) == 0) { 7486 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) { 7487 return (PP_GENERIC_ATTR(pp)); 7488 } 7489 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) { 7490 return (PP_GENERIC_ATTR(pp)); 7491 } 7492 if (clearflag & HAT_SYNC_STOPON_SHARED) { 7493 if (pp->p_share > po_share) { 7494 hat_page_setattr(pp, P_REF); 7495 return (PP_GENERIC_ATTR(pp)); 7496 } 7497 stop_on_sh = 1; 7498 shcnt = 0; 7499 } 7500 } 7501 7502 clearflag &= ~HAT_SYNC_STOPON_SHARED; 7503 pml = sfmmu_mlist_enter(pp); 7504 index = PP_MAPINDEX(pp); 7505 cons = TTE8K; 7506 retry: 7507 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7508 /* 7509 * We need to save the next hment on the list since 7510 * it is possible for pagesync to remove an invalid hment 7511 * from the list. 7512 */ 7513 tmphme = sfhme->hme_next; 7514 if (IS_PAHME(sfhme)) 7515 continue; 7516 /* 7517 * If we are looking for large mappings and this hme doesn't 7518 * reach the range we are seeking, just ignore it. 7519 */ 7520 hmeblkp = sfmmu_hmetohblk(sfhme); 7521 if (hmeblkp->hblk_xhat_bit) 7522 continue; 7523 7524 if (hme_size(sfhme) < cons) 7525 continue; 7526 7527 if (stop_on_sh) { 7528 if (hmeblkp->hblk_shared) { 7529 sf_srd_t *srdp = hblktosrd(hmeblkp); 7530 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7531 sf_region_t *rgnp; 7532 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7533 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7534 ASSERT(srdp != NULL); 7535 rgnp = srdp->srd_hmergnp[rid]; 7536 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 7537 rgnp, rid); 7538 shcnt += rgnp->rgn_refcnt; 7539 } else { 7540 shcnt++; 7541 } 7542 if (shcnt > po_share) { 7543 /* 7544 * tell the pager to spare the page this time 7545 * around. 7546 */ 7547 hat_page_setattr(save_pp, P_REF); 7548 index = 0; 7549 break; 7550 } 7551 } 7552 tset = sfmmu_pagesync(pp, sfhme, 7553 clearflag & ~HAT_SYNC_STOPON_RM); 7554 CPUSET_OR(cpuset, tset); 7555 7556 /* 7557 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 7558 * as the "ref" or "mod" is set or share cnt exceeds po_share. 7559 */ 7560 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 7561 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 7562 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) { 7563 index = 0; 7564 break; 7565 } 7566 } 7567 7568 while (index) { 7569 index = index >> 1; 7570 cons++; 7571 if (index & 0x1) { 7572 /* Go to leading page */ 7573 pp = PP_GROUPLEADER(pp, cons); 7574 goto retry; 7575 } 7576 } 7577 7578 xt_sync(cpuset); 7579 sfmmu_mlist_exit(pml); 7580 return (PP_GENERIC_ATTR(save_pp)); 7581 } 7582 7583 /* 7584 * Get all the hardware dependent attributes for a page struct 7585 */ 7586 static cpuset_t 7587 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 7588 uint_t clearflag) 7589 { 7590 caddr_t addr; 7591 tte_t tte, ttemod; 7592 struct hme_blk *hmeblkp; 7593 int ret; 7594 sfmmu_t *sfmmup; 7595 cpuset_t cpuset; 7596 7597 ASSERT(pp != NULL); 7598 ASSERT(sfmmu_mlist_held(pp)); 7599 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 7600 (clearflag == HAT_SYNC_ZERORM)); 7601 7602 SFMMU_STAT(sf_pagesync); 7603 7604 CPUSET_ZERO(cpuset); 7605 7606 sfmmu_pagesync_retry: 7607 7608 sfmmu_copytte(&sfhme->hme_tte, &tte); 7609 if (TTE_IS_VALID(&tte)) { 7610 hmeblkp = sfmmu_hmetohblk(sfhme); 7611 sfmmup = hblktosfmmu(hmeblkp); 7612 addr = tte_to_vaddr(hmeblkp, tte); 7613 if (clearflag == HAT_SYNC_ZERORM) { 7614 ttemod = tte; 7615 TTE_CLR_RM(&ttemod); 7616 ret = sfmmu_modifytte_try(&tte, &ttemod, 7617 &sfhme->hme_tte); 7618 if (ret < 0) { 7619 /* 7620 * cas failed and the new value is not what 7621 * we want. 7622 */ 7623 goto sfmmu_pagesync_retry; 7624 } 7625 7626 if (ret > 0) { 7627 /* we win the cas */ 7628 if (hmeblkp->hblk_shared) { 7629 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7630 uint_t rid = 7631 hmeblkp->hblk_tag.htag_rid; 7632 sf_region_t *rgnp; 7633 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7634 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7635 ASSERT(srdp != NULL); 7636 rgnp = srdp->srd_hmergnp[rid]; 7637 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7638 srdp, rgnp, rid); 7639 cpuset = sfmmu_rgntlb_demap(addr, 7640 rgnp, hmeblkp, 1); 7641 } else { 7642 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 7643 0, 0); 7644 cpuset = sfmmup->sfmmu_cpusran; 7645 } 7646 } 7647 } 7648 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr, 7649 &tte, pp); 7650 } 7651 return (cpuset); 7652 } 7653 7654 /* 7655 * Remove write permission from a mappings to a page, so that 7656 * we can detect the next modification of it. This requires modifying 7657 * the TTE then invalidating (demap) any TLB entry using that TTE. 7658 * This code is similar to sfmmu_pagesync(). 7659 */ 7660 static cpuset_t 7661 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 7662 { 7663 caddr_t addr; 7664 tte_t tte; 7665 tte_t ttemod; 7666 struct hme_blk *hmeblkp; 7667 int ret; 7668 sfmmu_t *sfmmup; 7669 cpuset_t cpuset; 7670 7671 ASSERT(pp != NULL); 7672 ASSERT(sfmmu_mlist_held(pp)); 7673 7674 CPUSET_ZERO(cpuset); 7675 SFMMU_STAT(sf_clrwrt); 7676 7677 retry: 7678 7679 sfmmu_copytte(&sfhme->hme_tte, &tte); 7680 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 7681 hmeblkp = sfmmu_hmetohblk(sfhme); 7682 7683 /* 7684 * xhat mappings should never be to a VMODSORT page. 7685 */ 7686 ASSERT(hmeblkp->hblk_xhat_bit == 0); 7687 7688 sfmmup = hblktosfmmu(hmeblkp); 7689 addr = tte_to_vaddr(hmeblkp, tte); 7690 7691 ttemod = tte; 7692 TTE_CLR_WRT(&ttemod); 7693 TTE_CLR_MOD(&ttemod); 7694 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7695 7696 /* 7697 * if cas failed and the new value is not what 7698 * we want retry 7699 */ 7700 if (ret < 0) 7701 goto retry; 7702 7703 /* we win the cas */ 7704 if (ret > 0) { 7705 if (hmeblkp->hblk_shared) { 7706 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7707 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7708 sf_region_t *rgnp; 7709 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7710 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7711 ASSERT(srdp != NULL); 7712 rgnp = srdp->srd_hmergnp[rid]; 7713 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7714 srdp, rgnp, rid); 7715 cpuset = sfmmu_rgntlb_demap(addr, 7716 rgnp, hmeblkp, 1); 7717 } else { 7718 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7719 cpuset = sfmmup->sfmmu_cpusran; 7720 } 7721 } 7722 } 7723 7724 return (cpuset); 7725 } 7726 7727 /* 7728 * Walk all mappings of a page, removing write permission and clearing the 7729 * ref/mod bits. This code is similar to hat_pagesync() 7730 */ 7731 static void 7732 hat_page_clrwrt(page_t *pp) 7733 { 7734 struct sf_hment *sfhme; 7735 struct sf_hment *tmphme = NULL; 7736 kmutex_t *pml; 7737 cpuset_t cpuset; 7738 cpuset_t tset; 7739 int index; 7740 int cons; 7741 7742 CPUSET_ZERO(cpuset); 7743 7744 pml = sfmmu_mlist_enter(pp); 7745 index = PP_MAPINDEX(pp); 7746 cons = TTE8K; 7747 retry: 7748 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7749 tmphme = sfhme->hme_next; 7750 7751 /* 7752 * If we are looking for large mappings and this hme doesn't 7753 * reach the range we are seeking, just ignore its. 7754 */ 7755 7756 if (hme_size(sfhme) < cons) 7757 continue; 7758 7759 tset = sfmmu_pageclrwrt(pp, sfhme); 7760 CPUSET_OR(cpuset, tset); 7761 } 7762 7763 while (index) { 7764 index = index >> 1; 7765 cons++; 7766 if (index & 0x1) { 7767 /* Go to leading page */ 7768 pp = PP_GROUPLEADER(pp, cons); 7769 goto retry; 7770 } 7771 } 7772 7773 xt_sync(cpuset); 7774 sfmmu_mlist_exit(pml); 7775 } 7776 7777 /* 7778 * Set the given REF/MOD/RO bits for the given page. 7779 * For a vnode with a sorted v_pages list, we need to change 7780 * the attributes and the v_pages list together under page_vnode_mutex. 7781 */ 7782 void 7783 hat_page_setattr(page_t *pp, uint_t flag) 7784 { 7785 vnode_t *vp = pp->p_vnode; 7786 page_t **listp; 7787 kmutex_t *pmtx; 7788 kmutex_t *vphm = NULL; 7789 int noshuffle; 7790 7791 noshuffle = flag & P_NSH; 7792 flag &= ~P_NSH; 7793 7794 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO | P_EXEC))); 7795 7796 /* 7797 * nothing to do if attribute already set 7798 */ 7799 if ((pp->p_nrm & flag) == flag) 7800 return; 7801 7802 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 7803 !noshuffle) { 7804 vphm = page_vnode_mutex(vp); 7805 mutex_enter(vphm); 7806 } 7807 7808 pmtx = sfmmu_page_enter(pp); 7809 pp->p_nrm |= flag; 7810 sfmmu_page_exit(pmtx); 7811 7812 if (vphm != NULL) { 7813 /* 7814 * Some File Systems examine v_pages for NULL w/o 7815 * grabbing the vphm mutex. Must not let it become NULL when 7816 * pp is the only page on the list. 7817 */ 7818 if (pp->p_vpnext != pp) { 7819 page_vpsub(&vp->v_pages, pp); 7820 if (vp->v_pages != NULL) 7821 listp = &vp->v_pages->p_vpprev->p_vpnext; 7822 else 7823 listp = &vp->v_pages; 7824 page_vpadd(listp, pp); 7825 } 7826 mutex_exit(vphm); 7827 } 7828 } 7829 7830 void 7831 hat_page_clrattr(page_t *pp, uint_t flag) 7832 { 7833 vnode_t *vp = pp->p_vnode; 7834 kmutex_t *pmtx; 7835 7836 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7837 7838 pmtx = sfmmu_page_enter(pp); 7839 7840 /* 7841 * Caller is expected to hold page's io lock for VMODSORT to work 7842 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 7843 * bit is cleared. 7844 * We don't have assert to avoid tripping some existing third party 7845 * code. The dirty page is moved back to top of the v_page list 7846 * after IO is done in pvn_write_done(). 7847 */ 7848 pp->p_nrm &= ~flag; 7849 sfmmu_page_exit(pmtx); 7850 7851 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 7852 7853 /* 7854 * VMODSORT works by removing write permissions and getting 7855 * a fault when a page is made dirty. At this point 7856 * we need to remove write permission from all mappings 7857 * to this page. 7858 */ 7859 hat_page_clrwrt(pp); 7860 } 7861 } 7862 7863 uint_t 7864 hat_page_getattr(page_t *pp, uint_t flag) 7865 { 7866 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7867 return ((uint_t)(pp->p_nrm & flag)); 7868 } 7869 7870 /* 7871 * DEBUG kernels: verify that a kernel va<->pa translation 7872 * is safe by checking the underlying page_t is in a page 7873 * relocation-safe state. 7874 */ 7875 #ifdef DEBUG 7876 void 7877 sfmmu_check_kpfn(pfn_t pfn) 7878 { 7879 page_t *pp; 7880 int index, cons; 7881 7882 if (hat_check_vtop == 0) 7883 return; 7884 7885 if (hat_kpr_enabled == 0 || kvseg.s_base == NULL || panicstr) 7886 return; 7887 7888 pp = page_numtopp_nolock(pfn); 7889 if (!pp) 7890 return; 7891 7892 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7893 return; 7894 7895 /* 7896 * Handed a large kernel page, we dig up the root page since we 7897 * know the root page might have the lock also. 7898 */ 7899 if (pp->p_szc != 0) { 7900 index = PP_MAPINDEX(pp); 7901 cons = TTE8K; 7902 again: 7903 while (index != 0) { 7904 index >>= 1; 7905 if (index != 0) 7906 cons++; 7907 if (index & 0x1) { 7908 pp = PP_GROUPLEADER(pp, cons); 7909 goto again; 7910 } 7911 } 7912 } 7913 7914 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7915 return; 7916 7917 /* 7918 * Pages need to be locked or allocated "permanent" (either from 7919 * static_arena arena or explicitly setting PG_NORELOC when calling 7920 * page_create_va()) for VA->PA translations to be valid. 7921 */ 7922 if (!PP_ISNORELOC(pp)) 7923 panic("Illegal VA->PA translation, pp 0x%p not permanent", 7924 (void *)pp); 7925 else 7926 panic("Illegal VA->PA translation, pp 0x%p not locked", 7927 (void *)pp); 7928 } 7929 #endif /* DEBUG */ 7930 7931 /* 7932 * Returns a page frame number for a given virtual address. 7933 * Returns PFN_INVALID to indicate an invalid mapping 7934 */ 7935 pfn_t 7936 hat_getpfnum(struct hat *hat, caddr_t addr) 7937 { 7938 pfn_t pfn; 7939 tte_t tte; 7940 7941 /* 7942 * We would like to 7943 * ASSERT(AS_LOCK_HELD(as, &as->a_lock)); 7944 * but we can't because the iommu driver will call this 7945 * routine at interrupt time and it can't grab the as lock 7946 * or it will deadlock: A thread could have the as lock 7947 * and be waiting for io. The io can't complete 7948 * because the interrupt thread is blocked trying to grab 7949 * the as lock. 7950 */ 7951 7952 ASSERT(hat->sfmmu_xhat_provider == NULL); 7953 7954 if (hat == ksfmmup) { 7955 if (IS_KMEM_VA_LARGEPAGE(addr)) { 7956 ASSERT(segkmem_lpszc > 0); 7957 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc); 7958 if (pfn != PFN_INVALID) { 7959 sfmmu_check_kpfn(pfn); 7960 return (pfn); 7961 } 7962 } else if (segkpm && IS_KPM_ADDR(addr)) { 7963 return (sfmmu_kpm_vatopfn(addr)); 7964 } 7965 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7966 == PFN_SUSPENDED) { 7967 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7968 } 7969 sfmmu_check_kpfn(pfn); 7970 return (pfn); 7971 } else { 7972 return (sfmmu_uvatopfn(addr, hat, NULL)); 7973 } 7974 } 7975 7976 /* 7977 * hat_getkpfnum() is an obsolete DDI routine, and its use is discouraged. 7978 * Use hat_getpfnum(kas.a_hat, ...) instead. 7979 * 7980 * We'd like to return PFN_INVALID if the mappings have underlying page_t's 7981 * but can't right now due to the fact that some software has grown to use 7982 * this interface incorrectly. So for now when the interface is misused, 7983 * return a warning to the user that in the future it won't work in the 7984 * way they're abusing it, and carry on (after disabling page relocation). 7985 */ 7986 pfn_t 7987 hat_getkpfnum(caddr_t addr) 7988 { 7989 pfn_t pfn; 7990 tte_t tte; 7991 int badcaller = 0; 7992 extern int segkmem_reloc; 7993 7994 if (segkpm && IS_KPM_ADDR(addr)) { 7995 badcaller = 1; 7996 pfn = sfmmu_kpm_vatopfn(addr); 7997 } else { 7998 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7999 == PFN_SUSPENDED) { 8000 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 8001 } 8002 badcaller = pf_is_memory(pfn); 8003 } 8004 8005 if (badcaller) { 8006 /* 8007 * We can't return PFN_INVALID or the caller may panic 8008 * or corrupt the system. The only alternative is to 8009 * disable page relocation at this point for all kernel 8010 * memory. This will impact any callers of page_relocate() 8011 * such as FMA or DR. 8012 * 8013 * RFE: Add junk here to spit out an ereport so the sysadmin 8014 * can be advised that he should upgrade his device driver 8015 * so that this doesn't happen. 8016 */ 8017 hat_getkpfnum_badcall(caller()); 8018 if (hat_kpr_enabled && segkmem_reloc) { 8019 hat_kpr_enabled = 0; 8020 segkmem_reloc = 0; 8021 cmn_err(CE_WARN, "Kernel Page Relocation is DISABLED"); 8022 } 8023 } 8024 return (pfn); 8025 } 8026 8027 /* 8028 * This routine will return both pfn and tte for the vaddr. 8029 */ 8030 static pfn_t 8031 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep) 8032 { 8033 struct hmehash_bucket *hmebp; 8034 hmeblk_tag hblktag; 8035 int hmeshift, hashno = 1; 8036 struct hme_blk *hmeblkp = NULL; 8037 tte_t tte; 8038 8039 struct sf_hment *sfhmep; 8040 pfn_t pfn; 8041 8042 /* support for ISM */ 8043 ism_map_t *ism_map; 8044 ism_blk_t *ism_blkp; 8045 int i; 8046 sfmmu_t *ism_hatid = NULL; 8047 sfmmu_t *locked_hatid = NULL; 8048 sfmmu_t *sv_sfmmup = sfmmup; 8049 caddr_t sv_vaddr = vaddr; 8050 sf_srd_t *srdp; 8051 8052 if (ttep == NULL) { 8053 ttep = &tte; 8054 } else { 8055 ttep->ll = 0; 8056 } 8057 8058 ASSERT(sfmmup != ksfmmup); 8059 SFMMU_STAT(sf_user_vtop); 8060 /* 8061 * Set ism_hatid if vaddr falls in a ISM segment. 8062 */ 8063 ism_blkp = sfmmup->sfmmu_iblk; 8064 if (ism_blkp != NULL) { 8065 sfmmu_ismhat_enter(sfmmup, 0); 8066 locked_hatid = sfmmup; 8067 } 8068 while (ism_blkp != NULL && ism_hatid == NULL) { 8069 ism_map = ism_blkp->iblk_maps; 8070 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 8071 if (vaddr >= ism_start(ism_map[i]) && 8072 vaddr < ism_end(ism_map[i])) { 8073 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 8074 vaddr = (caddr_t)(vaddr - 8075 ism_start(ism_map[i])); 8076 break; 8077 } 8078 } 8079 ism_blkp = ism_blkp->iblk_next; 8080 } 8081 if (locked_hatid) { 8082 sfmmu_ismhat_exit(locked_hatid, 0); 8083 } 8084 8085 hblktag.htag_id = sfmmup; 8086 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 8087 do { 8088 hmeshift = HME_HASH_SHIFT(hashno); 8089 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 8090 hblktag.htag_rehash = hashno; 8091 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 8092 8093 SFMMU_HASH_LOCK(hmebp); 8094 8095 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 8096 if (hmeblkp != NULL) { 8097 ASSERT(!hmeblkp->hblk_shared); 8098 HBLKTOHME(sfhmep, hmeblkp, vaddr); 8099 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8100 SFMMU_HASH_UNLOCK(hmebp); 8101 if (TTE_IS_VALID(ttep)) { 8102 pfn = TTE_TO_PFN(vaddr, ttep); 8103 return (pfn); 8104 } 8105 break; 8106 } 8107 SFMMU_HASH_UNLOCK(hmebp); 8108 hashno++; 8109 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 8110 8111 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) { 8112 return (PFN_INVALID); 8113 } 8114 srdp = sv_sfmmup->sfmmu_srdp; 8115 ASSERT(srdp != NULL); 8116 ASSERT(srdp->srd_refcnt != 0); 8117 hblktag.htag_id = srdp; 8118 hashno = 1; 8119 do { 8120 hmeshift = HME_HASH_SHIFT(hashno); 8121 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift); 8122 hblktag.htag_rehash = hashno; 8123 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift); 8124 8125 SFMMU_HASH_LOCK(hmebp); 8126 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL; 8127 hmeblkp = hmeblkp->hblk_next) { 8128 uint_t rid; 8129 sf_region_t *rgnp; 8130 caddr_t rsaddr; 8131 caddr_t readdr; 8132 8133 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag, 8134 sv_sfmmup->sfmmu_hmeregion_map)) { 8135 continue; 8136 } 8137 ASSERT(hmeblkp->hblk_shared); 8138 rid = hmeblkp->hblk_tag.htag_rid; 8139 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8140 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8141 rgnp = srdp->srd_hmergnp[rid]; 8142 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 8143 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr); 8144 sfmmu_copytte(&sfhmep->hme_tte, ttep); 8145 rsaddr = rgnp->rgn_saddr; 8146 readdr = rsaddr + rgnp->rgn_size; 8147 #ifdef DEBUG 8148 if (TTE_IS_VALID(ttep) || 8149 get_hblk_ttesz(hmeblkp) > TTE8K) { 8150 caddr_t eva = tte_to_evaddr(hmeblkp, ttep); 8151 ASSERT(eva > sv_vaddr); 8152 ASSERT(sv_vaddr >= rsaddr); 8153 ASSERT(sv_vaddr < readdr); 8154 ASSERT(eva <= readdr); 8155 } 8156 #endif /* DEBUG */ 8157 /* 8158 * Continue the search if we 8159 * found an invalid 8K tte outside of the area 8160 * covered by this hmeblk's region. 8161 */ 8162 if (TTE_IS_VALID(ttep)) { 8163 SFMMU_HASH_UNLOCK(hmebp); 8164 pfn = TTE_TO_PFN(sv_vaddr, ttep); 8165 return (pfn); 8166 } else if (get_hblk_ttesz(hmeblkp) > TTE8K || 8167 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) { 8168 SFMMU_HASH_UNLOCK(hmebp); 8169 pfn = PFN_INVALID; 8170 return (pfn); 8171 } 8172 } 8173 SFMMU_HASH_UNLOCK(hmebp); 8174 hashno++; 8175 } while (hashno <= mmu_hashcnt); 8176 return (PFN_INVALID); 8177 } 8178 8179 8180 /* 8181 * For compatability with AT&T and later optimizations 8182 */ 8183 /* ARGSUSED */ 8184 void 8185 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 8186 { 8187 ASSERT(hat != NULL); 8188 ASSERT(hat->sfmmu_xhat_provider == NULL); 8189 } 8190 8191 /* 8192 * Return the number of mappings to a particular page. This number is an 8193 * approximation of the number of people sharing the page. 8194 * 8195 * shared hmeblks or ism hmeblks are counted as 1 mapping here. 8196 * hat_page_checkshare() can be used to compare threshold to share 8197 * count that reflects the number of region sharers albeit at higher cost. 8198 */ 8199 ulong_t 8200 hat_page_getshare(page_t *pp) 8201 { 8202 page_t *spp = pp; /* start page */ 8203 kmutex_t *pml; 8204 ulong_t cnt; 8205 int index, sz = TTE64K; 8206 8207 /* 8208 * We need to grab the mlist lock to make sure any outstanding 8209 * load/unloads complete. Otherwise we could return zero 8210 * even though the unload(s) hasn't finished yet. 8211 */ 8212 pml = sfmmu_mlist_enter(spp); 8213 cnt = spp->p_share; 8214 8215 #ifdef VAC 8216 if (kpm_enable) 8217 cnt += spp->p_kpmref; 8218 #endif 8219 if (vpm_enable && pp->p_vpmref) { 8220 cnt += 1; 8221 } 8222 8223 /* 8224 * If we have any large mappings, we count the number of 8225 * mappings that this large page is part of. 8226 */ 8227 index = PP_MAPINDEX(spp); 8228 index >>= 1; 8229 while (index) { 8230 pp = PP_GROUPLEADER(spp, sz); 8231 if ((index & 0x1) && pp != spp) { 8232 cnt += pp->p_share; 8233 spp = pp; 8234 } 8235 index >>= 1; 8236 sz++; 8237 } 8238 sfmmu_mlist_exit(pml); 8239 return (cnt); 8240 } 8241 8242 /* 8243 * Return 1 if the number of mappings exceeds sh_thresh. Return 0 8244 * otherwise. Count shared hmeblks by region's refcnt. 8245 */ 8246 int 8247 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 8248 { 8249 kmutex_t *pml; 8250 ulong_t cnt = 0; 8251 int index, sz = TTE8K; 8252 struct sf_hment *sfhme, *tmphme = NULL; 8253 struct hme_blk *hmeblkp; 8254 8255 pml = sfmmu_mlist_enter(pp); 8256 8257 #ifdef VAC 8258 if (kpm_enable) 8259 cnt = pp->p_kpmref; 8260 #endif 8261 8262 if (vpm_enable && pp->p_vpmref) { 8263 cnt += 1; 8264 } 8265 8266 if (pp->p_share + cnt > sh_thresh) { 8267 sfmmu_mlist_exit(pml); 8268 return (1); 8269 } 8270 8271 index = PP_MAPINDEX(pp); 8272 8273 again: 8274 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 8275 tmphme = sfhme->hme_next; 8276 if (IS_PAHME(sfhme)) { 8277 continue; 8278 } 8279 8280 hmeblkp = sfmmu_hmetohblk(sfhme); 8281 if (hmeblkp->hblk_xhat_bit) { 8282 cnt++; 8283 if (cnt > sh_thresh) { 8284 sfmmu_mlist_exit(pml); 8285 return (1); 8286 } 8287 continue; 8288 } 8289 if (hme_size(sfhme) != sz) { 8290 continue; 8291 } 8292 8293 if (hmeblkp->hblk_shared) { 8294 sf_srd_t *srdp = hblktosrd(hmeblkp); 8295 uint_t rid = hmeblkp->hblk_tag.htag_rid; 8296 sf_region_t *rgnp; 8297 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8298 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8299 ASSERT(srdp != NULL); 8300 rgnp = srdp->srd_hmergnp[rid]; 8301 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 8302 rgnp, rid); 8303 cnt += rgnp->rgn_refcnt; 8304 } else { 8305 cnt++; 8306 } 8307 if (cnt > sh_thresh) { 8308 sfmmu_mlist_exit(pml); 8309 return (1); 8310 } 8311 } 8312 8313 index >>= 1; 8314 sz++; 8315 while (index) { 8316 pp = PP_GROUPLEADER(pp, sz); 8317 ASSERT(sfmmu_mlist_held(pp)); 8318 if (index & 0x1) { 8319 goto again; 8320 } 8321 index >>= 1; 8322 sz++; 8323 } 8324 sfmmu_mlist_exit(pml); 8325 return (0); 8326 } 8327 8328 /* 8329 * Unload all large mappings to the pp and reset the p_szc field of every 8330 * constituent page according to the remaining mappings. 8331 * 8332 * pp must be locked SE_EXCL. Even though no other constituent pages are 8333 * locked it's legal to unload the large mappings to the pp because all 8334 * constituent pages of large locked mappings have to be locked SE_SHARED. 8335 * This means if we have SE_EXCL lock on one of constituent pages none of the 8336 * large mappings to pp are locked. 8337 * 8338 * Decrease p_szc field starting from the last constituent page and ending 8339 * with the root page. This method is used because other threads rely on the 8340 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 8341 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 8342 * ensures that p_szc changes of the constituent pages appears atomic for all 8343 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 8344 * 8345 * This mechanism is only used for file system pages where it's not always 8346 * possible to get SE_EXCL locks on all constituent pages to demote the size 8347 * code (as is done for anonymous or kernel large pages). 8348 * 8349 * See more comments in front of sfmmu_mlspl_enter(). 8350 */ 8351 void 8352 hat_page_demote(page_t *pp) 8353 { 8354 int index; 8355 int sz; 8356 cpuset_t cpuset; 8357 int sync = 0; 8358 page_t *rootpp; 8359 struct sf_hment *sfhme; 8360 struct sf_hment *tmphme = NULL; 8361 struct hme_blk *hmeblkp; 8362 uint_t pszc; 8363 page_t *lastpp; 8364 cpuset_t tset; 8365 pgcnt_t npgs; 8366 kmutex_t *pml; 8367 kmutex_t *pmtx = NULL; 8368 8369 ASSERT(PAGE_EXCL(pp)); 8370 ASSERT(!PP_ISFREE(pp)); 8371 ASSERT(!PP_ISKAS(pp)); 8372 ASSERT(page_szc_lock_assert(pp)); 8373 pml = sfmmu_mlist_enter(pp); 8374 8375 pszc = pp->p_szc; 8376 if (pszc == 0) { 8377 goto out; 8378 } 8379 8380 index = PP_MAPINDEX(pp) >> 1; 8381 8382 if (index) { 8383 CPUSET_ZERO(cpuset); 8384 sz = TTE64K; 8385 sync = 1; 8386 } 8387 8388 while (index) { 8389 if (!(index & 0x1)) { 8390 index >>= 1; 8391 sz++; 8392 continue; 8393 } 8394 ASSERT(sz <= pszc); 8395 rootpp = PP_GROUPLEADER(pp, sz); 8396 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 8397 tmphme = sfhme->hme_next; 8398 ASSERT(!IS_PAHME(sfhme)); 8399 hmeblkp = sfmmu_hmetohblk(sfhme); 8400 if (hme_size(sfhme) != sz) { 8401 continue; 8402 } 8403 if (hmeblkp->hblk_xhat_bit) { 8404 cmn_err(CE_PANIC, 8405 "hat_page_demote: xhat hmeblk"); 8406 } 8407 tset = sfmmu_pageunload(rootpp, sfhme, sz); 8408 CPUSET_OR(cpuset, tset); 8409 } 8410 if (index >>= 1) { 8411 sz++; 8412 } 8413 } 8414 8415 ASSERT(!PP_ISMAPPED_LARGE(pp)); 8416 8417 if (sync) { 8418 xt_sync(cpuset); 8419 #ifdef VAC 8420 if (PP_ISTNC(pp)) { 8421 conv_tnc(rootpp, sz); 8422 } 8423 #endif /* VAC */ 8424 } 8425 8426 pmtx = sfmmu_page_enter(pp); 8427 8428 ASSERT(pp->p_szc == pszc); 8429 rootpp = PP_PAGEROOT(pp); 8430 ASSERT(rootpp->p_szc == pszc); 8431 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 8432 8433 while (lastpp != rootpp) { 8434 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 8435 ASSERT(sz < pszc); 8436 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 8437 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 8438 while (--npgs > 0) { 8439 lastpp->p_szc = (uchar_t)sz; 8440 lastpp = PP_PAGEPREV(lastpp); 8441 } 8442 if (sz) { 8443 /* 8444 * make sure before current root's pszc 8445 * is updated all updates to constituent pages pszc 8446 * fields are globally visible. 8447 */ 8448 membar_producer(); 8449 } 8450 lastpp->p_szc = sz; 8451 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 8452 if (lastpp != rootpp) { 8453 lastpp = PP_PAGEPREV(lastpp); 8454 } 8455 } 8456 if (sz == 0) { 8457 /* the loop above doesn't cover this case */ 8458 rootpp->p_szc = 0; 8459 } 8460 out: 8461 ASSERT(pp->p_szc == 0); 8462 if (pmtx != NULL) { 8463 sfmmu_page_exit(pmtx); 8464 } 8465 sfmmu_mlist_exit(pml); 8466 } 8467 8468 /* 8469 * Refresh the HAT ismttecnt[] element for size szc. 8470 * Caller must have set ISM busy flag to prevent mapping 8471 * lists from changing while we're traversing them. 8472 */ 8473 pgcnt_t 8474 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 8475 { 8476 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 8477 ism_map_t *ism_map; 8478 pgcnt_t npgs = 0; 8479 pgcnt_t npgs_scd = 0; 8480 int j; 8481 sf_scd_t *scdp; 8482 uchar_t rid; 8483 hatlock_t *hatlockp; 8484 int ismnotinscd = 0; 8485 8486 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 8487 scdp = sfmmup->sfmmu_scdp; 8488 8489 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 8490 ism_map = ism_blkp->iblk_maps; 8491 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) { 8492 rid = ism_map[j].imap_rid; 8493 ASSERT(rid == SFMMU_INVALID_ISMRID || 8494 rid < sfmmup->sfmmu_srdp->srd_next_ismrid); 8495 8496 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID && 8497 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 8498 /* ISM is in sfmmup's SCD */ 8499 npgs_scd += 8500 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8501 } else { 8502 /* ISMs is not in SCD */ 8503 npgs += 8504 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8505 ismnotinscd = 1; 8506 } 8507 } 8508 } 8509 8510 if (&mmu_set_pgsz_order) { 8511 hatlockp = sfmmu_hat_enter(sfmmup); 8512 if (ismnotinscd) { 8513 SFMMU_FLAGS_SET(sfmmup, HAT_ISMNOTINSCD); 8514 } else { 8515 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMNOTINSCD); 8516 } 8517 sfmmu_hat_exit(hatlockp); 8518 } 8519 8520 sfmmup->sfmmu_ismttecnt[szc] = npgs; 8521 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd; 8522 return (npgs); 8523 } 8524 8525 /* 8526 * Yield the memory claim requirement for an address space. 8527 * 8528 * This is currently implemented as the number of bytes that have active 8529 * hardware translations that have page structures. Therefore, it can 8530 * underestimate the traditional resident set size, eg, if the 8531 * physical page is present and the hardware translation is missing; 8532 * and it can overestimate the rss, eg, if there are active 8533 * translations to a frame buffer with page structs. 8534 * Also, it does not take sharing into account. 8535 * 8536 * Note that we don't acquire locks here since this function is most often 8537 * called from the clock thread. 8538 */ 8539 size_t 8540 hat_get_mapped_size(struct hat *hat) 8541 { 8542 size_t assize = 0; 8543 int i; 8544 8545 if (hat == NULL) 8546 return (0); 8547 8548 ASSERT(hat->sfmmu_xhat_provider == NULL); 8549 8550 for (i = 0; i < mmu_page_sizes; i++) 8551 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] + 8552 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i); 8553 8554 if (hat->sfmmu_iblk == NULL) 8555 return (assize); 8556 8557 for (i = 0; i < mmu_page_sizes; i++) 8558 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] + 8559 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i); 8560 8561 return (assize); 8562 } 8563 8564 int 8565 hat_stats_enable(struct hat *hat) 8566 { 8567 hatlock_t *hatlockp; 8568 8569 ASSERT(hat->sfmmu_xhat_provider == NULL); 8570 8571 hatlockp = sfmmu_hat_enter(hat); 8572 hat->sfmmu_rmstat++; 8573 sfmmu_hat_exit(hatlockp); 8574 return (1); 8575 } 8576 8577 void 8578 hat_stats_disable(struct hat *hat) 8579 { 8580 hatlock_t *hatlockp; 8581 8582 ASSERT(hat->sfmmu_xhat_provider == NULL); 8583 8584 hatlockp = sfmmu_hat_enter(hat); 8585 hat->sfmmu_rmstat--; 8586 sfmmu_hat_exit(hatlockp); 8587 } 8588 8589 /* 8590 * Routines for entering or removing ourselves from the 8591 * ism_hat's mapping list. This is used for both private and 8592 * SCD hats. 8593 */ 8594 static void 8595 iment_add(struct ism_ment *iment, struct hat *ism_hat) 8596 { 8597 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8598 8599 iment->iment_prev = NULL; 8600 iment->iment_next = ism_hat->sfmmu_iment; 8601 if (ism_hat->sfmmu_iment) { 8602 ism_hat->sfmmu_iment->iment_prev = iment; 8603 } 8604 ism_hat->sfmmu_iment = iment; 8605 } 8606 8607 static void 8608 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 8609 { 8610 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8611 8612 if (ism_hat->sfmmu_iment == NULL) { 8613 panic("ism map entry remove - no entries"); 8614 } 8615 8616 if (iment->iment_prev) { 8617 ASSERT(ism_hat->sfmmu_iment != iment); 8618 iment->iment_prev->iment_next = iment->iment_next; 8619 } else { 8620 ASSERT(ism_hat->sfmmu_iment == iment); 8621 ism_hat->sfmmu_iment = iment->iment_next; 8622 } 8623 8624 if (iment->iment_next) { 8625 iment->iment_next->iment_prev = iment->iment_prev; 8626 } 8627 8628 /* 8629 * zero out the entry 8630 */ 8631 iment->iment_next = NULL; 8632 iment->iment_prev = NULL; 8633 iment->iment_hat = NULL; 8634 } 8635 8636 /* 8637 * Hat_share()/unshare() return an (non-zero) error 8638 * when saddr and daddr are not properly aligned. 8639 * 8640 * The top level mapping element determines the alignment 8641 * requirement for saddr and daddr, depending on different 8642 * architectures. 8643 * 8644 * When hat_share()/unshare() are not supported, 8645 * HATOP_SHARE()/UNSHARE() return 0 8646 */ 8647 int 8648 hat_share(struct hat *sfmmup, caddr_t addr, 8649 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 8650 { 8651 ism_blk_t *ism_blkp; 8652 ism_blk_t *new_iblk; 8653 ism_map_t *ism_map; 8654 ism_ment_t *ism_ment; 8655 int i, added; 8656 hatlock_t *hatlockp; 8657 int reload_mmu = 0; 8658 uint_t ismshift = page_get_shift(ismszc); 8659 size_t ismpgsz = page_get_pagesize(ismszc); 8660 uint_t ismmask = (uint_t)ismpgsz - 1; 8661 size_t sh_size = ISM_SHIFT(ismshift, len); 8662 ushort_t ismhatflag; 8663 hat_region_cookie_t rcookie; 8664 sf_scd_t *old_scdp; 8665 8666 #ifdef DEBUG 8667 caddr_t eaddr = addr + len; 8668 #endif /* DEBUG */ 8669 8670 ASSERT(ism_hatid != NULL && sfmmup != NULL); 8671 ASSERT(sptaddr == ISMID_STARTADDR); 8672 /* 8673 * Check the alignment. 8674 */ 8675 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 8676 return (EINVAL); 8677 8678 /* 8679 * Check size alignment. 8680 */ 8681 if (!ISM_ALIGNED(ismshift, len)) 8682 return (EINVAL); 8683 8684 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 8685 8686 /* 8687 * Allocate ism_ment for the ism_hat's mapping list, and an 8688 * ism map blk in case we need one. We must do our 8689 * allocations before acquiring locks to prevent a deadlock 8690 * in the kmem allocator on the mapping list lock. 8691 */ 8692 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 8693 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 8694 8695 /* 8696 * Serialize ISM mappings with the ISM busy flag, and also the 8697 * trap handlers. 8698 */ 8699 sfmmu_ismhat_enter(sfmmup, 0); 8700 8701 /* 8702 * Allocate an ism map blk if necessary. 8703 */ 8704 if (sfmmup->sfmmu_iblk == NULL) { 8705 sfmmup->sfmmu_iblk = new_iblk; 8706 bzero(new_iblk, sizeof (*new_iblk)); 8707 new_iblk->iblk_nextpa = (uint64_t)-1; 8708 membar_stst(); /* make sure next ptr visible to all CPUs */ 8709 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 8710 reload_mmu = 1; 8711 new_iblk = NULL; 8712 } 8713 8714 #ifdef DEBUG 8715 /* 8716 * Make sure mapping does not already exist. 8717 */ 8718 ism_blkp = sfmmup->sfmmu_iblk; 8719 while (ism_blkp != NULL) { 8720 ism_map = ism_blkp->iblk_maps; 8721 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 8722 if ((addr >= ism_start(ism_map[i]) && 8723 addr < ism_end(ism_map[i])) || 8724 eaddr > ism_start(ism_map[i]) && 8725 eaddr <= ism_end(ism_map[i])) { 8726 panic("sfmmu_share: Already mapped!"); 8727 } 8728 } 8729 ism_blkp = ism_blkp->iblk_next; 8730 } 8731 #endif /* DEBUG */ 8732 8733 ASSERT(ismszc >= TTE4M); 8734 if (ismszc == TTE4M) { 8735 ismhatflag = HAT_4M_FLAG; 8736 } else if (ismszc == TTE32M) { 8737 ismhatflag = HAT_32M_FLAG; 8738 } else if (ismszc == TTE256M) { 8739 ismhatflag = HAT_256M_FLAG; 8740 } 8741 /* 8742 * Add mapping to first available mapping slot. 8743 */ 8744 ism_blkp = sfmmup->sfmmu_iblk; 8745 added = 0; 8746 while (!added) { 8747 ism_map = ism_blkp->iblk_maps; 8748 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8749 if (ism_map[i].imap_ismhat == NULL) { 8750 8751 ism_map[i].imap_ismhat = ism_hatid; 8752 ism_map[i].imap_vb_shift = (uchar_t)ismshift; 8753 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8754 ism_map[i].imap_hatflags = ismhatflag; 8755 ism_map[i].imap_sz_mask = ismmask; 8756 /* 8757 * imap_seg is checked in ISM_CHECK to see if 8758 * non-NULL, then other info assumed valid. 8759 */ 8760 membar_stst(); 8761 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 8762 ism_map[i].imap_ment = ism_ment; 8763 8764 /* 8765 * Now add ourselves to the ism_hat's 8766 * mapping list. 8767 */ 8768 ism_ment->iment_hat = sfmmup; 8769 ism_ment->iment_base_va = addr; 8770 ism_hatid->sfmmu_ismhat = 1; 8771 mutex_enter(&ism_mlist_lock); 8772 iment_add(ism_ment, ism_hatid); 8773 mutex_exit(&ism_mlist_lock); 8774 added = 1; 8775 break; 8776 } 8777 } 8778 if (!added && ism_blkp->iblk_next == NULL) { 8779 ism_blkp->iblk_next = new_iblk; 8780 new_iblk = NULL; 8781 bzero(ism_blkp->iblk_next, 8782 sizeof (*ism_blkp->iblk_next)); 8783 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 8784 membar_stst(); 8785 ism_blkp->iblk_nextpa = 8786 va_to_pa((caddr_t)ism_blkp->iblk_next); 8787 } 8788 ism_blkp = ism_blkp->iblk_next; 8789 } 8790 8791 /* 8792 * After calling hat_join_region, sfmmup may join a new SCD or 8793 * move from the old scd to a new scd, in which case, we want to 8794 * shrink the sfmmup's private tsb size, i.e., pass shrink to 8795 * sfmmu_check_page_sizes at the end of this routine. 8796 */ 8797 old_scdp = sfmmup->sfmmu_scdp; 8798 8799 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0, 8800 PROT_ALL, ismszc, NULL, HAT_REGION_ISM); 8801 if (rcookie != HAT_INVALID_REGION_COOKIE) { 8802 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie); 8803 } 8804 /* 8805 * Update our counters for this sfmmup's ism mappings. 8806 */ 8807 for (i = 0; i <= ismszc; i++) { 8808 if (!(disable_ism_large_pages & (1 << i))) 8809 (void) ism_tsb_entries(sfmmup, i); 8810 } 8811 8812 /* 8813 * For ISM and DISM we do not support 512K pages, so we only only 8814 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the 8815 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 8816 * 8817 * Need to set 32M/256M ISM flags to make sure 8818 * sfmmu_check_page_sizes() enables them on Panther. 8819 */ 8820 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 8821 8822 switch (ismszc) { 8823 case TTE256M: 8824 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) { 8825 hatlockp = sfmmu_hat_enter(sfmmup); 8826 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM); 8827 sfmmu_hat_exit(hatlockp); 8828 } 8829 break; 8830 case TTE32M: 8831 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) { 8832 hatlockp = sfmmu_hat_enter(sfmmup); 8833 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM); 8834 sfmmu_hat_exit(hatlockp); 8835 } 8836 break; 8837 default: 8838 break; 8839 } 8840 8841 /* 8842 * If we updated the ismblkpa for this HAT we must make 8843 * sure all CPUs running this process reload their tsbmiss area. 8844 * Otherwise they will fail to load the mappings in the tsbmiss 8845 * handler and will loop calling pagefault(). 8846 */ 8847 if (reload_mmu) { 8848 hatlockp = sfmmu_hat_enter(sfmmup); 8849 sfmmu_sync_mmustate(sfmmup); 8850 sfmmu_hat_exit(hatlockp); 8851 } 8852 8853 if (&mmu_set_pgsz_order) { 8854 hatlockp = sfmmu_hat_enter(sfmmup); 8855 mmu_set_pgsz_order(sfmmup, 1); 8856 sfmmu_hat_exit(hatlockp); 8857 } 8858 sfmmu_ismhat_exit(sfmmup, 0); 8859 8860 /* 8861 * Free up ismblk if we didn't use it. 8862 */ 8863 if (new_iblk != NULL) 8864 kmem_cache_free(ism_blk_cache, new_iblk); 8865 8866 /* 8867 * Check TSB and TLB page sizes. 8868 */ 8869 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) { 8870 sfmmu_check_page_sizes(sfmmup, 0); 8871 } else { 8872 sfmmu_check_page_sizes(sfmmup, 1); 8873 } 8874 return (0); 8875 } 8876 8877 /* 8878 * hat_unshare removes exactly one ism_map from 8879 * this process's as. It expects multiple calls 8880 * to hat_unshare for multiple shm segments. 8881 */ 8882 void 8883 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 8884 { 8885 ism_map_t *ism_map; 8886 ism_ment_t *free_ment = NULL; 8887 ism_blk_t *ism_blkp; 8888 struct hat *ism_hatid; 8889 int found, i; 8890 hatlock_t *hatlockp; 8891 struct tsb_info *tsbinfo; 8892 uint_t ismshift = page_get_shift(ismszc); 8893 size_t sh_size = ISM_SHIFT(ismshift, len); 8894 uchar_t ism_rid; 8895 sf_scd_t *old_scdp; 8896 8897 ASSERT(ISM_ALIGNED(ismshift, addr)); 8898 ASSERT(ISM_ALIGNED(ismshift, len)); 8899 ASSERT(sfmmup != NULL); 8900 ASSERT(sfmmup != ksfmmup); 8901 8902 if (sfmmup->sfmmu_xhat_provider) { 8903 XHAT_UNSHARE(sfmmup, addr, len); 8904 return; 8905 } else { 8906 /* 8907 * This must be a CPU HAT. If the address space has 8908 * XHATs attached, inform all XHATs that ISM segment 8909 * is going away 8910 */ 8911 ASSERT(sfmmup->sfmmu_as != NULL); 8912 if (sfmmup->sfmmu_as->a_xhat != NULL) 8913 xhat_unshare_all(sfmmup->sfmmu_as, addr, len); 8914 } 8915 8916 /* 8917 * Make sure that during the entire time ISM mappings are removed, 8918 * the trap handlers serialize behind us, and that no one else 8919 * can be mucking with ISM mappings. This also lets us get away 8920 * with not doing expensive cross calls to flush the TLB -- we 8921 * just discard the context, flush the entire TSB, and call it 8922 * a day. 8923 */ 8924 sfmmu_ismhat_enter(sfmmup, 0); 8925 8926 /* 8927 * Remove the mapping. 8928 * 8929 * We can't have any holes in the ism map. 8930 * The tsb miss code while searching the ism map will 8931 * stop on an empty map slot. So we must move 8932 * everyone past the hole up 1 if any. 8933 * 8934 * Also empty ism map blks are not freed until the 8935 * process exits. This is to prevent a MT race condition 8936 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 8937 */ 8938 found = 0; 8939 ism_blkp = sfmmup->sfmmu_iblk; 8940 while (!found && ism_blkp != NULL) { 8941 ism_map = ism_blkp->iblk_maps; 8942 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8943 if (addr == ism_start(ism_map[i]) && 8944 sh_size == (size_t)(ism_size(ism_map[i]))) { 8945 found = 1; 8946 break; 8947 } 8948 } 8949 if (!found) 8950 ism_blkp = ism_blkp->iblk_next; 8951 } 8952 8953 if (found) { 8954 ism_hatid = ism_map[i].imap_ismhat; 8955 ism_rid = ism_map[i].imap_rid; 8956 ASSERT(ism_hatid != NULL); 8957 ASSERT(ism_hatid->sfmmu_ismhat == 1); 8958 8959 /* 8960 * After hat_leave_region, the sfmmup may leave SCD, 8961 * in which case, we want to grow the private tsb size when 8962 * calling sfmmu_check_page_sizes at the end of the routine. 8963 */ 8964 old_scdp = sfmmup->sfmmu_scdp; 8965 /* 8966 * Then remove ourselves from the region. 8967 */ 8968 if (ism_rid != SFMMU_INVALID_ISMRID) { 8969 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid), 8970 HAT_REGION_ISM); 8971 } 8972 8973 /* 8974 * And now guarantee that any other cpu 8975 * that tries to process an ISM miss 8976 * will go to tl=0. 8977 */ 8978 hatlockp = sfmmu_hat_enter(sfmmup); 8979 sfmmu_invalidate_ctx(sfmmup); 8980 sfmmu_hat_exit(hatlockp); 8981 8982 /* 8983 * Remove ourselves from the ism mapping list. 8984 */ 8985 mutex_enter(&ism_mlist_lock); 8986 iment_sub(ism_map[i].imap_ment, ism_hatid); 8987 mutex_exit(&ism_mlist_lock); 8988 free_ment = ism_map[i].imap_ment; 8989 8990 /* 8991 * We delete the ism map by copying 8992 * the next map over the current one. 8993 * We will take the next one in the maps 8994 * array or from the next ism_blk. 8995 */ 8996 while (ism_blkp != NULL) { 8997 ism_map = ism_blkp->iblk_maps; 8998 while (i < (ISM_MAP_SLOTS - 1)) { 8999 ism_map[i] = ism_map[i + 1]; 9000 i++; 9001 } 9002 /* i == (ISM_MAP_SLOTS - 1) */ 9003 ism_blkp = ism_blkp->iblk_next; 9004 if (ism_blkp != NULL) { 9005 ism_map[i] = ism_blkp->iblk_maps[0]; 9006 i = 0; 9007 } else { 9008 ism_map[i].imap_seg = 0; 9009 ism_map[i].imap_vb_shift = 0; 9010 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 9011 ism_map[i].imap_hatflags = 0; 9012 ism_map[i].imap_sz_mask = 0; 9013 ism_map[i].imap_ismhat = NULL; 9014 ism_map[i].imap_ment = NULL; 9015 } 9016 } 9017 9018 /* 9019 * Now flush entire TSB for the process, since 9020 * demapping page by page can be too expensive. 9021 * We don't have to flush the TLB here anymore 9022 * since we switch to a new TLB ctx instead. 9023 * Also, there is no need to flush if the process 9024 * is exiting since the TSB will be freed later. 9025 */ 9026 if (!sfmmup->sfmmu_free) { 9027 hatlockp = sfmmu_hat_enter(sfmmup); 9028 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 9029 tsbinfo = tsbinfo->tsb_next) { 9030 if (tsbinfo->tsb_flags & TSB_SWAPPED) 9031 continue; 9032 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) { 9033 tsbinfo->tsb_flags |= 9034 TSB_FLUSH_NEEDED; 9035 continue; 9036 } 9037 9038 sfmmu_inv_tsb(tsbinfo->tsb_va, 9039 TSB_BYTES(tsbinfo->tsb_szc)); 9040 } 9041 sfmmu_hat_exit(hatlockp); 9042 } 9043 } 9044 9045 /* 9046 * Update our counters for this sfmmup's ism mappings. 9047 */ 9048 for (i = 0; i <= ismszc; i++) { 9049 if (!(disable_ism_large_pages & (1 << i))) 9050 (void) ism_tsb_entries(sfmmup, i); 9051 } 9052 9053 if (&mmu_set_pgsz_order) { 9054 hatlockp = sfmmu_hat_enter(sfmmup); 9055 mmu_set_pgsz_order(sfmmup, 1); 9056 sfmmu_hat_exit(hatlockp); 9057 } 9058 sfmmu_ismhat_exit(sfmmup, 0); 9059 9060 /* 9061 * We must do our freeing here after dropping locks 9062 * to prevent a deadlock in the kmem allocator on the 9063 * mapping list lock. 9064 */ 9065 if (free_ment != NULL) 9066 kmem_cache_free(ism_ment_cache, free_ment); 9067 9068 /* 9069 * Check TSB and TLB page sizes if the process isn't exiting. 9070 */ 9071 if (!sfmmup->sfmmu_free) { 9072 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 9073 sfmmu_check_page_sizes(sfmmup, 1); 9074 } else { 9075 sfmmu_check_page_sizes(sfmmup, 0); 9076 } 9077 } 9078 } 9079 9080 /* ARGSUSED */ 9081 static int 9082 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 9083 { 9084 /* void *buf is sfmmu_t pointer */ 9085 bzero(buf, sizeof (sfmmu_t)); 9086 9087 return (0); 9088 } 9089 9090 /* ARGSUSED */ 9091 static void 9092 sfmmu_idcache_destructor(void *buf, void *cdrarg) 9093 { 9094 /* void *buf is sfmmu_t pointer */ 9095 } 9096 9097 /* 9098 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 9099 * field to be the pa of this hmeblk 9100 */ 9101 /* ARGSUSED */ 9102 static int 9103 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 9104 { 9105 struct hme_blk *hmeblkp; 9106 9107 bzero(buf, (size_t)cdrarg); 9108 hmeblkp = (struct hme_blk *)buf; 9109 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 9110 9111 #ifdef HBLK_TRACE 9112 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 9113 #endif /* HBLK_TRACE */ 9114 9115 return (0); 9116 } 9117 9118 /* ARGSUSED */ 9119 static void 9120 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 9121 { 9122 9123 #ifdef HBLK_TRACE 9124 9125 struct hme_blk *hmeblkp; 9126 9127 hmeblkp = (struct hme_blk *)buf; 9128 mutex_destroy(&hmeblkp->hblk_audit_lock); 9129 9130 #endif /* HBLK_TRACE */ 9131 } 9132 9133 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 9134 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 9135 /* 9136 * The kmem allocator will callback into our reclaim routine when the system 9137 * is running low in memory. We traverse the hash and free up all unused but 9138 * still cached hme_blks. We also traverse the free list and free them up 9139 * as well. 9140 */ 9141 /*ARGSUSED*/ 9142 static void 9143 sfmmu_hblkcache_reclaim(void *cdrarg) 9144 { 9145 int i; 9146 struct hmehash_bucket *hmebp; 9147 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 9148 static struct hmehash_bucket *uhmehash_reclaim_hand; 9149 static struct hmehash_bucket *khmehash_reclaim_hand; 9150 struct hme_blk *list = NULL, *last_hmeblkp; 9151 cpuset_t cpuset = cpu_ready_set; 9152 cpu_hme_pend_t *cpuhp; 9153 9154 /* Free up hmeblks on the cpu pending lists */ 9155 for (i = 0; i < NCPU; i++) { 9156 cpuhp = &cpu_hme_pend[i]; 9157 if (cpuhp->chp_listp != NULL) { 9158 mutex_enter(&cpuhp->chp_mutex); 9159 if (cpuhp->chp_listp == NULL) { 9160 mutex_exit(&cpuhp->chp_mutex); 9161 continue; 9162 } 9163 for (last_hmeblkp = cpuhp->chp_listp; 9164 last_hmeblkp->hblk_next != NULL; 9165 last_hmeblkp = last_hmeblkp->hblk_next) 9166 ; 9167 last_hmeblkp->hblk_next = list; 9168 list = cpuhp->chp_listp; 9169 cpuhp->chp_listp = NULL; 9170 cpuhp->chp_count = 0; 9171 mutex_exit(&cpuhp->chp_mutex); 9172 } 9173 9174 } 9175 9176 if (list != NULL) { 9177 kpreempt_disable(); 9178 CPUSET_DEL(cpuset, CPU->cpu_id); 9179 xt_sync(cpuset); 9180 xt_sync(cpuset); 9181 kpreempt_enable(); 9182 sfmmu_hblk_free(&list); 9183 list = NULL; 9184 } 9185 9186 hmebp = uhmehash_reclaim_hand; 9187 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 9188 uhmehash_reclaim_hand = hmebp = uhme_hash; 9189 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9190 9191 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9192 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9193 hmeblkp = hmebp->hmeblkp; 9194 pr_hblk = NULL; 9195 while (hmeblkp) { 9196 nx_hblk = hmeblkp->hblk_next; 9197 if (!hmeblkp->hblk_vcnt && 9198 !hmeblkp->hblk_hmecnt) { 9199 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9200 pr_hblk, &list, 0); 9201 } else { 9202 pr_hblk = hmeblkp; 9203 } 9204 hmeblkp = nx_hblk; 9205 } 9206 SFMMU_HASH_UNLOCK(hmebp); 9207 } 9208 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 9209 hmebp = uhme_hash; 9210 } 9211 9212 hmebp = khmehash_reclaim_hand; 9213 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 9214 khmehash_reclaim_hand = hmebp = khme_hash; 9215 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9216 9217 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9218 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9219 hmeblkp = hmebp->hmeblkp; 9220 pr_hblk = NULL; 9221 while (hmeblkp) { 9222 nx_hblk = hmeblkp->hblk_next; 9223 if (!hmeblkp->hblk_vcnt && 9224 !hmeblkp->hblk_hmecnt) { 9225 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9226 pr_hblk, &list, 0); 9227 } else { 9228 pr_hblk = hmeblkp; 9229 } 9230 hmeblkp = nx_hblk; 9231 } 9232 SFMMU_HASH_UNLOCK(hmebp); 9233 } 9234 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 9235 hmebp = khme_hash; 9236 } 9237 sfmmu_hblks_list_purge(&list, 0); 9238 } 9239 9240 /* 9241 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 9242 * same goes for sfmmu_get_addrvcolor(). 9243 * 9244 * This function will return the virtual color for the specified page. The 9245 * virtual color corresponds to this page current mapping or its last mapping. 9246 * It is used by memory allocators to choose addresses with the correct 9247 * alignment so vac consistency is automatically maintained. If the page 9248 * has no color it returns -1. 9249 */ 9250 /*ARGSUSED*/ 9251 int 9252 sfmmu_get_ppvcolor(struct page *pp) 9253 { 9254 #ifdef VAC 9255 int color; 9256 9257 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 9258 return (-1); 9259 } 9260 color = PP_GET_VCOLOR(pp); 9261 ASSERT(color < mmu_btop(shm_alignment)); 9262 return (color); 9263 #else 9264 return (-1); 9265 #endif /* VAC */ 9266 } 9267 9268 /* 9269 * This function will return the desired alignment for vac consistency 9270 * (vac color) given a virtual address. If no vac is present it returns -1. 9271 */ 9272 /*ARGSUSED*/ 9273 int 9274 sfmmu_get_addrvcolor(caddr_t vaddr) 9275 { 9276 #ifdef VAC 9277 if (cache & CACHE_VAC) { 9278 return (addr_to_vcolor(vaddr)); 9279 } else { 9280 return (-1); 9281 } 9282 #else 9283 return (-1); 9284 #endif /* VAC */ 9285 } 9286 9287 #ifdef VAC 9288 /* 9289 * Check for conflicts. 9290 * A conflict exists if the new and existent mappings do not match in 9291 * their "shm_alignment fields. If conflicts exist, the existant mappings 9292 * are flushed unless one of them is locked. If one of them is locked, then 9293 * the mappings are flushed and converted to non-cacheable mappings. 9294 */ 9295 static void 9296 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 9297 { 9298 struct hat *tmphat; 9299 struct sf_hment *sfhmep, *tmphme = NULL; 9300 struct hme_blk *hmeblkp; 9301 int vcolor; 9302 tte_t tte; 9303 9304 ASSERT(sfmmu_mlist_held(pp)); 9305 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 9306 9307 vcolor = addr_to_vcolor(addr); 9308 if (PP_NEWPAGE(pp)) { 9309 PP_SET_VCOLOR(pp, vcolor); 9310 return; 9311 } 9312 9313 if (PP_GET_VCOLOR(pp) == vcolor) { 9314 return; 9315 } 9316 9317 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 9318 /* 9319 * Previous user of page had a different color 9320 * but since there are no current users 9321 * we just flush the cache and change the color. 9322 */ 9323 SFMMU_STAT(sf_pgcolor_conflict); 9324 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9325 PP_SET_VCOLOR(pp, vcolor); 9326 return; 9327 } 9328 9329 /* 9330 * If we get here we have a vac conflict with a current 9331 * mapping. VAC conflict policy is as follows. 9332 * - The default is to unload the other mappings unless: 9333 * - If we have a large mapping we uncache the page. 9334 * We need to uncache the rest of the large page too. 9335 * - If any of the mappings are locked we uncache the page. 9336 * - If the requested mapping is inconsistent 9337 * with another mapping and that mapping 9338 * is in the same address space we have to 9339 * make it non-cached. The default thing 9340 * to do is unload the inconsistent mapping 9341 * but if they are in the same address space 9342 * we run the risk of unmapping the pc or the 9343 * stack which we will use as we return to the user, 9344 * in which case we can then fault on the thing 9345 * we just unloaded and get into an infinite loop. 9346 */ 9347 if (PP_ISMAPPED_LARGE(pp)) { 9348 int sz; 9349 9350 /* 9351 * Existing mapping is for big pages. We don't unload 9352 * existing big mappings to satisfy new mappings. 9353 * Always convert all mappings to TNC. 9354 */ 9355 sz = fnd_mapping_sz(pp); 9356 pp = PP_GROUPLEADER(pp, sz); 9357 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 9358 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 9359 TTEPAGES(sz)); 9360 9361 return; 9362 } 9363 9364 /* 9365 * check if any mapping is in same as or if it is locked 9366 * since in that case we need to uncache. 9367 */ 9368 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9369 tmphme = sfhmep->hme_next; 9370 if (IS_PAHME(sfhmep)) 9371 continue; 9372 hmeblkp = sfmmu_hmetohblk(sfhmep); 9373 if (hmeblkp->hblk_xhat_bit) 9374 continue; 9375 tmphat = hblktosfmmu(hmeblkp); 9376 sfmmu_copytte(&sfhmep->hme_tte, &tte); 9377 ASSERT(TTE_IS_VALID(&tte)); 9378 if (hmeblkp->hblk_shared || tmphat == hat || 9379 hmeblkp->hblk_lckcnt) { 9380 /* 9381 * We have an uncache conflict 9382 */ 9383 SFMMU_STAT(sf_uncache_conflict); 9384 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 9385 return; 9386 } 9387 } 9388 9389 /* 9390 * We have an unload conflict 9391 * We have already checked for LARGE mappings, therefore 9392 * the remaining mapping(s) must be TTE8K. 9393 */ 9394 SFMMU_STAT(sf_unload_conflict); 9395 9396 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9397 tmphme = sfhmep->hme_next; 9398 if (IS_PAHME(sfhmep)) 9399 continue; 9400 hmeblkp = sfmmu_hmetohblk(sfhmep); 9401 if (hmeblkp->hblk_xhat_bit) 9402 continue; 9403 ASSERT(!hmeblkp->hblk_shared); 9404 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 9405 } 9406 9407 if (PP_ISMAPPED_KPM(pp)) 9408 sfmmu_kpm_vac_unload(pp, addr); 9409 9410 /* 9411 * Unloads only do TLB flushes so we need to flush the 9412 * cache here. 9413 */ 9414 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9415 PP_SET_VCOLOR(pp, vcolor); 9416 } 9417 9418 /* 9419 * Whenever a mapping is unloaded and the page is in TNC state, 9420 * we see if the page can be made cacheable again. 'pp' is 9421 * the page that we just unloaded a mapping from, the size 9422 * of mapping that was unloaded is 'ottesz'. 9423 * Remark: 9424 * The recache policy for mpss pages can leave a performance problem 9425 * under the following circumstances: 9426 * . A large page in uncached mode has just been unmapped. 9427 * . All constituent pages are TNC due to a conflicting small mapping. 9428 * . There are many other, non conflicting, small mappings around for 9429 * a lot of the constituent pages. 9430 * . We're called w/ the "old" groupleader page and the old ottesz, 9431 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 9432 * we end up w/ TTE8K or npages == 1. 9433 * . We call tst_tnc w/ the old groupleader only, and if there is no 9434 * conflict, we re-cache only this page. 9435 * . All other small mappings are not checked and will be left in TNC mode. 9436 * The problem is not very serious because: 9437 * . mpss is actually only defined for heap and stack, so the probability 9438 * is not very high that a large page mapping exists in parallel to a small 9439 * one (this is possible, but seems to be bad programming style in the 9440 * appl). 9441 * . The problem gets a little bit more serious, when those TNC pages 9442 * have to be mapped into kernel space, e.g. for networking. 9443 * . When VAC alias conflicts occur in applications, this is regarded 9444 * as an application bug. So if kstat's show them, the appl should 9445 * be changed anyway. 9446 */ 9447 void 9448 conv_tnc(page_t *pp, int ottesz) 9449 { 9450 int cursz, dosz; 9451 pgcnt_t curnpgs, dopgs; 9452 pgcnt_t pg64k; 9453 page_t *pp2; 9454 9455 /* 9456 * Determine how big a range we check for TNC and find 9457 * leader page. cursz is the size of the biggest 9458 * mapping that still exist on 'pp'. 9459 */ 9460 if (PP_ISMAPPED_LARGE(pp)) { 9461 cursz = fnd_mapping_sz(pp); 9462 } else { 9463 cursz = TTE8K; 9464 } 9465 9466 if (ottesz >= cursz) { 9467 dosz = ottesz; 9468 pp2 = pp; 9469 } else { 9470 dosz = cursz; 9471 pp2 = PP_GROUPLEADER(pp, dosz); 9472 } 9473 9474 pg64k = TTEPAGES(TTE64K); 9475 dopgs = TTEPAGES(dosz); 9476 9477 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 9478 9479 while (dopgs != 0) { 9480 curnpgs = TTEPAGES(cursz); 9481 if (tst_tnc(pp2, curnpgs)) { 9482 SFMMU_STAT_ADD(sf_recache, curnpgs); 9483 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 9484 curnpgs); 9485 } 9486 9487 ASSERT(dopgs >= curnpgs); 9488 dopgs -= curnpgs; 9489 9490 if (dopgs == 0) { 9491 break; 9492 } 9493 9494 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 9495 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 9496 cursz = fnd_mapping_sz(pp2); 9497 } else { 9498 cursz = TTE8K; 9499 } 9500 } 9501 } 9502 9503 /* 9504 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 9505 * returns 0 otherwise. Note that oaddr argument is valid for only 9506 * 8k pages. 9507 */ 9508 int 9509 tst_tnc(page_t *pp, pgcnt_t npages) 9510 { 9511 struct sf_hment *sfhme; 9512 struct hme_blk *hmeblkp; 9513 tte_t tte; 9514 caddr_t vaddr; 9515 int clr_valid = 0; 9516 int color, color1, bcolor; 9517 int i, ncolors; 9518 9519 ASSERT(pp != NULL); 9520 ASSERT(!(cache & CACHE_WRITEBACK)); 9521 9522 if (npages > 1) { 9523 ncolors = CACHE_NUM_COLOR; 9524 } 9525 9526 for (i = 0; i < npages; i++) { 9527 ASSERT(sfmmu_mlist_held(pp)); 9528 ASSERT(PP_ISTNC(pp)); 9529 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 9530 9531 if (PP_ISPNC(pp)) { 9532 return (0); 9533 } 9534 9535 clr_valid = 0; 9536 if (PP_ISMAPPED_KPM(pp)) { 9537 caddr_t kpmvaddr; 9538 9539 ASSERT(kpm_enable); 9540 kpmvaddr = hat_kpm_page2va(pp, 1); 9541 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 9542 color1 = addr_to_vcolor(kpmvaddr); 9543 clr_valid = 1; 9544 } 9545 9546 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9547 if (IS_PAHME(sfhme)) 9548 continue; 9549 hmeblkp = sfmmu_hmetohblk(sfhme); 9550 if (hmeblkp->hblk_xhat_bit) 9551 continue; 9552 9553 sfmmu_copytte(&sfhme->hme_tte, &tte); 9554 ASSERT(TTE_IS_VALID(&tte)); 9555 9556 vaddr = tte_to_vaddr(hmeblkp, tte); 9557 color = addr_to_vcolor(vaddr); 9558 9559 if (npages > 1) { 9560 /* 9561 * If there is a big mapping, make sure 9562 * 8K mapping is consistent with the big 9563 * mapping. 9564 */ 9565 bcolor = i % ncolors; 9566 if (color != bcolor) { 9567 return (0); 9568 } 9569 } 9570 if (!clr_valid) { 9571 clr_valid = 1; 9572 color1 = color; 9573 } 9574 9575 if (color1 != color) { 9576 return (0); 9577 } 9578 } 9579 9580 pp = PP_PAGENEXT(pp); 9581 } 9582 9583 return (1); 9584 } 9585 9586 void 9587 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 9588 pgcnt_t npages) 9589 { 9590 kmutex_t *pmtx; 9591 int i, ncolors, bcolor; 9592 kpm_hlk_t *kpmp; 9593 cpuset_t cpuset; 9594 9595 ASSERT(pp != NULL); 9596 ASSERT(!(cache & CACHE_WRITEBACK)); 9597 9598 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 9599 pmtx = sfmmu_page_enter(pp); 9600 9601 /* 9602 * Fast path caching single unmapped page 9603 */ 9604 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 9605 flags == HAT_CACHE) { 9606 PP_CLRTNC(pp); 9607 PP_CLRPNC(pp); 9608 sfmmu_page_exit(pmtx); 9609 sfmmu_kpm_kpmp_exit(kpmp); 9610 return; 9611 } 9612 9613 /* 9614 * We need to capture all cpus in order to change cacheability 9615 * because we can't allow one cpu to access the same physical 9616 * page using a cacheable and a non-cachebale mapping at the same 9617 * time. Since we may end up walking the ism mapping list 9618 * have to grab it's lock now since we can't after all the 9619 * cpus have been captured. 9620 */ 9621 sfmmu_hat_lock_all(); 9622 mutex_enter(&ism_mlist_lock); 9623 kpreempt_disable(); 9624 cpuset = cpu_ready_set; 9625 xc_attention(cpuset); 9626 9627 if (npages > 1) { 9628 /* 9629 * Make sure all colors are flushed since the 9630 * sfmmu_page_cache() only flushes one color- 9631 * it does not know big pages. 9632 */ 9633 ncolors = CACHE_NUM_COLOR; 9634 if (flags & HAT_TMPNC) { 9635 for (i = 0; i < ncolors; i++) { 9636 sfmmu_cache_flushcolor(i, pp->p_pagenum); 9637 } 9638 cache_flush_flag = CACHE_NO_FLUSH; 9639 } 9640 } 9641 9642 for (i = 0; i < npages; i++) { 9643 9644 ASSERT(sfmmu_mlist_held(pp)); 9645 9646 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 9647 9648 if (npages > 1) { 9649 bcolor = i % ncolors; 9650 } else { 9651 bcolor = NO_VCOLOR; 9652 } 9653 9654 sfmmu_page_cache(pp, flags, cache_flush_flag, 9655 bcolor); 9656 } 9657 9658 pp = PP_PAGENEXT(pp); 9659 } 9660 9661 xt_sync(cpuset); 9662 xc_dismissed(cpuset); 9663 mutex_exit(&ism_mlist_lock); 9664 sfmmu_hat_unlock_all(); 9665 sfmmu_page_exit(pmtx); 9666 sfmmu_kpm_kpmp_exit(kpmp); 9667 kpreempt_enable(); 9668 } 9669 9670 /* 9671 * This function changes the virtual cacheability of all mappings to a 9672 * particular page. When changing from uncache to cacheable the mappings will 9673 * only be changed if all of them have the same virtual color. 9674 * We need to flush the cache in all cpus. It is possible that 9675 * a process referenced a page as cacheable but has sinced exited 9676 * and cleared the mapping list. We still to flush it but have no 9677 * state so all cpus is the only alternative. 9678 */ 9679 static void 9680 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 9681 { 9682 struct sf_hment *sfhme; 9683 struct hme_blk *hmeblkp; 9684 sfmmu_t *sfmmup; 9685 tte_t tte, ttemod; 9686 caddr_t vaddr; 9687 int ret, color; 9688 pfn_t pfn; 9689 9690 color = bcolor; 9691 pfn = pp->p_pagenum; 9692 9693 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9694 9695 if (IS_PAHME(sfhme)) 9696 continue; 9697 hmeblkp = sfmmu_hmetohblk(sfhme); 9698 9699 if (hmeblkp->hblk_xhat_bit) 9700 continue; 9701 9702 sfmmu_copytte(&sfhme->hme_tte, &tte); 9703 ASSERT(TTE_IS_VALID(&tte)); 9704 vaddr = tte_to_vaddr(hmeblkp, tte); 9705 color = addr_to_vcolor(vaddr); 9706 9707 #ifdef DEBUG 9708 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 9709 ASSERT(color == bcolor); 9710 } 9711 #endif 9712 9713 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 9714 9715 ttemod = tte; 9716 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 9717 TTE_CLR_VCACHEABLE(&ttemod); 9718 } else { /* flags & HAT_CACHE */ 9719 TTE_SET_VCACHEABLE(&ttemod); 9720 } 9721 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 9722 if (ret < 0) { 9723 /* 9724 * Since all cpus are captured modifytte should not 9725 * fail. 9726 */ 9727 panic("sfmmu_page_cache: write to tte failed"); 9728 } 9729 9730 sfmmup = hblktosfmmu(hmeblkp); 9731 if (cache_flush_flag == CACHE_FLUSH) { 9732 /* 9733 * Flush TSBs, TLBs and caches 9734 */ 9735 if (hmeblkp->hblk_shared) { 9736 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9737 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9738 sf_region_t *rgnp; 9739 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9740 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9741 ASSERT(srdp != NULL); 9742 rgnp = srdp->srd_hmergnp[rid]; 9743 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9744 srdp, rgnp, rid); 9745 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9746 hmeblkp, 0); 9747 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr)); 9748 } else if (sfmmup->sfmmu_ismhat) { 9749 if (flags & HAT_CACHE) { 9750 SFMMU_STAT(sf_ism_recache); 9751 } else { 9752 SFMMU_STAT(sf_ism_uncache); 9753 } 9754 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9755 pfn, CACHE_FLUSH); 9756 } else { 9757 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 9758 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 9759 } 9760 9761 /* 9762 * all cache entries belonging to this pfn are 9763 * now flushed. 9764 */ 9765 cache_flush_flag = CACHE_NO_FLUSH; 9766 } else { 9767 /* 9768 * Flush only TSBs and TLBs. 9769 */ 9770 if (hmeblkp->hblk_shared) { 9771 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9772 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9773 sf_region_t *rgnp; 9774 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9775 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9776 ASSERT(srdp != NULL); 9777 rgnp = srdp->srd_hmergnp[rid]; 9778 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9779 srdp, rgnp, rid); 9780 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9781 hmeblkp, 0); 9782 } else if (sfmmup->sfmmu_ismhat) { 9783 if (flags & HAT_CACHE) { 9784 SFMMU_STAT(sf_ism_recache); 9785 } else { 9786 SFMMU_STAT(sf_ism_uncache); 9787 } 9788 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9789 pfn, CACHE_NO_FLUSH); 9790 } else { 9791 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 9792 } 9793 } 9794 } 9795 9796 if (PP_ISMAPPED_KPM(pp)) 9797 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 9798 9799 switch (flags) { 9800 9801 default: 9802 panic("sfmmu_pagecache: unknown flags"); 9803 break; 9804 9805 case HAT_CACHE: 9806 PP_CLRTNC(pp); 9807 PP_CLRPNC(pp); 9808 PP_SET_VCOLOR(pp, color); 9809 break; 9810 9811 case HAT_TMPNC: 9812 PP_SETTNC(pp); 9813 PP_SET_VCOLOR(pp, NO_VCOLOR); 9814 break; 9815 9816 case HAT_UNCACHE: 9817 PP_SETPNC(pp); 9818 PP_CLRTNC(pp); 9819 PP_SET_VCOLOR(pp, NO_VCOLOR); 9820 break; 9821 } 9822 } 9823 #endif /* VAC */ 9824 9825 9826 /* 9827 * Wrapper routine used to return a context. 9828 * 9829 * It's the responsibility of the caller to guarantee that the 9830 * process serializes on calls here by taking the HAT lock for 9831 * the hat. 9832 * 9833 */ 9834 static void 9835 sfmmu_get_ctx(sfmmu_t *sfmmup) 9836 { 9837 mmu_ctx_t *mmu_ctxp; 9838 uint_t pstate_save; 9839 int ret; 9840 9841 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9842 ASSERT(sfmmup != ksfmmup); 9843 9844 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) { 9845 sfmmu_setup_tsbinfo(sfmmup); 9846 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID); 9847 } 9848 9849 kpreempt_disable(); 9850 9851 mmu_ctxp = CPU_MMU_CTXP(CPU); 9852 ASSERT(mmu_ctxp); 9853 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 9854 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 9855 9856 /* 9857 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU. 9858 */ 9859 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs) 9860 sfmmu_ctx_wrap_around(mmu_ctxp); 9861 9862 /* 9863 * Let the MMU set up the page sizes to use for 9864 * this context in the TLB. Don't program 2nd dtlb for ism hat. 9865 */ 9866 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) { 9867 mmu_set_ctx_page_sizes(sfmmup); 9868 } 9869 9870 /* 9871 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with 9872 * interrupts disabled to prevent race condition with wrap-around 9873 * ctx invalidatation. In sun4v, ctx invalidation also involves 9874 * a HV call to set the number of TSBs to 0. If interrupts are not 9875 * disabled until after sfmmu_load_mmustate is complete TSBs may 9876 * become assigned to INVALID_CONTEXT. This is not allowed. 9877 */ 9878 pstate_save = sfmmu_disable_intrs(); 9879 9880 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) && 9881 sfmmup->sfmmu_scdp != NULL) { 9882 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 9883 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 9884 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED); 9885 /* debug purpose only */ 9886 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 9887 != INVALID_CONTEXT); 9888 } 9889 sfmmu_load_mmustate(sfmmup); 9890 9891 sfmmu_enable_intrs(pstate_save); 9892 9893 kpreempt_enable(); 9894 } 9895 9896 /* 9897 * When all cnums are used up in a MMU, cnum will wrap around to the 9898 * next generation and start from 2. 9899 */ 9900 static void 9901 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp) 9902 { 9903 9904 /* caller must have disabled the preemption */ 9905 ASSERT(curthread->t_preempt >= 1); 9906 ASSERT(mmu_ctxp != NULL); 9907 9908 /* acquire Per-MMU (PM) spin lock */ 9909 mutex_enter(&mmu_ctxp->mmu_lock); 9910 9911 /* re-check to see if wrap-around is needed */ 9912 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs) 9913 goto done; 9914 9915 SFMMU_MMU_STAT(mmu_wrap_around); 9916 9917 /* update gnum */ 9918 ASSERT(mmu_ctxp->mmu_gnum != 0); 9919 mmu_ctxp->mmu_gnum++; 9920 if (mmu_ctxp->mmu_gnum == 0 || 9921 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) { 9922 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.", 9923 (void *)mmu_ctxp); 9924 } 9925 9926 if (mmu_ctxp->mmu_ncpus > 1) { 9927 cpuset_t cpuset; 9928 9929 membar_enter(); /* make sure updated gnum visible */ 9930 9931 SFMMU_XCALL_STATS(NULL); 9932 9933 /* xcall to others on the same MMU to invalidate ctx */ 9934 cpuset = mmu_ctxp->mmu_cpuset; 9935 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id)); 9936 CPUSET_DEL(cpuset, CPU->cpu_id); 9937 CPUSET_AND(cpuset, cpu_ready_set); 9938 9939 /* 9940 * Pass in INVALID_CONTEXT as the first parameter to 9941 * sfmmu_raise_tsb_exception, which invalidates the context 9942 * of any process running on the CPUs in the MMU. 9943 */ 9944 xt_some(cpuset, sfmmu_raise_tsb_exception, 9945 INVALID_CONTEXT, INVALID_CONTEXT); 9946 xt_sync(cpuset); 9947 9948 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 9949 } 9950 9951 if (sfmmu_getctx_sec() != INVALID_CONTEXT) { 9952 sfmmu_setctx_sec(INVALID_CONTEXT); 9953 sfmmu_clear_utsbinfo(); 9954 } 9955 9956 /* 9957 * No xcall is needed here. For sun4u systems all CPUs in context 9958 * domain share a single physical MMU therefore it's enough to flush 9959 * TLB on local CPU. On sun4v systems we use 1 global context 9960 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception 9961 * handler. Note that vtag_flushall_uctxs() is called 9962 * for Ultra II machine, where the equivalent flushall functionality 9963 * is implemented in SW, and only user ctx TLB entries are flushed. 9964 */ 9965 if (&vtag_flushall_uctxs != NULL) { 9966 vtag_flushall_uctxs(); 9967 } else { 9968 vtag_flushall(); 9969 } 9970 9971 /* reset mmu cnum, skips cnum 0 and 1 */ 9972 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 9973 9974 done: 9975 mutex_exit(&mmu_ctxp->mmu_lock); 9976 } 9977 9978 9979 /* 9980 * For multi-threaded process, set the process context to INVALID_CONTEXT 9981 * so that it faults and reloads the MMU state from TL=0. For single-threaded 9982 * process, we can just load the MMU state directly without having to 9983 * set context invalid. Caller must hold the hat lock since we don't 9984 * acquire it here. 9985 */ 9986 static void 9987 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 9988 { 9989 uint_t cnum; 9990 uint_t pstate_save; 9991 9992 ASSERT(sfmmup != ksfmmup); 9993 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9994 9995 kpreempt_disable(); 9996 9997 /* 9998 * We check whether the pass'ed-in sfmmup is the same as the 9999 * current running proc. This is to makes sure the current proc 10000 * stays single-threaded if it already is. 10001 */ 10002 if ((sfmmup == curthread->t_procp->p_as->a_hat) && 10003 (curthread->t_procp->p_lwpcnt == 1)) { 10004 /* single-thread */ 10005 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum; 10006 if (cnum != INVALID_CONTEXT) { 10007 uint_t curcnum; 10008 /* 10009 * Disable interrupts to prevent race condition 10010 * with sfmmu_ctx_wrap_around ctx invalidation. 10011 * In sun4v, ctx invalidation involves setting 10012 * TSB to NULL, hence, interrupts should be disabled 10013 * untill after sfmmu_load_mmustate is completed. 10014 */ 10015 pstate_save = sfmmu_disable_intrs(); 10016 curcnum = sfmmu_getctx_sec(); 10017 if (curcnum == cnum) 10018 sfmmu_load_mmustate(sfmmup); 10019 sfmmu_enable_intrs(pstate_save); 10020 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT); 10021 } 10022 } else { 10023 /* 10024 * multi-thread 10025 * or when sfmmup is not the same as the curproc. 10026 */ 10027 sfmmu_invalidate_ctx(sfmmup); 10028 } 10029 10030 kpreempt_enable(); 10031 } 10032 10033 10034 /* 10035 * Replace the specified TSB with a new TSB. This function gets called when 10036 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the 10037 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 10038 * (8K). 10039 * 10040 * Caller must hold the HAT lock, but should assume any tsb_info 10041 * pointers it has are no longer valid after calling this function. 10042 * 10043 * Return values: 10044 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 10045 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 10046 * something to this tsbinfo/TSB 10047 * TSB_SUCCESS Operation succeeded 10048 */ 10049 static tsb_replace_rc_t 10050 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 10051 hatlock_t *hatlockp, uint_t flags) 10052 { 10053 struct tsb_info *new_tsbinfo = NULL; 10054 struct tsb_info *curtsb, *prevtsb; 10055 uint_t tte_sz_mask; 10056 int i; 10057 10058 ASSERT(sfmmup != ksfmmup); 10059 ASSERT(sfmmup->sfmmu_ismhat == 0); 10060 ASSERT(sfmmu_hat_lock_held(sfmmup)); 10061 ASSERT(szc <= tsb_max_growsize); 10062 10063 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 10064 return (TSB_LOSTRACE); 10065 10066 /* 10067 * Find the tsb_info ahead of this one in the list, and 10068 * also make sure that the tsb_info passed in really 10069 * exists! 10070 */ 10071 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10072 curtsb != old_tsbinfo && curtsb != NULL; 10073 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10074 ; 10075 ASSERT(curtsb != NULL); 10076 10077 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10078 /* 10079 * The process is swapped out, so just set the new size 10080 * code. When it swaps back in, we'll allocate a new one 10081 * of the new chosen size. 10082 */ 10083 curtsb->tsb_szc = szc; 10084 return (TSB_SUCCESS); 10085 } 10086 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 10087 10088 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 10089 10090 /* 10091 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 10092 * If we fail to allocate a TSB, exit. 10093 * 10094 * If tsb grows with new tsb size > 4M and old tsb size < 4M, 10095 * then try 4M slab after the initial alloc fails. 10096 * 10097 * If tsb swapin with tsb size > 4M, then try 4M after the 10098 * initial alloc fails. 10099 */ 10100 sfmmu_hat_exit(hatlockp); 10101 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, 10102 tte_sz_mask, flags, sfmmup) && 10103 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) || 10104 (!(flags & TSB_SWAPIN) && 10105 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) || 10106 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE, 10107 tte_sz_mask, flags, sfmmup))) { 10108 (void) sfmmu_hat_enter(sfmmup); 10109 if (!(flags & TSB_SWAPIN)) 10110 SFMMU_STAT(sf_tsb_resize_failures); 10111 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10112 return (TSB_ALLOCFAIL); 10113 } 10114 (void) sfmmu_hat_enter(sfmmup); 10115 10116 /* 10117 * Re-check to make sure somebody else didn't muck with us while we 10118 * didn't hold the HAT lock. If the process swapped out, fine, just 10119 * exit; this can happen if we try to shrink the TSB from the context 10120 * of another process (such as on an ISM unmap), though it is rare. 10121 */ 10122 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 10123 SFMMU_STAT(sf_tsb_resize_failures); 10124 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10125 sfmmu_hat_exit(hatlockp); 10126 sfmmu_tsbinfo_free(new_tsbinfo); 10127 (void) sfmmu_hat_enter(sfmmup); 10128 return (TSB_LOSTRACE); 10129 } 10130 10131 #ifdef DEBUG 10132 /* Reverify that the tsb_info still exists.. for debugging only */ 10133 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 10134 curtsb != old_tsbinfo && curtsb != NULL; 10135 prevtsb = curtsb, curtsb = curtsb->tsb_next) 10136 ; 10137 ASSERT(curtsb != NULL); 10138 #endif /* DEBUG */ 10139 10140 /* 10141 * Quiesce any CPUs running this process on their next TLB miss 10142 * so they atomically see the new tsb_info. We temporarily set the 10143 * context to invalid context so new threads that come on processor 10144 * after we do the xcall to cpusran will also serialize behind the 10145 * HAT lock on TLB miss and will see the new TSB. Since this short 10146 * race with a new thread coming on processor is relatively rare, 10147 * this synchronization mechanism should be cheaper than always 10148 * pausing all CPUs for the duration of the setup, which is what 10149 * the old implementation did. This is particuarly true if we are 10150 * copying a huge chunk of memory around during that window. 10151 * 10152 * The memory barriers are to make sure things stay consistent 10153 * with resume() since it does not hold the HAT lock while 10154 * walking the list of tsb_info structures. 10155 */ 10156 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 10157 /* The TSB is either growing or shrinking. */ 10158 sfmmu_invalidate_ctx(sfmmup); 10159 } else { 10160 /* 10161 * It is illegal to swap in TSBs from a process other 10162 * than a process being swapped in. This in turn 10163 * implies we do not have a valid MMU context here 10164 * since a process needs one to resolve translation 10165 * misses. 10166 */ 10167 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 10168 } 10169 10170 #ifdef DEBUG 10171 ASSERT(max_mmu_ctxdoms > 0); 10172 10173 /* 10174 * Process should have INVALID_CONTEXT on all MMUs 10175 */ 10176 for (i = 0; i < max_mmu_ctxdoms; i++) { 10177 10178 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT); 10179 } 10180 #endif 10181 10182 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 10183 membar_stst(); /* strict ordering required */ 10184 if (prevtsb) 10185 prevtsb->tsb_next = new_tsbinfo; 10186 else 10187 sfmmup->sfmmu_tsb = new_tsbinfo; 10188 membar_enter(); /* make sure new TSB globally visible */ 10189 10190 /* 10191 * We need to migrate TSB entries from the old TSB to the new TSB 10192 * if tsb_remap_ttes is set and the TSB is growing. 10193 */ 10194 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 10195 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 10196 10197 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 10198 10199 /* 10200 * Drop the HAT lock to free our old tsb_info. 10201 */ 10202 sfmmu_hat_exit(hatlockp); 10203 10204 if ((flags & TSB_GROW) == TSB_GROW) { 10205 SFMMU_STAT(sf_tsb_grow); 10206 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 10207 SFMMU_STAT(sf_tsb_shrink); 10208 } 10209 10210 sfmmu_tsbinfo_free(old_tsbinfo); 10211 10212 (void) sfmmu_hat_enter(sfmmup); 10213 return (TSB_SUCCESS); 10214 } 10215 10216 /* 10217 * This function will re-program hat pgsz array, and invalidate the 10218 * process' context, forcing the process to switch to another 10219 * context on the next TLB miss, and therefore start using the 10220 * TLB that is reprogrammed for the new page sizes. 10221 */ 10222 void 10223 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 10224 { 10225 int i; 10226 hatlock_t *hatlockp = NULL; 10227 10228 hatlockp = sfmmu_hat_enter(sfmmup); 10229 /* USIII+-IV+ optimization, requires hat lock */ 10230 if (tmp_pgsz) { 10231 for (i = 0; i < mmu_page_sizes; i++) 10232 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 10233 } 10234 SFMMU_STAT(sf_tlb_reprog_pgsz); 10235 10236 sfmmu_invalidate_ctx(sfmmup); 10237 10238 sfmmu_hat_exit(hatlockp); 10239 } 10240 10241 /* 10242 * The scd_rttecnt field in the SCD must be updated to take account of the 10243 * regions which it contains. 10244 */ 10245 static void 10246 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp) 10247 { 10248 uint_t rid; 10249 uint_t i, j; 10250 ulong_t w; 10251 sf_region_t *rgnp; 10252 10253 ASSERT(srdp != NULL); 10254 10255 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 10256 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 10257 continue; 10258 } 10259 10260 j = 0; 10261 while (w) { 10262 if (!(w & 0x1)) { 10263 j++; 10264 w >>= 1; 10265 continue; 10266 } 10267 rid = (i << BT_ULSHIFT) | j; 10268 j++; 10269 w >>= 1; 10270 10271 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 10272 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 10273 rgnp = srdp->srd_hmergnp[rid]; 10274 ASSERT(rgnp->rgn_refcnt > 0); 10275 ASSERT(rgnp->rgn_id == rid); 10276 10277 scdp->scd_rttecnt[rgnp->rgn_pgszc] += 10278 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 10279 10280 /* 10281 * Maintain the tsb0 inflation cnt for the regions 10282 * in the SCD. 10283 */ 10284 if (rgnp->rgn_pgszc >= TTE4M) { 10285 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt += 10286 rgnp->rgn_size >> 10287 (TTE_PAGE_SHIFT(TTE8K) + 2); 10288 } 10289 } 10290 } 10291 } 10292 10293 /* 10294 * This function assumes that there are either four or six supported page 10295 * sizes and at most two programmable TLBs, so we need to decide which 10296 * page sizes are most important and then tell the MMU layer so it 10297 * can adjust the TLB page sizes accordingly (if supported). 10298 * 10299 * If these assumptions change, this function will need to be 10300 * updated to support whatever the new limits are. 10301 * 10302 * The growing flag is nonzero if we are growing the address space, 10303 * and zero if it is shrinking. This allows us to decide whether 10304 * to grow or shrink our TSB, depending upon available memory 10305 * conditions. 10306 */ 10307 static void 10308 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 10309 { 10310 uint64_t ttecnt[MMU_PAGE_SIZES]; 10311 uint64_t tte8k_cnt, tte4m_cnt; 10312 uint8_t i; 10313 int sectsb_thresh; 10314 10315 /* 10316 * Kernel threads, processes with small address spaces not using 10317 * large pages, and dummy ISM HATs need not apply. 10318 */ 10319 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 10320 return; 10321 10322 if (!SFMMU_LGPGS_INUSE(sfmmup) && 10323 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 10324 return; 10325 10326 for (i = 0; i < mmu_page_sizes; i++) { 10327 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] + 10328 sfmmup->sfmmu_ismttecnt[i]; 10329 } 10330 10331 /* Check pagesizes in use, and possibly reprogram DTLB. */ 10332 if (&mmu_check_page_sizes) 10333 mmu_check_page_sizes(sfmmup, ttecnt); 10334 10335 /* 10336 * Calculate the number of 8k ttes to represent the span of these 10337 * pages. 10338 */ 10339 tte8k_cnt = ttecnt[TTE8K] + 10340 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 10341 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 10342 if (mmu_page_sizes == max_mmu_page_sizes) { 10343 tte4m_cnt = ttecnt[TTE4M] + 10344 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 10345 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 10346 } else { 10347 tte4m_cnt = ttecnt[TTE4M]; 10348 } 10349 10350 /* 10351 * Inflate tte8k_cnt to allow for region large page allocation failure. 10352 */ 10353 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt; 10354 10355 /* 10356 * Inflate TSB sizes by a factor of 2 if this process 10357 * uses 4M text pages to minimize extra conflict misses 10358 * in the first TSB since without counting text pages 10359 * 8K TSB may become too small. 10360 * 10361 * Also double the size of the second TSB to minimize 10362 * extra conflict misses due to competition between 4M text pages 10363 * and data pages. 10364 * 10365 * We need to adjust the second TSB allocation threshold by the 10366 * inflation factor, since there is no point in creating a second 10367 * TSB when we know all the mappings can fit in the I/D TLBs. 10368 */ 10369 sectsb_thresh = tsb_sectsb_threshold; 10370 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 10371 tte8k_cnt <<= 1; 10372 tte4m_cnt <<= 1; 10373 sectsb_thresh <<= 1; 10374 } 10375 10376 /* 10377 * Check to see if our TSB is the right size; we may need to 10378 * grow or shrink it. If the process is small, our work is 10379 * finished at this point. 10380 */ 10381 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) { 10382 return; 10383 } 10384 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh); 10385 } 10386 10387 static void 10388 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 10389 uint64_t tte4m_cnt, int sectsb_thresh) 10390 { 10391 int tsb_bits; 10392 uint_t tsb_szc; 10393 struct tsb_info *tsbinfop; 10394 hatlock_t *hatlockp = NULL; 10395 10396 hatlockp = sfmmu_hat_enter(sfmmup); 10397 ASSERT(hatlockp != NULL); 10398 tsbinfop = sfmmup->sfmmu_tsb; 10399 ASSERT(tsbinfop != NULL); 10400 10401 /* 10402 * If we're growing, select the size based on RSS. If we're 10403 * shrinking, leave some room so we don't have to turn around and 10404 * grow again immediately. 10405 */ 10406 if (growing) 10407 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 10408 else 10409 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 10410 10411 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10412 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10413 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10414 hatlockp, TSB_SHRINK); 10415 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 10416 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10417 hatlockp, TSB_GROW); 10418 } 10419 tsbinfop = sfmmup->sfmmu_tsb; 10420 10421 /* 10422 * With the TLB and first TSB out of the way, we need to see if 10423 * we need a second TSB for 4M pages. If we managed to reprogram 10424 * the TLB page sizes above, the process will start using this new 10425 * TSB right away; otherwise, it will start using it on the next 10426 * context switch. Either way, it's no big deal so there's no 10427 * synchronization with the trap handlers here unless we grow the 10428 * TSB (in which case it's required to prevent using the old one 10429 * after it's freed). Note: second tsb is required for 32M/256M 10430 * page sizes. 10431 */ 10432 if (tte4m_cnt > sectsb_thresh) { 10433 /* 10434 * If we're growing, select the size based on RSS. If we're 10435 * shrinking, leave some room so we don't have to turn 10436 * around and grow again immediately. 10437 */ 10438 if (growing) 10439 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 10440 else 10441 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 10442 if (tsbinfop->tsb_next == NULL) { 10443 struct tsb_info *newtsb; 10444 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 10445 0 : TSB_ALLOC; 10446 10447 sfmmu_hat_exit(hatlockp); 10448 10449 /* 10450 * Try to allocate a TSB for 4[32|256]M pages. If we 10451 * can't get the size we want, retry w/a minimum sized 10452 * TSB. If that still didn't work, give up; we can 10453 * still run without one. 10454 */ 10455 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 10456 TSB4M|TSB32M|TSB256M:TSB4M; 10457 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 10458 allocflags, sfmmup)) && 10459 (tsb_szc <= TSB_4M_SZCODE || 10460 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 10461 tsb_bits, allocflags, sfmmup)) && 10462 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 10463 tsb_bits, allocflags, sfmmup)) { 10464 return; 10465 } 10466 10467 hatlockp = sfmmu_hat_enter(sfmmup); 10468 10469 sfmmu_invalidate_ctx(sfmmup); 10470 10471 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 10472 sfmmup->sfmmu_tsb->tsb_next = newtsb; 10473 SFMMU_STAT(sf_tsb_sectsb_create); 10474 sfmmu_hat_exit(hatlockp); 10475 return; 10476 } else { 10477 /* 10478 * It's annoying, but possible for us 10479 * to get here.. we dropped the HAT lock 10480 * because of locking order in the kmem 10481 * allocator, and while we were off getting 10482 * our memory, some other thread decided to 10483 * do us a favor and won the race to get a 10484 * second TSB for this process. Sigh. 10485 */ 10486 sfmmu_hat_exit(hatlockp); 10487 sfmmu_tsbinfo_free(newtsb); 10488 return; 10489 } 10490 } 10491 10492 /* 10493 * We have a second TSB, see if it's big enough. 10494 */ 10495 tsbinfop = tsbinfop->tsb_next; 10496 10497 /* 10498 * Check to see if our second TSB is the right size; 10499 * we may need to grow or shrink it. 10500 * To prevent thrashing (e.g. growing the TSB on a 10501 * subsequent map operation), only try to shrink if 10502 * the TSB reach exceeds twice the virtual address 10503 * space size. 10504 */ 10505 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10506 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10507 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10508 tsb_szc, hatlockp, TSB_SHRINK); 10509 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 10510 TSB_OK_GROW()) { 10511 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10512 tsb_szc, hatlockp, TSB_GROW); 10513 } 10514 } 10515 10516 sfmmu_hat_exit(hatlockp); 10517 } 10518 10519 /* 10520 * Free up a sfmmu 10521 * Since the sfmmu is currently embedded in the hat struct we simply zero 10522 * out our fields and free up the ism map blk list if any. 10523 */ 10524 static void 10525 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 10526 { 10527 ism_blk_t *blkp, *nx_blkp; 10528 #ifdef DEBUG 10529 ism_map_t *map; 10530 int i; 10531 #endif 10532 10533 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 10534 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 10535 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 10536 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 10537 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 10538 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 10539 ASSERT(SF_RGNMAP_ISNULL(sfmmup)); 10540 10541 sfmmup->sfmmu_free = 0; 10542 sfmmup->sfmmu_ismhat = 0; 10543 10544 blkp = sfmmup->sfmmu_iblk; 10545 sfmmup->sfmmu_iblk = NULL; 10546 10547 while (blkp) { 10548 #ifdef DEBUG 10549 map = blkp->iblk_maps; 10550 for (i = 0; i < ISM_MAP_SLOTS; i++) { 10551 ASSERT(map[i].imap_seg == 0); 10552 ASSERT(map[i].imap_ismhat == NULL); 10553 ASSERT(map[i].imap_ment == NULL); 10554 } 10555 #endif 10556 nx_blkp = blkp->iblk_next; 10557 blkp->iblk_next = NULL; 10558 blkp->iblk_nextpa = (uint64_t)-1; 10559 kmem_cache_free(ism_blk_cache, blkp); 10560 blkp = nx_blkp; 10561 } 10562 } 10563 10564 /* 10565 * Locking primitves accessed by HATLOCK macros 10566 */ 10567 10568 #define SFMMU_SPL_MTX (0x0) 10569 #define SFMMU_ML_MTX (0x1) 10570 10571 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 10572 SPL_HASH(pg) : MLIST_HASH(pg)) 10573 10574 kmutex_t * 10575 sfmmu_page_enter(struct page *pp) 10576 { 10577 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 10578 } 10579 10580 void 10581 sfmmu_page_exit(kmutex_t *spl) 10582 { 10583 mutex_exit(spl); 10584 } 10585 10586 int 10587 sfmmu_page_spl_held(struct page *pp) 10588 { 10589 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 10590 } 10591 10592 kmutex_t * 10593 sfmmu_mlist_enter(struct page *pp) 10594 { 10595 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 10596 } 10597 10598 void 10599 sfmmu_mlist_exit(kmutex_t *mml) 10600 { 10601 mutex_exit(mml); 10602 } 10603 10604 int 10605 sfmmu_mlist_held(struct page *pp) 10606 { 10607 10608 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 10609 } 10610 10611 /* 10612 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 10613 * sfmmu_mlist_enter() case mml_table lock array is used and for 10614 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 10615 * 10616 * The lock is taken on a root page so that it protects an operation on all 10617 * constituent pages of a large page pp belongs to. 10618 * 10619 * The routine takes a lock from the appropriate array. The lock is determined 10620 * by hashing the root page. After taking the lock this routine checks if the 10621 * root page has the same size code that was used to determine the root (i.e 10622 * that root hasn't changed). If root page has the expected p_szc field we 10623 * have the right lock and it's returned to the caller. If root's p_szc 10624 * decreased we release the lock and retry from the beginning. This case can 10625 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 10626 * value and taking the lock. The number of retries due to p_szc decrease is 10627 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 10628 * determined by hashing pp itself. 10629 * 10630 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 10631 * possible that p_szc can increase. To increase p_szc a thread has to lock 10632 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 10633 * callers that don't hold a page locked recheck if hmeblk through which pp 10634 * was found still maps this pp. If it doesn't map it anymore returned lock 10635 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 10636 * p_szc increase after taking the lock it returns this lock without further 10637 * retries because in this case the caller doesn't care about which lock was 10638 * taken. The caller will drop it right away. 10639 * 10640 * After the routine returns it's guaranteed that hat_page_demote() can't 10641 * change p_szc field of any of constituent pages of a large page pp belongs 10642 * to as long as pp was either locked at least SHARED prior to this call or 10643 * the caller finds that hment that pointed to this pp still references this 10644 * pp (this also assumes that the caller holds hme hash bucket lock so that 10645 * the same pp can't be remapped into the same hmeblk after it was unmapped by 10646 * hat_pageunload()). 10647 */ 10648 static kmutex_t * 10649 sfmmu_mlspl_enter(struct page *pp, int type) 10650 { 10651 kmutex_t *mtx; 10652 uint_t prev_rszc = UINT_MAX; 10653 page_t *rootpp; 10654 uint_t szc; 10655 uint_t rszc; 10656 uint_t pszc = pp->p_szc; 10657 10658 ASSERT(pp != NULL); 10659 10660 again: 10661 if (pszc == 0) { 10662 mtx = SFMMU_MLSPL_MTX(type, pp); 10663 mutex_enter(mtx); 10664 return (mtx); 10665 } 10666 10667 /* The lock lives in the root page */ 10668 rootpp = PP_GROUPLEADER(pp, pszc); 10669 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10670 mutex_enter(mtx); 10671 10672 /* 10673 * Return mml in the following 3 cases: 10674 * 10675 * 1) If pp itself is root since if its p_szc decreased before we took 10676 * the lock pp is still the root of smaller szc page. And if its p_szc 10677 * increased it doesn't matter what lock we return (see comment in 10678 * front of this routine). 10679 * 10680 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 10681 * large page we have the right lock since any previous potential 10682 * hat_page_demote() is done demoting from greater than current root's 10683 * p_szc because hat_page_demote() changes root's p_szc last. No 10684 * further hat_page_demote() can start or be in progress since it 10685 * would need the same lock we currently hold. 10686 * 10687 * 3) If rootpp's p_szc increased since previous iteration it doesn't 10688 * matter what lock we return (see comment in front of this routine). 10689 */ 10690 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 10691 rszc >= prev_rszc) { 10692 return (mtx); 10693 } 10694 10695 /* 10696 * hat_page_demote() could have decreased root's p_szc. 10697 * In this case pp's p_szc must also be smaller than pszc. 10698 * Retry. 10699 */ 10700 if (rszc < pszc) { 10701 szc = pp->p_szc; 10702 if (szc < pszc) { 10703 mutex_exit(mtx); 10704 pszc = szc; 10705 goto again; 10706 } 10707 /* 10708 * pp's p_szc increased after it was decreased. 10709 * page cannot be mapped. Return current lock. The caller 10710 * will drop it right away. 10711 */ 10712 return (mtx); 10713 } 10714 10715 /* 10716 * root's p_szc is greater than pp's p_szc. 10717 * hat_page_demote() is not done with all pages 10718 * yet. Wait for it to complete. 10719 */ 10720 mutex_exit(mtx); 10721 rootpp = PP_GROUPLEADER(rootpp, rszc); 10722 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10723 mutex_enter(mtx); 10724 mutex_exit(mtx); 10725 prev_rszc = rszc; 10726 goto again; 10727 } 10728 10729 static int 10730 sfmmu_mlspl_held(struct page *pp, int type) 10731 { 10732 kmutex_t *mtx; 10733 10734 ASSERT(pp != NULL); 10735 /* The lock lives in the root page */ 10736 pp = PP_PAGEROOT(pp); 10737 ASSERT(pp != NULL); 10738 10739 mtx = SFMMU_MLSPL_MTX(type, pp); 10740 return (MUTEX_HELD(mtx)); 10741 } 10742 10743 static uint_t 10744 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 10745 { 10746 struct hme_blk *hblkp; 10747 10748 10749 if (freehblkp != NULL) { 10750 mutex_enter(&freehblkp_lock); 10751 if (freehblkp != NULL) { 10752 /* 10753 * If the current thread is owning hblk_reserve OR 10754 * critical request from sfmmu_hblk_steal() 10755 * let it succeed even if freehblkcnt is really low. 10756 */ 10757 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 10758 SFMMU_STAT(sf_get_free_throttle); 10759 mutex_exit(&freehblkp_lock); 10760 return (0); 10761 } 10762 freehblkcnt--; 10763 *hmeblkpp = freehblkp; 10764 hblkp = *hmeblkpp; 10765 freehblkp = hblkp->hblk_next; 10766 mutex_exit(&freehblkp_lock); 10767 hblkp->hblk_next = NULL; 10768 SFMMU_STAT(sf_get_free_success); 10769 10770 ASSERT(hblkp->hblk_hmecnt == 0); 10771 ASSERT(hblkp->hblk_vcnt == 0); 10772 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp)); 10773 10774 return (1); 10775 } 10776 mutex_exit(&freehblkp_lock); 10777 } 10778 10779 /* Check cpu hblk pending queues */ 10780 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) { 10781 hblkp = *hmeblkpp; 10782 hblkp->hblk_next = NULL; 10783 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp); 10784 10785 ASSERT(hblkp->hblk_hmecnt == 0); 10786 ASSERT(hblkp->hblk_vcnt == 0); 10787 10788 return (1); 10789 } 10790 10791 SFMMU_STAT(sf_get_free_fail); 10792 return (0); 10793 } 10794 10795 static uint_t 10796 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 10797 { 10798 struct hme_blk *hblkp; 10799 10800 ASSERT(hmeblkp->hblk_hmecnt == 0); 10801 ASSERT(hmeblkp->hblk_vcnt == 0); 10802 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10803 10804 /* 10805 * If the current thread is mapping into kernel space, 10806 * let it succede even if freehblkcnt is max 10807 * so that it will avoid freeing it to kmem. 10808 * This will prevent stack overflow due to 10809 * possible recursion since kmem_cache_free() 10810 * might require creation of a slab which 10811 * in turn needs an hmeblk to map that slab; 10812 * let's break this vicious chain at the first 10813 * opportunity. 10814 */ 10815 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10816 mutex_enter(&freehblkp_lock); 10817 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10818 SFMMU_STAT(sf_put_free_success); 10819 freehblkcnt++; 10820 hmeblkp->hblk_next = freehblkp; 10821 freehblkp = hmeblkp; 10822 mutex_exit(&freehblkp_lock); 10823 return (1); 10824 } 10825 mutex_exit(&freehblkp_lock); 10826 } 10827 10828 /* 10829 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 10830 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 10831 * we are not in the process of mapping into kernel space. 10832 */ 10833 ASSERT(!critical); 10834 while (freehblkcnt > HBLK_RESERVE_CNT) { 10835 mutex_enter(&freehblkp_lock); 10836 if (freehblkcnt > HBLK_RESERVE_CNT) { 10837 freehblkcnt--; 10838 hblkp = freehblkp; 10839 freehblkp = hblkp->hblk_next; 10840 mutex_exit(&freehblkp_lock); 10841 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 10842 kmem_cache_free(sfmmu8_cache, hblkp); 10843 continue; 10844 } 10845 mutex_exit(&freehblkp_lock); 10846 } 10847 SFMMU_STAT(sf_put_free_fail); 10848 return (0); 10849 } 10850 10851 static void 10852 sfmmu_hblk_swap(struct hme_blk *new) 10853 { 10854 struct hme_blk *old, *hblkp, *prev; 10855 uint64_t newpa; 10856 caddr_t base, vaddr, endaddr; 10857 struct hmehash_bucket *hmebp; 10858 struct sf_hment *osfhme, *nsfhme; 10859 page_t *pp; 10860 kmutex_t *pml; 10861 tte_t tte; 10862 struct hme_blk *list = NULL; 10863 10864 #ifdef DEBUG 10865 hmeblk_tag hblktag; 10866 struct hme_blk *found; 10867 #endif 10868 old = HBLK_RESERVE; 10869 ASSERT(!old->hblk_shared); 10870 10871 /* 10872 * save pa before bcopy clobbers it 10873 */ 10874 newpa = new->hblk_nextpa; 10875 10876 base = (caddr_t)get_hblk_base(old); 10877 endaddr = base + get_hblk_span(old); 10878 10879 /* 10880 * acquire hash bucket lock. 10881 */ 10882 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K, 10883 SFMMU_INVALID_SHMERID); 10884 10885 /* 10886 * copy contents from old to new 10887 */ 10888 bcopy((void *)old, (void *)new, HME8BLK_SZ); 10889 10890 /* 10891 * add new to hash chain 10892 */ 10893 sfmmu_hblk_hash_add(hmebp, new, newpa); 10894 10895 /* 10896 * search hash chain for hblk_reserve; this needs to be performed 10897 * after adding new, otherwise prev won't correspond to the hblk which 10898 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to 10899 * remove old later. 10900 */ 10901 for (prev = NULL, 10902 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old; 10903 prev = hblkp, hblkp = hblkp->hblk_next) 10904 ; 10905 10906 if (hblkp != old) 10907 panic("sfmmu_hblk_swap: hblk_reserve not found"); 10908 10909 /* 10910 * p_mapping list is still pointing to hments in hblk_reserve; 10911 * fix up p_mapping list so that they point to hments in new. 10912 * 10913 * Since all these mappings are created by hblk_reserve_thread 10914 * on the way and it's using at least one of the buffers from each of 10915 * the newly minted slabs, there is no danger of any of these 10916 * mappings getting unloaded by another thread. 10917 * 10918 * tsbmiss could only modify ref/mod bits of hments in old/new. 10919 * Since all of these hments hold mappings established by segkmem 10920 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 10921 * have no meaning for the mappings in hblk_reserve. hments in 10922 * old and new are identical except for ref/mod bits. 10923 */ 10924 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 10925 10926 HBLKTOHME(osfhme, old, vaddr); 10927 sfmmu_copytte(&osfhme->hme_tte, &tte); 10928 10929 if (TTE_IS_VALID(&tte)) { 10930 if ((pp = osfhme->hme_page) == NULL) 10931 panic("sfmmu_hblk_swap: page not mapped"); 10932 10933 pml = sfmmu_mlist_enter(pp); 10934 10935 if (pp != osfhme->hme_page) 10936 panic("sfmmu_hblk_swap: mapping changed"); 10937 10938 HBLKTOHME(nsfhme, new, vaddr); 10939 10940 HME_ADD(nsfhme, pp); 10941 HME_SUB(osfhme, pp); 10942 10943 sfmmu_mlist_exit(pml); 10944 } 10945 } 10946 10947 /* 10948 * remove old from hash chain 10949 */ 10950 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1); 10951 10952 #ifdef DEBUG 10953 10954 hblktag.htag_id = ksfmmup; 10955 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 10956 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 10957 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 10958 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 10959 10960 if (found != new) 10961 panic("sfmmu_hblk_swap: new hblk not found"); 10962 #endif 10963 10964 SFMMU_HASH_UNLOCK(hmebp); 10965 10966 /* 10967 * Reset hblk_reserve 10968 */ 10969 bzero((void *)old, HME8BLK_SZ); 10970 old->hblk_nextpa = va_to_pa((caddr_t)old); 10971 } 10972 10973 /* 10974 * Grab the mlist mutex for both pages passed in. 10975 * 10976 * low and high will be returned as pointers to the mutexes for these pages. 10977 * low refers to the mutex residing in the lower bin of the mlist hash, while 10978 * high refers to the mutex residing in the higher bin of the mlist hash. This 10979 * is due to the locking order restrictions on the same thread grabbing 10980 * multiple mlist mutexes. The low lock must be acquired before the high lock. 10981 * 10982 * If both pages hash to the same mutex, only grab that single mutex, and 10983 * high will be returned as NULL 10984 * If the pages hash to different bins in the hash, grab the lower addressed 10985 * lock first and then the higher addressed lock in order to follow the locking 10986 * rules involved with the same thread grabbing multiple mlist mutexes. 10987 * low and high will both have non-NULL values. 10988 */ 10989 static void 10990 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 10991 kmutex_t **low, kmutex_t **high) 10992 { 10993 kmutex_t *mml_targ, *mml_repl; 10994 10995 /* 10996 * no need to do the dance around szc as in sfmmu_mlist_enter() 10997 * because this routine is only called by hat_page_relocate() and all 10998 * targ and repl pages are already locked EXCL so szc can't change. 10999 */ 11000 11001 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 11002 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 11003 11004 if (mml_targ == mml_repl) { 11005 *low = mml_targ; 11006 *high = NULL; 11007 } else { 11008 if (mml_targ < mml_repl) { 11009 *low = mml_targ; 11010 *high = mml_repl; 11011 } else { 11012 *low = mml_repl; 11013 *high = mml_targ; 11014 } 11015 } 11016 11017 mutex_enter(*low); 11018 if (*high) 11019 mutex_enter(*high); 11020 } 11021 11022 static void 11023 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 11024 { 11025 if (high) 11026 mutex_exit(high); 11027 mutex_exit(low); 11028 } 11029 11030 hatlock_t * 11031 sfmmu_hat_enter(sfmmu_t *sfmmup) 11032 { 11033 hatlock_t *hatlockp; 11034 11035 if (sfmmup != ksfmmup) { 11036 hatlockp = TSB_HASH(sfmmup); 11037 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 11038 return (hatlockp); 11039 } 11040 return (NULL); 11041 } 11042 11043 static hatlock_t * 11044 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 11045 { 11046 hatlock_t *hatlockp; 11047 11048 if (sfmmup != ksfmmup) { 11049 hatlockp = TSB_HASH(sfmmup); 11050 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 11051 return (NULL); 11052 return (hatlockp); 11053 } 11054 return (NULL); 11055 } 11056 11057 void 11058 sfmmu_hat_exit(hatlock_t *hatlockp) 11059 { 11060 if (hatlockp != NULL) 11061 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 11062 } 11063 11064 static void 11065 sfmmu_hat_lock_all(void) 11066 { 11067 int i; 11068 for (i = 0; i < SFMMU_NUM_LOCK; i++) 11069 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 11070 } 11071 11072 static void 11073 sfmmu_hat_unlock_all(void) 11074 { 11075 int i; 11076 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 11077 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 11078 } 11079 11080 int 11081 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 11082 { 11083 ASSERT(sfmmup != ksfmmup); 11084 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 11085 } 11086 11087 /* 11088 * Locking primitives to provide consistency between ISM unmap 11089 * and other operations. Since ISM unmap can take a long time, we 11090 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 11091 * contention on the hatlock buckets while ISM segments are being 11092 * unmapped. The tradeoff is that the flags don't prevent priority 11093 * inversion from occurring, so we must request kernel priority in 11094 * case we have to sleep to keep from getting buried while holding 11095 * the HAT_ISMBUSY flag set, which in turn could block other kernel 11096 * threads from running (for example, in sfmmu_uvatopfn()). 11097 */ 11098 static void 11099 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 11100 { 11101 hatlock_t *hatlockp; 11102 11103 THREAD_KPRI_REQUEST(); 11104 if (!hatlock_held) 11105 hatlockp = sfmmu_hat_enter(sfmmup); 11106 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 11107 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11108 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 11109 if (!hatlock_held) 11110 sfmmu_hat_exit(hatlockp); 11111 } 11112 11113 static void 11114 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 11115 { 11116 hatlock_t *hatlockp; 11117 11118 if (!hatlock_held) 11119 hatlockp = sfmmu_hat_enter(sfmmup); 11120 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 11121 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 11122 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11123 if (!hatlock_held) 11124 sfmmu_hat_exit(hatlockp); 11125 THREAD_KPRI_RELEASE(); 11126 } 11127 11128 /* 11129 * 11130 * Algorithm: 11131 * 11132 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 11133 * hblks. 11134 * 11135 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 11136 * 11137 * (a) try to return an hblk from reserve pool of free hblks; 11138 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 11139 * and return hblk_reserve. 11140 * 11141 * (3) call kmem_cache_alloc() to allocate hblk; 11142 * 11143 * (a) if hblk_reserve_lock is held by the current thread, 11144 * atomically replace hblk_reserve by the hblk that is 11145 * returned by kmem_cache_alloc; release hblk_reserve_lock 11146 * and call kmem_cache_alloc() again. 11147 * (b) if reserve pool is not full, add the hblk that is 11148 * returned by kmem_cache_alloc to reserve pool and 11149 * call kmem_cache_alloc again. 11150 * 11151 */ 11152 static struct hme_blk * 11153 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 11154 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 11155 uint_t flags, uint_t rid) 11156 { 11157 struct hme_blk *hmeblkp = NULL; 11158 struct hme_blk *newhblkp; 11159 struct hme_blk *shw_hblkp = NULL; 11160 struct kmem_cache *sfmmu_cache = NULL; 11161 uint64_t hblkpa; 11162 ulong_t index; 11163 uint_t owner; /* set to 1 if using hblk_reserve */ 11164 uint_t forcefree; 11165 int sleep; 11166 sf_srd_t *srdp; 11167 sf_region_t *rgnp; 11168 11169 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11170 ASSERT(hblktag.htag_rid == rid); 11171 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 11172 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11173 IS_P2ALIGNED(vaddr, TTEBYTES(size))); 11174 11175 /* 11176 * If segkmem is not created yet, allocate from static hmeblks 11177 * created at the end of startup_modules(). See the block comment 11178 * in startup_modules() describing how we estimate the number of 11179 * static hmeblks that will be needed during re-map. 11180 */ 11181 if (!hblk_alloc_dynamic) { 11182 11183 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11184 11185 if (size == TTE8K) { 11186 index = nucleus_hblk8.index; 11187 if (index >= nucleus_hblk8.len) { 11188 /* 11189 * If we panic here, see startup_modules() to 11190 * make sure that we are calculating the 11191 * number of hblk8's that we need correctly. 11192 */ 11193 prom_panic("no nucleus hblk8 to allocate"); 11194 } 11195 hmeblkp = 11196 (struct hme_blk *)&nucleus_hblk8.list[index]; 11197 nucleus_hblk8.index++; 11198 SFMMU_STAT(sf_hblk8_nalloc); 11199 } else { 11200 index = nucleus_hblk1.index; 11201 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 11202 /* 11203 * If we panic here, see startup_modules(). 11204 * Most likely you need to update the 11205 * calculation of the number of hblk1 elements 11206 * that the kernel needs to boot. 11207 */ 11208 prom_panic("no nucleus hblk1 to allocate"); 11209 } 11210 hmeblkp = 11211 (struct hme_blk *)&nucleus_hblk1.list[index]; 11212 nucleus_hblk1.index++; 11213 SFMMU_STAT(sf_hblk1_nalloc); 11214 } 11215 11216 goto hblk_init; 11217 } 11218 11219 SFMMU_HASH_UNLOCK(hmebp); 11220 11221 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) { 11222 if (mmu_page_sizes == max_mmu_page_sizes) { 11223 if (size < TTE256M) 11224 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11225 size, flags); 11226 } else { 11227 if (size < TTE4M) 11228 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11229 size, flags); 11230 } 11231 } else if (SFMMU_IS_SHMERID_VALID(rid)) { 11232 /* 11233 * Shared hmes use per region bitmaps in rgn_hmeflag 11234 * rather than shadow hmeblks to keep track of the 11235 * mapping sizes which have been allocated for the region. 11236 * Here we cleanup old invalid hmeblks with this rid, 11237 * which may be left around by pageunload(). 11238 */ 11239 int ttesz; 11240 caddr_t va; 11241 caddr_t eva = vaddr + TTEBYTES(size); 11242 11243 ASSERT(sfmmup != KHATID); 11244 11245 srdp = sfmmup->sfmmu_srdp; 11246 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11247 rgnp = srdp->srd_hmergnp[rid]; 11248 ASSERT(rgnp != NULL && rgnp->rgn_id == rid); 11249 ASSERT(rgnp->rgn_refcnt != 0); 11250 ASSERT(size <= rgnp->rgn_pgszc); 11251 11252 ttesz = HBLK_MIN_TTESZ; 11253 do { 11254 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) { 11255 continue; 11256 } 11257 11258 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) { 11259 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz); 11260 } else if (ttesz < size) { 11261 for (va = vaddr; va < eva; 11262 va += TTEBYTES(ttesz)) { 11263 sfmmu_cleanup_rhblk(srdp, va, rid, 11264 ttesz); 11265 } 11266 } 11267 } while (++ttesz <= rgnp->rgn_pgszc); 11268 } 11269 11270 fill_hblk: 11271 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 11272 11273 if (owner && size == TTE8K) { 11274 11275 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11276 /* 11277 * We are really in a tight spot. We already own 11278 * hblk_reserve and we need another hblk. In anticipation 11279 * of this kind of scenario, we specifically set aside 11280 * HBLK_RESERVE_MIN number of hblks to be used exclusively 11281 * by owner of hblk_reserve. 11282 */ 11283 SFMMU_STAT(sf_hblk_recurse_cnt); 11284 11285 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 11286 panic("sfmmu_hblk_alloc: reserve list is empty"); 11287 11288 goto hblk_verify; 11289 } 11290 11291 ASSERT(!owner); 11292 11293 if ((flags & HAT_NO_KALLOC) == 0) { 11294 11295 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 11296 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 11297 11298 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 11299 hmeblkp = sfmmu_hblk_steal(size); 11300 } else { 11301 /* 11302 * if we are the owner of hblk_reserve, 11303 * swap hblk_reserve with hmeblkp and 11304 * start a fresh life. Hope things go 11305 * better this time. 11306 */ 11307 if (hblk_reserve_thread == curthread) { 11308 ASSERT(sfmmu_cache == sfmmu8_cache); 11309 sfmmu_hblk_swap(hmeblkp); 11310 hblk_reserve_thread = NULL; 11311 mutex_exit(&hblk_reserve_lock); 11312 goto fill_hblk; 11313 } 11314 /* 11315 * let's donate this hblk to our reserve list if 11316 * we are not mapping kernel range 11317 */ 11318 if (size == TTE8K && sfmmup != KHATID) { 11319 if (sfmmu_put_free_hblk(hmeblkp, 0)) 11320 goto fill_hblk; 11321 } 11322 } 11323 } else { 11324 /* 11325 * We are here to map the slab in sfmmu8_cache; let's 11326 * check if we could tap our reserve list; if successful, 11327 * this will avoid the pain of going thru sfmmu_hblk_swap 11328 */ 11329 SFMMU_STAT(sf_hblk_slab_cnt); 11330 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 11331 /* 11332 * let's start hblk_reserve dance 11333 */ 11334 SFMMU_STAT(sf_hblk_reserve_cnt); 11335 owner = 1; 11336 mutex_enter(&hblk_reserve_lock); 11337 hmeblkp = HBLK_RESERVE; 11338 hblk_reserve_thread = curthread; 11339 } 11340 } 11341 11342 hblk_verify: 11343 ASSERT(hmeblkp != NULL); 11344 set_hblk_sz(hmeblkp, size); 11345 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 11346 SFMMU_HASH_LOCK(hmebp); 11347 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11348 if (newhblkp != NULL) { 11349 SFMMU_HASH_UNLOCK(hmebp); 11350 if (hmeblkp != HBLK_RESERVE) { 11351 /* 11352 * This is really tricky! 11353 * 11354 * vmem_alloc(vmem_seg_arena) 11355 * vmem_alloc(vmem_internal_arena) 11356 * segkmem_alloc(heap_arena) 11357 * vmem_alloc(heap_arena) 11358 * page_create() 11359 * hat_memload() 11360 * kmem_cache_free() 11361 * kmem_cache_alloc() 11362 * kmem_slab_create() 11363 * vmem_alloc(kmem_internal_arena) 11364 * segkmem_alloc(heap_arena) 11365 * vmem_alloc(heap_arena) 11366 * page_create() 11367 * hat_memload() 11368 * kmem_cache_free() 11369 * ... 11370 * 11371 * Thus, hat_memload() could call kmem_cache_free 11372 * for enough number of times that we could easily 11373 * hit the bottom of the stack or run out of reserve 11374 * list of vmem_seg structs. So, we must donate 11375 * this hblk to reserve list if it's allocated 11376 * from sfmmu8_cache *and* mapping kernel range. 11377 * We don't need to worry about freeing hmeblk1's 11378 * to kmem since they don't map any kmem slabs. 11379 * 11380 * Note: When segkmem supports largepages, we must 11381 * free hmeblk1's to reserve list as well. 11382 */ 11383 forcefree = (sfmmup == KHATID) ? 1 : 0; 11384 if (size == TTE8K && 11385 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 11386 goto re_verify; 11387 } 11388 ASSERT(sfmmup != KHATID); 11389 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 11390 } else { 11391 /* 11392 * Hey! we don't need hblk_reserve any more. 11393 */ 11394 ASSERT(owner); 11395 hblk_reserve_thread = NULL; 11396 mutex_exit(&hblk_reserve_lock); 11397 owner = 0; 11398 } 11399 re_verify: 11400 /* 11401 * let's check if the goodies are still present 11402 */ 11403 SFMMU_HASH_LOCK(hmebp); 11404 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11405 if (newhblkp != NULL) { 11406 /* 11407 * return newhblkp if it's not hblk_reserve; 11408 * if newhblkp is hblk_reserve, return it 11409 * _only if_ we are the owner of hblk_reserve. 11410 */ 11411 if (newhblkp != HBLK_RESERVE || owner) { 11412 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11413 newhblkp->hblk_shared); 11414 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || 11415 !newhblkp->hblk_shared); 11416 return (newhblkp); 11417 } else { 11418 /* 11419 * we just hit hblk_reserve in the hash and 11420 * we are not the owner of that; 11421 * 11422 * block until hblk_reserve_thread completes 11423 * swapping hblk_reserve and try the dance 11424 * once again. 11425 */ 11426 SFMMU_HASH_UNLOCK(hmebp); 11427 mutex_enter(&hblk_reserve_lock); 11428 mutex_exit(&hblk_reserve_lock); 11429 SFMMU_STAT(sf_hblk_reserve_hit); 11430 goto fill_hblk; 11431 } 11432 } else { 11433 /* 11434 * it's no more! try the dance once again. 11435 */ 11436 SFMMU_HASH_UNLOCK(hmebp); 11437 goto fill_hblk; 11438 } 11439 } 11440 11441 hblk_init: 11442 if (SFMMU_IS_SHMERID_VALID(rid)) { 11443 uint16_t tteflag = 0x1 << 11444 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size); 11445 11446 if (!(rgnp->rgn_hmeflags & tteflag)) { 11447 atomic_or_16(&rgnp->rgn_hmeflags, tteflag); 11448 } 11449 hmeblkp->hblk_shared = 1; 11450 } else { 11451 hmeblkp->hblk_shared = 0; 11452 } 11453 set_hblk_sz(hmeblkp, size); 11454 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11455 hmeblkp->hblk_next = (struct hme_blk *)NULL; 11456 hmeblkp->hblk_tag = hblktag; 11457 hmeblkp->hblk_shadow = shw_hblkp; 11458 hblkpa = hmeblkp->hblk_nextpa; 11459 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 11460 11461 ASSERT(get_hblk_ttesz(hmeblkp) == size); 11462 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 11463 ASSERT(hmeblkp->hblk_hmecnt == 0); 11464 ASSERT(hmeblkp->hblk_vcnt == 0); 11465 ASSERT(hmeblkp->hblk_lckcnt == 0); 11466 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 11467 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 11468 return (hmeblkp); 11469 } 11470 11471 /* 11472 * This function cleans up the hme_blk and returns it to the free list. 11473 */ 11474 /* ARGSUSED */ 11475 static void 11476 sfmmu_hblk_free(struct hme_blk **listp) 11477 { 11478 struct hme_blk *hmeblkp, *next_hmeblkp; 11479 int size; 11480 uint_t critical; 11481 uint64_t hblkpa; 11482 11483 ASSERT(*listp != NULL); 11484 11485 hmeblkp = *listp; 11486 while (hmeblkp != NULL) { 11487 next_hmeblkp = hmeblkp->hblk_next; 11488 ASSERT(!hmeblkp->hblk_hmecnt); 11489 ASSERT(!hmeblkp->hblk_vcnt); 11490 ASSERT(!hmeblkp->hblk_lckcnt); 11491 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 11492 ASSERT(hmeblkp->hblk_shared == 0); 11493 ASSERT(hmeblkp->hblk_shw_bit == 0); 11494 ASSERT(hmeblkp->hblk_shadow == NULL); 11495 11496 hblkpa = va_to_pa((caddr_t)hmeblkp); 11497 ASSERT(hblkpa != (uint64_t)-1); 11498 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 11499 11500 size = get_hblk_ttesz(hmeblkp); 11501 hmeblkp->hblk_next = NULL; 11502 hmeblkp->hblk_nextpa = hblkpa; 11503 11504 if (hmeblkp->hblk_nuc_bit == 0) { 11505 11506 if (size != TTE8K || 11507 !sfmmu_put_free_hblk(hmeblkp, critical)) 11508 kmem_cache_free(get_hblk_cache(hmeblkp), 11509 hmeblkp); 11510 } 11511 hmeblkp = next_hmeblkp; 11512 } 11513 } 11514 11515 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 11516 #define SFMMU_HBLK_STEAL_THRESHOLD 5 11517 11518 static uint_t sfmmu_hblk_steal_twice; 11519 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 11520 11521 /* 11522 * Steal a hmeblk from user or kernel hme hash lists. 11523 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to 11524 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts 11525 * tap into critical reserve of freehblkp. 11526 * Note: We remain looping in this routine until we find one. 11527 */ 11528 static struct hme_blk * 11529 sfmmu_hblk_steal(int size) 11530 { 11531 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 11532 struct hmehash_bucket *hmebp; 11533 struct hme_blk *hmeblkp = NULL, *pr_hblk; 11534 uint64_t hblkpa; 11535 int i; 11536 uint_t loop_cnt = 0, critical; 11537 11538 for (;;) { 11539 /* Check cpu hblk pending queues */ 11540 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) { 11541 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 11542 ASSERT(hmeblkp->hblk_hmecnt == 0); 11543 ASSERT(hmeblkp->hblk_vcnt == 0); 11544 return (hmeblkp); 11545 } 11546 11547 if (size == TTE8K) { 11548 critical = 11549 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0; 11550 if (sfmmu_get_free_hblk(&hmeblkp, critical)) 11551 return (hmeblkp); 11552 } 11553 11554 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 11555 uhmehash_steal_hand; 11556 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 11557 11558 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 11559 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 11560 SFMMU_HASH_LOCK(hmebp); 11561 hmeblkp = hmebp->hmeblkp; 11562 hblkpa = hmebp->hmeh_nextpa; 11563 pr_hblk = NULL; 11564 while (hmeblkp) { 11565 /* 11566 * check if it is a hmeblk that is not locked 11567 * and not shared. skip shadow hmeblks with 11568 * shadow_mask set i.e valid count non zero. 11569 */ 11570 if ((get_hblk_ttesz(hmeblkp) == size) && 11571 (hmeblkp->hblk_shw_bit == 0 || 11572 hmeblkp->hblk_vcnt == 0) && 11573 (hmeblkp->hblk_lckcnt == 0)) { 11574 /* 11575 * there is a high probability that we 11576 * will find a free one. search some 11577 * buckets for a free hmeblk initially 11578 * before unloading a valid hmeblk. 11579 */ 11580 if ((hmeblkp->hblk_vcnt == 0 && 11581 hmeblkp->hblk_hmecnt == 0) || (i >= 11582 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 11583 if (sfmmu_steal_this_hblk(hmebp, 11584 hmeblkp, hblkpa, pr_hblk)) { 11585 /* 11586 * Hblk is unloaded 11587 * successfully 11588 */ 11589 break; 11590 } 11591 } 11592 } 11593 pr_hblk = hmeblkp; 11594 hblkpa = hmeblkp->hblk_nextpa; 11595 hmeblkp = hmeblkp->hblk_next; 11596 } 11597 11598 SFMMU_HASH_UNLOCK(hmebp); 11599 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 11600 hmebp = uhme_hash; 11601 } 11602 uhmehash_steal_hand = hmebp; 11603 11604 if (hmeblkp != NULL) 11605 break; 11606 11607 /* 11608 * in the worst case, look for a free one in the kernel 11609 * hash table. 11610 */ 11611 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 11612 SFMMU_HASH_LOCK(hmebp); 11613 hmeblkp = hmebp->hmeblkp; 11614 hblkpa = hmebp->hmeh_nextpa; 11615 pr_hblk = NULL; 11616 while (hmeblkp) { 11617 /* 11618 * check if it is free hmeblk 11619 */ 11620 if ((get_hblk_ttesz(hmeblkp) == size) && 11621 (hmeblkp->hblk_lckcnt == 0) && 11622 (hmeblkp->hblk_vcnt == 0) && 11623 (hmeblkp->hblk_hmecnt == 0)) { 11624 if (sfmmu_steal_this_hblk(hmebp, 11625 hmeblkp, hblkpa, pr_hblk)) { 11626 break; 11627 } else { 11628 /* 11629 * Cannot fail since we have 11630 * hash lock. 11631 */ 11632 panic("fail to steal?"); 11633 } 11634 } 11635 11636 pr_hblk = hmeblkp; 11637 hblkpa = hmeblkp->hblk_nextpa; 11638 hmeblkp = hmeblkp->hblk_next; 11639 } 11640 11641 SFMMU_HASH_UNLOCK(hmebp); 11642 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 11643 hmebp = khme_hash; 11644 } 11645 11646 if (hmeblkp != NULL) 11647 break; 11648 sfmmu_hblk_steal_twice++; 11649 } 11650 return (hmeblkp); 11651 } 11652 11653 /* 11654 * This routine does real work to prepare a hblk to be "stolen" by 11655 * unloading the mappings, updating shadow counts .... 11656 * It returns 1 if the block is ready to be reused (stolen), or 0 11657 * means the block cannot be stolen yet- pageunload is still working 11658 * on this hblk. 11659 */ 11660 static int 11661 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 11662 uint64_t hblkpa, struct hme_blk *pr_hblk) 11663 { 11664 int shw_size, vshift; 11665 struct hme_blk *shw_hblkp; 11666 caddr_t vaddr; 11667 uint_t shw_mask, newshw_mask; 11668 struct hme_blk *list = NULL; 11669 11670 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11671 11672 /* 11673 * check if the hmeblk is free, unload if necessary 11674 */ 11675 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11676 sfmmu_t *sfmmup; 11677 demap_range_t dmr; 11678 11679 sfmmup = hblktosfmmu(hmeblkp); 11680 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) { 11681 return (0); 11682 } 11683 DEMAP_RANGE_INIT(sfmmup, &dmr); 11684 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 11685 (caddr_t)get_hblk_base(hmeblkp), 11686 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 11687 DEMAP_RANGE_FLUSH(&dmr); 11688 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11689 /* 11690 * Pageunload is working on the same hblk. 11691 */ 11692 return (0); 11693 } 11694 11695 sfmmu_hblk_steal_unload_count++; 11696 } 11697 11698 ASSERT(hmeblkp->hblk_lckcnt == 0); 11699 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 11700 11701 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1); 11702 hmeblkp->hblk_nextpa = hblkpa; 11703 11704 shw_hblkp = hmeblkp->hblk_shadow; 11705 if (shw_hblkp) { 11706 ASSERT(!hmeblkp->hblk_shared); 11707 shw_size = get_hblk_ttesz(shw_hblkp); 11708 vaddr = (caddr_t)get_hblk_base(hmeblkp); 11709 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 11710 ASSERT(vshift < 8); 11711 /* 11712 * Atomically clear shadow mask bit 11713 */ 11714 do { 11715 shw_mask = shw_hblkp->hblk_shw_mask; 11716 ASSERT(shw_mask & (1 << vshift)); 11717 newshw_mask = shw_mask & ~(1 << vshift); 11718 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 11719 shw_mask, newshw_mask); 11720 } while (newshw_mask != shw_mask); 11721 hmeblkp->hblk_shadow = NULL; 11722 } 11723 11724 /* 11725 * remove shadow bit if we are stealing an unused shadow hmeblk. 11726 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 11727 * we are indeed allocating a shadow hmeblk. 11728 */ 11729 hmeblkp->hblk_shw_bit = 0; 11730 11731 if (hmeblkp->hblk_shared) { 11732 sf_srd_t *srdp; 11733 sf_region_t *rgnp; 11734 uint_t rid; 11735 11736 srdp = hblktosrd(hmeblkp); 11737 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11738 rid = hmeblkp->hblk_tag.htag_rid; 11739 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11740 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11741 rgnp = srdp->srd_hmergnp[rid]; 11742 ASSERT(rgnp != NULL); 11743 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 11744 hmeblkp->hblk_shared = 0; 11745 } 11746 11747 sfmmu_hblk_steal_count++; 11748 SFMMU_STAT(sf_steal_count); 11749 11750 return (1); 11751 } 11752 11753 struct hme_blk * 11754 sfmmu_hmetohblk(struct sf_hment *sfhme) 11755 { 11756 struct hme_blk *hmeblkp; 11757 struct sf_hment *sfhme0; 11758 struct hme_blk *hblk_dummy = 0; 11759 11760 /* 11761 * No dummy sf_hments, please. 11762 */ 11763 ASSERT(sfhme->hme_tte.ll != 0); 11764 11765 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 11766 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 11767 (uintptr_t)&hblk_dummy->hblk_hme[0]); 11768 11769 return (hmeblkp); 11770 } 11771 11772 /* 11773 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 11774 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 11775 * KM_SLEEP allocation. 11776 * 11777 * Return 0 on success, -1 otherwise. 11778 */ 11779 static void 11780 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11781 { 11782 struct tsb_info *tsbinfop, *next; 11783 tsb_replace_rc_t rc; 11784 boolean_t gotfirst = B_FALSE; 11785 11786 ASSERT(sfmmup != ksfmmup); 11787 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11788 11789 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 11790 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11791 } 11792 11793 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11794 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 11795 } else { 11796 return; 11797 } 11798 11799 ASSERT(sfmmup->sfmmu_tsb != NULL); 11800 11801 /* 11802 * Loop over all tsbinfo's replacing them with ones that actually have 11803 * a TSB. If any of the replacements ever fail, bail out of the loop. 11804 */ 11805 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 11806 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 11807 next = tsbinfop->tsb_next; 11808 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 11809 hatlockp, TSB_SWAPIN); 11810 if (rc != TSB_SUCCESS) { 11811 break; 11812 } 11813 gotfirst = B_TRUE; 11814 } 11815 11816 switch (rc) { 11817 case TSB_SUCCESS: 11818 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11819 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11820 return; 11821 case TSB_LOSTRACE: 11822 break; 11823 case TSB_ALLOCFAIL: 11824 break; 11825 default: 11826 panic("sfmmu_replace_tsb returned unrecognized failure code " 11827 "%d", rc); 11828 } 11829 11830 /* 11831 * In this case, we failed to get one of our TSBs. If we failed to 11832 * get the first TSB, get one of minimum size (8KB). Walk the list 11833 * and throw away the tsbinfos, starting where the allocation failed; 11834 * we can get by with just one TSB as long as we don't leave the 11835 * SWAPPED tsbinfo structures lying around. 11836 */ 11837 tsbinfop = sfmmup->sfmmu_tsb; 11838 next = tsbinfop->tsb_next; 11839 tsbinfop->tsb_next = NULL; 11840 11841 sfmmu_hat_exit(hatlockp); 11842 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 11843 next = tsbinfop->tsb_next; 11844 sfmmu_tsbinfo_free(tsbinfop); 11845 } 11846 hatlockp = sfmmu_hat_enter(sfmmup); 11847 11848 /* 11849 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 11850 * pages. 11851 */ 11852 if (!gotfirst) { 11853 tsbinfop = sfmmup->sfmmu_tsb; 11854 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 11855 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 11856 ASSERT(rc == TSB_SUCCESS); 11857 } 11858 11859 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11860 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11861 } 11862 11863 static int 11864 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw) 11865 { 11866 ulong_t bix = 0; 11867 uint_t rid; 11868 sf_region_t *rgnp; 11869 11870 ASSERT(srdp != NULL); 11871 ASSERT(srdp->srd_refcnt != 0); 11872 11873 w <<= BT_ULSHIFT; 11874 while (bmw) { 11875 if (!(bmw & 0x1)) { 11876 bix++; 11877 bmw >>= 1; 11878 continue; 11879 } 11880 rid = w | bix; 11881 rgnp = srdp->srd_hmergnp[rid]; 11882 ASSERT(rgnp->rgn_refcnt > 0); 11883 ASSERT(rgnp->rgn_id == rid); 11884 if (addr < rgnp->rgn_saddr || 11885 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) { 11886 bix++; 11887 bmw >>= 1; 11888 } else { 11889 return (1); 11890 } 11891 } 11892 return (0); 11893 } 11894 11895 /* 11896 * Handle exceptions for low level tsb_handler. 11897 * 11898 * There are many scenarios that could land us here: 11899 * 11900 * If the context is invalid we land here. The context can be invalid 11901 * for 3 reasons: 1) we couldn't allocate a new context and now need to 11902 * perform a wrap around operation in order to allocate a new context. 11903 * 2) Context was invalidated to change pagesize programming 3) ISMs or 11904 * TSBs configuration is changeing for this process and we are forced into 11905 * here to do a syncronization operation. If the context is valid we can 11906 * be here from window trap hanlder. In this case just call trap to handle 11907 * the fault. 11908 * 11909 * Note that the process will run in INVALID_CONTEXT before 11910 * faulting into here and subsequently loading the MMU registers 11911 * (including the TSB base register) associated with this process. 11912 * For this reason, the trap handlers must all test for 11913 * INVALID_CONTEXT before attempting to access any registers other 11914 * than the context registers. 11915 */ 11916 void 11917 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 11918 { 11919 sfmmu_t *sfmmup, *shsfmmup; 11920 uint_t ctxtype; 11921 klwp_id_t lwp; 11922 char lwp_save_state; 11923 hatlock_t *hatlockp, *shatlockp; 11924 struct tsb_info *tsbinfop; 11925 struct tsbmiss *tsbmp; 11926 sf_scd_t *scdp; 11927 11928 SFMMU_STAT(sf_tsb_exceptions); 11929 SFMMU_MMU_STAT(mmu_tsb_exceptions); 11930 sfmmup = astosfmmu(curthread->t_procp->p_as); 11931 /* 11932 * note that in sun4u, tagacces register contains ctxnum 11933 * while sun4v passes ctxtype in the tagaccess register. 11934 */ 11935 ctxtype = tagaccess & TAGACC_CTX_MASK; 11936 11937 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT); 11938 ASSERT(sfmmup->sfmmu_ismhat == 0); 11939 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) || 11940 ctxtype == INVALID_CONTEXT); 11941 11942 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) { 11943 /* 11944 * We may land here because shme bitmap and pagesize 11945 * flags are updated lazily in tsbmiss area on other cpus. 11946 * If we detect here that tsbmiss area is out of sync with 11947 * sfmmu update it and retry the trapped instruction. 11948 * Otherwise call trap(). 11949 */ 11950 int ret = 0; 11951 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K); 11952 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK); 11953 11954 /* 11955 * Must set lwp state to LWP_SYS before 11956 * trying to acquire any adaptive lock 11957 */ 11958 lwp = ttolwp(curthread); 11959 ASSERT(lwp); 11960 lwp_save_state = lwp->lwp_state; 11961 lwp->lwp_state = LWP_SYS; 11962 11963 hatlockp = sfmmu_hat_enter(sfmmup); 11964 kpreempt_disable(); 11965 tsbmp = &tsbmiss_area[CPU->cpu_id]; 11966 ASSERT(sfmmup == tsbmp->usfmmup); 11967 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) & 11968 ~tteflag_mask) || 11969 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) & 11970 ~tteflag_mask)) { 11971 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags; 11972 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags; 11973 ret = 1; 11974 } 11975 if (sfmmup->sfmmu_srdp != NULL) { 11976 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap; 11977 ulong_t *tm = tsbmp->shmermap; 11978 ulong_t i; 11979 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 11980 ulong_t d = tm[i] ^ sm[i]; 11981 if (d) { 11982 if (d & sm[i]) { 11983 if (!ret && sfmmu_is_rgnva( 11984 sfmmup->sfmmu_srdp, 11985 addr, i, d & sm[i])) { 11986 ret = 1; 11987 } 11988 } 11989 tm[i] = sm[i]; 11990 } 11991 } 11992 } 11993 kpreempt_enable(); 11994 sfmmu_hat_exit(hatlockp); 11995 lwp->lwp_state = lwp_save_state; 11996 if (ret) { 11997 return; 11998 } 11999 } else if (ctxtype == INVALID_CONTEXT) { 12000 /* 12001 * First, make sure we come out of here with a valid ctx, 12002 * since if we don't get one we'll simply loop on the 12003 * faulting instruction. 12004 * 12005 * If the ISM mappings are changing, the TSB is relocated, 12006 * the process is swapped, the process is joining SCD or 12007 * leaving SCD or shared regions we serialize behind the 12008 * controlling thread with hat lock, sfmmu_flags and 12009 * sfmmu_tsb_cv condition variable. 12010 */ 12011 12012 /* 12013 * Must set lwp state to LWP_SYS before 12014 * trying to acquire any adaptive lock 12015 */ 12016 lwp = ttolwp(curthread); 12017 ASSERT(lwp); 12018 lwp_save_state = lwp->lwp_state; 12019 lwp->lwp_state = LWP_SYS; 12020 12021 hatlockp = sfmmu_hat_enter(sfmmup); 12022 retry: 12023 if ((scdp = sfmmup->sfmmu_scdp) != NULL) { 12024 shsfmmup = scdp->scd_sfmmup; 12025 ASSERT(shsfmmup != NULL); 12026 12027 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL; 12028 tsbinfop = tsbinfop->tsb_next) { 12029 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 12030 /* drop the private hat lock */ 12031 sfmmu_hat_exit(hatlockp); 12032 /* acquire the shared hat lock */ 12033 shatlockp = sfmmu_hat_enter(shsfmmup); 12034 /* 12035 * recheck to see if anything changed 12036 * after we drop the private hat lock. 12037 */ 12038 if (sfmmup->sfmmu_scdp == scdp && 12039 shsfmmup == scdp->scd_sfmmup) { 12040 sfmmu_tsb_chk_reloc(shsfmmup, 12041 shatlockp); 12042 } 12043 sfmmu_hat_exit(shatlockp); 12044 hatlockp = sfmmu_hat_enter(sfmmup); 12045 goto retry; 12046 } 12047 } 12048 } 12049 12050 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 12051 tsbinfop = tsbinfop->tsb_next) { 12052 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 12053 cv_wait(&sfmmup->sfmmu_tsb_cv, 12054 HATLOCK_MUTEXP(hatlockp)); 12055 goto retry; 12056 } 12057 } 12058 12059 /* 12060 * Wait for ISM maps to be updated. 12061 */ 12062 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12063 cv_wait(&sfmmup->sfmmu_tsb_cv, 12064 HATLOCK_MUTEXP(hatlockp)); 12065 goto retry; 12066 } 12067 12068 /* Is this process joining an SCD? */ 12069 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12070 /* 12071 * Flush private TSB and setup shared TSB. 12072 * sfmmu_finish_join_scd() does not drop the 12073 * hat lock. 12074 */ 12075 sfmmu_finish_join_scd(sfmmup); 12076 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 12077 } 12078 12079 /* 12080 * If we're swapping in, get TSB(s). Note that we must do 12081 * this before we get a ctx or load the MMU state. Once 12082 * we swap in we have to recheck to make sure the TSB(s) and 12083 * ISM mappings didn't change while we slept. 12084 */ 12085 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 12086 sfmmu_tsb_swapin(sfmmup, hatlockp); 12087 goto retry; 12088 } 12089 12090 sfmmu_get_ctx(sfmmup); 12091 12092 sfmmu_hat_exit(hatlockp); 12093 /* 12094 * Must restore lwp_state if not calling 12095 * trap() for further processing. Restore 12096 * it anyway. 12097 */ 12098 lwp->lwp_state = lwp_save_state; 12099 return; 12100 } 12101 trap(rp, (caddr_t)tagaccess, traptype, 0); 12102 } 12103 12104 static void 12105 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp) 12106 { 12107 struct tsb_info *tp; 12108 12109 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12110 12111 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) { 12112 if (tp->tsb_flags & TSB_RELOC_FLAG) { 12113 cv_wait(&sfmmup->sfmmu_tsb_cv, 12114 HATLOCK_MUTEXP(hatlockp)); 12115 break; 12116 } 12117 } 12118 } 12119 12120 /* 12121 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 12122 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 12123 * rather than spinning to avoid send mondo timeouts with 12124 * interrupts enabled. When the lock is acquired it is immediately 12125 * released and we return back to sfmmu_vatopfn just after 12126 * the GET_TTE call. 12127 */ 12128 void 12129 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 12130 { 12131 struct page **pp; 12132 12133 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12134 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 12135 } 12136 12137 /* 12138 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 12139 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 12140 * cross traps which cannot be handled while spinning in the 12141 * trap handlers. Simply enter and exit the kpr_suspendlock spin 12142 * mutex, which is held by the holder of the suspend bit, and then 12143 * retry the trapped instruction after unwinding. 12144 */ 12145 /*ARGSUSED*/ 12146 void 12147 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 12148 { 12149 ASSERT(curthread != kreloc_thread); 12150 mutex_enter(&kpr_suspendlock); 12151 mutex_exit(&kpr_suspendlock); 12152 } 12153 12154 /* 12155 * This routine could be optimized to reduce the number of xcalls by flushing 12156 * the entire TLBs if region reference count is above some threshold but the 12157 * tradeoff will depend on the size of the TLB. So for now flush the specific 12158 * page a context at a time. 12159 * 12160 * If uselocks is 0 then it's called after all cpus were captured and all the 12161 * hat locks were taken. In this case don't take the region lock by relying on 12162 * the order of list region update operations in hat_join_region(), 12163 * hat_leave_region() and hat_dup_region(). The ordering in those routines 12164 * guarantees that list is always forward walkable and reaches active sfmmus 12165 * regardless of where xc_attention() captures a cpu. 12166 */ 12167 cpuset_t 12168 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp, 12169 struct hme_blk *hmeblkp, int uselocks) 12170 { 12171 sfmmu_t *sfmmup; 12172 cpuset_t cpuset; 12173 cpuset_t rcpuset; 12174 hatlock_t *hatlockp; 12175 uint_t rid = rgnp->rgn_id; 12176 sf_rgn_link_t *rlink; 12177 sf_scd_t *scdp; 12178 12179 ASSERT(hmeblkp->hblk_shared); 12180 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 12181 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 12182 12183 CPUSET_ZERO(rcpuset); 12184 if (uselocks) { 12185 mutex_enter(&rgnp->rgn_mutex); 12186 } 12187 sfmmup = rgnp->rgn_sfmmu_head; 12188 while (sfmmup != NULL) { 12189 if (uselocks) { 12190 hatlockp = sfmmu_hat_enter(sfmmup); 12191 } 12192 12193 /* 12194 * When an SCD is created the SCD hat is linked on the sfmmu 12195 * region lists for each hme region which is part of the 12196 * SCD. If we find an SCD hat, when walking these lists, 12197 * then we flush the shared TSBs, if we find a private hat, 12198 * which is part of an SCD, but where the region 12199 * is not part of the SCD then we flush the private TSBs. 12200 * 12201 * If the Rock page size register is present, then SCDs 12202 * may contain both shared and private pages, so we cannot 12203 * use this optimization to avoid flushing private TSBs. 12204 */ 12205 if (pgsz_search_on == 0 && 12206 !sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12207 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 12208 scdp = sfmmup->sfmmu_scdp; 12209 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 12210 if (uselocks) { 12211 sfmmu_hat_exit(hatlockp); 12212 } 12213 goto next; 12214 } 12215 } 12216 12217 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12218 12219 kpreempt_disable(); 12220 cpuset = sfmmup->sfmmu_cpusran; 12221 CPUSET_AND(cpuset, cpu_ready_set); 12222 CPUSET_DEL(cpuset, CPU->cpu_id); 12223 SFMMU_XCALL_STATS(sfmmup); 12224 xt_some(cpuset, vtag_flushpage_tl1, 12225 (uint64_t)addr, (uint64_t)sfmmup); 12226 vtag_flushpage(addr, (uint64_t)sfmmup); 12227 if (uselocks) { 12228 sfmmu_hat_exit(hatlockp); 12229 } 12230 kpreempt_enable(); 12231 CPUSET_OR(rcpuset, cpuset); 12232 12233 next: 12234 /* LINTED: constant in conditional context */ 12235 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 12236 ASSERT(rlink != NULL); 12237 sfmmup = rlink->next; 12238 } 12239 if (uselocks) { 12240 mutex_exit(&rgnp->rgn_mutex); 12241 } 12242 return (rcpuset); 12243 } 12244 12245 /* 12246 * This routine takes an sfmmu pointer and the va for an adddress in an 12247 * ISM region as input and returns the corresponding region id in ism_rid. 12248 * The return value of 1 indicates that a region has been found and ism_rid 12249 * is valid, otherwise 0 is returned. 12250 */ 12251 static int 12252 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid) 12253 { 12254 ism_blk_t *ism_blkp; 12255 int i; 12256 ism_map_t *ism_map; 12257 #ifdef DEBUG 12258 struct hat *ism_hatid; 12259 #endif 12260 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12261 12262 ism_blkp = sfmmup->sfmmu_iblk; 12263 while (ism_blkp != NULL) { 12264 ism_map = ism_blkp->iblk_maps; 12265 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 12266 if ((va >= ism_start(ism_map[i])) && 12267 (va < ism_end(ism_map[i]))) { 12268 12269 *ism_rid = ism_map[i].imap_rid; 12270 #ifdef DEBUG 12271 ism_hatid = ism_map[i].imap_ismhat; 12272 ASSERT(ism_hatid == ism_sfmmup); 12273 ASSERT(ism_hatid->sfmmu_ismhat); 12274 #endif 12275 return (1); 12276 } 12277 } 12278 ism_blkp = ism_blkp->iblk_next; 12279 } 12280 return (0); 12281 } 12282 12283 /* 12284 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 12285 * This routine may be called with all cpu's captured. Therefore, the 12286 * caller is responsible for holding all locks and disabling kernel 12287 * preemption. 12288 */ 12289 /* ARGSUSED */ 12290 static void 12291 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 12292 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 12293 { 12294 cpuset_t cpuset; 12295 caddr_t va; 12296 ism_ment_t *ment; 12297 sfmmu_t *sfmmup; 12298 #ifdef VAC 12299 int vcolor; 12300 #endif 12301 12302 sf_scd_t *scdp; 12303 uint_t ism_rid; 12304 12305 ASSERT(!hmeblkp->hblk_shared); 12306 /* 12307 * Walk the ism_hat's mapping list and flush the page 12308 * from every hat sharing this ism_hat. This routine 12309 * may be called while all cpu's have been captured. 12310 * Therefore we can't attempt to grab any locks. For now 12311 * this means we will protect the ism mapping list under 12312 * a single lock which will be grabbed by the caller. 12313 * If hat_share/unshare scalibility becomes a performance 12314 * problem then we may need to re-think ism mapping list locking. 12315 */ 12316 ASSERT(ism_sfmmup->sfmmu_ismhat); 12317 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 12318 addr = addr - ISMID_STARTADDR; 12319 12320 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 12321 12322 sfmmup = ment->iment_hat; 12323 12324 va = ment->iment_base_va; 12325 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 12326 12327 /* 12328 * When an SCD is created the SCD hat is linked on the ism 12329 * mapping lists for each ISM segment which is part of the 12330 * SCD. If we find an SCD hat, when walking these lists, 12331 * then we flush the shared TSBs, if we find a private hat, 12332 * which is part of an SCD, but where the region 12333 * corresponding to this va is not part of the SCD then we 12334 * flush the private TSBs. 12335 * 12336 * If the Rock page size register is present, then SCDs 12337 * may contain both shared and private pages, so we cannot 12338 * use this optimization to avoid flushing private TSBs. 12339 */ 12340 if (pgsz_search_on == 0 && 12341 !sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12342 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) && 12343 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12344 if (!find_ism_rid(sfmmup, ism_sfmmup, va, 12345 &ism_rid)) { 12346 cmn_err(CE_PANIC, 12347 "can't find matching ISM rid!"); 12348 } 12349 12350 scdp = sfmmup->sfmmu_scdp; 12351 if (SFMMU_IS_ISMRID_VALID(ism_rid) && 12352 SF_RGNMAP_TEST(scdp->scd_ismregion_map, 12353 ism_rid)) { 12354 continue; 12355 } 12356 } 12357 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1); 12358 12359 cpuset = sfmmup->sfmmu_cpusran; 12360 CPUSET_AND(cpuset, cpu_ready_set); 12361 CPUSET_DEL(cpuset, CPU->cpu_id); 12362 SFMMU_XCALL_STATS(sfmmup); 12363 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 12364 (uint64_t)sfmmup); 12365 vtag_flushpage(va, (uint64_t)sfmmup); 12366 12367 #ifdef VAC 12368 /* 12369 * Flush D$ 12370 * When flushing D$ we must flush all 12371 * cpu's. See sfmmu_cache_flush(). 12372 */ 12373 if (cache_flush_flag == CACHE_FLUSH) { 12374 cpuset = cpu_ready_set; 12375 CPUSET_DEL(cpuset, CPU->cpu_id); 12376 12377 SFMMU_XCALL_STATS(sfmmup); 12378 vcolor = addr_to_vcolor(va); 12379 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12380 vac_flushpage(pfnum, vcolor); 12381 } 12382 #endif /* VAC */ 12383 } 12384 } 12385 12386 /* 12387 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 12388 * a particular virtual address and ctx. If noflush is set we do not 12389 * flush the TLB/TSB. This function may or may not be called with the 12390 * HAT lock held. 12391 */ 12392 static void 12393 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12394 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 12395 int hat_lock_held) 12396 { 12397 #ifdef VAC 12398 int vcolor; 12399 #endif 12400 cpuset_t cpuset; 12401 hatlock_t *hatlockp; 12402 12403 ASSERT(!hmeblkp->hblk_shared); 12404 12405 #if defined(lint) && !defined(VAC) 12406 pfnum = pfnum; 12407 cpu_flag = cpu_flag; 12408 cache_flush_flag = cache_flush_flag; 12409 #endif 12410 12411 /* 12412 * There is no longer a need to protect against ctx being 12413 * stolen here since we don't store the ctx in the TSB anymore. 12414 */ 12415 #ifdef VAC 12416 vcolor = addr_to_vcolor(addr); 12417 #endif 12418 12419 /* 12420 * We must hold the hat lock during the flush of TLB, 12421 * to avoid a race with sfmmu_invalidate_ctx(), where 12422 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12423 * causing TLB demap routine to skip flush on that MMU. 12424 * If the context on a MMU has already been set to 12425 * INVALID_CONTEXT, we just get an extra flush on 12426 * that MMU. 12427 */ 12428 if (!hat_lock_held && !tlb_noflush) 12429 hatlockp = sfmmu_hat_enter(sfmmup); 12430 12431 kpreempt_disable(); 12432 if (!tlb_noflush) { 12433 /* 12434 * Flush the TSB and TLB. 12435 */ 12436 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12437 12438 cpuset = sfmmup->sfmmu_cpusran; 12439 CPUSET_AND(cpuset, cpu_ready_set); 12440 CPUSET_DEL(cpuset, CPU->cpu_id); 12441 12442 SFMMU_XCALL_STATS(sfmmup); 12443 12444 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 12445 (uint64_t)sfmmup); 12446 12447 vtag_flushpage(addr, (uint64_t)sfmmup); 12448 } 12449 12450 if (!hat_lock_held && !tlb_noflush) 12451 sfmmu_hat_exit(hatlockp); 12452 12453 #ifdef VAC 12454 /* 12455 * Flush the D$ 12456 * 12457 * Even if the ctx is stolen, we need to flush the 12458 * cache. Our ctx stealer only flushes the TLBs. 12459 */ 12460 if (cache_flush_flag == CACHE_FLUSH) { 12461 if (cpu_flag & FLUSH_ALL_CPUS) { 12462 cpuset = cpu_ready_set; 12463 } else { 12464 cpuset = sfmmup->sfmmu_cpusran; 12465 CPUSET_AND(cpuset, cpu_ready_set); 12466 } 12467 CPUSET_DEL(cpuset, CPU->cpu_id); 12468 SFMMU_XCALL_STATS(sfmmup); 12469 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12470 vac_flushpage(pfnum, vcolor); 12471 } 12472 #endif /* VAC */ 12473 kpreempt_enable(); 12474 } 12475 12476 /* 12477 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 12478 * address and ctx. If noflush is set we do not currently do anything. 12479 * This function may or may not be called with the HAT lock held. 12480 */ 12481 static void 12482 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12483 int tlb_noflush, int hat_lock_held) 12484 { 12485 cpuset_t cpuset; 12486 hatlock_t *hatlockp; 12487 12488 ASSERT(!hmeblkp->hblk_shared); 12489 12490 /* 12491 * If the process is exiting we have nothing to do. 12492 */ 12493 if (tlb_noflush) 12494 return; 12495 12496 /* 12497 * Flush TSB. 12498 */ 12499 if (!hat_lock_held) 12500 hatlockp = sfmmu_hat_enter(sfmmup); 12501 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12502 12503 kpreempt_disable(); 12504 12505 cpuset = sfmmup->sfmmu_cpusran; 12506 CPUSET_AND(cpuset, cpu_ready_set); 12507 CPUSET_DEL(cpuset, CPU->cpu_id); 12508 12509 SFMMU_XCALL_STATS(sfmmup); 12510 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup); 12511 12512 vtag_flushpage(addr, (uint64_t)sfmmup); 12513 12514 if (!hat_lock_held) 12515 sfmmu_hat_exit(hatlockp); 12516 12517 kpreempt_enable(); 12518 12519 } 12520 12521 /* 12522 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 12523 * call handler that can flush a range of pages to save on xcalls. 12524 */ 12525 static int sfmmu_xcall_save; 12526 12527 /* 12528 * this routine is never used for demaping addresses backed by SRD hmeblks. 12529 */ 12530 static void 12531 sfmmu_tlb_range_demap(demap_range_t *dmrp) 12532 { 12533 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 12534 hatlock_t *hatlockp; 12535 cpuset_t cpuset; 12536 uint64_t sfmmu_pgcnt; 12537 pgcnt_t pgcnt = 0; 12538 int pgunload = 0; 12539 int dirtypg = 0; 12540 caddr_t addr = dmrp->dmr_addr; 12541 caddr_t eaddr; 12542 uint64_t bitvec = dmrp->dmr_bitvec; 12543 12544 ASSERT(bitvec & 1); 12545 12546 /* 12547 * Flush TSB and calculate number of pages to flush. 12548 */ 12549 while (bitvec != 0) { 12550 dirtypg = 0; 12551 /* 12552 * Find the first page to flush and then count how many 12553 * pages there are after it that also need to be flushed. 12554 * This way the number of TSB flushes is minimized. 12555 */ 12556 while ((bitvec & 1) == 0) { 12557 pgcnt++; 12558 addr += MMU_PAGESIZE; 12559 bitvec >>= 1; 12560 } 12561 while (bitvec & 1) { 12562 dirtypg++; 12563 bitvec >>= 1; 12564 } 12565 eaddr = addr + ptob(dirtypg); 12566 hatlockp = sfmmu_hat_enter(sfmmup); 12567 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 12568 sfmmu_hat_exit(hatlockp); 12569 pgunload += dirtypg; 12570 addr = eaddr; 12571 pgcnt += dirtypg; 12572 } 12573 12574 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 12575 if (sfmmup->sfmmu_free == 0) { 12576 addr = dmrp->dmr_addr; 12577 bitvec = dmrp->dmr_bitvec; 12578 12579 /* 12580 * make sure it has SFMMU_PGCNT_SHIFT bits only, 12581 * as it will be used to pack argument for xt_some 12582 */ 12583 ASSERT((pgcnt > 0) && 12584 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT))); 12585 12586 /* 12587 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in 12588 * the low 6 bits of sfmmup. This is doable since pgcnt 12589 * always >= 1. 12590 */ 12591 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK)); 12592 sfmmu_pgcnt = (uint64_t)sfmmup | 12593 ((pgcnt - 1) & SFMMU_PGCNT_MASK); 12594 12595 /* 12596 * We must hold the hat lock during the flush of TLB, 12597 * to avoid a race with sfmmu_invalidate_ctx(), where 12598 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12599 * causing TLB demap routine to skip flush on that MMU. 12600 * If the context on a MMU has already been set to 12601 * INVALID_CONTEXT, we just get an extra flush on 12602 * that MMU. 12603 */ 12604 hatlockp = sfmmu_hat_enter(sfmmup); 12605 kpreempt_disable(); 12606 12607 cpuset = sfmmup->sfmmu_cpusran; 12608 CPUSET_AND(cpuset, cpu_ready_set); 12609 CPUSET_DEL(cpuset, CPU->cpu_id); 12610 12611 SFMMU_XCALL_STATS(sfmmup); 12612 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 12613 sfmmu_pgcnt); 12614 12615 for (; bitvec != 0; bitvec >>= 1) { 12616 if (bitvec & 1) 12617 vtag_flushpage(addr, (uint64_t)sfmmup); 12618 addr += MMU_PAGESIZE; 12619 } 12620 kpreempt_enable(); 12621 sfmmu_hat_exit(hatlockp); 12622 12623 sfmmu_xcall_save += (pgunload-1); 12624 } 12625 dmrp->dmr_bitvec = 0; 12626 } 12627 12628 /* 12629 * In cases where we need to synchronize with TLB/TSB miss trap 12630 * handlers, _and_ need to flush the TLB, it's a lot easier to 12631 * throw away the context from the process than to do a 12632 * special song and dance to keep things consistent for the 12633 * handlers. 12634 * 12635 * Since the process suddenly ends up without a context and our caller 12636 * holds the hat lock, threads that fault after this function is called 12637 * will pile up on the lock. We can then do whatever we need to 12638 * atomically from the context of the caller. The first blocked thread 12639 * to resume executing will get the process a new context, and the 12640 * process will resume executing. 12641 * 12642 * One added advantage of this approach is that on MMUs that 12643 * support a "flush all" operation, we will delay the flush until 12644 * cnum wrap-around, and then flush the TLB one time. This 12645 * is rather rare, so it's a lot less expensive than making 8000 12646 * x-calls to flush the TLB 8000 times. 12647 * 12648 * A per-process (PP) lock is used to synchronize ctx allocations in 12649 * resume() and ctx invalidations here. 12650 */ 12651 void 12652 sfmmu_invalidate_ctx(sfmmu_t *sfmmup) 12653 { 12654 cpuset_t cpuset; 12655 int cnum, currcnum; 12656 mmu_ctx_t *mmu_ctxp; 12657 int i; 12658 uint_t pstate_save; 12659 12660 SFMMU_STAT(sf_ctx_inv); 12661 12662 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12663 ASSERT(sfmmup != ksfmmup); 12664 12665 kpreempt_disable(); 12666 12667 mmu_ctxp = CPU_MMU_CTXP(CPU); 12668 ASSERT(mmu_ctxp); 12669 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 12670 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 12671 12672 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum; 12673 12674 pstate_save = sfmmu_disable_intrs(); 12675 12676 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */ 12677 /* set HAT cnum invalid across all context domains. */ 12678 for (i = 0; i < max_mmu_ctxdoms; i++) { 12679 12680 cnum = sfmmup->sfmmu_ctxs[i].cnum; 12681 if (cnum == INVALID_CONTEXT) { 12682 continue; 12683 } 12684 12685 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 12686 } 12687 membar_enter(); /* make sure globally visible to all CPUs */ 12688 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */ 12689 12690 sfmmu_enable_intrs(pstate_save); 12691 12692 cpuset = sfmmup->sfmmu_cpusran; 12693 CPUSET_DEL(cpuset, CPU->cpu_id); 12694 CPUSET_AND(cpuset, cpu_ready_set); 12695 if (!CPUSET_ISNULL(cpuset)) { 12696 SFMMU_XCALL_STATS(sfmmup); 12697 xt_some(cpuset, sfmmu_raise_tsb_exception, 12698 (uint64_t)sfmmup, INVALID_CONTEXT); 12699 xt_sync(cpuset); 12700 SFMMU_STAT(sf_tsb_raise_exception); 12701 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 12702 } 12703 12704 /* 12705 * If the hat to-be-invalidated is the same as the current 12706 * process on local CPU we need to invalidate 12707 * this CPU context as well. 12708 */ 12709 if ((sfmmu_getctx_sec() == currcnum) && 12710 (currcnum != INVALID_CONTEXT)) { 12711 /* sets shared context to INVALID too */ 12712 sfmmu_setctx_sec(INVALID_CONTEXT); 12713 sfmmu_clear_utsbinfo(); 12714 } 12715 12716 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID); 12717 12718 kpreempt_enable(); 12719 12720 /* 12721 * we hold the hat lock, so nobody should allocate a context 12722 * for us yet 12723 */ 12724 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT); 12725 } 12726 12727 #ifdef VAC 12728 /* 12729 * We need to flush the cache in all cpus. It is possible that 12730 * a process referenced a page as cacheable but has sinced exited 12731 * and cleared the mapping list. We still to flush it but have no 12732 * state so all cpus is the only alternative. 12733 */ 12734 void 12735 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 12736 { 12737 cpuset_t cpuset; 12738 12739 kpreempt_disable(); 12740 cpuset = cpu_ready_set; 12741 CPUSET_DEL(cpuset, CPU->cpu_id); 12742 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12743 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12744 xt_sync(cpuset); 12745 vac_flushpage(pfnum, vcolor); 12746 kpreempt_enable(); 12747 } 12748 12749 void 12750 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 12751 { 12752 cpuset_t cpuset; 12753 12754 ASSERT(vcolor >= 0); 12755 12756 kpreempt_disable(); 12757 cpuset = cpu_ready_set; 12758 CPUSET_DEL(cpuset, CPU->cpu_id); 12759 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12760 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 12761 xt_sync(cpuset); 12762 vac_flushcolor(vcolor, pfnum); 12763 kpreempt_enable(); 12764 } 12765 #endif /* VAC */ 12766 12767 /* 12768 * We need to prevent processes from accessing the TSB using a cached physical 12769 * address. It's alright if they try to access the TSB via virtual address 12770 * since they will just fault on that virtual address once the mapping has 12771 * been suspended. 12772 */ 12773 #pragma weak sendmondo_in_recover 12774 12775 /* ARGSUSED */ 12776 static int 12777 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 12778 { 12779 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12780 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12781 hatlock_t *hatlockp; 12782 sf_scd_t *scdp; 12783 12784 if (flags != HAT_PRESUSPEND) 12785 return (0); 12786 12787 /* 12788 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must 12789 * be a shared hat, then set SCD's tsbinfo's flag. 12790 * If tsb is not shared, sfmmup is a private hat, then set 12791 * its private tsbinfo's flag. 12792 */ 12793 hatlockp = sfmmu_hat_enter(sfmmup); 12794 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 12795 12796 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) { 12797 sfmmu_tsb_inv_ctx(sfmmup); 12798 sfmmu_hat_exit(hatlockp); 12799 } else { 12800 /* release lock on the shared hat */ 12801 sfmmu_hat_exit(hatlockp); 12802 /* sfmmup is a shared hat */ 12803 ASSERT(sfmmup->sfmmu_scdhat); 12804 scdp = sfmmup->sfmmu_scdp; 12805 ASSERT(scdp != NULL); 12806 /* get private hat from the scd list */ 12807 mutex_enter(&scdp->scd_mutex); 12808 sfmmup = scdp->scd_sf_list; 12809 while (sfmmup != NULL) { 12810 hatlockp = sfmmu_hat_enter(sfmmup); 12811 /* 12812 * We do not call sfmmu_tsb_inv_ctx here because 12813 * sendmondo_in_recover check is only needed for 12814 * sun4u. 12815 */ 12816 sfmmu_invalidate_ctx(sfmmup); 12817 sfmmu_hat_exit(hatlockp); 12818 sfmmup = sfmmup->sfmmu_scd_link.next; 12819 12820 } 12821 mutex_exit(&scdp->scd_mutex); 12822 } 12823 return (0); 12824 } 12825 12826 static void 12827 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup) 12828 { 12829 extern uint32_t sendmondo_in_recover; 12830 12831 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12832 12833 /* 12834 * For Cheetah+ Erratum 25: 12835 * Wait for any active recovery to finish. We can't risk 12836 * relocating the TSB of the thread running mondo_recover_proc() 12837 * since, if we did that, we would deadlock. The scenario we are 12838 * trying to avoid is as follows: 12839 * 12840 * THIS CPU RECOVER CPU 12841 * -------- ----------- 12842 * Begins recovery, walking through TSB 12843 * hat_pagesuspend() TSB TTE 12844 * TLB miss on TSB TTE, spins at TL1 12845 * xt_sync() 12846 * send_mondo_timeout() 12847 * mondo_recover_proc() 12848 * ((deadlocked)) 12849 * 12850 * The second half of the workaround is that mondo_recover_proc() 12851 * checks to see if the tsb_info has the RELOC flag set, and if it 12852 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 12853 * and hence avoiding the TLB miss that could result in a deadlock. 12854 */ 12855 if (&sendmondo_in_recover) { 12856 membar_enter(); /* make sure RELOC flag visible */ 12857 while (sendmondo_in_recover) { 12858 drv_usecwait(1); 12859 membar_consumer(); 12860 } 12861 } 12862 12863 sfmmu_invalidate_ctx(sfmmup); 12864 } 12865 12866 /* ARGSUSED */ 12867 static int 12868 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 12869 void *tsbinfo, pfn_t newpfn) 12870 { 12871 hatlock_t *hatlockp; 12872 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12873 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12874 12875 if (flags != HAT_POSTUNSUSPEND) 12876 return (0); 12877 12878 hatlockp = sfmmu_hat_enter(sfmmup); 12879 12880 SFMMU_STAT(sf_tsb_reloc); 12881 12882 /* 12883 * The process may have swapped out while we were relocating one 12884 * of its TSBs. If so, don't bother doing the setup since the 12885 * process can't be using the memory anymore. 12886 */ 12887 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 12888 ASSERT(va == tsbinfop->tsb_va); 12889 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 12890 12891 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 12892 sfmmu_inv_tsb(tsbinfop->tsb_va, 12893 TSB_BYTES(tsbinfop->tsb_szc)); 12894 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 12895 } 12896 } 12897 12898 membar_exit(); 12899 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 12900 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 12901 12902 sfmmu_hat_exit(hatlockp); 12903 12904 return (0); 12905 } 12906 12907 /* 12908 * Allocate and initialize a tsb_info structure. Note that we may or may not 12909 * allocate a TSB here, depending on the flags passed in. 12910 */ 12911 static int 12912 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 12913 uint_t flags, sfmmu_t *sfmmup) 12914 { 12915 int err; 12916 12917 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 12918 sfmmu_tsbinfo_cache, KM_SLEEP); 12919 12920 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 12921 tsb_szc, flags, sfmmup)) != 0) { 12922 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 12923 SFMMU_STAT(sf_tsb_allocfail); 12924 *tsbinfopp = NULL; 12925 return (err); 12926 } 12927 SFMMU_STAT(sf_tsb_alloc); 12928 12929 /* 12930 * Bump the TSB size counters for this TSB size. 12931 */ 12932 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 12933 return (0); 12934 } 12935 12936 static void 12937 sfmmu_tsb_free(struct tsb_info *tsbinfo) 12938 { 12939 caddr_t tsbva = tsbinfo->tsb_va; 12940 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 12941 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 12942 vmem_t *vmp = tsbinfo->tsb_vmp; 12943 12944 /* 12945 * If we allocated this TSB from relocatable kernel memory, then we 12946 * need to uninstall the callback handler. 12947 */ 12948 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 12949 uintptr_t slab_mask; 12950 caddr_t slab_vaddr; 12951 page_t **ppl; 12952 int ret; 12953 12954 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena); 12955 if (tsb_size > MMU_PAGESIZE4M) 12956 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12957 else 12958 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12959 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 12960 12961 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 12962 ASSERT(ret == 0); 12963 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 12964 0, NULL); 12965 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 12966 } 12967 12968 if (kmem_cachep != NULL) { 12969 kmem_cache_free(kmem_cachep, tsbva); 12970 } else { 12971 vmem_xfree(vmp, (void *)tsbva, tsb_size); 12972 } 12973 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 12974 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 12975 } 12976 12977 static void 12978 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 12979 { 12980 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 12981 sfmmu_tsb_free(tsbinfo); 12982 } 12983 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 12984 12985 } 12986 12987 /* 12988 * Setup all the references to physical memory for this tsbinfo. 12989 * The underlying page(s) must be locked. 12990 */ 12991 static void 12992 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 12993 { 12994 ASSERT(pfn != PFN_INVALID); 12995 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 12996 12997 #ifndef sun4v 12998 if (tsbinfo->tsb_szc == 0) { 12999 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 13000 PROT_WRITE|PROT_READ, TTE8K); 13001 } else { 13002 /* 13003 * Round down PA and use a large mapping; the handlers will 13004 * compute the TSB pointer at the correct offset into the 13005 * big virtual page. NOTE: this assumes all TSBs larger 13006 * than 8K must come from physically contiguous slabs of 13007 * size tsb_slab_size. 13008 */ 13009 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 13010 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 13011 } 13012 tsbinfo->tsb_pa = ptob(pfn); 13013 13014 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 13015 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 13016 13017 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 13018 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 13019 #else /* sun4v */ 13020 tsbinfo->tsb_pa = ptob(pfn); 13021 #endif /* sun4v */ 13022 } 13023 13024 13025 /* 13026 * Returns zero on success, ENOMEM if over the high water mark, 13027 * or EAGAIN if the caller needs to retry with a smaller TSB 13028 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 13029 * 13030 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 13031 * is specified and the TSB requested is PAGESIZE, though it 13032 * may sleep waiting for memory if sufficient memory is not 13033 * available. 13034 */ 13035 static int 13036 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 13037 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 13038 { 13039 caddr_t vaddr = NULL; 13040 caddr_t slab_vaddr; 13041 uintptr_t slab_mask; 13042 int tsbbytes = TSB_BYTES(tsbcode); 13043 int lowmem = 0; 13044 struct kmem_cache *kmem_cachep = NULL; 13045 vmem_t *vmp = NULL; 13046 lgrp_id_t lgrpid = LGRP_NONE; 13047 pfn_t pfn; 13048 uint_t cbflags = HAC_SLEEP; 13049 page_t **pplist; 13050 int ret; 13051 13052 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena); 13053 if (tsbbytes > MMU_PAGESIZE4M) 13054 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 13055 else 13056 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 13057 13058 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 13059 flags |= TSB_ALLOC; 13060 13061 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 13062 13063 tsbinfo->tsb_sfmmu = sfmmup; 13064 13065 /* 13066 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 13067 * return. 13068 */ 13069 if ((flags & TSB_ALLOC) == 0) { 13070 tsbinfo->tsb_szc = tsbcode; 13071 tsbinfo->tsb_ttesz_mask = tteszmask; 13072 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 13073 tsbinfo->tsb_pa = -1; 13074 tsbinfo->tsb_tte.ll = 0; 13075 tsbinfo->tsb_next = NULL; 13076 tsbinfo->tsb_flags = TSB_SWAPPED; 13077 tsbinfo->tsb_cache = NULL; 13078 tsbinfo->tsb_vmp = NULL; 13079 return (0); 13080 } 13081 13082 #ifdef DEBUG 13083 /* 13084 * For debugging: 13085 * Randomly force allocation failures every tsb_alloc_mtbf 13086 * tries if TSB_FORCEALLOC is not specified. This will 13087 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 13088 * it is even, to allow testing of both failure paths... 13089 */ 13090 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 13091 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 13092 tsb_alloc_count = 0; 13093 tsb_alloc_fail_mtbf++; 13094 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 13095 } 13096 #endif /* DEBUG */ 13097 13098 /* 13099 * Enforce high water mark if we are not doing a forced allocation 13100 * and are not shrinking a process' TSB. 13101 */ 13102 if ((flags & TSB_SHRINK) == 0 && 13103 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 13104 if ((flags & TSB_FORCEALLOC) == 0) 13105 return (ENOMEM); 13106 lowmem = 1; 13107 } 13108 13109 /* 13110 * Allocate from the correct location based upon the size of the TSB 13111 * compared to the base page size, and what memory conditions dictate. 13112 * Note we always do nonblocking allocations from the TSB arena since 13113 * we don't want memory fragmentation to cause processes to block 13114 * indefinitely waiting for memory; until the kernel algorithms that 13115 * coalesce large pages are improved this is our best option. 13116 * 13117 * Algorithm: 13118 * If allocating a "large" TSB (>8K), allocate from the 13119 * appropriate kmem_tsb_default_arena vmem arena 13120 * else if low on memory or the TSB_FORCEALLOC flag is set or 13121 * tsb_forceheap is set 13122 * Allocate from kernel heap via sfmmu_tsb8k_cache with 13123 * KM_SLEEP (never fails) 13124 * else 13125 * Allocate from appropriate sfmmu_tsb_cache with 13126 * KM_NOSLEEP 13127 * endif 13128 */ 13129 if (tsb_lgrp_affinity) 13130 lgrpid = lgrp_home_id(curthread); 13131 if (lgrpid == LGRP_NONE) 13132 lgrpid = 0; /* use lgrp of boot CPU */ 13133 13134 if (tsbbytes > MMU_PAGESIZE) { 13135 if (tsbbytes > MMU_PAGESIZE4M) { 13136 vmp = kmem_bigtsb_default_arena[lgrpid]; 13137 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13138 0, 0, NULL, NULL, VM_NOSLEEP); 13139 } else { 13140 vmp = kmem_tsb_default_arena[lgrpid]; 13141 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 13142 0, 0, NULL, NULL, VM_NOSLEEP); 13143 } 13144 #ifdef DEBUG 13145 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 13146 #else /* !DEBUG */ 13147 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 13148 #endif /* DEBUG */ 13149 kmem_cachep = sfmmu_tsb8k_cache; 13150 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 13151 ASSERT(vaddr != NULL); 13152 } else { 13153 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 13154 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 13155 } 13156 13157 tsbinfo->tsb_cache = kmem_cachep; 13158 tsbinfo->tsb_vmp = vmp; 13159 13160 if (vaddr == NULL) { 13161 return (EAGAIN); 13162 } 13163 13164 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 13165 kmem_cachep = tsbinfo->tsb_cache; 13166 13167 /* 13168 * If we are allocating from outside the cage, then we need to 13169 * register a relocation callback handler. Note that for now 13170 * since pseudo mappings always hang off of the slab's root page, 13171 * we need only lock the first 8K of the TSB slab. This is a bit 13172 * hacky but it is good for performance. 13173 */ 13174 if (kmem_cachep != sfmmu_tsb8k_cache) { 13175 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 13176 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 13177 ASSERT(ret == 0); 13178 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 13179 cbflags, (void *)tsbinfo, &pfn, NULL); 13180 13181 /* 13182 * Need to free up resources if we could not successfully 13183 * add the callback function and return an error condition. 13184 */ 13185 if (ret != 0) { 13186 if (kmem_cachep) { 13187 kmem_cache_free(kmem_cachep, vaddr); 13188 } else { 13189 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 13190 } 13191 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 13192 S_WRITE); 13193 return (EAGAIN); 13194 } 13195 } else { 13196 /* 13197 * Since allocation of 8K TSBs from heap is rare and occurs 13198 * during memory pressure we allocate them from permanent 13199 * memory rather than using callbacks to get the PFN. 13200 */ 13201 pfn = hat_getpfnum(kas.a_hat, vaddr); 13202 } 13203 13204 tsbinfo->tsb_va = vaddr; 13205 tsbinfo->tsb_szc = tsbcode; 13206 tsbinfo->tsb_ttesz_mask = tteszmask; 13207 tsbinfo->tsb_next = NULL; 13208 tsbinfo->tsb_flags = 0; 13209 13210 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 13211 13212 sfmmu_inv_tsb(vaddr, tsbbytes); 13213 13214 if (kmem_cachep != sfmmu_tsb8k_cache) { 13215 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 13216 } 13217 13218 return (0); 13219 } 13220 13221 /* 13222 * Initialize per cpu tsb and per cpu tsbmiss_area 13223 */ 13224 void 13225 sfmmu_init_tsbs(void) 13226 { 13227 int i; 13228 struct tsbmiss *tsbmissp; 13229 struct kpmtsbm *kpmtsbmp; 13230 #ifndef sun4v 13231 extern int dcache_line_mask; 13232 #endif /* sun4v */ 13233 extern uint_t vac_colors; 13234 13235 /* 13236 * Init. tsb miss area. 13237 */ 13238 tsbmissp = tsbmiss_area; 13239 13240 for (i = 0; i < NCPU; tsbmissp++, i++) { 13241 /* 13242 * initialize the tsbmiss area. 13243 * Do this for all possible CPUs as some may be added 13244 * while the system is running. There is no cost to this. 13245 */ 13246 tsbmissp->ksfmmup = ksfmmup; 13247 #ifndef sun4v 13248 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 13249 #endif /* sun4v */ 13250 tsbmissp->khashstart = 13251 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 13252 tsbmissp->uhashstart = 13253 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 13254 tsbmissp->khashsz = khmehash_num; 13255 tsbmissp->uhashsz = uhmehash_num; 13256 } 13257 13258 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B', 13259 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0); 13260 13261 if (kpm_enable == 0) 13262 return; 13263 13264 /* -- Begin KPM specific init -- */ 13265 13266 if (kpm_smallpages) { 13267 /* 13268 * If we're using base pagesize pages for seg_kpm 13269 * mappings, we use the kernel TSB since we can't afford 13270 * to allocate a second huge TSB for these mappings. 13271 */ 13272 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13273 kpm_tsbsz = ktsb_szcode; 13274 kpmsm_tsbbase = kpm_tsbbase; 13275 kpmsm_tsbsz = kpm_tsbsz; 13276 } else { 13277 /* 13278 * In VAC conflict case, just put the entries in the 13279 * kernel 8K indexed TSB for now so we can find them. 13280 * This could really be changed in the future if we feel 13281 * the need... 13282 */ 13283 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13284 kpmsm_tsbsz = ktsb_szcode; 13285 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 13286 kpm_tsbsz = ktsb4m_szcode; 13287 } 13288 13289 kpmtsbmp = kpmtsbm_area; 13290 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 13291 /* 13292 * Initialize the kpmtsbm area. 13293 * Do this for all possible CPUs as some may be added 13294 * while the system is running. There is no cost to this. 13295 */ 13296 kpmtsbmp->vbase = kpm_vbase; 13297 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 13298 kpmtsbmp->sz_shift = kpm_size_shift; 13299 kpmtsbmp->kpmp_shift = kpmp_shift; 13300 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 13301 if (kpm_smallpages == 0) { 13302 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 13303 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 13304 } else { 13305 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 13306 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 13307 } 13308 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 13309 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 13310 #ifdef DEBUG 13311 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 13312 #endif /* DEBUG */ 13313 if (ktsb_phys) 13314 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 13315 } 13316 13317 /* -- End KPM specific init -- */ 13318 } 13319 13320 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 13321 struct tsb_info ktsb_info[2]; 13322 13323 /* 13324 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 13325 */ 13326 void 13327 sfmmu_init_ktsbinfo() 13328 { 13329 ASSERT(ksfmmup != NULL); 13330 ASSERT(ksfmmup->sfmmu_tsb == NULL); 13331 /* 13332 * Allocate tsbinfos for kernel and copy in data 13333 * to make debug easier and sun4v setup easier. 13334 */ 13335 ktsb_info[0].tsb_sfmmu = ksfmmup; 13336 ktsb_info[0].tsb_szc = ktsb_szcode; 13337 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 13338 ktsb_info[0].tsb_va = ktsb_base; 13339 ktsb_info[0].tsb_pa = ktsb_pbase; 13340 ktsb_info[0].tsb_flags = 0; 13341 ktsb_info[0].tsb_tte.ll = 0; 13342 ktsb_info[0].tsb_cache = NULL; 13343 13344 ktsb_info[1].tsb_sfmmu = ksfmmup; 13345 ktsb_info[1].tsb_szc = ktsb4m_szcode; 13346 ktsb_info[1].tsb_ttesz_mask = TSB4M; 13347 ktsb_info[1].tsb_va = ktsb4m_base; 13348 ktsb_info[1].tsb_pa = ktsb4m_pbase; 13349 ktsb_info[1].tsb_flags = 0; 13350 ktsb_info[1].tsb_tte.ll = 0; 13351 ktsb_info[1].tsb_cache = NULL; 13352 13353 /* Link them into ksfmmup. */ 13354 ktsb_info[0].tsb_next = &ktsb_info[1]; 13355 ktsb_info[1].tsb_next = NULL; 13356 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 13357 13358 sfmmu_setup_tsbinfo(ksfmmup); 13359 } 13360 13361 /* 13362 * Cache the last value returned from va_to_pa(). If the VA specified 13363 * in the current call to cached_va_to_pa() maps to the same Page (as the 13364 * previous call to cached_va_to_pa()), then compute the PA using 13365 * cached info, else call va_to_pa(). 13366 * 13367 * Note: this function is neither MT-safe nor consistent in the presence 13368 * of multiple, interleaved threads. This function was created to enable 13369 * an optimization used during boot (at a point when there's only one thread 13370 * executing on the "boot CPU", and before startup_vm() has been called). 13371 */ 13372 static uint64_t 13373 cached_va_to_pa(void *vaddr) 13374 { 13375 static uint64_t prev_vaddr_base = 0; 13376 static uint64_t prev_pfn = 0; 13377 13378 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 13379 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 13380 } else { 13381 uint64_t pa = va_to_pa(vaddr); 13382 13383 if (pa != ((uint64_t)-1)) { 13384 /* 13385 * Computed physical address is valid. Cache its 13386 * related info for the next cached_va_to_pa() call. 13387 */ 13388 prev_pfn = pa & MMU_PAGEMASK; 13389 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 13390 } 13391 13392 return (pa); 13393 } 13394 } 13395 13396 /* 13397 * Carve up our nucleus hblk region. We may allocate more hblks than 13398 * asked due to rounding errors but we are guaranteed to have at least 13399 * enough space to allocate the requested number of hblk8's and hblk1's. 13400 */ 13401 void 13402 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 13403 { 13404 struct hme_blk *hmeblkp; 13405 size_t hme8blk_sz, hme1blk_sz; 13406 size_t i; 13407 size_t hblk8_bound; 13408 ulong_t j = 0, k = 0; 13409 13410 ASSERT(addr != NULL && size != 0); 13411 13412 /* Need to use proper structure alignment */ 13413 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 13414 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 13415 13416 nucleus_hblk8.list = (void *)addr; 13417 nucleus_hblk8.index = 0; 13418 13419 /* 13420 * Use as much memory as possible for hblk8's since we 13421 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 13422 * We need to hold back enough space for the hblk1's which 13423 * we'll allocate next. 13424 */ 13425 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 13426 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 13427 hmeblkp = (struct hme_blk *)addr; 13428 addr += hme8blk_sz; 13429 hmeblkp->hblk_nuc_bit = 1; 13430 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13431 } 13432 nucleus_hblk8.len = j; 13433 ASSERT(j >= nhblk8); 13434 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 13435 13436 nucleus_hblk1.list = (void *)addr; 13437 nucleus_hblk1.index = 0; 13438 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 13439 hmeblkp = (struct hme_blk *)addr; 13440 addr += hme1blk_sz; 13441 hmeblkp->hblk_nuc_bit = 1; 13442 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13443 } 13444 ASSERT(k >= nhblk1); 13445 nucleus_hblk1.len = k; 13446 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 13447 } 13448 13449 /* 13450 * This function is currently not supported on this platform. For what 13451 * it's supposed to do, see hat.c and hat_srmmu.c 13452 */ 13453 /* ARGSUSED */ 13454 faultcode_t 13455 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 13456 uint_t flags) 13457 { 13458 ASSERT(hat->sfmmu_xhat_provider == NULL); 13459 return (FC_NOSUPPORT); 13460 } 13461 13462 /* 13463 * Searchs the mapping list of the page for a mapping of the same size. If not 13464 * found the corresponding bit is cleared in the p_index field. When large 13465 * pages are more prevalent in the system, we can maintain the mapping list 13466 * in order and we don't have to traverse the list each time. Just check the 13467 * next and prev entries, and if both are of different size, we clear the bit. 13468 */ 13469 static void 13470 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 13471 { 13472 struct sf_hment *sfhmep; 13473 struct hme_blk *hmeblkp; 13474 int index; 13475 pgcnt_t npgs; 13476 13477 ASSERT(ttesz > TTE8K); 13478 13479 ASSERT(sfmmu_mlist_held(pp)); 13480 13481 ASSERT(PP_ISMAPPED_LARGE(pp)); 13482 13483 /* 13484 * Traverse mapping list looking for another mapping of same size. 13485 * since we only want to clear index field if all mappings of 13486 * that size are gone. 13487 */ 13488 13489 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 13490 if (IS_PAHME(sfhmep)) 13491 continue; 13492 hmeblkp = sfmmu_hmetohblk(sfhmep); 13493 if (hmeblkp->hblk_xhat_bit) 13494 continue; 13495 if (hme_size(sfhmep) == ttesz) { 13496 /* 13497 * another mapping of the same size. don't clear index. 13498 */ 13499 return; 13500 } 13501 } 13502 13503 /* 13504 * Clear the p_index bit for large page. 13505 */ 13506 index = PAGESZ_TO_INDEX(ttesz); 13507 npgs = TTEPAGES(ttesz); 13508 while (npgs-- > 0) { 13509 ASSERT(pp->p_index & index); 13510 pp->p_index &= ~index; 13511 pp = PP_PAGENEXT(pp); 13512 } 13513 } 13514 13515 /* 13516 * return supported features 13517 */ 13518 /* ARGSUSED */ 13519 int 13520 hat_supported(enum hat_features feature, void *arg) 13521 { 13522 switch (feature) { 13523 case HAT_SHARED_PT: 13524 case HAT_DYNAMIC_ISM_UNMAP: 13525 case HAT_VMODSORT: 13526 return (1); 13527 case HAT_SHARED_REGIONS: 13528 if (shctx_on) 13529 return (1); 13530 else 13531 return (0); 13532 default: 13533 return (0); 13534 } 13535 } 13536 13537 void 13538 hat_enter(struct hat *hat) 13539 { 13540 hatlock_t *hatlockp; 13541 13542 if (hat != ksfmmup) { 13543 hatlockp = TSB_HASH(hat); 13544 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 13545 } 13546 } 13547 13548 void 13549 hat_exit(struct hat *hat) 13550 { 13551 hatlock_t *hatlockp; 13552 13553 if (hat != ksfmmup) { 13554 hatlockp = TSB_HASH(hat); 13555 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 13556 } 13557 } 13558 13559 /*ARGSUSED*/ 13560 void 13561 hat_reserve(struct as *as, caddr_t addr, size_t len) 13562 { 13563 } 13564 13565 static void 13566 hat_kstat_init(void) 13567 { 13568 kstat_t *ksp; 13569 13570 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 13571 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 13572 KSTAT_FLAG_VIRTUAL); 13573 if (ksp) { 13574 ksp->ks_data = (void *) &sfmmu_global_stat; 13575 kstat_install(ksp); 13576 } 13577 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 13578 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 13579 KSTAT_FLAG_VIRTUAL); 13580 if (ksp) { 13581 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 13582 kstat_install(ksp); 13583 } 13584 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 13585 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 13586 KSTAT_FLAG_WRITABLE); 13587 if (ksp) { 13588 ksp->ks_update = sfmmu_kstat_percpu_update; 13589 kstat_install(ksp); 13590 } 13591 } 13592 13593 /* ARGSUSED */ 13594 static int 13595 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 13596 { 13597 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 13598 struct tsbmiss *tsbm = tsbmiss_area; 13599 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 13600 int i; 13601 13602 ASSERT(cpu_kstat); 13603 if (rw == KSTAT_READ) { 13604 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 13605 cpu_kstat->sf_itlb_misses = 0; 13606 cpu_kstat->sf_dtlb_misses = 0; 13607 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 13608 tsbm->uprot_traps; 13609 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 13610 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 13611 cpu_kstat->sf_tsb_hits = 0; 13612 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 13613 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 13614 } 13615 } else { 13616 /* KSTAT_WRITE is used to clear stats */ 13617 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 13618 tsbm->utsb_misses = 0; 13619 tsbm->ktsb_misses = 0; 13620 tsbm->uprot_traps = 0; 13621 tsbm->kprot_traps = 0; 13622 kpmtsbm->kpm_dtlb_misses = 0; 13623 kpmtsbm->kpm_tsb_misses = 0; 13624 } 13625 } 13626 return (0); 13627 } 13628 13629 #ifdef DEBUG 13630 13631 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 13632 13633 /* 13634 * A tte checker. *orig_old is the value we read before cas. 13635 * *cur is the value returned by cas. 13636 * *new is the desired value when we do the cas. 13637 * 13638 * *hmeblkp is currently unused. 13639 */ 13640 13641 /* ARGSUSED */ 13642 void 13643 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 13644 { 13645 pfn_t i, j, k; 13646 int cpuid = CPU->cpu_id; 13647 13648 gorig[cpuid] = orig_old; 13649 gcur[cpuid] = cur; 13650 gnew[cpuid] = new; 13651 13652 #ifdef lint 13653 hmeblkp = hmeblkp; 13654 #endif 13655 13656 if (TTE_IS_VALID(orig_old)) { 13657 if (TTE_IS_VALID(cur)) { 13658 i = TTE_TO_TTEPFN(orig_old); 13659 j = TTE_TO_TTEPFN(cur); 13660 k = TTE_TO_TTEPFN(new); 13661 if (i != j) { 13662 /* remap error? */ 13663 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j); 13664 } 13665 13666 if (i != k) { 13667 /* remap error? */ 13668 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k); 13669 } 13670 } else { 13671 if (TTE_IS_VALID(new)) { 13672 panic("chk_tte: invalid cur? "); 13673 } 13674 13675 i = TTE_TO_TTEPFN(orig_old); 13676 k = TTE_TO_TTEPFN(new); 13677 if (i != k) { 13678 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k); 13679 } 13680 } 13681 } else { 13682 if (TTE_IS_VALID(cur)) { 13683 j = TTE_TO_TTEPFN(cur); 13684 if (TTE_IS_VALID(new)) { 13685 k = TTE_TO_TTEPFN(new); 13686 if (j != k) { 13687 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx", 13688 j, k); 13689 } 13690 } else { 13691 panic("chk_tte: why here?"); 13692 } 13693 } else { 13694 if (!TTE_IS_VALID(new)) { 13695 panic("chk_tte: why here2 ?"); 13696 } 13697 } 13698 } 13699 } 13700 13701 #endif /* DEBUG */ 13702 13703 extern void prefetch_tsbe_read(struct tsbe *); 13704 extern void prefetch_tsbe_write(struct tsbe *); 13705 13706 13707 /* 13708 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 13709 * us optimal performance on Cheetah+. You can only have 8 outstanding 13710 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 13711 * prefetch to make the most utilization of the prefetch capability. 13712 */ 13713 #define TSBE_PREFETCH_STRIDE (7) 13714 13715 void 13716 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 13717 { 13718 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 13719 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 13720 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 13721 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 13722 struct tsbe *old; 13723 struct tsbe *new; 13724 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 13725 uint64_t va; 13726 int new_offset; 13727 int i; 13728 int vpshift; 13729 int last_prefetch; 13730 13731 if (old_bytes == new_bytes) { 13732 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 13733 } else { 13734 13735 /* 13736 * A TSBE is 16 bytes which means there are four TSBE's per 13737 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 13738 */ 13739 old = (struct tsbe *)old_tsbinfo->tsb_va; 13740 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 13741 for (i = 0; i < old_entries; i++, old++) { 13742 if (((i & (4-1)) == 0) && (i < last_prefetch)) 13743 prefetch_tsbe_read(old); 13744 if (!old->tte_tag.tag_invalid) { 13745 /* 13746 * We have a valid TTE to remap. Check the 13747 * size. We won't remap 64K or 512K TTEs 13748 * because they span more than one TSB entry 13749 * and are indexed using an 8K virt. page. 13750 * Ditto for 32M and 256M TTEs. 13751 */ 13752 if (TTE_CSZ(&old->tte_data) == TTE64K || 13753 TTE_CSZ(&old->tte_data) == TTE512K) 13754 continue; 13755 if (mmu_page_sizes == max_mmu_page_sizes) { 13756 if (TTE_CSZ(&old->tte_data) == TTE32M || 13757 TTE_CSZ(&old->tte_data) == TTE256M) 13758 continue; 13759 } 13760 13761 /* clear the lower 22 bits of the va */ 13762 va = *(uint64_t *)old << 22; 13763 /* turn va into a virtual pfn */ 13764 va >>= 22 - TSB_START_SIZE; 13765 /* 13766 * or in bits from the offset in the tsb 13767 * to get the real virtual pfn. These 13768 * correspond to bits [21:13] in the va 13769 */ 13770 vpshift = 13771 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 13772 0x1ff; 13773 va |= (i << vpshift); 13774 va >>= vpshift; 13775 new_offset = va & (new_entries - 1); 13776 new = new_base + new_offset; 13777 prefetch_tsbe_write(new); 13778 *new = *old; 13779 } 13780 } 13781 } 13782 } 13783 13784 /* 13785 * unused in sfmmu 13786 */ 13787 void 13788 hat_dump(void) 13789 { 13790 } 13791 13792 /* 13793 * Called when a thread is exiting and we have switched to the kernel address 13794 * space. Perform the same VM initialization resume() uses when switching 13795 * processes. 13796 * 13797 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 13798 * we call it anyway in case the semantics change in the future. 13799 */ 13800 /*ARGSUSED*/ 13801 void 13802 hat_thread_exit(kthread_t *thd) 13803 { 13804 uint_t pgsz_cnum; 13805 uint_t pstate_save; 13806 13807 ASSERT(thd->t_procp->p_as == &kas); 13808 13809 pgsz_cnum = KCONTEXT; 13810 #ifdef sun4u 13811 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT); 13812 #endif 13813 13814 /* 13815 * Note that sfmmu_load_mmustate() is currently a no-op for 13816 * kernel threads. We need to disable interrupts here, 13817 * simply because otherwise sfmmu_load_mmustate() would panic 13818 * if the caller does not disable interrupts. 13819 */ 13820 pstate_save = sfmmu_disable_intrs(); 13821 13822 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */ 13823 sfmmu_setctx_sec(pgsz_cnum); 13824 sfmmu_load_mmustate(ksfmmup); 13825 sfmmu_enable_intrs(pstate_save); 13826 } 13827 13828 13829 /* 13830 * SRD support 13831 */ 13832 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \ 13833 (((uintptr_t)(vp)) >> 11)) & \ 13834 srd_hashmask) 13835 13836 /* 13837 * Attach the process to the srd struct associated with the exec vnode 13838 * from which the process is started. 13839 */ 13840 void 13841 hat_join_srd(struct hat *sfmmup, vnode_t *evp) 13842 { 13843 uint_t hash = SRD_HASH_FUNCTION(evp); 13844 sf_srd_t *srdp; 13845 sf_srd_t *newsrdp; 13846 13847 ASSERT(sfmmup != ksfmmup); 13848 ASSERT(sfmmup->sfmmu_srdp == NULL); 13849 13850 if (!shctx_on) { 13851 return; 13852 } 13853 13854 VN_HOLD(evp); 13855 13856 if (srd_buckets[hash].srdb_srdp != NULL) { 13857 mutex_enter(&srd_buckets[hash].srdb_lock); 13858 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13859 srdp = srdp->srd_hash) { 13860 if (srdp->srd_evp == evp) { 13861 ASSERT(srdp->srd_refcnt >= 0); 13862 sfmmup->sfmmu_srdp = srdp; 13863 atomic_add_32( 13864 (volatile uint_t *)&srdp->srd_refcnt, 1); 13865 mutex_exit(&srd_buckets[hash].srdb_lock); 13866 return; 13867 } 13868 } 13869 mutex_exit(&srd_buckets[hash].srdb_lock); 13870 } 13871 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP); 13872 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0); 13873 13874 newsrdp->srd_evp = evp; 13875 newsrdp->srd_refcnt = 1; 13876 newsrdp->srd_hmergnfree = NULL; 13877 newsrdp->srd_ismrgnfree = NULL; 13878 13879 mutex_enter(&srd_buckets[hash].srdb_lock); 13880 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13881 srdp = srdp->srd_hash) { 13882 if (srdp->srd_evp == evp) { 13883 ASSERT(srdp->srd_refcnt >= 0); 13884 sfmmup->sfmmu_srdp = srdp; 13885 atomic_add_32((volatile uint_t *)&srdp->srd_refcnt, 1); 13886 mutex_exit(&srd_buckets[hash].srdb_lock); 13887 kmem_cache_free(srd_cache, newsrdp); 13888 return; 13889 } 13890 } 13891 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp; 13892 srd_buckets[hash].srdb_srdp = newsrdp; 13893 sfmmup->sfmmu_srdp = newsrdp; 13894 13895 mutex_exit(&srd_buckets[hash].srdb_lock); 13896 13897 } 13898 13899 static void 13900 sfmmu_leave_srd(sfmmu_t *sfmmup) 13901 { 13902 vnode_t *evp; 13903 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13904 uint_t hash; 13905 sf_srd_t **prev_srdpp; 13906 sf_region_t *rgnp; 13907 sf_region_t *nrgnp; 13908 #ifdef DEBUG 13909 int rgns = 0; 13910 #endif 13911 int i; 13912 13913 ASSERT(sfmmup != ksfmmup); 13914 ASSERT(srdp != NULL); 13915 ASSERT(srdp->srd_refcnt > 0); 13916 ASSERT(sfmmup->sfmmu_scdp == NULL); 13917 ASSERT(sfmmup->sfmmu_free == 1); 13918 13919 sfmmup->sfmmu_srdp = NULL; 13920 evp = srdp->srd_evp; 13921 ASSERT(evp != NULL); 13922 if (atomic_add_32_nv( 13923 (volatile uint_t *)&srdp->srd_refcnt, -1)) { 13924 VN_RELE(evp); 13925 return; 13926 } 13927 13928 hash = SRD_HASH_FUNCTION(evp); 13929 mutex_enter(&srd_buckets[hash].srdb_lock); 13930 for (prev_srdpp = &srd_buckets[hash].srdb_srdp; 13931 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) { 13932 if (srdp->srd_evp == evp) { 13933 break; 13934 } 13935 } 13936 if (srdp == NULL || srdp->srd_refcnt) { 13937 mutex_exit(&srd_buckets[hash].srdb_lock); 13938 VN_RELE(evp); 13939 return; 13940 } 13941 *prev_srdpp = srdp->srd_hash; 13942 mutex_exit(&srd_buckets[hash].srdb_lock); 13943 13944 ASSERT(srdp->srd_refcnt == 0); 13945 VN_RELE(evp); 13946 13947 #ifdef DEBUG 13948 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) { 13949 ASSERT(srdp->srd_rgnhash[i] == NULL); 13950 } 13951 #endif /* DEBUG */ 13952 13953 /* free each hme regions in the srd */ 13954 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) { 13955 nrgnp = rgnp->rgn_next; 13956 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid); 13957 ASSERT(rgnp->rgn_refcnt == 0); 13958 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13959 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13960 ASSERT(rgnp->rgn_hmeflags == 0); 13961 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp); 13962 #ifdef DEBUG 13963 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13964 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13965 } 13966 rgns++; 13967 #endif /* DEBUG */ 13968 kmem_cache_free(region_cache, rgnp); 13969 } 13970 ASSERT(rgns == srdp->srd_next_hmerid); 13971 13972 #ifdef DEBUG 13973 rgns = 0; 13974 #endif 13975 /* free each ism rgns in the srd */ 13976 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) { 13977 nrgnp = rgnp->rgn_next; 13978 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid); 13979 ASSERT(rgnp->rgn_refcnt == 0); 13980 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13981 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13982 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp); 13983 #ifdef DEBUG 13984 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13985 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13986 } 13987 rgns++; 13988 #endif /* DEBUG */ 13989 kmem_cache_free(region_cache, rgnp); 13990 } 13991 ASSERT(rgns == srdp->srd_next_ismrid); 13992 ASSERT(srdp->srd_ismbusyrgns == 0); 13993 ASSERT(srdp->srd_hmebusyrgns == 0); 13994 13995 srdp->srd_next_ismrid = 0; 13996 srdp->srd_next_hmerid = 0; 13997 13998 bzero((void *)srdp->srd_ismrgnp, 13999 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS); 14000 bzero((void *)srdp->srd_hmergnp, 14001 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS); 14002 14003 ASSERT(srdp->srd_scdp == NULL); 14004 kmem_cache_free(srd_cache, srdp); 14005 } 14006 14007 /* ARGSUSED */ 14008 static int 14009 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags) 14010 { 14011 sf_srd_t *srdp = (sf_srd_t *)buf; 14012 bzero(buf, sizeof (*srdp)); 14013 14014 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL); 14015 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL); 14016 return (0); 14017 } 14018 14019 /* ARGSUSED */ 14020 static void 14021 sfmmu_srdcache_destructor(void *buf, void *cdrarg) 14022 { 14023 sf_srd_t *srdp = (sf_srd_t *)buf; 14024 14025 mutex_destroy(&srdp->srd_mutex); 14026 mutex_destroy(&srdp->srd_scd_mutex); 14027 } 14028 14029 /* 14030 * The caller makes sure hat_join_region()/hat_leave_region() can't be called 14031 * at the same time for the same process and address range. This is ensured by 14032 * the fact that address space is locked as writer when a process joins the 14033 * regions. Therefore there's no need to hold an srd lock during the entire 14034 * execution of hat_join_region()/hat_leave_region(). 14035 */ 14036 14037 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \ 14038 (((uintptr_t)(obj)) >> 11)) & \ 14039 srd_rgn_hashmask) 14040 /* 14041 * This routine implements the shared context functionality required when 14042 * attaching a segment to an address space. It must be called from 14043 * hat_share() for D(ISM) segments and from segvn_create() for segments 14044 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie 14045 * which is saved in the private segment data for hme segments and 14046 * the ism_map structure for ism segments. 14047 */ 14048 hat_region_cookie_t 14049 hat_join_region(struct hat *sfmmup, 14050 caddr_t r_saddr, 14051 size_t r_size, 14052 void *r_obj, 14053 u_offset_t r_objoff, 14054 uchar_t r_perm, 14055 uchar_t r_pgszc, 14056 hat_rgn_cb_func_t r_cb_function, 14057 uint_t flags) 14058 { 14059 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14060 uint_t rhash; 14061 uint_t rid; 14062 hatlock_t *hatlockp; 14063 sf_region_t *rgnp; 14064 sf_region_t *new_rgnp = NULL; 14065 int i; 14066 uint16_t *nextidp; 14067 sf_region_t **freelistp; 14068 int maxids; 14069 sf_region_t **rarrp; 14070 uint16_t *busyrgnsp; 14071 ulong_t rttecnt; 14072 uchar_t tteflag; 14073 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14074 int text = (r_type == HAT_REGION_TEXT); 14075 14076 if (srdp == NULL || r_size == 0) { 14077 return (HAT_INVALID_REGION_COOKIE); 14078 } 14079 14080 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14081 ASSERT(sfmmup != ksfmmup); 14082 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14083 ASSERT(srdp->srd_refcnt > 0); 14084 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14085 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14086 ASSERT(r_pgszc < mmu_page_sizes); 14087 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) || 14088 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) { 14089 panic("hat_join_region: region addr or size is not aligned\n"); 14090 } 14091 14092 14093 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14094 SFMMU_REGION_HME; 14095 /* 14096 * Currently only support shared hmes for the read only main text 14097 * region. 14098 */ 14099 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) || 14100 (r_perm & PROT_WRITE))) { 14101 return (HAT_INVALID_REGION_COOKIE); 14102 } 14103 14104 rhash = RGN_HASH_FUNCTION(r_obj); 14105 14106 if (r_type == SFMMU_REGION_ISM) { 14107 nextidp = &srdp->srd_next_ismrid; 14108 freelistp = &srdp->srd_ismrgnfree; 14109 maxids = SFMMU_MAX_ISM_REGIONS; 14110 rarrp = srdp->srd_ismrgnp; 14111 busyrgnsp = &srdp->srd_ismbusyrgns; 14112 } else { 14113 nextidp = &srdp->srd_next_hmerid; 14114 freelistp = &srdp->srd_hmergnfree; 14115 maxids = SFMMU_MAX_HME_REGIONS; 14116 rarrp = srdp->srd_hmergnp; 14117 busyrgnsp = &srdp->srd_hmebusyrgns; 14118 } 14119 14120 mutex_enter(&srdp->srd_mutex); 14121 14122 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14123 rgnp = rgnp->rgn_hash) { 14124 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size && 14125 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff && 14126 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) { 14127 break; 14128 } 14129 } 14130 14131 rfound: 14132 if (rgnp != NULL) { 14133 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14134 ASSERT(rgnp->rgn_cb_function == r_cb_function); 14135 ASSERT(rgnp->rgn_refcnt >= 0); 14136 rid = rgnp->rgn_id; 14137 ASSERT(rid < maxids); 14138 ASSERT(rarrp[rid] == rgnp); 14139 ASSERT(rid < *nextidp); 14140 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14141 mutex_exit(&srdp->srd_mutex); 14142 if (new_rgnp != NULL) { 14143 kmem_cache_free(region_cache, new_rgnp); 14144 } 14145 if (r_type == SFMMU_REGION_HME) { 14146 int myjoin = 14147 (sfmmup == astosfmmu(curthread->t_procp->p_as)); 14148 14149 sfmmu_link_to_hmeregion(sfmmup, rgnp); 14150 /* 14151 * bitmap should be updated after linking sfmmu on 14152 * region list so that pageunload() doesn't skip 14153 * TSB/TLB flush. As soon as bitmap is updated another 14154 * thread in this process can already start accessing 14155 * this region. 14156 */ 14157 /* 14158 * Normally ttecnt accounting is done as part of 14159 * pagefault handling. But a process may not take any 14160 * pagefaults on shared hmeblks created by some other 14161 * process. To compensate for this assume that the 14162 * entire region will end up faulted in using 14163 * the region's pagesize. 14164 * 14165 */ 14166 if (r_pgszc > TTE8K) { 14167 tteflag = 1 << r_pgszc; 14168 if (disable_large_pages & tteflag) { 14169 tteflag = 0; 14170 } 14171 } else { 14172 tteflag = 0; 14173 } 14174 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) { 14175 hatlockp = sfmmu_hat_enter(sfmmup); 14176 sfmmup->sfmmu_rtteflags |= tteflag; 14177 if (&mmu_set_pgsz_order) { 14178 mmu_set_pgsz_order(sfmmup, 1); 14179 } 14180 sfmmu_hat_exit(hatlockp); 14181 } 14182 hatlockp = sfmmu_hat_enter(sfmmup); 14183 14184 /* 14185 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M 14186 * region to allow for large page allocation failure. 14187 */ 14188 if (r_pgszc >= TTE4M) { 14189 sfmmup->sfmmu_tsb0_4minflcnt += 14190 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14191 } 14192 14193 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14194 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14195 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14196 rttecnt); 14197 14198 if (text && r_pgszc >= TTE4M && 14199 (tteflag || ((disable_large_pages >> TTE4M) & 14200 ((1 << (r_pgszc - TTE4M + 1)) - 1))) && 14201 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 14202 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 14203 } 14204 14205 sfmmu_hat_exit(hatlockp); 14206 /* 14207 * On Panther we need to make sure TLB is programmed 14208 * to accept 32M/256M pages. Call 14209 * sfmmu_check_page_sizes() now to make sure TLB is 14210 * setup before making hmeregions visible to other 14211 * threads. 14212 */ 14213 sfmmu_check_page_sizes(sfmmup, 1); 14214 hatlockp = sfmmu_hat_enter(sfmmup); 14215 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14216 14217 /* 14218 * if context is invalid tsb miss exception code will 14219 * call sfmmu_check_page_sizes() and update tsbmiss 14220 * area later. 14221 */ 14222 kpreempt_disable(); 14223 if (myjoin && 14224 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 14225 != INVALID_CONTEXT)) { 14226 struct tsbmiss *tsbmp; 14227 14228 tsbmp = &tsbmiss_area[CPU->cpu_id]; 14229 ASSERT(sfmmup == tsbmp->usfmmup); 14230 BT_SET(tsbmp->shmermap, rid); 14231 if (r_pgszc > TTE64K) { 14232 tsbmp->uhat_rtteflags |= tteflag; 14233 } 14234 14235 } 14236 kpreempt_enable(); 14237 14238 sfmmu_hat_exit(hatlockp); 14239 ASSERT((hat_region_cookie_t)((uint64_t)rid) != 14240 HAT_INVALID_REGION_COOKIE); 14241 } else { 14242 hatlockp = sfmmu_hat_enter(sfmmup); 14243 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid); 14244 sfmmu_hat_exit(hatlockp); 14245 } 14246 ASSERT(rid < maxids); 14247 14248 if (r_type == SFMMU_REGION_ISM) { 14249 sfmmu_find_scd(sfmmup); 14250 } 14251 return ((hat_region_cookie_t)((uint64_t)rid)); 14252 } 14253 14254 ASSERT(new_rgnp == NULL); 14255 14256 if (*busyrgnsp >= maxids) { 14257 mutex_exit(&srdp->srd_mutex); 14258 return (HAT_INVALID_REGION_COOKIE); 14259 } 14260 14261 ASSERT(MUTEX_HELD(&srdp->srd_mutex)); 14262 if (*freelistp != NULL) { 14263 rgnp = *freelistp; 14264 *freelistp = rgnp->rgn_next; 14265 ASSERT(rgnp->rgn_id < *nextidp); 14266 ASSERT(rgnp->rgn_id < maxids); 14267 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 14268 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) 14269 == r_type); 14270 ASSERT(rarrp[rgnp->rgn_id] == rgnp); 14271 ASSERT(rgnp->rgn_hmeflags == 0); 14272 } else { 14273 /* 14274 * release local locks before memory allocation. 14275 */ 14276 mutex_exit(&srdp->srd_mutex); 14277 14278 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP); 14279 14280 mutex_enter(&srdp->srd_mutex); 14281 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14282 rgnp = rgnp->rgn_hash) { 14283 if (rgnp->rgn_saddr == r_saddr && 14284 rgnp->rgn_size == r_size && 14285 rgnp->rgn_obj == r_obj && 14286 rgnp->rgn_objoff == r_objoff && 14287 rgnp->rgn_perm == r_perm && 14288 rgnp->rgn_pgszc == r_pgszc) { 14289 break; 14290 } 14291 } 14292 if (rgnp != NULL) { 14293 goto rfound; 14294 } 14295 14296 if (*nextidp >= maxids) { 14297 mutex_exit(&srdp->srd_mutex); 14298 goto fail; 14299 } 14300 rgnp = new_rgnp; 14301 new_rgnp = NULL; 14302 rgnp->rgn_id = (*nextidp)++; 14303 ASSERT(rgnp->rgn_id < maxids); 14304 ASSERT(rarrp[rgnp->rgn_id] == NULL); 14305 rarrp[rgnp->rgn_id] = rgnp; 14306 } 14307 14308 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14309 ASSERT(rgnp->rgn_hmeflags == 0); 14310 #ifdef DEBUG 14311 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14312 ASSERT(rgnp->rgn_ttecnt[i] == 0); 14313 } 14314 #endif 14315 rgnp->rgn_saddr = r_saddr; 14316 rgnp->rgn_size = r_size; 14317 rgnp->rgn_obj = r_obj; 14318 rgnp->rgn_objoff = r_objoff; 14319 rgnp->rgn_perm = r_perm; 14320 rgnp->rgn_pgszc = r_pgszc; 14321 rgnp->rgn_flags = r_type; 14322 rgnp->rgn_refcnt = 0; 14323 rgnp->rgn_cb_function = r_cb_function; 14324 rgnp->rgn_hash = srdp->srd_rgnhash[rhash]; 14325 srdp->srd_rgnhash[rhash] = rgnp; 14326 (*busyrgnsp)++; 14327 ASSERT(*busyrgnsp <= maxids); 14328 goto rfound; 14329 14330 fail: 14331 ASSERT(new_rgnp != NULL); 14332 kmem_cache_free(region_cache, new_rgnp); 14333 return (HAT_INVALID_REGION_COOKIE); 14334 } 14335 14336 /* 14337 * This function implements the shared context functionality required 14338 * when detaching a segment from an address space. It must be called 14339 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(), 14340 * for segments with a valid region_cookie. 14341 * It will also be called from all seg_vn routines which change a 14342 * segment's attributes such as segvn_setprot(), segvn_setpagesize(), 14343 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault 14344 * from segvn_fault(). 14345 */ 14346 void 14347 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags) 14348 { 14349 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14350 sf_scd_t *scdp; 14351 uint_t rhash; 14352 uint_t rid = (uint_t)((uint64_t)rcookie); 14353 hatlock_t *hatlockp = NULL; 14354 sf_region_t *rgnp; 14355 sf_region_t **prev_rgnpp; 14356 sf_region_t *cur_rgnp; 14357 void *r_obj; 14358 int i; 14359 caddr_t r_saddr; 14360 caddr_t r_eaddr; 14361 size_t r_size; 14362 uchar_t r_pgszc; 14363 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14364 14365 ASSERT(sfmmup != ksfmmup); 14366 ASSERT(srdp != NULL); 14367 ASSERT(srdp->srd_refcnt > 0); 14368 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14369 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14370 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL); 14371 14372 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14373 SFMMU_REGION_HME; 14374 14375 if (r_type == SFMMU_REGION_ISM) { 14376 ASSERT(SFMMU_IS_ISMRID_VALID(rid)); 14377 ASSERT(rid < SFMMU_MAX_ISM_REGIONS); 14378 rgnp = srdp->srd_ismrgnp[rid]; 14379 } else { 14380 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14381 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14382 rgnp = srdp->srd_hmergnp[rid]; 14383 } 14384 ASSERT(rgnp != NULL); 14385 ASSERT(rgnp->rgn_id == rid); 14386 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14387 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14388 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 14389 14390 ASSERT(sfmmup->sfmmu_xhat_provider == NULL); 14391 if (r_type == SFMMU_REGION_HME && sfmmup->sfmmu_as->a_xhat != NULL) { 14392 xhat_unload_callback_all(sfmmup->sfmmu_as, rgnp->rgn_saddr, 14393 rgnp->rgn_size, 0, NULL); 14394 } 14395 14396 if (sfmmup->sfmmu_free) { 14397 ulong_t rttecnt; 14398 r_pgszc = rgnp->rgn_pgszc; 14399 r_size = rgnp->rgn_size; 14400 14401 ASSERT(sfmmup->sfmmu_scdp == NULL); 14402 if (r_type == SFMMU_REGION_ISM) { 14403 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14404 } else { 14405 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14406 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14407 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14408 14409 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14410 -rttecnt); 14411 14412 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14413 } 14414 } else if (r_type == SFMMU_REGION_ISM) { 14415 hatlockp = sfmmu_hat_enter(sfmmup); 14416 ASSERT(rid < srdp->srd_next_ismrid); 14417 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14418 scdp = sfmmup->sfmmu_scdp; 14419 if (scdp != NULL && 14420 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 14421 sfmmu_leave_scd(sfmmup, r_type); 14422 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14423 } 14424 sfmmu_hat_exit(hatlockp); 14425 } else { 14426 ulong_t rttecnt; 14427 r_pgszc = rgnp->rgn_pgszc; 14428 r_saddr = rgnp->rgn_saddr; 14429 r_size = rgnp->rgn_size; 14430 r_eaddr = r_saddr + r_size; 14431 14432 ASSERT(r_type == SFMMU_REGION_HME); 14433 hatlockp = sfmmu_hat_enter(sfmmup); 14434 ASSERT(rid < srdp->srd_next_hmerid); 14435 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14436 14437 /* 14438 * If region is part of an SCD call sfmmu_leave_scd(). 14439 * Otherwise if process is not exiting and has valid context 14440 * just drop the context on the floor to lose stale TLB 14441 * entries and force the update of tsb miss area to reflect 14442 * the new region map. After that clean our TSB entries. 14443 */ 14444 scdp = sfmmup->sfmmu_scdp; 14445 if (scdp != NULL && 14446 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 14447 sfmmu_leave_scd(sfmmup, r_type); 14448 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14449 } 14450 sfmmu_invalidate_ctx(sfmmup); 14451 14452 i = TTE8K; 14453 while (i < mmu_page_sizes) { 14454 if (rgnp->rgn_ttecnt[i] != 0) { 14455 sfmmu_unload_tsb_range(sfmmup, r_saddr, 14456 r_eaddr, i); 14457 if (i < TTE4M) { 14458 i = TTE4M; 14459 continue; 14460 } else { 14461 break; 14462 } 14463 } 14464 i++; 14465 } 14466 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */ 14467 if (r_pgszc >= TTE4M) { 14468 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14469 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14470 rttecnt); 14471 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt; 14472 } 14473 14474 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14475 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14476 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14477 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt); 14478 14479 sfmmu_hat_exit(hatlockp); 14480 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 14481 /* sfmmup left the scd, grow private tsb */ 14482 sfmmu_check_page_sizes(sfmmup, 1); 14483 } else { 14484 sfmmu_check_page_sizes(sfmmup, 0); 14485 } 14486 } 14487 14488 if (r_type == SFMMU_REGION_HME) { 14489 sfmmu_unlink_from_hmeregion(sfmmup, rgnp); 14490 } 14491 14492 r_obj = rgnp->rgn_obj; 14493 if (atomic_add_32_nv((volatile uint_t *)&rgnp->rgn_refcnt, -1)) { 14494 return; 14495 } 14496 14497 /* 14498 * looks like nobody uses this region anymore. Free it. 14499 */ 14500 rhash = RGN_HASH_FUNCTION(r_obj); 14501 mutex_enter(&srdp->srd_mutex); 14502 for (prev_rgnpp = &srdp->srd_rgnhash[rhash]; 14503 (cur_rgnp = *prev_rgnpp) != NULL; 14504 prev_rgnpp = &cur_rgnp->rgn_hash) { 14505 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) { 14506 break; 14507 } 14508 } 14509 14510 if (cur_rgnp == NULL) { 14511 mutex_exit(&srdp->srd_mutex); 14512 return; 14513 } 14514 14515 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14516 *prev_rgnpp = rgnp->rgn_hash; 14517 if (r_type == SFMMU_REGION_ISM) { 14518 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14519 ASSERT(rid < srdp->srd_next_ismrid); 14520 rgnp->rgn_next = srdp->srd_ismrgnfree; 14521 srdp->srd_ismrgnfree = rgnp; 14522 ASSERT(srdp->srd_ismbusyrgns > 0); 14523 srdp->srd_ismbusyrgns--; 14524 mutex_exit(&srdp->srd_mutex); 14525 return; 14526 } 14527 mutex_exit(&srdp->srd_mutex); 14528 14529 /* 14530 * Destroy region's hmeblks. 14531 */ 14532 sfmmu_unload_hmeregion(srdp, rgnp); 14533 14534 rgnp->rgn_hmeflags = 0; 14535 14536 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14537 ASSERT(rgnp->rgn_id == rid); 14538 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14539 rgnp->rgn_ttecnt[i] = 0; 14540 } 14541 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14542 mutex_enter(&srdp->srd_mutex); 14543 ASSERT(rid < srdp->srd_next_hmerid); 14544 rgnp->rgn_next = srdp->srd_hmergnfree; 14545 srdp->srd_hmergnfree = rgnp; 14546 ASSERT(srdp->srd_hmebusyrgns > 0); 14547 srdp->srd_hmebusyrgns--; 14548 mutex_exit(&srdp->srd_mutex); 14549 } 14550 14551 /* 14552 * For now only called for hmeblk regions and not for ISM regions. 14553 */ 14554 void 14555 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie) 14556 { 14557 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14558 uint_t rid = (uint_t)((uint64_t)rcookie); 14559 sf_region_t *rgnp; 14560 sf_rgn_link_t *rlink; 14561 sf_rgn_link_t *hrlink; 14562 ulong_t rttecnt; 14563 14564 ASSERT(sfmmup != ksfmmup); 14565 ASSERT(srdp != NULL); 14566 ASSERT(srdp->srd_refcnt > 0); 14567 14568 ASSERT(rid < srdp->srd_next_hmerid); 14569 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14570 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14571 14572 rgnp = srdp->srd_hmergnp[rid]; 14573 ASSERT(rgnp->rgn_refcnt > 0); 14574 ASSERT(rgnp->rgn_id == rid); 14575 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME); 14576 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14577 14578 atomic_add_32((volatile uint_t *)&rgnp->rgn_refcnt, 1); 14579 14580 /* LINTED: constant in conditional context */ 14581 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0); 14582 ASSERT(rlink != NULL); 14583 mutex_enter(&rgnp->rgn_mutex); 14584 ASSERT(rgnp->rgn_sfmmu_head != NULL); 14585 /* LINTED: constant in conditional context */ 14586 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0); 14587 ASSERT(hrlink != NULL); 14588 ASSERT(hrlink->prev == NULL); 14589 rlink->next = rgnp->rgn_sfmmu_head; 14590 rlink->prev = NULL; 14591 hrlink->prev = sfmmup; 14592 /* 14593 * make sure rlink's next field is correct 14594 * before making this link visible. 14595 */ 14596 membar_stst(); 14597 rgnp->rgn_sfmmu_head = sfmmup; 14598 mutex_exit(&rgnp->rgn_mutex); 14599 14600 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14601 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 14602 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt); 14603 /* update tsb0 inflation count */ 14604 if (rgnp->rgn_pgszc >= TTE4M) { 14605 sfmmup->sfmmu_tsb0_4minflcnt += 14606 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14607 } 14608 /* 14609 * Update regionid bitmask without hat lock since no other thread 14610 * can update this region bitmask right now. 14611 */ 14612 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14613 } 14614 14615 /* ARGSUSED */ 14616 static int 14617 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags) 14618 { 14619 sf_region_t *rgnp = (sf_region_t *)buf; 14620 bzero(buf, sizeof (*rgnp)); 14621 14622 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL); 14623 14624 return (0); 14625 } 14626 14627 /* ARGSUSED */ 14628 static void 14629 sfmmu_rgncache_destructor(void *buf, void *cdrarg) 14630 { 14631 sf_region_t *rgnp = (sf_region_t *)buf; 14632 mutex_destroy(&rgnp->rgn_mutex); 14633 } 14634 14635 static int 14636 sfrgnmap_isnull(sf_region_map_t *map) 14637 { 14638 int i; 14639 14640 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14641 if (map->bitmap[i] != 0) { 14642 return (0); 14643 } 14644 } 14645 return (1); 14646 } 14647 14648 static int 14649 sfhmergnmap_isnull(sf_hmeregion_map_t *map) 14650 { 14651 int i; 14652 14653 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 14654 if (map->bitmap[i] != 0) { 14655 return (0); 14656 } 14657 } 14658 return (1); 14659 } 14660 14661 #ifdef DEBUG 14662 static void 14663 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist) 14664 { 14665 sfmmu_t *sp; 14666 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14667 14668 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) { 14669 ASSERT(srdp == sp->sfmmu_srdp); 14670 if (sp == sfmmup) { 14671 if (onlist) { 14672 return; 14673 } else { 14674 panic("shctx: sfmmu 0x%p found on scd" 14675 "list 0x%p", (void *)sfmmup, 14676 (void *)*headp); 14677 } 14678 } 14679 } 14680 if (onlist) { 14681 panic("shctx: sfmmu 0x%p not found on scd list 0x%p", 14682 (void *)sfmmup, (void *)*headp); 14683 } else { 14684 return; 14685 } 14686 } 14687 #else /* DEBUG */ 14688 #define check_scd_sfmmu_list(headp, sfmmup, onlist) 14689 #endif /* DEBUG */ 14690 14691 /* 14692 * Removes an sfmmu from the SCD sfmmu list. 14693 */ 14694 static void 14695 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14696 { 14697 ASSERT(sfmmup->sfmmu_srdp != NULL); 14698 check_scd_sfmmu_list(headp, sfmmup, 1); 14699 if (sfmmup->sfmmu_scd_link.prev != NULL) { 14700 ASSERT(*headp != sfmmup); 14701 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next = 14702 sfmmup->sfmmu_scd_link.next; 14703 } else { 14704 ASSERT(*headp == sfmmup); 14705 *headp = sfmmup->sfmmu_scd_link.next; 14706 } 14707 if (sfmmup->sfmmu_scd_link.next != NULL) { 14708 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev = 14709 sfmmup->sfmmu_scd_link.prev; 14710 } 14711 } 14712 14713 14714 /* 14715 * Adds an sfmmu to the start of the queue. 14716 */ 14717 static void 14718 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14719 { 14720 check_scd_sfmmu_list(headp, sfmmup, 0); 14721 sfmmup->sfmmu_scd_link.prev = NULL; 14722 sfmmup->sfmmu_scd_link.next = *headp; 14723 if (*headp != NULL) 14724 (*headp)->sfmmu_scd_link.prev = sfmmup; 14725 *headp = sfmmup; 14726 } 14727 14728 /* 14729 * Remove an scd from the start of the queue. 14730 */ 14731 static void 14732 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp) 14733 { 14734 if (scdp->scd_prev != NULL) { 14735 ASSERT(*headp != scdp); 14736 scdp->scd_prev->scd_next = scdp->scd_next; 14737 } else { 14738 ASSERT(*headp == scdp); 14739 *headp = scdp->scd_next; 14740 } 14741 14742 if (scdp->scd_next != NULL) { 14743 scdp->scd_next->scd_prev = scdp->scd_prev; 14744 } 14745 } 14746 14747 /* 14748 * Add an scd to the start of the queue. 14749 */ 14750 static void 14751 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp) 14752 { 14753 scdp->scd_prev = NULL; 14754 scdp->scd_next = *headp; 14755 if (*headp != NULL) { 14756 (*headp)->scd_prev = scdp; 14757 } 14758 *headp = scdp; 14759 } 14760 14761 static int 14762 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp) 14763 { 14764 uint_t rid; 14765 uint_t i; 14766 uint_t j; 14767 ulong_t w; 14768 sf_region_t *rgnp; 14769 ulong_t tte8k_cnt = 0; 14770 ulong_t tte4m_cnt = 0; 14771 uint_t tsb_szc; 14772 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 14773 sfmmu_t *ism_hatid; 14774 struct tsb_info *newtsb; 14775 int szc; 14776 14777 ASSERT(srdp != NULL); 14778 14779 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14780 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14781 continue; 14782 } 14783 j = 0; 14784 while (w) { 14785 if (!(w & 0x1)) { 14786 j++; 14787 w >>= 1; 14788 continue; 14789 } 14790 rid = (i << BT_ULSHIFT) | j; 14791 j++; 14792 w >>= 1; 14793 14794 if (rid < SFMMU_MAX_HME_REGIONS) { 14795 rgnp = srdp->srd_hmergnp[rid]; 14796 ASSERT(rgnp->rgn_id == rid); 14797 ASSERT(rgnp->rgn_refcnt > 0); 14798 14799 if (rgnp->rgn_pgszc < TTE4M) { 14800 tte8k_cnt += rgnp->rgn_size >> 14801 TTE_PAGE_SHIFT(TTE8K); 14802 } else { 14803 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14804 tte4m_cnt += rgnp->rgn_size >> 14805 TTE_PAGE_SHIFT(TTE4M); 14806 /* 14807 * Inflate SCD tsb0 by preallocating 14808 * 1/4 8k ttecnt for 4M regions to 14809 * allow for lgpg alloc failure. 14810 */ 14811 tte8k_cnt += rgnp->rgn_size >> 14812 (TTE_PAGE_SHIFT(TTE8K) + 2); 14813 } 14814 } else { 14815 rid -= SFMMU_MAX_HME_REGIONS; 14816 rgnp = srdp->srd_ismrgnp[rid]; 14817 ASSERT(rgnp->rgn_id == rid); 14818 ASSERT(rgnp->rgn_refcnt > 0); 14819 14820 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14821 ASSERT(ism_hatid->sfmmu_ismhat); 14822 14823 for (szc = 0; szc < TTE4M; szc++) { 14824 tte8k_cnt += 14825 ism_hatid->sfmmu_ttecnt[szc] << 14826 TTE_BSZS_SHIFT(szc); 14827 } 14828 14829 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14830 if (rgnp->rgn_pgszc >= TTE4M) { 14831 tte4m_cnt += rgnp->rgn_size >> 14832 TTE_PAGE_SHIFT(TTE4M); 14833 } 14834 } 14835 } 14836 } 14837 14838 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 14839 14840 /* Allocate both the SCD TSBs here. */ 14841 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14842 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) && 14843 (tsb_szc <= TSB_4M_SZCODE || 14844 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14845 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K, 14846 TSB_ALLOC, scsfmmup))) { 14847 14848 SFMMU_STAT(sf_scd_1sttsb_allocfail); 14849 return (TSB_ALLOCFAIL); 14850 } else { 14851 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX; 14852 14853 if (tte4m_cnt) { 14854 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 14855 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, 14856 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) && 14857 (tsb_szc <= TSB_4M_SZCODE || 14858 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 14859 TSB4M|TSB32M|TSB256M, 14860 TSB_ALLOC, scsfmmup))) { 14861 /* 14862 * If we fail to allocate the 2nd shared tsb, 14863 * just free the 1st tsb, return failure. 14864 */ 14865 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb); 14866 SFMMU_STAT(sf_scd_2ndtsb_allocfail); 14867 return (TSB_ALLOCFAIL); 14868 } else { 14869 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL); 14870 newtsb->tsb_flags |= TSB_SHAREDCTX; 14871 scsfmmup->sfmmu_tsb->tsb_next = newtsb; 14872 SFMMU_STAT(sf_scd_2ndtsb_alloc); 14873 } 14874 } 14875 SFMMU_STAT(sf_scd_1sttsb_alloc); 14876 } 14877 return (TSB_SUCCESS); 14878 } 14879 14880 static void 14881 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu) 14882 { 14883 while (scd_sfmmu->sfmmu_tsb != NULL) { 14884 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next; 14885 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb); 14886 scd_sfmmu->sfmmu_tsb = next; 14887 } 14888 } 14889 14890 /* 14891 * Link the sfmmu onto the hme region list. 14892 */ 14893 void 14894 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14895 { 14896 uint_t rid; 14897 sf_rgn_link_t *rlink; 14898 sfmmu_t *head; 14899 sf_rgn_link_t *hrlink; 14900 14901 rid = rgnp->rgn_id; 14902 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14903 14904 /* LINTED: constant in conditional context */ 14905 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1); 14906 ASSERT(rlink != NULL); 14907 mutex_enter(&rgnp->rgn_mutex); 14908 if ((head = rgnp->rgn_sfmmu_head) == NULL) { 14909 rlink->next = NULL; 14910 rlink->prev = NULL; 14911 /* 14912 * make sure rlink's next field is NULL 14913 * before making this link visible. 14914 */ 14915 membar_stst(); 14916 rgnp->rgn_sfmmu_head = sfmmup; 14917 } else { 14918 /* LINTED: constant in conditional context */ 14919 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0); 14920 ASSERT(hrlink != NULL); 14921 ASSERT(hrlink->prev == NULL); 14922 rlink->next = head; 14923 rlink->prev = NULL; 14924 hrlink->prev = sfmmup; 14925 /* 14926 * make sure rlink's next field is correct 14927 * before making this link visible. 14928 */ 14929 membar_stst(); 14930 rgnp->rgn_sfmmu_head = sfmmup; 14931 } 14932 mutex_exit(&rgnp->rgn_mutex); 14933 } 14934 14935 /* 14936 * Unlink the sfmmu from the hme region list. 14937 */ 14938 void 14939 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14940 { 14941 uint_t rid; 14942 sf_rgn_link_t *rlink; 14943 14944 rid = rgnp->rgn_id; 14945 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14946 14947 /* LINTED: constant in conditional context */ 14948 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 14949 ASSERT(rlink != NULL); 14950 mutex_enter(&rgnp->rgn_mutex); 14951 if (rgnp->rgn_sfmmu_head == sfmmup) { 14952 sfmmu_t *next = rlink->next; 14953 rgnp->rgn_sfmmu_head = next; 14954 /* 14955 * if we are stopped by xc_attention() after this 14956 * point the forward link walking in 14957 * sfmmu_rgntlb_demap() will work correctly since the 14958 * head correctly points to the next element. 14959 */ 14960 membar_stst(); 14961 rlink->next = NULL; 14962 ASSERT(rlink->prev == NULL); 14963 if (next != NULL) { 14964 sf_rgn_link_t *nrlink; 14965 /* LINTED: constant in conditional context */ 14966 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14967 ASSERT(nrlink != NULL); 14968 ASSERT(nrlink->prev == sfmmup); 14969 nrlink->prev = NULL; 14970 } 14971 } else { 14972 sfmmu_t *next = rlink->next; 14973 sfmmu_t *prev = rlink->prev; 14974 sf_rgn_link_t *prlink; 14975 14976 ASSERT(prev != NULL); 14977 /* LINTED: constant in conditional context */ 14978 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0); 14979 ASSERT(prlink != NULL); 14980 ASSERT(prlink->next == sfmmup); 14981 prlink->next = next; 14982 /* 14983 * if we are stopped by xc_attention() 14984 * after this point the forward link walking 14985 * will work correctly since the prev element 14986 * correctly points to the next element. 14987 */ 14988 membar_stst(); 14989 rlink->next = NULL; 14990 rlink->prev = NULL; 14991 if (next != NULL) { 14992 sf_rgn_link_t *nrlink; 14993 /* LINTED: constant in conditional context */ 14994 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14995 ASSERT(nrlink != NULL); 14996 ASSERT(nrlink->prev == sfmmup); 14997 nrlink->prev = prev; 14998 } 14999 } 15000 mutex_exit(&rgnp->rgn_mutex); 15001 } 15002 15003 /* 15004 * Link scd sfmmu onto ism or hme region list for each region in the 15005 * scd region map. 15006 */ 15007 void 15008 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp) 15009 { 15010 uint_t rid; 15011 uint_t i; 15012 uint_t j; 15013 ulong_t w; 15014 sf_region_t *rgnp; 15015 sfmmu_t *scsfmmup; 15016 15017 scsfmmup = scdp->scd_sfmmup; 15018 ASSERT(scsfmmup->sfmmu_scdhat); 15019 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 15020 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 15021 continue; 15022 } 15023 j = 0; 15024 while (w) { 15025 if (!(w & 0x1)) { 15026 j++; 15027 w >>= 1; 15028 continue; 15029 } 15030 rid = (i << BT_ULSHIFT) | j; 15031 j++; 15032 w >>= 1; 15033 15034 if (rid < SFMMU_MAX_HME_REGIONS) { 15035 rgnp = srdp->srd_hmergnp[rid]; 15036 ASSERT(rgnp->rgn_id == rid); 15037 ASSERT(rgnp->rgn_refcnt > 0); 15038 sfmmu_link_to_hmeregion(scsfmmup, rgnp); 15039 } else { 15040 sfmmu_t *ism_hatid = NULL; 15041 ism_ment_t *ism_ment; 15042 rid -= SFMMU_MAX_HME_REGIONS; 15043 rgnp = srdp->srd_ismrgnp[rid]; 15044 ASSERT(rgnp->rgn_id == rid); 15045 ASSERT(rgnp->rgn_refcnt > 0); 15046 15047 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 15048 ASSERT(ism_hatid->sfmmu_ismhat); 15049 ism_ment = &scdp->scd_ism_links[rid]; 15050 ism_ment->iment_hat = scsfmmup; 15051 ism_ment->iment_base_va = rgnp->rgn_saddr; 15052 mutex_enter(&ism_mlist_lock); 15053 iment_add(ism_ment, ism_hatid); 15054 mutex_exit(&ism_mlist_lock); 15055 15056 } 15057 } 15058 } 15059 } 15060 /* 15061 * Unlink scd sfmmu from ism or hme region list for each region in the 15062 * scd region map. 15063 */ 15064 void 15065 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp) 15066 { 15067 uint_t rid; 15068 uint_t i; 15069 uint_t j; 15070 ulong_t w; 15071 sf_region_t *rgnp; 15072 sfmmu_t *scsfmmup; 15073 15074 scsfmmup = scdp->scd_sfmmup; 15075 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 15076 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 15077 continue; 15078 } 15079 j = 0; 15080 while (w) { 15081 if (!(w & 0x1)) { 15082 j++; 15083 w >>= 1; 15084 continue; 15085 } 15086 rid = (i << BT_ULSHIFT) | j; 15087 j++; 15088 w >>= 1; 15089 15090 if (rid < SFMMU_MAX_HME_REGIONS) { 15091 rgnp = srdp->srd_hmergnp[rid]; 15092 ASSERT(rgnp->rgn_id == rid); 15093 ASSERT(rgnp->rgn_refcnt > 0); 15094 sfmmu_unlink_from_hmeregion(scsfmmup, 15095 rgnp); 15096 15097 } else { 15098 sfmmu_t *ism_hatid = NULL; 15099 ism_ment_t *ism_ment; 15100 rid -= SFMMU_MAX_HME_REGIONS; 15101 rgnp = srdp->srd_ismrgnp[rid]; 15102 ASSERT(rgnp->rgn_id == rid); 15103 ASSERT(rgnp->rgn_refcnt > 0); 15104 15105 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 15106 ASSERT(ism_hatid->sfmmu_ismhat); 15107 ism_ment = &scdp->scd_ism_links[rid]; 15108 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup); 15109 ASSERT(ism_ment->iment_base_va == 15110 rgnp->rgn_saddr); 15111 ism_ment->iment_hat = NULL; 15112 ism_ment->iment_base_va = 0; 15113 mutex_enter(&ism_mlist_lock); 15114 iment_sub(ism_ment, ism_hatid); 15115 mutex_exit(&ism_mlist_lock); 15116 15117 } 15118 } 15119 } 15120 } 15121 /* 15122 * Allocates and initialises a new SCD structure, this is called with 15123 * the srd_scd_mutex held and returns with the reference count 15124 * initialised to 1. 15125 */ 15126 static sf_scd_t * 15127 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map) 15128 { 15129 sf_scd_t *new_scdp; 15130 sfmmu_t *scsfmmup; 15131 int i; 15132 15133 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex)); 15134 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP); 15135 15136 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 15137 new_scdp->scd_sfmmup = scsfmmup; 15138 scsfmmup->sfmmu_srdp = srdp; 15139 scsfmmup->sfmmu_scdp = new_scdp; 15140 scsfmmup->sfmmu_tsb0_4minflcnt = 0; 15141 scsfmmup->sfmmu_scdhat = 1; 15142 CPUSET_ALL(scsfmmup->sfmmu_cpusran); 15143 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 15144 15145 ASSERT(max_mmu_ctxdoms > 0); 15146 for (i = 0; i < max_mmu_ctxdoms; i++) { 15147 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 15148 scsfmmup->sfmmu_ctxs[i].gnum = 0; 15149 } 15150 15151 for (i = 0; i < MMU_PAGE_SIZES; i++) { 15152 new_scdp->scd_rttecnt[i] = 0; 15153 } 15154 15155 new_scdp->scd_region_map = *new_map; 15156 new_scdp->scd_refcnt = 1; 15157 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) { 15158 kmem_cache_free(scd_cache, new_scdp); 15159 kmem_cache_free(sfmmuid_cache, scsfmmup); 15160 return (NULL); 15161 } 15162 if (&mmu_init_scd) { 15163 mmu_init_scd(new_scdp); 15164 } 15165 return (new_scdp); 15166 } 15167 15168 /* 15169 * The first phase of a process joining an SCD. The hat structure is 15170 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set 15171 * and a cross-call with context invalidation is used to cause the 15172 * remaining work to be carried out in the sfmmu_tsbmiss_exception() 15173 * routine. 15174 */ 15175 static void 15176 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup) 15177 { 15178 hatlock_t *hatlockp; 15179 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15180 int i; 15181 sf_scd_t *old_scdp; 15182 15183 ASSERT(srdp != NULL); 15184 ASSERT(scdp != NULL); 15185 ASSERT(scdp->scd_refcnt > 0); 15186 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15187 15188 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) { 15189 ASSERT(old_scdp != scdp); 15190 15191 mutex_enter(&old_scdp->scd_mutex); 15192 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup); 15193 mutex_exit(&old_scdp->scd_mutex); 15194 /* 15195 * sfmmup leaves the old scd. Update sfmmu_ttecnt to 15196 * include the shme rgn ttecnt for rgns that 15197 * were in the old SCD 15198 */ 15199 for (i = 0; i < mmu_page_sizes; i++) { 15200 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15201 old_scdp->scd_rttecnt[i]); 15202 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15203 sfmmup->sfmmu_scdrttecnt[i]); 15204 } 15205 } 15206 15207 /* 15208 * Move sfmmu to the scd lists. 15209 */ 15210 mutex_enter(&scdp->scd_mutex); 15211 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup); 15212 mutex_exit(&scdp->scd_mutex); 15213 SF_SCD_INCR_REF(scdp); 15214 15215 hatlockp = sfmmu_hat_enter(sfmmup); 15216 /* 15217 * For a multi-thread process, we must stop 15218 * all the other threads before joining the scd. 15219 */ 15220 15221 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD); 15222 15223 sfmmu_invalidate_ctx(sfmmup); 15224 sfmmup->sfmmu_scdp = scdp; 15225 15226 /* 15227 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update 15228 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD. 15229 */ 15230 for (i = 0; i < mmu_page_sizes; i++) { 15231 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i]; 15232 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]); 15233 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15234 -sfmmup->sfmmu_scdrttecnt[i]); 15235 if (!sfmmup->sfmmu_ttecnt[i]) { 15236 sfmmup->sfmmu_tteflags &= ~(1 << i); 15237 } 15238 } 15239 /* update tsb0 inflation count */ 15240 if (old_scdp != NULL) { 15241 sfmmup->sfmmu_tsb0_4minflcnt += 15242 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15243 } 15244 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 15245 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt); 15246 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15247 15248 if (&mmu_set_pgsz_order) { 15249 mmu_set_pgsz_order(sfmmup, 0); 15250 } 15251 sfmmu_hat_exit(hatlockp); 15252 15253 if (old_scdp != NULL) { 15254 SF_SCD_DECR_REF(srdp, old_scdp); 15255 } 15256 15257 } 15258 15259 /* 15260 * This routine is called by a process to become part of an SCD. It is called 15261 * from sfmmu_tsbmiss_exception() once most of the initial work has been 15262 * done by sfmmu_join_scd(). This routine must not drop the hat lock. 15263 */ 15264 static void 15265 sfmmu_finish_join_scd(sfmmu_t *sfmmup) 15266 { 15267 struct tsb_info *tsbinfop; 15268 15269 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15270 ASSERT(sfmmup->sfmmu_scdp != NULL); 15271 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)); 15272 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15273 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)); 15274 15275 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 15276 tsbinfop = tsbinfop->tsb_next) { 15277 if (tsbinfop->tsb_flags & TSB_SWAPPED) { 15278 continue; 15279 } 15280 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG)); 15281 15282 sfmmu_inv_tsb(tsbinfop->tsb_va, 15283 TSB_BYTES(tsbinfop->tsb_szc)); 15284 } 15285 15286 /* Set HAT_CTX1_FLAG for all SCD ISMs */ 15287 sfmmu_ism_hatflags(sfmmup, 1); 15288 15289 SFMMU_STAT(sf_join_scd); 15290 } 15291 15292 /* 15293 * This routine is called in order to check if there is an SCD which matches 15294 * the process's region map if not then a new SCD may be created. 15295 */ 15296 static void 15297 sfmmu_find_scd(sfmmu_t *sfmmup) 15298 { 15299 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15300 sf_scd_t *scdp, *new_scdp; 15301 int ret; 15302 15303 ASSERT(srdp != NULL); 15304 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15305 15306 mutex_enter(&srdp->srd_scd_mutex); 15307 for (scdp = srdp->srd_scdp; scdp != NULL; 15308 scdp = scdp->scd_next) { 15309 SF_RGNMAP_EQUAL(&scdp->scd_region_map, 15310 &sfmmup->sfmmu_region_map, SFMMU_RGNMAP_WORDS, ret); 15311 if (ret == 1) { 15312 SF_SCD_INCR_REF(scdp); 15313 mutex_exit(&srdp->srd_scd_mutex); 15314 sfmmu_join_scd(scdp, sfmmup); 15315 ASSERT(scdp->scd_refcnt >= 2); 15316 atomic_add_32((volatile uint32_t *) 15317 &scdp->scd_refcnt, -1); 15318 return; 15319 } else { 15320 /* 15321 * If the sfmmu region map is a subset of the scd 15322 * region map, then the assumption is that this process 15323 * will continue attaching to ISM segments until the 15324 * region maps are equal. 15325 */ 15326 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map, 15327 &sfmmup->sfmmu_region_map, ret); 15328 if (ret == 1) { 15329 mutex_exit(&srdp->srd_scd_mutex); 15330 return; 15331 } 15332 } 15333 } 15334 15335 ASSERT(scdp == NULL); 15336 /* 15337 * No matching SCD has been found, create a new one. 15338 */ 15339 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) == 15340 NULL) { 15341 mutex_exit(&srdp->srd_scd_mutex); 15342 return; 15343 } 15344 15345 /* 15346 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd. 15347 */ 15348 15349 /* Set scd_rttecnt for shme rgns in SCD */ 15350 sfmmu_set_scd_rttecnt(srdp, new_scdp); 15351 15352 /* 15353 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists. 15354 */ 15355 sfmmu_link_scd_to_regions(srdp, new_scdp); 15356 sfmmu_add_scd(&srdp->srd_scdp, new_scdp); 15357 SFMMU_STAT_ADD(sf_create_scd, 1); 15358 15359 mutex_exit(&srdp->srd_scd_mutex); 15360 sfmmu_join_scd(new_scdp, sfmmup); 15361 ASSERT(new_scdp->scd_refcnt >= 2); 15362 atomic_add_32((volatile uint32_t *)&new_scdp->scd_refcnt, -1); 15363 } 15364 15365 /* 15366 * This routine is called by a process to remove itself from an SCD. It is 15367 * either called when the processes has detached from a segment or from 15368 * hat_free_start() as a result of calling exit. 15369 */ 15370 static void 15371 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type) 15372 { 15373 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15374 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15375 hatlock_t *hatlockp = TSB_HASH(sfmmup); 15376 int i; 15377 15378 ASSERT(scdp != NULL); 15379 ASSERT(srdp != NULL); 15380 15381 if (sfmmup->sfmmu_free) { 15382 /* 15383 * If the process is part of an SCD the sfmmu is unlinked 15384 * from scd_sf_list. 15385 */ 15386 mutex_enter(&scdp->scd_mutex); 15387 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15388 mutex_exit(&scdp->scd_mutex); 15389 /* 15390 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15391 * are about to leave the SCD 15392 */ 15393 for (i = 0; i < mmu_page_sizes; i++) { 15394 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15395 scdp->scd_rttecnt[i]); 15396 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15397 sfmmup->sfmmu_scdrttecnt[i]); 15398 sfmmup->sfmmu_scdrttecnt[i] = 0; 15399 } 15400 sfmmup->sfmmu_scdp = NULL; 15401 15402 SF_SCD_DECR_REF(srdp, scdp); 15403 return; 15404 } 15405 15406 ASSERT(r_type != SFMMU_REGION_ISM || 15407 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15408 ASSERT(scdp->scd_refcnt); 15409 ASSERT(!sfmmup->sfmmu_free); 15410 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15411 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as, &sfmmup->sfmmu_as->a_lock)); 15412 15413 /* 15414 * Wait for ISM maps to be updated. 15415 */ 15416 if (r_type != SFMMU_REGION_ISM) { 15417 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) && 15418 sfmmup->sfmmu_scdp != NULL) { 15419 cv_wait(&sfmmup->sfmmu_tsb_cv, 15420 HATLOCK_MUTEXP(hatlockp)); 15421 } 15422 15423 if (sfmmup->sfmmu_scdp == NULL) { 15424 sfmmu_hat_exit(hatlockp); 15425 return; 15426 } 15427 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 15428 } 15429 15430 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 15431 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 15432 /* 15433 * Since HAT_JOIN_SCD was set our context 15434 * is still invalid. 15435 */ 15436 } else { 15437 /* 15438 * For a multi-thread process, we must stop 15439 * all the other threads before leaving the scd. 15440 */ 15441 15442 sfmmu_invalidate_ctx(sfmmup); 15443 } 15444 15445 /* Clear all the rid's for ISM, delete flags, etc */ 15446 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15447 sfmmu_ism_hatflags(sfmmup, 0); 15448 15449 /* 15450 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15451 * are in SCD before this sfmmup leaves the SCD. 15452 */ 15453 for (i = 0; i < mmu_page_sizes; i++) { 15454 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15455 scdp->scd_rttecnt[i]); 15456 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15457 sfmmup->sfmmu_scdrttecnt[i]); 15458 if (sfmmup->sfmmu_ttecnt[i] && 15459 (sfmmup->sfmmu_tteflags & (1 << i)) == 0) { 15460 sfmmup->sfmmu_tteflags |= (1 << i); 15461 } 15462 sfmmup->sfmmu_scdrttecnt[i] = 0; 15463 /* update ismttecnt to include SCD ism before hat leaves SCD */ 15464 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i]; 15465 sfmmup->sfmmu_scdismttecnt[i] = 0; 15466 } 15467 /* update tsb0 inflation count */ 15468 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15469 15470 if (r_type != SFMMU_REGION_ISM) { 15471 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 15472 } 15473 sfmmup->sfmmu_scdp = NULL; 15474 15475 if (&mmu_set_pgsz_order) { 15476 mmu_set_pgsz_order(sfmmup, 0); 15477 } 15478 sfmmu_hat_exit(hatlockp); 15479 15480 /* 15481 * Unlink sfmmu from scd_sf_list this can be done without holding 15482 * the hat lock as we hold the sfmmu_as lock which prevents 15483 * hat_join_region from adding this thread to the scd again. Other 15484 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL 15485 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp 15486 * while holding the hat lock. 15487 */ 15488 mutex_enter(&scdp->scd_mutex); 15489 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15490 mutex_exit(&scdp->scd_mutex); 15491 SFMMU_STAT(sf_leave_scd); 15492 15493 SF_SCD_DECR_REF(srdp, scdp); 15494 hatlockp = sfmmu_hat_enter(sfmmup); 15495 15496 } 15497 15498 /* 15499 * Unlink and free up an SCD structure with a reference count of 0. 15500 */ 15501 static void 15502 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap) 15503 { 15504 sfmmu_t *scsfmmup; 15505 sf_scd_t *sp; 15506 hatlock_t *shatlockp; 15507 int i, ret; 15508 15509 mutex_enter(&srdp->srd_scd_mutex); 15510 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) { 15511 if (sp == scdp) 15512 break; 15513 } 15514 if (sp == NULL || sp->scd_refcnt) { 15515 mutex_exit(&srdp->srd_scd_mutex); 15516 return; 15517 } 15518 15519 /* 15520 * It is possible that the scd has been freed and reallocated with a 15521 * different region map while we've been waiting for the srd_scd_mutex. 15522 */ 15523 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, 15524 SFMMU_RGNMAP_WORDS, ret); 15525 if (ret != 1) { 15526 mutex_exit(&srdp->srd_scd_mutex); 15527 return; 15528 } 15529 15530 ASSERT(scdp->scd_sf_list == NULL); 15531 /* 15532 * Unlink scd from srd_scdp list. 15533 */ 15534 sfmmu_remove_scd(&srdp->srd_scdp, scdp); 15535 mutex_exit(&srdp->srd_scd_mutex); 15536 15537 sfmmu_unlink_scd_from_regions(srdp, scdp); 15538 15539 /* Clear shared context tsb and release ctx */ 15540 scsfmmup = scdp->scd_sfmmup; 15541 15542 /* 15543 * create a barrier so that scd will not be destroyed 15544 * if other thread still holds the same shared hat lock. 15545 * E.g., sfmmu_tsbmiss_exception() needs to acquire the 15546 * shared hat lock before checking the shared tsb reloc flag. 15547 */ 15548 shatlockp = sfmmu_hat_enter(scsfmmup); 15549 sfmmu_hat_exit(shatlockp); 15550 15551 sfmmu_free_scd_tsbs(scsfmmup); 15552 15553 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 15554 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) { 15555 kmem_free(scsfmmup->sfmmu_hmeregion_links[i], 15556 SFMMU_L2_HMERLINKS_SIZE); 15557 scsfmmup->sfmmu_hmeregion_links[i] = NULL; 15558 } 15559 } 15560 kmem_cache_free(sfmmuid_cache, scsfmmup); 15561 kmem_cache_free(scd_cache, scdp); 15562 SFMMU_STAT(sf_destroy_scd); 15563 } 15564 15565 /* 15566 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to 15567 * bits which are set in the ism_region_map parameter. This flag indicates to 15568 * the tsbmiss handler that mapping for these segments should be loaded using 15569 * the shared context. 15570 */ 15571 static void 15572 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag) 15573 { 15574 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15575 ism_blk_t *ism_blkp; 15576 ism_map_t *ism_map; 15577 int i, rid; 15578 15579 ASSERT(sfmmup->sfmmu_iblk != NULL); 15580 ASSERT(scdp != NULL); 15581 /* 15582 * Note that the caller either set HAT_ISMBUSY flag or checked 15583 * under hat lock that HAT_ISMBUSY was not set by another thread. 15584 */ 15585 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15586 15587 ism_blkp = sfmmup->sfmmu_iblk; 15588 while (ism_blkp != NULL) { 15589 ism_map = ism_blkp->iblk_maps; 15590 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 15591 rid = ism_map[i].imap_rid; 15592 if (rid == SFMMU_INVALID_ISMRID) { 15593 continue; 15594 } 15595 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS); 15596 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) && 15597 addflag) { 15598 ism_map[i].imap_hatflags |= 15599 HAT_CTX1_FLAG; 15600 } else { 15601 ism_map[i].imap_hatflags &= 15602 ~HAT_CTX1_FLAG; 15603 } 15604 } 15605 ism_blkp = ism_blkp->iblk_next; 15606 } 15607 } 15608 15609 static int 15610 sfmmu_srd_lock_held(sf_srd_t *srdp) 15611 { 15612 return (MUTEX_HELD(&srdp->srd_mutex)); 15613 } 15614 15615 /* ARGSUSED */ 15616 static int 15617 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags) 15618 { 15619 sf_scd_t *scdp = (sf_scd_t *)buf; 15620 15621 bzero(buf, sizeof (sf_scd_t)); 15622 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL); 15623 return (0); 15624 } 15625 15626 /* ARGSUSED */ 15627 static void 15628 sfmmu_scdcache_destructor(void *buf, void *cdrarg) 15629 { 15630 sf_scd_t *scdp = (sf_scd_t *)buf; 15631 15632 mutex_destroy(&scdp->scd_mutex); 15633 } 15634 15635 /* 15636 * The listp parameter is a pointer to a list of hmeblks which are partially 15637 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the 15638 * freeing process is to cross-call all cpus to ensure that there are no 15639 * remaining cached references. 15640 * 15641 * If the local generation number is less than the global then we can free 15642 * hmeblks which are already on the pending queue as another cpu has completed 15643 * the cross-call. 15644 * 15645 * We cross-call to make sure that there are no threads on other cpus accessing 15646 * these hmblks and then complete the process of freeing them under the 15647 * following conditions: 15648 * The total number of pending hmeblks is greater than the threshold 15649 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks 15650 * It is at least 1 second since the last time we cross-called 15651 * 15652 * Otherwise, we add the hmeblks to the per-cpu pending queue. 15653 */ 15654 static void 15655 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree) 15656 { 15657 struct hme_blk *hblkp, *pr_hblkp = NULL; 15658 int count = 0; 15659 cpuset_t cpuset = cpu_ready_set; 15660 cpu_hme_pend_t *cpuhp; 15661 timestruc_t now; 15662 int one_second_expired = 0; 15663 15664 gethrestime_lasttick(&now); 15665 15666 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) { 15667 ASSERT(hblkp->hblk_shw_bit == 0); 15668 ASSERT(hblkp->hblk_shared == 0); 15669 count++; 15670 pr_hblkp = hblkp; 15671 } 15672 15673 cpuhp = &cpu_hme_pend[CPU->cpu_seqid]; 15674 mutex_enter(&cpuhp->chp_mutex); 15675 15676 if ((cpuhp->chp_count + count) == 0) { 15677 mutex_exit(&cpuhp->chp_mutex); 15678 return; 15679 } 15680 15681 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) { 15682 one_second_expired = 1; 15683 } 15684 15685 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT || 15686 (cpuhp->chp_count + count) > cpu_hme_pend_thresh || 15687 one_second_expired)) { 15688 /* Append global list to local */ 15689 if (pr_hblkp == NULL) { 15690 *listp = cpuhp->chp_listp; 15691 } else { 15692 pr_hblkp->hblk_next = cpuhp->chp_listp; 15693 } 15694 cpuhp->chp_listp = NULL; 15695 cpuhp->chp_count = 0; 15696 cpuhp->chp_timestamp = now.tv_sec; 15697 mutex_exit(&cpuhp->chp_mutex); 15698 15699 kpreempt_disable(); 15700 CPUSET_DEL(cpuset, CPU->cpu_id); 15701 xt_sync(cpuset); 15702 xt_sync(cpuset); 15703 kpreempt_enable(); 15704 15705 /* 15706 * At this stage we know that no trap handlers on other 15707 * cpus can have references to hmeblks on the list. 15708 */ 15709 sfmmu_hblk_free(listp); 15710 } else if (*listp != NULL) { 15711 pr_hblkp->hblk_next = cpuhp->chp_listp; 15712 cpuhp->chp_listp = *listp; 15713 cpuhp->chp_count += count; 15714 *listp = NULL; 15715 mutex_exit(&cpuhp->chp_mutex); 15716 } else { 15717 mutex_exit(&cpuhp->chp_mutex); 15718 } 15719 } 15720 15721 /* 15722 * Add an hmeblk to the the hash list. 15723 */ 15724 void 15725 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15726 uint64_t hblkpa) 15727 { 15728 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15729 #ifdef DEBUG 15730 if (hmebp->hmeblkp == NULL) { 15731 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA); 15732 } 15733 #endif /* DEBUG */ 15734 15735 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa; 15736 /* 15737 * Since the TSB miss handler now does not lock the hash chain before 15738 * walking it, make sure that the hmeblks nextpa is globally visible 15739 * before we make the hmeblk globally visible by updating the chain root 15740 * pointer in the hash bucket. 15741 */ 15742 membar_producer(); 15743 hmebp->hmeh_nextpa = hblkpa; 15744 hmeblkp->hblk_next = hmebp->hmeblkp; 15745 hmebp->hmeblkp = hmeblkp; 15746 15747 } 15748 15749 /* 15750 * This function is the first part of a 2 part process to remove an hmeblk 15751 * from the hash chain. In this phase we unlink the hmeblk from the hash chain 15752 * but leave the next physical pointer unchanged. The hmeblk is then linked onto 15753 * a per-cpu pending list using the virtual address pointer. 15754 * 15755 * TSB miss trap handlers that start after this phase will no longer see 15756 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register 15757 * can still use it for further chain traversal because we haven't yet modifed 15758 * the next physical pointer or freed it. 15759 * 15760 * In the second phase of hmeblk removal we'll issue a barrier xcall before 15761 * we reuse or free this hmeblk. This will make sure all lingering references to 15762 * the hmeblk after first phase disappear before we finally reclaim it. 15763 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains 15764 * during their traversal. 15765 * 15766 * The hmehash_mutex must be held when calling this function. 15767 * 15768 * Input: 15769 * hmebp - hme hash bucket pointer 15770 * hmeblkp - address of hmeblk to be removed 15771 * pr_hblk - virtual address of previous hmeblkp 15772 * listp - pointer to list of hmeblks linked by virtual address 15773 * free_now flag - indicates that a complete removal from the hash chains 15774 * is necessary. 15775 * 15776 * It is inefficient to use the free_now flag as a cross-call is required to 15777 * remove a single hmeblk from the hash chain but is necessary when hmeblks are 15778 * in short supply. 15779 */ 15780 void 15781 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15782 struct hme_blk *pr_hblk, struct hme_blk **listp, 15783 int free_now) 15784 { 15785 int shw_size, vshift; 15786 struct hme_blk *shw_hblkp; 15787 uint_t shw_mask, newshw_mask; 15788 caddr_t vaddr; 15789 int size; 15790 cpuset_t cpuset = cpu_ready_set; 15791 15792 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15793 15794 if (hmebp->hmeblkp == hmeblkp) { 15795 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa; 15796 hmebp->hmeblkp = hmeblkp->hblk_next; 15797 } else { 15798 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa; 15799 pr_hblk->hblk_next = hmeblkp->hblk_next; 15800 } 15801 15802 size = get_hblk_ttesz(hmeblkp); 15803 shw_hblkp = hmeblkp->hblk_shadow; 15804 if (shw_hblkp) { 15805 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 15806 ASSERT(!hmeblkp->hblk_shared); 15807 #ifdef DEBUG 15808 if (mmu_page_sizes == max_mmu_page_sizes) { 15809 ASSERT(size < TTE256M); 15810 } else { 15811 ASSERT(size < TTE4M); 15812 } 15813 #endif /* DEBUG */ 15814 15815 shw_size = get_hblk_ttesz(shw_hblkp); 15816 vaddr = (caddr_t)get_hblk_base(hmeblkp); 15817 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 15818 ASSERT(vshift < 8); 15819 /* 15820 * Atomically clear shadow mask bit 15821 */ 15822 do { 15823 shw_mask = shw_hblkp->hblk_shw_mask; 15824 ASSERT(shw_mask & (1 << vshift)); 15825 newshw_mask = shw_mask & ~(1 << vshift); 15826 newshw_mask = cas32(&shw_hblkp->hblk_shw_mask, 15827 shw_mask, newshw_mask); 15828 } while (newshw_mask != shw_mask); 15829 hmeblkp->hblk_shadow = NULL; 15830 } 15831 hmeblkp->hblk_shw_bit = 0; 15832 15833 if (hmeblkp->hblk_shared) { 15834 #ifdef DEBUG 15835 sf_srd_t *srdp; 15836 sf_region_t *rgnp; 15837 uint_t rid; 15838 15839 srdp = hblktosrd(hmeblkp); 15840 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 15841 rid = hmeblkp->hblk_tag.htag_rid; 15842 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 15843 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 15844 rgnp = srdp->srd_hmergnp[rid]; 15845 ASSERT(rgnp != NULL); 15846 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 15847 #endif /* DEBUG */ 15848 hmeblkp->hblk_shared = 0; 15849 } 15850 if (free_now) { 15851 kpreempt_disable(); 15852 CPUSET_DEL(cpuset, CPU->cpu_id); 15853 xt_sync(cpuset); 15854 xt_sync(cpuset); 15855 kpreempt_enable(); 15856 15857 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 15858 hmeblkp->hblk_next = NULL; 15859 } else { 15860 /* Append hmeblkp to listp for processing later. */ 15861 hmeblkp->hblk_next = *listp; 15862 *listp = hmeblkp; 15863 } 15864 } 15865 15866 /* 15867 * This routine is called when memory is in short supply and returns a free 15868 * hmeblk of the requested size from the cpu pending lists. 15869 */ 15870 static struct hme_blk * 15871 sfmmu_check_pending_hblks(int size) 15872 { 15873 int i; 15874 struct hme_blk *hmeblkp = NULL, *last_hmeblkp; 15875 int found_hmeblk; 15876 cpuset_t cpuset = cpu_ready_set; 15877 cpu_hme_pend_t *cpuhp; 15878 15879 /* Flush cpu hblk pending queues */ 15880 for (i = 0; i < NCPU; i++) { 15881 cpuhp = &cpu_hme_pend[i]; 15882 if (cpuhp->chp_listp != NULL) { 15883 mutex_enter(&cpuhp->chp_mutex); 15884 if (cpuhp->chp_listp == NULL) { 15885 mutex_exit(&cpuhp->chp_mutex); 15886 continue; 15887 } 15888 found_hmeblk = 0; 15889 last_hmeblkp = NULL; 15890 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL; 15891 hmeblkp = hmeblkp->hblk_next) { 15892 if (get_hblk_ttesz(hmeblkp) == size) { 15893 if (last_hmeblkp == NULL) { 15894 cpuhp->chp_listp = 15895 hmeblkp->hblk_next; 15896 } else { 15897 last_hmeblkp->hblk_next = 15898 hmeblkp->hblk_next; 15899 } 15900 ASSERT(cpuhp->chp_count > 0); 15901 cpuhp->chp_count--; 15902 found_hmeblk = 1; 15903 break; 15904 } else { 15905 last_hmeblkp = hmeblkp; 15906 } 15907 } 15908 mutex_exit(&cpuhp->chp_mutex); 15909 15910 if (found_hmeblk) { 15911 kpreempt_disable(); 15912 CPUSET_DEL(cpuset, CPU->cpu_id); 15913 xt_sync(cpuset); 15914 xt_sync(cpuset); 15915 kpreempt_enable(); 15916 return (hmeblkp); 15917 } 15918 } 15919 } 15920 return (NULL); 15921 } 15922