1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 * Copyright 2016 Gary Mills 27 */ 28 29 /* 30 * VM - Hardware Address Translation management for Spitfire MMU. 31 * 32 * This file implements the machine specific hardware translation 33 * needed by the VM system. The machine independent interface is 34 * described in <vm/hat.h> while the machine dependent interface 35 * and data structures are described in <vm/hat_sfmmu.h>. 36 * 37 * The hat layer manages the address translation hardware as a cache 38 * driven by calls from the higher levels in the VM system. 39 */ 40 41 #include <sys/types.h> 42 #include <sys/kstat.h> 43 #include <vm/hat.h> 44 #include <vm/hat_sfmmu.h> 45 #include <vm/page.h> 46 #include <sys/pte.h> 47 #include <sys/systm.h> 48 #include <sys/mman.h> 49 #include <sys/sysmacros.h> 50 #include <sys/machparam.h> 51 #include <sys/vtrace.h> 52 #include <sys/kmem.h> 53 #include <sys/mmu.h> 54 #include <sys/cmn_err.h> 55 #include <sys/cpu.h> 56 #include <sys/cpuvar.h> 57 #include <sys/debug.h> 58 #include <sys/lgrp.h> 59 #include <sys/archsystm.h> 60 #include <sys/machsystm.h> 61 #include <sys/vmsystm.h> 62 #include <vm/as.h> 63 #include <vm/seg.h> 64 #include <vm/seg_kp.h> 65 #include <vm/seg_kmem.h> 66 #include <vm/seg_kpm.h> 67 #include <vm/rm.h> 68 #include <sys/t_lock.h> 69 #include <sys/obpdefs.h> 70 #include <sys/vm_machparam.h> 71 #include <sys/var.h> 72 #include <sys/trap.h> 73 #include <sys/machtrap.h> 74 #include <sys/scb.h> 75 #include <sys/bitmap.h> 76 #include <sys/machlock.h> 77 #include <sys/membar.h> 78 #include <sys/atomic.h> 79 #include <sys/cpu_module.h> 80 #include <sys/prom_debug.h> 81 #include <sys/ksynch.h> 82 #include <sys/mem_config.h> 83 #include <sys/mem_cage.h> 84 #include <vm/vm_dep.h> 85 #include <sys/fpu/fpusystm.h> 86 #include <vm/mach_kpm.h> 87 #include <sys/callb.h> 88 89 #ifdef DEBUG 90 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ 91 if (SFMMU_IS_SHMERID_VALID(rid)) { \ 92 caddr_t _eaddr = (saddr) + (len); \ 93 sf_srd_t *_srdp; \ 94 sf_region_t *_rgnp; \ 95 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 96 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \ 97 ASSERT((hat) != ksfmmup); \ 98 _srdp = (hat)->sfmmu_srdp; \ 99 ASSERT(_srdp != NULL); \ 100 ASSERT(_srdp->srd_refcnt != 0); \ 101 _rgnp = _srdp->srd_hmergnp[(rid)]; \ 102 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \ 103 ASSERT(_rgnp->rgn_refcnt != 0); \ 104 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \ 105 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 106 SFMMU_REGION_HME); \ 107 ASSERT((saddr) >= _rgnp->rgn_saddr); \ 108 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \ 109 ASSERT(_eaddr > _rgnp->rgn_saddr); \ 110 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \ 111 } 112 113 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \ 114 { \ 115 caddr_t _hsva; \ 116 caddr_t _heva; \ 117 caddr_t _rsva; \ 118 caddr_t _reva; \ 119 int _ttesz = get_hblk_ttesz(hmeblkp); \ 120 int _flagtte; \ 121 ASSERT((srdp)->srd_refcnt != 0); \ 122 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \ 123 ASSERT((rgnp)->rgn_id == rid); \ 124 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \ 125 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \ 126 SFMMU_REGION_HME); \ 127 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \ 128 _hsva = (caddr_t)get_hblk_base(hmeblkp); \ 129 _heva = get_hblk_endaddr(hmeblkp); \ 130 _rsva = (caddr_t)P2ALIGN( \ 131 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \ 132 _reva = (caddr_t)P2ROUNDUP( \ 133 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \ 134 HBLK_MIN_BYTES); \ 135 ASSERT(_hsva >= _rsva); \ 136 ASSERT(_hsva < _reva); \ 137 ASSERT(_heva > _rsva); \ 138 ASSERT(_heva <= _reva); \ 139 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \ 140 _ttesz; \ 141 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \ 142 } 143 144 #else /* DEBUG */ 145 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len) 146 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) 147 #endif /* DEBUG */ 148 149 #if defined(SF_ERRATA_57) 150 extern caddr_t errata57_limit; 151 #endif 152 153 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \ 154 (sizeof (int64_t))) 155 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve) 156 157 #define HBLK_RESERVE_CNT 128 158 #define HBLK_RESERVE_MIN 20 159 160 static struct hme_blk *freehblkp; 161 static kmutex_t freehblkp_lock; 162 static int freehblkcnt; 163 164 static int64_t hblk_reserve[HME8BLK_SZ_RND]; 165 static kmutex_t hblk_reserve_lock; 166 static kthread_t *hblk_reserve_thread; 167 168 static nucleus_hblk8_info_t nucleus_hblk8; 169 static nucleus_hblk1_info_t nucleus_hblk1; 170 171 /* 172 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here 173 * after the initial phase of removing an hmeblk from the hash chain, see 174 * the detailed comment in sfmmu_hblk_hash_rm() for further details. 175 */ 176 static cpu_hme_pend_t *cpu_hme_pend; 177 static uint_t cpu_hme_pend_thresh; 178 /* 179 * SFMMU specific hat functions 180 */ 181 void hat_pagecachectl(struct page *, int); 182 183 /* flags for hat_pagecachectl */ 184 #define HAT_CACHE 0x1 185 #define HAT_UNCACHE 0x2 186 #define HAT_TMPNC 0x4 187 188 /* 189 * Flag to allow the creation of non-cacheable translations 190 * to system memory. It is off by default. At the moment this 191 * flag is used by the ecache error injector. The error injector 192 * will turn it on when creating such a translation then shut it 193 * off when it's finished. 194 */ 195 196 int sfmmu_allow_nc_trans = 0; 197 198 /* 199 * Flag to disable large page support. 200 * value of 1 => disable all large pages. 201 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively. 202 * 203 * For example, use the value 0x4 to disable 512K pages. 204 * 205 */ 206 #define LARGE_PAGES_OFF 0x1 207 208 /* 209 * The disable_large_pages and disable_ism_large_pages variables control 210 * hat_memload_array and the page sizes to be used by ISM and the kernel. 211 * 212 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables 213 * are only used to control which OOB pages to use at upper VM segment creation 214 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines. 215 * Their values may come from platform or CPU specific code to disable page 216 * sizes that should not be used. 217 * 218 * WARNING: 512K pages are currently not supported for ISM/DISM. 219 */ 220 uint_t disable_large_pages = 0; 221 uint_t disable_ism_large_pages = (1 << TTE512K); 222 uint_t disable_auto_data_large_pages = 0; 223 uint_t disable_auto_text_large_pages = 0; 224 225 /* 226 * Private sfmmu data structures for hat management 227 */ 228 static struct kmem_cache *sfmmuid_cache; 229 static struct kmem_cache *mmuctxdom_cache; 230 231 /* 232 * Private sfmmu data structures for tsb management 233 */ 234 static struct kmem_cache *sfmmu_tsbinfo_cache; 235 static struct kmem_cache *sfmmu_tsb8k_cache; 236 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX]; 237 static vmem_t *kmem_bigtsb_arena; 238 static vmem_t *kmem_tsb_arena; 239 240 /* 241 * sfmmu static variables for hmeblk resource management. 242 */ 243 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */ 244 static struct kmem_cache *sfmmu8_cache; 245 static struct kmem_cache *sfmmu1_cache; 246 static struct kmem_cache *pa_hment_cache; 247 248 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */ 249 /* 250 * private data for ism 251 */ 252 static struct kmem_cache *ism_blk_cache; 253 static struct kmem_cache *ism_ment_cache; 254 #define ISMID_STARTADDR NULL 255 256 /* 257 * Region management data structures and function declarations. 258 */ 259 260 static void sfmmu_leave_srd(sfmmu_t *); 261 static int sfmmu_srdcache_constructor(void *, void *, int); 262 static void sfmmu_srdcache_destructor(void *, void *); 263 static int sfmmu_rgncache_constructor(void *, void *, int); 264 static void sfmmu_rgncache_destructor(void *, void *); 265 static int sfrgnmap_isnull(sf_region_map_t *); 266 static int sfhmergnmap_isnull(sf_hmeregion_map_t *); 267 static int sfmmu_scdcache_constructor(void *, void *, int); 268 static void sfmmu_scdcache_destructor(void *, void *); 269 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t, 270 size_t, void *, u_offset_t); 271 272 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1; 273 static sf_srd_bucket_t *srd_buckets; 274 static struct kmem_cache *srd_cache; 275 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1; 276 static struct kmem_cache *region_cache; 277 static struct kmem_cache *scd_cache; 278 279 #ifdef sun4v 280 int use_bigtsb_arena = 1; 281 #else 282 int use_bigtsb_arena = 0; 283 #endif 284 285 /* External /etc/system tunable, for turning on&off the shctx support */ 286 int disable_shctx = 0; 287 /* Internal variable, set by MD if the HW supports shctx feature */ 288 int shctx_on = 0; 289 290 #ifdef DEBUG 291 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int); 292 #endif 293 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *); 294 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *); 295 296 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *); 297 static void sfmmu_find_scd(sfmmu_t *); 298 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *); 299 static void sfmmu_finish_join_scd(sfmmu_t *); 300 static void sfmmu_leave_scd(sfmmu_t *, uchar_t); 301 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *); 302 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *); 303 static void sfmmu_free_scd_tsbs(sfmmu_t *); 304 static void sfmmu_tsb_inv_ctx(sfmmu_t *); 305 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *); 306 static void sfmmu_ism_hatflags(sfmmu_t *, int); 307 static int sfmmu_srd_lock_held(sf_srd_t *); 308 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *); 309 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *); 310 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *); 311 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *); 312 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *); 313 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *); 314 315 /* 316 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists, 317 * HAT flags, synchronizing TLB/TSB coherency, and context management. 318 * The lock is hashed on the sfmmup since the case where we need to lock 319 * all processes is rare but does occur (e.g. we need to unload a shared 320 * mapping from all processes using the mapping). We have a lot of buckets, 321 * and each slab of sfmmu_t's can use about a quarter of them, giving us 322 * a fairly good distribution without wasting too much space and overhead 323 * when we have to grab them all. 324 */ 325 #define SFMMU_NUM_LOCK 128 /* must be power of two */ 326 hatlock_t hat_lock[SFMMU_NUM_LOCK]; 327 328 /* 329 * Hash algorithm optimized for a small number of slabs. 330 * 7 is (highbit((sizeof sfmmu_t)) - 1) 331 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a 332 * kmem_cache, and thus they will be sequential within that cache. In 333 * addition, each new slab will have a different "color" up to cache_maxcolor 334 * which will skew the hashing for each successive slab which is allocated. 335 * If the size of sfmmu_t changed to a larger size, this algorithm may need 336 * to be revisited. 337 */ 338 #define TSB_HASH_SHIFT_BITS (7) 339 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS) 340 341 #ifdef DEBUG 342 int tsb_hash_debug = 0; 343 #define TSB_HASH(sfmmup) \ 344 (tsb_hash_debug ? &hat_lock[0] : \ 345 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]) 346 #else /* DEBUG */ 347 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)] 348 #endif /* DEBUG */ 349 350 351 /* sfmmu_replace_tsb() return codes. */ 352 typedef enum tsb_replace_rc { 353 TSB_SUCCESS, 354 TSB_ALLOCFAIL, 355 TSB_LOSTRACE, 356 TSB_ALREADY_SWAPPED, 357 TSB_CANTGROW 358 } tsb_replace_rc_t; 359 360 /* 361 * Flags for TSB allocation routines. 362 */ 363 #define TSB_ALLOC 0x01 364 #define TSB_FORCEALLOC 0x02 365 #define TSB_GROW 0x04 366 #define TSB_SHRINK 0x08 367 #define TSB_SWAPIN 0x10 368 369 /* 370 * Support for HAT callbacks. 371 */ 372 #define SFMMU_MAX_RELOC_CALLBACKS 10 373 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS; 374 static id_t sfmmu_cb_nextid = 0; 375 static id_t sfmmu_tsb_cb_id; 376 struct sfmmu_callback *sfmmu_cb_table; 377 378 kmutex_t kpr_mutex; 379 kmutex_t kpr_suspendlock; 380 kthread_t *kreloc_thread; 381 382 /* 383 * Enable VA->PA translation sanity checking on DEBUG kernels. 384 * Disabled by default. This is incompatible with some 385 * drivers (error injector, RSM) so if it breaks you get 386 * to keep both pieces. 387 */ 388 int hat_check_vtop = 0; 389 390 /* 391 * Private sfmmu routines (prototypes) 392 */ 393 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t); 394 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t, 395 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t, 396 uint_t); 397 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t, 398 caddr_t, demap_range_t *, uint_t); 399 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t, 400 caddr_t, int); 401 static void sfmmu_hblk_free(struct hme_blk **); 402 static void sfmmu_hblks_list_purge(struct hme_blk **, int); 403 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t); 404 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t); 405 static struct hme_blk *sfmmu_hblk_steal(int); 406 static int sfmmu_steal_this_hblk(struct hmehash_bucket *, 407 struct hme_blk *, uint64_t, struct hme_blk *); 408 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t); 409 410 static void hat_do_memload_array(struct hat *, caddr_t, size_t, 411 struct page **, uint_t, uint_t, uint_t); 412 static void hat_do_memload(struct hat *, caddr_t, struct page *, 413 uint_t, uint_t, uint_t); 414 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **, 415 uint_t, uint_t, pgcnt_t, uint_t); 416 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *, 417 uint_t); 418 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **, 419 uint_t, uint_t); 420 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *, 421 caddr_t, int, uint_t); 422 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *, 423 struct hmehash_bucket *, caddr_t, uint_t, uint_t, 424 uint_t); 425 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *, 426 caddr_t, page_t **, uint_t, uint_t); 427 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *); 428 429 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int); 430 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *); 431 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int); 432 #ifdef VAC 433 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *); 434 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *); 435 int tst_tnc(page_t *pp, pgcnt_t); 436 void conv_tnc(page_t *pp, int); 437 #endif 438 439 static void sfmmu_get_ctx(sfmmu_t *); 440 static void sfmmu_free_sfmmu(sfmmu_t *); 441 442 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *); 443 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int); 444 445 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int); 446 static void hat_pagereload(struct page *, struct page *); 447 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t); 448 #ifdef VAC 449 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t); 450 static void sfmmu_page_cache(page_t *, int, int, int); 451 #endif 452 453 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *, 454 struct hme_blk *, int); 455 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 456 pfn_t, int, int, int, int); 457 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *, 458 pfn_t, int); 459 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); 460 static void sfmmu_tlb_range_demap(demap_range_t *); 461 static void sfmmu_invalidate_ctx(sfmmu_t *); 462 static void sfmmu_sync_mmustate(sfmmu_t *); 463 464 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); 465 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t, 466 sfmmu_t *); 467 static void sfmmu_tsb_free(struct tsb_info *); 468 static void sfmmu_tsbinfo_free(struct tsb_info *); 469 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t, 470 sfmmu_t *); 471 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *); 472 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *); 473 static int sfmmu_select_tsb_szc(pgcnt_t); 474 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int); 475 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \ 476 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc) 477 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \ 478 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc) 479 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *); 480 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t, 481 hatlock_t *, uint_t); 482 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int); 483 484 #ifdef VAC 485 void sfmmu_cache_flush(pfn_t, int); 486 void sfmmu_cache_flushcolor(int, pfn_t); 487 #endif 488 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t, 489 caddr_t, demap_range_t *, uint_t, int); 490 491 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *); 492 static uint_t sfmmu_ptov_attr(tte_t *); 493 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t, 494 caddr_t, demap_range_t *, uint_t); 495 static uint_t sfmmu_vtop_prot(uint_t, uint_t *); 496 static int sfmmu_idcache_constructor(void *, void *, int); 497 static void sfmmu_idcache_destructor(void *, void *); 498 static int sfmmu_hblkcache_constructor(void *, void *, int); 499 static void sfmmu_hblkcache_destructor(void *, void *); 500 static void sfmmu_hblkcache_reclaim(void *); 501 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *, 502 struct hmehash_bucket *); 503 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *, 504 struct hme_blk *, struct hme_blk **, int); 505 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *, 506 uint64_t); 507 static struct hme_blk *sfmmu_check_pending_hblks(int); 508 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int); 509 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int); 510 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t, 511 int, caddr_t *); 512 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *); 513 514 static void sfmmu_rm_large_mappings(page_t *, int); 515 516 static void hat_lock_init(void); 517 static void hat_kstat_init(void); 518 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw); 519 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *); 520 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t); 521 static void sfmmu_check_page_sizes(sfmmu_t *, int); 522 int fnd_mapping_sz(page_t *); 523 static void iment_add(struct ism_ment *, struct hat *); 524 static void iment_sub(struct ism_ment *, struct hat *); 525 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc); 526 extern void sfmmu_setup_tsbinfo(sfmmu_t *); 527 extern void sfmmu_clear_utsbinfo(void); 528 529 static void sfmmu_ctx_wrap_around(mmu_ctx_t *, boolean_t); 530 531 extern int vpm_enable; 532 533 /* kpm globals */ 534 #ifdef DEBUG 535 /* 536 * Enable trap level tsbmiss handling 537 */ 538 int kpm_tsbmtl = 1; 539 540 /* 541 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the 542 * required TLB shootdowns in this case, so handle w/ care. Off by default. 543 */ 544 int kpm_tlb_flush; 545 #endif /* DEBUG */ 546 547 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int); 548 549 #ifdef DEBUG 550 static void sfmmu_check_hblk_flist(); 551 #endif 552 553 /* 554 * Semi-private sfmmu data structures. Some of them are initialize in 555 * startup or in hat_init. Some of them are private but accessed by 556 * assembly code or mach_sfmmu.c 557 */ 558 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */ 559 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */ 560 uint64_t uhme_hash_pa; /* PA of uhme_hash */ 561 uint64_t khme_hash_pa; /* PA of khme_hash */ 562 int uhmehash_num; /* # of buckets in user hash table */ 563 int khmehash_num; /* # of buckets in kernel hash table */ 564 565 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */ 566 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */ 567 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ 568 569 #define DEFAULT_NUM_CTXS_PER_MMU 8192 570 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; 571 572 int cache; /* describes system cache */ 573 574 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */ 575 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */ 576 int ktsb_szcode; /* kernel 8k-indexed tsb size code */ 577 int ktsb_sz; /* kernel 8k-indexed tsb size */ 578 579 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */ 580 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */ 581 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */ 582 int ktsb4m_sz; /* kernel 4m-indexed tsb size */ 583 584 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */ 585 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */ 586 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */ 587 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */ 588 589 #ifndef sun4v 590 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */ 591 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */ 592 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */ 593 caddr_t utsb_vabase; /* reserved kernel virtual memory */ 594 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */ 595 #endif /* sun4v */ 596 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */ 597 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */ 598 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */ 599 600 /* 601 * Size to use for TSB slabs. Future platforms that support page sizes 602 * larger than 4M may wish to change these values, and provide their own 603 * assembly macros for building and decoding the TSB base register contents. 604 * Note disable_large_pages will override the value set here. 605 */ 606 static uint_t tsb_slab_ttesz = TTE4M; 607 size_t tsb_slab_size = MMU_PAGESIZE4M; 608 uint_t tsb_slab_shift = MMU_PAGESHIFT4M; 609 /* PFN mask for TTE */ 610 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT; 611 612 /* 613 * Size to use for TSB slabs. These are used only when 256M tsb arenas 614 * exist. 615 */ 616 static uint_t bigtsb_slab_ttesz = TTE256M; 617 static size_t bigtsb_slab_size = MMU_PAGESIZE256M; 618 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M; 619 /* 256M page alignment for 8K pfn */ 620 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT; 621 622 /* largest TSB size to grow to, will be smaller on smaller memory systems */ 623 static int tsb_max_growsize = 0; 624 625 /* 626 * Tunable parameters dealing with TSB policies. 627 */ 628 629 /* 630 * This undocumented tunable forces all 8K TSBs to be allocated from 631 * the kernel heap rather than from the kmem_tsb_default_arena arenas. 632 */ 633 #ifdef DEBUG 634 int tsb_forceheap = 0; 635 #endif /* DEBUG */ 636 637 /* 638 * Decide whether to use per-lgroup arenas, or one global set of 639 * TSB arenas. The default is not to break up per-lgroup, since 640 * most platforms don't recognize any tangible benefit from it. 641 */ 642 int tsb_lgrp_affinity = 0; 643 644 /* 645 * Used for growing the TSB based on the process RSS. 646 * tsb_rss_factor is based on the smallest TSB, and is 647 * shifted by the TSB size to determine if we need to grow. 648 * The default will grow the TSB if the number of TTEs for 649 * this page size exceeds 75% of the number of TSB entries, 650 * which should _almost_ eliminate all conflict misses 651 * (at the expense of using up lots and lots of memory). 652 */ 653 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75) 654 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc) 655 #define SELECT_TSB_SIZECODE(pgcnt) ( \ 656 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \ 657 default_tsb_size) 658 #define TSB_OK_SHRINK() \ 659 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree) 660 #define TSB_OK_GROW() \ 661 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree) 662 663 int enable_tsb_rss_sizing = 1; 664 int tsb_rss_factor = (int)TSB_RSS_FACTOR; 665 666 /* which TSB size code to use for new address spaces or if rss sizing off */ 667 int default_tsb_size = TSB_8K_SZCODE; 668 669 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */ 670 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */ 671 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32 672 673 #ifdef DEBUG 674 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */ 675 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */ 676 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */ 677 static int tsb_alloc_fail_mtbf = 0; 678 static int tsb_alloc_count = 0; 679 #endif /* DEBUG */ 680 681 /* if set to 1, will remap valid TTEs when growing TSB. */ 682 int tsb_remap_ttes = 1; 683 684 /* 685 * If we have more than this many mappings, allocate a second TSB. 686 * This default is chosen because the I/D fully associative TLBs are 687 * assumed to have at least 8 available entries. Platforms with a 688 * larger fully-associative TLB could probably override the default. 689 */ 690 691 #ifdef sun4v 692 int tsb_sectsb_threshold = 0; 693 #else 694 int tsb_sectsb_threshold = 8; 695 #endif 696 697 /* 698 * kstat data 699 */ 700 struct sfmmu_global_stat sfmmu_global_stat; 701 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat; 702 703 /* 704 * Global data 705 */ 706 sfmmu_t *ksfmmup; /* kernel's hat id */ 707 708 #ifdef DEBUG 709 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *); 710 #endif 711 712 /* sfmmu locking operations */ 713 static kmutex_t *sfmmu_mlspl_enter(struct page *, int); 714 static int sfmmu_mlspl_held(struct page *, int); 715 716 kmutex_t *sfmmu_page_enter(page_t *); 717 void sfmmu_page_exit(kmutex_t *); 718 int sfmmu_page_spl_held(struct page *); 719 720 /* sfmmu internal locking operations - accessed directly */ 721 static void sfmmu_mlist_reloc_enter(page_t *, page_t *, 722 kmutex_t **, kmutex_t **); 723 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); 724 static hatlock_t * 725 sfmmu_hat_enter(sfmmu_t *); 726 static hatlock_t * 727 sfmmu_hat_tryenter(sfmmu_t *); 728 static void sfmmu_hat_exit(hatlock_t *); 729 static void sfmmu_hat_lock_all(void); 730 static void sfmmu_hat_unlock_all(void); 731 static void sfmmu_ismhat_enter(sfmmu_t *, int); 732 static void sfmmu_ismhat_exit(sfmmu_t *, int); 733 734 kpm_hlk_t *kpmp_table; 735 uint_t kpmp_table_sz; /* must be a power of 2 */ 736 uchar_t kpmp_shift; 737 738 kpm_shlk_t *kpmp_stable; 739 uint_t kpmp_stable_sz; /* must be a power of 2 */ 740 741 /* 742 * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128. 743 * SPL_SHIFT is log2(SPL_TABLE_SIZE). 744 */ 745 #if ((2*NCPU_P2) > 128) 746 #define SPL_SHIFT ((unsigned)(NCPU_LOG2 + 1)) 747 #else 748 #define SPL_SHIFT 7U 749 #endif 750 #define SPL_TABLE_SIZE (1U << SPL_SHIFT) 751 #define SPL_MASK (SPL_TABLE_SIZE - 1) 752 753 /* 754 * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t 755 * and by multiples of SPL_SHIFT to get as many varied bits as we can. 756 */ 757 #define SPL_INDEX(pp) \ 758 ((((uintptr_t)(pp) >> PP_SHIFT) ^ \ 759 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \ 760 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \ 761 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \ 762 SPL_MASK) 763 764 #define SPL_HASH(pp) \ 765 (&sfmmu_page_lock[SPL_INDEX(pp)].pad_mutex) 766 767 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE]; 768 769 /* Array of mutexes protecting a page's mapping list and p_nrm field. */ 770 771 #define MML_TABLE_SIZE SPL_TABLE_SIZE 772 #define MLIST_HASH(pp) (&mml_table[SPL_INDEX(pp)].pad_mutex) 773 774 static pad_mutex_t mml_table[MML_TABLE_SIZE]; 775 776 /* 777 * hat_unload_callback() will group together callbacks in order 778 * to avoid xt_sync() calls. This is the maximum size of the group. 779 */ 780 #define MAX_CB_ADDR 32 781 782 tte_t hw_tte; 783 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT; 784 785 static char *mmu_ctx_kstat_names[] = { 786 "mmu_ctx_tsb_exceptions", 787 "mmu_ctx_tsb_raise_exception", 788 "mmu_ctx_wrap_around", 789 }; 790 791 /* 792 * Wrapper for vmem_xalloc since vmem_create only allows limited 793 * parameters for vm_source_alloc functions. This function allows us 794 * to specify alignment consistent with the size of the object being 795 * allocated. 796 */ 797 static void * 798 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag) 799 { 800 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag)); 801 } 802 803 /* Common code for setting tsb_alloc_hiwater. */ 804 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \ 805 ptob(pages) / tsb_alloc_hiwater_factor 806 807 /* 808 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by 809 * a single TSB. physmem is the number of physical pages so we need physmem 8K 810 * TTEs to represent all those physical pages. We round this up by using 811 * 1<<highbit(). To figure out which size code to use, remember that the size 812 * code is just an amount to shift the smallest TSB size to get the size of 813 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or 814 * highbit() - 1) to get the size code for the smallest TSB that can represent 815 * all of physical memory, while erring on the side of too much. 816 * 817 * Restrict tsb_max_growsize to make sure that: 818 * 1) TSBs can't grow larger than the TSB slab size 819 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE. 820 */ 821 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \ 822 int _i, _szc, _slabszc, _tsbszc; \ 823 \ 824 _i = highbit(pages); \ 825 if ((1 << (_i - 1)) == (pages)) \ 826 _i--; /* 2^n case, round down */ \ 827 _szc = _i - TSB_START_SIZE; \ 828 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \ 829 _tsbszc = MIN(_szc, _slabszc); \ 830 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \ 831 } 832 833 /* 834 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the 835 * tsb_info which handles that TTE size. 836 */ 837 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \ 838 (tsbinfop) = (sfmmup)->sfmmu_tsb; \ 839 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \ 840 sfmmu_hat_lock_held(sfmmup)); \ 841 if ((tte_szc) >= TTE4M) { \ 842 ASSERT((tsbinfop) != NULL); \ 843 (tsbinfop) = (tsbinfop)->tsb_next; \ 844 } \ 845 } 846 847 /* 848 * Macro to use to unload entries from the TSB. 849 * It has knowledge of which page sizes get replicated in the TSB 850 * and will call the appropriate unload routine for the appropriate size. 851 */ 852 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \ 853 { \ 854 int ttesz = get_hblk_ttesz(hmeblkp); \ 855 if (ttesz == TTE8K || ttesz == TTE4M) { \ 856 sfmmu_unload_tsb(sfmmup, addr, ttesz); \ 857 } else { \ 858 caddr_t sva = ismhat ? addr : \ 859 (caddr_t)get_hblk_base(hmeblkp); \ 860 caddr_t eva = sva + get_hblk_span(hmeblkp); \ 861 ASSERT(addr >= sva && addr < eva); \ 862 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \ 863 } \ 864 } 865 866 867 /* Update tsb_alloc_hiwater after memory is configured. */ 868 /*ARGSUSED*/ 869 static void 870 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages) 871 { 872 /* Assumes physmem has already been updated. */ 873 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 874 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 875 } 876 877 /* 878 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here 879 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is 880 * deleted. 881 */ 882 /*ARGSUSED*/ 883 static int 884 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages) 885 { 886 return (0); 887 } 888 889 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */ 890 /*ARGSUSED*/ 891 static void 892 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled) 893 { 894 /* 895 * Whether the delete was cancelled or not, just go ahead and update 896 * tsb_alloc_hiwater and tsb_max_growsize. 897 */ 898 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 899 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 900 } 901 902 static kphysm_setup_vector_t sfmmu_update_vec = { 903 KPHYSM_SETUP_VECTOR_VERSION, /* version */ 904 sfmmu_update_post_add, /* post_add */ 905 sfmmu_update_pre_del, /* pre_del */ 906 sfmmu_update_post_del /* post_del */ 907 }; 908 909 910 /* 911 * HME_BLK HASH PRIMITIVES 912 */ 913 914 /* 915 * Enter a hme on the mapping list for page pp. 916 * When large pages are more prevalent in the system we might want to 917 * keep the mapping list in ascending order by the hment size. For now, 918 * small pages are more frequent, so don't slow it down. 919 */ 920 #define HME_ADD(hme, pp) \ 921 { \ 922 ASSERT(sfmmu_mlist_held(pp)); \ 923 \ 924 hme->hme_prev = NULL; \ 925 hme->hme_next = pp->p_mapping; \ 926 hme->hme_page = pp; \ 927 if (pp->p_mapping) { \ 928 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\ 929 ASSERT(pp->p_share > 0); \ 930 } else { \ 931 /* EMPTY */ \ 932 ASSERT(pp->p_share == 0); \ 933 } \ 934 pp->p_mapping = hme; \ 935 pp->p_share++; \ 936 } 937 938 /* 939 * Enter a hme on the mapping list for page pp. 940 * If we are unmapping a large translation, we need to make sure that the 941 * change is reflect in the corresponding bit of the p_index field. 942 */ 943 #define HME_SUB(hme, pp) \ 944 { \ 945 ASSERT(sfmmu_mlist_held(pp)); \ 946 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \ 947 \ 948 if (pp->p_mapping == NULL) { \ 949 panic("hme_remove - no mappings"); \ 950 } \ 951 \ 952 membar_stst(); /* ensure previous stores finish */ \ 953 \ 954 ASSERT(pp->p_share > 0); \ 955 pp->p_share--; \ 956 \ 957 if (hme->hme_prev) { \ 958 ASSERT(pp->p_mapping != hme); \ 959 ASSERT(hme->hme_prev->hme_page == pp || \ 960 IS_PAHME(hme->hme_prev)); \ 961 hme->hme_prev->hme_next = hme->hme_next; \ 962 } else { \ 963 ASSERT(pp->p_mapping == hme); \ 964 pp->p_mapping = hme->hme_next; \ 965 ASSERT((pp->p_mapping == NULL) ? \ 966 (pp->p_share == 0) : 1); \ 967 } \ 968 \ 969 if (hme->hme_next) { \ 970 ASSERT(hme->hme_next->hme_page == pp || \ 971 IS_PAHME(hme->hme_next)); \ 972 hme->hme_next->hme_prev = hme->hme_prev; \ 973 } \ 974 \ 975 /* zero out the entry */ \ 976 hme->hme_next = NULL; \ 977 hme->hme_prev = NULL; \ 978 hme->hme_page = NULL; \ 979 \ 980 if (hme_size(hme) > TTE8K) { \ 981 /* remove mappings for remainder of large pg */ \ 982 sfmmu_rm_large_mappings(pp, hme_size(hme)); \ 983 } \ 984 } 985 986 /* 987 * This function returns the hment given the hme_blk and a vaddr. 988 * It assumes addr has already been checked to belong to hme_blk's 989 * range. 990 */ 991 #define HBLKTOHME(hment, hmeblkp, addr) \ 992 { \ 993 int index; \ 994 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \ 995 } 996 997 /* 998 * Version of HBLKTOHME that also returns the index in hmeblkp 999 * of the hment. 1000 */ 1001 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \ 1002 { \ 1003 ASSERT(in_hblk_range((hmeblkp), (addr))); \ 1004 \ 1005 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \ 1006 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \ 1007 } else \ 1008 idx = 0; \ 1009 \ 1010 (hment) = &(hmeblkp)->hblk_hme[idx]; \ 1011 } 1012 1013 /* 1014 * Disable any page sizes not supported by the CPU 1015 */ 1016 void 1017 hat_init_pagesizes() 1018 { 1019 int i; 1020 1021 mmu_exported_page_sizes = 0; 1022 for (i = TTE8K; i < max_mmu_page_sizes; i++) { 1023 1024 szc_2_userszc[i] = (uint_t)-1; 1025 userszc_2_szc[i] = (uint_t)-1; 1026 1027 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) { 1028 disable_large_pages |= (1 << i); 1029 } else { 1030 szc_2_userszc[i] = mmu_exported_page_sizes; 1031 userszc_2_szc[mmu_exported_page_sizes] = i; 1032 mmu_exported_page_sizes++; 1033 } 1034 } 1035 1036 disable_ism_large_pages |= disable_large_pages; 1037 disable_auto_data_large_pages = disable_large_pages; 1038 disable_auto_text_large_pages = disable_large_pages; 1039 1040 /* 1041 * Initialize mmu-specific large page sizes. 1042 */ 1043 if (&mmu_large_pages_disabled) { 1044 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); 1045 disable_ism_large_pages |= 1046 mmu_large_pages_disabled(HAT_LOAD_SHARE); 1047 disable_auto_data_large_pages |= 1048 mmu_large_pages_disabled(HAT_AUTO_DATA); 1049 disable_auto_text_large_pages |= 1050 mmu_large_pages_disabled(HAT_AUTO_TEXT); 1051 } 1052 } 1053 1054 /* 1055 * Initialize the hardware address translation structures. 1056 */ 1057 void 1058 hat_init(void) 1059 { 1060 int i; 1061 uint_t sz; 1062 size_t size; 1063 1064 hat_lock_init(); 1065 hat_kstat_init(); 1066 1067 /* 1068 * Hardware-only bits in a TTE 1069 */ 1070 MAKE_TTE_MASK(&hw_tte); 1071 1072 hat_init_pagesizes(); 1073 1074 /* Initialize the hash locks */ 1075 for (i = 0; i < khmehash_num; i++) { 1076 mutex_init(&khme_hash[i].hmehash_mutex, NULL, 1077 MUTEX_DEFAULT, NULL); 1078 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1079 } 1080 for (i = 0; i < uhmehash_num; i++) { 1081 mutex_init(&uhme_hash[i].hmehash_mutex, NULL, 1082 MUTEX_DEFAULT, NULL); 1083 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA; 1084 } 1085 khmehash_num--; /* make sure counter starts from 0 */ 1086 uhmehash_num--; /* make sure counter starts from 0 */ 1087 1088 /* 1089 * Allocate context domain structures. 1090 * 1091 * A platform may choose to modify max_mmu_ctxdoms in 1092 * set_platform_defaults(). If a platform does not define 1093 * a set_platform_defaults() or does not choose to modify 1094 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU. 1095 * 1096 * For all platforms that have CPUs sharing MMUs, this 1097 * value must be defined. 1098 */ 1099 if (max_mmu_ctxdoms == 0) 1100 max_mmu_ctxdoms = max_ncpus; 1101 1102 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *); 1103 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP); 1104 1105 /* mmu_ctx_t is 64 bytes aligned */ 1106 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache", 1107 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 1108 /* 1109 * MMU context domain initialization for the Boot CPU. 1110 * This needs the context domains array allocated above. 1111 */ 1112 mutex_enter(&cpu_lock); 1113 sfmmu_cpu_init(CPU); 1114 mutex_exit(&cpu_lock); 1115 1116 /* 1117 * Intialize ism mapping list lock. 1118 */ 1119 1120 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL); 1121 1122 /* 1123 * Each sfmmu structure carries an array of MMU context info 1124 * structures, one per context domain. The size of this array depends 1125 * on the maximum number of context domains. So, the size of the 1126 * sfmmu structure varies per platform. 1127 * 1128 * sfmmu is allocated from static arena, because trap 1129 * handler at TL > 0 is not allowed to touch kernel relocatable 1130 * memory. sfmmu's alignment is changed to 64 bytes from 1131 * default 8 bytes, as the lower 6 bits will be used to pass 1132 * pgcnt to vtag_flush_pgcnt_tl1. 1133 */ 1134 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1); 1135 1136 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size, 1137 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor, 1138 NULL, NULL, static_arena, 0); 1139 1140 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache", 1141 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0); 1142 1143 /* 1144 * Since we only use the tsb8k cache to "borrow" pages for TSBs 1145 * from the heap when low on memory or when TSB_FORCEALLOC is 1146 * specified, don't use magazines to cache them--we want to return 1147 * them to the system as quickly as possible. 1148 */ 1149 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache", 1150 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL, 1151 static_arena, KMC_NOMAGAZINE); 1152 1153 /* 1154 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical 1155 * memory, which corresponds to the old static reserve for TSBs. 1156 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of 1157 * memory we'll allocate for TSB slabs; beyond this point TSB 1158 * allocations will be taken from the kernel heap (via 1159 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem 1160 * consumer. 1161 */ 1162 if (tsb_alloc_hiwater_factor == 0) { 1163 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT; 1164 } 1165 SFMMU_SET_TSB_ALLOC_HIWATER(physmem); 1166 1167 for (sz = tsb_slab_ttesz; sz > 0; sz--) { 1168 if (!(disable_large_pages & (1 << sz))) 1169 break; 1170 } 1171 1172 if (sz < tsb_slab_ttesz) { 1173 tsb_slab_ttesz = sz; 1174 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz; 1175 tsb_slab_size = 1 << tsb_slab_shift; 1176 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1; 1177 use_bigtsb_arena = 0; 1178 } else if (use_bigtsb_arena && 1179 (disable_large_pages & (1 << bigtsb_slab_ttesz))) { 1180 use_bigtsb_arena = 0; 1181 } 1182 1183 if (!use_bigtsb_arena) { 1184 bigtsb_slab_shift = tsb_slab_shift; 1185 } 1186 SFMMU_SET_TSB_MAX_GROWSIZE(physmem); 1187 1188 /* 1189 * On smaller memory systems, allocate TSB memory in smaller chunks 1190 * than the default 4M slab size. We also honor disable_large_pages 1191 * here. 1192 * 1193 * The trap handlers need to be patched with the final slab shift, 1194 * since they need to be able to construct the TSB pointer at runtime. 1195 */ 1196 if ((tsb_max_growsize <= TSB_512K_SZCODE) && 1197 !(disable_large_pages & (1 << TTE512K))) { 1198 tsb_slab_ttesz = TTE512K; 1199 tsb_slab_shift = MMU_PAGESHIFT512K; 1200 tsb_slab_size = MMU_PAGESIZE512K; 1201 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT; 1202 use_bigtsb_arena = 0; 1203 } 1204 1205 if (!use_bigtsb_arena) { 1206 bigtsb_slab_ttesz = tsb_slab_ttesz; 1207 bigtsb_slab_shift = tsb_slab_shift; 1208 bigtsb_slab_size = tsb_slab_size; 1209 bigtsb_slab_mask = tsb_slab_mask; 1210 } 1211 1212 1213 /* 1214 * Set up memory callback to update tsb_alloc_hiwater and 1215 * tsb_max_growsize. 1216 */ 1217 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0); 1218 ASSERT(i == 0); 1219 1220 /* 1221 * kmem_tsb_arena is the source from which large TSB slabs are 1222 * drawn. The quantum of this arena corresponds to the largest 1223 * TSB size we can dynamically allocate for user processes. 1224 * Currently it must also be a supported page size since we 1225 * use exactly one translation entry to map each slab page. 1226 * 1227 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from 1228 * which most TSBs are allocated. Since most TSB allocations are 1229 * typically 8K we have a kmem cache we stack on top of each 1230 * kmem_tsb_default_arena to speed up those allocations. 1231 * 1232 * Note the two-level scheme of arenas is required only 1233 * because vmem_create doesn't allow us to specify alignment 1234 * requirements. If this ever changes the code could be 1235 * simplified to use only one level of arenas. 1236 * 1237 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena 1238 * will be provided in addition to the 4M kmem_tsb_arena. 1239 */ 1240 if (use_bigtsb_arena) { 1241 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0, 1242 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper, 1243 vmem_xfree, heap_arena, 0, VM_SLEEP); 1244 } 1245 1246 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size, 1247 sfmmu_vmem_xalloc_aligned_wrapper, 1248 vmem_xfree, heap_arena, 0, VM_SLEEP); 1249 1250 if (tsb_lgrp_affinity) { 1251 char s[50]; 1252 for (i = 0; i < NLGRPS_MAX; i++) { 1253 if (use_bigtsb_arena) { 1254 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i); 1255 kmem_bigtsb_default_arena[i] = vmem_create(s, 1256 NULL, 0, 2 * tsb_slab_size, 1257 sfmmu_tsb_segkmem_alloc, 1258 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 1259 0, VM_SLEEP | VM_BESTFIT); 1260 } 1261 1262 (void) sprintf(s, "kmem_tsb_lgrp%d", i); 1263 kmem_tsb_default_arena[i] = vmem_create(s, 1264 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1265 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1266 VM_SLEEP | VM_BESTFIT); 1267 1268 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i); 1269 sfmmu_tsb_cache[i] = kmem_cache_create(s, 1270 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1271 kmem_tsb_default_arena[i], 0); 1272 } 1273 } else { 1274 if (use_bigtsb_arena) { 1275 kmem_bigtsb_default_arena[0] = 1276 vmem_create("kmem_bigtsb_default", NULL, 0, 1277 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc, 1278 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0, 1279 VM_SLEEP | VM_BESTFIT); 1280 } 1281 1282 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default", 1283 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc, 1284 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0, 1285 VM_SLEEP | VM_BESTFIT); 1286 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache", 1287 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL, 1288 kmem_tsb_default_arena[0], 0); 1289 } 1290 1291 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ, 1292 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1293 sfmmu_hblkcache_destructor, 1294 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ, 1295 hat_memload_arena, KMC_NOHASH); 1296 1297 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE, 1298 segkmem_alloc_permanent, segkmem_free, heap_arena, 0, 1299 VMC_DUMPSAFE | VM_SLEEP); 1300 1301 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ, 1302 HMEBLK_ALIGN, sfmmu_hblkcache_constructor, 1303 sfmmu_hblkcache_destructor, 1304 NULL, (void *)HME1BLK_SZ, 1305 hat_memload1_arena, KMC_NOHASH); 1306 1307 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ, 1308 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 1309 1310 ism_blk_cache = kmem_cache_create("ism_blk_cache", 1311 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL, 1312 NULL, NULL, static_arena, KMC_NOHASH); 1313 1314 ism_ment_cache = kmem_cache_create("ism_ment_cache", 1315 sizeof (ism_ment_t), 0, NULL, NULL, 1316 NULL, NULL, NULL, 0); 1317 1318 /* 1319 * We grab the first hat for the kernel, 1320 */ 1321 AS_LOCK_ENTER(&kas, RW_WRITER); 1322 kas.a_hat = hat_alloc(&kas); 1323 AS_LOCK_EXIT(&kas); 1324 1325 /* 1326 * Initialize hblk_reserve. 1327 */ 1328 ((struct hme_blk *)hblk_reserve)->hblk_nextpa = 1329 va_to_pa((caddr_t)hblk_reserve); 1330 1331 #ifndef UTSB_PHYS 1332 /* 1333 * Reserve some kernel virtual address space for the locked TTEs 1334 * that allow us to probe the TSB from TL>0. 1335 */ 1336 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1337 0, 0, NULL, NULL, VM_SLEEP); 1338 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size, 1339 0, 0, NULL, NULL, VM_SLEEP); 1340 #endif 1341 1342 #ifdef VAC 1343 /* 1344 * The big page VAC handling code assumes VAC 1345 * will not be bigger than the smallest big 1346 * page- which is 64K. 1347 */ 1348 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) { 1349 cmn_err(CE_PANIC, "VAC too big!"); 1350 } 1351 #endif 1352 1353 uhme_hash_pa = va_to_pa(uhme_hash); 1354 khme_hash_pa = va_to_pa(khme_hash); 1355 1356 /* 1357 * Initialize relocation locks. kpr_suspendlock is held 1358 * at PIL_MAX to prevent interrupts from pinning the holder 1359 * of a suspended TTE which may access it leading to a 1360 * deadlock condition. 1361 */ 1362 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL); 1363 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX); 1364 1365 /* 1366 * If Shared context support is disabled via /etc/system 1367 * set shctx_on to 0 here if it was set to 1 earlier in boot 1368 * sequence by cpu module initialization code. 1369 */ 1370 if (shctx_on && disable_shctx) { 1371 shctx_on = 0; 1372 } 1373 1374 if (shctx_on) { 1375 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS * 1376 sizeof (srd_buckets[0]), KM_SLEEP); 1377 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) { 1378 mutex_init(&srd_buckets[i].srdb_lock, NULL, 1379 MUTEX_DEFAULT, NULL); 1380 } 1381 1382 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t), 1383 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor, 1384 NULL, NULL, NULL, 0); 1385 region_cache = kmem_cache_create("region_cache", 1386 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor, 1387 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0); 1388 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t), 1389 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor, 1390 NULL, NULL, NULL, 0); 1391 } 1392 1393 /* 1394 * Pre-allocate hrm_hashtab before enabling the collection of 1395 * refmod statistics. Allocating on the fly would mean us 1396 * running the risk of suffering recursive mutex enters or 1397 * deadlocks. 1398 */ 1399 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *), 1400 KM_SLEEP); 1401 1402 /* Allocate per-cpu pending freelist of hmeblks */ 1403 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64, 1404 KM_SLEEP); 1405 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP( 1406 (uintptr_t)cpu_hme_pend, 64); 1407 1408 for (i = 0; i < NCPU; i++) { 1409 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT, 1410 NULL); 1411 } 1412 1413 if (cpu_hme_pend_thresh == 0) { 1414 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH; 1415 } 1416 } 1417 1418 /* 1419 * Initialize locking for the hat layer, called early during boot. 1420 */ 1421 static void 1422 hat_lock_init() 1423 { 1424 int i; 1425 1426 /* 1427 * initialize the array of mutexes protecting a page's mapping 1428 * list and p_nrm field. 1429 */ 1430 for (i = 0; i < MML_TABLE_SIZE; i++) 1431 mutex_init(&mml_table[i].pad_mutex, NULL, MUTEX_DEFAULT, NULL); 1432 1433 if (kpm_enable) { 1434 for (i = 0; i < kpmp_table_sz; i++) { 1435 mutex_init(&kpmp_table[i].khl_mutex, NULL, 1436 MUTEX_DEFAULT, NULL); 1437 } 1438 } 1439 1440 /* 1441 * Initialize array of mutex locks that protects sfmmu fields and 1442 * TSB lists. 1443 */ 1444 for (i = 0; i < SFMMU_NUM_LOCK; i++) 1445 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT, 1446 NULL); 1447 } 1448 1449 #define SFMMU_KERNEL_MAXVA \ 1450 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT)) 1451 1452 /* 1453 * Allocate a hat structure. 1454 * Called when an address space first uses a hat. 1455 */ 1456 struct hat * 1457 hat_alloc(struct as *as) 1458 { 1459 sfmmu_t *sfmmup; 1460 int i; 1461 uint64_t cnum; 1462 extern uint_t get_color_start(struct as *); 1463 1464 ASSERT(AS_WRITE_HELD(as)); 1465 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 1466 sfmmup->sfmmu_as = as; 1467 sfmmup->sfmmu_flags = 0; 1468 sfmmup->sfmmu_tteflags = 0; 1469 sfmmup->sfmmu_rtteflags = 0; 1470 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock); 1471 1472 if (as == &kas) { 1473 ksfmmup = sfmmup; 1474 sfmmup->sfmmu_cext = 0; 1475 cnum = KCONTEXT; 1476 1477 sfmmup->sfmmu_clrstart = 0; 1478 sfmmup->sfmmu_tsb = NULL; 1479 /* 1480 * hat_kern_setup() will call sfmmu_init_ktsbinfo() 1481 * to setup tsb_info for ksfmmup. 1482 */ 1483 } else { 1484 1485 /* 1486 * Just set to invalid ctx. When it faults, it will 1487 * get a valid ctx. This would avoid the situation 1488 * where we get a ctx, but it gets stolen and then 1489 * we fault when we try to run and so have to get 1490 * another ctx. 1491 */ 1492 sfmmup->sfmmu_cext = 0; 1493 cnum = INVALID_CONTEXT; 1494 1495 /* initialize original physical page coloring bin */ 1496 sfmmup->sfmmu_clrstart = get_color_start(as); 1497 #ifdef DEBUG 1498 if (tsb_random_size) { 1499 uint32_t randval = (uint32_t)gettick() >> 4; 1500 int size = randval % (tsb_max_growsize + 1); 1501 1502 /* chose a random tsb size for stress testing */ 1503 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size, 1504 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1505 } else 1506 #endif /* DEBUG */ 1507 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, 1508 default_tsb_size, 1509 TSB8K|TSB64K|TSB512K, 0, sfmmup); 1510 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID; 1511 ASSERT(sfmmup->sfmmu_tsb != NULL); 1512 } 1513 1514 ASSERT(max_mmu_ctxdoms > 0); 1515 for (i = 0; i < max_mmu_ctxdoms; i++) { 1516 sfmmup->sfmmu_ctxs[i].cnum = cnum; 1517 sfmmup->sfmmu_ctxs[i].gnum = 0; 1518 } 1519 1520 for (i = 0; i < max_mmu_page_sizes; i++) { 1521 sfmmup->sfmmu_ttecnt[i] = 0; 1522 sfmmup->sfmmu_scdrttecnt[i] = 0; 1523 sfmmup->sfmmu_ismttecnt[i] = 0; 1524 sfmmup->sfmmu_scdismttecnt[i] = 0; 1525 sfmmup->sfmmu_pgsz[i] = TTE8K; 1526 } 1527 sfmmup->sfmmu_tsb0_4minflcnt = 0; 1528 sfmmup->sfmmu_iblk = NULL; 1529 sfmmup->sfmmu_ismhat = 0; 1530 sfmmup->sfmmu_scdhat = 0; 1531 sfmmup->sfmmu_ismblkpa = (uint64_t)-1; 1532 if (sfmmup == ksfmmup) { 1533 CPUSET_ALL(sfmmup->sfmmu_cpusran); 1534 } else { 1535 CPUSET_ZERO(sfmmup->sfmmu_cpusran); 1536 } 1537 sfmmup->sfmmu_free = 0; 1538 sfmmup->sfmmu_rmstat = 0; 1539 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart; 1540 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL); 1541 sfmmup->sfmmu_srdp = NULL; 1542 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map); 1543 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 1544 sfmmup->sfmmu_scdp = NULL; 1545 sfmmup->sfmmu_scd_link.next = NULL; 1546 sfmmup->sfmmu_scd_link.prev = NULL; 1547 return (sfmmup); 1548 } 1549 1550 /* 1551 * Create per-MMU context domain kstats for a given MMU ctx. 1552 */ 1553 static void 1554 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp) 1555 { 1556 mmu_ctx_stat_t stat; 1557 kstat_t *mmu_kstat; 1558 1559 ASSERT(MUTEX_HELD(&cpu_lock)); 1560 ASSERT(mmu_ctxp->mmu_kstat == NULL); 1561 1562 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx", 1563 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL); 1564 1565 if (mmu_kstat == NULL) { 1566 cmn_err(CE_WARN, "kstat_create for MMU %d failed", 1567 mmu_ctxp->mmu_idx); 1568 } else { 1569 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data; 1570 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++) 1571 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat], 1572 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64); 1573 mmu_ctxp->mmu_kstat = mmu_kstat; 1574 kstat_install(mmu_kstat); 1575 } 1576 } 1577 1578 /* 1579 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU 1580 * context domain information for a given CPU. If a platform does not 1581 * specify that interface, then the function below is used instead to return 1582 * default information. The defaults are as follows: 1583 * 1584 * - The number of MMU context IDs supported on any CPU in the 1585 * system is 8K. 1586 * - There is one MMU context domain per CPU. 1587 */ 1588 /*ARGSUSED*/ 1589 static void 1590 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop) 1591 { 1592 infop->mmu_nctxs = nctxs; 1593 infop->mmu_idx = cpu[cpuid]->cpu_seqid; 1594 } 1595 1596 /* 1597 * Called during CPU initialization to set the MMU context-related information 1598 * for a CPU. 1599 * 1600 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum. 1601 */ 1602 void 1603 sfmmu_cpu_init(cpu_t *cp) 1604 { 1605 mmu_ctx_info_t info; 1606 mmu_ctx_t *mmu_ctxp; 1607 1608 ASSERT(MUTEX_HELD(&cpu_lock)); 1609 1610 if (&plat_cpuid_to_mmu_ctx_info == NULL) 1611 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1612 else 1613 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info); 1614 1615 ASSERT(info.mmu_idx < max_mmu_ctxdoms); 1616 1617 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) { 1618 /* Each mmu_ctx is cacheline aligned. */ 1619 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP); 1620 bzero(mmu_ctxp, sizeof (mmu_ctx_t)); 1621 1622 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN, 1623 (void *)ipltospl(DISP_LEVEL)); 1624 mmu_ctxp->mmu_idx = info.mmu_idx; 1625 mmu_ctxp->mmu_nctxs = info.mmu_nctxs; 1626 /* 1627 * Globally for lifetime of a system, 1628 * gnum must always increase. 1629 * mmu_saved_gnum is protected by the cpu_lock. 1630 */ 1631 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1; 1632 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 1633 1634 sfmmu_mmu_kstat_create(mmu_ctxp); 1635 1636 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp; 1637 } else { 1638 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx); 1639 ASSERT(mmu_ctxp->mmu_nctxs <= info.mmu_nctxs); 1640 } 1641 1642 /* 1643 * The mmu_lock is acquired here to prevent races with 1644 * the wrap-around code. 1645 */ 1646 mutex_enter(&mmu_ctxp->mmu_lock); 1647 1648 1649 mmu_ctxp->mmu_ncpus++; 1650 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1651 CPU_MMU_IDX(cp) = info.mmu_idx; 1652 CPU_MMU_CTXP(cp) = mmu_ctxp; 1653 1654 mutex_exit(&mmu_ctxp->mmu_lock); 1655 } 1656 1657 static void 1658 sfmmu_ctxdom_free(mmu_ctx_t *mmu_ctxp) 1659 { 1660 ASSERT(MUTEX_HELD(&cpu_lock)); 1661 ASSERT(!MUTEX_HELD(&mmu_ctxp->mmu_lock)); 1662 1663 mutex_destroy(&mmu_ctxp->mmu_lock); 1664 1665 if (mmu_ctxp->mmu_kstat) 1666 kstat_delete(mmu_ctxp->mmu_kstat); 1667 1668 /* mmu_saved_gnum is protected by the cpu_lock. */ 1669 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum) 1670 mmu_saved_gnum = mmu_ctxp->mmu_gnum; 1671 1672 kmem_cache_free(mmuctxdom_cache, mmu_ctxp); 1673 } 1674 1675 /* 1676 * Called to perform MMU context-related cleanup for a CPU. 1677 */ 1678 void 1679 sfmmu_cpu_cleanup(cpu_t *cp) 1680 { 1681 mmu_ctx_t *mmu_ctxp; 1682 1683 ASSERT(MUTEX_HELD(&cpu_lock)); 1684 1685 mmu_ctxp = CPU_MMU_CTXP(cp); 1686 ASSERT(mmu_ctxp != NULL); 1687 1688 /* 1689 * The mmu_lock is acquired here to prevent races with 1690 * the wrap-around code. 1691 */ 1692 mutex_enter(&mmu_ctxp->mmu_lock); 1693 1694 CPU_MMU_CTXP(cp) = NULL; 1695 1696 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id); 1697 if (--mmu_ctxp->mmu_ncpus == 0) { 1698 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL; 1699 mutex_exit(&mmu_ctxp->mmu_lock); 1700 sfmmu_ctxdom_free(mmu_ctxp); 1701 return; 1702 } 1703 1704 mutex_exit(&mmu_ctxp->mmu_lock); 1705 } 1706 1707 uint_t 1708 sfmmu_ctxdom_nctxs(int idx) 1709 { 1710 return (mmu_ctxs_tbl[idx]->mmu_nctxs); 1711 } 1712 1713 #ifdef sun4v 1714 /* 1715 * sfmmu_ctxdoms_* is an interface provided to help keep context domains 1716 * consistant after suspend/resume on system that can resume on a different 1717 * hardware than it was suspended. 1718 * 1719 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts 1720 * from being allocated. It acquires all hat_locks, which blocks most access to 1721 * context data, except for a few cases that are handled separately or are 1722 * harmless. It wraps each domain to increment gnum and invalidate on-CPU 1723 * contexts, and forces cnum to its max. As a result of this call all user 1724 * threads that are running on CPUs trap and try to perform wrap around but 1725 * can't because hat_locks are taken. Threads that were not on CPUs but started 1726 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking 1727 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block 1728 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs 1729 * are paused, else it could deadlock acquiring locks held by paused CPUs. 1730 * 1731 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records 1732 * the CPUs that had them. It must be called after CPUs have been paused. This 1733 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data, 1734 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx 1735 * runs with interrupts disabled. When CPUs are later resumed, they may enter 1736 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately 1737 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus 1738 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is 1739 * accessing the old context domains. 1740 * 1741 * sfmmu_ctxdoms_update(void) frees space used by old context domains and 1742 * allocates new context domains based on hardware layout. It initializes 1743 * every CPU that had context domain before migration to have one again. 1744 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it 1745 * could deadlock acquiring locks held by paused CPUs. 1746 * 1747 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads 1748 * acquire new context ids and continue execution. 1749 * 1750 * Therefore functions should be called in the following order: 1751 * suspend_routine() 1752 * sfmmu_ctxdom_lock() 1753 * pause_cpus() 1754 * suspend() 1755 * if (suspend failed) 1756 * sfmmu_ctxdom_unlock() 1757 * ... 1758 * sfmmu_ctxdom_remove() 1759 * resume_cpus() 1760 * sfmmu_ctxdom_update() 1761 * sfmmu_ctxdom_unlock() 1762 */ 1763 static cpuset_t sfmmu_ctxdoms_pset; 1764 1765 void 1766 sfmmu_ctxdoms_remove() 1767 { 1768 processorid_t id; 1769 cpu_t *cp; 1770 1771 /* 1772 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can 1773 * be restored post-migration. A CPU may be powered off and not have a 1774 * domain, for example. 1775 */ 1776 CPUSET_ZERO(sfmmu_ctxdoms_pset); 1777 1778 for (id = 0; id < NCPU; id++) { 1779 if ((cp = cpu[id]) != NULL && CPU_MMU_CTXP(cp) != NULL) { 1780 CPUSET_ADD(sfmmu_ctxdoms_pset, id); 1781 CPU_MMU_CTXP(cp) = NULL; 1782 } 1783 } 1784 } 1785 1786 void 1787 sfmmu_ctxdoms_lock(void) 1788 { 1789 int idx; 1790 mmu_ctx_t *mmu_ctxp; 1791 1792 sfmmu_hat_lock_all(); 1793 1794 /* 1795 * At this point, no thread can be in sfmmu_ctx_wrap_around, because 1796 * hat_lock is always taken before calling it. 1797 * 1798 * For each domain, set mmu_cnum to max so no more contexts can be 1799 * allocated, and wrap to flush on-CPU contexts and force threads to 1800 * acquire a new context when we later drop hat_lock after migration. 1801 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum, 1802 * but the latter uses CAS and will miscompare and not overwrite it. 1803 */ 1804 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */ 1805 for (idx = 0; idx < max_mmu_ctxdoms; idx++) { 1806 if ((mmu_ctxp = mmu_ctxs_tbl[idx]) != NULL) { 1807 mutex_enter(&mmu_ctxp->mmu_lock); 1808 mmu_ctxp->mmu_cnum = mmu_ctxp->mmu_nctxs; 1809 /* make sure updated cnum visible */ 1810 membar_enter(); 1811 mutex_exit(&mmu_ctxp->mmu_lock); 1812 sfmmu_ctx_wrap_around(mmu_ctxp, B_FALSE); 1813 } 1814 } 1815 kpreempt_enable(); 1816 } 1817 1818 void 1819 sfmmu_ctxdoms_unlock(void) 1820 { 1821 sfmmu_hat_unlock_all(); 1822 } 1823 1824 void 1825 sfmmu_ctxdoms_update(void) 1826 { 1827 processorid_t id; 1828 cpu_t *cp; 1829 uint_t idx; 1830 mmu_ctx_t *mmu_ctxp; 1831 1832 /* 1833 * Free all context domains. As side effect, this increases 1834 * mmu_saved_gnum to the maximum gnum over all domains, which is used to 1835 * init gnum in the new domains, which therefore will be larger than the 1836 * sfmmu gnum for any process, guaranteeing that every process will see 1837 * a new generation and allocate a new context regardless of what new 1838 * domain it runs in. 1839 */ 1840 mutex_enter(&cpu_lock); 1841 1842 for (idx = 0; idx < max_mmu_ctxdoms; idx++) { 1843 if (mmu_ctxs_tbl[idx] != NULL) { 1844 mmu_ctxp = mmu_ctxs_tbl[idx]; 1845 mmu_ctxs_tbl[idx] = NULL; 1846 sfmmu_ctxdom_free(mmu_ctxp); 1847 } 1848 } 1849 1850 for (id = 0; id < NCPU; id++) { 1851 if (CPU_IN_SET(sfmmu_ctxdoms_pset, id) && 1852 (cp = cpu[id]) != NULL) 1853 sfmmu_cpu_init(cp); 1854 } 1855 mutex_exit(&cpu_lock); 1856 } 1857 #endif 1858 1859 /* 1860 * Hat_setup, makes an address space context the current active one. 1861 * In sfmmu this translates to setting the secondary context with the 1862 * corresponding context. 1863 */ 1864 void 1865 hat_setup(struct hat *sfmmup, int allocflag) 1866 { 1867 hatlock_t *hatlockp; 1868 1869 /* Init needs some special treatment. */ 1870 if (allocflag == HAT_INIT) { 1871 /* 1872 * Make sure that we have 1873 * 1. a TSB 1874 * 2. a valid ctx that doesn't get stolen after this point. 1875 */ 1876 hatlockp = sfmmu_hat_enter(sfmmup); 1877 1878 /* 1879 * Swap in the TSB. hat_init() allocates tsbinfos without 1880 * TSBs, but we need one for init, since the kernel does some 1881 * special things to set up its stack and needs the TSB to 1882 * resolve page faults. 1883 */ 1884 sfmmu_tsb_swapin(sfmmup, hatlockp); 1885 1886 sfmmu_get_ctx(sfmmup); 1887 1888 sfmmu_hat_exit(hatlockp); 1889 } else { 1890 ASSERT(allocflag == HAT_ALLOC); 1891 1892 hatlockp = sfmmu_hat_enter(sfmmup); 1893 kpreempt_disable(); 1894 1895 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id); 1896 /* 1897 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter, 1898 * pagesize bits don't matter in this case since we are passing 1899 * INVALID_CONTEXT to it. 1900 * Compatibility Note: hw takes care of MMU_SCONTEXT1 1901 */ 1902 sfmmu_setctx_sec(INVALID_CONTEXT); 1903 sfmmu_clear_utsbinfo(); 1904 1905 kpreempt_enable(); 1906 sfmmu_hat_exit(hatlockp); 1907 } 1908 } 1909 1910 /* 1911 * Free all the translation resources for the specified address space. 1912 * Called from as_free when an address space is being destroyed. 1913 */ 1914 void 1915 hat_free_start(struct hat *sfmmup) 1916 { 1917 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as)); 1918 ASSERT(sfmmup != ksfmmup); 1919 1920 sfmmup->sfmmu_free = 1; 1921 if (sfmmup->sfmmu_scdp != NULL) { 1922 sfmmu_leave_scd(sfmmup, 0); 1923 } 1924 1925 ASSERT(sfmmup->sfmmu_scdp == NULL); 1926 } 1927 1928 void 1929 hat_free_end(struct hat *sfmmup) 1930 { 1931 int i; 1932 1933 ASSERT(sfmmup->sfmmu_free == 1); 1934 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 1935 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 1936 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 1937 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 1938 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 1939 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 1940 1941 if (sfmmup->sfmmu_rmstat) { 1942 hat_freestat(sfmmup->sfmmu_as, NULL); 1943 } 1944 1945 while (sfmmup->sfmmu_tsb != NULL) { 1946 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next; 1947 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb); 1948 sfmmup->sfmmu_tsb = next; 1949 } 1950 1951 if (sfmmup->sfmmu_srdp != NULL) { 1952 sfmmu_leave_srd(sfmmup); 1953 ASSERT(sfmmup->sfmmu_srdp == NULL); 1954 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1955 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) { 1956 kmem_free(sfmmup->sfmmu_hmeregion_links[i], 1957 SFMMU_L2_HMERLINKS_SIZE); 1958 sfmmup->sfmmu_hmeregion_links[i] = NULL; 1959 } 1960 } 1961 } 1962 sfmmu_free_sfmmu(sfmmup); 1963 1964 #ifdef DEBUG 1965 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 1966 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL); 1967 } 1968 #endif 1969 1970 kmem_cache_free(sfmmuid_cache, sfmmup); 1971 } 1972 1973 /* 1974 * Set up any translation structures, for the specified address space, 1975 * that are needed or preferred when the process is being swapped in. 1976 */ 1977 /* ARGSUSED */ 1978 void 1979 hat_swapin(struct hat *hat) 1980 { 1981 } 1982 1983 /* 1984 * Free all of the translation resources, for the specified address space, 1985 * that can be freed while the process is swapped out. Called from as_swapout. 1986 * Also, free up the ctx that this process was using. 1987 */ 1988 void 1989 hat_swapout(struct hat *sfmmup) 1990 { 1991 struct hmehash_bucket *hmebp; 1992 struct hme_blk *hmeblkp; 1993 struct hme_blk *pr_hblk = NULL; 1994 struct hme_blk *nx_hblk; 1995 int i; 1996 struct hme_blk *list = NULL; 1997 hatlock_t *hatlockp; 1998 struct tsb_info *tsbinfop; 1999 struct free_tsb { 2000 struct free_tsb *next; 2001 struct tsb_info *tsbinfop; 2002 }; /* free list of TSBs */ 2003 struct free_tsb *freelist, *last, *next; 2004 2005 SFMMU_STAT(sf_swapout); 2006 2007 /* 2008 * There is no way to go from an as to all its translations in sfmmu. 2009 * Here is one of the times when we take the big hit and traverse 2010 * the hash looking for hme_blks to free up. Not only do we free up 2011 * this as hme_blks but all those that are free. We are obviously 2012 * swapping because we need memory so let's free up as much 2013 * as we can. 2014 * 2015 * Note that we don't flush TLB/TSB here -- it's not necessary 2016 * because: 2017 * 1) we free the ctx we're using and throw away the TSB(s); 2018 * 2) processes aren't runnable while being swapped out. 2019 */ 2020 ASSERT(sfmmup != KHATID); 2021 for (i = 0; i <= UHMEHASH_SZ; i++) { 2022 hmebp = &uhme_hash[i]; 2023 SFMMU_HASH_LOCK(hmebp); 2024 hmeblkp = hmebp->hmeblkp; 2025 pr_hblk = NULL; 2026 while (hmeblkp) { 2027 2028 if ((hmeblkp->hblk_tag.htag_id == sfmmup) && 2029 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) { 2030 ASSERT(!hmeblkp->hblk_shared); 2031 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 2032 (caddr_t)get_hblk_base(hmeblkp), 2033 get_hblk_endaddr(hmeblkp), 2034 NULL, HAT_UNLOAD); 2035 } 2036 nx_hblk = hmeblkp->hblk_next; 2037 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 2038 ASSERT(!hmeblkp->hblk_lckcnt); 2039 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 2040 &list, 0); 2041 } else { 2042 pr_hblk = hmeblkp; 2043 } 2044 hmeblkp = nx_hblk; 2045 } 2046 SFMMU_HASH_UNLOCK(hmebp); 2047 } 2048 2049 sfmmu_hblks_list_purge(&list, 0); 2050 2051 /* 2052 * Now free up the ctx so that others can reuse it. 2053 */ 2054 hatlockp = sfmmu_hat_enter(sfmmup); 2055 2056 sfmmu_invalidate_ctx(sfmmup); 2057 2058 /* 2059 * Free TSBs, but not tsbinfos, and set SWAPPED flag. 2060 * If TSBs were never swapped in, just return. 2061 * This implies that we don't support partial swapping 2062 * of TSBs -- either all are swapped out, or none are. 2063 * 2064 * We must hold the HAT lock here to prevent racing with another 2065 * thread trying to unmap TTEs from the TSB or running the post- 2066 * relocator after relocating the TSB's memory. Unfortunately, we 2067 * can't free memory while holding the HAT lock or we could 2068 * deadlock, so we build a list of TSBs to be freed after marking 2069 * the tsbinfos as swapped out and free them after dropping the 2070 * lock. 2071 */ 2072 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 2073 sfmmu_hat_exit(hatlockp); 2074 return; 2075 } 2076 2077 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED); 2078 last = freelist = NULL; 2079 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 2080 tsbinfop = tsbinfop->tsb_next) { 2081 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0); 2082 2083 /* 2084 * Cast the TSB into a struct free_tsb and put it on the free 2085 * list. 2086 */ 2087 if (freelist == NULL) { 2088 last = freelist = (struct free_tsb *)tsbinfop->tsb_va; 2089 } else { 2090 last->next = (struct free_tsb *)tsbinfop->tsb_va; 2091 last = last->next; 2092 } 2093 last->next = NULL; 2094 last->tsbinfop = tsbinfop; 2095 tsbinfop->tsb_flags |= TSB_SWAPPED; 2096 /* 2097 * Zero out the TTE to clear the valid bit. 2098 * Note we can't use a value like 0xbad because we want to 2099 * ensure diagnostic bits are NEVER set on TTEs that might 2100 * be loaded. The intent is to catch any invalid access 2101 * to the swapped TSB, such as a thread running with a valid 2102 * context without first calling sfmmu_tsb_swapin() to 2103 * allocate TSB memory. 2104 */ 2105 tsbinfop->tsb_tte.ll = 0; 2106 } 2107 2108 /* Now we can drop the lock and free the TSB memory. */ 2109 sfmmu_hat_exit(hatlockp); 2110 for (; freelist != NULL; freelist = next) { 2111 next = freelist->next; 2112 sfmmu_tsb_free(freelist->tsbinfop); 2113 } 2114 } 2115 2116 /* 2117 * Duplicate the translations of an as into another newas 2118 */ 2119 /* ARGSUSED */ 2120 int 2121 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len, 2122 uint_t flag) 2123 { 2124 sf_srd_t *srdp; 2125 sf_scd_t *scdp; 2126 int i; 2127 extern uint_t get_color_start(struct as *); 2128 2129 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) || 2130 (flag == HAT_DUP_SRD)); 2131 ASSERT(hat != ksfmmup); 2132 ASSERT(newhat != ksfmmup); 2133 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp); 2134 2135 if (flag == HAT_DUP_COW) { 2136 panic("hat_dup: HAT_DUP_COW not supported"); 2137 } 2138 2139 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) { 2140 ASSERT(srdp->srd_evp != NULL); 2141 VN_HOLD(srdp->srd_evp); 2142 ASSERT(srdp->srd_refcnt > 0); 2143 newhat->sfmmu_srdp = srdp; 2144 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt); 2145 } 2146 2147 /* 2148 * HAT_DUP_ALL flag is used after as duplication is done. 2149 */ 2150 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) { 2151 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2); 2152 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags; 2153 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) { 2154 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG; 2155 } 2156 2157 /* check if need to join scd */ 2158 if ((scdp = hat->sfmmu_scdp) != NULL && 2159 newhat->sfmmu_scdp != scdp) { 2160 int ret; 2161 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map, 2162 &scdp->scd_region_map, ret); 2163 ASSERT(ret); 2164 sfmmu_join_scd(scdp, newhat); 2165 ASSERT(newhat->sfmmu_scdp == scdp && 2166 scdp->scd_refcnt >= 2); 2167 for (i = 0; i < max_mmu_page_sizes; i++) { 2168 newhat->sfmmu_ismttecnt[i] = 2169 hat->sfmmu_ismttecnt[i]; 2170 newhat->sfmmu_scdismttecnt[i] = 2171 hat->sfmmu_scdismttecnt[i]; 2172 } 2173 } 2174 2175 sfmmu_check_page_sizes(newhat, 1); 2176 } 2177 2178 if (flag == HAT_DUP_ALL && consistent_coloring == 0 && 2179 update_proc_pgcolorbase_after_fork != 0) { 2180 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as); 2181 } 2182 return (0); 2183 } 2184 2185 void 2186 hat_memload(struct hat *hat, caddr_t addr, struct page *pp, 2187 uint_t attr, uint_t flags) 2188 { 2189 hat_do_memload(hat, addr, pp, attr, flags, 2190 SFMMU_INVALID_SHMERID); 2191 } 2192 2193 void 2194 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp, 2195 uint_t attr, uint_t flags, hat_region_cookie_t rcookie) 2196 { 2197 uint_t rid; 2198 if (rcookie == HAT_INVALID_REGION_COOKIE) { 2199 hat_do_memload(hat, addr, pp, attr, flags, 2200 SFMMU_INVALID_SHMERID); 2201 return; 2202 } 2203 rid = (uint_t)((uint64_t)rcookie); 2204 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2205 hat_do_memload(hat, addr, pp, attr, flags, rid); 2206 } 2207 2208 /* 2209 * Set up addr to map to page pp with protection prot. 2210 * As an optimization we also load the TSB with the 2211 * corresponding tte but it is no big deal if the tte gets kicked out. 2212 */ 2213 static void 2214 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp, 2215 uint_t attr, uint_t flags, uint_t rid) 2216 { 2217 tte_t tte; 2218 2219 2220 ASSERT(hat != NULL); 2221 ASSERT(PAGE_LOCKED(pp)); 2222 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2223 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2224 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2225 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE); 2226 2227 if (PP_ISFREE(pp)) { 2228 panic("hat_memload: loading a mapping to free page %p", 2229 (void *)pp); 2230 } 2231 2232 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as)); 2233 2234 if (flags & ~SFMMU_LOAD_ALLFLAG) 2235 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d", 2236 flags & ~SFMMU_LOAD_ALLFLAG); 2237 2238 if (hat->sfmmu_rmstat) 2239 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr); 2240 2241 #if defined(SF_ERRATA_57) 2242 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2243 (addr < errata57_limit) && (attr & PROT_EXEC) && 2244 !(flags & HAT_LOAD_SHARE)) { 2245 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user " 2246 " page executable"); 2247 attr &= ~PROT_EXEC; 2248 } 2249 #endif 2250 2251 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2252 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid); 2253 2254 /* 2255 * Check TSB and TLB page sizes. 2256 */ 2257 if ((flags & HAT_LOAD_SHARE) == 0) { 2258 sfmmu_check_page_sizes(hat, 1); 2259 } 2260 } 2261 2262 /* 2263 * hat_devload can be called to map real memory (e.g. 2264 * /dev/kmem) and even though hat_devload will determine pf is 2265 * for memory, it will be unable to get a shared lock on the 2266 * page (because someone else has it exclusively) and will 2267 * pass dp = NULL. If tteload doesn't get a non-NULL 2268 * page pointer it can't cache memory. 2269 */ 2270 void 2271 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn, 2272 uint_t attr, int flags) 2273 { 2274 tte_t tte; 2275 struct page *pp = NULL; 2276 int use_lgpg = 0; 2277 2278 ASSERT(hat != NULL); 2279 2280 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG)); 2281 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2282 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as)); 2283 if (len == 0) 2284 panic("hat_devload: zero len"); 2285 if (flags & ~SFMMU_LOAD_ALLFLAG) 2286 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d", 2287 flags & ~SFMMU_LOAD_ALLFLAG); 2288 2289 #if defined(SF_ERRATA_57) 2290 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2291 (addr < errata57_limit) && (attr & PROT_EXEC) && 2292 !(flags & HAT_LOAD_SHARE)) { 2293 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user " 2294 " page executable"); 2295 attr &= ~PROT_EXEC; 2296 } 2297 #endif 2298 2299 /* 2300 * If it's a memory page find its pp 2301 */ 2302 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) { 2303 pp = page_numtopp_nolock(pfn); 2304 if (pp == NULL) { 2305 flags |= HAT_LOAD_NOCONSIST; 2306 } else { 2307 if (PP_ISFREE(pp)) { 2308 panic("hat_memload: loading " 2309 "a mapping to free page %p", 2310 (void *)pp); 2311 } 2312 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) { 2313 panic("hat_memload: loading a mapping " 2314 "to unlocked relocatable page %p", 2315 (void *)pp); 2316 } 2317 ASSERT(len == MMU_PAGESIZE); 2318 } 2319 } 2320 2321 if (hat->sfmmu_rmstat) 2322 hat_resvstat(len, hat->sfmmu_as, addr); 2323 2324 if (flags & HAT_LOAD_NOCONSIST) { 2325 attr |= SFMMU_UNCACHEVTTE; 2326 use_lgpg = 1; 2327 } 2328 if (!pf_is_memory(pfn)) { 2329 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC; 2330 use_lgpg = 1; 2331 switch (attr & HAT_ORDER_MASK) { 2332 case HAT_STRICTORDER: 2333 case HAT_UNORDERED_OK: 2334 /* 2335 * we set the side effect bit for all non 2336 * memory mappings unless merging is ok 2337 */ 2338 attr |= SFMMU_SIDEFFECT; 2339 break; 2340 case HAT_MERGING_OK: 2341 case HAT_LOADCACHING_OK: 2342 case HAT_STORECACHING_OK: 2343 break; 2344 default: 2345 panic("hat_devload: bad attr"); 2346 break; 2347 } 2348 } 2349 while (len) { 2350 if (!use_lgpg) { 2351 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2352 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2353 flags, SFMMU_INVALID_SHMERID); 2354 len -= MMU_PAGESIZE; 2355 addr += MMU_PAGESIZE; 2356 pfn++; 2357 continue; 2358 } 2359 /* 2360 * try to use large pages, check va/pa alignments 2361 * Note that 32M/256M page sizes are not (yet) supported. 2362 */ 2363 if ((len >= MMU_PAGESIZE4M) && 2364 !((uintptr_t)addr & MMU_PAGEOFFSET4M) && 2365 !(disable_large_pages & (1 << TTE4M)) && 2366 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) { 2367 sfmmu_memtte(&tte, pfn, attr, TTE4M); 2368 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2369 flags, SFMMU_INVALID_SHMERID); 2370 len -= MMU_PAGESIZE4M; 2371 addr += MMU_PAGESIZE4M; 2372 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE; 2373 } else if ((len >= MMU_PAGESIZE512K) && 2374 !((uintptr_t)addr & MMU_PAGEOFFSET512K) && 2375 !(disable_large_pages & (1 << TTE512K)) && 2376 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) { 2377 sfmmu_memtte(&tte, pfn, attr, TTE512K); 2378 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2379 flags, SFMMU_INVALID_SHMERID); 2380 len -= MMU_PAGESIZE512K; 2381 addr += MMU_PAGESIZE512K; 2382 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE; 2383 } else if ((len >= MMU_PAGESIZE64K) && 2384 !((uintptr_t)addr & MMU_PAGEOFFSET64K) && 2385 !(disable_large_pages & (1 << TTE64K)) && 2386 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) { 2387 sfmmu_memtte(&tte, pfn, attr, TTE64K); 2388 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2389 flags, SFMMU_INVALID_SHMERID); 2390 len -= MMU_PAGESIZE64K; 2391 addr += MMU_PAGESIZE64K; 2392 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE; 2393 } else { 2394 sfmmu_memtte(&tte, pfn, attr, TTE8K); 2395 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, 2396 flags, SFMMU_INVALID_SHMERID); 2397 len -= MMU_PAGESIZE; 2398 addr += MMU_PAGESIZE; 2399 pfn++; 2400 } 2401 } 2402 2403 /* 2404 * Check TSB and TLB page sizes. 2405 */ 2406 if ((flags & HAT_LOAD_SHARE) == 0) { 2407 sfmmu_check_page_sizes(hat, 1); 2408 } 2409 } 2410 2411 void 2412 hat_memload_array(struct hat *hat, caddr_t addr, size_t len, 2413 struct page **pps, uint_t attr, uint_t flags) 2414 { 2415 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2416 SFMMU_INVALID_SHMERID); 2417 } 2418 2419 void 2420 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len, 2421 struct page **pps, uint_t attr, uint_t flags, 2422 hat_region_cookie_t rcookie) 2423 { 2424 uint_t rid; 2425 if (rcookie == HAT_INVALID_REGION_COOKIE) { 2426 hat_do_memload_array(hat, addr, len, pps, attr, flags, 2427 SFMMU_INVALID_SHMERID); 2428 return; 2429 } 2430 rid = (uint_t)((uint64_t)rcookie); 2431 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 2432 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid); 2433 } 2434 2435 /* 2436 * Map the largest extend possible out of the page array. The array may NOT 2437 * be in order. The largest possible mapping a page can have 2438 * is specified in the p_szc field. The p_szc field 2439 * cannot change as long as there any mappings (large or small) 2440 * to any of the pages that make up the large page. (ie. any 2441 * promotion/demotion of page size is not up to the hat but up to 2442 * the page free list manager). The array 2443 * should consist of properly aligned contigous pages that are 2444 * part of a big page for a large mapping to be created. 2445 */ 2446 static void 2447 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len, 2448 struct page **pps, uint_t attr, uint_t flags, uint_t rid) 2449 { 2450 int ttesz; 2451 size_t mapsz; 2452 pgcnt_t numpg, npgs; 2453 tte_t tte; 2454 page_t *pp; 2455 uint_t large_pages_disable; 2456 2457 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 2458 SFMMU_VALIDATE_HMERID(hat, rid, addr, len); 2459 2460 if (hat->sfmmu_rmstat) 2461 hat_resvstat(len, hat->sfmmu_as, addr); 2462 2463 #if defined(SF_ERRATA_57) 2464 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) && 2465 (addr < errata57_limit) && (attr & PROT_EXEC) && 2466 !(flags & HAT_LOAD_SHARE)) { 2467 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make " 2468 "user page executable"); 2469 attr &= ~PROT_EXEC; 2470 } 2471 #endif 2472 2473 /* Get number of pages */ 2474 npgs = len >> MMU_PAGESHIFT; 2475 2476 if (flags & HAT_LOAD_SHARE) { 2477 large_pages_disable = disable_ism_large_pages; 2478 } else { 2479 large_pages_disable = disable_large_pages; 2480 } 2481 2482 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) { 2483 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2484 rid); 2485 return; 2486 } 2487 2488 while (npgs >= NHMENTS) { 2489 pp = *pps; 2490 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) { 2491 /* 2492 * Check if this page size is disabled. 2493 */ 2494 if (large_pages_disable & (1 << ttesz)) 2495 continue; 2496 2497 numpg = TTEPAGES(ttesz); 2498 mapsz = numpg << MMU_PAGESHIFT; 2499 if ((npgs >= numpg) && 2500 IS_P2ALIGNED(addr, mapsz) && 2501 IS_P2ALIGNED(pp->p_pagenum, numpg)) { 2502 /* 2503 * At this point we have enough pages and 2504 * we know the virtual address and the pfn 2505 * are properly aligned. We still need 2506 * to check for physical contiguity but since 2507 * it is very likely that this is the case 2508 * we will assume they are so and undo 2509 * the request if necessary. It would 2510 * be great if we could get a hint flag 2511 * like HAT_CONTIG which would tell us 2512 * the pages are contigous for sure. 2513 */ 2514 sfmmu_memtte(&tte, (*pps)->p_pagenum, 2515 attr, ttesz); 2516 if (!sfmmu_tteload_array(hat, &tte, addr, 2517 pps, flags, rid)) { 2518 break; 2519 } 2520 } 2521 } 2522 if (ttesz == TTE8K) { 2523 /* 2524 * We were not able to map array using a large page 2525 * batch a hmeblk or fraction at a time. 2526 */ 2527 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT) 2528 & (NHMENTS-1); 2529 numpg = NHMENTS - numpg; 2530 ASSERT(numpg <= npgs); 2531 mapsz = numpg * MMU_PAGESIZE; 2532 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, 2533 numpg, rid); 2534 } 2535 addr += mapsz; 2536 npgs -= numpg; 2537 pps += numpg; 2538 } 2539 2540 if (npgs) { 2541 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs, 2542 rid); 2543 } 2544 2545 /* 2546 * Check TSB and TLB page sizes. 2547 */ 2548 if ((flags & HAT_LOAD_SHARE) == 0) { 2549 sfmmu_check_page_sizes(hat, 1); 2550 } 2551 } 2552 2553 /* 2554 * Function tries to batch 8K pages into the same hme blk. 2555 */ 2556 static void 2557 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps, 2558 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid) 2559 { 2560 tte_t tte; 2561 page_t *pp; 2562 struct hmehash_bucket *hmebp; 2563 struct hme_blk *hmeblkp; 2564 int index; 2565 2566 while (npgs) { 2567 /* 2568 * Acquire the hash bucket. 2569 */ 2570 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K, 2571 rid); 2572 ASSERT(hmebp); 2573 2574 /* 2575 * Find the hment block. 2576 */ 2577 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr, 2578 TTE8K, flags, rid); 2579 ASSERT(hmeblkp); 2580 2581 do { 2582 /* 2583 * Make the tte. 2584 */ 2585 pp = *pps; 2586 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K); 2587 2588 /* 2589 * Add the translation. 2590 */ 2591 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte, 2592 vaddr, pps, flags, rid); 2593 2594 /* 2595 * Goto next page. 2596 */ 2597 pps++; 2598 npgs--; 2599 2600 /* 2601 * Goto next address. 2602 */ 2603 vaddr += MMU_PAGESIZE; 2604 2605 /* 2606 * Don't crossover into a different hmentblk. 2607 */ 2608 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) & 2609 (NHMENTS-1)); 2610 2611 } while (index != 0 && npgs != 0); 2612 2613 /* 2614 * Release the hash bucket. 2615 */ 2616 2617 sfmmu_tteload_release_hashbucket(hmebp); 2618 } 2619 } 2620 2621 /* 2622 * Construct a tte for a page: 2623 * 2624 * tte_valid = 1 2625 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only) 2626 * tte_size = size 2627 * tte_nfo = attr & HAT_NOFAULT 2628 * tte_ie = attr & HAT_STRUCTURE_LE 2629 * tte_hmenum = hmenum 2630 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT; 2631 * tte_palo = pp->p_pagenum & TTE_PALOMASK; 2632 * tte_ref = 1 (optimization) 2633 * tte_wr_perm = attr & PROT_WRITE; 2634 * tte_no_sync = attr & HAT_NOSYNC 2635 * tte_lock = attr & SFMMU_LOCKTTE 2636 * tte_cp = !(attr & SFMMU_UNCACHEPTTE) 2637 * tte_cv = !(attr & SFMMU_UNCACHEVTTE) 2638 * tte_e = attr & SFMMU_SIDEFFECT 2639 * tte_priv = !(attr & PROT_USER) 2640 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt) 2641 * tte_glb = 0 2642 */ 2643 void 2644 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) 2645 { 2646 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 2647 2648 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); 2649 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); 2650 2651 if (TTE_IS_NOSYNC(ttep)) { 2652 TTE_SET_REF(ttep); 2653 if (TTE_IS_WRITABLE(ttep)) { 2654 TTE_SET_MOD(ttep); 2655 } 2656 } 2657 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { 2658 panic("sfmmu_memtte: can't set both NFO and EXEC bits"); 2659 } 2660 } 2661 2662 /* 2663 * This function will add a translation to the hme_blk and allocate the 2664 * hme_blk if one does not exist. 2665 * If a page structure is specified then it will add the 2666 * corresponding hment to the mapping list. 2667 * It will also update the hmenum field for the tte. 2668 * 2669 * Currently this function is only used for kernel mappings. 2670 * So pass invalid region to sfmmu_tteload_array(). 2671 */ 2672 void 2673 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp, 2674 uint_t flags) 2675 { 2676 ASSERT(sfmmup == ksfmmup); 2677 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags, 2678 SFMMU_INVALID_SHMERID); 2679 } 2680 2681 /* 2682 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB. 2683 * Assumes that a particular page size may only be resident in one TSB. 2684 */ 2685 static void 2686 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz) 2687 { 2688 struct tsb_info *tsbinfop = NULL; 2689 uint64_t tag; 2690 struct tsbe *tsbe_addr; 2691 uint64_t tsb_base; 2692 uint_t tsb_size; 2693 int vpshift = MMU_PAGESHIFT; 2694 int phys = 0; 2695 2696 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */ 2697 phys = ktsb_phys; 2698 if (ttesz >= TTE4M) { 2699 #ifndef sun4v 2700 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2701 #endif 2702 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2703 tsb_size = ktsb4m_szcode; 2704 } else { 2705 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2706 tsb_size = ktsb_szcode; 2707 } 2708 } else { 2709 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2710 2711 /* 2712 * If there isn't a TSB for this page size, or the TSB is 2713 * swapped out, there is nothing to do. Note that the latter 2714 * case seems impossible but can occur if hat_pageunload() 2715 * is called on an ISM mapping while the process is swapped 2716 * out. 2717 */ 2718 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2719 return; 2720 2721 /* 2722 * If another thread is in the middle of relocating a TSB 2723 * we can't unload the entry so set a flag so that the 2724 * TSB will be flushed before it can be accessed by the 2725 * process. 2726 */ 2727 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2728 if (ttep == NULL) 2729 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2730 return; 2731 } 2732 #if defined(UTSB_PHYS) 2733 phys = 1; 2734 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2735 #else 2736 tsb_base = (uint64_t)tsbinfop->tsb_va; 2737 #endif 2738 tsb_size = tsbinfop->tsb_szc; 2739 } 2740 if (ttesz >= TTE4M) 2741 vpshift = MMU_PAGESHIFT4M; 2742 2743 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2744 tag = sfmmu_make_tsbtag(vaddr); 2745 2746 if (ttep == NULL) { 2747 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2748 } else { 2749 if (ttesz >= TTE4M) { 2750 SFMMU_STAT(sf_tsb_load4m); 2751 } else { 2752 SFMMU_STAT(sf_tsb_load8k); 2753 } 2754 2755 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys); 2756 } 2757 } 2758 2759 /* 2760 * Unmap all entries from [start, end) matching the given page size. 2761 * 2762 * This function is used primarily to unmap replicated 64K or 512K entries 2763 * from the TSB that are inserted using the base page size TSB pointer, but 2764 * it may also be called to unmap a range of addresses from the TSB. 2765 */ 2766 void 2767 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz) 2768 { 2769 struct tsb_info *tsbinfop; 2770 uint64_t tag; 2771 struct tsbe *tsbe_addr; 2772 caddr_t vaddr; 2773 uint64_t tsb_base; 2774 int vpshift, vpgsz; 2775 uint_t tsb_size; 2776 int phys = 0; 2777 2778 /* 2779 * Assumptions: 2780 * If ttesz == 8K, 64K or 512K, we walk through the range 8K 2781 * at a time shooting down any valid entries we encounter. 2782 * 2783 * If ttesz >= 4M we walk the range 4M at a time shooting 2784 * down any valid mappings we find. 2785 */ 2786 if (sfmmup == ksfmmup) { 2787 phys = ktsb_phys; 2788 if (ttesz >= TTE4M) { 2789 #ifndef sun4v 2790 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M)); 2791 #endif 2792 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base; 2793 tsb_size = ktsb4m_szcode; 2794 } else { 2795 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base; 2796 tsb_size = ktsb_szcode; 2797 } 2798 } else { 2799 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz); 2800 2801 /* 2802 * If there isn't a TSB for this page size, or the TSB is 2803 * swapped out, there is nothing to do. Note that the latter 2804 * case seems impossible but can occur if hat_pageunload() 2805 * is called on an ISM mapping while the process is swapped 2806 * out. 2807 */ 2808 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED)) 2809 return; 2810 2811 /* 2812 * If another thread is in the middle of relocating a TSB 2813 * we can't unload the entry so set a flag so that the 2814 * TSB will be flushed before it can be accessed by the 2815 * process. 2816 */ 2817 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) { 2818 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED; 2819 return; 2820 } 2821 #if defined(UTSB_PHYS) 2822 phys = 1; 2823 tsb_base = (uint64_t)tsbinfop->tsb_pa; 2824 #else 2825 tsb_base = (uint64_t)tsbinfop->tsb_va; 2826 #endif 2827 tsb_size = tsbinfop->tsb_szc; 2828 } 2829 if (ttesz >= TTE4M) { 2830 vpshift = MMU_PAGESHIFT4M; 2831 vpgsz = MMU_PAGESIZE4M; 2832 } else { 2833 vpshift = MMU_PAGESHIFT; 2834 vpgsz = MMU_PAGESIZE; 2835 } 2836 2837 for (vaddr = start; vaddr < end; vaddr += vpgsz) { 2838 tag = sfmmu_make_tsbtag(vaddr); 2839 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size); 2840 sfmmu_unload_tsbe(tsbe_addr, tag, phys); 2841 } 2842 } 2843 2844 /* 2845 * Select the optimum TSB size given the number of mappings 2846 * that need to be cached. 2847 */ 2848 static int 2849 sfmmu_select_tsb_szc(pgcnt_t pgcnt) 2850 { 2851 int szc = 0; 2852 2853 #ifdef DEBUG 2854 if (tsb_grow_stress) { 2855 uint32_t randval = (uint32_t)gettick() >> 4; 2856 return (randval % (tsb_max_growsize + 1)); 2857 } 2858 #endif /* DEBUG */ 2859 2860 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc))) 2861 szc++; 2862 return (szc); 2863 } 2864 2865 /* 2866 * This function will add a translation to the hme_blk and allocate the 2867 * hme_blk if one does not exist. 2868 * If a page structure is specified then it will add the 2869 * corresponding hment to the mapping list. 2870 * It will also update the hmenum field for the tte. 2871 * Furthermore, it attempts to create a large page translation 2872 * for <addr,hat> at page array pps. It assumes addr and first 2873 * pp is correctly aligned. It returns 0 if successful and 1 otherwise. 2874 */ 2875 static int 2876 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr, 2877 page_t **pps, uint_t flags, uint_t rid) 2878 { 2879 struct hmehash_bucket *hmebp; 2880 struct hme_blk *hmeblkp; 2881 int ret; 2882 uint_t size; 2883 2884 /* 2885 * Get mapping size. 2886 */ 2887 size = TTE_CSZ(ttep); 2888 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 2889 2890 /* 2891 * Acquire the hash bucket. 2892 */ 2893 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid); 2894 ASSERT(hmebp); 2895 2896 /* 2897 * Find the hment block. 2898 */ 2899 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags, 2900 rid); 2901 ASSERT(hmeblkp); 2902 2903 /* 2904 * Add the translation. 2905 */ 2906 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags, 2907 rid); 2908 2909 /* 2910 * Release the hash bucket. 2911 */ 2912 sfmmu_tteload_release_hashbucket(hmebp); 2913 2914 return (ret); 2915 } 2916 2917 /* 2918 * Function locks and returns a pointer to the hash bucket for vaddr and size. 2919 */ 2920 static struct hmehash_bucket * 2921 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size, 2922 uint_t rid) 2923 { 2924 struct hmehash_bucket *hmebp; 2925 int hmeshift; 2926 void *htagid = sfmmutohtagid(sfmmup, rid); 2927 2928 ASSERT(htagid != NULL); 2929 2930 hmeshift = HME_HASH_SHIFT(size); 2931 2932 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift); 2933 2934 SFMMU_HASH_LOCK(hmebp); 2935 2936 return (hmebp); 2937 } 2938 2939 /* 2940 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the 2941 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is 2942 * allocated. 2943 */ 2944 static struct hme_blk * 2945 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp, 2946 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid) 2947 { 2948 hmeblk_tag hblktag; 2949 int hmeshift; 2950 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 2951 2952 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 2953 2954 hblktag.htag_id = sfmmutohtagid(sfmmup, rid); 2955 ASSERT(hblktag.htag_id != NULL); 2956 hmeshift = HME_HASH_SHIFT(size); 2957 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 2958 hblktag.htag_rehash = HME_HASH_REHASH(size); 2959 hblktag.htag_rid = rid; 2960 2961 ttearray_realloc: 2962 2963 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 2964 2965 /* 2966 * We block until hblk_reserve_lock is released; it's held by 2967 * the thread, temporarily using hblk_reserve, until hblk_reserve is 2968 * replaced by a hblk from sfmmu8_cache. 2969 */ 2970 if (hmeblkp == (struct hme_blk *)hblk_reserve && 2971 hblk_reserve_thread != curthread) { 2972 SFMMU_HASH_UNLOCK(hmebp); 2973 mutex_enter(&hblk_reserve_lock); 2974 mutex_exit(&hblk_reserve_lock); 2975 SFMMU_STAT(sf_hblk_reserve_hit); 2976 SFMMU_HASH_LOCK(hmebp); 2977 goto ttearray_realloc; 2978 } 2979 2980 if (hmeblkp == NULL) { 2981 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 2982 hblktag, flags, rid); 2983 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 2984 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 2985 } else { 2986 /* 2987 * It is possible for 8k and 64k hblks to collide since they 2988 * have the same rehash value. This is because we 2989 * lazily free hblks and 8K/64K blks could be lingering. 2990 * If we find size mismatch we free the block and & try again. 2991 */ 2992 if (get_hblk_ttesz(hmeblkp) != size) { 2993 ASSERT(!hmeblkp->hblk_vcnt); 2994 ASSERT(!hmeblkp->hblk_hmecnt); 2995 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 2996 &list, 0); 2997 goto ttearray_realloc; 2998 } 2999 if (hmeblkp->hblk_shw_bit) { 3000 /* 3001 * if the hblk was previously used as a shadow hblk then 3002 * we will change it to a normal hblk 3003 */ 3004 ASSERT(!hmeblkp->hblk_shared); 3005 if (hmeblkp->hblk_shw_mask) { 3006 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp); 3007 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3008 goto ttearray_realloc; 3009 } else { 3010 hmeblkp->hblk_shw_bit = 0; 3011 } 3012 } 3013 SFMMU_STAT(sf_hblk_hit); 3014 } 3015 3016 /* 3017 * hat_memload() should never call kmem_cache_free() for kernel hmeblks; 3018 * see block comment showing the stacktrace in sfmmu_hblk_alloc(); 3019 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will 3020 * just add these hmeblks to the per-cpu pending queue. 3021 */ 3022 sfmmu_hblks_list_purge(&list, 1); 3023 3024 ASSERT(get_hblk_ttesz(hmeblkp) == size); 3025 ASSERT(!hmeblkp->hblk_shw_bit); 3026 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3027 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3028 ASSERT(hmeblkp->hblk_tag.htag_rid == rid); 3029 3030 return (hmeblkp); 3031 } 3032 3033 /* 3034 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1 3035 * otherwise. 3036 */ 3037 static int 3038 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep, 3039 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid) 3040 { 3041 page_t *pp = *pps; 3042 int hmenum, size, remap; 3043 tte_t tteold, flush_tte; 3044 #ifdef DEBUG 3045 tte_t orig_old; 3046 #endif /* DEBUG */ 3047 struct sf_hment *sfhme; 3048 kmutex_t *pml, *pmtx; 3049 hatlock_t *hatlockp; 3050 int myflt; 3051 3052 /* 3053 * remove this panic when we decide to let user virtual address 3054 * space be >= USERLIMIT. 3055 */ 3056 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT) 3057 panic("user addr %p in kernel space", (void *)vaddr); 3058 #if defined(TTE_IS_GLOBAL) 3059 if (TTE_IS_GLOBAL(ttep)) 3060 panic("sfmmu_tteload: creating global tte"); 3061 #endif 3062 3063 #ifdef DEBUG 3064 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) && 3065 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans) 3066 panic("sfmmu_tteload: non cacheable memory tte"); 3067 #endif /* DEBUG */ 3068 3069 /* don't simulate dirty bit for writeable ISM/DISM mappings */ 3070 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) { 3071 TTE_SET_REF(ttep); 3072 TTE_SET_MOD(ttep); 3073 } 3074 3075 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) || 3076 !TTE_IS_MOD(ttep)) { 3077 /* 3078 * Don't load TSB for dummy as in ISM. Also don't preload 3079 * the TSB if the TTE isn't writable since we're likely to 3080 * fault on it again -- preloading can be fairly expensive. 3081 */ 3082 flags |= SFMMU_NO_TSBLOAD; 3083 } 3084 3085 size = TTE_CSZ(ttep); 3086 switch (size) { 3087 case TTE8K: 3088 SFMMU_STAT(sf_tteload8k); 3089 break; 3090 case TTE64K: 3091 SFMMU_STAT(sf_tteload64k); 3092 break; 3093 case TTE512K: 3094 SFMMU_STAT(sf_tteload512k); 3095 break; 3096 case TTE4M: 3097 SFMMU_STAT(sf_tteload4m); 3098 break; 3099 case (TTE32M): 3100 SFMMU_STAT(sf_tteload32m); 3101 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3102 break; 3103 case (TTE256M): 3104 SFMMU_STAT(sf_tteload256m); 3105 ASSERT(mmu_page_sizes == max_mmu_page_sizes); 3106 break; 3107 } 3108 3109 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size))); 3110 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 3111 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared); 3112 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared); 3113 3114 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum); 3115 3116 /* 3117 * Need to grab mlist lock here so that pageunload 3118 * will not change tte behind us. 3119 */ 3120 if (pp) { 3121 pml = sfmmu_mlist_enter(pp); 3122 } 3123 3124 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3125 /* 3126 * Look for corresponding hment and if valid verify 3127 * pfns are equal. 3128 */ 3129 remap = TTE_IS_VALID(&tteold); 3130 if (remap) { 3131 pfn_t new_pfn, old_pfn; 3132 3133 old_pfn = TTE_TO_PFN(vaddr, &tteold); 3134 new_pfn = TTE_TO_PFN(vaddr, ttep); 3135 3136 if (flags & HAT_LOAD_REMAP) { 3137 /* make sure we are remapping same type of pages */ 3138 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) { 3139 panic("sfmmu_tteload - tte remap io<->memory"); 3140 } 3141 if (old_pfn != new_pfn && 3142 (pp != NULL || sfhme->hme_page != NULL)) { 3143 panic("sfmmu_tteload - tte remap pp != NULL"); 3144 } 3145 } else if (old_pfn != new_pfn) { 3146 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p", 3147 (void *)hmeblkp); 3148 } 3149 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); 3150 } 3151 3152 if (pp) { 3153 if (size == TTE8K) { 3154 #ifdef VAC 3155 /* 3156 * Handle VAC consistency 3157 */ 3158 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) { 3159 sfmmu_vac_conflict(sfmmup, vaddr, pp); 3160 } 3161 #endif 3162 3163 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3164 pmtx = sfmmu_page_enter(pp); 3165 PP_CLRRO(pp); 3166 sfmmu_page_exit(pmtx); 3167 } else if (!PP_ISMAPPED(pp) && 3168 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) { 3169 pmtx = sfmmu_page_enter(pp); 3170 if (!(PP_ISMOD(pp))) { 3171 PP_SETRO(pp); 3172 } 3173 sfmmu_page_exit(pmtx); 3174 } 3175 3176 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { 3177 /* 3178 * sfmmu_pagearray_setup failed so return 3179 */ 3180 sfmmu_mlist_exit(pml); 3181 return (1); 3182 } 3183 } 3184 3185 /* 3186 * Make sure hment is not on a mapping list. 3187 */ 3188 ASSERT(remap || (sfhme->hme_page == NULL)); 3189 3190 /* if it is not a remap then hme->next better be NULL */ 3191 ASSERT((!remap) ? sfhme->hme_next == NULL : 1); 3192 3193 if (flags & HAT_LOAD_LOCK) { 3194 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) { 3195 panic("too high lckcnt-hmeblk %p", 3196 (void *)hmeblkp); 3197 } 3198 atomic_inc_32(&hmeblkp->hblk_lckcnt); 3199 3200 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK); 3201 } 3202 3203 #ifdef VAC 3204 if (pp && PP_ISNC(pp)) { 3205 /* 3206 * If the physical page is marked to be uncacheable, like 3207 * by a vac conflict, make sure the new mapping is also 3208 * uncacheable. 3209 */ 3210 TTE_CLR_VCACHEABLE(ttep); 3211 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 3212 } 3213 #endif 3214 ttep->tte_hmenum = hmenum; 3215 3216 #ifdef DEBUG 3217 orig_old = tteold; 3218 #endif /* DEBUG */ 3219 3220 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) { 3221 if ((sfmmup == KHATID) && 3222 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) { 3223 sfmmu_copytte(&sfhme->hme_tte, &tteold); 3224 } 3225 #ifdef DEBUG 3226 chk_tte(&orig_old, &tteold, ttep, hmeblkp); 3227 #endif /* DEBUG */ 3228 } 3229 ASSERT(TTE_IS_VALID(&sfhme->hme_tte)); 3230 3231 if (!TTE_IS_VALID(&tteold)) { 3232 3233 atomic_inc_16(&hmeblkp->hblk_vcnt); 3234 if (rid == SFMMU_INVALID_SHMERID) { 3235 atomic_inc_ulong(&sfmmup->sfmmu_ttecnt[size]); 3236 } else { 3237 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 3238 sf_region_t *rgnp = srdp->srd_hmergnp[rid]; 3239 /* 3240 * We already accounted for region ttecnt's in sfmmu 3241 * during hat_join_region() processing. Here we 3242 * only update ttecnt's in region struture. 3243 */ 3244 atomic_inc_ulong(&rgnp->rgn_ttecnt[size]); 3245 } 3246 } 3247 3248 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup); 3249 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 && 3250 sfmmup != ksfmmup) { 3251 uchar_t tteflag = 1 << size; 3252 if (rid == SFMMU_INVALID_SHMERID) { 3253 if (!(sfmmup->sfmmu_tteflags & tteflag)) { 3254 hatlockp = sfmmu_hat_enter(sfmmup); 3255 sfmmup->sfmmu_tteflags |= tteflag; 3256 sfmmu_hat_exit(hatlockp); 3257 } 3258 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) { 3259 hatlockp = sfmmu_hat_enter(sfmmup); 3260 sfmmup->sfmmu_rtteflags |= tteflag; 3261 sfmmu_hat_exit(hatlockp); 3262 } 3263 /* 3264 * Update the current CPU tsbmiss area, so the current thread 3265 * won't need to take the tsbmiss for the new pagesize. 3266 * The other threads in the process will update their tsb 3267 * miss area lazily in sfmmu_tsbmiss_exception() when they 3268 * fail to find the translation for a newly added pagesize. 3269 */ 3270 if (size > TTE64K && myflt) { 3271 struct tsbmiss *tsbmp; 3272 kpreempt_disable(); 3273 tsbmp = &tsbmiss_area[CPU->cpu_id]; 3274 if (rid == SFMMU_INVALID_SHMERID) { 3275 if (!(tsbmp->uhat_tteflags & tteflag)) { 3276 tsbmp->uhat_tteflags |= tteflag; 3277 } 3278 } else { 3279 if (!(tsbmp->uhat_rtteflags & tteflag)) { 3280 tsbmp->uhat_rtteflags |= tteflag; 3281 } 3282 } 3283 kpreempt_enable(); 3284 } 3285 } 3286 3287 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) && 3288 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 3289 hatlockp = sfmmu_hat_enter(sfmmup); 3290 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 3291 sfmmu_hat_exit(hatlockp); 3292 } 3293 3294 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) & 3295 hw_tte.tte_intlo; 3296 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) & 3297 hw_tte.tte_inthi; 3298 3299 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) { 3300 /* 3301 * If remap and new tte differs from old tte we need 3302 * to sync the mod bit and flush TLB/TSB. We don't 3303 * need to sync ref bit because we currently always set 3304 * ref bit in tteload. 3305 */ 3306 ASSERT(TTE_IS_REF(ttep)); 3307 if (TTE_IS_MOD(&tteold)) { 3308 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); 3309 } 3310 /* 3311 * hwtte bits shouldn't change for SRD hmeblks as long as SRD 3312 * hmes are only used for read only text. Adding this code for 3313 * completeness and future use of shared hmeblks with writable 3314 * mappings of VMODSORT vnodes. 3315 */ 3316 if (hmeblkp->hblk_shared) { 3317 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr, 3318 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1); 3319 xt_sync(cpuset); 3320 SFMMU_STAT_ADD(sf_region_remap_demap, 1); 3321 } else { 3322 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0); 3323 xt_sync(sfmmup->sfmmu_cpusran); 3324 } 3325 } 3326 3327 if ((flags & SFMMU_NO_TSBLOAD) == 0) { 3328 /* 3329 * We only preload 8K and 4M mappings into the TSB, since 3330 * 64K and 512K mappings are replicated and hence don't 3331 * have a single, unique TSB entry. Ditto for 32M/256M. 3332 */ 3333 if (size == TTE8K || size == TTE4M) { 3334 sf_scd_t *scdp; 3335 hatlockp = sfmmu_hat_enter(sfmmup); 3336 /* 3337 * Don't preload private TSB if the mapping is used 3338 * by the shctx in the SCD. 3339 */ 3340 scdp = sfmmup->sfmmu_scdp; 3341 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL || 3342 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 3343 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte, 3344 size); 3345 } 3346 sfmmu_hat_exit(hatlockp); 3347 } 3348 } 3349 if (pp) { 3350 if (!remap) { 3351 HME_ADD(sfhme, pp); 3352 atomic_inc_16(&hmeblkp->hblk_hmecnt); 3353 ASSERT(hmeblkp->hblk_hmecnt > 0); 3354 3355 /* 3356 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 3357 * see pageunload() for comment. 3358 */ 3359 } 3360 sfmmu_mlist_exit(pml); 3361 } 3362 3363 return (0); 3364 } 3365 /* 3366 * Function unlocks hash bucket. 3367 */ 3368 static void 3369 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp) 3370 { 3371 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3372 SFMMU_HASH_UNLOCK(hmebp); 3373 } 3374 3375 /* 3376 * function which checks and sets up page array for a large 3377 * translation. Will set p_vcolor, p_index, p_ro fields. 3378 * Assumes addr and pfnum of first page are properly aligned. 3379 * Will check for physical contiguity. If check fails it return 3380 * non null. 3381 */ 3382 static int 3383 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap) 3384 { 3385 int i, index, ttesz; 3386 pfn_t pfnum; 3387 pgcnt_t npgs; 3388 page_t *pp, *pp1; 3389 kmutex_t *pmtx; 3390 #ifdef VAC 3391 int osz; 3392 int cflags = 0; 3393 int vac_err = 0; 3394 #endif 3395 int newidx = 0; 3396 3397 ttesz = TTE_CSZ(ttep); 3398 3399 ASSERT(ttesz > TTE8K); 3400 3401 npgs = TTEPAGES(ttesz); 3402 index = PAGESZ_TO_INDEX(ttesz); 3403 3404 pfnum = (*pps)->p_pagenum; 3405 ASSERT(IS_P2ALIGNED(pfnum, npgs)); 3406 3407 /* 3408 * Save the first pp so we can do HAT_TMPNC at the end. 3409 */ 3410 pp1 = *pps; 3411 #ifdef VAC 3412 osz = fnd_mapping_sz(pp1); 3413 #endif 3414 3415 for (i = 0; i < npgs; i++, pps++) { 3416 pp = *pps; 3417 ASSERT(PAGE_LOCKED(pp)); 3418 ASSERT(pp->p_szc >= ttesz); 3419 ASSERT(pp->p_szc == pp1->p_szc); 3420 ASSERT(sfmmu_mlist_held(pp)); 3421 3422 /* 3423 * XXX is it possible to maintain P_RO on the root only? 3424 */ 3425 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) { 3426 pmtx = sfmmu_page_enter(pp); 3427 PP_CLRRO(pp); 3428 sfmmu_page_exit(pmtx); 3429 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) && 3430 !PP_ISMOD(pp)) { 3431 pmtx = sfmmu_page_enter(pp); 3432 if (!(PP_ISMOD(pp))) { 3433 PP_SETRO(pp); 3434 } 3435 sfmmu_page_exit(pmtx); 3436 } 3437 3438 /* 3439 * If this is a remap we skip vac & contiguity checks. 3440 */ 3441 if (remap) 3442 continue; 3443 3444 /* 3445 * set p_vcolor and detect any vac conflicts. 3446 */ 3447 #ifdef VAC 3448 if (vac_err == 0) { 3449 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags); 3450 3451 } 3452 #endif 3453 3454 /* 3455 * Save current index in case we need to undo it. 3456 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))" 3457 * "SFMMU_INDEX_SHIFT 6" 3458 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)" 3459 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)" 3460 * 3461 * So: index = PAGESZ_TO_INDEX(ttesz); 3462 * if ttesz == 1 then index = 0x2 3463 * 2 then index = 0x4 3464 * 3 then index = 0x8 3465 * 4 then index = 0x10 3466 * 5 then index = 0x20 3467 * The code below checks if it's a new pagesize (ie, newidx) 3468 * in case we need to take it back out of p_index, 3469 * and then or's the new index into the existing index. 3470 */ 3471 if ((PP_MAPINDEX(pp) & index) == 0) 3472 newidx = 1; 3473 pp->p_index = (PP_MAPINDEX(pp) | index); 3474 3475 /* 3476 * contiguity check 3477 */ 3478 if (pp->p_pagenum != pfnum) { 3479 /* 3480 * If we fail the contiguity test then 3481 * the only thing we need to fix is the p_index field. 3482 * We might get a few extra flushes but since this 3483 * path is rare that is ok. The p_ro field will 3484 * get automatically fixed on the next tteload to 3485 * the page. NO TNC bit is set yet. 3486 */ 3487 while (i >= 0) { 3488 pp = *pps; 3489 if (newidx) 3490 pp->p_index = (PP_MAPINDEX(pp) & 3491 ~index); 3492 pps--; 3493 i--; 3494 } 3495 return (1); 3496 } 3497 pfnum++; 3498 addr += MMU_PAGESIZE; 3499 } 3500 3501 #ifdef VAC 3502 if (vac_err) { 3503 if (ttesz > osz) { 3504 /* 3505 * There are some smaller mappings that causes vac 3506 * conflicts. Convert all existing small mappings to 3507 * TNC. 3508 */ 3509 SFMMU_STAT_ADD(sf_uncache_conflict, npgs); 3510 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH, 3511 npgs); 3512 } else { 3513 /* EMPTY */ 3514 /* 3515 * If there exists an big page mapping, 3516 * that means the whole existing big page 3517 * has TNC setting already. No need to covert to 3518 * TNC again. 3519 */ 3520 ASSERT(PP_ISTNC(pp1)); 3521 } 3522 } 3523 #endif /* VAC */ 3524 3525 return (0); 3526 } 3527 3528 #ifdef VAC 3529 /* 3530 * Routine that detects vac consistency for a large page. It also 3531 * sets virtual color for all pp's for this big mapping. 3532 */ 3533 static int 3534 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags) 3535 { 3536 int vcolor, ocolor; 3537 3538 ASSERT(sfmmu_mlist_held(pp)); 3539 3540 if (PP_ISNC(pp)) { 3541 return (HAT_TMPNC); 3542 } 3543 3544 vcolor = addr_to_vcolor(addr); 3545 if (PP_NEWPAGE(pp)) { 3546 PP_SET_VCOLOR(pp, vcolor); 3547 return (0); 3548 } 3549 3550 ocolor = PP_GET_VCOLOR(pp); 3551 if (ocolor == vcolor) { 3552 return (0); 3553 } 3554 3555 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 3556 /* 3557 * Previous user of page had a differnet color 3558 * but since there are no current users 3559 * we just flush the cache and change the color. 3560 * As an optimization for large pages we flush the 3561 * entire cache of that color and set a flag. 3562 */ 3563 SFMMU_STAT(sf_pgcolor_conflict); 3564 if (!CacheColor_IsFlushed(*cflags, ocolor)) { 3565 CacheColor_SetFlushed(*cflags, ocolor); 3566 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum); 3567 } 3568 PP_SET_VCOLOR(pp, vcolor); 3569 return (0); 3570 } 3571 3572 /* 3573 * We got a real conflict with a current mapping. 3574 * set flags to start unencaching all mappings 3575 * and return failure so we restart looping 3576 * the pp array from the beginning. 3577 */ 3578 return (HAT_TMPNC); 3579 } 3580 #endif /* VAC */ 3581 3582 /* 3583 * creates a large page shadow hmeblk for a tte. 3584 * The purpose of this routine is to allow us to do quick unloads because 3585 * the vm layer can easily pass a very large but sparsely populated range. 3586 */ 3587 static struct hme_blk * 3588 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags) 3589 { 3590 struct hmehash_bucket *hmebp; 3591 hmeblk_tag hblktag; 3592 int hmeshift, size, vshift; 3593 uint_t shw_mask, newshw_mask; 3594 struct hme_blk *hmeblkp; 3595 3596 ASSERT(sfmmup != KHATID); 3597 if (mmu_page_sizes == max_mmu_page_sizes) { 3598 ASSERT(ttesz < TTE256M); 3599 } else { 3600 ASSERT(ttesz < TTE4M); 3601 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 3602 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 3603 } 3604 3605 if (ttesz == TTE8K) { 3606 size = TTE512K; 3607 } else { 3608 size = ++ttesz; 3609 } 3610 3611 hblktag.htag_id = sfmmup; 3612 hmeshift = HME_HASH_SHIFT(size); 3613 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 3614 hblktag.htag_rehash = HME_HASH_REHASH(size); 3615 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3616 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 3617 3618 SFMMU_HASH_LOCK(hmebp); 3619 3620 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 3621 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 3622 if (hmeblkp == NULL) { 3623 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size, 3624 hblktag, flags, SFMMU_INVALID_SHMERID); 3625 } 3626 ASSERT(hmeblkp); 3627 if (!hmeblkp->hblk_shw_mask) { 3628 /* 3629 * if this is a unused hblk it was just allocated or could 3630 * potentially be a previous large page hblk so we need to 3631 * set the shadow bit. 3632 */ 3633 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3634 hmeblkp->hblk_shw_bit = 1; 3635 } else if (hmeblkp->hblk_shw_bit == 0) { 3636 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p", 3637 (void *)hmeblkp); 3638 } 3639 ASSERT(hmeblkp->hblk_shw_bit == 1); 3640 ASSERT(!hmeblkp->hblk_shared); 3641 vshift = vaddr_to_vshift(hblktag, vaddr, size); 3642 ASSERT(vshift < 8); 3643 /* 3644 * Atomically set shw mask bit 3645 */ 3646 do { 3647 shw_mask = hmeblkp->hblk_shw_mask; 3648 newshw_mask = shw_mask | (1 << vshift); 3649 newshw_mask = atomic_cas_32(&hmeblkp->hblk_shw_mask, shw_mask, 3650 newshw_mask); 3651 } while (newshw_mask != shw_mask); 3652 3653 SFMMU_HASH_UNLOCK(hmebp); 3654 3655 return (hmeblkp); 3656 } 3657 3658 /* 3659 * This routine cleanup a previous shadow hmeblk and changes it to 3660 * a regular hblk. This happens rarely but it is possible 3661 * when a process wants to use large pages and there are hblks still 3662 * lying around from the previous as that used these hmeblks. 3663 * The alternative was to cleanup the shadow hblks at unload time 3664 * but since so few user processes actually use large pages, it is 3665 * better to be lazy and cleanup at this time. 3666 */ 3667 static void 3668 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 3669 struct hmehash_bucket *hmebp) 3670 { 3671 caddr_t addr, endaddr; 3672 int hashno, size; 3673 3674 ASSERT(hmeblkp->hblk_shw_bit); 3675 ASSERT(!hmeblkp->hblk_shared); 3676 3677 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 3678 3679 if (!hmeblkp->hblk_shw_mask) { 3680 hmeblkp->hblk_shw_bit = 0; 3681 return; 3682 } 3683 addr = (caddr_t)get_hblk_base(hmeblkp); 3684 endaddr = get_hblk_endaddr(hmeblkp); 3685 size = get_hblk_ttesz(hmeblkp); 3686 hashno = size - 1; 3687 ASSERT(hashno > 0); 3688 SFMMU_HASH_UNLOCK(hmebp); 3689 3690 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno); 3691 3692 SFMMU_HASH_LOCK(hmebp); 3693 } 3694 3695 static void 3696 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr, 3697 int hashno) 3698 { 3699 int hmeshift, shadow = 0; 3700 hmeblk_tag hblktag; 3701 struct hmehash_bucket *hmebp; 3702 struct hme_blk *hmeblkp; 3703 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL; 3704 3705 ASSERT(hashno > 0); 3706 hblktag.htag_id = sfmmup; 3707 hblktag.htag_rehash = hashno; 3708 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3709 3710 hmeshift = HME_HASH_SHIFT(hashno); 3711 3712 while (addr < endaddr) { 3713 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3714 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3715 SFMMU_HASH_LOCK(hmebp); 3716 /* inline HME_HASH_SEARCH */ 3717 hmeblkp = hmebp->hmeblkp; 3718 pr_hblk = NULL; 3719 while (hmeblkp) { 3720 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) { 3721 /* found hme_blk */ 3722 ASSERT(!hmeblkp->hblk_shared); 3723 if (hmeblkp->hblk_shw_bit) { 3724 if (hmeblkp->hblk_shw_mask) { 3725 shadow = 1; 3726 sfmmu_shadow_hcleanup(sfmmup, 3727 hmeblkp, hmebp); 3728 break; 3729 } else { 3730 hmeblkp->hblk_shw_bit = 0; 3731 } 3732 } 3733 3734 /* 3735 * Hblk_hmecnt and hblk_vcnt could be non zero 3736 * since hblk_unload() does not gurantee that. 3737 * 3738 * XXX - this could cause tteload() to spin 3739 * where sfmmu_shadow_hcleanup() is called. 3740 */ 3741 } 3742 3743 nx_hblk = hmeblkp->hblk_next; 3744 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 3745 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3746 &list, 0); 3747 } else { 3748 pr_hblk = hmeblkp; 3749 } 3750 hmeblkp = nx_hblk; 3751 } 3752 3753 SFMMU_HASH_UNLOCK(hmebp); 3754 3755 if (shadow) { 3756 /* 3757 * We found another shadow hblk so cleaned its 3758 * children. We need to go back and cleanup 3759 * the original hblk so we don't change the 3760 * addr. 3761 */ 3762 shadow = 0; 3763 } else { 3764 addr = (caddr_t)roundup((uintptr_t)addr + 1, 3765 (1 << hmeshift)); 3766 } 3767 } 3768 sfmmu_hblks_list_purge(&list, 0); 3769 } 3770 3771 /* 3772 * This routine's job is to delete stale invalid shared hmeregions hmeblks that 3773 * may still linger on after pageunload. 3774 */ 3775 static void 3776 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz) 3777 { 3778 int hmeshift; 3779 hmeblk_tag hblktag; 3780 struct hmehash_bucket *hmebp; 3781 struct hme_blk *hmeblkp; 3782 struct hme_blk *pr_hblk; 3783 struct hme_blk *list = NULL; 3784 3785 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3786 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3787 3788 hmeshift = HME_HASH_SHIFT(ttesz); 3789 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3790 hblktag.htag_rehash = ttesz; 3791 hblktag.htag_rid = rid; 3792 hblktag.htag_id = srdp; 3793 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3794 3795 SFMMU_HASH_LOCK(hmebp); 3796 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3797 if (hmeblkp != NULL) { 3798 ASSERT(hmeblkp->hblk_shared); 3799 ASSERT(!hmeblkp->hblk_shw_bit); 3800 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3801 panic("sfmmu_cleanup_rhblk: valid hmeblk"); 3802 } 3803 ASSERT(!hmeblkp->hblk_lckcnt); 3804 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3805 &list, 0); 3806 } 3807 SFMMU_HASH_UNLOCK(hmebp); 3808 sfmmu_hblks_list_purge(&list, 0); 3809 } 3810 3811 /* ARGSUSED */ 3812 static void 3813 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr, 3814 size_t r_size, void *r_obj, u_offset_t r_objoff) 3815 { 3816 } 3817 3818 /* 3819 * Searches for an hmeblk which maps addr, then unloads this mapping 3820 * and updates *eaddrp, if the hmeblk is found. 3821 */ 3822 static void 3823 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr, 3824 caddr_t eaddr, int ttesz, caddr_t *eaddrp) 3825 { 3826 int hmeshift; 3827 hmeblk_tag hblktag; 3828 struct hmehash_bucket *hmebp; 3829 struct hme_blk *hmeblkp; 3830 struct hme_blk *pr_hblk; 3831 struct hme_blk *list = NULL; 3832 3833 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3834 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3835 ASSERT(ttesz >= HBLK_MIN_TTESZ); 3836 3837 hmeshift = HME_HASH_SHIFT(ttesz); 3838 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3839 hblktag.htag_rehash = ttesz; 3840 hblktag.htag_rid = rid; 3841 hblktag.htag_id = srdp; 3842 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift); 3843 3844 SFMMU_HASH_LOCK(hmebp); 3845 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 3846 if (hmeblkp != NULL) { 3847 ASSERT(hmeblkp->hblk_shared); 3848 ASSERT(!hmeblkp->hblk_lckcnt); 3849 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 3850 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr, 3851 eaddr, NULL, HAT_UNLOAD); 3852 ASSERT(*eaddrp > addr); 3853 } 3854 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt); 3855 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 3856 &list, 0); 3857 } 3858 SFMMU_HASH_UNLOCK(hmebp); 3859 sfmmu_hblks_list_purge(&list, 0); 3860 } 3861 3862 static void 3863 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp) 3864 { 3865 int ttesz = rgnp->rgn_pgszc; 3866 size_t rsz = rgnp->rgn_size; 3867 caddr_t rsaddr = rgnp->rgn_saddr; 3868 caddr_t readdr = rsaddr + rsz; 3869 caddr_t rhsaddr; 3870 caddr_t va; 3871 uint_t rid = rgnp->rgn_id; 3872 caddr_t cbsaddr; 3873 caddr_t cbeaddr; 3874 hat_rgn_cb_func_t rcbfunc; 3875 ulong_t cnt; 3876 3877 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 3878 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 3879 3880 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz))); 3881 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz))); 3882 if (ttesz < HBLK_MIN_TTESZ) { 3883 ttesz = HBLK_MIN_TTESZ; 3884 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES); 3885 } else { 3886 rhsaddr = rsaddr; 3887 } 3888 3889 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) { 3890 rcbfunc = sfmmu_rgn_cb_noop; 3891 } 3892 3893 while (ttesz >= HBLK_MIN_TTESZ) { 3894 cbsaddr = rsaddr; 3895 cbeaddr = rsaddr; 3896 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 3897 ttesz--; 3898 continue; 3899 } 3900 cnt = 0; 3901 va = rsaddr; 3902 while (va < readdr) { 3903 ASSERT(va >= rhsaddr); 3904 if (va != cbeaddr) { 3905 if (cbeaddr != cbsaddr) { 3906 ASSERT(cbeaddr > cbsaddr); 3907 (*rcbfunc)(cbsaddr, cbeaddr, 3908 rsaddr, rsz, rgnp->rgn_obj, 3909 rgnp->rgn_objoff); 3910 } 3911 cbsaddr = va; 3912 cbeaddr = va; 3913 } 3914 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr, 3915 ttesz, &cbeaddr); 3916 cnt++; 3917 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz)); 3918 } 3919 if (cbeaddr != cbsaddr) { 3920 ASSERT(cbeaddr > cbsaddr); 3921 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr, 3922 rsz, rgnp->rgn_obj, 3923 rgnp->rgn_objoff); 3924 } 3925 ttesz--; 3926 } 3927 } 3928 3929 /* 3930 * Release one hardware address translation lock on the given address range. 3931 */ 3932 void 3933 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len) 3934 { 3935 struct hmehash_bucket *hmebp; 3936 hmeblk_tag hblktag; 3937 int hmeshift, hashno = 1; 3938 struct hme_blk *hmeblkp, *list = NULL; 3939 caddr_t endaddr; 3940 3941 ASSERT(sfmmup != NULL); 3942 3943 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as)); 3944 ASSERT((len & MMU_PAGEOFFSET) == 0); 3945 endaddr = addr + len; 3946 hblktag.htag_id = sfmmup; 3947 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 3948 3949 /* 3950 * Spitfire supports 4 page sizes. 3951 * Most pages are expected to be of the smallest page size (8K) and 3952 * these will not need to be rehashed. 64K pages also don't need to be 3953 * rehashed because an hmeblk spans 64K of address space. 512K pages 3954 * might need 1 rehash and and 4M pages might need 2 rehashes. 3955 */ 3956 while (addr < endaddr) { 3957 hmeshift = HME_HASH_SHIFT(hashno); 3958 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 3959 hblktag.htag_rehash = hashno; 3960 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 3961 3962 SFMMU_HASH_LOCK(hmebp); 3963 3964 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 3965 if (hmeblkp != NULL) { 3966 ASSERT(!hmeblkp->hblk_shared); 3967 /* 3968 * If we encounter a shadow hmeblk then 3969 * we know there are no valid hmeblks mapping 3970 * this address at this size or larger. 3971 * Just increment address by the smallest 3972 * page size. 3973 */ 3974 if (hmeblkp->hblk_shw_bit) { 3975 addr += MMU_PAGESIZE; 3976 } else { 3977 addr = sfmmu_hblk_unlock(hmeblkp, addr, 3978 endaddr); 3979 } 3980 SFMMU_HASH_UNLOCK(hmebp); 3981 hashno = 1; 3982 continue; 3983 } 3984 SFMMU_HASH_UNLOCK(hmebp); 3985 3986 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 3987 /* 3988 * We have traversed the whole list and rehashed 3989 * if necessary without finding the address to unlock 3990 * which should never happen. 3991 */ 3992 panic("sfmmu_unlock: addr not found. " 3993 "addr %p hat %p", (void *)addr, (void *)sfmmup); 3994 } else { 3995 hashno++; 3996 } 3997 } 3998 3999 sfmmu_hblks_list_purge(&list, 0); 4000 } 4001 4002 void 4003 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len, 4004 hat_region_cookie_t rcookie) 4005 { 4006 sf_srd_t *srdp; 4007 sf_region_t *rgnp; 4008 int ttesz; 4009 uint_t rid; 4010 caddr_t eaddr; 4011 caddr_t va; 4012 int hmeshift; 4013 hmeblk_tag hblktag; 4014 struct hmehash_bucket *hmebp; 4015 struct hme_blk *hmeblkp; 4016 struct hme_blk *pr_hblk; 4017 struct hme_blk *list; 4018 4019 if (rcookie == HAT_INVALID_REGION_COOKIE) { 4020 hat_unlock(sfmmup, addr, len); 4021 return; 4022 } 4023 4024 ASSERT(sfmmup != NULL); 4025 ASSERT(sfmmup != ksfmmup); 4026 4027 srdp = sfmmup->sfmmu_srdp; 4028 rid = (uint_t)((uint64_t)rcookie); 4029 VERIFY3U(rid, <, SFMMU_MAX_HME_REGIONS); 4030 eaddr = addr + len; 4031 va = addr; 4032 list = NULL; 4033 rgnp = srdp->srd_hmergnp[rid]; 4034 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len); 4035 4036 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc))); 4037 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc))); 4038 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) { 4039 ttesz = HBLK_MIN_TTESZ; 4040 } else { 4041 ttesz = rgnp->rgn_pgszc; 4042 } 4043 while (va < eaddr) { 4044 while (ttesz < rgnp->rgn_pgszc && 4045 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) { 4046 ttesz++; 4047 } 4048 while (ttesz >= HBLK_MIN_TTESZ) { 4049 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) { 4050 ttesz--; 4051 continue; 4052 } 4053 hmeshift = HME_HASH_SHIFT(ttesz); 4054 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift); 4055 hblktag.htag_rehash = ttesz; 4056 hblktag.htag_rid = rid; 4057 hblktag.htag_id = srdp; 4058 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift); 4059 SFMMU_HASH_LOCK(hmebp); 4060 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, 4061 &list); 4062 if (hmeblkp == NULL) { 4063 SFMMU_HASH_UNLOCK(hmebp); 4064 ttesz--; 4065 continue; 4066 } 4067 ASSERT(hmeblkp->hblk_shared); 4068 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr); 4069 ASSERT(va >= eaddr || 4070 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz))); 4071 SFMMU_HASH_UNLOCK(hmebp); 4072 break; 4073 } 4074 if (ttesz < HBLK_MIN_TTESZ) { 4075 panic("hat_unlock_region: addr not found " 4076 "addr %p hat %p", (void *)va, (void *)sfmmup); 4077 } 4078 } 4079 sfmmu_hblks_list_purge(&list, 0); 4080 } 4081 4082 /* 4083 * Function to unlock a range of addresses in an hmeblk. It returns the 4084 * next address that needs to be unlocked. 4085 * Should be called with the hash lock held. 4086 */ 4087 static caddr_t 4088 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr) 4089 { 4090 struct sf_hment *sfhme; 4091 tte_t tteold, ttemod; 4092 int ttesz, ret; 4093 4094 ASSERT(in_hblk_range(hmeblkp, addr)); 4095 ASSERT(hmeblkp->hblk_shw_bit == 0); 4096 4097 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4098 ttesz = get_hblk_ttesz(hmeblkp); 4099 4100 HBLKTOHME(sfhme, hmeblkp, addr); 4101 while (addr < endaddr) { 4102 readtte: 4103 sfmmu_copytte(&sfhme->hme_tte, &tteold); 4104 if (TTE_IS_VALID(&tteold)) { 4105 4106 ttemod = tteold; 4107 4108 ret = sfmmu_modifytte_try(&tteold, &ttemod, 4109 &sfhme->hme_tte); 4110 4111 if (ret < 0) 4112 goto readtte; 4113 4114 if (hmeblkp->hblk_lckcnt == 0) 4115 panic("zero hblk lckcnt"); 4116 4117 if (((uintptr_t)addr + TTEBYTES(ttesz)) > 4118 (uintptr_t)endaddr) 4119 panic("can't unlock large tte"); 4120 4121 ASSERT(hmeblkp->hblk_lckcnt > 0); 4122 atomic_dec_32(&hmeblkp->hblk_lckcnt); 4123 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 4124 } else { 4125 panic("sfmmu_hblk_unlock: invalid tte"); 4126 } 4127 addr += TTEBYTES(ttesz); 4128 sfhme++; 4129 } 4130 return (addr); 4131 } 4132 4133 /* 4134 * Physical Address Mapping Framework 4135 * 4136 * General rules: 4137 * 4138 * (1) Applies only to seg_kmem memory pages. To make things easier, 4139 * seg_kpm addresses are also accepted by the routines, but nothing 4140 * is done with them since by definition their PA mappings are static. 4141 * (2) hat_add_callback() may only be called while holding the page lock 4142 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()), 4143 * or passing HAC_PAGELOCK flag. 4144 * (3) prehandler() and posthandler() may not call hat_add_callback() or 4145 * hat_delete_callback(), nor should they allocate memory. Post quiesce 4146 * callbacks may not sleep or acquire adaptive mutex locks. 4147 * (4) Either prehandler() or posthandler() (but not both) may be specified 4148 * as being NULL. Specifying an errhandler() is optional. 4149 * 4150 * Details of using the framework: 4151 * 4152 * registering a callback (hat_register_callback()) 4153 * 4154 * Pass prehandler, posthandler, errhandler addresses 4155 * as described below. If capture_cpus argument is nonzero, 4156 * suspend callback to the prehandler will occur with CPUs 4157 * captured and executing xc_loop() and CPUs will remain 4158 * captured until after the posthandler suspend callback 4159 * occurs. 4160 * 4161 * adding a callback (hat_add_callback()) 4162 * 4163 * as_pagelock(); 4164 * hat_add_callback(); 4165 * save returned pfn in private data structures or program registers; 4166 * as_pageunlock(); 4167 * 4168 * prehandler() 4169 * 4170 * Stop all accesses by physical address to this memory page. 4171 * Called twice: the first, PRESUSPEND, is a context safe to acquire 4172 * adaptive locks. The second, SUSPEND, is called at high PIL with 4173 * CPUs captured so adaptive locks may NOT be acquired (and all spin 4174 * locks must be XCALL_PIL or higher locks). 4175 * 4176 * May return the following errors: 4177 * EIO: A fatal error has occurred. This will result in panic. 4178 * EAGAIN: The page cannot be suspended. This will fail the 4179 * relocation. 4180 * 0: Success. 4181 * 4182 * posthandler() 4183 * 4184 * Save new pfn in private data structures or program registers; 4185 * not allowed to fail (non-zero return values will result in panic). 4186 * 4187 * errhandler() 4188 * 4189 * called when an error occurs related to the callback. Currently 4190 * the only such error is HAT_CB_ERR_LEAKED which indicates that 4191 * a page is being freed, but there are still outstanding callback(s) 4192 * registered on the page. 4193 * 4194 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory) 4195 * 4196 * stop using physical address 4197 * hat_delete_callback(); 4198 * 4199 */ 4200 4201 /* 4202 * Register a callback class. Each subsystem should do this once and 4203 * cache the id_t returned for use in setting up and tearing down callbacks. 4204 * 4205 * There is no facility for removing callback IDs once they are created; 4206 * the "key" should be unique for each module, so in case a module is unloaded 4207 * and subsequently re-loaded, we can recycle the module's previous entry. 4208 */ 4209 id_t 4210 hat_register_callback(int key, 4211 int (*prehandler)(caddr_t, uint_t, uint_t, void *), 4212 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t), 4213 int (*errhandler)(caddr_t, uint_t, uint_t, void *), 4214 int capture_cpus) 4215 { 4216 id_t id; 4217 4218 /* 4219 * Search the table for a pre-existing callback associated with 4220 * the identifier "key". If one exists, we re-use that entry in 4221 * the table for this instance, otherwise we assign the next 4222 * available table slot. 4223 */ 4224 for (id = 0; id < sfmmu_max_cb_id; id++) { 4225 if (sfmmu_cb_table[id].key == key) 4226 break; 4227 } 4228 4229 if (id == sfmmu_max_cb_id) { 4230 id = sfmmu_cb_nextid++; 4231 if (id >= sfmmu_max_cb_id) 4232 panic("hat_register_callback: out of callback IDs"); 4233 } 4234 4235 ASSERT(prehandler != NULL || posthandler != NULL); 4236 4237 sfmmu_cb_table[id].key = key; 4238 sfmmu_cb_table[id].prehandler = prehandler; 4239 sfmmu_cb_table[id].posthandler = posthandler; 4240 sfmmu_cb_table[id].errhandler = errhandler; 4241 sfmmu_cb_table[id].capture_cpus = capture_cpus; 4242 4243 return (id); 4244 } 4245 4246 #define HAC_COOKIE_NONE (void *)-1 4247 4248 /* 4249 * Add relocation callbacks to the specified addr/len which will be called 4250 * when relocating the associated page. See the description of pre and 4251 * posthandler above for more details. 4252 * 4253 * If HAC_PAGELOCK is included in flags, the underlying memory page is 4254 * locked internally so the caller must be able to deal with the callback 4255 * running even before this function has returned. If HAC_PAGELOCK is not 4256 * set, it is assumed that the underlying memory pages are locked. 4257 * 4258 * Since the caller must track the individual page boundaries anyway, 4259 * we only allow a callback to be added to a single page (large 4260 * or small). Thus [addr, addr + len) MUST be contained within a single 4261 * page. 4262 * 4263 * Registering multiple callbacks on the same [addr, addr+len) is supported, 4264 * _provided_that_ a unique parameter is specified for each callback. 4265 * If multiple callbacks are registered on the same range the callback will 4266 * be invoked with each unique parameter. Registering the same callback with 4267 * the same argument more than once will result in corrupted kernel state. 4268 * 4269 * Returns the pfn of the underlying kernel page in *rpfn 4270 * on success, or PFN_INVALID on failure. 4271 * 4272 * cookiep (if passed) provides storage space for an opaque cookie 4273 * to return later to hat_delete_callback(). This cookie makes the callback 4274 * deletion significantly quicker by avoiding a potentially lengthy hash 4275 * search. 4276 * 4277 * Returns values: 4278 * 0: success 4279 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP) 4280 * EINVAL: callback ID is not valid 4281 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address 4282 * space 4283 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary 4284 */ 4285 int 4286 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags, 4287 void *pvt, pfn_t *rpfn, void **cookiep) 4288 { 4289 struct hmehash_bucket *hmebp; 4290 hmeblk_tag hblktag; 4291 struct hme_blk *hmeblkp; 4292 int hmeshift, hashno; 4293 caddr_t saddr, eaddr, baseaddr; 4294 struct pa_hment *pahmep; 4295 struct sf_hment *sfhmep, *osfhmep; 4296 kmutex_t *pml; 4297 tte_t tte; 4298 page_t *pp; 4299 vnode_t *vp; 4300 u_offset_t off; 4301 pfn_t pfn; 4302 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP; 4303 int locked = 0; 4304 4305 /* 4306 * For KPM mappings, just return the physical address since we 4307 * don't need to register any callbacks. 4308 */ 4309 if (IS_KPM_ADDR(vaddr)) { 4310 uint64_t paddr; 4311 SFMMU_KPM_VTOP(vaddr, paddr); 4312 *rpfn = btop(paddr); 4313 if (cookiep != NULL) 4314 *cookiep = HAC_COOKIE_NONE; 4315 return (0); 4316 } 4317 4318 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) { 4319 *rpfn = PFN_INVALID; 4320 return (EINVAL); 4321 } 4322 4323 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) { 4324 *rpfn = PFN_INVALID; 4325 return (ENOMEM); 4326 } 4327 4328 sfhmep = &pahmep->sfment; 4329 4330 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4331 eaddr = saddr + len; 4332 4333 rehash: 4334 /* Find the mapping(s) for this page */ 4335 for (hashno = TTE64K, hmeblkp = NULL; 4336 hmeblkp == NULL && hashno <= mmu_hashcnt; 4337 hashno++) { 4338 hmeshift = HME_HASH_SHIFT(hashno); 4339 hblktag.htag_id = ksfmmup; 4340 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4341 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4342 hblktag.htag_rehash = hashno; 4343 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4344 4345 SFMMU_HASH_LOCK(hmebp); 4346 4347 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4348 4349 if (hmeblkp == NULL) 4350 SFMMU_HASH_UNLOCK(hmebp); 4351 } 4352 4353 if (hmeblkp == NULL) { 4354 kmem_cache_free(pa_hment_cache, pahmep); 4355 *rpfn = PFN_INVALID; 4356 return (ENXIO); 4357 } 4358 4359 ASSERT(!hmeblkp->hblk_shared); 4360 4361 HBLKTOHME(osfhmep, hmeblkp, saddr); 4362 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4363 4364 if (!TTE_IS_VALID(&tte)) { 4365 SFMMU_HASH_UNLOCK(hmebp); 4366 kmem_cache_free(pa_hment_cache, pahmep); 4367 *rpfn = PFN_INVALID; 4368 return (ENXIO); 4369 } 4370 4371 /* 4372 * Make sure the boundaries for the callback fall within this 4373 * single mapping. 4374 */ 4375 baseaddr = (caddr_t)get_hblk_base(hmeblkp); 4376 ASSERT(saddr >= baseaddr); 4377 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) { 4378 SFMMU_HASH_UNLOCK(hmebp); 4379 kmem_cache_free(pa_hment_cache, pahmep); 4380 *rpfn = PFN_INVALID; 4381 return (ERANGE); 4382 } 4383 4384 pfn = sfmmu_ttetopfn(&tte, vaddr); 4385 4386 /* 4387 * The pfn may not have a page_t underneath in which case we 4388 * just return it. This can happen if we are doing I/O to a 4389 * static portion of the kernel's address space, for instance. 4390 */ 4391 pp = osfhmep->hme_page; 4392 if (pp == NULL) { 4393 SFMMU_HASH_UNLOCK(hmebp); 4394 kmem_cache_free(pa_hment_cache, pahmep); 4395 *rpfn = pfn; 4396 if (cookiep) 4397 *cookiep = HAC_COOKIE_NONE; 4398 return (0); 4399 } 4400 ASSERT(pp == PP_PAGEROOT(pp)); 4401 4402 vp = pp->p_vnode; 4403 off = pp->p_offset; 4404 4405 pml = sfmmu_mlist_enter(pp); 4406 4407 if (flags & HAC_PAGELOCK) { 4408 if (!page_trylock(pp, SE_SHARED)) { 4409 /* 4410 * Somebody is holding SE_EXCL lock. Might 4411 * even be hat_page_relocate(). Drop all 4412 * our locks, lookup the page in &kvp, and 4413 * retry. If it doesn't exist in &kvp and &zvp, 4414 * then we must be dealing with a kernel mapped 4415 * page which doesn't actually belong to 4416 * segkmem so we punt. 4417 */ 4418 sfmmu_mlist_exit(pml); 4419 SFMMU_HASH_UNLOCK(hmebp); 4420 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4421 4422 /* check zvp before giving up */ 4423 if (pp == NULL) 4424 pp = page_lookup(&zvp, (u_offset_t)saddr, 4425 SE_SHARED); 4426 4427 /* Okay, we didn't find it, give up */ 4428 if (pp == NULL) { 4429 kmem_cache_free(pa_hment_cache, pahmep); 4430 *rpfn = pfn; 4431 if (cookiep) 4432 *cookiep = HAC_COOKIE_NONE; 4433 return (0); 4434 } 4435 page_unlock(pp); 4436 goto rehash; 4437 } 4438 locked = 1; 4439 } 4440 4441 if (!PAGE_LOCKED(pp) && !panicstr) 4442 panic("hat_add_callback: page 0x%p not locked", (void *)pp); 4443 4444 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4445 pp->p_offset != off) { 4446 /* 4447 * The page moved before we got our hands on it. Drop 4448 * all the locks and try again. 4449 */ 4450 ASSERT((flags & HAC_PAGELOCK) != 0); 4451 sfmmu_mlist_exit(pml); 4452 SFMMU_HASH_UNLOCK(hmebp); 4453 page_unlock(pp); 4454 locked = 0; 4455 goto rehash; 4456 } 4457 4458 if (!VN_ISKAS(vp)) { 4459 /* 4460 * This is not a segkmem page but another page which 4461 * has been kernel mapped. It had better have at least 4462 * a share lock on it. Return the pfn. 4463 */ 4464 sfmmu_mlist_exit(pml); 4465 SFMMU_HASH_UNLOCK(hmebp); 4466 if (locked) 4467 page_unlock(pp); 4468 kmem_cache_free(pa_hment_cache, pahmep); 4469 ASSERT(PAGE_LOCKED(pp)); 4470 *rpfn = pfn; 4471 if (cookiep) 4472 *cookiep = HAC_COOKIE_NONE; 4473 return (0); 4474 } 4475 4476 /* 4477 * Setup this pa_hment and link its embedded dummy sf_hment into 4478 * the mapping list. 4479 */ 4480 pp->p_share++; 4481 pahmep->cb_id = callback_id; 4482 pahmep->addr = vaddr; 4483 pahmep->len = len; 4484 pahmep->refcnt = 1; 4485 pahmep->flags = 0; 4486 pahmep->pvt = pvt; 4487 4488 sfhmep->hme_tte.ll = 0; 4489 sfhmep->hme_data = pahmep; 4490 sfhmep->hme_prev = osfhmep; 4491 sfhmep->hme_next = osfhmep->hme_next; 4492 4493 if (osfhmep->hme_next) 4494 osfhmep->hme_next->hme_prev = sfhmep; 4495 4496 osfhmep->hme_next = sfhmep; 4497 4498 sfmmu_mlist_exit(pml); 4499 SFMMU_HASH_UNLOCK(hmebp); 4500 4501 if (locked) 4502 page_unlock(pp); 4503 4504 *rpfn = pfn; 4505 if (cookiep) 4506 *cookiep = (void *)pahmep; 4507 4508 return (0); 4509 } 4510 4511 /* 4512 * Remove the relocation callbacks from the specified addr/len. 4513 */ 4514 void 4515 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags, 4516 void *cookie) 4517 { 4518 struct hmehash_bucket *hmebp; 4519 hmeblk_tag hblktag; 4520 struct hme_blk *hmeblkp; 4521 int hmeshift, hashno; 4522 caddr_t saddr; 4523 struct pa_hment *pahmep; 4524 struct sf_hment *sfhmep, *osfhmep; 4525 kmutex_t *pml; 4526 tte_t tte; 4527 page_t *pp; 4528 vnode_t *vp; 4529 u_offset_t off; 4530 int locked = 0; 4531 4532 /* 4533 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to 4534 * remove so just return. 4535 */ 4536 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr)) 4537 return; 4538 4539 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK); 4540 4541 rehash: 4542 /* Find the mapping(s) for this page */ 4543 for (hashno = TTE64K, hmeblkp = NULL; 4544 hmeblkp == NULL && hashno <= mmu_hashcnt; 4545 hashno++) { 4546 hmeshift = HME_HASH_SHIFT(hashno); 4547 hblktag.htag_id = ksfmmup; 4548 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4549 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift); 4550 hblktag.htag_rehash = hashno; 4551 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift); 4552 4553 SFMMU_HASH_LOCK(hmebp); 4554 4555 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 4556 4557 if (hmeblkp == NULL) 4558 SFMMU_HASH_UNLOCK(hmebp); 4559 } 4560 4561 if (hmeblkp == NULL) 4562 return; 4563 4564 ASSERT(!hmeblkp->hblk_shared); 4565 4566 HBLKTOHME(osfhmep, hmeblkp, saddr); 4567 4568 sfmmu_copytte(&osfhmep->hme_tte, &tte); 4569 if (!TTE_IS_VALID(&tte)) { 4570 SFMMU_HASH_UNLOCK(hmebp); 4571 return; 4572 } 4573 4574 pp = osfhmep->hme_page; 4575 if (pp == NULL) { 4576 SFMMU_HASH_UNLOCK(hmebp); 4577 ASSERT(cookie == NULL); 4578 return; 4579 } 4580 4581 vp = pp->p_vnode; 4582 off = pp->p_offset; 4583 4584 pml = sfmmu_mlist_enter(pp); 4585 4586 if (flags & HAC_PAGELOCK) { 4587 if (!page_trylock(pp, SE_SHARED)) { 4588 /* 4589 * Somebody is holding SE_EXCL lock. Might 4590 * even be hat_page_relocate(). Drop all 4591 * our locks, lookup the page in &kvp, and 4592 * retry. If it doesn't exist in &kvp and &zvp, 4593 * then we must be dealing with a kernel mapped 4594 * page which doesn't actually belong to 4595 * segkmem so we punt. 4596 */ 4597 sfmmu_mlist_exit(pml); 4598 SFMMU_HASH_UNLOCK(hmebp); 4599 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED); 4600 /* check zvp before giving up */ 4601 if (pp == NULL) 4602 pp = page_lookup(&zvp, (u_offset_t)saddr, 4603 SE_SHARED); 4604 4605 if (pp == NULL) { 4606 ASSERT(cookie == NULL); 4607 return; 4608 } 4609 page_unlock(pp); 4610 goto rehash; 4611 } 4612 locked = 1; 4613 } 4614 4615 ASSERT(PAGE_LOCKED(pp)); 4616 4617 if (osfhmep->hme_page != pp || pp->p_vnode != vp || 4618 pp->p_offset != off) { 4619 /* 4620 * The page moved before we got our hands on it. Drop 4621 * all the locks and try again. 4622 */ 4623 ASSERT((flags & HAC_PAGELOCK) != 0); 4624 sfmmu_mlist_exit(pml); 4625 SFMMU_HASH_UNLOCK(hmebp); 4626 page_unlock(pp); 4627 locked = 0; 4628 goto rehash; 4629 } 4630 4631 if (!VN_ISKAS(vp)) { 4632 /* 4633 * This is not a segkmem page but another page which 4634 * has been kernel mapped. 4635 */ 4636 sfmmu_mlist_exit(pml); 4637 SFMMU_HASH_UNLOCK(hmebp); 4638 if (locked) 4639 page_unlock(pp); 4640 ASSERT(cookie == NULL); 4641 return; 4642 } 4643 4644 if (cookie != NULL) { 4645 pahmep = (struct pa_hment *)cookie; 4646 sfhmep = &pahmep->sfment; 4647 } else { 4648 for (sfhmep = pp->p_mapping; sfhmep != NULL; 4649 sfhmep = sfhmep->hme_next) { 4650 4651 /* 4652 * skip va<->pa mappings 4653 */ 4654 if (!IS_PAHME(sfhmep)) 4655 continue; 4656 4657 pahmep = sfhmep->hme_data; 4658 ASSERT(pahmep != NULL); 4659 4660 /* 4661 * if pa_hment matches, remove it 4662 */ 4663 if ((pahmep->pvt == pvt) && 4664 (pahmep->addr == vaddr) && 4665 (pahmep->len == len)) { 4666 break; 4667 } 4668 } 4669 } 4670 4671 if (sfhmep == NULL) { 4672 if (!panicstr) { 4673 panic("hat_delete_callback: pa_hment not found, pp %p", 4674 (void *)pp); 4675 } 4676 return; 4677 } 4678 4679 /* 4680 * Note: at this point a valid kernel mapping must still be 4681 * present on this page. 4682 */ 4683 pp->p_share--; 4684 if (pp->p_share <= 0) 4685 panic("hat_delete_callback: zero p_share"); 4686 4687 if (--pahmep->refcnt == 0) { 4688 if (pahmep->flags != 0) 4689 panic("hat_delete_callback: pa_hment is busy"); 4690 4691 /* 4692 * Remove sfhmep from the mapping list for the page. 4693 */ 4694 if (sfhmep->hme_prev) { 4695 sfhmep->hme_prev->hme_next = sfhmep->hme_next; 4696 } else { 4697 pp->p_mapping = sfhmep->hme_next; 4698 } 4699 4700 if (sfhmep->hme_next) 4701 sfhmep->hme_next->hme_prev = sfhmep->hme_prev; 4702 4703 sfmmu_mlist_exit(pml); 4704 SFMMU_HASH_UNLOCK(hmebp); 4705 4706 if (locked) 4707 page_unlock(pp); 4708 4709 kmem_cache_free(pa_hment_cache, pahmep); 4710 return; 4711 } 4712 4713 sfmmu_mlist_exit(pml); 4714 SFMMU_HASH_UNLOCK(hmebp); 4715 if (locked) 4716 page_unlock(pp); 4717 } 4718 4719 /* 4720 * hat_probe returns 1 if the translation for the address 'addr' is 4721 * loaded, zero otherwise. 4722 * 4723 * hat_probe should be used only for advisorary purposes because it may 4724 * occasionally return the wrong value. The implementation must guarantee that 4725 * returning the wrong value is a very rare event. hat_probe is used 4726 * to implement optimizations in the segment drivers. 4727 * 4728 */ 4729 int 4730 hat_probe(struct hat *sfmmup, caddr_t addr) 4731 { 4732 pfn_t pfn; 4733 tte_t tte; 4734 4735 ASSERT(sfmmup != NULL); 4736 4737 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as)); 4738 4739 if (sfmmup == ksfmmup) { 4740 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte)) 4741 == PFN_SUSPENDED) { 4742 sfmmu_vatopfn_suspended(addr, sfmmup, &tte); 4743 } 4744 } else { 4745 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL); 4746 } 4747 4748 if (pfn != PFN_INVALID) 4749 return (1); 4750 else 4751 return (0); 4752 } 4753 4754 ssize_t 4755 hat_getpagesize(struct hat *sfmmup, caddr_t addr) 4756 { 4757 tte_t tte; 4758 4759 if (sfmmup == ksfmmup) { 4760 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4761 return (-1); 4762 } 4763 } else { 4764 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4765 return (-1); 4766 } 4767 } 4768 4769 ASSERT(TTE_IS_VALID(&tte)); 4770 return (TTEBYTES(TTE_CSZ(&tte))); 4771 } 4772 4773 uint_t 4774 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr) 4775 { 4776 tte_t tte; 4777 4778 if (sfmmup == ksfmmup) { 4779 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4780 tte.ll = 0; 4781 } 4782 } else { 4783 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) { 4784 tte.ll = 0; 4785 } 4786 } 4787 if (TTE_IS_VALID(&tte)) { 4788 *attr = sfmmu_ptov_attr(&tte); 4789 return (0); 4790 } 4791 *attr = 0; 4792 return ((uint_t)0xffffffff); 4793 } 4794 4795 /* 4796 * Enables more attributes on specified address range (ie. logical OR) 4797 */ 4798 void 4799 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4800 { 4801 ASSERT(hat->sfmmu_as != NULL); 4802 4803 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR); 4804 } 4805 4806 /* 4807 * Assigns attributes to the specified address range. All the attributes 4808 * are specified. 4809 */ 4810 void 4811 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4812 { 4813 ASSERT(hat->sfmmu_as != NULL); 4814 4815 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR); 4816 } 4817 4818 /* 4819 * Remove attributes on the specified address range (ie. loginal NAND) 4820 */ 4821 void 4822 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr) 4823 { 4824 ASSERT(hat->sfmmu_as != NULL); 4825 4826 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR); 4827 } 4828 4829 /* 4830 * Change attributes on an address range to that specified by attr and mode. 4831 */ 4832 static void 4833 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr, 4834 int mode) 4835 { 4836 struct hmehash_bucket *hmebp; 4837 hmeblk_tag hblktag; 4838 int hmeshift, hashno = 1; 4839 struct hme_blk *hmeblkp, *list = NULL; 4840 caddr_t endaddr; 4841 cpuset_t cpuset; 4842 demap_range_t dmr; 4843 4844 CPUSET_ZERO(cpuset); 4845 4846 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as)); 4847 ASSERT((len & MMU_PAGEOFFSET) == 0); 4848 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 4849 4850 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) && 4851 ((addr + len) > (caddr_t)USERLIMIT)) { 4852 panic("user addr %p in kernel space", 4853 (void *)addr); 4854 } 4855 4856 endaddr = addr + len; 4857 hblktag.htag_id = sfmmup; 4858 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 4859 DEMAP_RANGE_INIT(sfmmup, &dmr); 4860 4861 while (addr < endaddr) { 4862 hmeshift = HME_HASH_SHIFT(hashno); 4863 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 4864 hblktag.htag_rehash = hashno; 4865 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 4866 4867 SFMMU_HASH_LOCK(hmebp); 4868 4869 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 4870 if (hmeblkp != NULL) { 4871 ASSERT(!hmeblkp->hblk_shared); 4872 /* 4873 * We've encountered a shadow hmeblk so skip the range 4874 * of the next smaller mapping size. 4875 */ 4876 if (hmeblkp->hblk_shw_bit) { 4877 ASSERT(sfmmup != ksfmmup); 4878 ASSERT(hashno > 1); 4879 addr = (caddr_t)P2END((uintptr_t)addr, 4880 TTEBYTES(hashno - 1)); 4881 } else { 4882 addr = sfmmu_hblk_chgattr(sfmmup, 4883 hmeblkp, addr, endaddr, &dmr, attr, mode); 4884 } 4885 SFMMU_HASH_UNLOCK(hmebp); 4886 hashno = 1; 4887 continue; 4888 } 4889 SFMMU_HASH_UNLOCK(hmebp); 4890 4891 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 4892 /* 4893 * We have traversed the whole list and rehashed 4894 * if necessary without finding the address to chgattr. 4895 * This is ok, so we increment the address by the 4896 * smallest hmeblk range for kernel mappings or for 4897 * user mappings with no large pages, and the largest 4898 * hmeblk range, to account for shadow hmeblks, for 4899 * user mappings with large pages and continue. 4900 */ 4901 if (sfmmup == ksfmmup) 4902 addr = (caddr_t)P2END((uintptr_t)addr, 4903 TTEBYTES(1)); 4904 else 4905 addr = (caddr_t)P2END((uintptr_t)addr, 4906 TTEBYTES(hashno)); 4907 hashno = 1; 4908 } else { 4909 hashno++; 4910 } 4911 } 4912 4913 sfmmu_hblks_list_purge(&list, 0); 4914 DEMAP_RANGE_FLUSH(&dmr); 4915 cpuset = sfmmup->sfmmu_cpusran; 4916 xt_sync(cpuset); 4917 } 4918 4919 /* 4920 * This function chgattr on a range of addresses in an hmeblk. It returns the 4921 * next addres that needs to be chgattr. 4922 * It should be called with the hash lock held. 4923 * XXX It should be possible to optimize chgattr by not flushing every time but 4924 * on the other hand: 4925 * 1. do one flush crosscall. 4926 * 2. only flush if we are increasing permissions (make sure this will work) 4927 */ 4928 static caddr_t 4929 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 4930 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode) 4931 { 4932 tte_t tte, tteattr, tteflags, ttemod; 4933 struct sf_hment *sfhmep; 4934 int ttesz; 4935 struct page *pp = NULL; 4936 kmutex_t *pml, *pmtx; 4937 int ret; 4938 int use_demap_range; 4939 #if defined(SF_ERRATA_57) 4940 int check_exec; 4941 #endif 4942 4943 ASSERT(in_hblk_range(hmeblkp, addr)); 4944 ASSERT(hmeblkp->hblk_shw_bit == 0); 4945 ASSERT(!hmeblkp->hblk_shared); 4946 4947 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 4948 ttesz = get_hblk_ttesz(hmeblkp); 4949 4950 /* 4951 * Flush the current demap region if addresses have been 4952 * skipped or the page size doesn't match. 4953 */ 4954 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)); 4955 if (use_demap_range) { 4956 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 4957 } else if (dmrp != NULL) { 4958 DEMAP_RANGE_FLUSH(dmrp); 4959 } 4960 4961 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags); 4962 #if defined(SF_ERRATA_57) 4963 check_exec = (sfmmup != ksfmmup) && 4964 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 4965 TTE_IS_EXECUTABLE(&tteattr); 4966 #endif 4967 HBLKTOHME(sfhmep, hmeblkp, addr); 4968 while (addr < endaddr) { 4969 sfmmu_copytte(&sfhmep->hme_tte, &tte); 4970 if (TTE_IS_VALID(&tte)) { 4971 if ((tte.ll & tteflags.ll) == tteattr.ll) { 4972 /* 4973 * if the new attr is the same as old 4974 * continue 4975 */ 4976 goto next_addr; 4977 } 4978 if (!TTE_IS_WRITABLE(&tteattr)) { 4979 /* 4980 * make sure we clear hw modify bit if we 4981 * removing write protections 4982 */ 4983 tteflags.tte_intlo |= TTE_HWWR_INT; 4984 } 4985 4986 pml = NULL; 4987 pp = sfhmep->hme_page; 4988 if (pp) { 4989 pml = sfmmu_mlist_enter(pp); 4990 } 4991 4992 if (pp != sfhmep->hme_page) { 4993 /* 4994 * tte must have been unloaded. 4995 */ 4996 ASSERT(pml); 4997 sfmmu_mlist_exit(pml); 4998 continue; 4999 } 5000 5001 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5002 5003 ttemod = tte; 5004 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll; 5005 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte)); 5006 5007 #if defined(SF_ERRATA_57) 5008 if (check_exec && addr < errata57_limit) 5009 ttemod.tte_exec_perm = 0; 5010 #endif 5011 ret = sfmmu_modifytte_try(&tte, &ttemod, 5012 &sfhmep->hme_tte); 5013 5014 if (ret < 0) { 5015 /* tte changed underneath us */ 5016 if (pml) { 5017 sfmmu_mlist_exit(pml); 5018 } 5019 continue; 5020 } 5021 5022 if (tteflags.tte_intlo & TTE_HWWR_INT) { 5023 /* 5024 * need to sync if we are clearing modify bit. 5025 */ 5026 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5027 } 5028 5029 if (pp && PP_ISRO(pp)) { 5030 if (tteattr.tte_intlo & TTE_WRPRM_INT) { 5031 pmtx = sfmmu_page_enter(pp); 5032 PP_CLRRO(pp); 5033 sfmmu_page_exit(pmtx); 5034 } 5035 } 5036 5037 if (ret > 0 && use_demap_range) { 5038 DEMAP_RANGE_MARKPG(dmrp, addr); 5039 } else if (ret > 0) { 5040 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5041 } 5042 5043 if (pml) { 5044 sfmmu_mlist_exit(pml); 5045 } 5046 } 5047 next_addr: 5048 addr += TTEBYTES(ttesz); 5049 sfhmep++; 5050 DEMAP_RANGE_NEXTPG(dmrp); 5051 } 5052 return (addr); 5053 } 5054 5055 /* 5056 * This routine converts virtual attributes to physical ones. It will 5057 * update the tteflags field with the tte mask corresponding to the attributes 5058 * affected and it returns the new attributes. It will also clear the modify 5059 * bit if we are taking away write permission. This is necessary since the 5060 * modify bit is the hardware permission bit and we need to clear it in order 5061 * to detect write faults. 5062 */ 5063 static uint64_t 5064 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp) 5065 { 5066 tte_t ttevalue; 5067 5068 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); 5069 5070 switch (mode) { 5071 case SFMMU_CHGATTR: 5072 /* all attributes specified */ 5073 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr); 5074 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); 5075 ttemaskp->tte_inthi = TTEINTHI_ATTR; 5076 ttemaskp->tte_intlo = TTEINTLO_ATTR; 5077 break; 5078 case SFMMU_SETATTR: 5079 ASSERT(!(attr & ~HAT_PROT_MASK)); 5080 ttemaskp->ll = 0; 5081 ttevalue.ll = 0; 5082 /* 5083 * a valid tte implies exec and read for sfmmu 5084 * so no need to do anything about them. 5085 * since priviledged access implies user access 5086 * PROT_USER doesn't make sense either. 5087 */ 5088 if (attr & PROT_WRITE) { 5089 ttemaskp->tte_intlo |= TTE_WRPRM_INT; 5090 ttevalue.tte_intlo |= TTE_WRPRM_INT; 5091 } 5092 break; 5093 case SFMMU_CLRATTR: 5094 /* attributes will be nand with current ones */ 5095 if (attr & ~(PROT_WRITE | PROT_USER)) { 5096 panic("sfmmu: attr %x not supported", attr); 5097 } 5098 ttemaskp->ll = 0; 5099 ttevalue.ll = 0; 5100 if (attr & PROT_WRITE) { 5101 /* clear both writable and modify bit */ 5102 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT; 5103 } 5104 if (attr & PROT_USER) { 5105 ttemaskp->tte_intlo |= TTE_PRIV_INT; 5106 ttevalue.tte_intlo |= TTE_PRIV_INT; 5107 } 5108 break; 5109 default: 5110 panic("sfmmu_vtop_attr: bad mode %x", mode); 5111 } 5112 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0); 5113 return (ttevalue.ll); 5114 } 5115 5116 static uint_t 5117 sfmmu_ptov_attr(tte_t *ttep) 5118 { 5119 uint_t attr; 5120 5121 ASSERT(TTE_IS_VALID(ttep)); 5122 5123 attr = PROT_READ; 5124 5125 if (TTE_IS_WRITABLE(ttep)) { 5126 attr |= PROT_WRITE; 5127 } 5128 if (TTE_IS_EXECUTABLE(ttep)) { 5129 attr |= PROT_EXEC; 5130 } 5131 if (!TTE_IS_PRIVILEGED(ttep)) { 5132 attr |= PROT_USER; 5133 } 5134 if (TTE_IS_NFO(ttep)) { 5135 attr |= HAT_NOFAULT; 5136 } 5137 if (TTE_IS_NOSYNC(ttep)) { 5138 attr |= HAT_NOSYNC; 5139 } 5140 if (TTE_IS_SIDEFFECT(ttep)) { 5141 attr |= SFMMU_SIDEFFECT; 5142 } 5143 if (!TTE_IS_VCACHEABLE(ttep)) { 5144 attr |= SFMMU_UNCACHEVTTE; 5145 } 5146 if (!TTE_IS_PCACHEABLE(ttep)) { 5147 attr |= SFMMU_UNCACHEPTTE; 5148 } 5149 return (attr); 5150 } 5151 5152 /* 5153 * hat_chgprot is a deprecated hat call. New segment drivers 5154 * should store all attributes and use hat_*attr calls. 5155 * 5156 * Change the protections in the virtual address range 5157 * given to the specified virtual protection. If vprot is ~PROT_WRITE, 5158 * then remove write permission, leaving the other 5159 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions. 5160 * 5161 */ 5162 void 5163 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot) 5164 { 5165 struct hmehash_bucket *hmebp; 5166 hmeblk_tag hblktag; 5167 int hmeshift, hashno = 1; 5168 struct hme_blk *hmeblkp, *list = NULL; 5169 caddr_t endaddr; 5170 cpuset_t cpuset; 5171 demap_range_t dmr; 5172 5173 ASSERT((len & MMU_PAGEOFFSET) == 0); 5174 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0); 5175 5176 ASSERT(sfmmup->sfmmu_as != NULL); 5177 5178 CPUSET_ZERO(cpuset); 5179 5180 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) && 5181 ((addr + len) > (caddr_t)USERLIMIT)) { 5182 panic("user addr %p vprot %x in kernel space", 5183 (void *)addr, vprot); 5184 } 5185 endaddr = addr + len; 5186 hblktag.htag_id = sfmmup; 5187 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5188 DEMAP_RANGE_INIT(sfmmup, &dmr); 5189 5190 while (addr < endaddr) { 5191 hmeshift = HME_HASH_SHIFT(hashno); 5192 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5193 hblktag.htag_rehash = hashno; 5194 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5195 5196 SFMMU_HASH_LOCK(hmebp); 5197 5198 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 5199 if (hmeblkp != NULL) { 5200 ASSERT(!hmeblkp->hblk_shared); 5201 /* 5202 * We've encountered a shadow hmeblk so skip the range 5203 * of the next smaller mapping size. 5204 */ 5205 if (hmeblkp->hblk_shw_bit) { 5206 ASSERT(sfmmup != ksfmmup); 5207 ASSERT(hashno > 1); 5208 addr = (caddr_t)P2END((uintptr_t)addr, 5209 TTEBYTES(hashno - 1)); 5210 } else { 5211 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp, 5212 addr, endaddr, &dmr, vprot); 5213 } 5214 SFMMU_HASH_UNLOCK(hmebp); 5215 hashno = 1; 5216 continue; 5217 } 5218 SFMMU_HASH_UNLOCK(hmebp); 5219 5220 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 5221 /* 5222 * We have traversed the whole list and rehashed 5223 * if necessary without finding the address to chgprot. 5224 * This is ok so we increment the address by the 5225 * smallest hmeblk range for kernel mappings and the 5226 * largest hmeblk range, to account for shadow hmeblks, 5227 * for user mappings and continue. 5228 */ 5229 if (sfmmup == ksfmmup) 5230 addr = (caddr_t)P2END((uintptr_t)addr, 5231 TTEBYTES(1)); 5232 else 5233 addr = (caddr_t)P2END((uintptr_t)addr, 5234 TTEBYTES(hashno)); 5235 hashno = 1; 5236 } else { 5237 hashno++; 5238 } 5239 } 5240 5241 sfmmu_hblks_list_purge(&list, 0); 5242 DEMAP_RANGE_FLUSH(&dmr); 5243 cpuset = sfmmup->sfmmu_cpusran; 5244 xt_sync(cpuset); 5245 } 5246 5247 /* 5248 * This function chgprots a range of addresses in an hmeblk. It returns the 5249 * next addres that needs to be chgprot. 5250 * It should be called with the hash lock held. 5251 * XXX It shold be possible to optimize chgprot by not flushing every time but 5252 * on the other hand: 5253 * 1. do one flush crosscall. 5254 * 2. only flush if we are increasing permissions (make sure this will work) 5255 */ 5256 static caddr_t 5257 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5258 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot) 5259 { 5260 uint_t pprot; 5261 tte_t tte, ttemod; 5262 struct sf_hment *sfhmep; 5263 uint_t tteflags; 5264 int ttesz; 5265 struct page *pp = NULL; 5266 kmutex_t *pml, *pmtx; 5267 int ret; 5268 int use_demap_range; 5269 #if defined(SF_ERRATA_57) 5270 int check_exec; 5271 #endif 5272 5273 ASSERT(in_hblk_range(hmeblkp, addr)); 5274 ASSERT(hmeblkp->hblk_shw_bit == 0); 5275 ASSERT(!hmeblkp->hblk_shared); 5276 5277 #ifdef DEBUG 5278 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5279 (endaddr < get_hblk_endaddr(hmeblkp))) { 5280 panic("sfmmu_hblk_chgprot: partial chgprot of large page"); 5281 } 5282 #endif /* DEBUG */ 5283 5284 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5285 ttesz = get_hblk_ttesz(hmeblkp); 5286 5287 pprot = sfmmu_vtop_prot(vprot, &tteflags); 5288 #if defined(SF_ERRATA_57) 5289 check_exec = (sfmmup != ksfmmup) && 5290 AS_TYPE_64BIT(sfmmup->sfmmu_as) && 5291 ((vprot & PROT_EXEC) == PROT_EXEC); 5292 #endif 5293 HBLKTOHME(sfhmep, hmeblkp, addr); 5294 5295 /* 5296 * Flush the current demap region if addresses have been 5297 * skipped or the page size doesn't match. 5298 */ 5299 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE); 5300 if (use_demap_range) { 5301 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5302 } else if (dmrp != NULL) { 5303 DEMAP_RANGE_FLUSH(dmrp); 5304 } 5305 5306 while (addr < endaddr) { 5307 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5308 if (TTE_IS_VALID(&tte)) { 5309 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) { 5310 /* 5311 * if the new protection is the same as old 5312 * continue 5313 */ 5314 goto next_addr; 5315 } 5316 pml = NULL; 5317 pp = sfhmep->hme_page; 5318 if (pp) { 5319 pml = sfmmu_mlist_enter(pp); 5320 } 5321 if (pp != sfhmep->hme_page) { 5322 /* 5323 * tte most have been unloaded 5324 * underneath us. Recheck 5325 */ 5326 ASSERT(pml); 5327 sfmmu_mlist_exit(pml); 5328 continue; 5329 } 5330 5331 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5332 5333 ttemod = tte; 5334 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); 5335 #if defined(SF_ERRATA_57) 5336 if (check_exec && addr < errata57_limit) 5337 ttemod.tte_exec_perm = 0; 5338 #endif 5339 ret = sfmmu_modifytte_try(&tte, &ttemod, 5340 &sfhmep->hme_tte); 5341 5342 if (ret < 0) { 5343 /* tte changed underneath us */ 5344 if (pml) { 5345 sfmmu_mlist_exit(pml); 5346 } 5347 continue; 5348 } 5349 5350 if (tteflags & TTE_HWWR_INT) { 5351 /* 5352 * need to sync if we are clearing modify bit. 5353 */ 5354 sfmmu_ttesync(sfmmup, addr, &tte, pp); 5355 } 5356 5357 if (pp && PP_ISRO(pp)) { 5358 if (pprot & TTE_WRPRM_INT) { 5359 pmtx = sfmmu_page_enter(pp); 5360 PP_CLRRO(pp); 5361 sfmmu_page_exit(pmtx); 5362 } 5363 } 5364 5365 if (ret > 0 && use_demap_range) { 5366 DEMAP_RANGE_MARKPG(dmrp, addr); 5367 } else if (ret > 0) { 5368 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 5369 } 5370 5371 if (pml) { 5372 sfmmu_mlist_exit(pml); 5373 } 5374 } 5375 next_addr: 5376 addr += TTEBYTES(ttesz); 5377 sfhmep++; 5378 DEMAP_RANGE_NEXTPG(dmrp); 5379 } 5380 return (addr); 5381 } 5382 5383 /* 5384 * This routine is deprecated and should only be used by hat_chgprot. 5385 * The correct routine is sfmmu_vtop_attr. 5386 * This routine converts virtual page protections to physical ones. It will 5387 * update the tteflags field with the tte mask corresponding to the protections 5388 * affected and it returns the new protections. It will also clear the modify 5389 * bit if we are taking away write permission. This is necessary since the 5390 * modify bit is the hardware permission bit and we need to clear it in order 5391 * to detect write faults. 5392 * It accepts the following special protections: 5393 * ~PROT_WRITE = remove write permissions. 5394 * ~PROT_USER = remove user permissions. 5395 */ 5396 static uint_t 5397 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp) 5398 { 5399 if (vprot == (uint_t)~PROT_WRITE) { 5400 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT; 5401 return (0); /* will cause wrprm to be cleared */ 5402 } 5403 if (vprot == (uint_t)~PROT_USER) { 5404 *tteflagsp = TTE_PRIV_INT; 5405 return (0); /* will cause privprm to be cleared */ 5406 } 5407 if ((vprot == 0) || (vprot == PROT_USER) || 5408 ((vprot & PROT_ALL) != vprot)) { 5409 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5410 } 5411 5412 switch (vprot) { 5413 case (PROT_READ): 5414 case (PROT_EXEC): 5415 case (PROT_EXEC | PROT_READ): 5416 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5417 return (TTE_PRIV_INT); /* set prv and clr wrt */ 5418 case (PROT_WRITE): 5419 case (PROT_WRITE | PROT_READ): 5420 case (PROT_EXEC | PROT_WRITE): 5421 case (PROT_EXEC | PROT_WRITE | PROT_READ): 5422 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5423 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */ 5424 case (PROT_USER | PROT_READ): 5425 case (PROT_USER | PROT_EXEC): 5426 case (PROT_USER | PROT_EXEC | PROT_READ): 5427 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT; 5428 return (0); /* clr prv and wrt */ 5429 case (PROT_USER | PROT_WRITE): 5430 case (PROT_USER | PROT_WRITE | PROT_READ): 5431 case (PROT_USER | PROT_EXEC | PROT_WRITE): 5432 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ): 5433 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT; 5434 return (TTE_WRPRM_INT); /* clr prv and set wrt */ 5435 default: 5436 panic("sfmmu_vtop_prot -- bad prot %x", vprot); 5437 } 5438 return (0); 5439 } 5440 5441 /* 5442 * Alternate unload for very large virtual ranges. With a true 64 bit VA, 5443 * the normal algorithm would take too long for a very large VA range with 5444 * few real mappings. This routine just walks thru all HMEs in the global 5445 * hash table to find and remove mappings. 5446 */ 5447 static void 5448 hat_unload_large_virtual( 5449 struct hat *sfmmup, 5450 caddr_t startaddr, 5451 size_t len, 5452 uint_t flags, 5453 hat_callback_t *callback) 5454 { 5455 struct hmehash_bucket *hmebp; 5456 struct hme_blk *hmeblkp; 5457 struct hme_blk *pr_hblk = NULL; 5458 struct hme_blk *nx_hblk; 5459 struct hme_blk *list = NULL; 5460 int i; 5461 demap_range_t dmr, *dmrp; 5462 cpuset_t cpuset; 5463 caddr_t endaddr = startaddr + len; 5464 caddr_t sa; 5465 caddr_t ea; 5466 caddr_t cb_sa[MAX_CB_ADDR]; 5467 caddr_t cb_ea[MAX_CB_ADDR]; 5468 int addr_cnt = 0; 5469 int a = 0; 5470 5471 if (sfmmup->sfmmu_free) { 5472 dmrp = NULL; 5473 } else { 5474 dmrp = &dmr; 5475 DEMAP_RANGE_INIT(sfmmup, dmrp); 5476 } 5477 5478 /* 5479 * Loop through all the hash buckets of HME blocks looking for matches. 5480 */ 5481 for (i = 0; i <= UHMEHASH_SZ; i++) { 5482 hmebp = &uhme_hash[i]; 5483 SFMMU_HASH_LOCK(hmebp); 5484 hmeblkp = hmebp->hmeblkp; 5485 pr_hblk = NULL; 5486 while (hmeblkp) { 5487 nx_hblk = hmeblkp->hblk_next; 5488 5489 /* 5490 * skip if not this context, if a shadow block or 5491 * if the mapping is not in the requested range 5492 */ 5493 if (hmeblkp->hblk_tag.htag_id != sfmmup || 5494 hmeblkp->hblk_shw_bit || 5495 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr || 5496 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) { 5497 pr_hblk = hmeblkp; 5498 goto next_block; 5499 } 5500 5501 ASSERT(!hmeblkp->hblk_shared); 5502 /* 5503 * unload if there are any current valid mappings 5504 */ 5505 if (hmeblkp->hblk_vcnt != 0 || 5506 hmeblkp->hblk_hmecnt != 0) 5507 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 5508 sa, ea, dmrp, flags); 5509 5510 /* 5511 * on unmap we also release the HME block itself, once 5512 * all mappings are gone. 5513 */ 5514 if ((flags & HAT_UNLOAD_UNMAP) != 0 && 5515 !hmeblkp->hblk_vcnt && 5516 !hmeblkp->hblk_hmecnt) { 5517 ASSERT(!hmeblkp->hblk_lckcnt); 5518 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5519 &list, 0); 5520 } else { 5521 pr_hblk = hmeblkp; 5522 } 5523 5524 if (callback == NULL) 5525 goto next_block; 5526 5527 /* 5528 * HME blocks may span more than one page, but we may be 5529 * unmapping only one page, so check for a smaller range 5530 * for the callback 5531 */ 5532 if (sa < startaddr) 5533 sa = startaddr; 5534 if (--ea > endaddr) 5535 ea = endaddr - 1; 5536 5537 cb_sa[addr_cnt] = sa; 5538 cb_ea[addr_cnt] = ea; 5539 if (++addr_cnt == MAX_CB_ADDR) { 5540 if (dmrp != NULL) { 5541 DEMAP_RANGE_FLUSH(dmrp); 5542 cpuset = sfmmup->sfmmu_cpusran; 5543 xt_sync(cpuset); 5544 } 5545 5546 for (a = 0; a < MAX_CB_ADDR; ++a) { 5547 callback->hcb_start_addr = cb_sa[a]; 5548 callback->hcb_end_addr = cb_ea[a]; 5549 callback->hcb_function(callback); 5550 } 5551 addr_cnt = 0; 5552 } 5553 5554 next_block: 5555 hmeblkp = nx_hblk; 5556 } 5557 SFMMU_HASH_UNLOCK(hmebp); 5558 } 5559 5560 sfmmu_hblks_list_purge(&list, 0); 5561 if (dmrp != NULL) { 5562 DEMAP_RANGE_FLUSH(dmrp); 5563 cpuset = sfmmup->sfmmu_cpusran; 5564 xt_sync(cpuset); 5565 } 5566 5567 for (a = 0; a < addr_cnt; ++a) { 5568 callback->hcb_start_addr = cb_sa[a]; 5569 callback->hcb_end_addr = cb_ea[a]; 5570 callback->hcb_function(callback); 5571 } 5572 5573 /* 5574 * Check TSB and TLB page sizes if the process isn't exiting. 5575 */ 5576 if (!sfmmup->sfmmu_free) 5577 sfmmu_check_page_sizes(sfmmup, 0); 5578 } 5579 5580 /* 5581 * Unload all the mappings in the range [addr..addr+len). addr and len must 5582 * be MMU_PAGESIZE aligned. 5583 */ 5584 5585 extern struct seg *segkmap; 5586 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \ 5587 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size)) 5588 5589 5590 void 5591 hat_unload_callback( 5592 struct hat *sfmmup, 5593 caddr_t addr, 5594 size_t len, 5595 uint_t flags, 5596 hat_callback_t *callback) 5597 { 5598 struct hmehash_bucket *hmebp; 5599 hmeblk_tag hblktag; 5600 int hmeshift, hashno, iskernel; 5601 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL; 5602 caddr_t endaddr; 5603 cpuset_t cpuset; 5604 int addr_count = 0; 5605 int a; 5606 caddr_t cb_start_addr[MAX_CB_ADDR]; 5607 caddr_t cb_end_addr[MAX_CB_ADDR]; 5608 int issegkmap = ISSEGKMAP(sfmmup, addr); 5609 demap_range_t dmr, *dmrp; 5610 5611 ASSERT(sfmmup->sfmmu_as != NULL); 5612 5613 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \ 5614 AS_LOCK_HELD(sfmmup->sfmmu_as)); 5615 5616 ASSERT(sfmmup != NULL); 5617 ASSERT((len & MMU_PAGEOFFSET) == 0); 5618 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET)); 5619 5620 /* 5621 * Probing through a large VA range (say 63 bits) will be slow, even 5622 * at 4 Meg steps between the probes. So, when the virtual address range 5623 * is very large, search the HME entries for what to unload. 5624 * 5625 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need 5626 * 5627 * UHMEHASH_SZ is number of hash buckets to examine 5628 * 5629 */ 5630 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) { 5631 hat_unload_large_virtual(sfmmup, addr, len, flags, callback); 5632 return; 5633 } 5634 5635 CPUSET_ZERO(cpuset); 5636 5637 /* 5638 * If the process is exiting, we can save a lot of fuss since 5639 * we'll flush the TLB when we free the ctx anyway. 5640 */ 5641 if (sfmmup->sfmmu_free) { 5642 dmrp = NULL; 5643 } else { 5644 dmrp = &dmr; 5645 DEMAP_RANGE_INIT(sfmmup, dmrp); 5646 } 5647 5648 endaddr = addr + len; 5649 hblktag.htag_id = sfmmup; 5650 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 5651 5652 /* 5653 * It is likely for the vm to call unload over a wide range of 5654 * addresses that are actually very sparsely populated by 5655 * translations. In order to speed this up the sfmmu hat supports 5656 * the concept of shadow hmeblks. Dummy large page hmeblks that 5657 * correspond to actual small translations are allocated at tteload 5658 * time and are referred to as shadow hmeblks. Now, during unload 5659 * time, we first check if we have a shadow hmeblk for that 5660 * translation. The absence of one means the corresponding address 5661 * range is empty and can be skipped. 5662 * 5663 * The kernel is an exception to above statement and that is why 5664 * we don't use shadow hmeblks and hash starting from the smallest 5665 * page size. 5666 */ 5667 if (sfmmup == KHATID) { 5668 iskernel = 1; 5669 hashno = TTE64K; 5670 } else { 5671 iskernel = 0; 5672 if (mmu_page_sizes == max_mmu_page_sizes) { 5673 hashno = TTE256M; 5674 } else { 5675 hashno = TTE4M; 5676 } 5677 } 5678 while (addr < endaddr) { 5679 hmeshift = HME_HASH_SHIFT(hashno); 5680 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 5681 hblktag.htag_rehash = hashno; 5682 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 5683 5684 SFMMU_HASH_LOCK(hmebp); 5685 5686 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list); 5687 if (hmeblkp == NULL) { 5688 /* 5689 * didn't find an hmeblk. skip the appropiate 5690 * address range. 5691 */ 5692 SFMMU_HASH_UNLOCK(hmebp); 5693 if (iskernel) { 5694 if (hashno < mmu_hashcnt) { 5695 hashno++; 5696 continue; 5697 } else { 5698 hashno = TTE64K; 5699 addr = (caddr_t)roundup((uintptr_t)addr 5700 + 1, MMU_PAGESIZE64K); 5701 continue; 5702 } 5703 } 5704 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5705 (1 << hmeshift)); 5706 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5707 ASSERT(hashno == TTE64K); 5708 continue; 5709 } 5710 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5711 hashno = TTE512K; 5712 continue; 5713 } 5714 if (mmu_page_sizes == max_mmu_page_sizes) { 5715 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5716 hashno = TTE4M; 5717 continue; 5718 } 5719 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5720 hashno = TTE32M; 5721 continue; 5722 } 5723 hashno = TTE256M; 5724 continue; 5725 } else { 5726 hashno = TTE4M; 5727 continue; 5728 } 5729 } 5730 ASSERT(hmeblkp); 5731 ASSERT(!hmeblkp->hblk_shared); 5732 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5733 /* 5734 * If the valid count is zero we can skip the range 5735 * mapped by this hmeblk. 5736 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP 5737 * is used by segment drivers as a hint 5738 * that the mapping resource won't be used any longer. 5739 * The best example of this is during exit(). 5740 */ 5741 addr = (caddr_t)roundup((uintptr_t)addr + 1, 5742 get_hblk_span(hmeblkp)); 5743 if ((flags & HAT_UNLOAD_UNMAP) || 5744 (iskernel && !issegkmap)) { 5745 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, 5746 &list, 0); 5747 } 5748 SFMMU_HASH_UNLOCK(hmebp); 5749 5750 if (iskernel) { 5751 hashno = TTE64K; 5752 continue; 5753 } 5754 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5755 ASSERT(hashno == TTE64K); 5756 continue; 5757 } 5758 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5759 hashno = TTE512K; 5760 continue; 5761 } 5762 if (mmu_page_sizes == max_mmu_page_sizes) { 5763 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5764 hashno = TTE4M; 5765 continue; 5766 } 5767 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5768 hashno = TTE32M; 5769 continue; 5770 } 5771 hashno = TTE256M; 5772 continue; 5773 } else { 5774 hashno = TTE4M; 5775 continue; 5776 } 5777 } 5778 if (hmeblkp->hblk_shw_bit) { 5779 /* 5780 * If we encounter a shadow hmeblk we know there is 5781 * smaller sized hmeblks mapping the same address space. 5782 * Decrement the hash size and rehash. 5783 */ 5784 ASSERT(sfmmup != KHATID); 5785 hashno--; 5786 SFMMU_HASH_UNLOCK(hmebp); 5787 continue; 5788 } 5789 5790 /* 5791 * track callback address ranges. 5792 * only start a new range when it's not contiguous 5793 */ 5794 if (callback != NULL) { 5795 if (addr_count > 0 && 5796 addr == cb_end_addr[addr_count - 1]) 5797 --addr_count; 5798 else 5799 cb_start_addr[addr_count] = addr; 5800 } 5801 5802 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr, 5803 dmrp, flags); 5804 5805 if (callback != NULL) 5806 cb_end_addr[addr_count++] = addr; 5807 5808 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) && 5809 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) { 5810 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0); 5811 } 5812 SFMMU_HASH_UNLOCK(hmebp); 5813 5814 /* 5815 * Notify our caller as to exactly which pages 5816 * have been unloaded. We do these in clumps, 5817 * to minimize the number of xt_sync()s that need to occur. 5818 */ 5819 if (callback != NULL && addr_count == MAX_CB_ADDR) { 5820 if (dmrp != NULL) { 5821 DEMAP_RANGE_FLUSH(dmrp); 5822 cpuset = sfmmup->sfmmu_cpusran; 5823 xt_sync(cpuset); 5824 } 5825 5826 for (a = 0; a < MAX_CB_ADDR; ++a) { 5827 callback->hcb_start_addr = cb_start_addr[a]; 5828 callback->hcb_end_addr = cb_end_addr[a]; 5829 callback->hcb_function(callback); 5830 } 5831 addr_count = 0; 5832 } 5833 if (iskernel) { 5834 hashno = TTE64K; 5835 continue; 5836 } 5837 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) { 5838 ASSERT(hashno == TTE64K); 5839 continue; 5840 } 5841 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) { 5842 hashno = TTE512K; 5843 continue; 5844 } 5845 if (mmu_page_sizes == max_mmu_page_sizes) { 5846 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) { 5847 hashno = TTE4M; 5848 continue; 5849 } 5850 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) { 5851 hashno = TTE32M; 5852 continue; 5853 } 5854 hashno = TTE256M; 5855 } else { 5856 hashno = TTE4M; 5857 } 5858 } 5859 5860 sfmmu_hblks_list_purge(&list, 0); 5861 if (dmrp != NULL) { 5862 DEMAP_RANGE_FLUSH(dmrp); 5863 cpuset = sfmmup->sfmmu_cpusran; 5864 xt_sync(cpuset); 5865 } 5866 if (callback && addr_count != 0) { 5867 for (a = 0; a < addr_count; ++a) { 5868 callback->hcb_start_addr = cb_start_addr[a]; 5869 callback->hcb_end_addr = cb_end_addr[a]; 5870 callback->hcb_function(callback); 5871 } 5872 } 5873 5874 /* 5875 * Check TSB and TLB page sizes if the process isn't exiting. 5876 */ 5877 if (!sfmmup->sfmmu_free) 5878 sfmmu_check_page_sizes(sfmmup, 0); 5879 } 5880 5881 /* 5882 * Unload all the mappings in the range [addr..addr+len). addr and len must 5883 * be MMU_PAGESIZE aligned. 5884 */ 5885 void 5886 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags) 5887 { 5888 hat_unload_callback(sfmmup, addr, len, flags, NULL); 5889 } 5890 5891 5892 /* 5893 * Find the largest mapping size for this page. 5894 */ 5895 int 5896 fnd_mapping_sz(page_t *pp) 5897 { 5898 int sz; 5899 int p_index; 5900 5901 p_index = PP_MAPINDEX(pp); 5902 5903 sz = 0; 5904 p_index >>= 1; /* don't care about 8K bit */ 5905 for (; p_index; p_index >>= 1) { 5906 sz++; 5907 } 5908 5909 return (sz); 5910 } 5911 5912 /* 5913 * This function unloads a range of addresses for an hmeblk. 5914 * It returns the next address to be unloaded. 5915 * It should be called with the hash lock held. 5916 */ 5917 static caddr_t 5918 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 5919 caddr_t endaddr, demap_range_t *dmrp, uint_t flags) 5920 { 5921 tte_t tte, ttemod; 5922 struct sf_hment *sfhmep; 5923 int ttesz; 5924 long ttecnt; 5925 page_t *pp; 5926 kmutex_t *pml; 5927 int ret; 5928 int use_demap_range; 5929 5930 ASSERT(in_hblk_range(hmeblkp, addr)); 5931 ASSERT(!hmeblkp->hblk_shw_bit); 5932 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared); 5933 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared); 5934 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared); 5935 5936 #ifdef DEBUG 5937 if (get_hblk_ttesz(hmeblkp) != TTE8K && 5938 (endaddr < get_hblk_endaddr(hmeblkp))) { 5939 panic("sfmmu_hblk_unload: partial unload of large page"); 5940 } 5941 #endif /* DEBUG */ 5942 5943 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 5944 ttesz = get_hblk_ttesz(hmeblkp); 5945 5946 use_demap_range = ((dmrp == NULL) || 5947 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp))); 5948 5949 if (use_demap_range) { 5950 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr); 5951 } else if (dmrp != NULL) { 5952 DEMAP_RANGE_FLUSH(dmrp); 5953 } 5954 ttecnt = 0; 5955 HBLKTOHME(sfhmep, hmeblkp, addr); 5956 5957 while (addr < endaddr) { 5958 pml = NULL; 5959 sfmmu_copytte(&sfhmep->hme_tte, &tte); 5960 if (TTE_IS_VALID(&tte)) { 5961 pp = sfhmep->hme_page; 5962 if (pp != NULL) { 5963 pml = sfmmu_mlist_enter(pp); 5964 } 5965 5966 /* 5967 * Verify if hme still points to 'pp' now that 5968 * we have p_mapping lock. 5969 */ 5970 if (sfhmep->hme_page != pp) { 5971 if (pp != NULL && sfhmep->hme_page != NULL) { 5972 ASSERT(pml != NULL); 5973 sfmmu_mlist_exit(pml); 5974 /* Re-start this iteration. */ 5975 continue; 5976 } 5977 ASSERT((pp != NULL) && 5978 (sfhmep->hme_page == NULL)); 5979 goto tte_unloaded; 5980 } 5981 5982 /* 5983 * This point on we have both HASH and p_mapping 5984 * lock. 5985 */ 5986 ASSERT(pp == sfhmep->hme_page); 5987 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 5988 5989 /* 5990 * We need to loop on modify tte because it is 5991 * possible for pagesync to come along and 5992 * change the software bits beneath us. 5993 * 5994 * Page_unload can also invalidate the tte after 5995 * we read tte outside of p_mapping lock. 5996 */ 5997 again: 5998 ttemod = tte; 5999 6000 TTE_SET_INVALID(&ttemod); 6001 ret = sfmmu_modifytte_try(&tte, &ttemod, 6002 &sfhmep->hme_tte); 6003 6004 if (ret <= 0) { 6005 if (TTE_IS_VALID(&tte)) { 6006 ASSERT(ret < 0); 6007 goto again; 6008 } 6009 if (pp != NULL) { 6010 panic("sfmmu_hblk_unload: pp = 0x%p " 6011 "tte became invalid under mlist" 6012 " lock = 0x%p", (void *)pp, 6013 (void *)pml); 6014 } 6015 continue; 6016 } 6017 6018 if (!(flags & HAT_UNLOAD_NOSYNC)) { 6019 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6020 } 6021 6022 /* 6023 * Ok- we invalidated the tte. Do the rest of the job. 6024 */ 6025 ttecnt++; 6026 6027 if (flags & HAT_UNLOAD_UNLOCK) { 6028 ASSERT(hmeblkp->hblk_lckcnt > 0); 6029 atomic_dec_32(&hmeblkp->hblk_lckcnt); 6030 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK); 6031 } 6032 6033 /* 6034 * Normally we would need to flush the page 6035 * from the virtual cache at this point in 6036 * order to prevent a potential cache alias 6037 * inconsistency. 6038 * The particular scenario we need to worry 6039 * about is: 6040 * Given: va1 and va2 are two virtual address 6041 * that alias and map the same physical 6042 * address. 6043 * 1. mapping exists from va1 to pa and data 6044 * has been read into the cache. 6045 * 2. unload va1. 6046 * 3. load va2 and modify data using va2. 6047 * 4 unload va2. 6048 * 5. load va1 and reference data. Unless we 6049 * flush the data cache when we unload we will 6050 * get stale data. 6051 * Fortunately, page coloring eliminates the 6052 * above scenario by remembering the color a 6053 * physical page was last or is currently 6054 * mapped to. Now, we delay the flush until 6055 * the loading of translations. Only when the 6056 * new translation is of a different color 6057 * are we forced to flush. 6058 */ 6059 if (use_demap_range) { 6060 /* 6061 * Mark this page as needing a demap. 6062 */ 6063 DEMAP_RANGE_MARKPG(dmrp, addr); 6064 } else { 6065 ASSERT(sfmmup != NULL); 6066 ASSERT(!hmeblkp->hblk_shared); 6067 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 6068 sfmmup->sfmmu_free, 0); 6069 } 6070 6071 if (pp) { 6072 /* 6073 * Remove the hment from the mapping list 6074 */ 6075 ASSERT(hmeblkp->hblk_hmecnt > 0); 6076 6077 /* 6078 * Again, we cannot 6079 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS); 6080 */ 6081 HME_SUB(sfhmep, pp); 6082 membar_stst(); 6083 atomic_dec_16(&hmeblkp->hblk_hmecnt); 6084 } 6085 6086 ASSERT(hmeblkp->hblk_vcnt > 0); 6087 atomic_dec_16(&hmeblkp->hblk_vcnt); 6088 6089 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 6090 !hmeblkp->hblk_lckcnt); 6091 6092 #ifdef VAC 6093 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) { 6094 if (PP_ISTNC(pp)) { 6095 /* 6096 * If page was temporary 6097 * uncached, try to recache 6098 * it. Note that HME_SUB() was 6099 * called above so p_index and 6100 * mlist had been updated. 6101 */ 6102 conv_tnc(pp, ttesz); 6103 } else if (pp->p_mapping == NULL) { 6104 ASSERT(kpm_enable); 6105 /* 6106 * Page is marked to be in VAC conflict 6107 * to an existing kpm mapping and/or is 6108 * kpm mapped using only the regular 6109 * pagesize. 6110 */ 6111 sfmmu_kpm_hme_unload(pp); 6112 } 6113 } 6114 #endif /* VAC */ 6115 } else if ((pp = sfhmep->hme_page) != NULL) { 6116 /* 6117 * TTE is invalid but the hme 6118 * still exists. let pageunload 6119 * complete its job. 6120 */ 6121 ASSERT(pml == NULL); 6122 pml = sfmmu_mlist_enter(pp); 6123 if (sfhmep->hme_page != NULL) { 6124 sfmmu_mlist_exit(pml); 6125 continue; 6126 } 6127 ASSERT(sfhmep->hme_page == NULL); 6128 } else if (hmeblkp->hblk_hmecnt != 0) { 6129 /* 6130 * pageunload may have not finished decrementing 6131 * hblk_vcnt and hblk_hmecnt. Find page_t if any and 6132 * wait for pageunload to finish. Rely on pageunload 6133 * to decrement hblk_hmecnt after hblk_vcnt. 6134 */ 6135 pfn_t pfn = TTE_TO_TTEPFN(&tte); 6136 ASSERT(pml == NULL); 6137 if (pf_is_memory(pfn)) { 6138 pp = page_numtopp_nolock(pfn); 6139 if (pp != NULL) { 6140 pml = sfmmu_mlist_enter(pp); 6141 sfmmu_mlist_exit(pml); 6142 pml = NULL; 6143 } 6144 } 6145 } 6146 6147 tte_unloaded: 6148 /* 6149 * At this point, the tte we are looking at 6150 * should be unloaded, and hme has been unlinked 6151 * from page too. This is important because in 6152 * pageunload, it does ttesync() then HME_SUB. 6153 * We need to make sure HME_SUB has been completed 6154 * so we know ttesync() has been completed. Otherwise, 6155 * at exit time, after return from hat layer, VM will 6156 * release as structure which hat_setstat() (called 6157 * by ttesync()) needs. 6158 */ 6159 #ifdef DEBUG 6160 { 6161 tte_t dtte; 6162 6163 ASSERT(sfhmep->hme_page == NULL); 6164 6165 sfmmu_copytte(&sfhmep->hme_tte, &dtte); 6166 ASSERT(!TTE_IS_VALID(&dtte)); 6167 } 6168 #endif 6169 6170 if (pml) { 6171 sfmmu_mlist_exit(pml); 6172 } 6173 6174 addr += TTEBYTES(ttesz); 6175 sfhmep++; 6176 DEMAP_RANGE_NEXTPG(dmrp); 6177 } 6178 /* 6179 * For shared hmeblks this routine is only called when region is freed 6180 * and no longer referenced. So no need to decrement ttecnt 6181 * in the region structure here. 6182 */ 6183 if (ttecnt > 0 && sfmmup != NULL) { 6184 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt); 6185 } 6186 return (addr); 6187 } 6188 6189 /* 6190 * Invalidate a virtual address range for the local CPU. 6191 * For best performance ensure that the va range is completely 6192 * mapped, otherwise the entire TLB will be flushed. 6193 */ 6194 void 6195 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size) 6196 { 6197 ssize_t sz; 6198 caddr_t endva = va + size; 6199 6200 while (va < endva) { 6201 sz = hat_getpagesize(sfmmup, va); 6202 if (sz < 0) { 6203 vtag_flushall(); 6204 break; 6205 } 6206 vtag_flushpage(va, (uint64_t)sfmmup); 6207 va += sz; 6208 } 6209 } 6210 6211 /* 6212 * Synchronize all the mappings in the range [addr..addr+len). 6213 * Can be called with clearflag having two states: 6214 * HAT_SYNC_DONTZERO means just return the rm stats 6215 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats 6216 */ 6217 void 6218 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag) 6219 { 6220 struct hmehash_bucket *hmebp; 6221 hmeblk_tag hblktag; 6222 int hmeshift, hashno = 1; 6223 struct hme_blk *hmeblkp, *list = NULL; 6224 caddr_t endaddr; 6225 cpuset_t cpuset; 6226 6227 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as)); 6228 ASSERT((len & MMU_PAGEOFFSET) == 0); 6229 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 6230 (clearflag == HAT_SYNC_ZERORM)); 6231 6232 CPUSET_ZERO(cpuset); 6233 6234 endaddr = addr + len; 6235 hblktag.htag_id = sfmmup; 6236 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 6237 6238 /* 6239 * Spitfire supports 4 page sizes. 6240 * Most pages are expected to be of the smallest page 6241 * size (8K) and these will not need to be rehashed. 64K 6242 * pages also don't need to be rehashed because the an hmeblk 6243 * spans 64K of address space. 512K pages might need 1 rehash and 6244 * and 4M pages 2 rehashes. 6245 */ 6246 while (addr < endaddr) { 6247 hmeshift = HME_HASH_SHIFT(hashno); 6248 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift); 6249 hblktag.htag_rehash = hashno; 6250 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift); 6251 6252 SFMMU_HASH_LOCK(hmebp); 6253 6254 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list); 6255 if (hmeblkp != NULL) { 6256 ASSERT(!hmeblkp->hblk_shared); 6257 /* 6258 * We've encountered a shadow hmeblk so skip the range 6259 * of the next smaller mapping size. 6260 */ 6261 if (hmeblkp->hblk_shw_bit) { 6262 ASSERT(sfmmup != ksfmmup); 6263 ASSERT(hashno > 1); 6264 addr = (caddr_t)P2END((uintptr_t)addr, 6265 TTEBYTES(hashno - 1)); 6266 } else { 6267 addr = sfmmu_hblk_sync(sfmmup, hmeblkp, 6268 addr, endaddr, clearflag); 6269 } 6270 SFMMU_HASH_UNLOCK(hmebp); 6271 hashno = 1; 6272 continue; 6273 } 6274 SFMMU_HASH_UNLOCK(hmebp); 6275 6276 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) { 6277 /* 6278 * We have traversed the whole list and rehashed 6279 * if necessary without finding the address to sync. 6280 * This is ok so we increment the address by the 6281 * smallest hmeblk range for kernel mappings and the 6282 * largest hmeblk range, to account for shadow hmeblks, 6283 * for user mappings and continue. 6284 */ 6285 if (sfmmup == ksfmmup) 6286 addr = (caddr_t)P2END((uintptr_t)addr, 6287 TTEBYTES(1)); 6288 else 6289 addr = (caddr_t)P2END((uintptr_t)addr, 6290 TTEBYTES(hashno)); 6291 hashno = 1; 6292 } else { 6293 hashno++; 6294 } 6295 } 6296 sfmmu_hblks_list_purge(&list, 0); 6297 cpuset = sfmmup->sfmmu_cpusran; 6298 xt_sync(cpuset); 6299 } 6300 6301 static caddr_t 6302 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr, 6303 caddr_t endaddr, int clearflag) 6304 { 6305 tte_t tte, ttemod; 6306 struct sf_hment *sfhmep; 6307 int ttesz; 6308 struct page *pp; 6309 kmutex_t *pml; 6310 int ret; 6311 6312 ASSERT(hmeblkp->hblk_shw_bit == 0); 6313 ASSERT(!hmeblkp->hblk_shared); 6314 6315 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp)); 6316 6317 ttesz = get_hblk_ttesz(hmeblkp); 6318 HBLKTOHME(sfhmep, hmeblkp, addr); 6319 6320 while (addr < endaddr) { 6321 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6322 if (TTE_IS_VALID(&tte)) { 6323 pml = NULL; 6324 pp = sfhmep->hme_page; 6325 if (pp) { 6326 pml = sfmmu_mlist_enter(pp); 6327 } 6328 if (pp != sfhmep->hme_page) { 6329 /* 6330 * tte most have been unloaded 6331 * underneath us. Recheck 6332 */ 6333 ASSERT(pml); 6334 sfmmu_mlist_exit(pml); 6335 continue; 6336 } 6337 6338 ASSERT(pp == NULL || sfmmu_mlist_held(pp)); 6339 6340 if (clearflag == HAT_SYNC_ZERORM) { 6341 ttemod = tte; 6342 TTE_CLR_RM(&ttemod); 6343 ret = sfmmu_modifytte_try(&tte, &ttemod, 6344 &sfhmep->hme_tte); 6345 if (ret < 0) { 6346 if (pml) { 6347 sfmmu_mlist_exit(pml); 6348 } 6349 continue; 6350 } 6351 6352 if (ret > 0) { 6353 sfmmu_tlb_demap(addr, sfmmup, 6354 hmeblkp, 0, 0); 6355 } 6356 } 6357 sfmmu_ttesync(sfmmup, addr, &tte, pp); 6358 if (pml) { 6359 sfmmu_mlist_exit(pml); 6360 } 6361 } 6362 addr += TTEBYTES(ttesz); 6363 sfhmep++; 6364 } 6365 return (addr); 6366 } 6367 6368 /* 6369 * This function will sync a tte to the page struct and it will 6370 * update the hat stats. Currently it allows us to pass a NULL pp 6371 * and we will simply update the stats. We may want to change this 6372 * so we only keep stats for pages backed by pp's. 6373 */ 6374 static void 6375 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) 6376 { 6377 uint_t rm = 0; 6378 int sz; 6379 pgcnt_t npgs; 6380 6381 ASSERT(TTE_IS_VALID(ttep)); 6382 6383 if (TTE_IS_NOSYNC(ttep)) { 6384 return; 6385 } 6386 6387 if (TTE_IS_REF(ttep)) { 6388 rm = P_REF; 6389 } 6390 if (TTE_IS_MOD(ttep)) { 6391 rm |= P_MOD; 6392 } 6393 6394 if (rm == 0) { 6395 return; 6396 } 6397 6398 sz = TTE_CSZ(ttep); 6399 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { 6400 int i; 6401 caddr_t vaddr = addr; 6402 6403 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) { 6404 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm); 6405 } 6406 6407 } 6408 6409 /* 6410 * XXX I want to use cas to update nrm bits but they 6411 * currently belong in common/vm and not in hat where 6412 * they should be. 6413 * The nrm bits are protected by the same mutex as 6414 * the one that protects the page's mapping list. 6415 */ 6416 if (!pp) 6417 return; 6418 ASSERT(sfmmu_mlist_held(pp)); 6419 /* 6420 * If the tte is for a large page, we need to sync all the 6421 * pages covered by the tte. 6422 */ 6423 if (sz != TTE8K) { 6424 ASSERT(pp->p_szc != 0); 6425 pp = PP_GROUPLEADER(pp, sz); 6426 ASSERT(sfmmu_mlist_held(pp)); 6427 } 6428 6429 /* Get number of pages from tte size. */ 6430 npgs = TTEPAGES(sz); 6431 6432 do { 6433 ASSERT(pp); 6434 ASSERT(sfmmu_mlist_held(pp)); 6435 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || 6436 ((rm & P_MOD) != 0 && !PP_ISMOD(pp))) 6437 hat_page_setattr(pp, rm); 6438 6439 /* 6440 * Are we done? If not, we must have a large mapping. 6441 * For large mappings we need to sync the rest of the pages 6442 * covered by this tte; goto the next page. 6443 */ 6444 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp))); 6445 } 6446 6447 /* 6448 * Execute pre-callback handler of each pa_hment linked to pp 6449 * 6450 * Inputs: 6451 * flag: either HAT_PRESUSPEND or HAT_SUSPEND. 6452 * capture_cpus: pointer to return value (below) 6453 * 6454 * Returns: 6455 * Propagates the subsystem callback return values back to the caller; 6456 * returns 0 on success. If capture_cpus is non-NULL, the value returned 6457 * is zero if all of the pa_hments are of a type that do not require 6458 * capturing CPUs prior to suspending the mapping, else it is 1. 6459 */ 6460 static int 6461 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus) 6462 { 6463 struct sf_hment *sfhmep; 6464 struct pa_hment *pahmep; 6465 int (*f)(caddr_t, uint_t, uint_t, void *); 6466 int ret; 6467 id_t id; 6468 int locked = 0; 6469 kmutex_t *pml; 6470 6471 ASSERT(PAGE_EXCL(pp)); 6472 if (!sfmmu_mlist_held(pp)) { 6473 pml = sfmmu_mlist_enter(pp); 6474 locked = 1; 6475 } 6476 6477 if (capture_cpus) 6478 *capture_cpus = 0; 6479 6480 top: 6481 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6482 /* 6483 * skip sf_hments corresponding to VA<->PA mappings; 6484 * for pa_hment's, hme_tte.ll is zero 6485 */ 6486 if (!IS_PAHME(sfhmep)) 6487 continue; 6488 6489 pahmep = sfhmep->hme_data; 6490 ASSERT(pahmep != NULL); 6491 6492 /* 6493 * skip if pre-handler has been called earlier in this loop 6494 */ 6495 if (pahmep->flags & flag) 6496 continue; 6497 6498 id = pahmep->cb_id; 6499 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6500 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0) 6501 *capture_cpus = 1; 6502 if ((f = sfmmu_cb_table[id].prehandler) == NULL) { 6503 pahmep->flags |= flag; 6504 continue; 6505 } 6506 6507 /* 6508 * Drop the mapping list lock to avoid locking order issues. 6509 */ 6510 if (locked) 6511 sfmmu_mlist_exit(pml); 6512 6513 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt); 6514 if (ret != 0) 6515 return (ret); /* caller must do the cleanup */ 6516 6517 if (locked) { 6518 pml = sfmmu_mlist_enter(pp); 6519 pahmep->flags |= flag; 6520 goto top; 6521 } 6522 6523 pahmep->flags |= flag; 6524 } 6525 6526 if (locked) 6527 sfmmu_mlist_exit(pml); 6528 6529 return (0); 6530 } 6531 6532 /* 6533 * Execute post-callback handler of each pa_hment linked to pp 6534 * 6535 * Same overall assumptions and restrictions apply as for 6536 * hat_pageprocess_precallbacks(). 6537 */ 6538 static void 6539 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag) 6540 { 6541 pfn_t pgpfn = pp->p_pagenum; 6542 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1; 6543 pfn_t newpfn; 6544 struct sf_hment *sfhmep; 6545 struct pa_hment *pahmep; 6546 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t); 6547 id_t id; 6548 int locked = 0; 6549 kmutex_t *pml; 6550 6551 ASSERT(PAGE_EXCL(pp)); 6552 if (!sfmmu_mlist_held(pp)) { 6553 pml = sfmmu_mlist_enter(pp); 6554 locked = 1; 6555 } 6556 6557 top: 6558 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6559 /* 6560 * skip sf_hments corresponding to VA<->PA mappings; 6561 * for pa_hment's, hme_tte.ll is zero 6562 */ 6563 if (!IS_PAHME(sfhmep)) 6564 continue; 6565 6566 pahmep = sfhmep->hme_data; 6567 ASSERT(pahmep != NULL); 6568 6569 if ((pahmep->flags & flag) == 0) 6570 continue; 6571 6572 pahmep->flags &= ~flag; 6573 6574 id = pahmep->cb_id; 6575 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid); 6576 if ((f = sfmmu_cb_table[id].posthandler) == NULL) 6577 continue; 6578 6579 /* 6580 * Convert the base page PFN into the constituent PFN 6581 * which is needed by the callback handler. 6582 */ 6583 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask); 6584 6585 /* 6586 * Drop the mapping list lock to avoid locking order issues. 6587 */ 6588 if (locked) 6589 sfmmu_mlist_exit(pml); 6590 6591 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn) 6592 != 0) 6593 panic("sfmmu: posthandler failed"); 6594 6595 if (locked) { 6596 pml = sfmmu_mlist_enter(pp); 6597 goto top; 6598 } 6599 } 6600 6601 if (locked) 6602 sfmmu_mlist_exit(pml); 6603 } 6604 6605 /* 6606 * Suspend locked kernel mapping 6607 */ 6608 void 6609 hat_pagesuspend(struct page *pp) 6610 { 6611 struct sf_hment *sfhmep; 6612 sfmmu_t *sfmmup; 6613 tte_t tte, ttemod; 6614 struct hme_blk *hmeblkp; 6615 caddr_t addr; 6616 int index, cons; 6617 cpuset_t cpuset; 6618 6619 ASSERT(PAGE_EXCL(pp)); 6620 ASSERT(sfmmu_mlist_held(pp)); 6621 6622 mutex_enter(&kpr_suspendlock); 6623 6624 /* 6625 * We're about to suspend a kernel mapping so mark this thread as 6626 * non-traceable by DTrace. This prevents us from running into issues 6627 * with probe context trying to touch a suspended page 6628 * in the relocation codepath itself. 6629 */ 6630 curthread->t_flag |= T_DONTDTRACE; 6631 6632 index = PP_MAPINDEX(pp); 6633 cons = TTE8K; 6634 6635 retry: 6636 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 6637 6638 if (IS_PAHME(sfhmep)) 6639 continue; 6640 6641 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons) 6642 continue; 6643 6644 /* 6645 * Loop until we successfully set the suspend bit in 6646 * the TTE. 6647 */ 6648 again: 6649 sfmmu_copytte(&sfhmep->hme_tte, &tte); 6650 ASSERT(TTE_IS_VALID(&tte)); 6651 6652 ttemod = tte; 6653 TTE_SET_SUSPEND(&ttemod); 6654 if (sfmmu_modifytte_try(&tte, &ttemod, 6655 &sfhmep->hme_tte) < 0) 6656 goto again; 6657 6658 /* 6659 * Invalidate TSB entry 6660 */ 6661 hmeblkp = sfmmu_hmetohblk(sfhmep); 6662 6663 sfmmup = hblktosfmmu(hmeblkp); 6664 ASSERT(sfmmup == ksfmmup); 6665 ASSERT(!hmeblkp->hblk_shared); 6666 6667 addr = tte_to_vaddr(hmeblkp, tte); 6668 6669 /* 6670 * No need to make sure that the TSB for this sfmmu is 6671 * not being relocated since it is ksfmmup and thus it 6672 * will never be relocated. 6673 */ 6674 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 6675 6676 /* 6677 * Update xcall stats 6678 */ 6679 cpuset = cpu_ready_set; 6680 CPUSET_DEL(cpuset, CPU->cpu_id); 6681 6682 /* LINTED: constant in conditional context */ 6683 SFMMU_XCALL_STATS(ksfmmup); 6684 6685 /* 6686 * Flush TLB entry on remote CPU's 6687 */ 6688 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 6689 (uint64_t)ksfmmup); 6690 xt_sync(cpuset); 6691 6692 /* 6693 * Flush TLB entry on local CPU 6694 */ 6695 vtag_flushpage(addr, (uint64_t)ksfmmup); 6696 } 6697 6698 while (index != 0) { 6699 index = index >> 1; 6700 if (index != 0) 6701 cons++; 6702 if (index & 0x1) { 6703 pp = PP_GROUPLEADER(pp, cons); 6704 goto retry; 6705 } 6706 } 6707 } 6708 6709 #ifdef DEBUG 6710 6711 #define N_PRLE 1024 6712 struct prle { 6713 page_t *targ; 6714 page_t *repl; 6715 int status; 6716 int pausecpus; 6717 hrtime_t whence; 6718 }; 6719 6720 static struct prle page_relocate_log[N_PRLE]; 6721 static int prl_entry; 6722 static kmutex_t prl_mutex; 6723 6724 #define PAGE_RELOCATE_LOG(t, r, s, p) \ 6725 mutex_enter(&prl_mutex); \ 6726 page_relocate_log[prl_entry].targ = *(t); \ 6727 page_relocate_log[prl_entry].repl = *(r); \ 6728 page_relocate_log[prl_entry].status = (s); \ 6729 page_relocate_log[prl_entry].pausecpus = (p); \ 6730 page_relocate_log[prl_entry].whence = gethrtime(); \ 6731 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \ 6732 mutex_exit(&prl_mutex); 6733 6734 #else /* !DEBUG */ 6735 #define PAGE_RELOCATE_LOG(t, r, s, p) 6736 #endif 6737 6738 /* 6739 * Core Kernel Page Relocation Algorithm 6740 * 6741 * Input: 6742 * 6743 * target : constituent pages are SE_EXCL locked. 6744 * replacement: constituent pages are SE_EXCL locked. 6745 * 6746 * Output: 6747 * 6748 * nrelocp: number of pages relocated 6749 */ 6750 int 6751 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp) 6752 { 6753 page_t *targ, *repl; 6754 page_t *tpp, *rpp; 6755 kmutex_t *low, *high; 6756 spgcnt_t npages, i; 6757 page_t *pl = NULL; 6758 int old_pil; 6759 cpuset_t cpuset; 6760 int cap_cpus; 6761 int ret; 6762 #ifdef VAC 6763 int cflags = 0; 6764 #endif 6765 6766 if (!kcage_on || PP_ISNORELOC(*target)) { 6767 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1); 6768 return (EAGAIN); 6769 } 6770 6771 mutex_enter(&kpr_mutex); 6772 kreloc_thread = curthread; 6773 6774 targ = *target; 6775 repl = *replacement; 6776 ASSERT(repl != NULL); 6777 ASSERT(targ->p_szc == repl->p_szc); 6778 6779 npages = page_get_pagecnt(targ->p_szc); 6780 6781 /* 6782 * unload VA<->PA mappings that are not locked 6783 */ 6784 tpp = targ; 6785 for (i = 0; i < npages; i++) { 6786 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC); 6787 tpp++; 6788 } 6789 6790 /* 6791 * Do "presuspend" callbacks, in a context from which we can still 6792 * block as needed. Note that we don't hold the mapping list lock 6793 * of "targ" at this point due to potential locking order issues; 6794 * we assume that between the hat_pageunload() above and holding 6795 * the SE_EXCL lock that the mapping list *cannot* change at this 6796 * point. 6797 */ 6798 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus); 6799 if (ret != 0) { 6800 /* 6801 * EIO translates to fatal error, for all others cleanup 6802 * and return EAGAIN. 6803 */ 6804 ASSERT(ret != EIO); 6805 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND); 6806 PAGE_RELOCATE_LOG(target, replacement, ret, -1); 6807 kreloc_thread = NULL; 6808 mutex_exit(&kpr_mutex); 6809 return (EAGAIN); 6810 } 6811 6812 /* 6813 * acquire p_mapping list lock for both the target and replacement 6814 * root pages. 6815 * 6816 * low and high refer to the need to grab the mlist locks in a 6817 * specific order in order to prevent race conditions. Thus the 6818 * lower lock must be grabbed before the higher lock. 6819 * 6820 * This will block hat_unload's accessing p_mapping list. Since 6821 * we have SE_EXCL lock, hat_memload and hat_pageunload will be 6822 * blocked. Thus, no one else will be accessing the p_mapping list 6823 * while we suspend and reload the locked mapping below. 6824 */ 6825 tpp = targ; 6826 rpp = repl; 6827 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high); 6828 6829 kpreempt_disable(); 6830 6831 /* 6832 * We raise our PIL to 13 so that we don't get captured by 6833 * another CPU or pinned by an interrupt thread. We can't go to 6834 * PIL 14 since the nexus driver(s) may need to interrupt at 6835 * that level in the case of IOMMU pseudo mappings. 6836 */ 6837 cpuset = cpu_ready_set; 6838 CPUSET_DEL(cpuset, CPU->cpu_id); 6839 if (!cap_cpus || CPUSET_ISNULL(cpuset)) { 6840 old_pil = splr(XCALL_PIL); 6841 } else { 6842 old_pil = -1; 6843 xc_attention(cpuset); 6844 } 6845 ASSERT(getpil() == XCALL_PIL); 6846 6847 /* 6848 * Now do suspend callbacks. In the case of an IOMMU mapping 6849 * this will suspend all DMA activity to the page while it is 6850 * being relocated. Since we are well above LOCK_LEVEL and CPUs 6851 * may be captured at this point we should have acquired any needed 6852 * locks in the presuspend callback. 6853 */ 6854 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL); 6855 if (ret != 0) { 6856 repl = targ; 6857 goto suspend_fail; 6858 } 6859 6860 /* 6861 * Raise the PIL yet again, this time to block all high-level 6862 * interrupts on this CPU. This is necessary to prevent an 6863 * interrupt routine from pinning the thread which holds the 6864 * mapping suspended and then touching the suspended page. 6865 * 6866 * Once the page is suspended we also need to be careful to 6867 * avoid calling any functions which touch any seg_kmem memory 6868 * since that memory may be backed by the very page we are 6869 * relocating in here! 6870 */ 6871 hat_pagesuspend(targ); 6872 6873 /* 6874 * Now that we are confident everybody has stopped using this page, 6875 * copy the page contents. Note we use a physical copy to prevent 6876 * locking issues and to avoid fpRAS because we can't handle it in 6877 * this context. 6878 */ 6879 for (i = 0; i < npages; i++, tpp++, rpp++) { 6880 #ifdef VAC 6881 /* 6882 * If the replacement has a different vcolor than 6883 * the one being replacd, we need to handle VAC 6884 * consistency for it just as we were setting up 6885 * a new mapping to it. 6886 */ 6887 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) && 6888 (tpp->p_vcolor != rpp->p_vcolor) && 6889 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) { 6890 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp)); 6891 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp), 6892 rpp->p_pagenum); 6893 } 6894 #endif 6895 /* 6896 * Copy the contents of the page. 6897 */ 6898 ppcopy_kernel(tpp, rpp); 6899 } 6900 6901 tpp = targ; 6902 rpp = repl; 6903 for (i = 0; i < npages; i++, tpp++, rpp++) { 6904 /* 6905 * Copy attributes. VAC consistency was handled above, 6906 * if required. 6907 */ 6908 rpp->p_nrm = tpp->p_nrm; 6909 tpp->p_nrm = 0; 6910 rpp->p_index = tpp->p_index; 6911 tpp->p_index = 0; 6912 #ifdef VAC 6913 rpp->p_vcolor = tpp->p_vcolor; 6914 #endif 6915 } 6916 6917 /* 6918 * First, unsuspend the page, if we set the suspend bit, and transfer 6919 * the mapping list from the target page to the replacement page. 6920 * Next process postcallbacks; since pa_hment's are linked only to the 6921 * p_mapping list of root page, we don't iterate over the constituent 6922 * pages. 6923 */ 6924 hat_pagereload(targ, repl); 6925 6926 suspend_fail: 6927 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND); 6928 6929 /* 6930 * Now lower our PIL and release any captured CPUs since we 6931 * are out of the "danger zone". After this it will again be 6932 * safe to acquire adaptive mutex locks, or to drop them... 6933 */ 6934 if (old_pil != -1) { 6935 splx(old_pil); 6936 } else { 6937 xc_dismissed(cpuset); 6938 } 6939 6940 kpreempt_enable(); 6941 6942 sfmmu_mlist_reloc_exit(low, high); 6943 6944 /* 6945 * Postsuspend callbacks should drop any locks held across 6946 * the suspend callbacks. As before, we don't hold the mapping 6947 * list lock at this point.. our assumption is that the mapping 6948 * list still can't change due to our holding SE_EXCL lock and 6949 * there being no unlocked mappings left. Hence the restriction 6950 * on calling context to hat_delete_callback() 6951 */ 6952 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND); 6953 if (ret != 0) { 6954 /* 6955 * The second presuspend call failed: we got here through 6956 * the suspend_fail label above. 6957 */ 6958 ASSERT(ret != EIO); 6959 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus); 6960 kreloc_thread = NULL; 6961 mutex_exit(&kpr_mutex); 6962 return (EAGAIN); 6963 } 6964 6965 /* 6966 * Now that we're out of the performance critical section we can 6967 * take care of updating the hash table, since we still 6968 * hold all the pages locked SE_EXCL at this point we 6969 * needn't worry about things changing out from under us. 6970 */ 6971 tpp = targ; 6972 rpp = repl; 6973 for (i = 0; i < npages; i++, tpp++, rpp++) { 6974 6975 /* 6976 * replace targ with replacement in page_hash table 6977 */ 6978 targ = tpp; 6979 page_relocate_hash(rpp, targ); 6980 6981 /* 6982 * concatenate target; caller of platform_page_relocate() 6983 * expects target to be concatenated after returning. 6984 */ 6985 ASSERT(targ->p_next == targ); 6986 ASSERT(targ->p_prev == targ); 6987 page_list_concat(&pl, &targ); 6988 } 6989 6990 ASSERT(*target == pl); 6991 *nrelocp = npages; 6992 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus); 6993 kreloc_thread = NULL; 6994 mutex_exit(&kpr_mutex); 6995 return (0); 6996 } 6997 6998 /* 6999 * Called when stray pa_hments are found attached to a page which is 7000 * being freed. Notify the subsystem which attached the pa_hment of 7001 * the error if it registered a suitable handler, else panic. 7002 */ 7003 static void 7004 sfmmu_pahment_leaked(struct pa_hment *pahmep) 7005 { 7006 id_t cb_id = pahmep->cb_id; 7007 7008 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid); 7009 if (sfmmu_cb_table[cb_id].errhandler != NULL) { 7010 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len, 7011 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0) 7012 return; /* non-fatal */ 7013 } 7014 panic("pa_hment leaked: 0x%p", (void *)pahmep); 7015 } 7016 7017 /* 7018 * Remove all mappings to page 'pp'. 7019 */ 7020 int 7021 hat_pageunload(struct page *pp, uint_t forceflag) 7022 { 7023 struct page *origpp = pp; 7024 struct sf_hment *sfhme, *tmphme; 7025 struct hme_blk *hmeblkp; 7026 kmutex_t *pml; 7027 #ifdef VAC 7028 kmutex_t *pmtx; 7029 #endif 7030 cpuset_t cpuset, tset; 7031 int index, cons; 7032 int pa_hments; 7033 7034 ASSERT(PAGE_EXCL(pp)); 7035 7036 tmphme = NULL; 7037 pa_hments = 0; 7038 CPUSET_ZERO(cpuset); 7039 7040 pml = sfmmu_mlist_enter(pp); 7041 7042 #ifdef VAC 7043 if (pp->p_kpmref) 7044 sfmmu_kpm_pageunload(pp); 7045 ASSERT(!PP_ISMAPPED_KPM(pp)); 7046 #endif 7047 /* 7048 * Clear vpm reference. Since the page is exclusively locked 7049 * vpm cannot be referencing it. 7050 */ 7051 if (vpm_enable) { 7052 pp->p_vpmref = 0; 7053 } 7054 7055 index = PP_MAPINDEX(pp); 7056 cons = TTE8K; 7057 retry: 7058 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7059 tmphme = sfhme->hme_next; 7060 7061 if (IS_PAHME(sfhme)) { 7062 ASSERT(sfhme->hme_data != NULL); 7063 pa_hments++; 7064 continue; 7065 } 7066 7067 hmeblkp = sfmmu_hmetohblk(sfhme); 7068 7069 /* 7070 * If there are kernel mappings don't unload them, they will 7071 * be suspended. 7072 */ 7073 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt && 7074 hmeblkp->hblk_tag.htag_id == ksfmmup) 7075 continue; 7076 7077 tset = sfmmu_pageunload(pp, sfhme, cons); 7078 CPUSET_OR(cpuset, tset); 7079 } 7080 7081 while (index != 0) { 7082 index = index >> 1; 7083 if (index != 0) 7084 cons++; 7085 if (index & 0x1) { 7086 /* Go to leading page */ 7087 pp = PP_GROUPLEADER(pp, cons); 7088 ASSERT(sfmmu_mlist_held(pp)); 7089 goto retry; 7090 } 7091 } 7092 7093 /* 7094 * cpuset may be empty if the page was only mapped by segkpm, 7095 * in which case we won't actually cross-trap. 7096 */ 7097 xt_sync(cpuset); 7098 7099 /* 7100 * The page should have no mappings at this point, unless 7101 * we were called from hat_page_relocate() in which case we 7102 * leave the locked mappings which will be suspended later. 7103 */ 7104 ASSERT(!PP_ISMAPPED(origpp) || pa_hments || 7105 (forceflag == SFMMU_KERNEL_RELOC)); 7106 7107 #ifdef VAC 7108 if (PP_ISTNC(pp)) { 7109 if (cons == TTE8K) { 7110 pmtx = sfmmu_page_enter(pp); 7111 PP_CLRTNC(pp); 7112 sfmmu_page_exit(pmtx); 7113 } else { 7114 conv_tnc(pp, cons); 7115 } 7116 } 7117 #endif /* VAC */ 7118 7119 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) { 7120 /* 7121 * Unlink any pa_hments and free them, calling back 7122 * the responsible subsystem to notify it of the error. 7123 * This can occur in situations such as drivers leaking 7124 * DMA handles: naughty, but common enough that we'd like 7125 * to keep the system running rather than bringing it 7126 * down with an obscure error like "pa_hment leaked" 7127 * which doesn't aid the user in debugging their driver. 7128 */ 7129 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7130 tmphme = sfhme->hme_next; 7131 if (IS_PAHME(sfhme)) { 7132 struct pa_hment *pahmep = sfhme->hme_data; 7133 sfmmu_pahment_leaked(pahmep); 7134 HME_SUB(sfhme, pp); 7135 kmem_cache_free(pa_hment_cache, pahmep); 7136 } 7137 } 7138 7139 ASSERT(!PP_ISMAPPED(origpp)); 7140 } 7141 7142 sfmmu_mlist_exit(pml); 7143 7144 return (0); 7145 } 7146 7147 cpuset_t 7148 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons) 7149 { 7150 struct hme_blk *hmeblkp; 7151 sfmmu_t *sfmmup; 7152 tte_t tte, ttemod; 7153 #ifdef DEBUG 7154 tte_t orig_old; 7155 #endif /* DEBUG */ 7156 caddr_t addr; 7157 int ttesz; 7158 int ret; 7159 cpuset_t cpuset; 7160 7161 ASSERT(pp != NULL); 7162 ASSERT(sfmmu_mlist_held(pp)); 7163 ASSERT(!PP_ISKAS(pp)); 7164 7165 CPUSET_ZERO(cpuset); 7166 7167 hmeblkp = sfmmu_hmetohblk(sfhme); 7168 7169 readtte: 7170 sfmmu_copytte(&sfhme->hme_tte, &tte); 7171 if (TTE_IS_VALID(&tte)) { 7172 sfmmup = hblktosfmmu(hmeblkp); 7173 ttesz = get_hblk_ttesz(hmeblkp); 7174 /* 7175 * Only unload mappings of 'cons' size. 7176 */ 7177 if (ttesz != cons) 7178 return (cpuset); 7179 7180 /* 7181 * Note that we have p_mapping lock, but no hash lock here. 7182 * hblk_unload() has to have both hash lock AND p_mapping 7183 * lock before it tries to modify tte. So, the tte could 7184 * not become invalid in the sfmmu_modifytte_try() below. 7185 */ 7186 ttemod = tte; 7187 #ifdef DEBUG 7188 orig_old = tte; 7189 #endif /* DEBUG */ 7190 7191 TTE_SET_INVALID(&ttemod); 7192 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7193 if (ret < 0) { 7194 #ifdef DEBUG 7195 /* only R/M bits can change. */ 7196 chk_tte(&orig_old, &tte, &ttemod, hmeblkp); 7197 #endif /* DEBUG */ 7198 goto readtte; 7199 } 7200 7201 if (ret == 0) { 7202 panic("pageunload: cas failed?"); 7203 } 7204 7205 addr = tte_to_vaddr(hmeblkp, tte); 7206 7207 if (hmeblkp->hblk_shared) { 7208 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7209 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7210 sf_region_t *rgnp; 7211 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7212 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7213 ASSERT(srdp != NULL); 7214 rgnp = srdp->srd_hmergnp[rid]; 7215 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7216 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1); 7217 sfmmu_ttesync(NULL, addr, &tte, pp); 7218 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0); 7219 atomic_dec_ulong(&rgnp->rgn_ttecnt[ttesz]); 7220 } else { 7221 sfmmu_ttesync(sfmmup, addr, &tte, pp); 7222 atomic_dec_ulong(&sfmmup->sfmmu_ttecnt[ttesz]); 7223 7224 /* 7225 * We need to flush the page from the virtual cache 7226 * in order to prevent a virtual cache alias 7227 * inconsistency. The particular scenario we need 7228 * to worry about is: 7229 * Given: va1 and va2 are two virtual address that 7230 * alias and will map the same physical address. 7231 * 1. mapping exists from va1 to pa and data has 7232 * been read into the cache. 7233 * 2. unload va1. 7234 * 3. load va2 and modify data using va2. 7235 * 4 unload va2. 7236 * 5. load va1 and reference data. Unless we flush 7237 * the data cache when we unload we will get 7238 * stale data. 7239 * This scenario is taken care of by using virtual 7240 * page coloring. 7241 */ 7242 if (sfmmup->sfmmu_ismhat) { 7243 /* 7244 * Flush TSBs, TLBs and caches 7245 * of every process 7246 * sharing this ism segment. 7247 */ 7248 sfmmu_hat_lock_all(); 7249 mutex_enter(&ism_mlist_lock); 7250 kpreempt_disable(); 7251 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp, 7252 pp->p_pagenum, CACHE_NO_FLUSH); 7253 kpreempt_enable(); 7254 mutex_exit(&ism_mlist_lock); 7255 sfmmu_hat_unlock_all(); 7256 cpuset = cpu_ready_set; 7257 } else { 7258 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7259 cpuset = sfmmup->sfmmu_cpusran; 7260 } 7261 } 7262 7263 /* 7264 * Hme_sub has to run after ttesync() and a_rss update. 7265 * See hblk_unload(). 7266 */ 7267 HME_SUB(sfhme, pp); 7268 membar_stst(); 7269 7270 /* 7271 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS) 7272 * since pteload may have done a HME_ADD() right after 7273 * we did the HME_SUB() above. Hmecnt is now maintained 7274 * by cas only. no lock guranteed its value. The only 7275 * gurantee we have is the hmecnt should not be less than 7276 * what it should be so the hblk will not be taken away. 7277 * It's also important that we decremented the hmecnt after 7278 * we are done with hmeblkp so that this hmeblk won't be 7279 * stolen. 7280 */ 7281 ASSERT(hmeblkp->hblk_hmecnt > 0); 7282 ASSERT(hmeblkp->hblk_vcnt > 0); 7283 atomic_dec_16(&hmeblkp->hblk_vcnt); 7284 atomic_dec_16(&hmeblkp->hblk_hmecnt); 7285 /* 7286 * This is bug 4063182. 7287 * XXX: fixme 7288 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt || 7289 * !hmeblkp->hblk_lckcnt); 7290 */ 7291 } else { 7292 panic("invalid tte? pp %p &tte %p", 7293 (void *)pp, (void *)&tte); 7294 } 7295 7296 return (cpuset); 7297 } 7298 7299 /* 7300 * While relocating a kernel page, this function will move the mappings 7301 * from tpp to dpp and modify any associated data with these mappings. 7302 * It also unsuspends the suspended kernel mapping. 7303 */ 7304 static void 7305 hat_pagereload(struct page *tpp, struct page *dpp) 7306 { 7307 struct sf_hment *sfhme; 7308 tte_t tte, ttemod; 7309 int index, cons; 7310 7311 ASSERT(getpil() == PIL_MAX); 7312 ASSERT(sfmmu_mlist_held(tpp)); 7313 ASSERT(sfmmu_mlist_held(dpp)); 7314 7315 index = PP_MAPINDEX(tpp); 7316 cons = TTE8K; 7317 7318 /* Update real mappings to the page */ 7319 retry: 7320 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) { 7321 if (IS_PAHME(sfhme)) 7322 continue; 7323 sfmmu_copytte(&sfhme->hme_tte, &tte); 7324 ttemod = tte; 7325 7326 /* 7327 * replace old pfn with new pfn in TTE 7328 */ 7329 PFN_TO_TTE(ttemod, dpp->p_pagenum); 7330 7331 /* 7332 * clear suspend bit 7333 */ 7334 ASSERT(TTE_IS_SUSPEND(&ttemod)); 7335 TTE_CLR_SUSPEND(&ttemod); 7336 7337 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0) 7338 panic("hat_pagereload(): sfmmu_modifytte_try() failed"); 7339 7340 /* 7341 * set hme_page point to new page 7342 */ 7343 sfhme->hme_page = dpp; 7344 } 7345 7346 /* 7347 * move p_mapping list from old page to new page 7348 */ 7349 dpp->p_mapping = tpp->p_mapping; 7350 tpp->p_mapping = NULL; 7351 dpp->p_share = tpp->p_share; 7352 tpp->p_share = 0; 7353 7354 while (index != 0) { 7355 index = index >> 1; 7356 if (index != 0) 7357 cons++; 7358 if (index & 0x1) { 7359 tpp = PP_GROUPLEADER(tpp, cons); 7360 dpp = PP_GROUPLEADER(dpp, cons); 7361 goto retry; 7362 } 7363 } 7364 7365 curthread->t_flag &= ~T_DONTDTRACE; 7366 mutex_exit(&kpr_suspendlock); 7367 } 7368 7369 uint_t 7370 hat_pagesync(struct page *pp, uint_t clearflag) 7371 { 7372 struct sf_hment *sfhme, *tmphme = NULL; 7373 struct hme_blk *hmeblkp; 7374 kmutex_t *pml; 7375 cpuset_t cpuset, tset; 7376 int index, cons; 7377 extern ulong_t po_share; 7378 page_t *save_pp = pp; 7379 int stop_on_sh = 0; 7380 uint_t shcnt; 7381 7382 CPUSET_ZERO(cpuset); 7383 7384 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) { 7385 return (PP_GENERIC_ATTR(pp)); 7386 } 7387 7388 if ((clearflag & HAT_SYNC_ZERORM) == 0) { 7389 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) { 7390 return (PP_GENERIC_ATTR(pp)); 7391 } 7392 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) { 7393 return (PP_GENERIC_ATTR(pp)); 7394 } 7395 if (clearflag & HAT_SYNC_STOPON_SHARED) { 7396 if (pp->p_share > po_share) { 7397 hat_page_setattr(pp, P_REF); 7398 return (PP_GENERIC_ATTR(pp)); 7399 } 7400 stop_on_sh = 1; 7401 shcnt = 0; 7402 } 7403 } 7404 7405 clearflag &= ~HAT_SYNC_STOPON_SHARED; 7406 pml = sfmmu_mlist_enter(pp); 7407 index = PP_MAPINDEX(pp); 7408 cons = TTE8K; 7409 retry: 7410 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7411 /* 7412 * We need to save the next hment on the list since 7413 * it is possible for pagesync to remove an invalid hment 7414 * from the list. 7415 */ 7416 tmphme = sfhme->hme_next; 7417 if (IS_PAHME(sfhme)) 7418 continue; 7419 /* 7420 * If we are looking for large mappings and this hme doesn't 7421 * reach the range we are seeking, just ignore it. 7422 */ 7423 hmeblkp = sfmmu_hmetohblk(sfhme); 7424 7425 if (hme_size(sfhme) < cons) 7426 continue; 7427 7428 if (stop_on_sh) { 7429 if (hmeblkp->hblk_shared) { 7430 sf_srd_t *srdp = hblktosrd(hmeblkp); 7431 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7432 sf_region_t *rgnp; 7433 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7434 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7435 ASSERT(srdp != NULL); 7436 rgnp = srdp->srd_hmergnp[rid]; 7437 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 7438 rgnp, rid); 7439 shcnt += rgnp->rgn_refcnt; 7440 } else { 7441 shcnt++; 7442 } 7443 if (shcnt > po_share) { 7444 /* 7445 * tell the pager to spare the page this time 7446 * around. 7447 */ 7448 hat_page_setattr(save_pp, P_REF); 7449 index = 0; 7450 break; 7451 } 7452 } 7453 tset = sfmmu_pagesync(pp, sfhme, 7454 clearflag & ~HAT_SYNC_STOPON_RM); 7455 CPUSET_OR(cpuset, tset); 7456 7457 /* 7458 * If clearflag is HAT_SYNC_DONTZERO, break out as soon 7459 * as the "ref" or "mod" is set or share cnt exceeds po_share. 7460 */ 7461 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO && 7462 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) || 7463 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) { 7464 index = 0; 7465 break; 7466 } 7467 } 7468 7469 while (index) { 7470 index = index >> 1; 7471 cons++; 7472 if (index & 0x1) { 7473 /* Go to leading page */ 7474 pp = PP_GROUPLEADER(pp, cons); 7475 goto retry; 7476 } 7477 } 7478 7479 xt_sync(cpuset); 7480 sfmmu_mlist_exit(pml); 7481 return (PP_GENERIC_ATTR(save_pp)); 7482 } 7483 7484 /* 7485 * Get all the hardware dependent attributes for a page struct 7486 */ 7487 static cpuset_t 7488 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme, 7489 uint_t clearflag) 7490 { 7491 caddr_t addr; 7492 tte_t tte, ttemod; 7493 struct hme_blk *hmeblkp; 7494 int ret; 7495 sfmmu_t *sfmmup; 7496 cpuset_t cpuset; 7497 7498 ASSERT(pp != NULL); 7499 ASSERT(sfmmu_mlist_held(pp)); 7500 ASSERT((clearflag == HAT_SYNC_DONTZERO) || 7501 (clearflag == HAT_SYNC_ZERORM)); 7502 7503 SFMMU_STAT(sf_pagesync); 7504 7505 CPUSET_ZERO(cpuset); 7506 7507 sfmmu_pagesync_retry: 7508 7509 sfmmu_copytte(&sfhme->hme_tte, &tte); 7510 if (TTE_IS_VALID(&tte)) { 7511 hmeblkp = sfmmu_hmetohblk(sfhme); 7512 sfmmup = hblktosfmmu(hmeblkp); 7513 addr = tte_to_vaddr(hmeblkp, tte); 7514 if (clearflag == HAT_SYNC_ZERORM) { 7515 ttemod = tte; 7516 TTE_CLR_RM(&ttemod); 7517 ret = sfmmu_modifytte_try(&tte, &ttemod, 7518 &sfhme->hme_tte); 7519 if (ret < 0) { 7520 /* 7521 * cas failed and the new value is not what 7522 * we want. 7523 */ 7524 goto sfmmu_pagesync_retry; 7525 } 7526 7527 if (ret > 0) { 7528 /* we win the cas */ 7529 if (hmeblkp->hblk_shared) { 7530 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7531 uint_t rid = 7532 hmeblkp->hblk_tag.htag_rid; 7533 sf_region_t *rgnp; 7534 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7535 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7536 ASSERT(srdp != NULL); 7537 rgnp = srdp->srd_hmergnp[rid]; 7538 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7539 srdp, rgnp, rid); 7540 cpuset = sfmmu_rgntlb_demap(addr, 7541 rgnp, hmeblkp, 1); 7542 } else { 7543 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 7544 0, 0); 7545 cpuset = sfmmup->sfmmu_cpusran; 7546 } 7547 } 7548 } 7549 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr, 7550 &tte, pp); 7551 } 7552 return (cpuset); 7553 } 7554 7555 /* 7556 * Remove write permission from a mappings to a page, so that 7557 * we can detect the next modification of it. This requires modifying 7558 * the TTE then invalidating (demap) any TLB entry using that TTE. 7559 * This code is similar to sfmmu_pagesync(). 7560 */ 7561 static cpuset_t 7562 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme) 7563 { 7564 caddr_t addr; 7565 tte_t tte; 7566 tte_t ttemod; 7567 struct hme_blk *hmeblkp; 7568 int ret; 7569 sfmmu_t *sfmmup; 7570 cpuset_t cpuset; 7571 7572 ASSERT(pp != NULL); 7573 ASSERT(sfmmu_mlist_held(pp)); 7574 7575 CPUSET_ZERO(cpuset); 7576 SFMMU_STAT(sf_clrwrt); 7577 7578 retry: 7579 7580 sfmmu_copytte(&sfhme->hme_tte, &tte); 7581 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) { 7582 hmeblkp = sfmmu_hmetohblk(sfhme); 7583 sfmmup = hblktosfmmu(hmeblkp); 7584 addr = tte_to_vaddr(hmeblkp, tte); 7585 7586 ttemod = tte; 7587 TTE_CLR_WRT(&ttemod); 7588 TTE_CLR_MOD(&ttemod); 7589 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 7590 7591 /* 7592 * if cas failed and the new value is not what 7593 * we want retry 7594 */ 7595 if (ret < 0) 7596 goto retry; 7597 7598 /* we win the cas */ 7599 if (ret > 0) { 7600 if (hmeblkp->hblk_shared) { 7601 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 7602 uint_t rid = hmeblkp->hblk_tag.htag_rid; 7603 sf_region_t *rgnp; 7604 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7605 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7606 ASSERT(srdp != NULL); 7607 rgnp = srdp->srd_hmergnp[rid]; 7608 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 7609 srdp, rgnp, rid); 7610 cpuset = sfmmu_rgntlb_demap(addr, 7611 rgnp, hmeblkp, 1); 7612 } else { 7613 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0); 7614 cpuset = sfmmup->sfmmu_cpusran; 7615 } 7616 } 7617 } 7618 7619 return (cpuset); 7620 } 7621 7622 /* 7623 * Walk all mappings of a page, removing write permission and clearing the 7624 * ref/mod bits. This code is similar to hat_pagesync() 7625 */ 7626 static void 7627 hat_page_clrwrt(page_t *pp) 7628 { 7629 struct sf_hment *sfhme; 7630 struct sf_hment *tmphme = NULL; 7631 kmutex_t *pml; 7632 cpuset_t cpuset; 7633 cpuset_t tset; 7634 int index; 7635 int cons; 7636 7637 CPUSET_ZERO(cpuset); 7638 7639 pml = sfmmu_mlist_enter(pp); 7640 index = PP_MAPINDEX(pp); 7641 cons = TTE8K; 7642 retry: 7643 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 7644 tmphme = sfhme->hme_next; 7645 7646 /* 7647 * If we are looking for large mappings and this hme doesn't 7648 * reach the range we are seeking, just ignore its. 7649 */ 7650 7651 if (hme_size(sfhme) < cons) 7652 continue; 7653 7654 tset = sfmmu_pageclrwrt(pp, sfhme); 7655 CPUSET_OR(cpuset, tset); 7656 } 7657 7658 while (index) { 7659 index = index >> 1; 7660 cons++; 7661 if (index & 0x1) { 7662 /* Go to leading page */ 7663 pp = PP_GROUPLEADER(pp, cons); 7664 goto retry; 7665 } 7666 } 7667 7668 xt_sync(cpuset); 7669 sfmmu_mlist_exit(pml); 7670 } 7671 7672 /* 7673 * Set the given REF/MOD/RO bits for the given page. 7674 * For a vnode with a sorted v_pages list, we need to change 7675 * the attributes and the v_pages list together under page_vnode_mutex. 7676 */ 7677 void 7678 hat_page_setattr(page_t *pp, uint_t flag) 7679 { 7680 vnode_t *vp = pp->p_vnode; 7681 page_t **listp; 7682 kmutex_t *pmtx; 7683 kmutex_t *vphm = NULL; 7684 int noshuffle; 7685 7686 noshuffle = flag & P_NSH; 7687 flag &= ~P_NSH; 7688 7689 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7690 7691 /* 7692 * nothing to do if attribute already set 7693 */ 7694 if ((pp->p_nrm & flag) == flag) 7695 return; 7696 7697 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) && 7698 !noshuffle) { 7699 vphm = page_vnode_mutex(vp); 7700 mutex_enter(vphm); 7701 } 7702 7703 pmtx = sfmmu_page_enter(pp); 7704 pp->p_nrm |= flag; 7705 sfmmu_page_exit(pmtx); 7706 7707 if (vphm != NULL) { 7708 /* 7709 * Some File Systems examine v_pages for NULL w/o 7710 * grabbing the vphm mutex. Must not let it become NULL when 7711 * pp is the only page on the list. 7712 */ 7713 if (pp->p_vpnext != pp) { 7714 page_vpsub(&vp->v_pages, pp); 7715 if (vp->v_pages != NULL) 7716 listp = &vp->v_pages->p_vpprev->p_vpnext; 7717 else 7718 listp = &vp->v_pages; 7719 page_vpadd(listp, pp); 7720 } 7721 mutex_exit(vphm); 7722 } 7723 } 7724 7725 void 7726 hat_page_clrattr(page_t *pp, uint_t flag) 7727 { 7728 vnode_t *vp = pp->p_vnode; 7729 kmutex_t *pmtx; 7730 7731 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7732 7733 pmtx = sfmmu_page_enter(pp); 7734 7735 /* 7736 * Caller is expected to hold page's io lock for VMODSORT to work 7737 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod 7738 * bit is cleared. 7739 * We don't have assert to avoid tripping some existing third party 7740 * code. The dirty page is moved back to top of the v_page list 7741 * after IO is done in pvn_write_done(). 7742 */ 7743 pp->p_nrm &= ~flag; 7744 sfmmu_page_exit(pmtx); 7745 7746 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) { 7747 7748 /* 7749 * VMODSORT works by removing write permissions and getting 7750 * a fault when a page is made dirty. At this point 7751 * we need to remove write permission from all mappings 7752 * to this page. 7753 */ 7754 hat_page_clrwrt(pp); 7755 } 7756 } 7757 7758 uint_t 7759 hat_page_getattr(page_t *pp, uint_t flag) 7760 { 7761 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); 7762 return ((uint_t)(pp->p_nrm & flag)); 7763 } 7764 7765 /* 7766 * DEBUG kernels: verify that a kernel va<->pa translation 7767 * is safe by checking the underlying page_t is in a page 7768 * relocation-safe state. 7769 */ 7770 #ifdef DEBUG 7771 void 7772 sfmmu_check_kpfn(pfn_t pfn) 7773 { 7774 page_t *pp; 7775 int index, cons; 7776 7777 if (hat_check_vtop == 0) 7778 return; 7779 7780 if (kvseg.s_base == NULL || panicstr) 7781 return; 7782 7783 pp = page_numtopp_nolock(pfn); 7784 if (!pp) 7785 return; 7786 7787 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7788 return; 7789 7790 /* 7791 * Handed a large kernel page, we dig up the root page since we 7792 * know the root page might have the lock also. 7793 */ 7794 if (pp->p_szc != 0) { 7795 index = PP_MAPINDEX(pp); 7796 cons = TTE8K; 7797 again: 7798 while (index != 0) { 7799 index >>= 1; 7800 if (index != 0) 7801 cons++; 7802 if (index & 0x1) { 7803 pp = PP_GROUPLEADER(pp, cons); 7804 goto again; 7805 } 7806 } 7807 } 7808 7809 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp)) 7810 return; 7811 7812 /* 7813 * Pages need to be locked or allocated "permanent" (either from 7814 * static_arena arena or explicitly setting PG_NORELOC when calling 7815 * page_create_va()) for VA->PA translations to be valid. 7816 */ 7817 if (!PP_ISNORELOC(pp)) 7818 panic("Illegal VA->PA translation, pp 0x%p not permanent", 7819 (void *)pp); 7820 else 7821 panic("Illegal VA->PA translation, pp 0x%p not locked", 7822 (void *)pp); 7823 } 7824 #endif /* DEBUG */ 7825 7826 /* 7827 * Returns a page frame number for a given virtual address. 7828 * Returns PFN_INVALID to indicate an invalid mapping 7829 */ 7830 pfn_t 7831 hat_getpfnum(struct hat *hat, caddr_t addr) 7832 { 7833 pfn_t pfn; 7834 tte_t tte; 7835 7836 /* 7837 * We would like to 7838 * ASSERT(AS_LOCK_HELD(as)); 7839 * but we can't because the iommu driver will call this 7840 * routine at interrupt time and it can't grab the as lock 7841 * or it will deadlock: A thread could have the as lock 7842 * and be waiting for io. The io can't complete 7843 * because the interrupt thread is blocked trying to grab 7844 * the as lock. 7845 */ 7846 7847 if (hat == ksfmmup) { 7848 if (IS_KMEM_VA_LARGEPAGE(addr)) { 7849 ASSERT(segkmem_lpszc > 0); 7850 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc); 7851 if (pfn != PFN_INVALID) { 7852 sfmmu_check_kpfn(pfn); 7853 return (pfn); 7854 } 7855 } else if (segkpm && IS_KPM_ADDR(addr)) { 7856 return (sfmmu_kpm_vatopfn(addr)); 7857 } 7858 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte)) 7859 == PFN_SUSPENDED) { 7860 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte); 7861 } 7862 sfmmu_check_kpfn(pfn); 7863 return (pfn); 7864 } else { 7865 return (sfmmu_uvatopfn(addr, hat, NULL)); 7866 } 7867 } 7868 7869 /* 7870 * This routine will return both pfn and tte for the vaddr. 7871 */ 7872 static pfn_t 7873 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep) 7874 { 7875 struct hmehash_bucket *hmebp; 7876 hmeblk_tag hblktag; 7877 int hmeshift, hashno = 1; 7878 struct hme_blk *hmeblkp = NULL; 7879 tte_t tte; 7880 7881 struct sf_hment *sfhmep; 7882 pfn_t pfn; 7883 7884 /* support for ISM */ 7885 ism_map_t *ism_map; 7886 ism_blk_t *ism_blkp; 7887 int i; 7888 sfmmu_t *ism_hatid = NULL; 7889 sfmmu_t *locked_hatid = NULL; 7890 sfmmu_t *sv_sfmmup = sfmmup; 7891 caddr_t sv_vaddr = vaddr; 7892 sf_srd_t *srdp; 7893 7894 if (ttep == NULL) { 7895 ttep = &tte; 7896 } else { 7897 ttep->ll = 0; 7898 } 7899 7900 ASSERT(sfmmup != ksfmmup); 7901 SFMMU_STAT(sf_user_vtop); 7902 /* 7903 * Set ism_hatid if vaddr falls in a ISM segment. 7904 */ 7905 ism_blkp = sfmmup->sfmmu_iblk; 7906 if (ism_blkp != NULL) { 7907 sfmmu_ismhat_enter(sfmmup, 0); 7908 locked_hatid = sfmmup; 7909 } 7910 while (ism_blkp != NULL && ism_hatid == NULL) { 7911 ism_map = ism_blkp->iblk_maps; 7912 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 7913 if (vaddr >= ism_start(ism_map[i]) && 7914 vaddr < ism_end(ism_map[i])) { 7915 sfmmup = ism_hatid = ism_map[i].imap_ismhat; 7916 vaddr = (caddr_t)(vaddr - 7917 ism_start(ism_map[i])); 7918 break; 7919 } 7920 } 7921 ism_blkp = ism_blkp->iblk_next; 7922 } 7923 if (locked_hatid) { 7924 sfmmu_ismhat_exit(locked_hatid, 0); 7925 } 7926 7927 hblktag.htag_id = sfmmup; 7928 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 7929 do { 7930 hmeshift = HME_HASH_SHIFT(hashno); 7931 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift); 7932 hblktag.htag_rehash = hashno; 7933 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift); 7934 7935 SFMMU_HASH_LOCK(hmebp); 7936 7937 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp); 7938 if (hmeblkp != NULL) { 7939 ASSERT(!hmeblkp->hblk_shared); 7940 HBLKTOHME(sfhmep, hmeblkp, vaddr); 7941 sfmmu_copytte(&sfhmep->hme_tte, ttep); 7942 SFMMU_HASH_UNLOCK(hmebp); 7943 if (TTE_IS_VALID(ttep)) { 7944 pfn = TTE_TO_PFN(vaddr, ttep); 7945 return (pfn); 7946 } 7947 break; 7948 } 7949 SFMMU_HASH_UNLOCK(hmebp); 7950 hashno++; 7951 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt)); 7952 7953 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) { 7954 return (PFN_INVALID); 7955 } 7956 srdp = sv_sfmmup->sfmmu_srdp; 7957 ASSERT(srdp != NULL); 7958 ASSERT(srdp->srd_refcnt != 0); 7959 hblktag.htag_id = srdp; 7960 hashno = 1; 7961 do { 7962 hmeshift = HME_HASH_SHIFT(hashno); 7963 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift); 7964 hblktag.htag_rehash = hashno; 7965 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift); 7966 7967 SFMMU_HASH_LOCK(hmebp); 7968 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL; 7969 hmeblkp = hmeblkp->hblk_next) { 7970 uint_t rid; 7971 sf_region_t *rgnp; 7972 caddr_t rsaddr; 7973 caddr_t readdr; 7974 7975 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag, 7976 sv_sfmmup->sfmmu_hmeregion_map)) { 7977 continue; 7978 } 7979 ASSERT(hmeblkp->hblk_shared); 7980 rid = hmeblkp->hblk_tag.htag_rid; 7981 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 7982 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 7983 rgnp = srdp->srd_hmergnp[rid]; 7984 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 7985 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr); 7986 sfmmu_copytte(&sfhmep->hme_tte, ttep); 7987 rsaddr = rgnp->rgn_saddr; 7988 readdr = rsaddr + rgnp->rgn_size; 7989 #ifdef DEBUG 7990 if (TTE_IS_VALID(ttep) || 7991 get_hblk_ttesz(hmeblkp) > TTE8K) { 7992 caddr_t eva = tte_to_evaddr(hmeblkp, ttep); 7993 ASSERT(eva > sv_vaddr); 7994 ASSERT(sv_vaddr >= rsaddr); 7995 ASSERT(sv_vaddr < readdr); 7996 ASSERT(eva <= readdr); 7997 } 7998 #endif /* DEBUG */ 7999 /* 8000 * Continue the search if we 8001 * found an invalid 8K tte outside of the area 8002 * covered by this hmeblk's region. 8003 */ 8004 if (TTE_IS_VALID(ttep)) { 8005 SFMMU_HASH_UNLOCK(hmebp); 8006 pfn = TTE_TO_PFN(sv_vaddr, ttep); 8007 return (pfn); 8008 } else if (get_hblk_ttesz(hmeblkp) > TTE8K || 8009 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) { 8010 SFMMU_HASH_UNLOCK(hmebp); 8011 pfn = PFN_INVALID; 8012 return (pfn); 8013 } 8014 } 8015 SFMMU_HASH_UNLOCK(hmebp); 8016 hashno++; 8017 } while (hashno <= mmu_hashcnt); 8018 return (PFN_INVALID); 8019 } 8020 8021 8022 /* 8023 * For compatability with AT&T and later optimizations 8024 */ 8025 /* ARGSUSED */ 8026 void 8027 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags) 8028 { 8029 ASSERT(hat != NULL); 8030 } 8031 8032 /* 8033 * Return the number of mappings to a particular page. This number is an 8034 * approximation of the number of people sharing the page. 8035 * 8036 * shared hmeblks or ism hmeblks are counted as 1 mapping here. 8037 * hat_page_checkshare() can be used to compare threshold to share 8038 * count that reflects the number of region sharers albeit at higher cost. 8039 */ 8040 ulong_t 8041 hat_page_getshare(page_t *pp) 8042 { 8043 page_t *spp = pp; /* start page */ 8044 kmutex_t *pml; 8045 ulong_t cnt; 8046 int index, sz = TTE64K; 8047 8048 /* 8049 * We need to grab the mlist lock to make sure any outstanding 8050 * load/unloads complete. Otherwise we could return zero 8051 * even though the unload(s) hasn't finished yet. 8052 */ 8053 pml = sfmmu_mlist_enter(spp); 8054 cnt = spp->p_share; 8055 8056 #ifdef VAC 8057 if (kpm_enable) 8058 cnt += spp->p_kpmref; 8059 #endif 8060 if (vpm_enable && pp->p_vpmref) { 8061 cnt += 1; 8062 } 8063 8064 /* 8065 * If we have any large mappings, we count the number of 8066 * mappings that this large page is part of. 8067 */ 8068 index = PP_MAPINDEX(spp); 8069 index >>= 1; 8070 while (index) { 8071 pp = PP_GROUPLEADER(spp, sz); 8072 if ((index & 0x1) && pp != spp) { 8073 cnt += pp->p_share; 8074 spp = pp; 8075 } 8076 index >>= 1; 8077 sz++; 8078 } 8079 sfmmu_mlist_exit(pml); 8080 return (cnt); 8081 } 8082 8083 /* 8084 * Return 1 if the number of mappings exceeds sh_thresh. Return 0 8085 * otherwise. Count shared hmeblks by region's refcnt. 8086 */ 8087 int 8088 hat_page_checkshare(page_t *pp, ulong_t sh_thresh) 8089 { 8090 kmutex_t *pml; 8091 ulong_t cnt = 0; 8092 int index, sz = TTE8K; 8093 struct sf_hment *sfhme, *tmphme = NULL; 8094 struct hme_blk *hmeblkp; 8095 8096 pml = sfmmu_mlist_enter(pp); 8097 8098 #ifdef VAC 8099 if (kpm_enable) 8100 cnt = pp->p_kpmref; 8101 #endif 8102 8103 if (vpm_enable && pp->p_vpmref) { 8104 cnt += 1; 8105 } 8106 8107 if (pp->p_share + cnt > sh_thresh) { 8108 sfmmu_mlist_exit(pml); 8109 return (1); 8110 } 8111 8112 index = PP_MAPINDEX(pp); 8113 8114 again: 8115 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) { 8116 tmphme = sfhme->hme_next; 8117 if (IS_PAHME(sfhme)) { 8118 continue; 8119 } 8120 8121 hmeblkp = sfmmu_hmetohblk(sfhme); 8122 if (hme_size(sfhme) != sz) { 8123 continue; 8124 } 8125 8126 if (hmeblkp->hblk_shared) { 8127 sf_srd_t *srdp = hblktosrd(hmeblkp); 8128 uint_t rid = hmeblkp->hblk_tag.htag_rid; 8129 sf_region_t *rgnp; 8130 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 8131 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 8132 ASSERT(srdp != NULL); 8133 rgnp = srdp->srd_hmergnp[rid]; 8134 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, 8135 rgnp, rid); 8136 cnt += rgnp->rgn_refcnt; 8137 } else { 8138 cnt++; 8139 } 8140 if (cnt > sh_thresh) { 8141 sfmmu_mlist_exit(pml); 8142 return (1); 8143 } 8144 } 8145 8146 index >>= 1; 8147 sz++; 8148 while (index) { 8149 pp = PP_GROUPLEADER(pp, sz); 8150 ASSERT(sfmmu_mlist_held(pp)); 8151 if (index & 0x1) { 8152 goto again; 8153 } 8154 index >>= 1; 8155 sz++; 8156 } 8157 sfmmu_mlist_exit(pml); 8158 return (0); 8159 } 8160 8161 /* 8162 * Unload all large mappings to the pp and reset the p_szc field of every 8163 * constituent page according to the remaining mappings. 8164 * 8165 * pp must be locked SE_EXCL. Even though no other constituent pages are 8166 * locked it's legal to unload the large mappings to the pp because all 8167 * constituent pages of large locked mappings have to be locked SE_SHARED. 8168 * This means if we have SE_EXCL lock on one of constituent pages none of the 8169 * large mappings to pp are locked. 8170 * 8171 * Decrease p_szc field starting from the last constituent page and ending 8172 * with the root page. This method is used because other threads rely on the 8173 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc 8174 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This 8175 * ensures that p_szc changes of the constituent pages appears atomic for all 8176 * threads that use sfmmu_mlspl_enter() to examine p_szc field. 8177 * 8178 * This mechanism is only used for file system pages where it's not always 8179 * possible to get SE_EXCL locks on all constituent pages to demote the size 8180 * code (as is done for anonymous or kernel large pages). 8181 * 8182 * See more comments in front of sfmmu_mlspl_enter(). 8183 */ 8184 void 8185 hat_page_demote(page_t *pp) 8186 { 8187 int index; 8188 int sz; 8189 cpuset_t cpuset; 8190 int sync = 0; 8191 page_t *rootpp; 8192 struct sf_hment *sfhme; 8193 struct sf_hment *tmphme = NULL; 8194 uint_t pszc; 8195 page_t *lastpp; 8196 cpuset_t tset; 8197 pgcnt_t npgs; 8198 kmutex_t *pml; 8199 kmutex_t *pmtx = NULL; 8200 8201 ASSERT(PAGE_EXCL(pp)); 8202 ASSERT(!PP_ISFREE(pp)); 8203 ASSERT(!PP_ISKAS(pp)); 8204 ASSERT(page_szc_lock_assert(pp)); 8205 pml = sfmmu_mlist_enter(pp); 8206 8207 pszc = pp->p_szc; 8208 if (pszc == 0) { 8209 goto out; 8210 } 8211 8212 index = PP_MAPINDEX(pp) >> 1; 8213 8214 if (index) { 8215 CPUSET_ZERO(cpuset); 8216 sz = TTE64K; 8217 sync = 1; 8218 } 8219 8220 while (index) { 8221 if (!(index & 0x1)) { 8222 index >>= 1; 8223 sz++; 8224 continue; 8225 } 8226 ASSERT(sz <= pszc); 8227 rootpp = PP_GROUPLEADER(pp, sz); 8228 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) { 8229 tmphme = sfhme->hme_next; 8230 ASSERT(!IS_PAHME(sfhme)); 8231 if (hme_size(sfhme) != sz) { 8232 continue; 8233 } 8234 tset = sfmmu_pageunload(rootpp, sfhme, sz); 8235 CPUSET_OR(cpuset, tset); 8236 } 8237 if (index >>= 1) { 8238 sz++; 8239 } 8240 } 8241 8242 ASSERT(!PP_ISMAPPED_LARGE(pp)); 8243 8244 if (sync) { 8245 xt_sync(cpuset); 8246 #ifdef VAC 8247 if (PP_ISTNC(pp)) { 8248 conv_tnc(rootpp, sz); 8249 } 8250 #endif /* VAC */ 8251 } 8252 8253 pmtx = sfmmu_page_enter(pp); 8254 8255 ASSERT(pp->p_szc == pszc); 8256 rootpp = PP_PAGEROOT(pp); 8257 ASSERT(rootpp->p_szc == pszc); 8258 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1); 8259 8260 while (lastpp != rootpp) { 8261 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0; 8262 ASSERT(sz < pszc); 8263 npgs = (sz == 0) ? 1 : TTEPAGES(sz); 8264 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1); 8265 while (--npgs > 0) { 8266 lastpp->p_szc = (uchar_t)sz; 8267 lastpp = PP_PAGEPREV(lastpp); 8268 } 8269 if (sz) { 8270 /* 8271 * make sure before current root's pszc 8272 * is updated all updates to constituent pages pszc 8273 * fields are globally visible. 8274 */ 8275 membar_producer(); 8276 } 8277 lastpp->p_szc = sz; 8278 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz))); 8279 if (lastpp != rootpp) { 8280 lastpp = PP_PAGEPREV(lastpp); 8281 } 8282 } 8283 if (sz == 0) { 8284 /* the loop above doesn't cover this case */ 8285 rootpp->p_szc = 0; 8286 } 8287 out: 8288 ASSERT(pp->p_szc == 0); 8289 if (pmtx != NULL) { 8290 sfmmu_page_exit(pmtx); 8291 } 8292 sfmmu_mlist_exit(pml); 8293 } 8294 8295 /* 8296 * Refresh the HAT ismttecnt[] element for size szc. 8297 * Caller must have set ISM busy flag to prevent mapping 8298 * lists from changing while we're traversing them. 8299 */ 8300 pgcnt_t 8301 ism_tsb_entries(sfmmu_t *sfmmup, int szc) 8302 { 8303 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk; 8304 ism_map_t *ism_map; 8305 pgcnt_t npgs = 0; 8306 pgcnt_t npgs_scd = 0; 8307 int j; 8308 sf_scd_t *scdp; 8309 uchar_t rid; 8310 8311 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 8312 scdp = sfmmup->sfmmu_scdp; 8313 8314 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) { 8315 ism_map = ism_blkp->iblk_maps; 8316 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) { 8317 rid = ism_map[j].imap_rid; 8318 ASSERT(rid == SFMMU_INVALID_ISMRID || 8319 rid < sfmmup->sfmmu_srdp->srd_next_ismrid); 8320 8321 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID && 8322 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 8323 /* ISM is in sfmmup's SCD */ 8324 npgs_scd += 8325 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8326 } else { 8327 /* ISMs is not in SCD */ 8328 npgs += 8329 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; 8330 } 8331 } 8332 } 8333 sfmmup->sfmmu_ismttecnt[szc] = npgs; 8334 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd; 8335 return (npgs); 8336 } 8337 8338 /* 8339 * Yield the memory claim requirement for an address space. 8340 * 8341 * This is currently implemented as the number of bytes that have active 8342 * hardware translations that have page structures. Therefore, it can 8343 * underestimate the traditional resident set size, eg, if the 8344 * physical page is present and the hardware translation is missing; 8345 * and it can overestimate the rss, eg, if there are active 8346 * translations to a frame buffer with page structs. 8347 * Also, it does not take sharing into account. 8348 * 8349 * Note that we don't acquire locks here since this function is most often 8350 * called from the clock thread. 8351 */ 8352 size_t 8353 hat_get_mapped_size(struct hat *hat) 8354 { 8355 size_t assize = 0; 8356 int i; 8357 8358 if (hat == NULL) 8359 return (0); 8360 8361 for (i = 0; i < mmu_page_sizes; i++) 8362 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] + 8363 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i); 8364 8365 if (hat->sfmmu_iblk == NULL) 8366 return (assize); 8367 8368 for (i = 0; i < mmu_page_sizes; i++) 8369 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] + 8370 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i); 8371 8372 return (assize); 8373 } 8374 8375 int 8376 hat_stats_enable(struct hat *hat) 8377 { 8378 hatlock_t *hatlockp; 8379 8380 hatlockp = sfmmu_hat_enter(hat); 8381 hat->sfmmu_rmstat++; 8382 sfmmu_hat_exit(hatlockp); 8383 return (1); 8384 } 8385 8386 void 8387 hat_stats_disable(struct hat *hat) 8388 { 8389 hatlock_t *hatlockp; 8390 8391 hatlockp = sfmmu_hat_enter(hat); 8392 hat->sfmmu_rmstat--; 8393 sfmmu_hat_exit(hatlockp); 8394 } 8395 8396 /* 8397 * Routines for entering or removing ourselves from the 8398 * ism_hat's mapping list. This is used for both private and 8399 * SCD hats. 8400 */ 8401 static void 8402 iment_add(struct ism_ment *iment, struct hat *ism_hat) 8403 { 8404 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8405 8406 iment->iment_prev = NULL; 8407 iment->iment_next = ism_hat->sfmmu_iment; 8408 if (ism_hat->sfmmu_iment) { 8409 ism_hat->sfmmu_iment->iment_prev = iment; 8410 } 8411 ism_hat->sfmmu_iment = iment; 8412 } 8413 8414 static void 8415 iment_sub(struct ism_ment *iment, struct hat *ism_hat) 8416 { 8417 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 8418 8419 if (ism_hat->sfmmu_iment == NULL) { 8420 panic("ism map entry remove - no entries"); 8421 } 8422 8423 if (iment->iment_prev) { 8424 ASSERT(ism_hat->sfmmu_iment != iment); 8425 iment->iment_prev->iment_next = iment->iment_next; 8426 } else { 8427 ASSERT(ism_hat->sfmmu_iment == iment); 8428 ism_hat->sfmmu_iment = iment->iment_next; 8429 } 8430 8431 if (iment->iment_next) { 8432 iment->iment_next->iment_prev = iment->iment_prev; 8433 } 8434 8435 /* 8436 * zero out the entry 8437 */ 8438 iment->iment_next = NULL; 8439 iment->iment_prev = NULL; 8440 iment->iment_hat = NULL; 8441 iment->iment_base_va = 0; 8442 } 8443 8444 /* 8445 * Hat_share()/unshare() return an (non-zero) error 8446 * when saddr and daddr are not properly aligned. 8447 * 8448 * The top level mapping element determines the alignment 8449 * requirement for saddr and daddr, depending on different 8450 * architectures. 8451 * 8452 * When hat_share()/unshare() are not supported, 8453 * HATOP_SHARE()/UNSHARE() return 0 8454 */ 8455 int 8456 hat_share(struct hat *sfmmup, caddr_t addr, 8457 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc) 8458 { 8459 ism_blk_t *ism_blkp; 8460 ism_blk_t *new_iblk; 8461 ism_map_t *ism_map; 8462 ism_ment_t *ism_ment; 8463 int i, added; 8464 hatlock_t *hatlockp; 8465 int reload_mmu = 0; 8466 uint_t ismshift = page_get_shift(ismszc); 8467 size_t ismpgsz = page_get_pagesize(ismszc); 8468 uint_t ismmask = (uint_t)ismpgsz - 1; 8469 size_t sh_size = ISM_SHIFT(ismshift, len); 8470 ushort_t ismhatflag; 8471 hat_region_cookie_t rcookie; 8472 sf_scd_t *old_scdp; 8473 8474 #ifdef DEBUG 8475 caddr_t eaddr = addr + len; 8476 #endif /* DEBUG */ 8477 8478 ASSERT(ism_hatid != NULL && sfmmup != NULL); 8479 ASSERT(sptaddr == ISMID_STARTADDR); 8480 /* 8481 * Check the alignment. 8482 */ 8483 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr)) 8484 return (EINVAL); 8485 8486 /* 8487 * Check size alignment. 8488 */ 8489 if (!ISM_ALIGNED(ismshift, len)) 8490 return (EINVAL); 8491 8492 /* 8493 * Allocate ism_ment for the ism_hat's mapping list, and an 8494 * ism map blk in case we need one. We must do our 8495 * allocations before acquiring locks to prevent a deadlock 8496 * in the kmem allocator on the mapping list lock. 8497 */ 8498 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP); 8499 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP); 8500 8501 /* 8502 * Serialize ISM mappings with the ISM busy flag, and also the 8503 * trap handlers. 8504 */ 8505 sfmmu_ismhat_enter(sfmmup, 0); 8506 8507 /* 8508 * Allocate an ism map blk if necessary. 8509 */ 8510 if (sfmmup->sfmmu_iblk == NULL) { 8511 sfmmup->sfmmu_iblk = new_iblk; 8512 bzero(new_iblk, sizeof (*new_iblk)); 8513 new_iblk->iblk_nextpa = (uint64_t)-1; 8514 membar_stst(); /* make sure next ptr visible to all CPUs */ 8515 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk); 8516 reload_mmu = 1; 8517 new_iblk = NULL; 8518 } 8519 8520 #ifdef DEBUG 8521 /* 8522 * Make sure mapping does not already exist. 8523 */ 8524 ism_blkp = sfmmup->sfmmu_iblk; 8525 while (ism_blkp != NULL) { 8526 ism_map = ism_blkp->iblk_maps; 8527 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 8528 if ((addr >= ism_start(ism_map[i]) && 8529 addr < ism_end(ism_map[i])) || 8530 eaddr > ism_start(ism_map[i]) && 8531 eaddr <= ism_end(ism_map[i])) { 8532 panic("sfmmu_share: Already mapped!"); 8533 } 8534 } 8535 ism_blkp = ism_blkp->iblk_next; 8536 } 8537 #endif /* DEBUG */ 8538 8539 ASSERT(ismszc >= TTE4M); 8540 if (ismszc == TTE4M) { 8541 ismhatflag = HAT_4M_FLAG; 8542 } else if (ismszc == TTE32M) { 8543 ismhatflag = HAT_32M_FLAG; 8544 } else if (ismszc == TTE256M) { 8545 ismhatflag = HAT_256M_FLAG; 8546 } 8547 /* 8548 * Add mapping to first available mapping slot. 8549 */ 8550 ism_blkp = sfmmup->sfmmu_iblk; 8551 added = 0; 8552 while (!added) { 8553 ism_map = ism_blkp->iblk_maps; 8554 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8555 if (ism_map[i].imap_ismhat == NULL) { 8556 8557 ism_map[i].imap_ismhat = ism_hatid; 8558 ism_map[i].imap_vb_shift = (uchar_t)ismshift; 8559 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8560 ism_map[i].imap_hatflags = ismhatflag; 8561 ism_map[i].imap_sz_mask = ismmask; 8562 /* 8563 * imap_seg is checked in ISM_CHECK to see if 8564 * non-NULL, then other info assumed valid. 8565 */ 8566 membar_stst(); 8567 ism_map[i].imap_seg = (uintptr_t)addr | sh_size; 8568 ism_map[i].imap_ment = ism_ment; 8569 8570 /* 8571 * Now add ourselves to the ism_hat's 8572 * mapping list. 8573 */ 8574 ism_ment->iment_hat = sfmmup; 8575 ism_ment->iment_base_va = addr; 8576 ism_hatid->sfmmu_ismhat = 1; 8577 mutex_enter(&ism_mlist_lock); 8578 iment_add(ism_ment, ism_hatid); 8579 mutex_exit(&ism_mlist_lock); 8580 added = 1; 8581 break; 8582 } 8583 } 8584 if (!added && ism_blkp->iblk_next == NULL) { 8585 ism_blkp->iblk_next = new_iblk; 8586 new_iblk = NULL; 8587 bzero(ism_blkp->iblk_next, 8588 sizeof (*ism_blkp->iblk_next)); 8589 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1; 8590 membar_stst(); 8591 ism_blkp->iblk_nextpa = 8592 va_to_pa((caddr_t)ism_blkp->iblk_next); 8593 } 8594 ism_blkp = ism_blkp->iblk_next; 8595 } 8596 8597 /* 8598 * After calling hat_join_region, sfmmup may join a new SCD or 8599 * move from the old scd to a new scd, in which case, we want to 8600 * shrink the sfmmup's private tsb size, i.e., pass shrink to 8601 * sfmmu_check_page_sizes at the end of this routine. 8602 */ 8603 old_scdp = sfmmup->sfmmu_scdp; 8604 8605 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0, 8606 PROT_ALL, ismszc, NULL, HAT_REGION_ISM); 8607 if (rcookie != HAT_INVALID_REGION_COOKIE) { 8608 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie); 8609 } 8610 /* 8611 * Update our counters for this sfmmup's ism mappings. 8612 */ 8613 for (i = 0; i <= ismszc; i++) { 8614 if (!(disable_ism_large_pages & (1 << i))) 8615 (void) ism_tsb_entries(sfmmup, i); 8616 } 8617 8618 /* 8619 * For ISM and DISM we do not support 512K pages, so we only only 8620 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the 8621 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus. 8622 * 8623 * Need to set 32M/256M ISM flags to make sure 8624 * sfmmu_check_page_sizes() enables them on Panther. 8625 */ 8626 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0); 8627 8628 switch (ismszc) { 8629 case TTE256M: 8630 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) { 8631 hatlockp = sfmmu_hat_enter(sfmmup); 8632 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM); 8633 sfmmu_hat_exit(hatlockp); 8634 } 8635 break; 8636 case TTE32M: 8637 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) { 8638 hatlockp = sfmmu_hat_enter(sfmmup); 8639 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM); 8640 sfmmu_hat_exit(hatlockp); 8641 } 8642 break; 8643 default: 8644 break; 8645 } 8646 8647 /* 8648 * If we updated the ismblkpa for this HAT we must make 8649 * sure all CPUs running this process reload their tsbmiss area. 8650 * Otherwise they will fail to load the mappings in the tsbmiss 8651 * handler and will loop calling pagefault(). 8652 */ 8653 if (reload_mmu) { 8654 hatlockp = sfmmu_hat_enter(sfmmup); 8655 sfmmu_sync_mmustate(sfmmup); 8656 sfmmu_hat_exit(hatlockp); 8657 } 8658 8659 sfmmu_ismhat_exit(sfmmup, 0); 8660 8661 /* 8662 * Free up ismblk if we didn't use it. 8663 */ 8664 if (new_iblk != NULL) 8665 kmem_cache_free(ism_blk_cache, new_iblk); 8666 8667 /* 8668 * Check TSB and TLB page sizes. 8669 */ 8670 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) { 8671 sfmmu_check_page_sizes(sfmmup, 0); 8672 } else { 8673 sfmmu_check_page_sizes(sfmmup, 1); 8674 } 8675 return (0); 8676 } 8677 8678 /* 8679 * hat_unshare removes exactly one ism_map from 8680 * this process's as. It expects multiple calls 8681 * to hat_unshare for multiple shm segments. 8682 */ 8683 void 8684 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc) 8685 { 8686 ism_map_t *ism_map; 8687 ism_ment_t *free_ment = NULL; 8688 ism_blk_t *ism_blkp; 8689 struct hat *ism_hatid; 8690 int found, i; 8691 hatlock_t *hatlockp; 8692 struct tsb_info *tsbinfo; 8693 uint_t ismshift = page_get_shift(ismszc); 8694 size_t sh_size = ISM_SHIFT(ismshift, len); 8695 uchar_t ism_rid; 8696 sf_scd_t *old_scdp; 8697 8698 ASSERT(ISM_ALIGNED(ismshift, addr)); 8699 ASSERT(ISM_ALIGNED(ismshift, len)); 8700 ASSERT(sfmmup != NULL); 8701 ASSERT(sfmmup != ksfmmup); 8702 8703 ASSERT(sfmmup->sfmmu_as != NULL); 8704 8705 /* 8706 * Make sure that during the entire time ISM mappings are removed, 8707 * the trap handlers serialize behind us, and that no one else 8708 * can be mucking with ISM mappings. This also lets us get away 8709 * with not doing expensive cross calls to flush the TLB -- we 8710 * just discard the context, flush the entire TSB, and call it 8711 * a day. 8712 */ 8713 sfmmu_ismhat_enter(sfmmup, 0); 8714 8715 /* 8716 * Remove the mapping. 8717 * 8718 * We can't have any holes in the ism map. 8719 * The tsb miss code while searching the ism map will 8720 * stop on an empty map slot. So we must move 8721 * everyone past the hole up 1 if any. 8722 * 8723 * Also empty ism map blks are not freed until the 8724 * process exits. This is to prevent a MT race condition 8725 * between sfmmu_unshare() and sfmmu_tsbmiss_exception(). 8726 */ 8727 found = 0; 8728 ism_blkp = sfmmup->sfmmu_iblk; 8729 while (!found && ism_blkp != NULL) { 8730 ism_map = ism_blkp->iblk_maps; 8731 for (i = 0; i < ISM_MAP_SLOTS; i++) { 8732 if (addr == ism_start(ism_map[i]) && 8733 sh_size == (size_t)(ism_size(ism_map[i]))) { 8734 found = 1; 8735 break; 8736 } 8737 } 8738 if (!found) 8739 ism_blkp = ism_blkp->iblk_next; 8740 } 8741 8742 if (found) { 8743 ism_hatid = ism_map[i].imap_ismhat; 8744 ism_rid = ism_map[i].imap_rid; 8745 ASSERT(ism_hatid != NULL); 8746 ASSERT(ism_hatid->sfmmu_ismhat == 1); 8747 8748 /* 8749 * After hat_leave_region, the sfmmup may leave SCD, 8750 * in which case, we want to grow the private tsb size when 8751 * calling sfmmu_check_page_sizes at the end of the routine. 8752 */ 8753 old_scdp = sfmmup->sfmmu_scdp; 8754 /* 8755 * Then remove ourselves from the region. 8756 */ 8757 if (ism_rid != SFMMU_INVALID_ISMRID) { 8758 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid), 8759 HAT_REGION_ISM); 8760 } 8761 8762 /* 8763 * And now guarantee that any other cpu 8764 * that tries to process an ISM miss 8765 * will go to tl=0. 8766 */ 8767 hatlockp = sfmmu_hat_enter(sfmmup); 8768 sfmmu_invalidate_ctx(sfmmup); 8769 sfmmu_hat_exit(hatlockp); 8770 8771 /* 8772 * Remove ourselves from the ism mapping list. 8773 */ 8774 mutex_enter(&ism_mlist_lock); 8775 iment_sub(ism_map[i].imap_ment, ism_hatid); 8776 mutex_exit(&ism_mlist_lock); 8777 free_ment = ism_map[i].imap_ment; 8778 8779 /* 8780 * We delete the ism map by copying 8781 * the next map over the current one. 8782 * We will take the next one in the maps 8783 * array or from the next ism_blk. 8784 */ 8785 while (ism_blkp != NULL) { 8786 ism_map = ism_blkp->iblk_maps; 8787 while (i < (ISM_MAP_SLOTS - 1)) { 8788 ism_map[i] = ism_map[i + 1]; 8789 i++; 8790 } 8791 /* i == (ISM_MAP_SLOTS - 1) */ 8792 ism_blkp = ism_blkp->iblk_next; 8793 if (ism_blkp != NULL) { 8794 ism_map[i] = ism_blkp->iblk_maps[0]; 8795 i = 0; 8796 } else { 8797 ism_map[i].imap_seg = 0; 8798 ism_map[i].imap_vb_shift = 0; 8799 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID; 8800 ism_map[i].imap_hatflags = 0; 8801 ism_map[i].imap_sz_mask = 0; 8802 ism_map[i].imap_ismhat = NULL; 8803 ism_map[i].imap_ment = NULL; 8804 } 8805 } 8806 8807 /* 8808 * Now flush entire TSB for the process, since 8809 * demapping page by page can be too expensive. 8810 * We don't have to flush the TLB here anymore 8811 * since we switch to a new TLB ctx instead. 8812 * Also, there is no need to flush if the process 8813 * is exiting since the TSB will be freed later. 8814 */ 8815 if (!sfmmup->sfmmu_free) { 8816 hatlockp = sfmmu_hat_enter(sfmmup); 8817 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL; 8818 tsbinfo = tsbinfo->tsb_next) { 8819 if (tsbinfo->tsb_flags & TSB_SWAPPED) 8820 continue; 8821 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) { 8822 tsbinfo->tsb_flags |= 8823 TSB_FLUSH_NEEDED; 8824 continue; 8825 } 8826 8827 sfmmu_inv_tsb(tsbinfo->tsb_va, 8828 TSB_BYTES(tsbinfo->tsb_szc)); 8829 } 8830 sfmmu_hat_exit(hatlockp); 8831 } 8832 } 8833 8834 /* 8835 * Update our counters for this sfmmup's ism mappings. 8836 */ 8837 for (i = 0; i <= ismszc; i++) { 8838 if (!(disable_ism_large_pages & (1 << i))) 8839 (void) ism_tsb_entries(sfmmup, i); 8840 } 8841 8842 sfmmu_ismhat_exit(sfmmup, 0); 8843 8844 /* 8845 * We must do our freeing here after dropping locks 8846 * to prevent a deadlock in the kmem allocator on the 8847 * mapping list lock. 8848 */ 8849 if (free_ment != NULL) 8850 kmem_cache_free(ism_ment_cache, free_ment); 8851 8852 /* 8853 * Check TSB and TLB page sizes if the process isn't exiting. 8854 */ 8855 if (!sfmmup->sfmmu_free) { 8856 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 8857 sfmmu_check_page_sizes(sfmmup, 1); 8858 } else { 8859 sfmmu_check_page_sizes(sfmmup, 0); 8860 } 8861 } 8862 } 8863 8864 /* ARGSUSED */ 8865 static int 8866 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags) 8867 { 8868 /* void *buf is sfmmu_t pointer */ 8869 bzero(buf, sizeof (sfmmu_t)); 8870 8871 return (0); 8872 } 8873 8874 /* ARGSUSED */ 8875 static void 8876 sfmmu_idcache_destructor(void *buf, void *cdrarg) 8877 { 8878 /* void *buf is sfmmu_t pointer */ 8879 } 8880 8881 /* 8882 * setup kmem hmeblks by bzeroing all members and initializing the nextpa 8883 * field to be the pa of this hmeblk 8884 */ 8885 /* ARGSUSED */ 8886 static int 8887 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags) 8888 { 8889 struct hme_blk *hmeblkp; 8890 8891 bzero(buf, (size_t)cdrarg); 8892 hmeblkp = (struct hme_blk *)buf; 8893 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 8894 8895 #ifdef HBLK_TRACE 8896 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL); 8897 #endif /* HBLK_TRACE */ 8898 8899 return (0); 8900 } 8901 8902 /* ARGSUSED */ 8903 static void 8904 sfmmu_hblkcache_destructor(void *buf, void *cdrarg) 8905 { 8906 8907 #ifdef HBLK_TRACE 8908 8909 struct hme_blk *hmeblkp; 8910 8911 hmeblkp = (struct hme_blk *)buf; 8912 mutex_destroy(&hmeblkp->hblk_audit_lock); 8913 8914 #endif /* HBLK_TRACE */ 8915 } 8916 8917 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8 8918 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO; 8919 /* 8920 * The kmem allocator will callback into our reclaim routine when the system 8921 * is running low in memory. We traverse the hash and free up all unused but 8922 * still cached hme_blks. We also traverse the free list and free them up 8923 * as well. 8924 */ 8925 /*ARGSUSED*/ 8926 static void 8927 sfmmu_hblkcache_reclaim(void *cdrarg) 8928 { 8929 int i; 8930 struct hmehash_bucket *hmebp; 8931 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL; 8932 static struct hmehash_bucket *uhmehash_reclaim_hand; 8933 static struct hmehash_bucket *khmehash_reclaim_hand; 8934 struct hme_blk *list = NULL, *last_hmeblkp; 8935 cpuset_t cpuset = cpu_ready_set; 8936 cpu_hme_pend_t *cpuhp; 8937 8938 /* Free up hmeblks on the cpu pending lists */ 8939 for (i = 0; i < NCPU; i++) { 8940 cpuhp = &cpu_hme_pend[i]; 8941 if (cpuhp->chp_listp != NULL) { 8942 mutex_enter(&cpuhp->chp_mutex); 8943 if (cpuhp->chp_listp == NULL) { 8944 mutex_exit(&cpuhp->chp_mutex); 8945 continue; 8946 } 8947 for (last_hmeblkp = cpuhp->chp_listp; 8948 last_hmeblkp->hblk_next != NULL; 8949 last_hmeblkp = last_hmeblkp->hblk_next) 8950 ; 8951 last_hmeblkp->hblk_next = list; 8952 list = cpuhp->chp_listp; 8953 cpuhp->chp_listp = NULL; 8954 cpuhp->chp_count = 0; 8955 mutex_exit(&cpuhp->chp_mutex); 8956 } 8957 8958 } 8959 8960 if (list != NULL) { 8961 kpreempt_disable(); 8962 CPUSET_DEL(cpuset, CPU->cpu_id); 8963 xt_sync(cpuset); 8964 xt_sync(cpuset); 8965 kpreempt_enable(); 8966 sfmmu_hblk_free(&list); 8967 list = NULL; 8968 } 8969 8970 hmebp = uhmehash_reclaim_hand; 8971 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ]) 8972 uhmehash_reclaim_hand = hmebp = uhme_hash; 8973 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 8974 8975 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 8976 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 8977 hmeblkp = hmebp->hmeblkp; 8978 pr_hblk = NULL; 8979 while (hmeblkp) { 8980 nx_hblk = hmeblkp->hblk_next; 8981 if (!hmeblkp->hblk_vcnt && 8982 !hmeblkp->hblk_hmecnt) { 8983 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 8984 pr_hblk, &list, 0); 8985 } else { 8986 pr_hblk = hmeblkp; 8987 } 8988 hmeblkp = nx_hblk; 8989 } 8990 SFMMU_HASH_UNLOCK(hmebp); 8991 } 8992 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 8993 hmebp = uhme_hash; 8994 } 8995 8996 hmebp = khmehash_reclaim_hand; 8997 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ]) 8998 khmehash_reclaim_hand = hmebp = khme_hash; 8999 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; 9000 9001 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) { 9002 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) { 9003 hmeblkp = hmebp->hmeblkp; 9004 pr_hblk = NULL; 9005 while (hmeblkp) { 9006 nx_hblk = hmeblkp->hblk_next; 9007 if (!hmeblkp->hblk_vcnt && 9008 !hmeblkp->hblk_hmecnt) { 9009 sfmmu_hblk_hash_rm(hmebp, hmeblkp, 9010 pr_hblk, &list, 0); 9011 } else { 9012 pr_hblk = hmeblkp; 9013 } 9014 hmeblkp = nx_hblk; 9015 } 9016 SFMMU_HASH_UNLOCK(hmebp); 9017 } 9018 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 9019 hmebp = khme_hash; 9020 } 9021 sfmmu_hblks_list_purge(&list, 0); 9022 } 9023 9024 /* 9025 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface. 9026 * same goes for sfmmu_get_addrvcolor(). 9027 * 9028 * This function will return the virtual color for the specified page. The 9029 * virtual color corresponds to this page current mapping or its last mapping. 9030 * It is used by memory allocators to choose addresses with the correct 9031 * alignment so vac consistency is automatically maintained. If the page 9032 * has no color it returns -1. 9033 */ 9034 /*ARGSUSED*/ 9035 int 9036 sfmmu_get_ppvcolor(struct page *pp) 9037 { 9038 #ifdef VAC 9039 int color; 9040 9041 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) { 9042 return (-1); 9043 } 9044 color = PP_GET_VCOLOR(pp); 9045 ASSERT(color < mmu_btop(shm_alignment)); 9046 return (color); 9047 #else 9048 return (-1); 9049 #endif /* VAC */ 9050 } 9051 9052 /* 9053 * This function will return the desired alignment for vac consistency 9054 * (vac color) given a virtual address. If no vac is present it returns -1. 9055 */ 9056 /*ARGSUSED*/ 9057 int 9058 sfmmu_get_addrvcolor(caddr_t vaddr) 9059 { 9060 #ifdef VAC 9061 if (cache & CACHE_VAC) { 9062 return (addr_to_vcolor(vaddr)); 9063 } else { 9064 return (-1); 9065 } 9066 #else 9067 return (-1); 9068 #endif /* VAC */ 9069 } 9070 9071 #ifdef VAC 9072 /* 9073 * Check for conflicts. 9074 * A conflict exists if the new and existent mappings do not match in 9075 * their "shm_alignment fields. If conflicts exist, the existant mappings 9076 * are flushed unless one of them is locked. If one of them is locked, then 9077 * the mappings are flushed and converted to non-cacheable mappings. 9078 */ 9079 static void 9080 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp) 9081 { 9082 struct hat *tmphat; 9083 struct sf_hment *sfhmep, *tmphme = NULL; 9084 struct hme_blk *hmeblkp; 9085 int vcolor; 9086 tte_t tte; 9087 9088 ASSERT(sfmmu_mlist_held(pp)); 9089 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */ 9090 9091 vcolor = addr_to_vcolor(addr); 9092 if (PP_NEWPAGE(pp)) { 9093 PP_SET_VCOLOR(pp, vcolor); 9094 return; 9095 } 9096 9097 if (PP_GET_VCOLOR(pp) == vcolor) { 9098 return; 9099 } 9100 9101 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) { 9102 /* 9103 * Previous user of page had a different color 9104 * but since there are no current users 9105 * we just flush the cache and change the color. 9106 */ 9107 SFMMU_STAT(sf_pgcolor_conflict); 9108 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9109 PP_SET_VCOLOR(pp, vcolor); 9110 return; 9111 } 9112 9113 /* 9114 * If we get here we have a vac conflict with a current 9115 * mapping. VAC conflict policy is as follows. 9116 * - The default is to unload the other mappings unless: 9117 * - If we have a large mapping we uncache the page. 9118 * We need to uncache the rest of the large page too. 9119 * - If any of the mappings are locked we uncache the page. 9120 * - If the requested mapping is inconsistent 9121 * with another mapping and that mapping 9122 * is in the same address space we have to 9123 * make it non-cached. The default thing 9124 * to do is unload the inconsistent mapping 9125 * but if they are in the same address space 9126 * we run the risk of unmapping the pc or the 9127 * stack which we will use as we return to the user, 9128 * in which case we can then fault on the thing 9129 * we just unloaded and get into an infinite loop. 9130 */ 9131 if (PP_ISMAPPED_LARGE(pp)) { 9132 int sz; 9133 9134 /* 9135 * Existing mapping is for big pages. We don't unload 9136 * existing big mappings to satisfy new mappings. 9137 * Always convert all mappings to TNC. 9138 */ 9139 sz = fnd_mapping_sz(pp); 9140 pp = PP_GROUPLEADER(pp, sz); 9141 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz)); 9142 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 9143 TTEPAGES(sz)); 9144 9145 return; 9146 } 9147 9148 /* 9149 * check if any mapping is in same as or if it is locked 9150 * since in that case we need to uncache. 9151 */ 9152 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9153 tmphme = sfhmep->hme_next; 9154 if (IS_PAHME(sfhmep)) 9155 continue; 9156 hmeblkp = sfmmu_hmetohblk(sfhmep); 9157 tmphat = hblktosfmmu(hmeblkp); 9158 sfmmu_copytte(&sfhmep->hme_tte, &tte); 9159 ASSERT(TTE_IS_VALID(&tte)); 9160 if (hmeblkp->hblk_shared || tmphat == hat || 9161 hmeblkp->hblk_lckcnt) { 9162 /* 9163 * We have an uncache conflict 9164 */ 9165 SFMMU_STAT(sf_uncache_conflict); 9166 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1); 9167 return; 9168 } 9169 } 9170 9171 /* 9172 * We have an unload conflict 9173 * We have already checked for LARGE mappings, therefore 9174 * the remaining mapping(s) must be TTE8K. 9175 */ 9176 SFMMU_STAT(sf_unload_conflict); 9177 9178 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) { 9179 tmphme = sfhmep->hme_next; 9180 if (IS_PAHME(sfhmep)) 9181 continue; 9182 hmeblkp = sfmmu_hmetohblk(sfhmep); 9183 ASSERT(!hmeblkp->hblk_shared); 9184 (void) sfmmu_pageunload(pp, sfhmep, TTE8K); 9185 } 9186 9187 if (PP_ISMAPPED_KPM(pp)) 9188 sfmmu_kpm_vac_unload(pp, addr); 9189 9190 /* 9191 * Unloads only do TLB flushes so we need to flush the 9192 * cache here. 9193 */ 9194 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp)); 9195 PP_SET_VCOLOR(pp, vcolor); 9196 } 9197 9198 /* 9199 * Whenever a mapping is unloaded and the page is in TNC state, 9200 * we see if the page can be made cacheable again. 'pp' is 9201 * the page that we just unloaded a mapping from, the size 9202 * of mapping that was unloaded is 'ottesz'. 9203 * Remark: 9204 * The recache policy for mpss pages can leave a performance problem 9205 * under the following circumstances: 9206 * . A large page in uncached mode has just been unmapped. 9207 * . All constituent pages are TNC due to a conflicting small mapping. 9208 * . There are many other, non conflicting, small mappings around for 9209 * a lot of the constituent pages. 9210 * . We're called w/ the "old" groupleader page and the old ottesz, 9211 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so 9212 * we end up w/ TTE8K or npages == 1. 9213 * . We call tst_tnc w/ the old groupleader only, and if there is no 9214 * conflict, we re-cache only this page. 9215 * . All other small mappings are not checked and will be left in TNC mode. 9216 * The problem is not very serious because: 9217 * . mpss is actually only defined for heap and stack, so the probability 9218 * is not very high that a large page mapping exists in parallel to a small 9219 * one (this is possible, but seems to be bad programming style in the 9220 * appl). 9221 * . The problem gets a little bit more serious, when those TNC pages 9222 * have to be mapped into kernel space, e.g. for networking. 9223 * . When VAC alias conflicts occur in applications, this is regarded 9224 * as an application bug. So if kstat's show them, the appl should 9225 * be changed anyway. 9226 */ 9227 void 9228 conv_tnc(page_t *pp, int ottesz) 9229 { 9230 int cursz, dosz; 9231 pgcnt_t curnpgs, dopgs; 9232 pgcnt_t pg64k; 9233 page_t *pp2; 9234 9235 /* 9236 * Determine how big a range we check for TNC and find 9237 * leader page. cursz is the size of the biggest 9238 * mapping that still exist on 'pp'. 9239 */ 9240 if (PP_ISMAPPED_LARGE(pp)) { 9241 cursz = fnd_mapping_sz(pp); 9242 } else { 9243 cursz = TTE8K; 9244 } 9245 9246 if (ottesz >= cursz) { 9247 dosz = ottesz; 9248 pp2 = pp; 9249 } else { 9250 dosz = cursz; 9251 pp2 = PP_GROUPLEADER(pp, dosz); 9252 } 9253 9254 pg64k = TTEPAGES(TTE64K); 9255 dopgs = TTEPAGES(dosz); 9256 9257 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0)); 9258 9259 while (dopgs != 0) { 9260 curnpgs = TTEPAGES(cursz); 9261 if (tst_tnc(pp2, curnpgs)) { 9262 SFMMU_STAT_ADD(sf_recache, curnpgs); 9263 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH, 9264 curnpgs); 9265 } 9266 9267 ASSERT(dopgs >= curnpgs); 9268 dopgs -= curnpgs; 9269 9270 if (dopgs == 0) { 9271 break; 9272 } 9273 9274 pp2 = PP_PAGENEXT_N(pp2, curnpgs); 9275 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) { 9276 cursz = fnd_mapping_sz(pp2); 9277 } else { 9278 cursz = TTE8K; 9279 } 9280 } 9281 } 9282 9283 /* 9284 * Returns 1 if page(s) can be converted from TNC to cacheable setting, 9285 * returns 0 otherwise. Note that oaddr argument is valid for only 9286 * 8k pages. 9287 */ 9288 int 9289 tst_tnc(page_t *pp, pgcnt_t npages) 9290 { 9291 struct sf_hment *sfhme; 9292 struct hme_blk *hmeblkp; 9293 tte_t tte; 9294 caddr_t vaddr; 9295 int clr_valid = 0; 9296 int color, color1, bcolor; 9297 int i, ncolors; 9298 9299 ASSERT(pp != NULL); 9300 ASSERT(!(cache & CACHE_WRITEBACK)); 9301 9302 if (npages > 1) { 9303 ncolors = CACHE_NUM_COLOR; 9304 } 9305 9306 for (i = 0; i < npages; i++) { 9307 ASSERT(sfmmu_mlist_held(pp)); 9308 ASSERT(PP_ISTNC(pp)); 9309 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR); 9310 9311 if (PP_ISPNC(pp)) { 9312 return (0); 9313 } 9314 9315 clr_valid = 0; 9316 if (PP_ISMAPPED_KPM(pp)) { 9317 caddr_t kpmvaddr; 9318 9319 ASSERT(kpm_enable); 9320 kpmvaddr = hat_kpm_page2va(pp, 1); 9321 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr))); 9322 color1 = addr_to_vcolor(kpmvaddr); 9323 clr_valid = 1; 9324 } 9325 9326 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9327 if (IS_PAHME(sfhme)) 9328 continue; 9329 hmeblkp = sfmmu_hmetohblk(sfhme); 9330 9331 sfmmu_copytte(&sfhme->hme_tte, &tte); 9332 ASSERT(TTE_IS_VALID(&tte)); 9333 9334 vaddr = tte_to_vaddr(hmeblkp, tte); 9335 color = addr_to_vcolor(vaddr); 9336 9337 if (npages > 1) { 9338 /* 9339 * If there is a big mapping, make sure 9340 * 8K mapping is consistent with the big 9341 * mapping. 9342 */ 9343 bcolor = i % ncolors; 9344 if (color != bcolor) { 9345 return (0); 9346 } 9347 } 9348 if (!clr_valid) { 9349 clr_valid = 1; 9350 color1 = color; 9351 } 9352 9353 if (color1 != color) { 9354 return (0); 9355 } 9356 } 9357 9358 pp = PP_PAGENEXT(pp); 9359 } 9360 9361 return (1); 9362 } 9363 9364 void 9365 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag, 9366 pgcnt_t npages) 9367 { 9368 kmutex_t *pmtx; 9369 int i, ncolors, bcolor; 9370 kpm_hlk_t *kpmp; 9371 cpuset_t cpuset; 9372 9373 ASSERT(pp != NULL); 9374 ASSERT(!(cache & CACHE_WRITEBACK)); 9375 9376 kpmp = sfmmu_kpm_kpmp_enter(pp, npages); 9377 pmtx = sfmmu_page_enter(pp); 9378 9379 /* 9380 * Fast path caching single unmapped page 9381 */ 9382 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) && 9383 flags == HAT_CACHE) { 9384 PP_CLRTNC(pp); 9385 PP_CLRPNC(pp); 9386 sfmmu_page_exit(pmtx); 9387 sfmmu_kpm_kpmp_exit(kpmp); 9388 return; 9389 } 9390 9391 /* 9392 * We need to capture all cpus in order to change cacheability 9393 * because we can't allow one cpu to access the same physical 9394 * page using a cacheable and a non-cachebale mapping at the same 9395 * time. Since we may end up walking the ism mapping list 9396 * have to grab it's lock now since we can't after all the 9397 * cpus have been captured. 9398 */ 9399 sfmmu_hat_lock_all(); 9400 mutex_enter(&ism_mlist_lock); 9401 kpreempt_disable(); 9402 cpuset = cpu_ready_set; 9403 xc_attention(cpuset); 9404 9405 if (npages > 1) { 9406 /* 9407 * Make sure all colors are flushed since the 9408 * sfmmu_page_cache() only flushes one color- 9409 * it does not know big pages. 9410 */ 9411 ncolors = CACHE_NUM_COLOR; 9412 if (flags & HAT_TMPNC) { 9413 for (i = 0; i < ncolors; i++) { 9414 sfmmu_cache_flushcolor(i, pp->p_pagenum); 9415 } 9416 cache_flush_flag = CACHE_NO_FLUSH; 9417 } 9418 } 9419 9420 for (i = 0; i < npages; i++) { 9421 9422 ASSERT(sfmmu_mlist_held(pp)); 9423 9424 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) { 9425 9426 if (npages > 1) { 9427 bcolor = i % ncolors; 9428 } else { 9429 bcolor = NO_VCOLOR; 9430 } 9431 9432 sfmmu_page_cache(pp, flags, cache_flush_flag, 9433 bcolor); 9434 } 9435 9436 pp = PP_PAGENEXT(pp); 9437 } 9438 9439 xt_sync(cpuset); 9440 xc_dismissed(cpuset); 9441 mutex_exit(&ism_mlist_lock); 9442 sfmmu_hat_unlock_all(); 9443 sfmmu_page_exit(pmtx); 9444 sfmmu_kpm_kpmp_exit(kpmp); 9445 kpreempt_enable(); 9446 } 9447 9448 /* 9449 * This function changes the virtual cacheability of all mappings to a 9450 * particular page. When changing from uncache to cacheable the mappings will 9451 * only be changed if all of them have the same virtual color. 9452 * We need to flush the cache in all cpus. It is possible that 9453 * a process referenced a page as cacheable but has sinced exited 9454 * and cleared the mapping list. We still to flush it but have no 9455 * state so all cpus is the only alternative. 9456 */ 9457 static void 9458 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor) 9459 { 9460 struct sf_hment *sfhme; 9461 struct hme_blk *hmeblkp; 9462 sfmmu_t *sfmmup; 9463 tte_t tte, ttemod; 9464 caddr_t vaddr; 9465 int ret, color; 9466 pfn_t pfn; 9467 9468 color = bcolor; 9469 pfn = pp->p_pagenum; 9470 9471 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) { 9472 9473 if (IS_PAHME(sfhme)) 9474 continue; 9475 hmeblkp = sfmmu_hmetohblk(sfhme); 9476 9477 sfmmu_copytte(&sfhme->hme_tte, &tte); 9478 ASSERT(TTE_IS_VALID(&tte)); 9479 vaddr = tte_to_vaddr(hmeblkp, tte); 9480 color = addr_to_vcolor(vaddr); 9481 9482 #ifdef DEBUG 9483 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) { 9484 ASSERT(color == bcolor); 9485 } 9486 #endif 9487 9488 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp)); 9489 9490 ttemod = tte; 9491 if (flags & (HAT_UNCACHE | HAT_TMPNC)) { 9492 TTE_CLR_VCACHEABLE(&ttemod); 9493 } else { /* flags & HAT_CACHE */ 9494 TTE_SET_VCACHEABLE(&ttemod); 9495 } 9496 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte); 9497 if (ret < 0) { 9498 /* 9499 * Since all cpus are captured modifytte should not 9500 * fail. 9501 */ 9502 panic("sfmmu_page_cache: write to tte failed"); 9503 } 9504 9505 sfmmup = hblktosfmmu(hmeblkp); 9506 if (cache_flush_flag == CACHE_FLUSH) { 9507 /* 9508 * Flush TSBs, TLBs and caches 9509 */ 9510 if (hmeblkp->hblk_shared) { 9511 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9512 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9513 sf_region_t *rgnp; 9514 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9515 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9516 ASSERT(srdp != NULL); 9517 rgnp = srdp->srd_hmergnp[rid]; 9518 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9519 srdp, rgnp, rid); 9520 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9521 hmeblkp, 0); 9522 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr)); 9523 } else if (sfmmup->sfmmu_ismhat) { 9524 if (flags & HAT_CACHE) { 9525 SFMMU_STAT(sf_ism_recache); 9526 } else { 9527 SFMMU_STAT(sf_ism_uncache); 9528 } 9529 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9530 pfn, CACHE_FLUSH); 9531 } else { 9532 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp, 9533 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1); 9534 } 9535 9536 /* 9537 * all cache entries belonging to this pfn are 9538 * now flushed. 9539 */ 9540 cache_flush_flag = CACHE_NO_FLUSH; 9541 } else { 9542 /* 9543 * Flush only TSBs and TLBs. 9544 */ 9545 if (hmeblkp->hblk_shared) { 9546 sf_srd_t *srdp = (sf_srd_t *)sfmmup; 9547 uint_t rid = hmeblkp->hblk_tag.htag_rid; 9548 sf_region_t *rgnp; 9549 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 9550 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 9551 ASSERT(srdp != NULL); 9552 rgnp = srdp->srd_hmergnp[rid]; 9553 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, 9554 srdp, rgnp, rid); 9555 (void) sfmmu_rgntlb_demap(vaddr, rgnp, 9556 hmeblkp, 0); 9557 } else if (sfmmup->sfmmu_ismhat) { 9558 if (flags & HAT_CACHE) { 9559 SFMMU_STAT(sf_ism_recache); 9560 } else { 9561 SFMMU_STAT(sf_ism_uncache); 9562 } 9563 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp, 9564 pfn, CACHE_NO_FLUSH); 9565 } else { 9566 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1); 9567 } 9568 } 9569 } 9570 9571 if (PP_ISMAPPED_KPM(pp)) 9572 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag); 9573 9574 switch (flags) { 9575 9576 default: 9577 panic("sfmmu_pagecache: unknown flags"); 9578 break; 9579 9580 case HAT_CACHE: 9581 PP_CLRTNC(pp); 9582 PP_CLRPNC(pp); 9583 PP_SET_VCOLOR(pp, color); 9584 break; 9585 9586 case HAT_TMPNC: 9587 PP_SETTNC(pp); 9588 PP_SET_VCOLOR(pp, NO_VCOLOR); 9589 break; 9590 9591 case HAT_UNCACHE: 9592 PP_SETPNC(pp); 9593 PP_CLRTNC(pp); 9594 PP_SET_VCOLOR(pp, NO_VCOLOR); 9595 break; 9596 } 9597 } 9598 #endif /* VAC */ 9599 9600 9601 /* 9602 * Wrapper routine used to return a context. 9603 * 9604 * It's the responsibility of the caller to guarantee that the 9605 * process serializes on calls here by taking the HAT lock for 9606 * the hat. 9607 * 9608 */ 9609 static void 9610 sfmmu_get_ctx(sfmmu_t *sfmmup) 9611 { 9612 mmu_ctx_t *mmu_ctxp; 9613 uint_t pstate_save; 9614 int ret; 9615 9616 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9617 ASSERT(sfmmup != ksfmmup); 9618 9619 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) { 9620 sfmmu_setup_tsbinfo(sfmmup); 9621 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID); 9622 } 9623 9624 kpreempt_disable(); 9625 9626 mmu_ctxp = CPU_MMU_CTXP(CPU); 9627 ASSERT(mmu_ctxp); 9628 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 9629 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 9630 9631 /* 9632 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU. 9633 */ 9634 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs) 9635 sfmmu_ctx_wrap_around(mmu_ctxp, B_TRUE); 9636 9637 /* 9638 * Let the MMU set up the page sizes to use for 9639 * this context in the TLB. Don't program 2nd dtlb for ism hat. 9640 */ 9641 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) { 9642 mmu_set_ctx_page_sizes(sfmmup); 9643 } 9644 9645 /* 9646 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with 9647 * interrupts disabled to prevent race condition with wrap-around 9648 * ctx invalidatation. In sun4v, ctx invalidation also involves 9649 * a HV call to set the number of TSBs to 0. If interrupts are not 9650 * disabled until after sfmmu_load_mmustate is complete TSBs may 9651 * become assigned to INVALID_CONTEXT. This is not allowed. 9652 */ 9653 pstate_save = sfmmu_disable_intrs(); 9654 9655 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) && 9656 sfmmup->sfmmu_scdp != NULL) { 9657 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 9658 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 9659 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED); 9660 /* debug purpose only */ 9661 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 9662 != INVALID_CONTEXT); 9663 } 9664 sfmmu_load_mmustate(sfmmup); 9665 9666 sfmmu_enable_intrs(pstate_save); 9667 9668 kpreempt_enable(); 9669 } 9670 9671 /* 9672 * When all cnums are used up in a MMU, cnum will wrap around to the 9673 * next generation and start from 2. 9674 */ 9675 static void 9676 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp, boolean_t reset_cnum) 9677 { 9678 9679 /* caller must have disabled the preemption */ 9680 ASSERT(curthread->t_preempt >= 1); 9681 ASSERT(mmu_ctxp != NULL); 9682 9683 /* acquire Per-MMU (PM) spin lock */ 9684 mutex_enter(&mmu_ctxp->mmu_lock); 9685 9686 /* re-check to see if wrap-around is needed */ 9687 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs) 9688 goto done; 9689 9690 SFMMU_MMU_STAT(mmu_wrap_around); 9691 9692 /* update gnum */ 9693 ASSERT(mmu_ctxp->mmu_gnum != 0); 9694 mmu_ctxp->mmu_gnum++; 9695 if (mmu_ctxp->mmu_gnum == 0 || 9696 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) { 9697 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.", 9698 (void *)mmu_ctxp); 9699 } 9700 9701 if (mmu_ctxp->mmu_ncpus > 1) { 9702 cpuset_t cpuset; 9703 9704 membar_enter(); /* make sure updated gnum visible */ 9705 9706 SFMMU_XCALL_STATS(NULL); 9707 9708 /* xcall to others on the same MMU to invalidate ctx */ 9709 cpuset = mmu_ctxp->mmu_cpuset; 9710 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id) || !reset_cnum); 9711 CPUSET_DEL(cpuset, CPU->cpu_id); 9712 CPUSET_AND(cpuset, cpu_ready_set); 9713 9714 /* 9715 * Pass in INVALID_CONTEXT as the first parameter to 9716 * sfmmu_raise_tsb_exception, which invalidates the context 9717 * of any process running on the CPUs in the MMU. 9718 */ 9719 xt_some(cpuset, sfmmu_raise_tsb_exception, 9720 INVALID_CONTEXT, INVALID_CONTEXT); 9721 xt_sync(cpuset); 9722 9723 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 9724 } 9725 9726 if (sfmmu_getctx_sec() != INVALID_CONTEXT) { 9727 sfmmu_setctx_sec(INVALID_CONTEXT); 9728 sfmmu_clear_utsbinfo(); 9729 } 9730 9731 /* 9732 * No xcall is needed here. For sun4u systems all CPUs in context 9733 * domain share a single physical MMU therefore it's enough to flush 9734 * TLB on local CPU. On sun4v systems we use 1 global context 9735 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception 9736 * handler. Note that vtag_flushall_uctxs() is called 9737 * for Ultra II machine, where the equivalent flushall functionality 9738 * is implemented in SW, and only user ctx TLB entries are flushed. 9739 */ 9740 if (&vtag_flushall_uctxs != NULL) { 9741 vtag_flushall_uctxs(); 9742 } else { 9743 vtag_flushall(); 9744 } 9745 9746 /* reset mmu cnum, skips cnum 0 and 1 */ 9747 if (reset_cnum == B_TRUE) 9748 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS; 9749 9750 done: 9751 mutex_exit(&mmu_ctxp->mmu_lock); 9752 } 9753 9754 9755 /* 9756 * For multi-threaded process, set the process context to INVALID_CONTEXT 9757 * so that it faults and reloads the MMU state from TL=0. For single-threaded 9758 * process, we can just load the MMU state directly without having to 9759 * set context invalid. Caller must hold the hat lock since we don't 9760 * acquire it here. 9761 */ 9762 static void 9763 sfmmu_sync_mmustate(sfmmu_t *sfmmup) 9764 { 9765 uint_t cnum; 9766 uint_t pstate_save; 9767 9768 ASSERT(sfmmup != ksfmmup); 9769 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9770 9771 kpreempt_disable(); 9772 9773 /* 9774 * We check whether the pass'ed-in sfmmup is the same as the 9775 * current running proc. This is to makes sure the current proc 9776 * stays single-threaded if it already is. 9777 */ 9778 if ((sfmmup == curthread->t_procp->p_as->a_hat) && 9779 (curthread->t_procp->p_lwpcnt == 1)) { 9780 /* single-thread */ 9781 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum; 9782 if (cnum != INVALID_CONTEXT) { 9783 uint_t curcnum; 9784 /* 9785 * Disable interrupts to prevent race condition 9786 * with sfmmu_ctx_wrap_around ctx invalidation. 9787 * In sun4v, ctx invalidation involves setting 9788 * TSB to NULL, hence, interrupts should be disabled 9789 * untill after sfmmu_load_mmustate is completed. 9790 */ 9791 pstate_save = sfmmu_disable_intrs(); 9792 curcnum = sfmmu_getctx_sec(); 9793 if (curcnum == cnum) 9794 sfmmu_load_mmustate(sfmmup); 9795 sfmmu_enable_intrs(pstate_save); 9796 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT); 9797 } 9798 } else { 9799 /* 9800 * multi-thread 9801 * or when sfmmup is not the same as the curproc. 9802 */ 9803 sfmmu_invalidate_ctx(sfmmup); 9804 } 9805 9806 kpreempt_enable(); 9807 } 9808 9809 9810 /* 9811 * Replace the specified TSB with a new TSB. This function gets called when 9812 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the 9813 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB 9814 * (8K). 9815 * 9816 * Caller must hold the HAT lock, but should assume any tsb_info 9817 * pointers it has are no longer valid after calling this function. 9818 * 9819 * Return values: 9820 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints 9821 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing 9822 * something to this tsbinfo/TSB 9823 * TSB_SUCCESS Operation succeeded 9824 */ 9825 static tsb_replace_rc_t 9826 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc, 9827 hatlock_t *hatlockp, uint_t flags) 9828 { 9829 struct tsb_info *new_tsbinfo = NULL; 9830 struct tsb_info *curtsb, *prevtsb; 9831 uint_t tte_sz_mask; 9832 int i; 9833 9834 ASSERT(sfmmup != ksfmmup); 9835 ASSERT(sfmmup->sfmmu_ismhat == 0); 9836 ASSERT(sfmmu_hat_lock_held(sfmmup)); 9837 ASSERT(szc <= tsb_max_growsize); 9838 9839 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY)) 9840 return (TSB_LOSTRACE); 9841 9842 /* 9843 * Find the tsb_info ahead of this one in the list, and 9844 * also make sure that the tsb_info passed in really 9845 * exists! 9846 */ 9847 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 9848 curtsb != old_tsbinfo && curtsb != NULL; 9849 prevtsb = curtsb, curtsb = curtsb->tsb_next) 9850 ; 9851 ASSERT(curtsb != NULL); 9852 9853 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 9854 /* 9855 * The process is swapped out, so just set the new size 9856 * code. When it swaps back in, we'll allocate a new one 9857 * of the new chosen size. 9858 */ 9859 curtsb->tsb_szc = szc; 9860 return (TSB_SUCCESS); 9861 } 9862 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY); 9863 9864 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask; 9865 9866 /* 9867 * All initialization is done inside of sfmmu_tsbinfo_alloc(). 9868 * If we fail to allocate a TSB, exit. 9869 * 9870 * If tsb grows with new tsb size > 4M and old tsb size < 4M, 9871 * then try 4M slab after the initial alloc fails. 9872 * 9873 * If tsb swapin with tsb size > 4M, then try 4M after the 9874 * initial alloc fails. 9875 */ 9876 sfmmu_hat_exit(hatlockp); 9877 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc, 9878 tte_sz_mask, flags, sfmmup) && 9879 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) || 9880 (!(flags & TSB_SWAPIN) && 9881 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) || 9882 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE, 9883 tte_sz_mask, flags, sfmmup))) { 9884 (void) sfmmu_hat_enter(sfmmup); 9885 if (!(flags & TSB_SWAPIN)) 9886 SFMMU_STAT(sf_tsb_resize_failures); 9887 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 9888 return (TSB_ALLOCFAIL); 9889 } 9890 (void) sfmmu_hat_enter(sfmmup); 9891 9892 /* 9893 * Re-check to make sure somebody else didn't muck with us while we 9894 * didn't hold the HAT lock. If the process swapped out, fine, just 9895 * exit; this can happen if we try to shrink the TSB from the context 9896 * of another process (such as on an ISM unmap), though it is rare. 9897 */ 9898 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 9899 SFMMU_STAT(sf_tsb_resize_failures); 9900 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 9901 sfmmu_hat_exit(hatlockp); 9902 sfmmu_tsbinfo_free(new_tsbinfo); 9903 (void) sfmmu_hat_enter(sfmmup); 9904 return (TSB_LOSTRACE); 9905 } 9906 9907 #ifdef DEBUG 9908 /* Reverify that the tsb_info still exists.. for debugging only */ 9909 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb; 9910 curtsb != old_tsbinfo && curtsb != NULL; 9911 prevtsb = curtsb, curtsb = curtsb->tsb_next) 9912 ; 9913 ASSERT(curtsb != NULL); 9914 #endif /* DEBUG */ 9915 9916 /* 9917 * Quiesce any CPUs running this process on their next TLB miss 9918 * so they atomically see the new tsb_info. We temporarily set the 9919 * context to invalid context so new threads that come on processor 9920 * after we do the xcall to cpusran will also serialize behind the 9921 * HAT lock on TLB miss and will see the new TSB. Since this short 9922 * race with a new thread coming on processor is relatively rare, 9923 * this synchronization mechanism should be cheaper than always 9924 * pausing all CPUs for the duration of the setup, which is what 9925 * the old implementation did. This is particuarly true if we are 9926 * copying a huge chunk of memory around during that window. 9927 * 9928 * The memory barriers are to make sure things stay consistent 9929 * with resume() since it does not hold the HAT lock while 9930 * walking the list of tsb_info structures. 9931 */ 9932 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) { 9933 /* The TSB is either growing or shrinking. */ 9934 sfmmu_invalidate_ctx(sfmmup); 9935 } else { 9936 /* 9937 * It is illegal to swap in TSBs from a process other 9938 * than a process being swapped in. This in turn 9939 * implies we do not have a valid MMU context here 9940 * since a process needs one to resolve translation 9941 * misses. 9942 */ 9943 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup); 9944 } 9945 9946 #ifdef DEBUG 9947 ASSERT(max_mmu_ctxdoms > 0); 9948 9949 /* 9950 * Process should have INVALID_CONTEXT on all MMUs 9951 */ 9952 for (i = 0; i < max_mmu_ctxdoms; i++) { 9953 9954 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT); 9955 } 9956 #endif 9957 9958 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next; 9959 membar_stst(); /* strict ordering required */ 9960 if (prevtsb) 9961 prevtsb->tsb_next = new_tsbinfo; 9962 else 9963 sfmmup->sfmmu_tsb = new_tsbinfo; 9964 membar_enter(); /* make sure new TSB globally visible */ 9965 9966 /* 9967 * We need to migrate TSB entries from the old TSB to the new TSB 9968 * if tsb_remap_ttes is set and the TSB is growing. 9969 */ 9970 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW)) 9971 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo); 9972 9973 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY); 9974 9975 /* 9976 * Drop the HAT lock to free our old tsb_info. 9977 */ 9978 sfmmu_hat_exit(hatlockp); 9979 9980 if ((flags & TSB_GROW) == TSB_GROW) { 9981 SFMMU_STAT(sf_tsb_grow); 9982 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) { 9983 SFMMU_STAT(sf_tsb_shrink); 9984 } 9985 9986 sfmmu_tsbinfo_free(old_tsbinfo); 9987 9988 (void) sfmmu_hat_enter(sfmmup); 9989 return (TSB_SUCCESS); 9990 } 9991 9992 /* 9993 * This function will re-program hat pgsz array, and invalidate the 9994 * process' context, forcing the process to switch to another 9995 * context on the next TLB miss, and therefore start using the 9996 * TLB that is reprogrammed for the new page sizes. 9997 */ 9998 void 9999 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz) 10000 { 10001 int i; 10002 hatlock_t *hatlockp = NULL; 10003 10004 hatlockp = sfmmu_hat_enter(sfmmup); 10005 /* USIII+-IV+ optimization, requires hat lock */ 10006 if (tmp_pgsz) { 10007 for (i = 0; i < mmu_page_sizes; i++) 10008 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i]; 10009 } 10010 SFMMU_STAT(sf_tlb_reprog_pgsz); 10011 10012 sfmmu_invalidate_ctx(sfmmup); 10013 10014 sfmmu_hat_exit(hatlockp); 10015 } 10016 10017 /* 10018 * The scd_rttecnt field in the SCD must be updated to take account of the 10019 * regions which it contains. 10020 */ 10021 static void 10022 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp) 10023 { 10024 uint_t rid; 10025 uint_t i, j; 10026 ulong_t w; 10027 sf_region_t *rgnp; 10028 10029 ASSERT(srdp != NULL); 10030 10031 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 10032 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 10033 continue; 10034 } 10035 10036 j = 0; 10037 while (w) { 10038 if (!(w & 0x1)) { 10039 j++; 10040 w >>= 1; 10041 continue; 10042 } 10043 rid = (i << BT_ULSHIFT) | j; 10044 j++; 10045 w >>= 1; 10046 10047 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 10048 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 10049 rgnp = srdp->srd_hmergnp[rid]; 10050 ASSERT(rgnp->rgn_refcnt > 0); 10051 ASSERT(rgnp->rgn_id == rid); 10052 10053 scdp->scd_rttecnt[rgnp->rgn_pgszc] += 10054 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 10055 10056 /* 10057 * Maintain the tsb0 inflation cnt for the regions 10058 * in the SCD. 10059 */ 10060 if (rgnp->rgn_pgszc >= TTE4M) { 10061 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt += 10062 rgnp->rgn_size >> 10063 (TTE_PAGE_SHIFT(TTE8K) + 2); 10064 } 10065 } 10066 } 10067 } 10068 10069 /* 10070 * This function assumes that there are either four or six supported page 10071 * sizes and at most two programmable TLBs, so we need to decide which 10072 * page sizes are most important and then tell the MMU layer so it 10073 * can adjust the TLB page sizes accordingly (if supported). 10074 * 10075 * If these assumptions change, this function will need to be 10076 * updated to support whatever the new limits are. 10077 * 10078 * The growing flag is nonzero if we are growing the address space, 10079 * and zero if it is shrinking. This allows us to decide whether 10080 * to grow or shrink our TSB, depending upon available memory 10081 * conditions. 10082 */ 10083 static void 10084 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing) 10085 { 10086 uint64_t ttecnt[MMU_PAGE_SIZES]; 10087 uint64_t tte8k_cnt, tte4m_cnt; 10088 uint8_t i; 10089 int sectsb_thresh; 10090 10091 /* 10092 * Kernel threads, processes with small address spaces not using 10093 * large pages, and dummy ISM HATs need not apply. 10094 */ 10095 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL) 10096 return; 10097 10098 if (!SFMMU_LGPGS_INUSE(sfmmup) && 10099 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor) 10100 return; 10101 10102 for (i = 0; i < mmu_page_sizes; i++) { 10103 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] + 10104 sfmmup->sfmmu_ismttecnt[i]; 10105 } 10106 10107 /* Check pagesizes in use, and possibly reprogram DTLB. */ 10108 if (&mmu_check_page_sizes) 10109 mmu_check_page_sizes(sfmmup, ttecnt); 10110 10111 /* 10112 * Calculate the number of 8k ttes to represent the span of these 10113 * pages. 10114 */ 10115 tte8k_cnt = ttecnt[TTE8K] + 10116 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) + 10117 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT)); 10118 if (mmu_page_sizes == max_mmu_page_sizes) { 10119 tte4m_cnt = ttecnt[TTE4M] + 10120 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) + 10121 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M)); 10122 } else { 10123 tte4m_cnt = ttecnt[TTE4M]; 10124 } 10125 10126 /* 10127 * Inflate tte8k_cnt to allow for region large page allocation failure. 10128 */ 10129 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt; 10130 10131 /* 10132 * Inflate TSB sizes by a factor of 2 if this process 10133 * uses 4M text pages to minimize extra conflict misses 10134 * in the first TSB since without counting text pages 10135 * 8K TSB may become too small. 10136 * 10137 * Also double the size of the second TSB to minimize 10138 * extra conflict misses due to competition between 4M text pages 10139 * and data pages. 10140 * 10141 * We need to adjust the second TSB allocation threshold by the 10142 * inflation factor, since there is no point in creating a second 10143 * TSB when we know all the mappings can fit in the I/D TLBs. 10144 */ 10145 sectsb_thresh = tsb_sectsb_threshold; 10146 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) { 10147 tte8k_cnt <<= 1; 10148 tte4m_cnt <<= 1; 10149 sectsb_thresh <<= 1; 10150 } 10151 10152 /* 10153 * Check to see if our TSB is the right size; we may need to 10154 * grow or shrink it. If the process is small, our work is 10155 * finished at this point. 10156 */ 10157 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) { 10158 return; 10159 } 10160 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh); 10161 } 10162 10163 static void 10164 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt, 10165 uint64_t tte4m_cnt, int sectsb_thresh) 10166 { 10167 int tsb_bits; 10168 uint_t tsb_szc; 10169 struct tsb_info *tsbinfop; 10170 hatlock_t *hatlockp = NULL; 10171 10172 hatlockp = sfmmu_hat_enter(sfmmup); 10173 ASSERT(hatlockp != NULL); 10174 tsbinfop = sfmmup->sfmmu_tsb; 10175 ASSERT(tsbinfop != NULL); 10176 10177 /* 10178 * If we're growing, select the size based on RSS. If we're 10179 * shrinking, leave some room so we don't have to turn around and 10180 * grow again immediately. 10181 */ 10182 if (growing) 10183 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 10184 else 10185 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1); 10186 10187 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10188 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10189 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10190 hatlockp, TSB_SHRINK); 10191 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) { 10192 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc, 10193 hatlockp, TSB_GROW); 10194 } 10195 tsbinfop = sfmmup->sfmmu_tsb; 10196 10197 /* 10198 * With the TLB and first TSB out of the way, we need to see if 10199 * we need a second TSB for 4M pages. If we managed to reprogram 10200 * the TLB page sizes above, the process will start using this new 10201 * TSB right away; otherwise, it will start using it on the next 10202 * context switch. Either way, it's no big deal so there's no 10203 * synchronization with the trap handlers here unless we grow the 10204 * TSB (in which case it's required to prevent using the old one 10205 * after it's freed). Note: second tsb is required for 32M/256M 10206 * page sizes. 10207 */ 10208 if (tte4m_cnt > sectsb_thresh) { 10209 /* 10210 * If we're growing, select the size based on RSS. If we're 10211 * shrinking, leave some room so we don't have to turn 10212 * around and grow again immediately. 10213 */ 10214 if (growing) 10215 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 10216 else 10217 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1); 10218 if (tsbinfop->tsb_next == NULL) { 10219 struct tsb_info *newtsb; 10220 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)? 10221 0 : TSB_ALLOC; 10222 10223 sfmmu_hat_exit(hatlockp); 10224 10225 /* 10226 * Try to allocate a TSB for 4[32|256]M pages. If we 10227 * can't get the size we want, retry w/a minimum sized 10228 * TSB. If that still didn't work, give up; we can 10229 * still run without one. 10230 */ 10231 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)? 10232 TSB4M|TSB32M|TSB256M:TSB4M; 10233 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits, 10234 allocflags, sfmmup)) && 10235 (tsb_szc <= TSB_4M_SZCODE || 10236 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 10237 tsb_bits, allocflags, sfmmup)) && 10238 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE, 10239 tsb_bits, allocflags, sfmmup)) { 10240 return; 10241 } 10242 10243 hatlockp = sfmmu_hat_enter(sfmmup); 10244 10245 sfmmu_invalidate_ctx(sfmmup); 10246 10247 if (sfmmup->sfmmu_tsb->tsb_next == NULL) { 10248 sfmmup->sfmmu_tsb->tsb_next = newtsb; 10249 SFMMU_STAT(sf_tsb_sectsb_create); 10250 sfmmu_hat_exit(hatlockp); 10251 return; 10252 } else { 10253 /* 10254 * It's annoying, but possible for us 10255 * to get here.. we dropped the HAT lock 10256 * because of locking order in the kmem 10257 * allocator, and while we were off getting 10258 * our memory, some other thread decided to 10259 * do us a favor and won the race to get a 10260 * second TSB for this process. Sigh. 10261 */ 10262 sfmmu_hat_exit(hatlockp); 10263 sfmmu_tsbinfo_free(newtsb); 10264 return; 10265 } 10266 } 10267 10268 /* 10269 * We have a second TSB, see if it's big enough. 10270 */ 10271 tsbinfop = tsbinfop->tsb_next; 10272 10273 /* 10274 * Check to see if our second TSB is the right size; 10275 * we may need to grow or shrink it. 10276 * To prevent thrashing (e.g. growing the TSB on a 10277 * subsequent map operation), only try to shrink if 10278 * the TSB reach exceeds twice the virtual address 10279 * space size. 10280 */ 10281 if (!growing && (tsb_szc < tsbinfop->tsb_szc) && 10282 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) { 10283 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10284 tsb_szc, hatlockp, TSB_SHRINK); 10285 } else if (growing && tsb_szc > tsbinfop->tsb_szc && 10286 TSB_OK_GROW()) { 10287 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, 10288 tsb_szc, hatlockp, TSB_GROW); 10289 } 10290 } 10291 10292 sfmmu_hat_exit(hatlockp); 10293 } 10294 10295 /* 10296 * Free up a sfmmu 10297 * Since the sfmmu is currently embedded in the hat struct we simply zero 10298 * out our fields and free up the ism map blk list if any. 10299 */ 10300 static void 10301 sfmmu_free_sfmmu(sfmmu_t *sfmmup) 10302 { 10303 ism_blk_t *blkp, *nx_blkp; 10304 #ifdef DEBUG 10305 ism_map_t *map; 10306 int i; 10307 #endif 10308 10309 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0); 10310 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0); 10311 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0); 10312 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0); 10313 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0); 10314 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0); 10315 ASSERT(SF_RGNMAP_ISNULL(sfmmup)); 10316 10317 sfmmup->sfmmu_free = 0; 10318 sfmmup->sfmmu_ismhat = 0; 10319 10320 blkp = sfmmup->sfmmu_iblk; 10321 sfmmup->sfmmu_iblk = NULL; 10322 10323 while (blkp) { 10324 #ifdef DEBUG 10325 map = blkp->iblk_maps; 10326 for (i = 0; i < ISM_MAP_SLOTS; i++) { 10327 ASSERT(map[i].imap_seg == 0); 10328 ASSERT(map[i].imap_ismhat == NULL); 10329 ASSERT(map[i].imap_ment == NULL); 10330 } 10331 #endif 10332 nx_blkp = blkp->iblk_next; 10333 blkp->iblk_next = NULL; 10334 blkp->iblk_nextpa = (uint64_t)-1; 10335 kmem_cache_free(ism_blk_cache, blkp); 10336 blkp = nx_blkp; 10337 } 10338 } 10339 10340 /* 10341 * Locking primitves accessed by HATLOCK macros 10342 */ 10343 10344 #define SFMMU_SPL_MTX (0x0) 10345 #define SFMMU_ML_MTX (0x1) 10346 10347 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \ 10348 SPL_HASH(pg) : MLIST_HASH(pg)) 10349 10350 kmutex_t * 10351 sfmmu_page_enter(struct page *pp) 10352 { 10353 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX)); 10354 } 10355 10356 void 10357 sfmmu_page_exit(kmutex_t *spl) 10358 { 10359 mutex_exit(spl); 10360 } 10361 10362 int 10363 sfmmu_page_spl_held(struct page *pp) 10364 { 10365 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX)); 10366 } 10367 10368 kmutex_t * 10369 sfmmu_mlist_enter(struct page *pp) 10370 { 10371 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX)); 10372 } 10373 10374 void 10375 sfmmu_mlist_exit(kmutex_t *mml) 10376 { 10377 mutex_exit(mml); 10378 } 10379 10380 int 10381 sfmmu_mlist_held(struct page *pp) 10382 { 10383 10384 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX)); 10385 } 10386 10387 /* 10388 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For 10389 * sfmmu_mlist_enter() case mml_table lock array is used and for 10390 * sfmmu_page_enter() sfmmu_page_lock lock array is used. 10391 * 10392 * The lock is taken on a root page so that it protects an operation on all 10393 * constituent pages of a large page pp belongs to. 10394 * 10395 * The routine takes a lock from the appropriate array. The lock is determined 10396 * by hashing the root page. After taking the lock this routine checks if the 10397 * root page has the same size code that was used to determine the root (i.e 10398 * that root hasn't changed). If root page has the expected p_szc field we 10399 * have the right lock and it's returned to the caller. If root's p_szc 10400 * decreased we release the lock and retry from the beginning. This case can 10401 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc 10402 * value and taking the lock. The number of retries due to p_szc decrease is 10403 * limited by the maximum p_szc value. If p_szc is 0 we return the lock 10404 * determined by hashing pp itself. 10405 * 10406 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also 10407 * possible that p_szc can increase. To increase p_szc a thread has to lock 10408 * all constituent pages EXCL and do hat_pageunload() on all of them. All the 10409 * callers that don't hold a page locked recheck if hmeblk through which pp 10410 * was found still maps this pp. If it doesn't map it anymore returned lock 10411 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of 10412 * p_szc increase after taking the lock it returns this lock without further 10413 * retries because in this case the caller doesn't care about which lock was 10414 * taken. The caller will drop it right away. 10415 * 10416 * After the routine returns it's guaranteed that hat_page_demote() can't 10417 * change p_szc field of any of constituent pages of a large page pp belongs 10418 * to as long as pp was either locked at least SHARED prior to this call or 10419 * the caller finds that hment that pointed to this pp still references this 10420 * pp (this also assumes that the caller holds hme hash bucket lock so that 10421 * the same pp can't be remapped into the same hmeblk after it was unmapped by 10422 * hat_pageunload()). 10423 */ 10424 static kmutex_t * 10425 sfmmu_mlspl_enter(struct page *pp, int type) 10426 { 10427 kmutex_t *mtx; 10428 uint_t prev_rszc = UINT_MAX; 10429 page_t *rootpp; 10430 uint_t szc; 10431 uint_t rszc; 10432 uint_t pszc = pp->p_szc; 10433 10434 ASSERT(pp != NULL); 10435 10436 again: 10437 if (pszc == 0) { 10438 mtx = SFMMU_MLSPL_MTX(type, pp); 10439 mutex_enter(mtx); 10440 return (mtx); 10441 } 10442 10443 /* The lock lives in the root page */ 10444 rootpp = PP_GROUPLEADER(pp, pszc); 10445 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10446 mutex_enter(mtx); 10447 10448 /* 10449 * Return mml in the following 3 cases: 10450 * 10451 * 1) If pp itself is root since if its p_szc decreased before we took 10452 * the lock pp is still the root of smaller szc page. And if its p_szc 10453 * increased it doesn't matter what lock we return (see comment in 10454 * front of this routine). 10455 * 10456 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size 10457 * large page we have the right lock since any previous potential 10458 * hat_page_demote() is done demoting from greater than current root's 10459 * p_szc because hat_page_demote() changes root's p_szc last. No 10460 * further hat_page_demote() can start or be in progress since it 10461 * would need the same lock we currently hold. 10462 * 10463 * 3) If rootpp's p_szc increased since previous iteration it doesn't 10464 * matter what lock we return (see comment in front of this routine). 10465 */ 10466 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc || 10467 rszc >= prev_rszc) { 10468 return (mtx); 10469 } 10470 10471 /* 10472 * hat_page_demote() could have decreased root's p_szc. 10473 * In this case pp's p_szc must also be smaller than pszc. 10474 * Retry. 10475 */ 10476 if (rszc < pszc) { 10477 szc = pp->p_szc; 10478 if (szc < pszc) { 10479 mutex_exit(mtx); 10480 pszc = szc; 10481 goto again; 10482 } 10483 /* 10484 * pp's p_szc increased after it was decreased. 10485 * page cannot be mapped. Return current lock. The caller 10486 * will drop it right away. 10487 */ 10488 return (mtx); 10489 } 10490 10491 /* 10492 * root's p_szc is greater than pp's p_szc. 10493 * hat_page_demote() is not done with all pages 10494 * yet. Wait for it to complete. 10495 */ 10496 mutex_exit(mtx); 10497 rootpp = PP_GROUPLEADER(rootpp, rszc); 10498 mtx = SFMMU_MLSPL_MTX(type, rootpp); 10499 mutex_enter(mtx); 10500 mutex_exit(mtx); 10501 prev_rszc = rszc; 10502 goto again; 10503 } 10504 10505 static int 10506 sfmmu_mlspl_held(struct page *pp, int type) 10507 { 10508 kmutex_t *mtx; 10509 10510 ASSERT(pp != NULL); 10511 /* The lock lives in the root page */ 10512 pp = PP_PAGEROOT(pp); 10513 ASSERT(pp != NULL); 10514 10515 mtx = SFMMU_MLSPL_MTX(type, pp); 10516 return (MUTEX_HELD(mtx)); 10517 } 10518 10519 static uint_t 10520 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical) 10521 { 10522 struct hme_blk *hblkp; 10523 10524 10525 if (freehblkp != NULL) { 10526 mutex_enter(&freehblkp_lock); 10527 if (freehblkp != NULL) { 10528 /* 10529 * If the current thread is owning hblk_reserve OR 10530 * critical request from sfmmu_hblk_steal() 10531 * let it succeed even if freehblkcnt is really low. 10532 */ 10533 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) { 10534 SFMMU_STAT(sf_get_free_throttle); 10535 mutex_exit(&freehblkp_lock); 10536 return (0); 10537 } 10538 freehblkcnt--; 10539 *hmeblkpp = freehblkp; 10540 hblkp = *hmeblkpp; 10541 freehblkp = hblkp->hblk_next; 10542 mutex_exit(&freehblkp_lock); 10543 hblkp->hblk_next = NULL; 10544 SFMMU_STAT(sf_get_free_success); 10545 10546 ASSERT(hblkp->hblk_hmecnt == 0); 10547 ASSERT(hblkp->hblk_vcnt == 0); 10548 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp)); 10549 10550 return (1); 10551 } 10552 mutex_exit(&freehblkp_lock); 10553 } 10554 10555 /* Check cpu hblk pending queues */ 10556 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) { 10557 hblkp = *hmeblkpp; 10558 hblkp->hblk_next = NULL; 10559 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp); 10560 10561 ASSERT(hblkp->hblk_hmecnt == 0); 10562 ASSERT(hblkp->hblk_vcnt == 0); 10563 10564 return (1); 10565 } 10566 10567 SFMMU_STAT(sf_get_free_fail); 10568 return (0); 10569 } 10570 10571 static uint_t 10572 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical) 10573 { 10574 struct hme_blk *hblkp; 10575 10576 ASSERT(hmeblkp->hblk_hmecnt == 0); 10577 ASSERT(hmeblkp->hblk_vcnt == 0); 10578 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 10579 10580 /* 10581 * If the current thread is mapping into kernel space, 10582 * let it succede even if freehblkcnt is max 10583 * so that it will avoid freeing it to kmem. 10584 * This will prevent stack overflow due to 10585 * possible recursion since kmem_cache_free() 10586 * might require creation of a slab which 10587 * in turn needs an hmeblk to map that slab; 10588 * let's break this vicious chain at the first 10589 * opportunity. 10590 */ 10591 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10592 mutex_enter(&freehblkp_lock); 10593 if (freehblkcnt < HBLK_RESERVE_CNT || critical) { 10594 SFMMU_STAT(sf_put_free_success); 10595 freehblkcnt++; 10596 hmeblkp->hblk_next = freehblkp; 10597 freehblkp = hmeblkp; 10598 mutex_exit(&freehblkp_lock); 10599 return (1); 10600 } 10601 mutex_exit(&freehblkp_lock); 10602 } 10603 10604 /* 10605 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here 10606 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and* 10607 * we are not in the process of mapping into kernel space. 10608 */ 10609 ASSERT(!critical); 10610 while (freehblkcnt > HBLK_RESERVE_CNT) { 10611 mutex_enter(&freehblkp_lock); 10612 if (freehblkcnt > HBLK_RESERVE_CNT) { 10613 freehblkcnt--; 10614 hblkp = freehblkp; 10615 freehblkp = hblkp->hblk_next; 10616 mutex_exit(&freehblkp_lock); 10617 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache); 10618 kmem_cache_free(sfmmu8_cache, hblkp); 10619 continue; 10620 } 10621 mutex_exit(&freehblkp_lock); 10622 } 10623 SFMMU_STAT(sf_put_free_fail); 10624 return (0); 10625 } 10626 10627 static void 10628 sfmmu_hblk_swap(struct hme_blk *new) 10629 { 10630 struct hme_blk *old, *hblkp, *prev; 10631 uint64_t newpa; 10632 caddr_t base, vaddr, endaddr; 10633 struct hmehash_bucket *hmebp; 10634 struct sf_hment *osfhme, *nsfhme; 10635 page_t *pp; 10636 kmutex_t *pml; 10637 tte_t tte; 10638 struct hme_blk *list = NULL; 10639 10640 #ifdef DEBUG 10641 hmeblk_tag hblktag; 10642 struct hme_blk *found; 10643 #endif 10644 old = HBLK_RESERVE; 10645 ASSERT(!old->hblk_shared); 10646 10647 /* 10648 * save pa before bcopy clobbers it 10649 */ 10650 newpa = new->hblk_nextpa; 10651 10652 base = (caddr_t)get_hblk_base(old); 10653 endaddr = base + get_hblk_span(old); 10654 10655 /* 10656 * acquire hash bucket lock. 10657 */ 10658 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K, 10659 SFMMU_INVALID_SHMERID); 10660 10661 /* 10662 * copy contents from old to new 10663 */ 10664 bcopy((void *)old, (void *)new, HME8BLK_SZ); 10665 10666 /* 10667 * add new to hash chain 10668 */ 10669 sfmmu_hblk_hash_add(hmebp, new, newpa); 10670 10671 /* 10672 * search hash chain for hblk_reserve; this needs to be performed 10673 * after adding new, otherwise prev won't correspond to the hblk which 10674 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to 10675 * remove old later. 10676 */ 10677 for (prev = NULL, 10678 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old; 10679 prev = hblkp, hblkp = hblkp->hblk_next) 10680 ; 10681 10682 if (hblkp != old) 10683 panic("sfmmu_hblk_swap: hblk_reserve not found"); 10684 10685 /* 10686 * p_mapping list is still pointing to hments in hblk_reserve; 10687 * fix up p_mapping list so that they point to hments in new. 10688 * 10689 * Since all these mappings are created by hblk_reserve_thread 10690 * on the way and it's using at least one of the buffers from each of 10691 * the newly minted slabs, there is no danger of any of these 10692 * mappings getting unloaded by another thread. 10693 * 10694 * tsbmiss could only modify ref/mod bits of hments in old/new. 10695 * Since all of these hments hold mappings established by segkmem 10696 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits 10697 * have no meaning for the mappings in hblk_reserve. hments in 10698 * old and new are identical except for ref/mod bits. 10699 */ 10700 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) { 10701 10702 HBLKTOHME(osfhme, old, vaddr); 10703 sfmmu_copytte(&osfhme->hme_tte, &tte); 10704 10705 if (TTE_IS_VALID(&tte)) { 10706 if ((pp = osfhme->hme_page) == NULL) 10707 panic("sfmmu_hblk_swap: page not mapped"); 10708 10709 pml = sfmmu_mlist_enter(pp); 10710 10711 if (pp != osfhme->hme_page) 10712 panic("sfmmu_hblk_swap: mapping changed"); 10713 10714 HBLKTOHME(nsfhme, new, vaddr); 10715 10716 HME_ADD(nsfhme, pp); 10717 HME_SUB(osfhme, pp); 10718 10719 sfmmu_mlist_exit(pml); 10720 } 10721 } 10722 10723 /* 10724 * remove old from hash chain 10725 */ 10726 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1); 10727 10728 #ifdef DEBUG 10729 10730 hblktag.htag_id = ksfmmup; 10731 hblktag.htag_rid = SFMMU_INVALID_SHMERID; 10732 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K)); 10733 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K); 10734 HME_HASH_FAST_SEARCH(hmebp, hblktag, found); 10735 10736 if (found != new) 10737 panic("sfmmu_hblk_swap: new hblk not found"); 10738 #endif 10739 10740 SFMMU_HASH_UNLOCK(hmebp); 10741 10742 /* 10743 * Reset hblk_reserve 10744 */ 10745 bzero((void *)old, HME8BLK_SZ); 10746 old->hblk_nextpa = va_to_pa((caddr_t)old); 10747 } 10748 10749 /* 10750 * Grab the mlist mutex for both pages passed in. 10751 * 10752 * low and high will be returned as pointers to the mutexes for these pages. 10753 * low refers to the mutex residing in the lower bin of the mlist hash, while 10754 * high refers to the mutex residing in the higher bin of the mlist hash. This 10755 * is due to the locking order restrictions on the same thread grabbing 10756 * multiple mlist mutexes. The low lock must be acquired before the high lock. 10757 * 10758 * If both pages hash to the same mutex, only grab that single mutex, and 10759 * high will be returned as NULL 10760 * If the pages hash to different bins in the hash, grab the lower addressed 10761 * lock first and then the higher addressed lock in order to follow the locking 10762 * rules involved with the same thread grabbing multiple mlist mutexes. 10763 * low and high will both have non-NULL values. 10764 */ 10765 static void 10766 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl, 10767 kmutex_t **low, kmutex_t **high) 10768 { 10769 kmutex_t *mml_targ, *mml_repl; 10770 10771 /* 10772 * no need to do the dance around szc as in sfmmu_mlist_enter() 10773 * because this routine is only called by hat_page_relocate() and all 10774 * targ and repl pages are already locked EXCL so szc can't change. 10775 */ 10776 10777 mml_targ = MLIST_HASH(PP_PAGEROOT(targ)); 10778 mml_repl = MLIST_HASH(PP_PAGEROOT(repl)); 10779 10780 if (mml_targ == mml_repl) { 10781 *low = mml_targ; 10782 *high = NULL; 10783 } else { 10784 if (mml_targ < mml_repl) { 10785 *low = mml_targ; 10786 *high = mml_repl; 10787 } else { 10788 *low = mml_repl; 10789 *high = mml_targ; 10790 } 10791 } 10792 10793 mutex_enter(*low); 10794 if (*high) 10795 mutex_enter(*high); 10796 } 10797 10798 static void 10799 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high) 10800 { 10801 if (high) 10802 mutex_exit(high); 10803 mutex_exit(low); 10804 } 10805 10806 static hatlock_t * 10807 sfmmu_hat_enter(sfmmu_t *sfmmup) 10808 { 10809 hatlock_t *hatlockp; 10810 10811 if (sfmmup != ksfmmup) { 10812 hatlockp = TSB_HASH(sfmmup); 10813 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 10814 return (hatlockp); 10815 } 10816 return (NULL); 10817 } 10818 10819 static hatlock_t * 10820 sfmmu_hat_tryenter(sfmmu_t *sfmmup) 10821 { 10822 hatlock_t *hatlockp; 10823 10824 if (sfmmup != ksfmmup) { 10825 hatlockp = TSB_HASH(sfmmup); 10826 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0) 10827 return (NULL); 10828 return (hatlockp); 10829 } 10830 return (NULL); 10831 } 10832 10833 static void 10834 sfmmu_hat_exit(hatlock_t *hatlockp) 10835 { 10836 if (hatlockp != NULL) 10837 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 10838 } 10839 10840 static void 10841 sfmmu_hat_lock_all(void) 10842 { 10843 int i; 10844 for (i = 0; i < SFMMU_NUM_LOCK; i++) 10845 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i])); 10846 } 10847 10848 static void 10849 sfmmu_hat_unlock_all(void) 10850 { 10851 int i; 10852 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--) 10853 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i])); 10854 } 10855 10856 int 10857 sfmmu_hat_lock_held(sfmmu_t *sfmmup) 10858 { 10859 ASSERT(sfmmup != ksfmmup); 10860 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup)))); 10861 } 10862 10863 /* 10864 * Locking primitives to provide consistency between ISM unmap 10865 * and other operations. Since ISM unmap can take a long time, we 10866 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating 10867 * contention on the hatlock buckets while ISM segments are being 10868 * unmapped. The tradeoff is that the flags don't prevent priority 10869 * inversion from occurring, so we must request kernel priority in 10870 * case we have to sleep to keep from getting buried while holding 10871 * the HAT_ISMBUSY flag set, which in turn could block other kernel 10872 * threads from running (for example, in sfmmu_uvatopfn()). 10873 */ 10874 static void 10875 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held) 10876 { 10877 hatlock_t *hatlockp; 10878 10879 THREAD_KPRI_REQUEST(); 10880 if (!hatlock_held) 10881 hatlockp = sfmmu_hat_enter(sfmmup); 10882 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) 10883 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 10884 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 10885 if (!hatlock_held) 10886 sfmmu_hat_exit(hatlockp); 10887 } 10888 10889 static void 10890 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held) 10891 { 10892 hatlock_t *hatlockp; 10893 10894 if (!hatlock_held) 10895 hatlockp = sfmmu_hat_enter(sfmmup); 10896 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 10897 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 10898 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 10899 if (!hatlock_held) 10900 sfmmu_hat_exit(hatlockp); 10901 THREAD_KPRI_RELEASE(); 10902 } 10903 10904 /* 10905 * 10906 * Algorithm: 10907 * 10908 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed 10909 * hblks. 10910 * 10911 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache, 10912 * 10913 * (a) try to return an hblk from reserve pool of free hblks; 10914 * (b) if the reserve pool is empty, acquire hblk_reserve_lock 10915 * and return hblk_reserve. 10916 * 10917 * (3) call kmem_cache_alloc() to allocate hblk; 10918 * 10919 * (a) if hblk_reserve_lock is held by the current thread, 10920 * atomically replace hblk_reserve by the hblk that is 10921 * returned by kmem_cache_alloc; release hblk_reserve_lock 10922 * and call kmem_cache_alloc() again. 10923 * (b) if reserve pool is not full, add the hblk that is 10924 * returned by kmem_cache_alloc to reserve pool and 10925 * call kmem_cache_alloc again. 10926 * 10927 */ 10928 static struct hme_blk * 10929 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr, 10930 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag, 10931 uint_t flags, uint_t rid) 10932 { 10933 struct hme_blk *hmeblkp = NULL; 10934 struct hme_blk *newhblkp; 10935 struct hme_blk *shw_hblkp = NULL; 10936 struct kmem_cache *sfmmu_cache = NULL; 10937 uint64_t hblkpa; 10938 ulong_t index; 10939 uint_t owner; /* set to 1 if using hblk_reserve */ 10940 uint_t forcefree; 10941 int sleep; 10942 sf_srd_t *srdp; 10943 sf_region_t *rgnp; 10944 10945 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 10946 ASSERT(hblktag.htag_rid == rid); 10947 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size)); 10948 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 10949 IS_P2ALIGNED(vaddr, TTEBYTES(size))); 10950 10951 /* 10952 * If segkmem is not created yet, allocate from static hmeblks 10953 * created at the end of startup_modules(). See the block comment 10954 * in startup_modules() describing how we estimate the number of 10955 * static hmeblks that will be needed during re-map. 10956 */ 10957 if (!hblk_alloc_dynamic) { 10958 10959 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 10960 10961 if (size == TTE8K) { 10962 index = nucleus_hblk8.index; 10963 if (index >= nucleus_hblk8.len) { 10964 /* 10965 * If we panic here, see startup_modules() to 10966 * make sure that we are calculating the 10967 * number of hblk8's that we need correctly. 10968 */ 10969 prom_panic("no nucleus hblk8 to allocate"); 10970 } 10971 hmeblkp = 10972 (struct hme_blk *)&nucleus_hblk8.list[index]; 10973 nucleus_hblk8.index++; 10974 SFMMU_STAT(sf_hblk8_nalloc); 10975 } else { 10976 index = nucleus_hblk1.index; 10977 if (nucleus_hblk1.index >= nucleus_hblk1.len) { 10978 /* 10979 * If we panic here, see startup_modules(). 10980 * Most likely you need to update the 10981 * calculation of the number of hblk1 elements 10982 * that the kernel needs to boot. 10983 */ 10984 prom_panic("no nucleus hblk1 to allocate"); 10985 } 10986 hmeblkp = 10987 (struct hme_blk *)&nucleus_hblk1.list[index]; 10988 nucleus_hblk1.index++; 10989 SFMMU_STAT(sf_hblk1_nalloc); 10990 } 10991 10992 goto hblk_init; 10993 } 10994 10995 SFMMU_HASH_UNLOCK(hmebp); 10996 10997 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) { 10998 if (mmu_page_sizes == max_mmu_page_sizes) { 10999 if (size < TTE256M) 11000 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11001 size, flags); 11002 } else { 11003 if (size < TTE4M) 11004 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr, 11005 size, flags); 11006 } 11007 } else if (SFMMU_IS_SHMERID_VALID(rid)) { 11008 /* 11009 * Shared hmes use per region bitmaps in rgn_hmeflag 11010 * rather than shadow hmeblks to keep track of the 11011 * mapping sizes which have been allocated for the region. 11012 * Here we cleanup old invalid hmeblks with this rid, 11013 * which may be left around by pageunload(). 11014 */ 11015 int ttesz; 11016 caddr_t va; 11017 caddr_t eva = vaddr + TTEBYTES(size); 11018 11019 ASSERT(sfmmup != KHATID); 11020 11021 srdp = sfmmup->sfmmu_srdp; 11022 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11023 rgnp = srdp->srd_hmergnp[rid]; 11024 ASSERT(rgnp != NULL && rgnp->rgn_id == rid); 11025 ASSERT(rgnp->rgn_refcnt != 0); 11026 ASSERT(size <= rgnp->rgn_pgszc); 11027 11028 ttesz = HBLK_MIN_TTESZ; 11029 do { 11030 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) { 11031 continue; 11032 } 11033 11034 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) { 11035 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz); 11036 } else if (ttesz < size) { 11037 for (va = vaddr; va < eva; 11038 va += TTEBYTES(ttesz)) { 11039 sfmmu_cleanup_rhblk(srdp, va, rid, 11040 ttesz); 11041 } 11042 } 11043 } while (++ttesz <= rgnp->rgn_pgszc); 11044 } 11045 11046 fill_hblk: 11047 owner = (hblk_reserve_thread == curthread) ? 1 : 0; 11048 11049 if (owner && size == TTE8K) { 11050 11051 ASSERT(!SFMMU_IS_SHMERID_VALID(rid)); 11052 /* 11053 * We are really in a tight spot. We already own 11054 * hblk_reserve and we need another hblk. In anticipation 11055 * of this kind of scenario, we specifically set aside 11056 * HBLK_RESERVE_MIN number of hblks to be used exclusively 11057 * by owner of hblk_reserve. 11058 */ 11059 SFMMU_STAT(sf_hblk_recurse_cnt); 11060 11061 if (!sfmmu_get_free_hblk(&hmeblkp, 1)) 11062 panic("sfmmu_hblk_alloc: reserve list is empty"); 11063 11064 goto hblk_verify; 11065 } 11066 11067 ASSERT(!owner); 11068 11069 if ((flags & HAT_NO_KALLOC) == 0) { 11070 11071 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache); 11072 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP); 11073 11074 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) { 11075 hmeblkp = sfmmu_hblk_steal(size); 11076 } else { 11077 /* 11078 * if we are the owner of hblk_reserve, 11079 * swap hblk_reserve with hmeblkp and 11080 * start a fresh life. Hope things go 11081 * better this time. 11082 */ 11083 if (hblk_reserve_thread == curthread) { 11084 ASSERT(sfmmu_cache == sfmmu8_cache); 11085 sfmmu_hblk_swap(hmeblkp); 11086 hblk_reserve_thread = NULL; 11087 mutex_exit(&hblk_reserve_lock); 11088 goto fill_hblk; 11089 } 11090 /* 11091 * let's donate this hblk to our reserve list if 11092 * we are not mapping kernel range 11093 */ 11094 if (size == TTE8K && sfmmup != KHATID) { 11095 if (sfmmu_put_free_hblk(hmeblkp, 0)) 11096 goto fill_hblk; 11097 } 11098 } 11099 } else { 11100 /* 11101 * We are here to map the slab in sfmmu8_cache; let's 11102 * check if we could tap our reserve list; if successful, 11103 * this will avoid the pain of going thru sfmmu_hblk_swap 11104 */ 11105 SFMMU_STAT(sf_hblk_slab_cnt); 11106 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) { 11107 /* 11108 * let's start hblk_reserve dance 11109 */ 11110 SFMMU_STAT(sf_hblk_reserve_cnt); 11111 owner = 1; 11112 mutex_enter(&hblk_reserve_lock); 11113 hmeblkp = HBLK_RESERVE; 11114 hblk_reserve_thread = curthread; 11115 } 11116 } 11117 11118 hblk_verify: 11119 ASSERT(hmeblkp != NULL); 11120 set_hblk_sz(hmeblkp, size); 11121 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp)); 11122 SFMMU_HASH_LOCK(hmebp); 11123 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11124 if (newhblkp != NULL) { 11125 SFMMU_HASH_UNLOCK(hmebp); 11126 if (hmeblkp != HBLK_RESERVE) { 11127 /* 11128 * This is really tricky! 11129 * 11130 * vmem_alloc(vmem_seg_arena) 11131 * vmem_alloc(vmem_internal_arena) 11132 * segkmem_alloc(heap_arena) 11133 * vmem_alloc(heap_arena) 11134 * page_create() 11135 * hat_memload() 11136 * kmem_cache_free() 11137 * kmem_cache_alloc() 11138 * kmem_slab_create() 11139 * vmem_alloc(kmem_internal_arena) 11140 * segkmem_alloc(heap_arena) 11141 * vmem_alloc(heap_arena) 11142 * page_create() 11143 * hat_memload() 11144 * kmem_cache_free() 11145 * ... 11146 * 11147 * Thus, hat_memload() could call kmem_cache_free 11148 * for enough number of times that we could easily 11149 * hit the bottom of the stack or run out of reserve 11150 * list of vmem_seg structs. So, we must donate 11151 * this hblk to reserve list if it's allocated 11152 * from sfmmu8_cache *and* mapping kernel range. 11153 * We don't need to worry about freeing hmeblk1's 11154 * to kmem since they don't map any kmem slabs. 11155 * 11156 * Note: When segkmem supports largepages, we must 11157 * free hmeblk1's to reserve list as well. 11158 */ 11159 forcefree = (sfmmup == KHATID) ? 1 : 0; 11160 if (size == TTE8K && 11161 sfmmu_put_free_hblk(hmeblkp, forcefree)) { 11162 goto re_verify; 11163 } 11164 ASSERT(sfmmup != KHATID); 11165 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp); 11166 } else { 11167 /* 11168 * Hey! we don't need hblk_reserve any more. 11169 */ 11170 ASSERT(owner); 11171 hblk_reserve_thread = NULL; 11172 mutex_exit(&hblk_reserve_lock); 11173 owner = 0; 11174 } 11175 re_verify: 11176 /* 11177 * let's check if the goodies are still present 11178 */ 11179 SFMMU_HASH_LOCK(hmebp); 11180 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp); 11181 if (newhblkp != NULL) { 11182 /* 11183 * return newhblkp if it's not hblk_reserve; 11184 * if newhblkp is hblk_reserve, return it 11185 * _only if_ we are the owner of hblk_reserve. 11186 */ 11187 if (newhblkp != HBLK_RESERVE || owner) { 11188 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || 11189 newhblkp->hblk_shared); 11190 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || 11191 !newhblkp->hblk_shared); 11192 return (newhblkp); 11193 } else { 11194 /* 11195 * we just hit hblk_reserve in the hash and 11196 * we are not the owner of that; 11197 * 11198 * block until hblk_reserve_thread completes 11199 * swapping hblk_reserve and try the dance 11200 * once again. 11201 */ 11202 SFMMU_HASH_UNLOCK(hmebp); 11203 mutex_enter(&hblk_reserve_lock); 11204 mutex_exit(&hblk_reserve_lock); 11205 SFMMU_STAT(sf_hblk_reserve_hit); 11206 goto fill_hblk; 11207 } 11208 } else { 11209 /* 11210 * it's no more! try the dance once again. 11211 */ 11212 SFMMU_HASH_UNLOCK(hmebp); 11213 goto fill_hblk; 11214 } 11215 } 11216 11217 hblk_init: 11218 if (SFMMU_IS_SHMERID_VALID(rid)) { 11219 uint16_t tteflag = 0x1 << 11220 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size); 11221 11222 if (!(rgnp->rgn_hmeflags & tteflag)) { 11223 atomic_or_16(&rgnp->rgn_hmeflags, tteflag); 11224 } 11225 hmeblkp->hblk_shared = 1; 11226 } else { 11227 hmeblkp->hblk_shared = 0; 11228 } 11229 set_hblk_sz(hmeblkp, size); 11230 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11231 hmeblkp->hblk_next = (struct hme_blk *)NULL; 11232 hmeblkp->hblk_tag = hblktag; 11233 hmeblkp->hblk_shadow = shw_hblkp; 11234 hblkpa = hmeblkp->hblk_nextpa; 11235 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 11236 11237 ASSERT(get_hblk_ttesz(hmeblkp) == size); 11238 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size)); 11239 ASSERT(hmeblkp->hblk_hmecnt == 0); 11240 ASSERT(hmeblkp->hblk_vcnt == 0); 11241 ASSERT(hmeblkp->hblk_lckcnt == 0); 11242 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp)); 11243 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa); 11244 return (hmeblkp); 11245 } 11246 11247 /* 11248 * This function cleans up the hme_blk and returns it to the free list. 11249 */ 11250 /* ARGSUSED */ 11251 static void 11252 sfmmu_hblk_free(struct hme_blk **listp) 11253 { 11254 struct hme_blk *hmeblkp, *next_hmeblkp; 11255 int size; 11256 uint_t critical; 11257 uint64_t hblkpa; 11258 11259 ASSERT(*listp != NULL); 11260 11261 hmeblkp = *listp; 11262 while (hmeblkp != NULL) { 11263 next_hmeblkp = hmeblkp->hblk_next; 11264 ASSERT(!hmeblkp->hblk_hmecnt); 11265 ASSERT(!hmeblkp->hblk_vcnt); 11266 ASSERT(!hmeblkp->hblk_lckcnt); 11267 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve); 11268 ASSERT(hmeblkp->hblk_shared == 0); 11269 ASSERT(hmeblkp->hblk_shw_bit == 0); 11270 ASSERT(hmeblkp->hblk_shadow == NULL); 11271 11272 hblkpa = va_to_pa((caddr_t)hmeblkp); 11273 ASSERT(hblkpa != (uint64_t)-1); 11274 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0; 11275 11276 size = get_hblk_ttesz(hmeblkp); 11277 hmeblkp->hblk_next = NULL; 11278 hmeblkp->hblk_nextpa = hblkpa; 11279 11280 if (hmeblkp->hblk_nuc_bit == 0) { 11281 11282 if (size != TTE8K || 11283 !sfmmu_put_free_hblk(hmeblkp, critical)) 11284 kmem_cache_free(get_hblk_cache(hmeblkp), 11285 hmeblkp); 11286 } 11287 hmeblkp = next_hmeblkp; 11288 } 11289 } 11290 11291 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30 11292 #define SFMMU_HBLK_STEAL_THRESHOLD 5 11293 11294 static uint_t sfmmu_hblk_steal_twice; 11295 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count; 11296 11297 /* 11298 * Steal a hmeblk from user or kernel hme hash lists. 11299 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to 11300 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts 11301 * tap into critical reserve of freehblkp. 11302 * Note: We remain looping in this routine until we find one. 11303 */ 11304 static struct hme_blk * 11305 sfmmu_hblk_steal(int size) 11306 { 11307 static struct hmehash_bucket *uhmehash_steal_hand = NULL; 11308 struct hmehash_bucket *hmebp; 11309 struct hme_blk *hmeblkp = NULL, *pr_hblk; 11310 uint64_t hblkpa; 11311 int i; 11312 uint_t loop_cnt = 0, critical; 11313 11314 for (;;) { 11315 /* Check cpu hblk pending queues */ 11316 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) { 11317 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp); 11318 ASSERT(hmeblkp->hblk_hmecnt == 0); 11319 ASSERT(hmeblkp->hblk_vcnt == 0); 11320 return (hmeblkp); 11321 } 11322 11323 if (size == TTE8K) { 11324 critical = 11325 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0; 11326 if (sfmmu_get_free_hblk(&hmeblkp, critical)) 11327 return (hmeblkp); 11328 } 11329 11330 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash : 11331 uhmehash_steal_hand; 11332 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]); 11333 11334 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ + 11335 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) { 11336 SFMMU_HASH_LOCK(hmebp); 11337 hmeblkp = hmebp->hmeblkp; 11338 hblkpa = hmebp->hmeh_nextpa; 11339 pr_hblk = NULL; 11340 while (hmeblkp) { 11341 /* 11342 * check if it is a hmeblk that is not locked 11343 * and not shared. skip shadow hmeblks with 11344 * shadow_mask set i.e valid count non zero. 11345 */ 11346 if ((get_hblk_ttesz(hmeblkp) == size) && 11347 (hmeblkp->hblk_shw_bit == 0 || 11348 hmeblkp->hblk_vcnt == 0) && 11349 (hmeblkp->hblk_lckcnt == 0)) { 11350 /* 11351 * there is a high probability that we 11352 * will find a free one. search some 11353 * buckets for a free hmeblk initially 11354 * before unloading a valid hmeblk. 11355 */ 11356 if ((hmeblkp->hblk_vcnt == 0 && 11357 hmeblkp->hblk_hmecnt == 0) || (i >= 11358 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) { 11359 if (sfmmu_steal_this_hblk(hmebp, 11360 hmeblkp, hblkpa, pr_hblk)) { 11361 /* 11362 * Hblk is unloaded 11363 * successfully 11364 */ 11365 break; 11366 } 11367 } 11368 } 11369 pr_hblk = hmeblkp; 11370 hblkpa = hmeblkp->hblk_nextpa; 11371 hmeblkp = hmeblkp->hblk_next; 11372 } 11373 11374 SFMMU_HASH_UNLOCK(hmebp); 11375 if (hmebp++ == &uhme_hash[UHMEHASH_SZ]) 11376 hmebp = uhme_hash; 11377 } 11378 uhmehash_steal_hand = hmebp; 11379 11380 if (hmeblkp != NULL) 11381 break; 11382 11383 /* 11384 * in the worst case, look for a free one in the kernel 11385 * hash table. 11386 */ 11387 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) { 11388 SFMMU_HASH_LOCK(hmebp); 11389 hmeblkp = hmebp->hmeblkp; 11390 hblkpa = hmebp->hmeh_nextpa; 11391 pr_hblk = NULL; 11392 while (hmeblkp) { 11393 /* 11394 * check if it is free hmeblk 11395 */ 11396 if ((get_hblk_ttesz(hmeblkp) == size) && 11397 (hmeblkp->hblk_lckcnt == 0) && 11398 (hmeblkp->hblk_vcnt == 0) && 11399 (hmeblkp->hblk_hmecnt == 0)) { 11400 if (sfmmu_steal_this_hblk(hmebp, 11401 hmeblkp, hblkpa, pr_hblk)) { 11402 break; 11403 } else { 11404 /* 11405 * Cannot fail since we have 11406 * hash lock. 11407 */ 11408 panic("fail to steal?"); 11409 } 11410 } 11411 11412 pr_hblk = hmeblkp; 11413 hblkpa = hmeblkp->hblk_nextpa; 11414 hmeblkp = hmeblkp->hblk_next; 11415 } 11416 11417 SFMMU_HASH_UNLOCK(hmebp); 11418 if (hmebp++ == &khme_hash[KHMEHASH_SZ]) 11419 hmebp = khme_hash; 11420 } 11421 11422 if (hmeblkp != NULL) 11423 break; 11424 sfmmu_hblk_steal_twice++; 11425 } 11426 return (hmeblkp); 11427 } 11428 11429 /* 11430 * This routine does real work to prepare a hblk to be "stolen" by 11431 * unloading the mappings, updating shadow counts .... 11432 * It returns 1 if the block is ready to be reused (stolen), or 0 11433 * means the block cannot be stolen yet- pageunload is still working 11434 * on this hblk. 11435 */ 11436 static int 11437 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 11438 uint64_t hblkpa, struct hme_blk *pr_hblk) 11439 { 11440 int shw_size, vshift; 11441 struct hme_blk *shw_hblkp; 11442 caddr_t vaddr; 11443 uint_t shw_mask, newshw_mask; 11444 struct hme_blk *list = NULL; 11445 11446 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 11447 11448 /* 11449 * check if the hmeblk is free, unload if necessary 11450 */ 11451 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11452 sfmmu_t *sfmmup; 11453 demap_range_t dmr; 11454 11455 sfmmup = hblktosfmmu(hmeblkp); 11456 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) { 11457 return (0); 11458 } 11459 DEMAP_RANGE_INIT(sfmmup, &dmr); 11460 (void) sfmmu_hblk_unload(sfmmup, hmeblkp, 11461 (caddr_t)get_hblk_base(hmeblkp), 11462 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD); 11463 DEMAP_RANGE_FLUSH(&dmr); 11464 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) { 11465 /* 11466 * Pageunload is working on the same hblk. 11467 */ 11468 return (0); 11469 } 11470 11471 sfmmu_hblk_steal_unload_count++; 11472 } 11473 11474 ASSERT(hmeblkp->hblk_lckcnt == 0); 11475 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0); 11476 11477 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1); 11478 hmeblkp->hblk_nextpa = hblkpa; 11479 11480 shw_hblkp = hmeblkp->hblk_shadow; 11481 if (shw_hblkp) { 11482 ASSERT(!hmeblkp->hblk_shared); 11483 shw_size = get_hblk_ttesz(shw_hblkp); 11484 vaddr = (caddr_t)get_hblk_base(hmeblkp); 11485 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 11486 ASSERT(vshift < 8); 11487 /* 11488 * Atomically clear shadow mask bit 11489 */ 11490 do { 11491 shw_mask = shw_hblkp->hblk_shw_mask; 11492 ASSERT(shw_mask & (1 << vshift)); 11493 newshw_mask = shw_mask & ~(1 << vshift); 11494 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask, 11495 shw_mask, newshw_mask); 11496 } while (newshw_mask != shw_mask); 11497 hmeblkp->hblk_shadow = NULL; 11498 } 11499 11500 /* 11501 * remove shadow bit if we are stealing an unused shadow hmeblk. 11502 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if 11503 * we are indeed allocating a shadow hmeblk. 11504 */ 11505 hmeblkp->hblk_shw_bit = 0; 11506 11507 if (hmeblkp->hblk_shared) { 11508 sf_srd_t *srdp; 11509 sf_region_t *rgnp; 11510 uint_t rid; 11511 11512 srdp = hblktosrd(hmeblkp); 11513 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 11514 rid = hmeblkp->hblk_tag.htag_rid; 11515 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11516 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11517 rgnp = srdp->srd_hmergnp[rid]; 11518 ASSERT(rgnp != NULL); 11519 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 11520 hmeblkp->hblk_shared = 0; 11521 } 11522 11523 sfmmu_hblk_steal_count++; 11524 SFMMU_STAT(sf_steal_count); 11525 11526 return (1); 11527 } 11528 11529 struct hme_blk * 11530 sfmmu_hmetohblk(struct sf_hment *sfhme) 11531 { 11532 struct hme_blk *hmeblkp; 11533 struct sf_hment *sfhme0; 11534 struct hme_blk *hblk_dummy = 0; 11535 11536 /* 11537 * No dummy sf_hments, please. 11538 */ 11539 ASSERT(sfhme->hme_tte.ll != 0); 11540 11541 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum; 11542 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 - 11543 (uintptr_t)&hblk_dummy->hblk_hme[0]); 11544 11545 return (hmeblkp); 11546 } 11547 11548 /* 11549 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag. 11550 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using 11551 * KM_SLEEP allocation. 11552 * 11553 * Return 0 on success, -1 otherwise. 11554 */ 11555 static void 11556 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11557 { 11558 struct tsb_info *tsbinfop, *next; 11559 tsb_replace_rc_t rc; 11560 boolean_t gotfirst = B_FALSE; 11561 11562 ASSERT(sfmmup != ksfmmup); 11563 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11564 11565 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) { 11566 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp)); 11567 } 11568 11569 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11570 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN); 11571 } else { 11572 return; 11573 } 11574 11575 ASSERT(sfmmup->sfmmu_tsb != NULL); 11576 11577 /* 11578 * Loop over all tsbinfo's replacing them with ones that actually have 11579 * a TSB. If any of the replacements ever fail, bail out of the loop. 11580 */ 11581 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) { 11582 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED); 11583 next = tsbinfop->tsb_next; 11584 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc, 11585 hatlockp, TSB_SWAPIN); 11586 if (rc != TSB_SUCCESS) { 11587 break; 11588 } 11589 gotfirst = B_TRUE; 11590 } 11591 11592 switch (rc) { 11593 case TSB_SUCCESS: 11594 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11595 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11596 return; 11597 case TSB_LOSTRACE: 11598 break; 11599 case TSB_ALLOCFAIL: 11600 break; 11601 default: 11602 panic("sfmmu_replace_tsb returned unrecognized failure code " 11603 "%d", rc); 11604 } 11605 11606 /* 11607 * In this case, we failed to get one of our TSBs. If we failed to 11608 * get the first TSB, get one of minimum size (8KB). Walk the list 11609 * and throw away the tsbinfos, starting where the allocation failed; 11610 * we can get by with just one TSB as long as we don't leave the 11611 * SWAPPED tsbinfo structures lying around. 11612 */ 11613 tsbinfop = sfmmup->sfmmu_tsb; 11614 next = tsbinfop->tsb_next; 11615 tsbinfop->tsb_next = NULL; 11616 11617 sfmmu_hat_exit(hatlockp); 11618 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) { 11619 next = tsbinfop->tsb_next; 11620 sfmmu_tsbinfo_free(tsbinfop); 11621 } 11622 hatlockp = sfmmu_hat_enter(sfmmup); 11623 11624 /* 11625 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K 11626 * pages. 11627 */ 11628 if (!gotfirst) { 11629 tsbinfop = sfmmup->sfmmu_tsb; 11630 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE, 11631 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC); 11632 ASSERT(rc == TSB_SUCCESS); 11633 } 11634 11635 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN); 11636 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 11637 } 11638 11639 static int 11640 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw) 11641 { 11642 ulong_t bix = 0; 11643 uint_t rid; 11644 sf_region_t *rgnp; 11645 11646 ASSERT(srdp != NULL); 11647 ASSERT(srdp->srd_refcnt != 0); 11648 11649 w <<= BT_ULSHIFT; 11650 while (bmw) { 11651 if (!(bmw & 0x1)) { 11652 bix++; 11653 bmw >>= 1; 11654 continue; 11655 } 11656 rid = w | bix; 11657 rgnp = srdp->srd_hmergnp[rid]; 11658 ASSERT(rgnp->rgn_refcnt > 0); 11659 ASSERT(rgnp->rgn_id == rid); 11660 if (addr < rgnp->rgn_saddr || 11661 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) { 11662 bix++; 11663 bmw >>= 1; 11664 } else { 11665 return (1); 11666 } 11667 } 11668 return (0); 11669 } 11670 11671 /* 11672 * Handle exceptions for low level tsb_handler. 11673 * 11674 * There are many scenarios that could land us here: 11675 * 11676 * If the context is invalid we land here. The context can be invalid 11677 * for 3 reasons: 1) we couldn't allocate a new context and now need to 11678 * perform a wrap around operation in order to allocate a new context. 11679 * 2) Context was invalidated to change pagesize programming 3) ISMs or 11680 * TSBs configuration is changeing for this process and we are forced into 11681 * here to do a syncronization operation. If the context is valid we can 11682 * be here from window trap hanlder. In this case just call trap to handle 11683 * the fault. 11684 * 11685 * Note that the process will run in INVALID_CONTEXT before 11686 * faulting into here and subsequently loading the MMU registers 11687 * (including the TSB base register) associated with this process. 11688 * For this reason, the trap handlers must all test for 11689 * INVALID_CONTEXT before attempting to access any registers other 11690 * than the context registers. 11691 */ 11692 void 11693 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype) 11694 { 11695 sfmmu_t *sfmmup, *shsfmmup; 11696 uint_t ctxtype; 11697 klwp_id_t lwp; 11698 char lwp_save_state; 11699 hatlock_t *hatlockp, *shatlockp; 11700 struct tsb_info *tsbinfop; 11701 struct tsbmiss *tsbmp; 11702 sf_scd_t *scdp; 11703 11704 SFMMU_STAT(sf_tsb_exceptions); 11705 SFMMU_MMU_STAT(mmu_tsb_exceptions); 11706 sfmmup = astosfmmu(curthread->t_procp->p_as); 11707 /* 11708 * note that in sun4u, tagacces register contains ctxnum 11709 * while sun4v passes ctxtype in the tagaccess register. 11710 */ 11711 ctxtype = tagaccess & TAGACC_CTX_MASK; 11712 11713 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT); 11714 ASSERT(sfmmup->sfmmu_ismhat == 0); 11715 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) || 11716 ctxtype == INVALID_CONTEXT); 11717 11718 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) { 11719 /* 11720 * We may land here because shme bitmap and pagesize 11721 * flags are updated lazily in tsbmiss area on other cpus. 11722 * If we detect here that tsbmiss area is out of sync with 11723 * sfmmu update it and retry the trapped instruction. 11724 * Otherwise call trap(). 11725 */ 11726 int ret = 0; 11727 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K); 11728 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK); 11729 11730 /* 11731 * Must set lwp state to LWP_SYS before 11732 * trying to acquire any adaptive lock 11733 */ 11734 lwp = ttolwp(curthread); 11735 ASSERT(lwp); 11736 lwp_save_state = lwp->lwp_state; 11737 lwp->lwp_state = LWP_SYS; 11738 11739 hatlockp = sfmmu_hat_enter(sfmmup); 11740 kpreempt_disable(); 11741 tsbmp = &tsbmiss_area[CPU->cpu_id]; 11742 ASSERT(sfmmup == tsbmp->usfmmup); 11743 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) & 11744 ~tteflag_mask) || 11745 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) & 11746 ~tteflag_mask)) { 11747 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags; 11748 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags; 11749 ret = 1; 11750 } 11751 if (sfmmup->sfmmu_srdp != NULL) { 11752 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap; 11753 ulong_t *tm = tsbmp->shmermap; 11754 ulong_t i; 11755 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 11756 ulong_t d = tm[i] ^ sm[i]; 11757 if (d) { 11758 if (d & sm[i]) { 11759 if (!ret && sfmmu_is_rgnva( 11760 sfmmup->sfmmu_srdp, 11761 addr, i, d & sm[i])) { 11762 ret = 1; 11763 } 11764 } 11765 tm[i] = sm[i]; 11766 } 11767 } 11768 } 11769 kpreempt_enable(); 11770 sfmmu_hat_exit(hatlockp); 11771 lwp->lwp_state = lwp_save_state; 11772 if (ret) { 11773 return; 11774 } 11775 } else if (ctxtype == INVALID_CONTEXT) { 11776 /* 11777 * First, make sure we come out of here with a valid ctx, 11778 * since if we don't get one we'll simply loop on the 11779 * faulting instruction. 11780 * 11781 * If the ISM mappings are changing, the TSB is relocated, 11782 * the process is swapped, the process is joining SCD or 11783 * leaving SCD or shared regions we serialize behind the 11784 * controlling thread with hat lock, sfmmu_flags and 11785 * sfmmu_tsb_cv condition variable. 11786 */ 11787 11788 /* 11789 * Must set lwp state to LWP_SYS before 11790 * trying to acquire any adaptive lock 11791 */ 11792 lwp = ttolwp(curthread); 11793 ASSERT(lwp); 11794 lwp_save_state = lwp->lwp_state; 11795 lwp->lwp_state = LWP_SYS; 11796 11797 hatlockp = sfmmu_hat_enter(sfmmup); 11798 retry: 11799 if ((scdp = sfmmup->sfmmu_scdp) != NULL) { 11800 shsfmmup = scdp->scd_sfmmup; 11801 ASSERT(shsfmmup != NULL); 11802 11803 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL; 11804 tsbinfop = tsbinfop->tsb_next) { 11805 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 11806 /* drop the private hat lock */ 11807 sfmmu_hat_exit(hatlockp); 11808 /* acquire the shared hat lock */ 11809 shatlockp = sfmmu_hat_enter(shsfmmup); 11810 /* 11811 * recheck to see if anything changed 11812 * after we drop the private hat lock. 11813 */ 11814 if (sfmmup->sfmmu_scdp == scdp && 11815 shsfmmup == scdp->scd_sfmmup) { 11816 sfmmu_tsb_chk_reloc(shsfmmup, 11817 shatlockp); 11818 } 11819 sfmmu_hat_exit(shatlockp); 11820 hatlockp = sfmmu_hat_enter(sfmmup); 11821 goto retry; 11822 } 11823 } 11824 } 11825 11826 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 11827 tsbinfop = tsbinfop->tsb_next) { 11828 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) { 11829 cv_wait(&sfmmup->sfmmu_tsb_cv, 11830 HATLOCK_MUTEXP(hatlockp)); 11831 goto retry; 11832 } 11833 } 11834 11835 /* 11836 * Wait for ISM maps to be updated. 11837 */ 11838 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 11839 cv_wait(&sfmmup->sfmmu_tsb_cv, 11840 HATLOCK_MUTEXP(hatlockp)); 11841 goto retry; 11842 } 11843 11844 /* Is this process joining an SCD? */ 11845 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 11846 /* 11847 * Flush private TSB and setup shared TSB. 11848 * sfmmu_finish_join_scd() does not drop the 11849 * hat lock. 11850 */ 11851 sfmmu_finish_join_scd(sfmmup); 11852 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 11853 } 11854 11855 /* 11856 * If we're swapping in, get TSB(s). Note that we must do 11857 * this before we get a ctx or load the MMU state. Once 11858 * we swap in we have to recheck to make sure the TSB(s) and 11859 * ISM mappings didn't change while we slept. 11860 */ 11861 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) { 11862 sfmmu_tsb_swapin(sfmmup, hatlockp); 11863 goto retry; 11864 } 11865 11866 sfmmu_get_ctx(sfmmup); 11867 11868 sfmmu_hat_exit(hatlockp); 11869 /* 11870 * Must restore lwp_state if not calling 11871 * trap() for further processing. Restore 11872 * it anyway. 11873 */ 11874 lwp->lwp_state = lwp_save_state; 11875 return; 11876 } 11877 trap(rp, (caddr_t)tagaccess, traptype, 0); 11878 } 11879 11880 static void 11881 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp) 11882 { 11883 struct tsb_info *tp; 11884 11885 ASSERT(sfmmu_hat_lock_held(sfmmup)); 11886 11887 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) { 11888 if (tp->tsb_flags & TSB_RELOC_FLAG) { 11889 cv_wait(&sfmmup->sfmmu_tsb_cv, 11890 HATLOCK_MUTEXP(hatlockp)); 11891 break; 11892 } 11893 } 11894 } 11895 11896 /* 11897 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and 11898 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock 11899 * rather than spinning to avoid send mondo timeouts with 11900 * interrupts enabled. When the lock is acquired it is immediately 11901 * released and we return back to sfmmu_vatopfn just after 11902 * the GET_TTE call. 11903 */ 11904 void 11905 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep) 11906 { 11907 struct page **pp; 11908 11909 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE); 11910 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE); 11911 } 11912 11913 /* 11914 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and 11915 * TTE_SUSPENDED bit set in tte. We do this so that we can handle 11916 * cross traps which cannot be handled while spinning in the 11917 * trap handlers. Simply enter and exit the kpr_suspendlock spin 11918 * mutex, which is held by the holder of the suspend bit, and then 11919 * retry the trapped instruction after unwinding. 11920 */ 11921 /*ARGSUSED*/ 11922 void 11923 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype) 11924 { 11925 ASSERT(curthread != kreloc_thread); 11926 mutex_enter(&kpr_suspendlock); 11927 mutex_exit(&kpr_suspendlock); 11928 } 11929 11930 /* 11931 * This routine could be optimized to reduce the number of xcalls by flushing 11932 * the entire TLBs if region reference count is above some threshold but the 11933 * tradeoff will depend on the size of the TLB. So for now flush the specific 11934 * page a context at a time. 11935 * 11936 * If uselocks is 0 then it's called after all cpus were captured and all the 11937 * hat locks were taken. In this case don't take the region lock by relying on 11938 * the order of list region update operations in hat_join_region(), 11939 * hat_leave_region() and hat_dup_region(). The ordering in those routines 11940 * guarantees that list is always forward walkable and reaches active sfmmus 11941 * regardless of where xc_attention() captures a cpu. 11942 */ 11943 cpuset_t 11944 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp, 11945 struct hme_blk *hmeblkp, int uselocks) 11946 { 11947 sfmmu_t *sfmmup; 11948 cpuset_t cpuset; 11949 cpuset_t rcpuset; 11950 hatlock_t *hatlockp; 11951 uint_t rid = rgnp->rgn_id; 11952 sf_rgn_link_t *rlink; 11953 sf_scd_t *scdp; 11954 11955 ASSERT(hmeblkp->hblk_shared); 11956 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 11957 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 11958 11959 CPUSET_ZERO(rcpuset); 11960 if (uselocks) { 11961 mutex_enter(&rgnp->rgn_mutex); 11962 } 11963 sfmmup = rgnp->rgn_sfmmu_head; 11964 while (sfmmup != NULL) { 11965 if (uselocks) { 11966 hatlockp = sfmmu_hat_enter(sfmmup); 11967 } 11968 11969 /* 11970 * When an SCD is created the SCD hat is linked on the sfmmu 11971 * region lists for each hme region which is part of the 11972 * SCD. If we find an SCD hat, when walking these lists, 11973 * then we flush the shared TSBs, if we find a private hat, 11974 * which is part of an SCD, but where the region 11975 * is not part of the SCD then we flush the private TSBs. 11976 */ 11977 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 11978 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 11979 scdp = sfmmup->sfmmu_scdp; 11980 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 11981 if (uselocks) { 11982 sfmmu_hat_exit(hatlockp); 11983 } 11984 goto next; 11985 } 11986 } 11987 11988 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 11989 11990 kpreempt_disable(); 11991 cpuset = sfmmup->sfmmu_cpusran; 11992 CPUSET_AND(cpuset, cpu_ready_set); 11993 CPUSET_DEL(cpuset, CPU->cpu_id); 11994 SFMMU_XCALL_STATS(sfmmup); 11995 xt_some(cpuset, vtag_flushpage_tl1, 11996 (uint64_t)addr, (uint64_t)sfmmup); 11997 vtag_flushpage(addr, (uint64_t)sfmmup); 11998 if (uselocks) { 11999 sfmmu_hat_exit(hatlockp); 12000 } 12001 kpreempt_enable(); 12002 CPUSET_OR(rcpuset, cpuset); 12003 12004 next: 12005 /* LINTED: constant in conditional context */ 12006 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 12007 ASSERT(rlink != NULL); 12008 sfmmup = rlink->next; 12009 } 12010 if (uselocks) { 12011 mutex_exit(&rgnp->rgn_mutex); 12012 } 12013 return (rcpuset); 12014 } 12015 12016 /* 12017 * This routine takes an sfmmu pointer and the va for an adddress in an 12018 * ISM region as input and returns the corresponding region id in ism_rid. 12019 * The return value of 1 indicates that a region has been found and ism_rid 12020 * is valid, otherwise 0 is returned. 12021 */ 12022 static int 12023 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid) 12024 { 12025 ism_blk_t *ism_blkp; 12026 int i; 12027 ism_map_t *ism_map; 12028 #ifdef DEBUG 12029 struct hat *ism_hatid; 12030 #endif 12031 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12032 12033 ism_blkp = sfmmup->sfmmu_iblk; 12034 while (ism_blkp != NULL) { 12035 ism_map = ism_blkp->iblk_maps; 12036 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) { 12037 if ((va >= ism_start(ism_map[i])) && 12038 (va < ism_end(ism_map[i]))) { 12039 12040 *ism_rid = ism_map[i].imap_rid; 12041 #ifdef DEBUG 12042 ism_hatid = ism_map[i].imap_ismhat; 12043 ASSERT(ism_hatid == ism_sfmmup); 12044 ASSERT(ism_hatid->sfmmu_ismhat); 12045 #endif 12046 return (1); 12047 } 12048 } 12049 ism_blkp = ism_blkp->iblk_next; 12050 } 12051 return (0); 12052 } 12053 12054 /* 12055 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches. 12056 * This routine may be called with all cpu's captured. Therefore, the 12057 * caller is responsible for holding all locks and disabling kernel 12058 * preemption. 12059 */ 12060 /* ARGSUSED */ 12061 static void 12062 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup, 12063 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag) 12064 { 12065 cpuset_t cpuset; 12066 caddr_t va; 12067 ism_ment_t *ment; 12068 sfmmu_t *sfmmup; 12069 #ifdef VAC 12070 int vcolor; 12071 #endif 12072 12073 sf_scd_t *scdp; 12074 uint_t ism_rid; 12075 12076 ASSERT(!hmeblkp->hblk_shared); 12077 /* 12078 * Walk the ism_hat's mapping list and flush the page 12079 * from every hat sharing this ism_hat. This routine 12080 * may be called while all cpu's have been captured. 12081 * Therefore we can't attempt to grab any locks. For now 12082 * this means we will protect the ism mapping list under 12083 * a single lock which will be grabbed by the caller. 12084 * If hat_share/unshare scalibility becomes a performance 12085 * problem then we may need to re-think ism mapping list locking. 12086 */ 12087 ASSERT(ism_sfmmup->sfmmu_ismhat); 12088 ASSERT(MUTEX_HELD(&ism_mlist_lock)); 12089 addr = addr - ISMID_STARTADDR; 12090 12091 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) { 12092 12093 sfmmup = ment->iment_hat; 12094 12095 va = ment->iment_base_va; 12096 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr); 12097 12098 /* 12099 * When an SCD is created the SCD hat is linked on the ism 12100 * mapping lists for each ISM segment which is part of the 12101 * SCD. If we find an SCD hat, when walking these lists, 12102 * then we flush the shared TSBs, if we find a private hat, 12103 * which is part of an SCD, but where the region 12104 * corresponding to this va is not part of the SCD then we 12105 * flush the private TSBs. 12106 */ 12107 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && 12108 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) && 12109 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { 12110 if (!find_ism_rid(sfmmup, ism_sfmmup, va, 12111 &ism_rid)) { 12112 cmn_err(CE_PANIC, 12113 "can't find matching ISM rid!"); 12114 } 12115 12116 scdp = sfmmup->sfmmu_scdp; 12117 if (SFMMU_IS_ISMRID_VALID(ism_rid) && 12118 SF_RGNMAP_TEST(scdp->scd_ismregion_map, 12119 ism_rid)) { 12120 continue; 12121 } 12122 } 12123 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1); 12124 12125 cpuset = sfmmup->sfmmu_cpusran; 12126 CPUSET_AND(cpuset, cpu_ready_set); 12127 CPUSET_DEL(cpuset, CPU->cpu_id); 12128 SFMMU_XCALL_STATS(sfmmup); 12129 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va, 12130 (uint64_t)sfmmup); 12131 vtag_flushpage(va, (uint64_t)sfmmup); 12132 12133 #ifdef VAC 12134 /* 12135 * Flush D$ 12136 * When flushing D$ we must flush all 12137 * cpu's. See sfmmu_cache_flush(). 12138 */ 12139 if (cache_flush_flag == CACHE_FLUSH) { 12140 cpuset = cpu_ready_set; 12141 CPUSET_DEL(cpuset, CPU->cpu_id); 12142 12143 SFMMU_XCALL_STATS(sfmmup); 12144 vcolor = addr_to_vcolor(va); 12145 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12146 vac_flushpage(pfnum, vcolor); 12147 } 12148 #endif /* VAC */ 12149 } 12150 } 12151 12152 /* 12153 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of 12154 * a particular virtual address and ctx. If noflush is set we do not 12155 * flush the TLB/TSB. This function may or may not be called with the 12156 * HAT lock held. 12157 */ 12158 static void 12159 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12160 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag, 12161 int hat_lock_held) 12162 { 12163 #ifdef VAC 12164 int vcolor; 12165 #endif 12166 cpuset_t cpuset; 12167 hatlock_t *hatlockp; 12168 12169 ASSERT(!hmeblkp->hblk_shared); 12170 12171 #if defined(lint) && !defined(VAC) 12172 pfnum = pfnum; 12173 cpu_flag = cpu_flag; 12174 cache_flush_flag = cache_flush_flag; 12175 #endif 12176 12177 /* 12178 * There is no longer a need to protect against ctx being 12179 * stolen here since we don't store the ctx in the TSB anymore. 12180 */ 12181 #ifdef VAC 12182 vcolor = addr_to_vcolor(addr); 12183 #endif 12184 12185 /* 12186 * We must hold the hat lock during the flush of TLB, 12187 * to avoid a race with sfmmu_invalidate_ctx(), where 12188 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12189 * causing TLB demap routine to skip flush on that MMU. 12190 * If the context on a MMU has already been set to 12191 * INVALID_CONTEXT, we just get an extra flush on 12192 * that MMU. 12193 */ 12194 if (!hat_lock_held && !tlb_noflush) 12195 hatlockp = sfmmu_hat_enter(sfmmup); 12196 12197 kpreempt_disable(); 12198 if (!tlb_noflush) { 12199 /* 12200 * Flush the TSB and TLB. 12201 */ 12202 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12203 12204 cpuset = sfmmup->sfmmu_cpusran; 12205 CPUSET_AND(cpuset, cpu_ready_set); 12206 CPUSET_DEL(cpuset, CPU->cpu_id); 12207 12208 SFMMU_XCALL_STATS(sfmmup); 12209 12210 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, 12211 (uint64_t)sfmmup); 12212 12213 vtag_flushpage(addr, (uint64_t)sfmmup); 12214 } 12215 12216 if (!hat_lock_held && !tlb_noflush) 12217 sfmmu_hat_exit(hatlockp); 12218 12219 #ifdef VAC 12220 /* 12221 * Flush the D$ 12222 * 12223 * Even if the ctx is stolen, we need to flush the 12224 * cache. Our ctx stealer only flushes the TLBs. 12225 */ 12226 if (cache_flush_flag == CACHE_FLUSH) { 12227 if (cpu_flag & FLUSH_ALL_CPUS) { 12228 cpuset = cpu_ready_set; 12229 } else { 12230 cpuset = sfmmup->sfmmu_cpusran; 12231 CPUSET_AND(cpuset, cpu_ready_set); 12232 } 12233 CPUSET_DEL(cpuset, CPU->cpu_id); 12234 SFMMU_XCALL_STATS(sfmmup); 12235 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12236 vac_flushpage(pfnum, vcolor); 12237 } 12238 #endif /* VAC */ 12239 kpreempt_enable(); 12240 } 12241 12242 /* 12243 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual 12244 * address and ctx. If noflush is set we do not currently do anything. 12245 * This function may or may not be called with the HAT lock held. 12246 */ 12247 static void 12248 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp, 12249 int tlb_noflush, int hat_lock_held) 12250 { 12251 cpuset_t cpuset; 12252 hatlock_t *hatlockp; 12253 12254 ASSERT(!hmeblkp->hblk_shared); 12255 12256 /* 12257 * If the process is exiting we have nothing to do. 12258 */ 12259 if (tlb_noflush) 12260 return; 12261 12262 /* 12263 * Flush TSB. 12264 */ 12265 if (!hat_lock_held) 12266 hatlockp = sfmmu_hat_enter(sfmmup); 12267 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0); 12268 12269 kpreempt_disable(); 12270 12271 cpuset = sfmmup->sfmmu_cpusran; 12272 CPUSET_AND(cpuset, cpu_ready_set); 12273 CPUSET_DEL(cpuset, CPU->cpu_id); 12274 12275 SFMMU_XCALL_STATS(sfmmup); 12276 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup); 12277 12278 vtag_flushpage(addr, (uint64_t)sfmmup); 12279 12280 if (!hat_lock_held) 12281 sfmmu_hat_exit(hatlockp); 12282 12283 kpreempt_enable(); 12284 12285 } 12286 12287 /* 12288 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall 12289 * call handler that can flush a range of pages to save on xcalls. 12290 */ 12291 static int sfmmu_xcall_save; 12292 12293 /* 12294 * this routine is never used for demaping addresses backed by SRD hmeblks. 12295 */ 12296 static void 12297 sfmmu_tlb_range_demap(demap_range_t *dmrp) 12298 { 12299 sfmmu_t *sfmmup = dmrp->dmr_sfmmup; 12300 hatlock_t *hatlockp; 12301 cpuset_t cpuset; 12302 uint64_t sfmmu_pgcnt; 12303 pgcnt_t pgcnt = 0; 12304 int pgunload = 0; 12305 int dirtypg = 0; 12306 caddr_t addr = dmrp->dmr_addr; 12307 caddr_t eaddr; 12308 uint64_t bitvec = dmrp->dmr_bitvec; 12309 12310 ASSERT(bitvec & 1); 12311 12312 /* 12313 * Flush TSB and calculate number of pages to flush. 12314 */ 12315 while (bitvec != 0) { 12316 dirtypg = 0; 12317 /* 12318 * Find the first page to flush and then count how many 12319 * pages there are after it that also need to be flushed. 12320 * This way the number of TSB flushes is minimized. 12321 */ 12322 while ((bitvec & 1) == 0) { 12323 pgcnt++; 12324 addr += MMU_PAGESIZE; 12325 bitvec >>= 1; 12326 } 12327 while (bitvec & 1) { 12328 dirtypg++; 12329 bitvec >>= 1; 12330 } 12331 eaddr = addr + ptob(dirtypg); 12332 hatlockp = sfmmu_hat_enter(sfmmup); 12333 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K); 12334 sfmmu_hat_exit(hatlockp); 12335 pgunload += dirtypg; 12336 addr = eaddr; 12337 pgcnt += dirtypg; 12338 } 12339 12340 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr); 12341 if (sfmmup->sfmmu_free == 0) { 12342 addr = dmrp->dmr_addr; 12343 bitvec = dmrp->dmr_bitvec; 12344 12345 /* 12346 * make sure it has SFMMU_PGCNT_SHIFT bits only, 12347 * as it will be used to pack argument for xt_some 12348 */ 12349 ASSERT((pgcnt > 0) && 12350 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT))); 12351 12352 /* 12353 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in 12354 * the low 6 bits of sfmmup. This is doable since pgcnt 12355 * always >= 1. 12356 */ 12357 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK)); 12358 sfmmu_pgcnt = (uint64_t)sfmmup | 12359 ((pgcnt - 1) & SFMMU_PGCNT_MASK); 12360 12361 /* 12362 * We must hold the hat lock during the flush of TLB, 12363 * to avoid a race with sfmmu_invalidate_ctx(), where 12364 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT, 12365 * causing TLB demap routine to skip flush on that MMU. 12366 * If the context on a MMU has already been set to 12367 * INVALID_CONTEXT, we just get an extra flush on 12368 * that MMU. 12369 */ 12370 hatlockp = sfmmu_hat_enter(sfmmup); 12371 kpreempt_disable(); 12372 12373 cpuset = sfmmup->sfmmu_cpusran; 12374 CPUSET_AND(cpuset, cpu_ready_set); 12375 CPUSET_DEL(cpuset, CPU->cpu_id); 12376 12377 SFMMU_XCALL_STATS(sfmmup); 12378 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr, 12379 sfmmu_pgcnt); 12380 12381 for (; bitvec != 0; bitvec >>= 1) { 12382 if (bitvec & 1) 12383 vtag_flushpage(addr, (uint64_t)sfmmup); 12384 addr += MMU_PAGESIZE; 12385 } 12386 kpreempt_enable(); 12387 sfmmu_hat_exit(hatlockp); 12388 12389 sfmmu_xcall_save += (pgunload-1); 12390 } 12391 dmrp->dmr_bitvec = 0; 12392 } 12393 12394 /* 12395 * In cases where we need to synchronize with TLB/TSB miss trap 12396 * handlers, _and_ need to flush the TLB, it's a lot easier to 12397 * throw away the context from the process than to do a 12398 * special song and dance to keep things consistent for the 12399 * handlers. 12400 * 12401 * Since the process suddenly ends up without a context and our caller 12402 * holds the hat lock, threads that fault after this function is called 12403 * will pile up on the lock. We can then do whatever we need to 12404 * atomically from the context of the caller. The first blocked thread 12405 * to resume executing will get the process a new context, and the 12406 * process will resume executing. 12407 * 12408 * One added advantage of this approach is that on MMUs that 12409 * support a "flush all" operation, we will delay the flush until 12410 * cnum wrap-around, and then flush the TLB one time. This 12411 * is rather rare, so it's a lot less expensive than making 8000 12412 * x-calls to flush the TLB 8000 times. 12413 * 12414 * A per-process (PP) lock is used to synchronize ctx allocations in 12415 * resume() and ctx invalidations here. 12416 */ 12417 static void 12418 sfmmu_invalidate_ctx(sfmmu_t *sfmmup) 12419 { 12420 cpuset_t cpuset; 12421 int cnum, currcnum; 12422 mmu_ctx_t *mmu_ctxp; 12423 int i; 12424 uint_t pstate_save; 12425 12426 SFMMU_STAT(sf_ctx_inv); 12427 12428 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12429 ASSERT(sfmmup != ksfmmup); 12430 12431 kpreempt_disable(); 12432 12433 mmu_ctxp = CPU_MMU_CTXP(CPU); 12434 ASSERT(mmu_ctxp); 12435 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms); 12436 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]); 12437 12438 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum; 12439 12440 pstate_save = sfmmu_disable_intrs(); 12441 12442 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */ 12443 /* set HAT cnum invalid across all context domains. */ 12444 for (i = 0; i < max_mmu_ctxdoms; i++) { 12445 12446 cnum = sfmmup->sfmmu_ctxs[i].cnum; 12447 if (cnum == INVALID_CONTEXT) { 12448 continue; 12449 } 12450 12451 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 12452 } 12453 membar_enter(); /* make sure globally visible to all CPUs */ 12454 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */ 12455 12456 sfmmu_enable_intrs(pstate_save); 12457 12458 cpuset = sfmmup->sfmmu_cpusran; 12459 CPUSET_DEL(cpuset, CPU->cpu_id); 12460 CPUSET_AND(cpuset, cpu_ready_set); 12461 if (!CPUSET_ISNULL(cpuset)) { 12462 SFMMU_XCALL_STATS(sfmmup); 12463 xt_some(cpuset, sfmmu_raise_tsb_exception, 12464 (uint64_t)sfmmup, INVALID_CONTEXT); 12465 xt_sync(cpuset); 12466 SFMMU_STAT(sf_tsb_raise_exception); 12467 SFMMU_MMU_STAT(mmu_tsb_raise_exception); 12468 } 12469 12470 /* 12471 * If the hat to-be-invalidated is the same as the current 12472 * process on local CPU we need to invalidate 12473 * this CPU context as well. 12474 */ 12475 if ((sfmmu_getctx_sec() == currcnum) && 12476 (currcnum != INVALID_CONTEXT)) { 12477 /* sets shared context to INVALID too */ 12478 sfmmu_setctx_sec(INVALID_CONTEXT); 12479 sfmmu_clear_utsbinfo(); 12480 } 12481 12482 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID); 12483 12484 kpreempt_enable(); 12485 12486 /* 12487 * we hold the hat lock, so nobody should allocate a context 12488 * for us yet 12489 */ 12490 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT); 12491 } 12492 12493 #ifdef VAC 12494 /* 12495 * We need to flush the cache in all cpus. It is possible that 12496 * a process referenced a page as cacheable but has sinced exited 12497 * and cleared the mapping list. We still to flush it but have no 12498 * state so all cpus is the only alternative. 12499 */ 12500 void 12501 sfmmu_cache_flush(pfn_t pfnum, int vcolor) 12502 { 12503 cpuset_t cpuset; 12504 12505 kpreempt_disable(); 12506 cpuset = cpu_ready_set; 12507 CPUSET_DEL(cpuset, CPU->cpu_id); 12508 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12509 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor); 12510 xt_sync(cpuset); 12511 vac_flushpage(pfnum, vcolor); 12512 kpreempt_enable(); 12513 } 12514 12515 void 12516 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum) 12517 { 12518 cpuset_t cpuset; 12519 12520 ASSERT(vcolor >= 0); 12521 12522 kpreempt_disable(); 12523 cpuset = cpu_ready_set; 12524 CPUSET_DEL(cpuset, CPU->cpu_id); 12525 SFMMU_XCALL_STATS(NULL); /* account to any ctx */ 12526 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum); 12527 xt_sync(cpuset); 12528 vac_flushcolor(vcolor, pfnum); 12529 kpreempt_enable(); 12530 } 12531 #endif /* VAC */ 12532 12533 /* 12534 * We need to prevent processes from accessing the TSB using a cached physical 12535 * address. It's alright if they try to access the TSB via virtual address 12536 * since they will just fault on that virtual address once the mapping has 12537 * been suspended. 12538 */ 12539 #pragma weak sendmondo_in_recover 12540 12541 /* ARGSUSED */ 12542 static int 12543 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo) 12544 { 12545 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12546 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12547 hatlock_t *hatlockp; 12548 sf_scd_t *scdp; 12549 12550 if (flags != HAT_PRESUSPEND) 12551 return (0); 12552 12553 /* 12554 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must 12555 * be a shared hat, then set SCD's tsbinfo's flag. 12556 * If tsb is not shared, sfmmup is a private hat, then set 12557 * its private tsbinfo's flag. 12558 */ 12559 hatlockp = sfmmu_hat_enter(sfmmup); 12560 tsbinfop->tsb_flags |= TSB_RELOC_FLAG; 12561 12562 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) { 12563 sfmmu_tsb_inv_ctx(sfmmup); 12564 sfmmu_hat_exit(hatlockp); 12565 } else { 12566 /* release lock on the shared hat */ 12567 sfmmu_hat_exit(hatlockp); 12568 /* sfmmup is a shared hat */ 12569 ASSERT(sfmmup->sfmmu_scdhat); 12570 scdp = sfmmup->sfmmu_scdp; 12571 ASSERT(scdp != NULL); 12572 /* get private hat from the scd list */ 12573 mutex_enter(&scdp->scd_mutex); 12574 sfmmup = scdp->scd_sf_list; 12575 while (sfmmup != NULL) { 12576 hatlockp = sfmmu_hat_enter(sfmmup); 12577 /* 12578 * We do not call sfmmu_tsb_inv_ctx here because 12579 * sendmondo_in_recover check is only needed for 12580 * sun4u. 12581 */ 12582 sfmmu_invalidate_ctx(sfmmup); 12583 sfmmu_hat_exit(hatlockp); 12584 sfmmup = sfmmup->sfmmu_scd_link.next; 12585 12586 } 12587 mutex_exit(&scdp->scd_mutex); 12588 } 12589 return (0); 12590 } 12591 12592 static void 12593 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup) 12594 { 12595 extern uint32_t sendmondo_in_recover; 12596 12597 ASSERT(sfmmu_hat_lock_held(sfmmup)); 12598 12599 /* 12600 * For Cheetah+ Erratum 25: 12601 * Wait for any active recovery to finish. We can't risk 12602 * relocating the TSB of the thread running mondo_recover_proc() 12603 * since, if we did that, we would deadlock. The scenario we are 12604 * trying to avoid is as follows: 12605 * 12606 * THIS CPU RECOVER CPU 12607 * -------- ----------- 12608 * Begins recovery, walking through TSB 12609 * hat_pagesuspend() TSB TTE 12610 * TLB miss on TSB TTE, spins at TL1 12611 * xt_sync() 12612 * send_mondo_timeout() 12613 * mondo_recover_proc() 12614 * ((deadlocked)) 12615 * 12616 * The second half of the workaround is that mondo_recover_proc() 12617 * checks to see if the tsb_info has the RELOC flag set, and if it 12618 * does, it skips over that TSB without ever touching tsbinfop->tsb_va 12619 * and hence avoiding the TLB miss that could result in a deadlock. 12620 */ 12621 if (&sendmondo_in_recover) { 12622 membar_enter(); /* make sure RELOC flag visible */ 12623 while (sendmondo_in_recover) { 12624 drv_usecwait(1); 12625 membar_consumer(); 12626 } 12627 } 12628 12629 sfmmu_invalidate_ctx(sfmmup); 12630 } 12631 12632 /* ARGSUSED */ 12633 static int 12634 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags, 12635 void *tsbinfo, pfn_t newpfn) 12636 { 12637 hatlock_t *hatlockp; 12638 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo; 12639 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu; 12640 12641 if (flags != HAT_POSTUNSUSPEND) 12642 return (0); 12643 12644 hatlockp = sfmmu_hat_enter(sfmmup); 12645 12646 SFMMU_STAT(sf_tsb_reloc); 12647 12648 /* 12649 * The process may have swapped out while we were relocating one 12650 * of its TSBs. If so, don't bother doing the setup since the 12651 * process can't be using the memory anymore. 12652 */ 12653 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) { 12654 ASSERT(va == tsbinfop->tsb_va); 12655 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn); 12656 12657 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) { 12658 sfmmu_inv_tsb(tsbinfop->tsb_va, 12659 TSB_BYTES(tsbinfop->tsb_szc)); 12660 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED; 12661 } 12662 } 12663 12664 membar_exit(); 12665 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG; 12666 cv_broadcast(&sfmmup->sfmmu_tsb_cv); 12667 12668 sfmmu_hat_exit(hatlockp); 12669 12670 return (0); 12671 } 12672 12673 /* 12674 * Allocate and initialize a tsb_info structure. Note that we may or may not 12675 * allocate a TSB here, depending on the flags passed in. 12676 */ 12677 static int 12678 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask, 12679 uint_t flags, sfmmu_t *sfmmup) 12680 { 12681 int err; 12682 12683 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc( 12684 sfmmu_tsbinfo_cache, KM_SLEEP); 12685 12686 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask, 12687 tsb_szc, flags, sfmmup)) != 0) { 12688 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp); 12689 SFMMU_STAT(sf_tsb_allocfail); 12690 *tsbinfopp = NULL; 12691 return (err); 12692 } 12693 SFMMU_STAT(sf_tsb_alloc); 12694 12695 /* 12696 * Bump the TSB size counters for this TSB size. 12697 */ 12698 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++; 12699 return (0); 12700 } 12701 12702 static void 12703 sfmmu_tsb_free(struct tsb_info *tsbinfo) 12704 { 12705 caddr_t tsbva = tsbinfo->tsb_va; 12706 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc); 12707 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache; 12708 vmem_t *vmp = tsbinfo->tsb_vmp; 12709 12710 /* 12711 * If we allocated this TSB from relocatable kernel memory, then we 12712 * need to uninstall the callback handler. 12713 */ 12714 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) { 12715 uintptr_t slab_mask; 12716 caddr_t slab_vaddr; 12717 page_t **ppl; 12718 int ret; 12719 12720 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena); 12721 if (tsb_size > MMU_PAGESIZE4M) 12722 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12723 else 12724 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12725 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask); 12726 12727 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE); 12728 ASSERT(ret == 0); 12729 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo, 12730 0, NULL); 12731 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE); 12732 } 12733 12734 if (kmem_cachep != NULL) { 12735 kmem_cache_free(kmem_cachep, tsbva); 12736 } else { 12737 vmem_xfree(vmp, (void *)tsbva, tsb_size); 12738 } 12739 tsbinfo->tsb_va = (caddr_t)0xbad00bad; 12740 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size); 12741 } 12742 12743 static void 12744 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo) 12745 { 12746 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) { 12747 sfmmu_tsb_free(tsbinfo); 12748 } 12749 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo); 12750 12751 } 12752 12753 /* 12754 * Setup all the references to physical memory for this tsbinfo. 12755 * The underlying page(s) must be locked. 12756 */ 12757 static void 12758 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn) 12759 { 12760 ASSERT(pfn != PFN_INVALID); 12761 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va)); 12762 12763 #ifndef sun4v 12764 if (tsbinfo->tsb_szc == 0) { 12765 sfmmu_memtte(&tsbinfo->tsb_tte, pfn, 12766 PROT_WRITE|PROT_READ, TTE8K); 12767 } else { 12768 /* 12769 * Round down PA and use a large mapping; the handlers will 12770 * compute the TSB pointer at the correct offset into the 12771 * big virtual page. NOTE: this assumes all TSBs larger 12772 * than 8K must come from physically contiguous slabs of 12773 * size tsb_slab_size. 12774 */ 12775 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask, 12776 PROT_WRITE|PROT_READ, tsb_slab_ttesz); 12777 } 12778 tsbinfo->tsb_pa = ptob(pfn); 12779 12780 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */ 12781 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */ 12782 12783 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte)); 12784 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte)); 12785 #else /* sun4v */ 12786 tsbinfo->tsb_pa = ptob(pfn); 12787 #endif /* sun4v */ 12788 } 12789 12790 12791 /* 12792 * Returns zero on success, ENOMEM if over the high water mark, 12793 * or EAGAIN if the caller needs to retry with a smaller TSB 12794 * size (or specify TSB_FORCEALLOC if the allocation can't fail). 12795 * 12796 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC 12797 * is specified and the TSB requested is PAGESIZE, though it 12798 * may sleep waiting for memory if sufficient memory is not 12799 * available. 12800 */ 12801 static int 12802 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask, 12803 int tsbcode, uint_t flags, sfmmu_t *sfmmup) 12804 { 12805 caddr_t vaddr = NULL; 12806 caddr_t slab_vaddr; 12807 uintptr_t slab_mask; 12808 int tsbbytes = TSB_BYTES(tsbcode); 12809 int lowmem = 0; 12810 struct kmem_cache *kmem_cachep = NULL; 12811 vmem_t *vmp = NULL; 12812 lgrp_id_t lgrpid = LGRP_NONE; 12813 pfn_t pfn; 12814 uint_t cbflags = HAC_SLEEP; 12815 page_t **pplist; 12816 int ret; 12817 12818 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena); 12819 if (tsbbytes > MMU_PAGESIZE4M) 12820 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT; 12821 else 12822 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT; 12823 12824 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK)) 12825 flags |= TSB_ALLOC; 12826 12827 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE); 12828 12829 tsbinfo->tsb_sfmmu = sfmmup; 12830 12831 /* 12832 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and 12833 * return. 12834 */ 12835 if ((flags & TSB_ALLOC) == 0) { 12836 tsbinfo->tsb_szc = tsbcode; 12837 tsbinfo->tsb_ttesz_mask = tteszmask; 12838 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef; 12839 tsbinfo->tsb_pa = -1; 12840 tsbinfo->tsb_tte.ll = 0; 12841 tsbinfo->tsb_next = NULL; 12842 tsbinfo->tsb_flags = TSB_SWAPPED; 12843 tsbinfo->tsb_cache = NULL; 12844 tsbinfo->tsb_vmp = NULL; 12845 return (0); 12846 } 12847 12848 #ifdef DEBUG 12849 /* 12850 * For debugging: 12851 * Randomly force allocation failures every tsb_alloc_mtbf 12852 * tries if TSB_FORCEALLOC is not specified. This will 12853 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if 12854 * it is even, to allow testing of both failure paths... 12855 */ 12856 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) && 12857 (tsb_alloc_count++ == tsb_alloc_mtbf)) { 12858 tsb_alloc_count = 0; 12859 tsb_alloc_fail_mtbf++; 12860 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN); 12861 } 12862 #endif /* DEBUG */ 12863 12864 /* 12865 * Enforce high water mark if we are not doing a forced allocation 12866 * and are not shrinking a process' TSB. 12867 */ 12868 if ((flags & TSB_SHRINK) == 0 && 12869 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) { 12870 if ((flags & TSB_FORCEALLOC) == 0) 12871 return (ENOMEM); 12872 lowmem = 1; 12873 } 12874 12875 /* 12876 * Allocate from the correct location based upon the size of the TSB 12877 * compared to the base page size, and what memory conditions dictate. 12878 * Note we always do nonblocking allocations from the TSB arena since 12879 * we don't want memory fragmentation to cause processes to block 12880 * indefinitely waiting for memory; until the kernel algorithms that 12881 * coalesce large pages are improved this is our best option. 12882 * 12883 * Algorithm: 12884 * If allocating a "large" TSB (>8K), allocate from the 12885 * appropriate kmem_tsb_default_arena vmem arena 12886 * else if low on memory or the TSB_FORCEALLOC flag is set or 12887 * tsb_forceheap is set 12888 * Allocate from kernel heap via sfmmu_tsb8k_cache with 12889 * KM_SLEEP (never fails) 12890 * else 12891 * Allocate from appropriate sfmmu_tsb_cache with 12892 * KM_NOSLEEP 12893 * endif 12894 */ 12895 if (tsb_lgrp_affinity) 12896 lgrpid = lgrp_home_id(curthread); 12897 if (lgrpid == LGRP_NONE) 12898 lgrpid = 0; /* use lgrp of boot CPU */ 12899 12900 if (tsbbytes > MMU_PAGESIZE) { 12901 if (tsbbytes > MMU_PAGESIZE4M) { 12902 vmp = kmem_bigtsb_default_arena[lgrpid]; 12903 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 12904 0, 0, NULL, NULL, VM_NOSLEEP); 12905 } else { 12906 vmp = kmem_tsb_default_arena[lgrpid]; 12907 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes, 12908 0, 0, NULL, NULL, VM_NOSLEEP); 12909 } 12910 #ifdef DEBUG 12911 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) { 12912 #else /* !DEBUG */ 12913 } else if (lowmem || (flags & TSB_FORCEALLOC)) { 12914 #endif /* DEBUG */ 12915 kmem_cachep = sfmmu_tsb8k_cache; 12916 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP); 12917 ASSERT(vaddr != NULL); 12918 } else { 12919 kmem_cachep = sfmmu_tsb_cache[lgrpid]; 12920 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP); 12921 } 12922 12923 tsbinfo->tsb_cache = kmem_cachep; 12924 tsbinfo->tsb_vmp = vmp; 12925 12926 if (vaddr == NULL) { 12927 return (EAGAIN); 12928 } 12929 12930 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes); 12931 kmem_cachep = tsbinfo->tsb_cache; 12932 12933 /* 12934 * If we are allocating from outside the cage, then we need to 12935 * register a relocation callback handler. Note that for now 12936 * since pseudo mappings always hang off of the slab's root page, 12937 * we need only lock the first 8K of the TSB slab. This is a bit 12938 * hacky but it is good for performance. 12939 */ 12940 if (kmem_cachep != sfmmu_tsb8k_cache) { 12941 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask); 12942 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE); 12943 ASSERT(ret == 0); 12944 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes, 12945 cbflags, (void *)tsbinfo, &pfn, NULL); 12946 12947 /* 12948 * Need to free up resources if we could not successfully 12949 * add the callback function and return an error condition. 12950 */ 12951 if (ret != 0) { 12952 if (kmem_cachep) { 12953 kmem_cache_free(kmem_cachep, vaddr); 12954 } else { 12955 vmem_xfree(vmp, (void *)vaddr, tsbbytes); 12956 } 12957 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, 12958 S_WRITE); 12959 return (EAGAIN); 12960 } 12961 } else { 12962 /* 12963 * Since allocation of 8K TSBs from heap is rare and occurs 12964 * during memory pressure we allocate them from permanent 12965 * memory rather than using callbacks to get the PFN. 12966 */ 12967 pfn = hat_getpfnum(kas.a_hat, vaddr); 12968 } 12969 12970 tsbinfo->tsb_va = vaddr; 12971 tsbinfo->tsb_szc = tsbcode; 12972 tsbinfo->tsb_ttesz_mask = tteszmask; 12973 tsbinfo->tsb_next = NULL; 12974 tsbinfo->tsb_flags = 0; 12975 12976 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn); 12977 12978 sfmmu_inv_tsb(vaddr, tsbbytes); 12979 12980 if (kmem_cachep != sfmmu_tsb8k_cache) { 12981 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE); 12982 } 12983 12984 return (0); 12985 } 12986 12987 /* 12988 * Initialize per cpu tsb and per cpu tsbmiss_area 12989 */ 12990 void 12991 sfmmu_init_tsbs(void) 12992 { 12993 int i; 12994 struct tsbmiss *tsbmissp; 12995 struct kpmtsbm *kpmtsbmp; 12996 #ifndef sun4v 12997 extern int dcache_line_mask; 12998 #endif /* sun4v */ 12999 extern uint_t vac_colors; 13000 13001 /* 13002 * Init. tsb miss area. 13003 */ 13004 tsbmissp = tsbmiss_area; 13005 13006 for (i = 0; i < NCPU; tsbmissp++, i++) { 13007 /* 13008 * initialize the tsbmiss area. 13009 * Do this for all possible CPUs as some may be added 13010 * while the system is running. There is no cost to this. 13011 */ 13012 tsbmissp->ksfmmup = ksfmmup; 13013 #ifndef sun4v 13014 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask; 13015 #endif /* sun4v */ 13016 tsbmissp->khashstart = 13017 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash); 13018 tsbmissp->uhashstart = 13019 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash); 13020 tsbmissp->khashsz = khmehash_num; 13021 tsbmissp->uhashsz = uhmehash_num; 13022 } 13023 13024 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B', 13025 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0); 13026 13027 if (kpm_enable == 0) 13028 return; 13029 13030 /* -- Begin KPM specific init -- */ 13031 13032 if (kpm_smallpages) { 13033 /* 13034 * If we're using base pagesize pages for seg_kpm 13035 * mappings, we use the kernel TSB since we can't afford 13036 * to allocate a second huge TSB for these mappings. 13037 */ 13038 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13039 kpm_tsbsz = ktsb_szcode; 13040 kpmsm_tsbbase = kpm_tsbbase; 13041 kpmsm_tsbsz = kpm_tsbsz; 13042 } else { 13043 /* 13044 * In VAC conflict case, just put the entries in the 13045 * kernel 8K indexed TSB for now so we can find them. 13046 * This could really be changed in the future if we feel 13047 * the need... 13048 */ 13049 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base; 13050 kpmsm_tsbsz = ktsb_szcode; 13051 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base; 13052 kpm_tsbsz = ktsb4m_szcode; 13053 } 13054 13055 kpmtsbmp = kpmtsbm_area; 13056 for (i = 0; i < NCPU; kpmtsbmp++, i++) { 13057 /* 13058 * Initialize the kpmtsbm area. 13059 * Do this for all possible CPUs as some may be added 13060 * while the system is running. There is no cost to this. 13061 */ 13062 kpmtsbmp->vbase = kpm_vbase; 13063 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors; 13064 kpmtsbmp->sz_shift = kpm_size_shift; 13065 kpmtsbmp->kpmp_shift = kpmp_shift; 13066 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft; 13067 if (kpm_smallpages == 0) { 13068 kpmtsbmp->kpmp_table_sz = kpmp_table_sz; 13069 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table); 13070 } else { 13071 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz; 13072 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable); 13073 } 13074 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash); 13075 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG; 13076 #ifdef DEBUG 13077 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0; 13078 #endif /* DEBUG */ 13079 if (ktsb_phys) 13080 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG; 13081 } 13082 13083 /* -- End KPM specific init -- */ 13084 } 13085 13086 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */ 13087 struct tsb_info ktsb_info[2]; 13088 13089 /* 13090 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup. 13091 */ 13092 void 13093 sfmmu_init_ktsbinfo() 13094 { 13095 ASSERT(ksfmmup != NULL); 13096 ASSERT(ksfmmup->sfmmu_tsb == NULL); 13097 /* 13098 * Allocate tsbinfos for kernel and copy in data 13099 * to make debug easier and sun4v setup easier. 13100 */ 13101 ktsb_info[0].tsb_sfmmu = ksfmmup; 13102 ktsb_info[0].tsb_szc = ktsb_szcode; 13103 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K; 13104 ktsb_info[0].tsb_va = ktsb_base; 13105 ktsb_info[0].tsb_pa = ktsb_pbase; 13106 ktsb_info[0].tsb_flags = 0; 13107 ktsb_info[0].tsb_tte.ll = 0; 13108 ktsb_info[0].tsb_cache = NULL; 13109 13110 ktsb_info[1].tsb_sfmmu = ksfmmup; 13111 ktsb_info[1].tsb_szc = ktsb4m_szcode; 13112 ktsb_info[1].tsb_ttesz_mask = TSB4M; 13113 ktsb_info[1].tsb_va = ktsb4m_base; 13114 ktsb_info[1].tsb_pa = ktsb4m_pbase; 13115 ktsb_info[1].tsb_flags = 0; 13116 ktsb_info[1].tsb_tte.ll = 0; 13117 ktsb_info[1].tsb_cache = NULL; 13118 13119 /* Link them into ksfmmup. */ 13120 ktsb_info[0].tsb_next = &ktsb_info[1]; 13121 ktsb_info[1].tsb_next = NULL; 13122 ksfmmup->sfmmu_tsb = &ktsb_info[0]; 13123 13124 sfmmu_setup_tsbinfo(ksfmmup); 13125 } 13126 13127 /* 13128 * Cache the last value returned from va_to_pa(). If the VA specified 13129 * in the current call to cached_va_to_pa() maps to the same Page (as the 13130 * previous call to cached_va_to_pa()), then compute the PA using 13131 * cached info, else call va_to_pa(). 13132 * 13133 * Note: this function is neither MT-safe nor consistent in the presence 13134 * of multiple, interleaved threads. This function was created to enable 13135 * an optimization used during boot (at a point when there's only one thread 13136 * executing on the "boot CPU", and before startup_vm() has been called). 13137 */ 13138 static uint64_t 13139 cached_va_to_pa(void *vaddr) 13140 { 13141 static uint64_t prev_vaddr_base = 0; 13142 static uint64_t prev_pfn = 0; 13143 13144 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) { 13145 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET)); 13146 } else { 13147 uint64_t pa = va_to_pa(vaddr); 13148 13149 if (pa != ((uint64_t)-1)) { 13150 /* 13151 * Computed physical address is valid. Cache its 13152 * related info for the next cached_va_to_pa() call. 13153 */ 13154 prev_pfn = pa & MMU_PAGEMASK; 13155 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK; 13156 } 13157 13158 return (pa); 13159 } 13160 } 13161 13162 /* 13163 * Carve up our nucleus hblk region. We may allocate more hblks than 13164 * asked due to rounding errors but we are guaranteed to have at least 13165 * enough space to allocate the requested number of hblk8's and hblk1's. 13166 */ 13167 void 13168 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1) 13169 { 13170 struct hme_blk *hmeblkp; 13171 size_t hme8blk_sz, hme1blk_sz; 13172 size_t i; 13173 size_t hblk8_bound; 13174 ulong_t j = 0, k = 0; 13175 13176 ASSERT(addr != NULL && size != 0); 13177 13178 /* Need to use proper structure alignment */ 13179 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t)); 13180 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t)); 13181 13182 nucleus_hblk8.list = (void *)addr; 13183 nucleus_hblk8.index = 0; 13184 13185 /* 13186 * Use as much memory as possible for hblk8's since we 13187 * expect all bop_alloc'ed memory to be allocated in 8k chunks. 13188 * We need to hold back enough space for the hblk1's which 13189 * we'll allocate next. 13190 */ 13191 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz; 13192 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) { 13193 hmeblkp = (struct hme_blk *)addr; 13194 addr += hme8blk_sz; 13195 hmeblkp->hblk_nuc_bit = 1; 13196 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13197 } 13198 nucleus_hblk8.len = j; 13199 ASSERT(j >= nhblk8); 13200 SFMMU_STAT_ADD(sf_hblk8_ncreate, j); 13201 13202 nucleus_hblk1.list = (void *)addr; 13203 nucleus_hblk1.index = 0; 13204 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) { 13205 hmeblkp = (struct hme_blk *)addr; 13206 addr += hme1blk_sz; 13207 hmeblkp->hblk_nuc_bit = 1; 13208 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp); 13209 } 13210 ASSERT(k >= nhblk1); 13211 nucleus_hblk1.len = k; 13212 SFMMU_STAT_ADD(sf_hblk1_ncreate, k); 13213 } 13214 13215 /* 13216 * This function is currently not supported on this platform. For what 13217 * it's supposed to do, see hat.c and hat_srmmu.c 13218 */ 13219 /* ARGSUSED */ 13220 faultcode_t 13221 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp, 13222 uint_t flags) 13223 { 13224 return (FC_NOSUPPORT); 13225 } 13226 13227 /* 13228 * Searchs the mapping list of the page for a mapping of the same size. If not 13229 * found the corresponding bit is cleared in the p_index field. When large 13230 * pages are more prevalent in the system, we can maintain the mapping list 13231 * in order and we don't have to traverse the list each time. Just check the 13232 * next and prev entries, and if both are of different size, we clear the bit. 13233 */ 13234 static void 13235 sfmmu_rm_large_mappings(page_t *pp, int ttesz) 13236 { 13237 struct sf_hment *sfhmep; 13238 int index; 13239 pgcnt_t npgs; 13240 13241 ASSERT(ttesz > TTE8K); 13242 13243 ASSERT(sfmmu_mlist_held(pp)); 13244 13245 ASSERT(PP_ISMAPPED_LARGE(pp)); 13246 13247 /* 13248 * Traverse mapping list looking for another mapping of same size. 13249 * since we only want to clear index field if all mappings of 13250 * that size are gone. 13251 */ 13252 13253 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) { 13254 if (IS_PAHME(sfhmep)) 13255 continue; 13256 if (hme_size(sfhmep) == ttesz) { 13257 /* 13258 * another mapping of the same size. don't clear index. 13259 */ 13260 return; 13261 } 13262 } 13263 13264 /* 13265 * Clear the p_index bit for large page. 13266 */ 13267 index = PAGESZ_TO_INDEX(ttesz); 13268 npgs = TTEPAGES(ttesz); 13269 while (npgs-- > 0) { 13270 ASSERT(pp->p_index & index); 13271 pp->p_index &= ~index; 13272 pp = PP_PAGENEXT(pp); 13273 } 13274 } 13275 13276 /* 13277 * return supported features 13278 */ 13279 /* ARGSUSED */ 13280 int 13281 hat_supported(enum hat_features feature, void *arg) 13282 { 13283 switch (feature) { 13284 case HAT_SHARED_PT: 13285 case HAT_DYNAMIC_ISM_UNMAP: 13286 case HAT_VMODSORT: 13287 return (1); 13288 case HAT_SHARED_REGIONS: 13289 if (shctx_on) 13290 return (1); 13291 else 13292 return (0); 13293 default: 13294 return (0); 13295 } 13296 } 13297 13298 void 13299 hat_enter(struct hat *hat) 13300 { 13301 hatlock_t *hatlockp; 13302 13303 if (hat != ksfmmup) { 13304 hatlockp = TSB_HASH(hat); 13305 mutex_enter(HATLOCK_MUTEXP(hatlockp)); 13306 } 13307 } 13308 13309 void 13310 hat_exit(struct hat *hat) 13311 { 13312 hatlock_t *hatlockp; 13313 13314 if (hat != ksfmmup) { 13315 hatlockp = TSB_HASH(hat); 13316 mutex_exit(HATLOCK_MUTEXP(hatlockp)); 13317 } 13318 } 13319 13320 /*ARGSUSED*/ 13321 void 13322 hat_reserve(struct as *as, caddr_t addr, size_t len) 13323 { 13324 } 13325 13326 static void 13327 hat_kstat_init(void) 13328 { 13329 kstat_t *ksp; 13330 13331 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat", 13332 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat), 13333 KSTAT_FLAG_VIRTUAL); 13334 if (ksp) { 13335 ksp->ks_data = (void *) &sfmmu_global_stat; 13336 kstat_install(ksp); 13337 } 13338 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat", 13339 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat), 13340 KSTAT_FLAG_VIRTUAL); 13341 if (ksp) { 13342 ksp->ks_data = (void *) &sfmmu_tsbsize_stat; 13343 kstat_install(ksp); 13344 } 13345 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat", 13346 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU, 13347 KSTAT_FLAG_WRITABLE); 13348 if (ksp) { 13349 ksp->ks_update = sfmmu_kstat_percpu_update; 13350 kstat_install(ksp); 13351 } 13352 } 13353 13354 /* ARGSUSED */ 13355 static int 13356 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw) 13357 { 13358 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data; 13359 struct tsbmiss *tsbm = tsbmiss_area; 13360 struct kpmtsbm *kpmtsbm = kpmtsbm_area; 13361 int i; 13362 13363 ASSERT(cpu_kstat); 13364 if (rw == KSTAT_READ) { 13365 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) { 13366 cpu_kstat->sf_itlb_misses = 0; 13367 cpu_kstat->sf_dtlb_misses = 0; 13368 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses - 13369 tsbm->uprot_traps; 13370 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses + 13371 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps; 13372 cpu_kstat->sf_tsb_hits = 0; 13373 cpu_kstat->sf_umod_faults = tsbm->uprot_traps; 13374 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps; 13375 } 13376 } else { 13377 /* KSTAT_WRITE is used to clear stats */ 13378 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) { 13379 tsbm->utsb_misses = 0; 13380 tsbm->ktsb_misses = 0; 13381 tsbm->uprot_traps = 0; 13382 tsbm->kprot_traps = 0; 13383 kpmtsbm->kpm_dtlb_misses = 0; 13384 kpmtsbm->kpm_tsb_misses = 0; 13385 } 13386 } 13387 return (0); 13388 } 13389 13390 #ifdef DEBUG 13391 13392 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU]; 13393 13394 /* 13395 * A tte checker. *orig_old is the value we read before cas. 13396 * *cur is the value returned by cas. 13397 * *new is the desired value when we do the cas. 13398 * 13399 * *hmeblkp is currently unused. 13400 */ 13401 13402 /* ARGSUSED */ 13403 void 13404 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp) 13405 { 13406 pfn_t i, j, k; 13407 int cpuid = CPU->cpu_id; 13408 13409 gorig[cpuid] = orig_old; 13410 gcur[cpuid] = cur; 13411 gnew[cpuid] = new; 13412 13413 #ifdef lint 13414 hmeblkp = hmeblkp; 13415 #endif 13416 13417 if (TTE_IS_VALID(orig_old)) { 13418 if (TTE_IS_VALID(cur)) { 13419 i = TTE_TO_TTEPFN(orig_old); 13420 j = TTE_TO_TTEPFN(cur); 13421 k = TTE_TO_TTEPFN(new); 13422 if (i != j) { 13423 /* remap error? */ 13424 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j); 13425 } 13426 13427 if (i != k) { 13428 /* remap error? */ 13429 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k); 13430 } 13431 } else { 13432 if (TTE_IS_VALID(new)) { 13433 panic("chk_tte: invalid cur? "); 13434 } 13435 13436 i = TTE_TO_TTEPFN(orig_old); 13437 k = TTE_TO_TTEPFN(new); 13438 if (i != k) { 13439 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k); 13440 } 13441 } 13442 } else { 13443 if (TTE_IS_VALID(cur)) { 13444 j = TTE_TO_TTEPFN(cur); 13445 if (TTE_IS_VALID(new)) { 13446 k = TTE_TO_TTEPFN(new); 13447 if (j != k) { 13448 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx", 13449 j, k); 13450 } 13451 } else { 13452 panic("chk_tte: why here?"); 13453 } 13454 } else { 13455 if (!TTE_IS_VALID(new)) { 13456 panic("chk_tte: why here2 ?"); 13457 } 13458 } 13459 } 13460 } 13461 13462 #endif /* DEBUG */ 13463 13464 extern void prefetch_tsbe_read(struct tsbe *); 13465 extern void prefetch_tsbe_write(struct tsbe *); 13466 13467 13468 /* 13469 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives 13470 * us optimal performance on Cheetah+. You can only have 8 outstanding 13471 * prefetches at any one time, so we opted for 7 read prefetches and 1 write 13472 * prefetch to make the most utilization of the prefetch capability. 13473 */ 13474 #define TSBE_PREFETCH_STRIDE (7) 13475 13476 void 13477 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo) 13478 { 13479 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc); 13480 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc); 13481 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc); 13482 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc); 13483 struct tsbe *old; 13484 struct tsbe *new; 13485 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va; 13486 uint64_t va; 13487 int new_offset; 13488 int i; 13489 int vpshift; 13490 int last_prefetch; 13491 13492 if (old_bytes == new_bytes) { 13493 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes); 13494 } else { 13495 13496 /* 13497 * A TSBE is 16 bytes which means there are four TSBE's per 13498 * P$ line (64 bytes), thus every 4 TSBE's we prefetch. 13499 */ 13500 old = (struct tsbe *)old_tsbinfo->tsb_va; 13501 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1)); 13502 for (i = 0; i < old_entries; i++, old++) { 13503 if (((i & (4-1)) == 0) && (i < last_prefetch)) 13504 prefetch_tsbe_read(old); 13505 if (!old->tte_tag.tag_invalid) { 13506 /* 13507 * We have a valid TTE to remap. Check the 13508 * size. We won't remap 64K or 512K TTEs 13509 * because they span more than one TSB entry 13510 * and are indexed using an 8K virt. page. 13511 * Ditto for 32M and 256M TTEs. 13512 */ 13513 if (TTE_CSZ(&old->tte_data) == TTE64K || 13514 TTE_CSZ(&old->tte_data) == TTE512K) 13515 continue; 13516 if (mmu_page_sizes == max_mmu_page_sizes) { 13517 if (TTE_CSZ(&old->tte_data) == TTE32M || 13518 TTE_CSZ(&old->tte_data) == TTE256M) 13519 continue; 13520 } 13521 13522 /* clear the lower 22 bits of the va */ 13523 va = *(uint64_t *)old << 22; 13524 /* turn va into a virtual pfn */ 13525 va >>= 22 - TSB_START_SIZE; 13526 /* 13527 * or in bits from the offset in the tsb 13528 * to get the real virtual pfn. These 13529 * correspond to bits [21:13] in the va 13530 */ 13531 vpshift = 13532 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) & 13533 0x1ff; 13534 va |= (i << vpshift); 13535 va >>= vpshift; 13536 new_offset = va & (new_entries - 1); 13537 new = new_base + new_offset; 13538 prefetch_tsbe_write(new); 13539 *new = *old; 13540 } 13541 } 13542 } 13543 } 13544 13545 /* 13546 * unused in sfmmu 13547 */ 13548 void 13549 hat_dump(void) 13550 { 13551 } 13552 13553 /* 13554 * Called when a thread is exiting and we have switched to the kernel address 13555 * space. Perform the same VM initialization resume() uses when switching 13556 * processes. 13557 * 13558 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but 13559 * we call it anyway in case the semantics change in the future. 13560 */ 13561 /*ARGSUSED*/ 13562 void 13563 hat_thread_exit(kthread_t *thd) 13564 { 13565 uint_t pgsz_cnum; 13566 uint_t pstate_save; 13567 13568 ASSERT(thd->t_procp->p_as == &kas); 13569 13570 pgsz_cnum = KCONTEXT; 13571 #ifdef sun4u 13572 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT); 13573 #endif 13574 13575 /* 13576 * Note that sfmmu_load_mmustate() is currently a no-op for 13577 * kernel threads. We need to disable interrupts here, 13578 * simply because otherwise sfmmu_load_mmustate() would panic 13579 * if the caller does not disable interrupts. 13580 */ 13581 pstate_save = sfmmu_disable_intrs(); 13582 13583 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */ 13584 sfmmu_setctx_sec(pgsz_cnum); 13585 sfmmu_load_mmustate(ksfmmup); 13586 sfmmu_enable_intrs(pstate_save); 13587 } 13588 13589 13590 /* 13591 * SRD support 13592 */ 13593 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \ 13594 (((uintptr_t)(vp)) >> 11)) & \ 13595 srd_hashmask) 13596 13597 /* 13598 * Attach the process to the srd struct associated with the exec vnode 13599 * from which the process is started. 13600 */ 13601 void 13602 hat_join_srd(struct hat *sfmmup, vnode_t *evp) 13603 { 13604 uint_t hash = SRD_HASH_FUNCTION(evp); 13605 sf_srd_t *srdp; 13606 sf_srd_t *newsrdp; 13607 13608 ASSERT(sfmmup != ksfmmup); 13609 ASSERT(sfmmup->sfmmu_srdp == NULL); 13610 13611 if (!shctx_on) { 13612 return; 13613 } 13614 13615 VN_HOLD(evp); 13616 13617 if (srd_buckets[hash].srdb_srdp != NULL) { 13618 mutex_enter(&srd_buckets[hash].srdb_lock); 13619 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13620 srdp = srdp->srd_hash) { 13621 if (srdp->srd_evp == evp) { 13622 ASSERT(srdp->srd_refcnt >= 0); 13623 sfmmup->sfmmu_srdp = srdp; 13624 atomic_inc_32( 13625 (volatile uint_t *)&srdp->srd_refcnt); 13626 mutex_exit(&srd_buckets[hash].srdb_lock); 13627 return; 13628 } 13629 } 13630 mutex_exit(&srd_buckets[hash].srdb_lock); 13631 } 13632 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP); 13633 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0); 13634 13635 newsrdp->srd_evp = evp; 13636 newsrdp->srd_refcnt = 1; 13637 newsrdp->srd_hmergnfree = NULL; 13638 newsrdp->srd_ismrgnfree = NULL; 13639 13640 mutex_enter(&srd_buckets[hash].srdb_lock); 13641 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL; 13642 srdp = srdp->srd_hash) { 13643 if (srdp->srd_evp == evp) { 13644 ASSERT(srdp->srd_refcnt >= 0); 13645 sfmmup->sfmmu_srdp = srdp; 13646 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt); 13647 mutex_exit(&srd_buckets[hash].srdb_lock); 13648 kmem_cache_free(srd_cache, newsrdp); 13649 return; 13650 } 13651 } 13652 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp; 13653 srd_buckets[hash].srdb_srdp = newsrdp; 13654 sfmmup->sfmmu_srdp = newsrdp; 13655 13656 mutex_exit(&srd_buckets[hash].srdb_lock); 13657 13658 } 13659 13660 static void 13661 sfmmu_leave_srd(sfmmu_t *sfmmup) 13662 { 13663 vnode_t *evp; 13664 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13665 uint_t hash; 13666 sf_srd_t **prev_srdpp; 13667 sf_region_t *rgnp; 13668 sf_region_t *nrgnp; 13669 #ifdef DEBUG 13670 int rgns = 0; 13671 #endif 13672 int i; 13673 13674 ASSERT(sfmmup != ksfmmup); 13675 ASSERT(srdp != NULL); 13676 ASSERT(srdp->srd_refcnt > 0); 13677 ASSERT(sfmmup->sfmmu_scdp == NULL); 13678 ASSERT(sfmmup->sfmmu_free == 1); 13679 13680 sfmmup->sfmmu_srdp = NULL; 13681 evp = srdp->srd_evp; 13682 ASSERT(evp != NULL); 13683 if (atomic_dec_32_nv((volatile uint_t *)&srdp->srd_refcnt)) { 13684 VN_RELE(evp); 13685 return; 13686 } 13687 13688 hash = SRD_HASH_FUNCTION(evp); 13689 mutex_enter(&srd_buckets[hash].srdb_lock); 13690 for (prev_srdpp = &srd_buckets[hash].srdb_srdp; 13691 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) { 13692 if (srdp->srd_evp == evp) { 13693 break; 13694 } 13695 } 13696 if (srdp == NULL || srdp->srd_refcnt) { 13697 mutex_exit(&srd_buckets[hash].srdb_lock); 13698 VN_RELE(evp); 13699 return; 13700 } 13701 *prev_srdpp = srdp->srd_hash; 13702 mutex_exit(&srd_buckets[hash].srdb_lock); 13703 13704 ASSERT(srdp->srd_refcnt == 0); 13705 VN_RELE(evp); 13706 13707 #ifdef DEBUG 13708 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) { 13709 ASSERT(srdp->srd_rgnhash[i] == NULL); 13710 } 13711 #endif /* DEBUG */ 13712 13713 /* free each hme regions in the srd */ 13714 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) { 13715 nrgnp = rgnp->rgn_next; 13716 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid); 13717 ASSERT(rgnp->rgn_refcnt == 0); 13718 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13719 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13720 ASSERT(rgnp->rgn_hmeflags == 0); 13721 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp); 13722 #ifdef DEBUG 13723 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13724 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13725 } 13726 rgns++; 13727 #endif /* DEBUG */ 13728 kmem_cache_free(region_cache, rgnp); 13729 } 13730 ASSERT(rgns == srdp->srd_next_hmerid); 13731 13732 #ifdef DEBUG 13733 rgns = 0; 13734 #endif 13735 /* free each ism rgns in the srd */ 13736 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) { 13737 nrgnp = rgnp->rgn_next; 13738 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid); 13739 ASSERT(rgnp->rgn_refcnt == 0); 13740 ASSERT(rgnp->rgn_sfmmu_head == NULL); 13741 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 13742 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp); 13743 #ifdef DEBUG 13744 for (i = 0; i < MMU_PAGE_SIZES; i++) { 13745 ASSERT(rgnp->rgn_ttecnt[i] == 0); 13746 } 13747 rgns++; 13748 #endif /* DEBUG */ 13749 kmem_cache_free(region_cache, rgnp); 13750 } 13751 ASSERT(rgns == srdp->srd_next_ismrid); 13752 ASSERT(srdp->srd_ismbusyrgns == 0); 13753 ASSERT(srdp->srd_hmebusyrgns == 0); 13754 13755 srdp->srd_next_ismrid = 0; 13756 srdp->srd_next_hmerid = 0; 13757 13758 bzero((void *)srdp->srd_ismrgnp, 13759 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS); 13760 bzero((void *)srdp->srd_hmergnp, 13761 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS); 13762 13763 ASSERT(srdp->srd_scdp == NULL); 13764 kmem_cache_free(srd_cache, srdp); 13765 } 13766 13767 /* ARGSUSED */ 13768 static int 13769 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags) 13770 { 13771 sf_srd_t *srdp = (sf_srd_t *)buf; 13772 bzero(buf, sizeof (*srdp)); 13773 13774 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL); 13775 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL); 13776 return (0); 13777 } 13778 13779 /* ARGSUSED */ 13780 static void 13781 sfmmu_srdcache_destructor(void *buf, void *cdrarg) 13782 { 13783 sf_srd_t *srdp = (sf_srd_t *)buf; 13784 13785 mutex_destroy(&srdp->srd_mutex); 13786 mutex_destroy(&srdp->srd_scd_mutex); 13787 } 13788 13789 /* 13790 * The caller makes sure hat_join_region()/hat_leave_region() can't be called 13791 * at the same time for the same process and address range. This is ensured by 13792 * the fact that address space is locked as writer when a process joins the 13793 * regions. Therefore there's no need to hold an srd lock during the entire 13794 * execution of hat_join_region()/hat_leave_region(). 13795 */ 13796 13797 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \ 13798 (((uintptr_t)(obj)) >> 11)) & \ 13799 srd_rgn_hashmask) 13800 /* 13801 * This routine implements the shared context functionality required when 13802 * attaching a segment to an address space. It must be called from 13803 * hat_share() for D(ISM) segments and from segvn_create() for segments 13804 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie 13805 * which is saved in the private segment data for hme segments and 13806 * the ism_map structure for ism segments. 13807 */ 13808 hat_region_cookie_t 13809 hat_join_region(struct hat *sfmmup, 13810 caddr_t r_saddr, 13811 size_t r_size, 13812 void *r_obj, 13813 u_offset_t r_objoff, 13814 uchar_t r_perm, 13815 uchar_t r_pgszc, 13816 hat_rgn_cb_func_t r_cb_function, 13817 uint_t flags) 13818 { 13819 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 13820 uint_t rhash; 13821 uint_t rid; 13822 hatlock_t *hatlockp; 13823 sf_region_t *rgnp; 13824 sf_region_t *new_rgnp = NULL; 13825 int i; 13826 uint16_t *nextidp; 13827 sf_region_t **freelistp; 13828 int maxids; 13829 sf_region_t **rarrp; 13830 uint16_t *busyrgnsp; 13831 ulong_t rttecnt; 13832 uchar_t tteflag; 13833 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 13834 int text = (r_type == HAT_REGION_TEXT); 13835 13836 if (srdp == NULL || r_size == 0) { 13837 return (HAT_INVALID_REGION_COOKIE); 13838 } 13839 13840 ASSERT(sfmmup != ksfmmup); 13841 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as)); 13842 ASSERT(srdp->srd_refcnt > 0); 13843 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 13844 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 13845 ASSERT(r_pgszc < mmu_page_sizes); 13846 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) || 13847 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) { 13848 panic("hat_join_region: region addr or size is not aligned\n"); 13849 } 13850 13851 13852 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 13853 SFMMU_REGION_HME; 13854 /* 13855 * Currently only support shared hmes for the read only main text 13856 * region. 13857 */ 13858 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) || 13859 (r_perm & PROT_WRITE))) { 13860 return (HAT_INVALID_REGION_COOKIE); 13861 } 13862 13863 rhash = RGN_HASH_FUNCTION(r_obj); 13864 13865 if (r_type == SFMMU_REGION_ISM) { 13866 nextidp = &srdp->srd_next_ismrid; 13867 freelistp = &srdp->srd_ismrgnfree; 13868 maxids = SFMMU_MAX_ISM_REGIONS; 13869 rarrp = srdp->srd_ismrgnp; 13870 busyrgnsp = &srdp->srd_ismbusyrgns; 13871 } else { 13872 nextidp = &srdp->srd_next_hmerid; 13873 freelistp = &srdp->srd_hmergnfree; 13874 maxids = SFMMU_MAX_HME_REGIONS; 13875 rarrp = srdp->srd_hmergnp; 13876 busyrgnsp = &srdp->srd_hmebusyrgns; 13877 } 13878 13879 mutex_enter(&srdp->srd_mutex); 13880 13881 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 13882 rgnp = rgnp->rgn_hash) { 13883 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size && 13884 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff && 13885 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) { 13886 break; 13887 } 13888 } 13889 13890 rfound: 13891 if (rgnp != NULL) { 13892 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 13893 ASSERT(rgnp->rgn_cb_function == r_cb_function); 13894 ASSERT(rgnp->rgn_refcnt >= 0); 13895 rid = rgnp->rgn_id; 13896 ASSERT(rid < maxids); 13897 ASSERT(rarrp[rid] == rgnp); 13898 ASSERT(rid < *nextidp); 13899 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt); 13900 mutex_exit(&srdp->srd_mutex); 13901 if (new_rgnp != NULL) { 13902 kmem_cache_free(region_cache, new_rgnp); 13903 } 13904 if (r_type == SFMMU_REGION_HME) { 13905 int myjoin = 13906 (sfmmup == astosfmmu(curthread->t_procp->p_as)); 13907 13908 sfmmu_link_to_hmeregion(sfmmup, rgnp); 13909 /* 13910 * bitmap should be updated after linking sfmmu on 13911 * region list so that pageunload() doesn't skip 13912 * TSB/TLB flush. As soon as bitmap is updated another 13913 * thread in this process can already start accessing 13914 * this region. 13915 */ 13916 /* 13917 * Normally ttecnt accounting is done as part of 13918 * pagefault handling. But a process may not take any 13919 * pagefaults on shared hmeblks created by some other 13920 * process. To compensate for this assume that the 13921 * entire region will end up faulted in using 13922 * the region's pagesize. 13923 * 13924 */ 13925 if (r_pgszc > TTE8K) { 13926 tteflag = 1 << r_pgszc; 13927 if (disable_large_pages & tteflag) { 13928 tteflag = 0; 13929 } 13930 } else { 13931 tteflag = 0; 13932 } 13933 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) { 13934 hatlockp = sfmmu_hat_enter(sfmmup); 13935 sfmmup->sfmmu_rtteflags |= tteflag; 13936 sfmmu_hat_exit(hatlockp); 13937 } 13938 hatlockp = sfmmu_hat_enter(sfmmup); 13939 13940 /* 13941 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M 13942 * region to allow for large page allocation failure. 13943 */ 13944 if (r_pgszc >= TTE4M) { 13945 sfmmup->sfmmu_tsb0_4minflcnt += 13946 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 13947 } 13948 13949 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 13950 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 13951 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 13952 rttecnt); 13953 13954 if (text && r_pgszc >= TTE4M && 13955 (tteflag || ((disable_large_pages >> TTE4M) & 13956 ((1 << (r_pgszc - TTE4M + 1)) - 1))) && 13957 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) { 13958 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG); 13959 } 13960 13961 sfmmu_hat_exit(hatlockp); 13962 /* 13963 * On Panther we need to make sure TLB is programmed 13964 * to accept 32M/256M pages. Call 13965 * sfmmu_check_page_sizes() now to make sure TLB is 13966 * setup before making hmeregions visible to other 13967 * threads. 13968 */ 13969 sfmmu_check_page_sizes(sfmmup, 1); 13970 hatlockp = sfmmu_hat_enter(sfmmup); 13971 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 13972 13973 /* 13974 * if context is invalid tsb miss exception code will 13975 * call sfmmu_check_page_sizes() and update tsbmiss 13976 * area later. 13977 */ 13978 kpreempt_disable(); 13979 if (myjoin && 13980 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum 13981 != INVALID_CONTEXT)) { 13982 struct tsbmiss *tsbmp; 13983 13984 tsbmp = &tsbmiss_area[CPU->cpu_id]; 13985 ASSERT(sfmmup == tsbmp->usfmmup); 13986 BT_SET(tsbmp->shmermap, rid); 13987 if (r_pgszc > TTE64K) { 13988 tsbmp->uhat_rtteflags |= tteflag; 13989 } 13990 13991 } 13992 kpreempt_enable(); 13993 13994 sfmmu_hat_exit(hatlockp); 13995 ASSERT((hat_region_cookie_t)((uint64_t)rid) != 13996 HAT_INVALID_REGION_COOKIE); 13997 } else { 13998 hatlockp = sfmmu_hat_enter(sfmmup); 13999 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid); 14000 sfmmu_hat_exit(hatlockp); 14001 } 14002 ASSERT(rid < maxids); 14003 14004 if (r_type == SFMMU_REGION_ISM) { 14005 sfmmu_find_scd(sfmmup); 14006 } 14007 return ((hat_region_cookie_t)((uint64_t)rid)); 14008 } 14009 14010 ASSERT(new_rgnp == NULL); 14011 14012 if (*busyrgnsp >= maxids) { 14013 mutex_exit(&srdp->srd_mutex); 14014 return (HAT_INVALID_REGION_COOKIE); 14015 } 14016 14017 ASSERT(MUTEX_HELD(&srdp->srd_mutex)); 14018 if (*freelistp != NULL) { 14019 rgnp = *freelistp; 14020 *freelistp = rgnp->rgn_next; 14021 ASSERT(rgnp->rgn_id < *nextidp); 14022 ASSERT(rgnp->rgn_id < maxids); 14023 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE); 14024 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) 14025 == r_type); 14026 ASSERT(rarrp[rgnp->rgn_id] == rgnp); 14027 ASSERT(rgnp->rgn_hmeflags == 0); 14028 } else { 14029 /* 14030 * release local locks before memory allocation. 14031 */ 14032 mutex_exit(&srdp->srd_mutex); 14033 14034 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP); 14035 14036 mutex_enter(&srdp->srd_mutex); 14037 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL; 14038 rgnp = rgnp->rgn_hash) { 14039 if (rgnp->rgn_saddr == r_saddr && 14040 rgnp->rgn_size == r_size && 14041 rgnp->rgn_obj == r_obj && 14042 rgnp->rgn_objoff == r_objoff && 14043 rgnp->rgn_perm == r_perm && 14044 rgnp->rgn_pgszc == r_pgszc) { 14045 break; 14046 } 14047 } 14048 if (rgnp != NULL) { 14049 goto rfound; 14050 } 14051 14052 if (*nextidp >= maxids) { 14053 mutex_exit(&srdp->srd_mutex); 14054 goto fail; 14055 } 14056 rgnp = new_rgnp; 14057 new_rgnp = NULL; 14058 rgnp->rgn_id = (*nextidp)++; 14059 ASSERT(rgnp->rgn_id < maxids); 14060 ASSERT(rarrp[rgnp->rgn_id] == NULL); 14061 rarrp[rgnp->rgn_id] = rgnp; 14062 } 14063 14064 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14065 ASSERT(rgnp->rgn_hmeflags == 0); 14066 #ifdef DEBUG 14067 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14068 ASSERT(rgnp->rgn_ttecnt[i] == 0); 14069 } 14070 #endif 14071 rgnp->rgn_saddr = r_saddr; 14072 rgnp->rgn_size = r_size; 14073 rgnp->rgn_obj = r_obj; 14074 rgnp->rgn_objoff = r_objoff; 14075 rgnp->rgn_perm = r_perm; 14076 rgnp->rgn_pgszc = r_pgszc; 14077 rgnp->rgn_flags = r_type; 14078 rgnp->rgn_refcnt = 0; 14079 rgnp->rgn_cb_function = r_cb_function; 14080 rgnp->rgn_hash = srdp->srd_rgnhash[rhash]; 14081 srdp->srd_rgnhash[rhash] = rgnp; 14082 (*busyrgnsp)++; 14083 ASSERT(*busyrgnsp <= maxids); 14084 goto rfound; 14085 14086 fail: 14087 ASSERT(new_rgnp != NULL); 14088 kmem_cache_free(region_cache, new_rgnp); 14089 return (HAT_INVALID_REGION_COOKIE); 14090 } 14091 14092 /* 14093 * This function implements the shared context functionality required 14094 * when detaching a segment from an address space. It must be called 14095 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(), 14096 * for segments with a valid region_cookie. 14097 * It will also be called from all seg_vn routines which change a 14098 * segment's attributes such as segvn_setprot(), segvn_setpagesize(), 14099 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault 14100 * from segvn_fault(). 14101 */ 14102 void 14103 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags) 14104 { 14105 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14106 sf_scd_t *scdp; 14107 uint_t rhash; 14108 uint_t rid = (uint_t)((uint64_t)rcookie); 14109 hatlock_t *hatlockp = NULL; 14110 sf_region_t *rgnp; 14111 sf_region_t **prev_rgnpp; 14112 sf_region_t *cur_rgnp; 14113 void *r_obj; 14114 int i; 14115 caddr_t r_saddr; 14116 caddr_t r_eaddr; 14117 size_t r_size; 14118 uchar_t r_pgszc; 14119 uchar_t r_type = flags & HAT_REGION_TYPE_MASK; 14120 14121 ASSERT(sfmmup != ksfmmup); 14122 ASSERT(srdp != NULL); 14123 ASSERT(srdp->srd_refcnt > 0); 14124 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK)); 14125 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM); 14126 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL); 14127 14128 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM : 14129 SFMMU_REGION_HME; 14130 14131 if (r_type == SFMMU_REGION_ISM) { 14132 ASSERT(SFMMU_IS_ISMRID_VALID(rid)); 14133 ASSERT(rid < SFMMU_MAX_ISM_REGIONS); 14134 rgnp = srdp->srd_ismrgnp[rid]; 14135 } else { 14136 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14137 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14138 rgnp = srdp->srd_hmergnp[rid]; 14139 } 14140 ASSERT(rgnp != NULL); 14141 ASSERT(rgnp->rgn_id == rid); 14142 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14143 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14144 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as)); 14145 14146 if (sfmmup->sfmmu_free) { 14147 ulong_t rttecnt; 14148 r_pgszc = rgnp->rgn_pgszc; 14149 r_size = rgnp->rgn_size; 14150 14151 ASSERT(sfmmup->sfmmu_scdp == NULL); 14152 if (r_type == SFMMU_REGION_ISM) { 14153 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14154 } else { 14155 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14156 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14157 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14158 14159 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], 14160 -rttecnt); 14161 14162 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14163 } 14164 } else if (r_type == SFMMU_REGION_ISM) { 14165 hatlockp = sfmmu_hat_enter(sfmmup); 14166 ASSERT(rid < srdp->srd_next_ismrid); 14167 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid); 14168 scdp = sfmmup->sfmmu_scdp; 14169 if (scdp != NULL && 14170 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) { 14171 sfmmu_leave_scd(sfmmup, r_type); 14172 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14173 } 14174 sfmmu_hat_exit(hatlockp); 14175 } else { 14176 ulong_t rttecnt; 14177 r_pgszc = rgnp->rgn_pgszc; 14178 r_saddr = rgnp->rgn_saddr; 14179 r_size = rgnp->rgn_size; 14180 r_eaddr = r_saddr + r_size; 14181 14182 ASSERT(r_type == SFMMU_REGION_HME); 14183 hatlockp = sfmmu_hat_enter(sfmmup); 14184 ASSERT(rid < srdp->srd_next_hmerid); 14185 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid); 14186 14187 /* 14188 * If region is part of an SCD call sfmmu_leave_scd(). 14189 * Otherwise if process is not exiting and has valid context 14190 * just drop the context on the floor to lose stale TLB 14191 * entries and force the update of tsb miss area to reflect 14192 * the new region map. After that clean our TSB entries. 14193 */ 14194 scdp = sfmmup->sfmmu_scdp; 14195 if (scdp != NULL && 14196 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { 14197 sfmmu_leave_scd(sfmmup, r_type); 14198 ASSERT(sfmmu_hat_lock_held(sfmmup)); 14199 } 14200 sfmmu_invalidate_ctx(sfmmup); 14201 14202 i = TTE8K; 14203 while (i < mmu_page_sizes) { 14204 if (rgnp->rgn_ttecnt[i] != 0) { 14205 sfmmu_unload_tsb_range(sfmmup, r_saddr, 14206 r_eaddr, i); 14207 if (i < TTE4M) { 14208 i = TTE4M; 14209 continue; 14210 } else { 14211 break; 14212 } 14213 } 14214 i++; 14215 } 14216 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */ 14217 if (r_pgszc >= TTE4M) { 14218 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14219 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14220 rttecnt); 14221 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt; 14222 } 14223 14224 /* update shme rgns ttecnt in sfmmu_ttecnt */ 14225 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc); 14226 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt); 14227 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt); 14228 14229 sfmmu_hat_exit(hatlockp); 14230 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) { 14231 /* sfmmup left the scd, grow private tsb */ 14232 sfmmu_check_page_sizes(sfmmup, 1); 14233 } else { 14234 sfmmu_check_page_sizes(sfmmup, 0); 14235 } 14236 } 14237 14238 if (r_type == SFMMU_REGION_HME) { 14239 sfmmu_unlink_from_hmeregion(sfmmup, rgnp); 14240 } 14241 14242 r_obj = rgnp->rgn_obj; 14243 if (atomic_dec_32_nv((volatile uint_t *)&rgnp->rgn_refcnt)) { 14244 return; 14245 } 14246 14247 /* 14248 * looks like nobody uses this region anymore. Free it. 14249 */ 14250 rhash = RGN_HASH_FUNCTION(r_obj); 14251 mutex_enter(&srdp->srd_mutex); 14252 for (prev_rgnpp = &srdp->srd_rgnhash[rhash]; 14253 (cur_rgnp = *prev_rgnpp) != NULL; 14254 prev_rgnpp = &cur_rgnp->rgn_hash) { 14255 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) { 14256 break; 14257 } 14258 } 14259 14260 if (cur_rgnp == NULL) { 14261 mutex_exit(&srdp->srd_mutex); 14262 return; 14263 } 14264 14265 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type); 14266 *prev_rgnpp = rgnp->rgn_hash; 14267 if (r_type == SFMMU_REGION_ISM) { 14268 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14269 ASSERT(rid < srdp->srd_next_ismrid); 14270 rgnp->rgn_next = srdp->srd_ismrgnfree; 14271 srdp->srd_ismrgnfree = rgnp; 14272 ASSERT(srdp->srd_ismbusyrgns > 0); 14273 srdp->srd_ismbusyrgns--; 14274 mutex_exit(&srdp->srd_mutex); 14275 return; 14276 } 14277 mutex_exit(&srdp->srd_mutex); 14278 14279 /* 14280 * Destroy region's hmeblks. 14281 */ 14282 sfmmu_unload_hmeregion(srdp, rgnp); 14283 14284 rgnp->rgn_hmeflags = 0; 14285 14286 ASSERT(rgnp->rgn_sfmmu_head == NULL); 14287 ASSERT(rgnp->rgn_id == rid); 14288 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14289 rgnp->rgn_ttecnt[i] = 0; 14290 } 14291 rgnp->rgn_flags |= SFMMU_REGION_FREE; 14292 mutex_enter(&srdp->srd_mutex); 14293 ASSERT(rid < srdp->srd_next_hmerid); 14294 rgnp->rgn_next = srdp->srd_hmergnfree; 14295 srdp->srd_hmergnfree = rgnp; 14296 ASSERT(srdp->srd_hmebusyrgns > 0); 14297 srdp->srd_hmebusyrgns--; 14298 mutex_exit(&srdp->srd_mutex); 14299 } 14300 14301 /* 14302 * For now only called for hmeblk regions and not for ISM regions. 14303 */ 14304 void 14305 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie) 14306 { 14307 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14308 uint_t rid = (uint_t)((uint64_t)rcookie); 14309 sf_region_t *rgnp; 14310 sf_rgn_link_t *rlink; 14311 sf_rgn_link_t *hrlink; 14312 ulong_t rttecnt; 14313 14314 ASSERT(sfmmup != ksfmmup); 14315 ASSERT(srdp != NULL); 14316 ASSERT(srdp->srd_refcnt > 0); 14317 14318 ASSERT(rid < srdp->srd_next_hmerid); 14319 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14320 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 14321 14322 rgnp = srdp->srd_hmergnp[rid]; 14323 ASSERT(rgnp->rgn_refcnt > 0); 14324 ASSERT(rgnp->rgn_id == rid); 14325 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME); 14326 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE)); 14327 14328 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt); 14329 14330 /* LINTED: constant in conditional context */ 14331 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0); 14332 ASSERT(rlink != NULL); 14333 mutex_enter(&rgnp->rgn_mutex); 14334 ASSERT(rgnp->rgn_sfmmu_head != NULL); 14335 /* LINTED: constant in conditional context */ 14336 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0); 14337 ASSERT(hrlink != NULL); 14338 ASSERT(hrlink->prev == NULL); 14339 rlink->next = rgnp->rgn_sfmmu_head; 14340 rlink->prev = NULL; 14341 hrlink->prev = sfmmup; 14342 /* 14343 * make sure rlink's next field is correct 14344 * before making this link visible. 14345 */ 14346 membar_stst(); 14347 rgnp->rgn_sfmmu_head = sfmmup; 14348 mutex_exit(&rgnp->rgn_mutex); 14349 14350 /* update sfmmu_ttecnt with the shme rgn ttecnt */ 14351 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc); 14352 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt); 14353 /* update tsb0 inflation count */ 14354 if (rgnp->rgn_pgszc >= TTE4M) { 14355 sfmmup->sfmmu_tsb0_4minflcnt += 14356 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2); 14357 } 14358 /* 14359 * Update regionid bitmask without hat lock since no other thread 14360 * can update this region bitmask right now. 14361 */ 14362 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid); 14363 } 14364 14365 /* ARGSUSED */ 14366 static int 14367 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags) 14368 { 14369 sf_region_t *rgnp = (sf_region_t *)buf; 14370 bzero(buf, sizeof (*rgnp)); 14371 14372 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL); 14373 14374 return (0); 14375 } 14376 14377 /* ARGSUSED */ 14378 static void 14379 sfmmu_rgncache_destructor(void *buf, void *cdrarg) 14380 { 14381 sf_region_t *rgnp = (sf_region_t *)buf; 14382 mutex_destroy(&rgnp->rgn_mutex); 14383 } 14384 14385 static int 14386 sfrgnmap_isnull(sf_region_map_t *map) 14387 { 14388 int i; 14389 14390 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14391 if (map->bitmap[i] != 0) { 14392 return (0); 14393 } 14394 } 14395 return (1); 14396 } 14397 14398 static int 14399 sfhmergnmap_isnull(sf_hmeregion_map_t *map) 14400 { 14401 int i; 14402 14403 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) { 14404 if (map->bitmap[i] != 0) { 14405 return (0); 14406 } 14407 } 14408 return (1); 14409 } 14410 14411 #ifdef DEBUG 14412 static void 14413 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist) 14414 { 14415 sfmmu_t *sp; 14416 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14417 14418 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) { 14419 ASSERT(srdp == sp->sfmmu_srdp); 14420 if (sp == sfmmup) { 14421 if (onlist) { 14422 return; 14423 } else { 14424 panic("shctx: sfmmu 0x%p found on scd" 14425 "list 0x%p", (void *)sfmmup, 14426 (void *)*headp); 14427 } 14428 } 14429 } 14430 if (onlist) { 14431 panic("shctx: sfmmu 0x%p not found on scd list 0x%p", 14432 (void *)sfmmup, (void *)*headp); 14433 } else { 14434 return; 14435 } 14436 } 14437 #else /* DEBUG */ 14438 #define check_scd_sfmmu_list(headp, sfmmup, onlist) 14439 #endif /* DEBUG */ 14440 14441 /* 14442 * Removes an sfmmu from the SCD sfmmu list. 14443 */ 14444 static void 14445 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14446 { 14447 ASSERT(sfmmup->sfmmu_srdp != NULL); 14448 check_scd_sfmmu_list(headp, sfmmup, 1); 14449 if (sfmmup->sfmmu_scd_link.prev != NULL) { 14450 ASSERT(*headp != sfmmup); 14451 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next = 14452 sfmmup->sfmmu_scd_link.next; 14453 } else { 14454 ASSERT(*headp == sfmmup); 14455 *headp = sfmmup->sfmmu_scd_link.next; 14456 } 14457 if (sfmmup->sfmmu_scd_link.next != NULL) { 14458 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev = 14459 sfmmup->sfmmu_scd_link.prev; 14460 } 14461 } 14462 14463 14464 /* 14465 * Adds an sfmmu to the start of the queue. 14466 */ 14467 static void 14468 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup) 14469 { 14470 check_scd_sfmmu_list(headp, sfmmup, 0); 14471 sfmmup->sfmmu_scd_link.prev = NULL; 14472 sfmmup->sfmmu_scd_link.next = *headp; 14473 if (*headp != NULL) 14474 (*headp)->sfmmu_scd_link.prev = sfmmup; 14475 *headp = sfmmup; 14476 } 14477 14478 /* 14479 * Remove an scd from the start of the queue. 14480 */ 14481 static void 14482 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp) 14483 { 14484 if (scdp->scd_prev != NULL) { 14485 ASSERT(*headp != scdp); 14486 scdp->scd_prev->scd_next = scdp->scd_next; 14487 } else { 14488 ASSERT(*headp == scdp); 14489 *headp = scdp->scd_next; 14490 } 14491 14492 if (scdp->scd_next != NULL) { 14493 scdp->scd_next->scd_prev = scdp->scd_prev; 14494 } 14495 } 14496 14497 /* 14498 * Add an scd to the start of the queue. 14499 */ 14500 static void 14501 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp) 14502 { 14503 scdp->scd_prev = NULL; 14504 scdp->scd_next = *headp; 14505 if (*headp != NULL) { 14506 (*headp)->scd_prev = scdp; 14507 } 14508 *headp = scdp; 14509 } 14510 14511 static int 14512 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp) 14513 { 14514 uint_t rid; 14515 uint_t i; 14516 uint_t j; 14517 ulong_t w; 14518 sf_region_t *rgnp; 14519 ulong_t tte8k_cnt = 0; 14520 ulong_t tte4m_cnt = 0; 14521 uint_t tsb_szc; 14522 sfmmu_t *scsfmmup = scdp->scd_sfmmup; 14523 sfmmu_t *ism_hatid; 14524 struct tsb_info *newtsb; 14525 int szc; 14526 14527 ASSERT(srdp != NULL); 14528 14529 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14530 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14531 continue; 14532 } 14533 j = 0; 14534 while (w) { 14535 if (!(w & 0x1)) { 14536 j++; 14537 w >>= 1; 14538 continue; 14539 } 14540 rid = (i << BT_ULSHIFT) | j; 14541 j++; 14542 w >>= 1; 14543 14544 if (rid < SFMMU_MAX_HME_REGIONS) { 14545 rgnp = srdp->srd_hmergnp[rid]; 14546 ASSERT(rgnp->rgn_id == rid); 14547 ASSERT(rgnp->rgn_refcnt > 0); 14548 14549 if (rgnp->rgn_pgszc < TTE4M) { 14550 tte8k_cnt += rgnp->rgn_size >> 14551 TTE_PAGE_SHIFT(TTE8K); 14552 } else { 14553 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14554 tte4m_cnt += rgnp->rgn_size >> 14555 TTE_PAGE_SHIFT(TTE4M); 14556 /* 14557 * Inflate SCD tsb0 by preallocating 14558 * 1/4 8k ttecnt for 4M regions to 14559 * allow for lgpg alloc failure. 14560 */ 14561 tte8k_cnt += rgnp->rgn_size >> 14562 (TTE_PAGE_SHIFT(TTE8K) + 2); 14563 } 14564 } else { 14565 rid -= SFMMU_MAX_HME_REGIONS; 14566 rgnp = srdp->srd_ismrgnp[rid]; 14567 ASSERT(rgnp->rgn_id == rid); 14568 ASSERT(rgnp->rgn_refcnt > 0); 14569 14570 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14571 ASSERT(ism_hatid->sfmmu_ismhat); 14572 14573 for (szc = 0; szc < TTE4M; szc++) { 14574 tte8k_cnt += 14575 ism_hatid->sfmmu_ttecnt[szc] << 14576 TTE_BSZS_SHIFT(szc); 14577 } 14578 14579 ASSERT(rgnp->rgn_pgszc >= TTE4M); 14580 if (rgnp->rgn_pgszc >= TTE4M) { 14581 tte4m_cnt += rgnp->rgn_size >> 14582 TTE_PAGE_SHIFT(TTE4M); 14583 } 14584 } 14585 } 14586 } 14587 14588 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt); 14589 14590 /* Allocate both the SCD TSBs here. */ 14591 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14592 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) && 14593 (tsb_szc <= TSB_4M_SZCODE || 14594 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb, 14595 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K, 14596 TSB_ALLOC, scsfmmup))) { 14597 14598 SFMMU_STAT(sf_scd_1sttsb_allocfail); 14599 return (TSB_ALLOCFAIL); 14600 } else { 14601 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX; 14602 14603 if (tte4m_cnt) { 14604 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt); 14605 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, 14606 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) && 14607 (tsb_szc <= TSB_4M_SZCODE || 14608 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE, 14609 TSB4M|TSB32M|TSB256M, 14610 TSB_ALLOC, scsfmmup))) { 14611 /* 14612 * If we fail to allocate the 2nd shared tsb, 14613 * just free the 1st tsb, return failure. 14614 */ 14615 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb); 14616 SFMMU_STAT(sf_scd_2ndtsb_allocfail); 14617 return (TSB_ALLOCFAIL); 14618 } else { 14619 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL); 14620 newtsb->tsb_flags |= TSB_SHAREDCTX; 14621 scsfmmup->sfmmu_tsb->tsb_next = newtsb; 14622 SFMMU_STAT(sf_scd_2ndtsb_alloc); 14623 } 14624 } 14625 SFMMU_STAT(sf_scd_1sttsb_alloc); 14626 } 14627 return (TSB_SUCCESS); 14628 } 14629 14630 static void 14631 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu) 14632 { 14633 while (scd_sfmmu->sfmmu_tsb != NULL) { 14634 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next; 14635 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb); 14636 scd_sfmmu->sfmmu_tsb = next; 14637 } 14638 } 14639 14640 /* 14641 * Link the sfmmu onto the hme region list. 14642 */ 14643 void 14644 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14645 { 14646 uint_t rid; 14647 sf_rgn_link_t *rlink; 14648 sfmmu_t *head; 14649 sf_rgn_link_t *hrlink; 14650 14651 rid = rgnp->rgn_id; 14652 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14653 14654 /* LINTED: constant in conditional context */ 14655 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1); 14656 ASSERT(rlink != NULL); 14657 mutex_enter(&rgnp->rgn_mutex); 14658 if ((head = rgnp->rgn_sfmmu_head) == NULL) { 14659 rlink->next = NULL; 14660 rlink->prev = NULL; 14661 /* 14662 * make sure rlink's next field is NULL 14663 * before making this link visible. 14664 */ 14665 membar_stst(); 14666 rgnp->rgn_sfmmu_head = sfmmup; 14667 } else { 14668 /* LINTED: constant in conditional context */ 14669 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0); 14670 ASSERT(hrlink != NULL); 14671 ASSERT(hrlink->prev == NULL); 14672 rlink->next = head; 14673 rlink->prev = NULL; 14674 hrlink->prev = sfmmup; 14675 /* 14676 * make sure rlink's next field is correct 14677 * before making this link visible. 14678 */ 14679 membar_stst(); 14680 rgnp->rgn_sfmmu_head = sfmmup; 14681 } 14682 mutex_exit(&rgnp->rgn_mutex); 14683 } 14684 14685 /* 14686 * Unlink the sfmmu from the hme region list. 14687 */ 14688 void 14689 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp) 14690 { 14691 uint_t rid; 14692 sf_rgn_link_t *rlink; 14693 14694 rid = rgnp->rgn_id; 14695 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 14696 14697 /* LINTED: constant in conditional context */ 14698 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0); 14699 ASSERT(rlink != NULL); 14700 mutex_enter(&rgnp->rgn_mutex); 14701 if (rgnp->rgn_sfmmu_head == sfmmup) { 14702 sfmmu_t *next = rlink->next; 14703 rgnp->rgn_sfmmu_head = next; 14704 /* 14705 * if we are stopped by xc_attention() after this 14706 * point the forward link walking in 14707 * sfmmu_rgntlb_demap() will work correctly since the 14708 * head correctly points to the next element. 14709 */ 14710 membar_stst(); 14711 rlink->next = NULL; 14712 ASSERT(rlink->prev == NULL); 14713 if (next != NULL) { 14714 sf_rgn_link_t *nrlink; 14715 /* LINTED: constant in conditional context */ 14716 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14717 ASSERT(nrlink != NULL); 14718 ASSERT(nrlink->prev == sfmmup); 14719 nrlink->prev = NULL; 14720 } 14721 } else { 14722 sfmmu_t *next = rlink->next; 14723 sfmmu_t *prev = rlink->prev; 14724 sf_rgn_link_t *prlink; 14725 14726 ASSERT(prev != NULL); 14727 /* LINTED: constant in conditional context */ 14728 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0); 14729 ASSERT(prlink != NULL); 14730 ASSERT(prlink->next == sfmmup); 14731 prlink->next = next; 14732 /* 14733 * if we are stopped by xc_attention() 14734 * after this point the forward link walking 14735 * will work correctly since the prev element 14736 * correctly points to the next element. 14737 */ 14738 membar_stst(); 14739 rlink->next = NULL; 14740 rlink->prev = NULL; 14741 if (next != NULL) { 14742 sf_rgn_link_t *nrlink; 14743 /* LINTED: constant in conditional context */ 14744 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0); 14745 ASSERT(nrlink != NULL); 14746 ASSERT(nrlink->prev == sfmmup); 14747 nrlink->prev = prev; 14748 } 14749 } 14750 mutex_exit(&rgnp->rgn_mutex); 14751 } 14752 14753 /* 14754 * Link scd sfmmu onto ism or hme region list for each region in the 14755 * scd region map. 14756 */ 14757 void 14758 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp) 14759 { 14760 uint_t rid; 14761 uint_t i; 14762 uint_t j; 14763 ulong_t w; 14764 sf_region_t *rgnp; 14765 sfmmu_t *scsfmmup; 14766 14767 scsfmmup = scdp->scd_sfmmup; 14768 ASSERT(scsfmmup->sfmmu_scdhat); 14769 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14770 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14771 continue; 14772 } 14773 j = 0; 14774 while (w) { 14775 if (!(w & 0x1)) { 14776 j++; 14777 w >>= 1; 14778 continue; 14779 } 14780 rid = (i << BT_ULSHIFT) | j; 14781 j++; 14782 w >>= 1; 14783 14784 if (rid < SFMMU_MAX_HME_REGIONS) { 14785 rgnp = srdp->srd_hmergnp[rid]; 14786 ASSERT(rgnp->rgn_id == rid); 14787 ASSERT(rgnp->rgn_refcnt > 0); 14788 sfmmu_link_to_hmeregion(scsfmmup, rgnp); 14789 } else { 14790 sfmmu_t *ism_hatid = NULL; 14791 ism_ment_t *ism_ment; 14792 rid -= SFMMU_MAX_HME_REGIONS; 14793 rgnp = srdp->srd_ismrgnp[rid]; 14794 ASSERT(rgnp->rgn_id == rid); 14795 ASSERT(rgnp->rgn_refcnt > 0); 14796 14797 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14798 ASSERT(ism_hatid->sfmmu_ismhat); 14799 ism_ment = &scdp->scd_ism_links[rid]; 14800 ism_ment->iment_hat = scsfmmup; 14801 ism_ment->iment_base_va = rgnp->rgn_saddr; 14802 mutex_enter(&ism_mlist_lock); 14803 iment_add(ism_ment, ism_hatid); 14804 mutex_exit(&ism_mlist_lock); 14805 14806 } 14807 } 14808 } 14809 } 14810 /* 14811 * Unlink scd sfmmu from ism or hme region list for each region in the 14812 * scd region map. 14813 */ 14814 void 14815 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp) 14816 { 14817 uint_t rid; 14818 uint_t i; 14819 uint_t j; 14820 ulong_t w; 14821 sf_region_t *rgnp; 14822 sfmmu_t *scsfmmup; 14823 14824 scsfmmup = scdp->scd_sfmmup; 14825 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) { 14826 if ((w = scdp->scd_region_map.bitmap[i]) == 0) { 14827 continue; 14828 } 14829 j = 0; 14830 while (w) { 14831 if (!(w & 0x1)) { 14832 j++; 14833 w >>= 1; 14834 continue; 14835 } 14836 rid = (i << BT_ULSHIFT) | j; 14837 j++; 14838 w >>= 1; 14839 14840 if (rid < SFMMU_MAX_HME_REGIONS) { 14841 rgnp = srdp->srd_hmergnp[rid]; 14842 ASSERT(rgnp->rgn_id == rid); 14843 ASSERT(rgnp->rgn_refcnt > 0); 14844 sfmmu_unlink_from_hmeregion(scsfmmup, 14845 rgnp); 14846 14847 } else { 14848 sfmmu_t *ism_hatid = NULL; 14849 ism_ment_t *ism_ment; 14850 rid -= SFMMU_MAX_HME_REGIONS; 14851 rgnp = srdp->srd_ismrgnp[rid]; 14852 ASSERT(rgnp->rgn_id == rid); 14853 ASSERT(rgnp->rgn_refcnt > 0); 14854 14855 ism_hatid = (sfmmu_t *)rgnp->rgn_obj; 14856 ASSERT(ism_hatid->sfmmu_ismhat); 14857 ism_ment = &scdp->scd_ism_links[rid]; 14858 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup); 14859 ASSERT(ism_ment->iment_base_va == 14860 rgnp->rgn_saddr); 14861 mutex_enter(&ism_mlist_lock); 14862 iment_sub(ism_ment, ism_hatid); 14863 mutex_exit(&ism_mlist_lock); 14864 14865 } 14866 } 14867 } 14868 } 14869 /* 14870 * Allocates and initialises a new SCD structure, this is called with 14871 * the srd_scd_mutex held and returns with the reference count 14872 * initialised to 1. 14873 */ 14874 static sf_scd_t * 14875 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map) 14876 { 14877 sf_scd_t *new_scdp; 14878 sfmmu_t *scsfmmup; 14879 int i; 14880 14881 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex)); 14882 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP); 14883 14884 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP); 14885 new_scdp->scd_sfmmup = scsfmmup; 14886 scsfmmup->sfmmu_srdp = srdp; 14887 scsfmmup->sfmmu_scdp = new_scdp; 14888 scsfmmup->sfmmu_tsb0_4minflcnt = 0; 14889 scsfmmup->sfmmu_scdhat = 1; 14890 CPUSET_ALL(scsfmmup->sfmmu_cpusran); 14891 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE); 14892 14893 ASSERT(max_mmu_ctxdoms > 0); 14894 for (i = 0; i < max_mmu_ctxdoms; i++) { 14895 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT; 14896 scsfmmup->sfmmu_ctxs[i].gnum = 0; 14897 } 14898 14899 for (i = 0; i < MMU_PAGE_SIZES; i++) { 14900 new_scdp->scd_rttecnt[i] = 0; 14901 } 14902 14903 new_scdp->scd_region_map = *new_map; 14904 new_scdp->scd_refcnt = 1; 14905 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) { 14906 kmem_cache_free(scd_cache, new_scdp); 14907 kmem_cache_free(sfmmuid_cache, scsfmmup); 14908 return (NULL); 14909 } 14910 if (&mmu_init_scd) { 14911 mmu_init_scd(new_scdp); 14912 } 14913 return (new_scdp); 14914 } 14915 14916 /* 14917 * The first phase of a process joining an SCD. The hat structure is 14918 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set 14919 * and a cross-call with context invalidation is used to cause the 14920 * remaining work to be carried out in the sfmmu_tsbmiss_exception() 14921 * routine. 14922 */ 14923 static void 14924 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup) 14925 { 14926 hatlock_t *hatlockp; 14927 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 14928 int i; 14929 sf_scd_t *old_scdp; 14930 14931 ASSERT(srdp != NULL); 14932 ASSERT(scdp != NULL); 14933 ASSERT(scdp->scd_refcnt > 0); 14934 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as)); 14935 14936 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) { 14937 ASSERT(old_scdp != scdp); 14938 14939 mutex_enter(&old_scdp->scd_mutex); 14940 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup); 14941 mutex_exit(&old_scdp->scd_mutex); 14942 /* 14943 * sfmmup leaves the old scd. Update sfmmu_ttecnt to 14944 * include the shme rgn ttecnt for rgns that 14945 * were in the old SCD 14946 */ 14947 for (i = 0; i < mmu_page_sizes; i++) { 14948 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 14949 old_scdp->scd_rttecnt[i]); 14950 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 14951 sfmmup->sfmmu_scdrttecnt[i]); 14952 } 14953 } 14954 14955 /* 14956 * Move sfmmu to the scd lists. 14957 */ 14958 mutex_enter(&scdp->scd_mutex); 14959 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup); 14960 mutex_exit(&scdp->scd_mutex); 14961 SF_SCD_INCR_REF(scdp); 14962 14963 hatlockp = sfmmu_hat_enter(sfmmup); 14964 /* 14965 * For a multi-thread process, we must stop 14966 * all the other threads before joining the scd. 14967 */ 14968 14969 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD); 14970 14971 sfmmu_invalidate_ctx(sfmmup); 14972 sfmmup->sfmmu_scdp = scdp; 14973 14974 /* 14975 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update 14976 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD. 14977 */ 14978 for (i = 0; i < mmu_page_sizes; i++) { 14979 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i]; 14980 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]); 14981 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 14982 -sfmmup->sfmmu_scdrttecnt[i]); 14983 } 14984 /* update tsb0 inflation count */ 14985 if (old_scdp != NULL) { 14986 sfmmup->sfmmu_tsb0_4minflcnt += 14987 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 14988 } 14989 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >= 14990 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt); 14991 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 14992 14993 sfmmu_hat_exit(hatlockp); 14994 14995 if (old_scdp != NULL) { 14996 SF_SCD_DECR_REF(srdp, old_scdp); 14997 } 14998 14999 } 15000 15001 /* 15002 * This routine is called by a process to become part of an SCD. It is called 15003 * from sfmmu_tsbmiss_exception() once most of the initial work has been 15004 * done by sfmmu_join_scd(). This routine must not drop the hat lock. 15005 */ 15006 static void 15007 sfmmu_finish_join_scd(sfmmu_t *sfmmup) 15008 { 15009 struct tsb_info *tsbinfop; 15010 15011 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15012 ASSERT(sfmmup->sfmmu_scdp != NULL); 15013 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)); 15014 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15015 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)); 15016 15017 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; 15018 tsbinfop = tsbinfop->tsb_next) { 15019 if (tsbinfop->tsb_flags & TSB_SWAPPED) { 15020 continue; 15021 } 15022 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG)); 15023 15024 sfmmu_inv_tsb(tsbinfop->tsb_va, 15025 TSB_BYTES(tsbinfop->tsb_szc)); 15026 } 15027 15028 /* Set HAT_CTX1_FLAG for all SCD ISMs */ 15029 sfmmu_ism_hatflags(sfmmup, 1); 15030 15031 SFMMU_STAT(sf_join_scd); 15032 } 15033 15034 /* 15035 * This routine is called in order to check if there is an SCD which matches 15036 * the process's region map if not then a new SCD may be created. 15037 */ 15038 static void 15039 sfmmu_find_scd(sfmmu_t *sfmmup) 15040 { 15041 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15042 sf_scd_t *scdp, *new_scdp; 15043 int ret; 15044 15045 ASSERT(srdp != NULL); 15046 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as)); 15047 15048 mutex_enter(&srdp->srd_scd_mutex); 15049 for (scdp = srdp->srd_scdp; scdp != NULL; 15050 scdp = scdp->scd_next) { 15051 SF_RGNMAP_EQUAL(&scdp->scd_region_map, 15052 &sfmmup->sfmmu_region_map, ret); 15053 if (ret == 1) { 15054 SF_SCD_INCR_REF(scdp); 15055 mutex_exit(&srdp->srd_scd_mutex); 15056 sfmmu_join_scd(scdp, sfmmup); 15057 ASSERT(scdp->scd_refcnt >= 2); 15058 atomic_dec_32((volatile uint32_t *)&scdp->scd_refcnt); 15059 return; 15060 } else { 15061 /* 15062 * If the sfmmu region map is a subset of the scd 15063 * region map, then the assumption is that this process 15064 * will continue attaching to ISM segments until the 15065 * region maps are equal. 15066 */ 15067 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map, 15068 &sfmmup->sfmmu_region_map, ret); 15069 if (ret == 1) { 15070 mutex_exit(&srdp->srd_scd_mutex); 15071 return; 15072 } 15073 } 15074 } 15075 15076 ASSERT(scdp == NULL); 15077 /* 15078 * No matching SCD has been found, create a new one. 15079 */ 15080 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) == 15081 NULL) { 15082 mutex_exit(&srdp->srd_scd_mutex); 15083 return; 15084 } 15085 15086 /* 15087 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd. 15088 */ 15089 15090 /* Set scd_rttecnt for shme rgns in SCD */ 15091 sfmmu_set_scd_rttecnt(srdp, new_scdp); 15092 15093 /* 15094 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists. 15095 */ 15096 sfmmu_link_scd_to_regions(srdp, new_scdp); 15097 sfmmu_add_scd(&srdp->srd_scdp, new_scdp); 15098 SFMMU_STAT_ADD(sf_create_scd, 1); 15099 15100 mutex_exit(&srdp->srd_scd_mutex); 15101 sfmmu_join_scd(new_scdp, sfmmup); 15102 ASSERT(new_scdp->scd_refcnt >= 2); 15103 atomic_dec_32((volatile uint32_t *)&new_scdp->scd_refcnt); 15104 } 15105 15106 /* 15107 * This routine is called by a process to remove itself from an SCD. It is 15108 * either called when the processes has detached from a segment or from 15109 * hat_free_start() as a result of calling exit. 15110 */ 15111 static void 15112 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type) 15113 { 15114 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15115 sf_srd_t *srdp = sfmmup->sfmmu_srdp; 15116 hatlock_t *hatlockp = TSB_HASH(sfmmup); 15117 int i; 15118 15119 ASSERT(scdp != NULL); 15120 ASSERT(srdp != NULL); 15121 15122 if (sfmmup->sfmmu_free) { 15123 /* 15124 * If the process is part of an SCD the sfmmu is unlinked 15125 * from scd_sf_list. 15126 */ 15127 mutex_enter(&scdp->scd_mutex); 15128 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15129 mutex_exit(&scdp->scd_mutex); 15130 /* 15131 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15132 * are about to leave the SCD 15133 */ 15134 for (i = 0; i < mmu_page_sizes; i++) { 15135 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15136 scdp->scd_rttecnt[i]); 15137 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15138 sfmmup->sfmmu_scdrttecnt[i]); 15139 sfmmup->sfmmu_scdrttecnt[i] = 0; 15140 } 15141 sfmmup->sfmmu_scdp = NULL; 15142 15143 SF_SCD_DECR_REF(srdp, scdp); 15144 return; 15145 } 15146 15147 ASSERT(r_type != SFMMU_REGION_ISM || 15148 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15149 ASSERT(scdp->scd_refcnt); 15150 ASSERT(!sfmmup->sfmmu_free); 15151 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15152 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as)); 15153 15154 /* 15155 * Wait for ISM maps to be updated. 15156 */ 15157 if (r_type != SFMMU_REGION_ISM) { 15158 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) && 15159 sfmmup->sfmmu_scdp != NULL) { 15160 cv_wait(&sfmmup->sfmmu_tsb_cv, 15161 HATLOCK_MUTEXP(hatlockp)); 15162 } 15163 15164 if (sfmmup->sfmmu_scdp == NULL) { 15165 sfmmu_hat_exit(hatlockp); 15166 return; 15167 } 15168 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY); 15169 } 15170 15171 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { 15172 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD); 15173 /* 15174 * Since HAT_JOIN_SCD was set our context 15175 * is still invalid. 15176 */ 15177 } else { 15178 /* 15179 * For a multi-thread process, we must stop 15180 * all the other threads before leaving the scd. 15181 */ 15182 15183 sfmmu_invalidate_ctx(sfmmup); 15184 } 15185 15186 /* Clear all the rid's for ISM, delete flags, etc */ 15187 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); 15188 sfmmu_ism_hatflags(sfmmup, 0); 15189 15190 /* 15191 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that 15192 * are in SCD before this sfmmup leaves the SCD. 15193 */ 15194 for (i = 0; i < mmu_page_sizes; i++) { 15195 ASSERT(sfmmup->sfmmu_scdrttecnt[i] == 15196 scdp->scd_rttecnt[i]); 15197 atomic_add_long(&sfmmup->sfmmu_ttecnt[i], 15198 sfmmup->sfmmu_scdrttecnt[i]); 15199 sfmmup->sfmmu_scdrttecnt[i] = 0; 15200 /* update ismttecnt to include SCD ism before hat leaves SCD */ 15201 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i]; 15202 sfmmup->sfmmu_scdismttecnt[i] = 0; 15203 } 15204 /* update tsb0 inflation count */ 15205 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; 15206 15207 if (r_type != SFMMU_REGION_ISM) { 15208 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY); 15209 } 15210 sfmmup->sfmmu_scdp = NULL; 15211 15212 sfmmu_hat_exit(hatlockp); 15213 15214 /* 15215 * Unlink sfmmu from scd_sf_list this can be done without holding 15216 * the hat lock as we hold the sfmmu_as lock which prevents 15217 * hat_join_region from adding this thread to the scd again. Other 15218 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL 15219 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp 15220 * while holding the hat lock. 15221 */ 15222 mutex_enter(&scdp->scd_mutex); 15223 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup); 15224 mutex_exit(&scdp->scd_mutex); 15225 SFMMU_STAT(sf_leave_scd); 15226 15227 SF_SCD_DECR_REF(srdp, scdp); 15228 hatlockp = sfmmu_hat_enter(sfmmup); 15229 15230 } 15231 15232 /* 15233 * Unlink and free up an SCD structure with a reference count of 0. 15234 */ 15235 static void 15236 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap) 15237 { 15238 sfmmu_t *scsfmmup; 15239 sf_scd_t *sp; 15240 hatlock_t *shatlockp; 15241 int i, ret; 15242 15243 mutex_enter(&srdp->srd_scd_mutex); 15244 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) { 15245 if (sp == scdp) 15246 break; 15247 } 15248 if (sp == NULL || sp->scd_refcnt) { 15249 mutex_exit(&srdp->srd_scd_mutex); 15250 return; 15251 } 15252 15253 /* 15254 * It is possible that the scd has been freed and reallocated with a 15255 * different region map while we've been waiting for the srd_scd_mutex. 15256 */ 15257 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret); 15258 if (ret != 1) { 15259 mutex_exit(&srdp->srd_scd_mutex); 15260 return; 15261 } 15262 15263 ASSERT(scdp->scd_sf_list == NULL); 15264 /* 15265 * Unlink scd from srd_scdp list. 15266 */ 15267 sfmmu_remove_scd(&srdp->srd_scdp, scdp); 15268 mutex_exit(&srdp->srd_scd_mutex); 15269 15270 sfmmu_unlink_scd_from_regions(srdp, scdp); 15271 15272 /* Clear shared context tsb and release ctx */ 15273 scsfmmup = scdp->scd_sfmmup; 15274 15275 /* 15276 * create a barrier so that scd will not be destroyed 15277 * if other thread still holds the same shared hat lock. 15278 * E.g., sfmmu_tsbmiss_exception() needs to acquire the 15279 * shared hat lock before checking the shared tsb reloc flag. 15280 */ 15281 shatlockp = sfmmu_hat_enter(scsfmmup); 15282 sfmmu_hat_exit(shatlockp); 15283 15284 sfmmu_free_scd_tsbs(scsfmmup); 15285 15286 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) { 15287 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) { 15288 kmem_free(scsfmmup->sfmmu_hmeregion_links[i], 15289 SFMMU_L2_HMERLINKS_SIZE); 15290 scsfmmup->sfmmu_hmeregion_links[i] = NULL; 15291 } 15292 } 15293 kmem_cache_free(sfmmuid_cache, scsfmmup); 15294 kmem_cache_free(scd_cache, scdp); 15295 SFMMU_STAT(sf_destroy_scd); 15296 } 15297 15298 /* 15299 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to 15300 * bits which are set in the ism_region_map parameter. This flag indicates to 15301 * the tsbmiss handler that mapping for these segments should be loaded using 15302 * the shared context. 15303 */ 15304 static void 15305 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag) 15306 { 15307 sf_scd_t *scdp = sfmmup->sfmmu_scdp; 15308 ism_blk_t *ism_blkp; 15309 ism_map_t *ism_map; 15310 int i, rid; 15311 15312 ASSERT(sfmmup->sfmmu_iblk != NULL); 15313 ASSERT(scdp != NULL); 15314 /* 15315 * Note that the caller either set HAT_ISMBUSY flag or checked 15316 * under hat lock that HAT_ISMBUSY was not set by another thread. 15317 */ 15318 ASSERT(sfmmu_hat_lock_held(sfmmup)); 15319 15320 ism_blkp = sfmmup->sfmmu_iblk; 15321 while (ism_blkp != NULL) { 15322 ism_map = ism_blkp->iblk_maps; 15323 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) { 15324 rid = ism_map[i].imap_rid; 15325 if (rid == SFMMU_INVALID_ISMRID) { 15326 continue; 15327 } 15328 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS); 15329 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) && 15330 addflag) { 15331 ism_map[i].imap_hatflags |= 15332 HAT_CTX1_FLAG; 15333 } else { 15334 ism_map[i].imap_hatflags &= 15335 ~HAT_CTX1_FLAG; 15336 } 15337 } 15338 ism_blkp = ism_blkp->iblk_next; 15339 } 15340 } 15341 15342 static int 15343 sfmmu_srd_lock_held(sf_srd_t *srdp) 15344 { 15345 return (MUTEX_HELD(&srdp->srd_mutex)); 15346 } 15347 15348 /* ARGSUSED */ 15349 static int 15350 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags) 15351 { 15352 sf_scd_t *scdp = (sf_scd_t *)buf; 15353 15354 bzero(buf, sizeof (sf_scd_t)); 15355 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL); 15356 return (0); 15357 } 15358 15359 /* ARGSUSED */ 15360 static void 15361 sfmmu_scdcache_destructor(void *buf, void *cdrarg) 15362 { 15363 sf_scd_t *scdp = (sf_scd_t *)buf; 15364 15365 mutex_destroy(&scdp->scd_mutex); 15366 } 15367 15368 /* 15369 * The listp parameter is a pointer to a list of hmeblks which are partially 15370 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the 15371 * freeing process is to cross-call all cpus to ensure that there are no 15372 * remaining cached references. 15373 * 15374 * If the local generation number is less than the global then we can free 15375 * hmeblks which are already on the pending queue as another cpu has completed 15376 * the cross-call. 15377 * 15378 * We cross-call to make sure that there are no threads on other cpus accessing 15379 * these hmblks and then complete the process of freeing them under the 15380 * following conditions: 15381 * The total number of pending hmeblks is greater than the threshold 15382 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks 15383 * It is at least 1 second since the last time we cross-called 15384 * 15385 * Otherwise, we add the hmeblks to the per-cpu pending queue. 15386 */ 15387 static void 15388 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree) 15389 { 15390 struct hme_blk *hblkp, *pr_hblkp = NULL; 15391 int count = 0; 15392 cpuset_t cpuset = cpu_ready_set; 15393 cpu_hme_pend_t *cpuhp; 15394 timestruc_t now; 15395 int one_second_expired = 0; 15396 15397 gethrestime_lasttick(&now); 15398 15399 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) { 15400 ASSERT(hblkp->hblk_shw_bit == 0); 15401 ASSERT(hblkp->hblk_shared == 0); 15402 count++; 15403 pr_hblkp = hblkp; 15404 } 15405 15406 cpuhp = &cpu_hme_pend[CPU->cpu_seqid]; 15407 mutex_enter(&cpuhp->chp_mutex); 15408 15409 if ((cpuhp->chp_count + count) == 0) { 15410 mutex_exit(&cpuhp->chp_mutex); 15411 return; 15412 } 15413 15414 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) { 15415 one_second_expired = 1; 15416 } 15417 15418 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT || 15419 (cpuhp->chp_count + count) > cpu_hme_pend_thresh || 15420 one_second_expired)) { 15421 /* Append global list to local */ 15422 if (pr_hblkp == NULL) { 15423 *listp = cpuhp->chp_listp; 15424 } else { 15425 pr_hblkp->hblk_next = cpuhp->chp_listp; 15426 } 15427 cpuhp->chp_listp = NULL; 15428 cpuhp->chp_count = 0; 15429 cpuhp->chp_timestamp = now.tv_sec; 15430 mutex_exit(&cpuhp->chp_mutex); 15431 15432 kpreempt_disable(); 15433 CPUSET_DEL(cpuset, CPU->cpu_id); 15434 xt_sync(cpuset); 15435 xt_sync(cpuset); 15436 kpreempt_enable(); 15437 15438 /* 15439 * At this stage we know that no trap handlers on other 15440 * cpus can have references to hmeblks on the list. 15441 */ 15442 sfmmu_hblk_free(listp); 15443 } else if (*listp != NULL) { 15444 pr_hblkp->hblk_next = cpuhp->chp_listp; 15445 cpuhp->chp_listp = *listp; 15446 cpuhp->chp_count += count; 15447 *listp = NULL; 15448 mutex_exit(&cpuhp->chp_mutex); 15449 } else { 15450 mutex_exit(&cpuhp->chp_mutex); 15451 } 15452 } 15453 15454 /* 15455 * Add an hmeblk to the the hash list. 15456 */ 15457 void 15458 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15459 uint64_t hblkpa) 15460 { 15461 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15462 #ifdef DEBUG 15463 if (hmebp->hmeblkp == NULL) { 15464 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA); 15465 } 15466 #endif /* DEBUG */ 15467 15468 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa; 15469 /* 15470 * Since the TSB miss handler now does not lock the hash chain before 15471 * walking it, make sure that the hmeblks nextpa is globally visible 15472 * before we make the hmeblk globally visible by updating the chain root 15473 * pointer in the hash bucket. 15474 */ 15475 membar_producer(); 15476 hmebp->hmeh_nextpa = hblkpa; 15477 hmeblkp->hblk_next = hmebp->hmeblkp; 15478 hmebp->hmeblkp = hmeblkp; 15479 15480 } 15481 15482 /* 15483 * This function is the first part of a 2 part process to remove an hmeblk 15484 * from the hash chain. In this phase we unlink the hmeblk from the hash chain 15485 * but leave the next physical pointer unchanged. The hmeblk is then linked onto 15486 * a per-cpu pending list using the virtual address pointer. 15487 * 15488 * TSB miss trap handlers that start after this phase will no longer see 15489 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register 15490 * can still use it for further chain traversal because we haven't yet modifed 15491 * the next physical pointer or freed it. 15492 * 15493 * In the second phase of hmeblk removal we'll issue a barrier xcall before 15494 * we reuse or free this hmeblk. This will make sure all lingering references to 15495 * the hmeblk after first phase disappear before we finally reclaim it. 15496 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains 15497 * during their traversal. 15498 * 15499 * The hmehash_mutex must be held when calling this function. 15500 * 15501 * Input: 15502 * hmebp - hme hash bucket pointer 15503 * hmeblkp - address of hmeblk to be removed 15504 * pr_hblk - virtual address of previous hmeblkp 15505 * listp - pointer to list of hmeblks linked by virtual address 15506 * free_now flag - indicates that a complete removal from the hash chains 15507 * is necessary. 15508 * 15509 * It is inefficient to use the free_now flag as a cross-call is required to 15510 * remove a single hmeblk from the hash chain but is necessary when hmeblks are 15511 * in short supply. 15512 */ 15513 void 15514 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp, 15515 struct hme_blk *pr_hblk, struct hme_blk **listp, 15516 int free_now) 15517 { 15518 int shw_size, vshift; 15519 struct hme_blk *shw_hblkp; 15520 uint_t shw_mask, newshw_mask; 15521 caddr_t vaddr; 15522 int size; 15523 cpuset_t cpuset = cpu_ready_set; 15524 15525 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp)); 15526 15527 if (hmebp->hmeblkp == hmeblkp) { 15528 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa; 15529 hmebp->hmeblkp = hmeblkp->hblk_next; 15530 } else { 15531 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa; 15532 pr_hblk->hblk_next = hmeblkp->hblk_next; 15533 } 15534 15535 size = get_hblk_ttesz(hmeblkp); 15536 shw_hblkp = hmeblkp->hblk_shadow; 15537 if (shw_hblkp) { 15538 ASSERT(hblktosfmmu(hmeblkp) != KHATID); 15539 ASSERT(!hmeblkp->hblk_shared); 15540 #ifdef DEBUG 15541 if (mmu_page_sizes == max_mmu_page_sizes) { 15542 ASSERT(size < TTE256M); 15543 } else { 15544 ASSERT(size < TTE4M); 15545 } 15546 #endif /* DEBUG */ 15547 15548 shw_size = get_hblk_ttesz(shw_hblkp); 15549 vaddr = (caddr_t)get_hblk_base(hmeblkp); 15550 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size); 15551 ASSERT(vshift < 8); 15552 /* 15553 * Atomically clear shadow mask bit 15554 */ 15555 do { 15556 shw_mask = shw_hblkp->hblk_shw_mask; 15557 ASSERT(shw_mask & (1 << vshift)); 15558 newshw_mask = shw_mask & ~(1 << vshift); 15559 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask, 15560 shw_mask, newshw_mask); 15561 } while (newshw_mask != shw_mask); 15562 hmeblkp->hblk_shadow = NULL; 15563 } 15564 hmeblkp->hblk_shw_bit = 0; 15565 15566 if (hmeblkp->hblk_shared) { 15567 #ifdef DEBUG 15568 sf_srd_t *srdp; 15569 sf_region_t *rgnp; 15570 uint_t rid; 15571 15572 srdp = hblktosrd(hmeblkp); 15573 ASSERT(srdp != NULL && srdp->srd_refcnt != 0); 15574 rid = hmeblkp->hblk_tag.htag_rid; 15575 ASSERT(SFMMU_IS_SHMERID_VALID(rid)); 15576 ASSERT(rid < SFMMU_MAX_HME_REGIONS); 15577 rgnp = srdp->srd_hmergnp[rid]; 15578 ASSERT(rgnp != NULL); 15579 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid); 15580 #endif /* DEBUG */ 15581 hmeblkp->hblk_shared = 0; 15582 } 15583 if (free_now) { 15584 kpreempt_disable(); 15585 CPUSET_DEL(cpuset, CPU->cpu_id); 15586 xt_sync(cpuset); 15587 xt_sync(cpuset); 15588 kpreempt_enable(); 15589 15590 hmeblkp->hblk_nextpa = HMEBLK_ENDPA; 15591 hmeblkp->hblk_next = NULL; 15592 } else { 15593 /* Append hmeblkp to listp for processing later. */ 15594 hmeblkp->hblk_next = *listp; 15595 *listp = hmeblkp; 15596 } 15597 } 15598 15599 /* 15600 * This routine is called when memory is in short supply and returns a free 15601 * hmeblk of the requested size from the cpu pending lists. 15602 */ 15603 static struct hme_blk * 15604 sfmmu_check_pending_hblks(int size) 15605 { 15606 int i; 15607 struct hme_blk *hmeblkp = NULL, *last_hmeblkp; 15608 int found_hmeblk; 15609 cpuset_t cpuset = cpu_ready_set; 15610 cpu_hme_pend_t *cpuhp; 15611 15612 /* Flush cpu hblk pending queues */ 15613 for (i = 0; i < NCPU; i++) { 15614 cpuhp = &cpu_hme_pend[i]; 15615 if (cpuhp->chp_listp != NULL) { 15616 mutex_enter(&cpuhp->chp_mutex); 15617 if (cpuhp->chp_listp == NULL) { 15618 mutex_exit(&cpuhp->chp_mutex); 15619 continue; 15620 } 15621 found_hmeblk = 0; 15622 last_hmeblkp = NULL; 15623 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL; 15624 hmeblkp = hmeblkp->hblk_next) { 15625 if (get_hblk_ttesz(hmeblkp) == size) { 15626 if (last_hmeblkp == NULL) { 15627 cpuhp->chp_listp = 15628 hmeblkp->hblk_next; 15629 } else { 15630 last_hmeblkp->hblk_next = 15631 hmeblkp->hblk_next; 15632 } 15633 ASSERT(cpuhp->chp_count > 0); 15634 cpuhp->chp_count--; 15635 found_hmeblk = 1; 15636 break; 15637 } else { 15638 last_hmeblkp = hmeblkp; 15639 } 15640 } 15641 mutex_exit(&cpuhp->chp_mutex); 15642 15643 if (found_hmeblk) { 15644 kpreempt_disable(); 15645 CPUSET_DEL(cpuset, CPU->cpu_id); 15646 xt_sync(cpuset); 15647 xt_sync(cpuset); 15648 kpreempt_enable(); 15649 return (hmeblkp); 15650 } 15651 } 15652 } 15653 return (NULL); 15654 } 15655