1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24 /*
25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 /*
29 * VM - Hardware Address Translation management for Spitfire MMU.
30 *
31 * This file implements the machine specific hardware translation
32 * needed by the VM system. The machine independent interface is
33 * described in <vm/hat.h> while the machine dependent interface
34 * and data structures are described in <vm/hat_sfmmu.h>.
35 *
36 * The hat layer manages the address translation hardware as a cache
37 * driven by calls from the higher levels in the VM system.
38 */
39
40 #include <sys/types.h>
41 #include <sys/kstat.h>
42 #include <vm/hat.h>
43 #include <vm/hat_sfmmu.h>
44 #include <vm/page.h>
45 #include <sys/pte.h>
46 #include <sys/systm.h>
47 #include <sys/mman.h>
48 #include <sys/sysmacros.h>
49 #include <sys/machparam.h>
50 #include <sys/vtrace.h>
51 #include <sys/kmem.h>
52 #include <sys/mmu.h>
53 #include <sys/cmn_err.h>
54 #include <sys/cpu.h>
55 #include <sys/cpuvar.h>
56 #include <sys/debug.h>
57 #include <sys/lgrp.h>
58 #include <sys/archsystm.h>
59 #include <sys/machsystm.h>
60 #include <sys/vmsystm.h>
61 #include <vm/as.h>
62 #include <vm/seg.h>
63 #include <vm/seg_kp.h>
64 #include <vm/seg_kmem.h>
65 #include <vm/seg_kpm.h>
66 #include <vm/rm.h>
67 #include <sys/t_lock.h>
68 #include <sys/obpdefs.h>
69 #include <sys/vm_machparam.h>
70 #include <sys/var.h>
71 #include <sys/trap.h>
72 #include <sys/machtrap.h>
73 #include <sys/scb.h>
74 #include <sys/bitmap.h>
75 #include <sys/machlock.h>
76 #include <sys/membar.h>
77 #include <sys/atomic.h>
78 #include <sys/cpu_module.h>
79 #include <sys/prom_debug.h>
80 #include <sys/ksynch.h>
81 #include <sys/mem_config.h>
82 #include <sys/mem_cage.h>
83 #include <vm/vm_dep.h>
84 #include <vm/xhat_sfmmu.h>
85 #include <sys/fpu/fpusystm.h>
86 #include <vm/mach_kpm.h>
87 #include <sys/callb.h>
88
89 #ifdef DEBUG
90 #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \
91 if (SFMMU_IS_SHMERID_VALID(rid)) { \
92 caddr_t _eaddr = (saddr) + (len); \
93 sf_srd_t *_srdp; \
94 sf_region_t *_rgnp; \
95 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
96 ASSERT(SF_RGNMAP_TEST(hat->sfmmu_hmeregion_map, rid)); \
97 ASSERT((hat) != ksfmmup); \
98 _srdp = (hat)->sfmmu_srdp; \
99 ASSERT(_srdp != NULL); \
100 ASSERT(_srdp->srd_refcnt != 0); \
101 _rgnp = _srdp->srd_hmergnp[(rid)]; \
102 ASSERT(_rgnp != NULL && _rgnp->rgn_id == rid); \
103 ASSERT(_rgnp->rgn_refcnt != 0); \
104 ASSERT(!(_rgnp->rgn_flags & SFMMU_REGION_FREE)); \
105 ASSERT((_rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
106 SFMMU_REGION_HME); \
107 ASSERT((saddr) >= _rgnp->rgn_saddr); \
108 ASSERT((saddr) < _rgnp->rgn_saddr + _rgnp->rgn_size); \
109 ASSERT(_eaddr > _rgnp->rgn_saddr); \
110 ASSERT(_eaddr <= _rgnp->rgn_saddr + _rgnp->rgn_size); \
111 }
112
113 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid) \
114 { \
115 caddr_t _hsva; \
116 caddr_t _heva; \
117 caddr_t _rsva; \
118 caddr_t _reva; \
119 int _ttesz = get_hblk_ttesz(hmeblkp); \
120 int _flagtte; \
121 ASSERT((srdp)->srd_refcnt != 0); \
122 ASSERT((rid) < SFMMU_MAX_HME_REGIONS); \
123 ASSERT((rgnp)->rgn_id == rid); \
124 ASSERT(!((rgnp)->rgn_flags & SFMMU_REGION_FREE)); \
125 ASSERT(((rgnp)->rgn_flags & SFMMU_REGION_TYPE_MASK) == \
126 SFMMU_REGION_HME); \
127 ASSERT(_ttesz <= (rgnp)->rgn_pgszc); \
128 _hsva = (caddr_t)get_hblk_base(hmeblkp); \
129 _heva = get_hblk_endaddr(hmeblkp); \
130 _rsva = (caddr_t)P2ALIGN( \
131 (uintptr_t)(rgnp)->rgn_saddr, HBLK_MIN_BYTES); \
132 _reva = (caddr_t)P2ROUNDUP( \
133 (uintptr_t)((rgnp)->rgn_saddr + (rgnp)->rgn_size), \
134 HBLK_MIN_BYTES); \
135 ASSERT(_hsva >= _rsva); \
136 ASSERT(_hsva < _reva); \
137 ASSERT(_heva > _rsva); \
138 ASSERT(_heva <= _reva); \
139 _flagtte = (_ttesz < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : \
140 _ttesz; \
141 ASSERT(rgnp->rgn_hmeflags & (0x1 << _flagtte)); \
142 }
143
144 #else /* DEBUG */
145 #define SFMMU_VALIDATE_HMERID(hat, rid, addr, len)
146 #define SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid)
147 #endif /* DEBUG */
148
149 #if defined(SF_ERRATA_57)
150 extern caddr_t errata57_limit;
151 #endif
152
153 #define HME8BLK_SZ_RND ((roundup(HME8BLK_SZ, sizeof (int64_t))) / \
154 (sizeof (int64_t)))
155 #define HBLK_RESERVE ((struct hme_blk *)hblk_reserve)
156
157 #define HBLK_RESERVE_CNT 128
158 #define HBLK_RESERVE_MIN 20
159
160 static struct hme_blk *freehblkp;
161 static kmutex_t freehblkp_lock;
162 static int freehblkcnt;
163
164 static int64_t hblk_reserve[HME8BLK_SZ_RND];
165 static kmutex_t hblk_reserve_lock;
166 static kthread_t *hblk_reserve_thread;
167
168 static nucleus_hblk8_info_t nucleus_hblk8;
169 static nucleus_hblk1_info_t nucleus_hblk1;
170
171 /*
172 * Data to manage per-cpu hmeblk pending queues, hmeblks are queued here
173 * after the initial phase of removing an hmeblk from the hash chain, see
174 * the detailed comment in sfmmu_hblk_hash_rm() for further details.
175 */
176 static cpu_hme_pend_t *cpu_hme_pend;
177 static uint_t cpu_hme_pend_thresh;
178 /*
179 * SFMMU specific hat functions
180 */
181 void hat_pagecachectl(struct page *, int);
182
183 /* flags for hat_pagecachectl */
184 #define HAT_CACHE 0x1
185 #define HAT_UNCACHE 0x2
186 #define HAT_TMPNC 0x4
187
188 /*
189 * Flag to allow the creation of non-cacheable translations
190 * to system memory. It is off by default. At the moment this
191 * flag is used by the ecache error injector. The error injector
192 * will turn it on when creating such a translation then shut it
193 * off when it's finished.
194 */
195
196 int sfmmu_allow_nc_trans = 0;
197
198 /*
199 * Flag to disable large page support.
200 * value of 1 => disable all large pages.
201 * bits 1, 2, and 3 are to disable 64K, 512K and 4M pages respectively.
202 *
203 * For example, use the value 0x4 to disable 512K pages.
204 *
205 */
206 #define LARGE_PAGES_OFF 0x1
207
208 /*
209 * The disable_large_pages and disable_ism_large_pages variables control
210 * hat_memload_array and the page sizes to be used by ISM and the kernel.
211 *
212 * The disable_auto_data_large_pages and disable_auto_text_large_pages variables
213 * are only used to control which OOB pages to use at upper VM segment creation
214 * time, and are set in hat_init_pagesizes and used in the map_pgsz* routines.
215 * Their values may come from platform or CPU specific code to disable page
216 * sizes that should not be used.
217 *
218 * WARNING: 512K pages are currently not supported for ISM/DISM.
219 */
220 uint_t disable_large_pages = 0;
221 uint_t disable_ism_large_pages = (1 << TTE512K);
222 uint_t disable_auto_data_large_pages = 0;
223 uint_t disable_auto_text_large_pages = 0;
224
225 /*
226 * Private sfmmu data structures for hat management
227 */
228 static struct kmem_cache *sfmmuid_cache;
229 static struct kmem_cache *mmuctxdom_cache;
230
231 /*
232 * Private sfmmu data structures for tsb management
233 */
234 static struct kmem_cache *sfmmu_tsbinfo_cache;
235 static struct kmem_cache *sfmmu_tsb8k_cache;
236 static struct kmem_cache *sfmmu_tsb_cache[NLGRPS_MAX];
237 static vmem_t *kmem_bigtsb_arena;
238 static vmem_t *kmem_tsb_arena;
239
240 /*
241 * sfmmu static variables for hmeblk resource management.
242 */
243 static vmem_t *hat_memload1_arena; /* HAT translation arena for sfmmu1_cache */
244 static struct kmem_cache *sfmmu8_cache;
245 static struct kmem_cache *sfmmu1_cache;
246 static struct kmem_cache *pa_hment_cache;
247
248 static kmutex_t ism_mlist_lock; /* mutex for ism mapping list */
249 /*
250 * private data for ism
251 */
252 static struct kmem_cache *ism_blk_cache;
253 static struct kmem_cache *ism_ment_cache;
254 #define ISMID_STARTADDR NULL
255
256 /*
257 * Region management data structures and function declarations.
258 */
259
260 static void sfmmu_leave_srd(sfmmu_t *);
261 static int sfmmu_srdcache_constructor(void *, void *, int);
262 static void sfmmu_srdcache_destructor(void *, void *);
263 static int sfmmu_rgncache_constructor(void *, void *, int);
264 static void sfmmu_rgncache_destructor(void *, void *);
265 static int sfrgnmap_isnull(sf_region_map_t *);
266 static int sfhmergnmap_isnull(sf_hmeregion_map_t *);
267 static int sfmmu_scdcache_constructor(void *, void *, int);
268 static void sfmmu_scdcache_destructor(void *, void *);
269 static void sfmmu_rgn_cb_noop(caddr_t, caddr_t, caddr_t,
270 size_t, void *, u_offset_t);
271
272 static uint_t srd_hashmask = SFMMU_MAX_SRD_BUCKETS - 1;
273 static sf_srd_bucket_t *srd_buckets;
274 static struct kmem_cache *srd_cache;
275 static uint_t srd_rgn_hashmask = SFMMU_MAX_REGION_BUCKETS - 1;
276 static struct kmem_cache *region_cache;
277 static struct kmem_cache *scd_cache;
278
279 #ifdef sun4v
280 int use_bigtsb_arena = 1;
281 #else
282 int use_bigtsb_arena = 0;
283 #endif
284
285 /* External /etc/system tunable, for turning on&off the shctx support */
286 int disable_shctx = 0;
287 /* Internal variable, set by MD if the HW supports shctx feature */
288 int shctx_on = 0;
289
290 #ifdef DEBUG
291 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int);
292 #endif
293 static void sfmmu_to_scd_list(sfmmu_t **, sfmmu_t *);
294 static void sfmmu_from_scd_list(sfmmu_t **, sfmmu_t *);
295
296 static sf_scd_t *sfmmu_alloc_scd(sf_srd_t *, sf_region_map_t *);
297 static void sfmmu_find_scd(sfmmu_t *);
298 static void sfmmu_join_scd(sf_scd_t *, sfmmu_t *);
299 static void sfmmu_finish_join_scd(sfmmu_t *);
300 static void sfmmu_leave_scd(sfmmu_t *, uchar_t);
301 static void sfmmu_destroy_scd(sf_srd_t *, sf_scd_t *, sf_region_map_t *);
302 static int sfmmu_alloc_scd_tsbs(sf_srd_t *, sf_scd_t *);
303 static void sfmmu_free_scd_tsbs(sfmmu_t *);
304 static void sfmmu_tsb_inv_ctx(sfmmu_t *);
305 static int find_ism_rid(sfmmu_t *, sfmmu_t *, caddr_t, uint_t *);
306 static void sfmmu_ism_hatflags(sfmmu_t *, int);
307 static int sfmmu_srd_lock_held(sf_srd_t *);
308 static void sfmmu_remove_scd(sf_scd_t **, sf_scd_t *);
309 static void sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *);
310 static void sfmmu_link_scd_to_regions(sf_srd_t *, sf_scd_t *);
311 static void sfmmu_unlink_scd_from_regions(sf_srd_t *, sf_scd_t *);
312 static void sfmmu_link_to_hmeregion(sfmmu_t *, sf_region_t *);
313 static void sfmmu_unlink_from_hmeregion(sfmmu_t *, sf_region_t *);
314
315 /*
316 * ``hat_lock'' is a hashed mutex lock for protecting sfmmu TSB lists,
317 * HAT flags, synchronizing TLB/TSB coherency, and context management.
318 * The lock is hashed on the sfmmup since the case where we need to lock
319 * all processes is rare but does occur (e.g. we need to unload a shared
320 * mapping from all processes using the mapping). We have a lot of buckets,
321 * and each slab of sfmmu_t's can use about a quarter of them, giving us
322 * a fairly good distribution without wasting too much space and overhead
323 * when we have to grab them all.
324 */
325 #define SFMMU_NUM_LOCK 128 /* must be power of two */
326 hatlock_t hat_lock[SFMMU_NUM_LOCK];
327
328 /*
329 * Hash algorithm optimized for a small number of slabs.
330 * 7 is (highbit((sizeof sfmmu_t)) - 1)
331 * This hash algorithm is based upon the knowledge that sfmmu_t's come from a
332 * kmem_cache, and thus they will be sequential within that cache. In
333 * addition, each new slab will have a different "color" up to cache_maxcolor
334 * which will skew the hashing for each successive slab which is allocated.
335 * If the size of sfmmu_t changed to a larger size, this algorithm may need
336 * to be revisited.
337 */
338 #define TSB_HASH_SHIFT_BITS (7)
339 #define PTR_HASH(x) ((uintptr_t)x >> TSB_HASH_SHIFT_BITS)
340
341 #ifdef DEBUG
342 int tsb_hash_debug = 0;
343 #define TSB_HASH(sfmmup) \
344 (tsb_hash_debug ? &hat_lock[0] : \
345 &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)])
346 #else /* DEBUG */
347 #define TSB_HASH(sfmmup) &hat_lock[PTR_HASH(sfmmup) & (SFMMU_NUM_LOCK-1)]
348 #endif /* DEBUG */
349
350
351 /* sfmmu_replace_tsb() return codes. */
352 typedef enum tsb_replace_rc {
353 TSB_SUCCESS,
354 TSB_ALLOCFAIL,
355 TSB_LOSTRACE,
356 TSB_ALREADY_SWAPPED,
357 TSB_CANTGROW
358 } tsb_replace_rc_t;
359
360 /*
361 * Flags for TSB allocation routines.
362 */
363 #define TSB_ALLOC 0x01
364 #define TSB_FORCEALLOC 0x02
365 #define TSB_GROW 0x04
366 #define TSB_SHRINK 0x08
367 #define TSB_SWAPIN 0x10
368
369 /*
370 * Support for HAT callbacks.
371 */
372 #define SFMMU_MAX_RELOC_CALLBACKS 10
373 int sfmmu_max_cb_id = SFMMU_MAX_RELOC_CALLBACKS;
374 static id_t sfmmu_cb_nextid = 0;
375 static id_t sfmmu_tsb_cb_id;
376 struct sfmmu_callback *sfmmu_cb_table;
377
378 kmutex_t kpr_mutex;
379 kmutex_t kpr_suspendlock;
380 kthread_t *kreloc_thread;
381
382 /*
383 * Enable VA->PA translation sanity checking on DEBUG kernels.
384 * Disabled by default. This is incompatible with some
385 * drivers (error injector, RSM) so if it breaks you get
386 * to keep both pieces.
387 */
388 int hat_check_vtop = 0;
389
390 /*
391 * Private sfmmu routines (prototypes)
392 */
393 static struct hme_blk *sfmmu_shadow_hcreate(sfmmu_t *, caddr_t, int, uint_t);
394 static struct hme_blk *sfmmu_hblk_alloc(sfmmu_t *, caddr_t,
395 struct hmehash_bucket *, uint_t, hmeblk_tag, uint_t,
396 uint_t);
397 static caddr_t sfmmu_hblk_unload(struct hat *, struct hme_blk *, caddr_t,
398 caddr_t, demap_range_t *, uint_t);
399 static caddr_t sfmmu_hblk_sync(struct hat *, struct hme_blk *, caddr_t,
400 caddr_t, int);
401 static void sfmmu_hblk_free(struct hme_blk **);
402 static void sfmmu_hblks_list_purge(struct hme_blk **, int);
403 static uint_t sfmmu_get_free_hblk(struct hme_blk **, uint_t);
404 static uint_t sfmmu_put_free_hblk(struct hme_blk *, uint_t);
405 static struct hme_blk *sfmmu_hblk_steal(int);
406 static int sfmmu_steal_this_hblk(struct hmehash_bucket *,
407 struct hme_blk *, uint64_t, struct hme_blk *);
408 static caddr_t sfmmu_hblk_unlock(struct hme_blk *, caddr_t, caddr_t);
409
410 static void hat_do_memload_array(struct hat *, caddr_t, size_t,
411 struct page **, uint_t, uint_t, uint_t);
412 static void hat_do_memload(struct hat *, caddr_t, struct page *,
413 uint_t, uint_t, uint_t);
414 static void sfmmu_memload_batchsmall(struct hat *, caddr_t, page_t **,
415 uint_t, uint_t, pgcnt_t, uint_t);
416 void sfmmu_tteload(struct hat *, tte_t *, caddr_t, page_t *,
417 uint_t);
418 static int sfmmu_tteload_array(sfmmu_t *, tte_t *, caddr_t, page_t **,
419 uint_t, uint_t);
420 static struct hmehash_bucket *sfmmu_tteload_acquire_hashbucket(sfmmu_t *,
421 caddr_t, int, uint_t);
422 static struct hme_blk *sfmmu_tteload_find_hmeblk(sfmmu_t *,
423 struct hmehash_bucket *, caddr_t, uint_t, uint_t,
424 uint_t);
425 static int sfmmu_tteload_addentry(sfmmu_t *, struct hme_blk *, tte_t *,
426 caddr_t, page_t **, uint_t, uint_t);
427 static void sfmmu_tteload_release_hashbucket(struct hmehash_bucket *);
428
429 static int sfmmu_pagearray_setup(caddr_t, page_t **, tte_t *, int);
430 static pfn_t sfmmu_uvatopfn(caddr_t, sfmmu_t *, tte_t *);
431 void sfmmu_memtte(tte_t *, pfn_t, uint_t, int);
432 #ifdef VAC
433 static void sfmmu_vac_conflict(struct hat *, caddr_t, page_t *);
434 static int sfmmu_vacconflict_array(caddr_t, page_t *, int *);
435 int tst_tnc(page_t *pp, pgcnt_t);
436 void conv_tnc(page_t *pp, int);
437 #endif
438
439 static void sfmmu_get_ctx(sfmmu_t *);
440 static void sfmmu_free_sfmmu(sfmmu_t *);
441
442 static void sfmmu_ttesync(struct hat *, caddr_t, tte_t *, page_t *);
443 static void sfmmu_chgattr(struct hat *, caddr_t, size_t, uint_t, int);
444
445 cpuset_t sfmmu_pageunload(page_t *, struct sf_hment *, int);
446 static void hat_pagereload(struct page *, struct page *);
447 static cpuset_t sfmmu_pagesync(page_t *, struct sf_hment *, uint_t);
448 #ifdef VAC
449 void sfmmu_page_cache_array(page_t *, int, int, pgcnt_t);
450 static void sfmmu_page_cache(page_t *, int, int, int);
451 #endif
452
453 cpuset_t sfmmu_rgntlb_demap(caddr_t, sf_region_t *,
454 struct hme_blk *, int);
455 static void sfmmu_tlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
456 pfn_t, int, int, int, int);
457 static void sfmmu_ismtlbcache_demap(caddr_t, sfmmu_t *, struct hme_blk *,
458 pfn_t, int);
459 static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int);
460 static void sfmmu_tlb_range_demap(demap_range_t *);
461 static void sfmmu_invalidate_ctx(sfmmu_t *);
462 static void sfmmu_sync_mmustate(sfmmu_t *);
463
464 static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t);
465 static int sfmmu_tsbinfo_alloc(struct tsb_info **, int, int, uint_t,
466 sfmmu_t *);
467 static void sfmmu_tsb_free(struct tsb_info *);
468 static void sfmmu_tsbinfo_free(struct tsb_info *);
469 static int sfmmu_init_tsbinfo(struct tsb_info *, int, int, uint_t,
470 sfmmu_t *);
471 static void sfmmu_tsb_chk_reloc(sfmmu_t *, hatlock_t *);
472 static void sfmmu_tsb_swapin(sfmmu_t *, hatlock_t *);
473 static int sfmmu_select_tsb_szc(pgcnt_t);
474 static void sfmmu_mod_tsb(sfmmu_t *, caddr_t, tte_t *, int);
475 #define sfmmu_load_tsb(sfmmup, vaddr, tte, szc) \
476 sfmmu_mod_tsb(sfmmup, vaddr, tte, szc)
477 #define sfmmu_unload_tsb(sfmmup, vaddr, szc) \
478 sfmmu_mod_tsb(sfmmup, vaddr, NULL, szc)
479 static void sfmmu_copy_tsb(struct tsb_info *, struct tsb_info *);
480 static tsb_replace_rc_t sfmmu_replace_tsb(sfmmu_t *, struct tsb_info *, uint_t,
481 hatlock_t *, uint_t);
482 static void sfmmu_size_tsb(sfmmu_t *, int, uint64_t, uint64_t, int);
483
484 #ifdef VAC
485 void sfmmu_cache_flush(pfn_t, int);
486 void sfmmu_cache_flushcolor(int, pfn_t);
487 #endif
488 static caddr_t sfmmu_hblk_chgattr(sfmmu_t *, struct hme_blk *, caddr_t,
489 caddr_t, demap_range_t *, uint_t, int);
490
491 static uint64_t sfmmu_vtop_attr(uint_t, int mode, tte_t *);
492 static uint_t sfmmu_ptov_attr(tte_t *);
493 static caddr_t sfmmu_hblk_chgprot(sfmmu_t *, struct hme_blk *, caddr_t,
494 caddr_t, demap_range_t *, uint_t);
495 static uint_t sfmmu_vtop_prot(uint_t, uint_t *);
496 static int sfmmu_idcache_constructor(void *, void *, int);
497 static void sfmmu_idcache_destructor(void *, void *);
498 static int sfmmu_hblkcache_constructor(void *, void *, int);
499 static void sfmmu_hblkcache_destructor(void *, void *);
500 static void sfmmu_hblkcache_reclaim(void *);
501 static void sfmmu_shadow_hcleanup(sfmmu_t *, struct hme_blk *,
502 struct hmehash_bucket *);
503 static void sfmmu_hblk_hash_rm(struct hmehash_bucket *, struct hme_blk *,
504 struct hme_blk *, struct hme_blk **, int);
505 static void sfmmu_hblk_hash_add(struct hmehash_bucket *, struct hme_blk *,
506 uint64_t);
507 static struct hme_blk *sfmmu_check_pending_hblks(int);
508 static void sfmmu_free_hblks(sfmmu_t *, caddr_t, caddr_t, int);
509 static void sfmmu_cleanup_rhblk(sf_srd_t *, caddr_t, uint_t, int);
510 static void sfmmu_unload_hmeregion_va(sf_srd_t *, uint_t, caddr_t, caddr_t,
511 int, caddr_t *);
512 static void sfmmu_unload_hmeregion(sf_srd_t *, sf_region_t *);
513
514 static void sfmmu_rm_large_mappings(page_t *, int);
515
516 static void hat_lock_init(void);
517 static void hat_kstat_init(void);
518 static int sfmmu_kstat_percpu_update(kstat_t *ksp, int rw);
519 static void sfmmu_set_scd_rttecnt(sf_srd_t *, sf_scd_t *);
520 static int sfmmu_is_rgnva(sf_srd_t *, caddr_t, ulong_t, ulong_t);
521 static void sfmmu_check_page_sizes(sfmmu_t *, int);
522 int fnd_mapping_sz(page_t *);
523 static void iment_add(struct ism_ment *, struct hat *);
524 static void iment_sub(struct ism_ment *, struct hat *);
525 static pgcnt_t ism_tsb_entries(sfmmu_t *, int szc);
526 extern void sfmmu_setup_tsbinfo(sfmmu_t *);
527 extern void sfmmu_clear_utsbinfo(void);
528
529 static void sfmmu_ctx_wrap_around(mmu_ctx_t *, boolean_t);
530
531 extern int vpm_enable;
532
533 /* kpm globals */
534 #ifdef DEBUG
535 /*
536 * Enable trap level tsbmiss handling
537 */
538 int kpm_tsbmtl = 1;
539
540 /*
541 * Flush the TLB on kpm mapout. Note: Xcalls are used (again) for the
542 * required TLB shootdowns in this case, so handle w/ care. Off by default.
543 */
544 int kpm_tlb_flush;
545 #endif /* DEBUG */
546
547 static void *sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *, size_t, int);
548
549 #ifdef DEBUG
550 static void sfmmu_check_hblk_flist();
551 #endif
552
553 /*
554 * Semi-private sfmmu data structures. Some of them are initialize in
555 * startup or in hat_init. Some of them are private but accessed by
556 * assembly code or mach_sfmmu.c
557 */
558 struct hmehash_bucket *uhme_hash; /* user hmeblk hash table */
559 struct hmehash_bucket *khme_hash; /* kernel hmeblk hash table */
560 uint64_t uhme_hash_pa; /* PA of uhme_hash */
561 uint64_t khme_hash_pa; /* PA of khme_hash */
562 int uhmehash_num; /* # of buckets in user hash table */
563 int khmehash_num; /* # of buckets in kernel hash table */
564
565 uint_t max_mmu_ctxdoms = 0; /* max context domains in the system */
566 mmu_ctx_t **mmu_ctxs_tbl; /* global array of context domains */
567 uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */
568
569 #define DEFAULT_NUM_CTXS_PER_MMU 8192
570 static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU;
571
572 int cache; /* describes system cache */
573
574 caddr_t ktsb_base; /* kernel 8k-indexed tsb base address */
575 uint64_t ktsb_pbase; /* kernel 8k-indexed tsb phys address */
576 int ktsb_szcode; /* kernel 8k-indexed tsb size code */
577 int ktsb_sz; /* kernel 8k-indexed tsb size */
578
579 caddr_t ktsb4m_base; /* kernel 4m-indexed tsb base address */
580 uint64_t ktsb4m_pbase; /* kernel 4m-indexed tsb phys address */
581 int ktsb4m_szcode; /* kernel 4m-indexed tsb size code */
582 int ktsb4m_sz; /* kernel 4m-indexed tsb size */
583
584 uint64_t kpm_tsbbase; /* kernel seg_kpm 4M TSB base address */
585 int kpm_tsbsz; /* kernel seg_kpm 4M TSB size code */
586 uint64_t kpmsm_tsbbase; /* kernel seg_kpm 8K TSB base address */
587 int kpmsm_tsbsz; /* kernel seg_kpm 8K TSB size code */
588
589 #ifndef sun4v
590 int utsb_dtlb_ttenum = -1; /* index in TLB for utsb locked TTE */
591 int utsb4m_dtlb_ttenum = -1; /* index in TLB for 4M TSB TTE */
592 int dtlb_resv_ttenum; /* index in TLB of first reserved TTE */
593 caddr_t utsb_vabase; /* reserved kernel virtual memory */
594 caddr_t utsb4m_vabase; /* for trap handler TSB accesses */
595 #endif /* sun4v */
596 uint64_t tsb_alloc_bytes = 0; /* bytes allocated to TSBs */
597 vmem_t *kmem_tsb_default_arena[NLGRPS_MAX]; /* For dynamic TSBs */
598 vmem_t *kmem_bigtsb_default_arena[NLGRPS_MAX]; /* dynamic 256M TSBs */
599
600 /*
601 * Size to use for TSB slabs. Future platforms that support page sizes
602 * larger than 4M may wish to change these values, and provide their own
603 * assembly macros for building and decoding the TSB base register contents.
604 * Note disable_large_pages will override the value set here.
605 */
606 static uint_t tsb_slab_ttesz = TTE4M;
607 size_t tsb_slab_size = MMU_PAGESIZE4M;
608 uint_t tsb_slab_shift = MMU_PAGESHIFT4M;
609 /* PFN mask for TTE */
610 size_t tsb_slab_mask = MMU_PAGEOFFSET4M >> MMU_PAGESHIFT;
611
612 /*
613 * Size to use for TSB slabs. These are used only when 256M tsb arenas
614 * exist.
615 */
616 static uint_t bigtsb_slab_ttesz = TTE256M;
617 static size_t bigtsb_slab_size = MMU_PAGESIZE256M;
618 static uint_t bigtsb_slab_shift = MMU_PAGESHIFT256M;
619 /* 256M page alignment for 8K pfn */
620 static size_t bigtsb_slab_mask = MMU_PAGEOFFSET256M >> MMU_PAGESHIFT;
621
622 /* largest TSB size to grow to, will be smaller on smaller memory systems */
623 static int tsb_max_growsize = 0;
624
625 /*
626 * Tunable parameters dealing with TSB policies.
627 */
628
629 /*
630 * This undocumented tunable forces all 8K TSBs to be allocated from
631 * the kernel heap rather than from the kmem_tsb_default_arena arenas.
632 */
633 #ifdef DEBUG
634 int tsb_forceheap = 0;
635 #endif /* DEBUG */
636
637 /*
638 * Decide whether to use per-lgroup arenas, or one global set of
639 * TSB arenas. The default is not to break up per-lgroup, since
640 * most platforms don't recognize any tangible benefit from it.
641 */
642 int tsb_lgrp_affinity = 0;
643
644 /*
645 * Used for growing the TSB based on the process RSS.
646 * tsb_rss_factor is based on the smallest TSB, and is
647 * shifted by the TSB size to determine if we need to grow.
648 * The default will grow the TSB if the number of TTEs for
649 * this page size exceeds 75% of the number of TSB entries,
650 * which should _almost_ eliminate all conflict misses
651 * (at the expense of using up lots and lots of memory).
652 */
653 #define TSB_RSS_FACTOR (TSB_ENTRIES(TSB_MIN_SZCODE) * 0.75)
654 #define SFMMU_RSS_TSBSIZE(tsbszc) (tsb_rss_factor << tsbszc)
655 #define SELECT_TSB_SIZECODE(pgcnt) ( \
656 (enable_tsb_rss_sizing)? sfmmu_select_tsb_szc(pgcnt) : \
657 default_tsb_size)
658 #define TSB_OK_SHRINK() \
659 (tsb_alloc_bytes > tsb_alloc_hiwater || freemem < desfree)
660 #define TSB_OK_GROW() \
661 (tsb_alloc_bytes < tsb_alloc_hiwater && freemem > desfree)
662
663 int enable_tsb_rss_sizing = 1;
664 int tsb_rss_factor = (int)TSB_RSS_FACTOR;
665
666 /* which TSB size code to use for new address spaces or if rss sizing off */
667 int default_tsb_size = TSB_8K_SZCODE;
668
669 static uint64_t tsb_alloc_hiwater; /* limit TSB reserved memory */
670 uint64_t tsb_alloc_hiwater_factor; /* tsb_alloc_hiwater = physmem / this */
671 #define TSB_ALLOC_HIWATER_FACTOR_DEFAULT 32
672
673 #ifdef DEBUG
674 static int tsb_random_size = 0; /* set to 1 to test random tsb sizes on alloc */
675 static int tsb_grow_stress = 0; /* if set to 1, keep replacing TSB w/ random */
676 static int tsb_alloc_mtbf = 0; /* fail allocation every n attempts */
677 static int tsb_alloc_fail_mtbf = 0;
678 static int tsb_alloc_count = 0;
679 #endif /* DEBUG */
680
681 /* if set to 1, will remap valid TTEs when growing TSB. */
682 int tsb_remap_ttes = 1;
683
684 /*
685 * If we have more than this many mappings, allocate a second TSB.
686 * This default is chosen because the I/D fully associative TLBs are
687 * assumed to have at least 8 available entries. Platforms with a
688 * larger fully-associative TLB could probably override the default.
689 */
690
691 #ifdef sun4v
692 int tsb_sectsb_threshold = 0;
693 #else
694 int tsb_sectsb_threshold = 8;
695 #endif
696
697 /*
698 * kstat data
699 */
700 struct sfmmu_global_stat sfmmu_global_stat;
701 struct sfmmu_tsbsize_stat sfmmu_tsbsize_stat;
702
703 /*
704 * Global data
705 */
706 sfmmu_t *ksfmmup; /* kernel's hat id */
707
708 #ifdef DEBUG
709 static void chk_tte(tte_t *, tte_t *, tte_t *, struct hme_blk *);
710 #endif
711
712 /* sfmmu locking operations */
713 static kmutex_t *sfmmu_mlspl_enter(struct page *, int);
714 static int sfmmu_mlspl_held(struct page *, int);
715
716 kmutex_t *sfmmu_page_enter(page_t *);
717 void sfmmu_page_exit(kmutex_t *);
718 int sfmmu_page_spl_held(struct page *);
719
720 /* sfmmu internal locking operations - accessed directly */
721 static void sfmmu_mlist_reloc_enter(page_t *, page_t *,
722 kmutex_t **, kmutex_t **);
723 static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *);
724 static hatlock_t *
725 sfmmu_hat_enter(sfmmu_t *);
726 static hatlock_t *
727 sfmmu_hat_tryenter(sfmmu_t *);
728 static void sfmmu_hat_exit(hatlock_t *);
729 static void sfmmu_hat_lock_all(void);
730 static void sfmmu_hat_unlock_all(void);
731 static void sfmmu_ismhat_enter(sfmmu_t *, int);
732 static void sfmmu_ismhat_exit(sfmmu_t *, int);
733
734 kpm_hlk_t *kpmp_table;
735 uint_t kpmp_table_sz; /* must be a power of 2 */
736 uchar_t kpmp_shift;
737
738 kpm_shlk_t *kpmp_stable;
739 uint_t kpmp_stable_sz; /* must be a power of 2 */
740
741 /*
742 * SPL_TABLE_SIZE is 2 * NCPU, but no smaller than 128.
743 * SPL_SHIFT is log2(SPL_TABLE_SIZE).
744 */
745 #if ((2*NCPU_P2) > 128)
746 #define SPL_SHIFT ((unsigned)(NCPU_LOG2 + 1))
747 #else
748 #define SPL_SHIFT 7U
749 #endif
750 #define SPL_TABLE_SIZE (1U << SPL_SHIFT)
751 #define SPL_MASK (SPL_TABLE_SIZE - 1)
752
753 /*
754 * We shift by PP_SHIFT to take care of the low-order 0 bits of a page_t
755 * and by multiples of SPL_SHIFT to get as many varied bits as we can.
756 */
757 #define SPL_INDEX(pp) \
758 ((((uintptr_t)(pp) >> PP_SHIFT) ^ \
759 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT)) ^ \
760 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 2)) ^ \
761 ((uintptr_t)(pp) >> (PP_SHIFT + SPL_SHIFT * 3))) & \
762 SPL_MASK)
763
764 #define SPL_HASH(pp) \
765 (&sfmmu_page_lock[SPL_INDEX(pp)].pad_mutex)
766
767 static pad_mutex_t sfmmu_page_lock[SPL_TABLE_SIZE];
768
769 /* Array of mutexes protecting a page's mapping list and p_nrm field. */
770
771 #define MML_TABLE_SIZE SPL_TABLE_SIZE
772 #define MLIST_HASH(pp) (&mml_table[SPL_INDEX(pp)].pad_mutex)
773
774 static pad_mutex_t mml_table[MML_TABLE_SIZE];
775
776 /*
777 * hat_unload_callback() will group together callbacks in order
778 * to avoid xt_sync() calls. This is the maximum size of the group.
779 */
780 #define MAX_CB_ADDR 32
781
782 tte_t hw_tte;
783 static ulong_t sfmmu_dmr_maxbit = DMR_MAXBIT;
784
785 static char *mmu_ctx_kstat_names[] = {
786 "mmu_ctx_tsb_exceptions",
787 "mmu_ctx_tsb_raise_exception",
788 "mmu_ctx_wrap_around",
789 };
790
791 /*
792 * Wrapper for vmem_xalloc since vmem_create only allows limited
793 * parameters for vm_source_alloc functions. This function allows us
794 * to specify alignment consistent with the size of the object being
795 * allocated.
796 */
797 static void *
sfmmu_vmem_xalloc_aligned_wrapper(vmem_t * vmp,size_t size,int vmflag)798 sfmmu_vmem_xalloc_aligned_wrapper(vmem_t *vmp, size_t size, int vmflag)
799 {
800 return (vmem_xalloc(vmp, size, size, 0, 0, NULL, NULL, vmflag));
801 }
802
803 /* Common code for setting tsb_alloc_hiwater. */
804 #define SFMMU_SET_TSB_ALLOC_HIWATER(pages) tsb_alloc_hiwater = \
805 ptob(pages) / tsb_alloc_hiwater_factor
806
807 /*
808 * Set tsb_max_growsize to allow at most all of physical memory to be mapped by
809 * a single TSB. physmem is the number of physical pages so we need physmem 8K
810 * TTEs to represent all those physical pages. We round this up by using
811 * 1<<highbit(). To figure out which size code to use, remember that the size
812 * code is just an amount to shift the smallest TSB size to get the size of
813 * this TSB. So we subtract that size, TSB_START_SIZE, from highbit() (or
814 * highbit() - 1) to get the size code for the smallest TSB that can represent
815 * all of physical memory, while erring on the side of too much.
816 *
817 * Restrict tsb_max_growsize to make sure that:
818 * 1) TSBs can't grow larger than the TSB slab size
819 * 2) TSBs can't grow larger than UTSB_MAX_SZCODE.
820 */
821 #define SFMMU_SET_TSB_MAX_GROWSIZE(pages) { \
822 int _i, _szc, _slabszc, _tsbszc; \
823 \
824 _i = highbit(pages); \
825 if ((1 << (_i - 1)) == (pages)) \
826 _i--; /* 2^n case, round down */ \
827 _szc = _i - TSB_START_SIZE; \
828 _slabszc = bigtsb_slab_shift - (TSB_START_SIZE + TSB_ENTRY_SHIFT); \
829 _tsbszc = MIN(_szc, _slabszc); \
830 tsb_max_growsize = MIN(_tsbszc, UTSB_MAX_SZCODE); \
831 }
832
833 /*
834 * Given a pointer to an sfmmu and a TTE size code, return a pointer to the
835 * tsb_info which handles that TTE size.
836 */
837 #define SFMMU_GET_TSBINFO(tsbinfop, sfmmup, tte_szc) { \
838 (tsbinfop) = (sfmmup)->sfmmu_tsb; \
839 ASSERT(((tsbinfop)->tsb_flags & TSB_SHAREDCTX) || \
840 sfmmu_hat_lock_held(sfmmup)); \
841 if ((tte_szc) >= TTE4M) { \
842 ASSERT((tsbinfop) != NULL); \
843 (tsbinfop) = (tsbinfop)->tsb_next; \
844 } \
845 }
846
847 /*
848 * Macro to use to unload entries from the TSB.
849 * It has knowledge of which page sizes get replicated in the TSB
850 * and will call the appropriate unload routine for the appropriate size.
851 */
852 #define SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, ismhat) \
853 { \
854 int ttesz = get_hblk_ttesz(hmeblkp); \
855 if (ttesz == TTE8K || ttesz == TTE4M) { \
856 sfmmu_unload_tsb(sfmmup, addr, ttesz); \
857 } else { \
858 caddr_t sva = ismhat ? addr : \
859 (caddr_t)get_hblk_base(hmeblkp); \
860 caddr_t eva = sva + get_hblk_span(hmeblkp); \
861 ASSERT(addr >= sva && addr < eva); \
862 sfmmu_unload_tsb_range(sfmmup, sva, eva, ttesz); \
863 } \
864 }
865
866
867 /* Update tsb_alloc_hiwater after memory is configured. */
868 /*ARGSUSED*/
869 static void
sfmmu_update_post_add(void * arg,pgcnt_t delta_pages)870 sfmmu_update_post_add(void *arg, pgcnt_t delta_pages)
871 {
872 /* Assumes physmem has already been updated. */
873 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
874 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
875 }
876
877 /*
878 * Update tsb_alloc_hiwater before memory is deleted. We'll do nothing here
879 * and update tsb_alloc_hiwater and tsb_max_growsize after the memory is
880 * deleted.
881 */
882 /*ARGSUSED*/
883 static int
sfmmu_update_pre_del(void * arg,pgcnt_t delta_pages)884 sfmmu_update_pre_del(void *arg, pgcnt_t delta_pages)
885 {
886 return (0);
887 }
888
889 /* Update tsb_alloc_hiwater after memory fails to be unconfigured. */
890 /*ARGSUSED*/
891 static void
sfmmu_update_post_del(void * arg,pgcnt_t delta_pages,int cancelled)892 sfmmu_update_post_del(void *arg, pgcnt_t delta_pages, int cancelled)
893 {
894 /*
895 * Whether the delete was cancelled or not, just go ahead and update
896 * tsb_alloc_hiwater and tsb_max_growsize.
897 */
898 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
899 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
900 }
901
902 static kphysm_setup_vector_t sfmmu_update_vec = {
903 KPHYSM_SETUP_VECTOR_VERSION, /* version */
904 sfmmu_update_post_add, /* post_add */
905 sfmmu_update_pre_del, /* pre_del */
906 sfmmu_update_post_del /* post_del */
907 };
908
909
910 /*
911 * HME_BLK HASH PRIMITIVES
912 */
913
914 /*
915 * Enter a hme on the mapping list for page pp.
916 * When large pages are more prevalent in the system we might want to
917 * keep the mapping list in ascending order by the hment size. For now,
918 * small pages are more frequent, so don't slow it down.
919 */
920 #define HME_ADD(hme, pp) \
921 { \
922 ASSERT(sfmmu_mlist_held(pp)); \
923 \
924 hme->hme_prev = NULL; \
925 hme->hme_next = pp->p_mapping; \
926 hme->hme_page = pp; \
927 if (pp->p_mapping) { \
928 ((struct sf_hment *)(pp->p_mapping))->hme_prev = hme;\
929 ASSERT(pp->p_share > 0); \
930 } else { \
931 /* EMPTY */ \
932 ASSERT(pp->p_share == 0); \
933 } \
934 pp->p_mapping = hme; \
935 pp->p_share++; \
936 }
937
938 /*
939 * Enter a hme on the mapping list for page pp.
940 * If we are unmapping a large translation, we need to make sure that the
941 * change is reflect in the corresponding bit of the p_index field.
942 */
943 #define HME_SUB(hme, pp) \
944 { \
945 ASSERT(sfmmu_mlist_held(pp)); \
946 ASSERT(hme->hme_page == pp || IS_PAHME(hme)); \
947 \
948 if (pp->p_mapping == NULL) { \
949 panic("hme_remove - no mappings"); \
950 } \
951 \
952 membar_stst(); /* ensure previous stores finish */ \
953 \
954 ASSERT(pp->p_share > 0); \
955 pp->p_share--; \
956 \
957 if (hme->hme_prev) { \
958 ASSERT(pp->p_mapping != hme); \
959 ASSERT(hme->hme_prev->hme_page == pp || \
960 IS_PAHME(hme->hme_prev)); \
961 hme->hme_prev->hme_next = hme->hme_next; \
962 } else { \
963 ASSERT(pp->p_mapping == hme); \
964 pp->p_mapping = hme->hme_next; \
965 ASSERT((pp->p_mapping == NULL) ? \
966 (pp->p_share == 0) : 1); \
967 } \
968 \
969 if (hme->hme_next) { \
970 ASSERT(hme->hme_next->hme_page == pp || \
971 IS_PAHME(hme->hme_next)); \
972 hme->hme_next->hme_prev = hme->hme_prev; \
973 } \
974 \
975 /* zero out the entry */ \
976 hme->hme_next = NULL; \
977 hme->hme_prev = NULL; \
978 hme->hme_page = NULL; \
979 \
980 if (hme_size(hme) > TTE8K) { \
981 /* remove mappings for remainder of large pg */ \
982 sfmmu_rm_large_mappings(pp, hme_size(hme)); \
983 } \
984 }
985
986 /*
987 * This function returns the hment given the hme_blk and a vaddr.
988 * It assumes addr has already been checked to belong to hme_blk's
989 * range.
990 */
991 #define HBLKTOHME(hment, hmeblkp, addr) \
992 { \
993 int index; \
994 HBLKTOHME_IDX(hment, hmeblkp, addr, index) \
995 }
996
997 /*
998 * Version of HBLKTOHME that also returns the index in hmeblkp
999 * of the hment.
1000 */
1001 #define HBLKTOHME_IDX(hment, hmeblkp, addr, idx) \
1002 { \
1003 ASSERT(in_hblk_range((hmeblkp), (addr))); \
1004 \
1005 if (get_hblk_ttesz(hmeblkp) == TTE8K) { \
1006 idx = (((uintptr_t)(addr) >> MMU_PAGESHIFT) & (NHMENTS-1)); \
1007 } else \
1008 idx = 0; \
1009 \
1010 (hment) = &(hmeblkp)->hblk_hme[idx]; \
1011 }
1012
1013 /*
1014 * Disable any page sizes not supported by the CPU
1015 */
1016 void
hat_init_pagesizes()1017 hat_init_pagesizes()
1018 {
1019 int i;
1020
1021 mmu_exported_page_sizes = 0;
1022 for (i = TTE8K; i < max_mmu_page_sizes; i++) {
1023
1024 szc_2_userszc[i] = (uint_t)-1;
1025 userszc_2_szc[i] = (uint_t)-1;
1026
1027 if ((mmu_exported_pagesize_mask & (1 << i)) == 0) {
1028 disable_large_pages |= (1 << i);
1029 } else {
1030 szc_2_userszc[i] = mmu_exported_page_sizes;
1031 userszc_2_szc[mmu_exported_page_sizes] = i;
1032 mmu_exported_page_sizes++;
1033 }
1034 }
1035
1036 disable_ism_large_pages |= disable_large_pages;
1037 disable_auto_data_large_pages = disable_large_pages;
1038 disable_auto_text_large_pages = disable_large_pages;
1039
1040 /*
1041 * Initialize mmu-specific large page sizes.
1042 */
1043 if (&mmu_large_pages_disabled) {
1044 disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD);
1045 disable_ism_large_pages |=
1046 mmu_large_pages_disabled(HAT_LOAD_SHARE);
1047 disable_auto_data_large_pages |=
1048 mmu_large_pages_disabled(HAT_AUTO_DATA);
1049 disable_auto_text_large_pages |=
1050 mmu_large_pages_disabled(HAT_AUTO_TEXT);
1051 }
1052 }
1053
1054 /*
1055 * Initialize the hardware address translation structures.
1056 */
1057 void
hat_init(void)1058 hat_init(void)
1059 {
1060 int i;
1061 uint_t sz;
1062 size_t size;
1063
1064 hat_lock_init();
1065 hat_kstat_init();
1066
1067 /*
1068 * Hardware-only bits in a TTE
1069 */
1070 MAKE_TTE_MASK(&hw_tte);
1071
1072 hat_init_pagesizes();
1073
1074 /* Initialize the hash locks */
1075 for (i = 0; i < khmehash_num; i++) {
1076 mutex_init(&khme_hash[i].hmehash_mutex, NULL,
1077 MUTEX_DEFAULT, NULL);
1078 khme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
1079 }
1080 for (i = 0; i < uhmehash_num; i++) {
1081 mutex_init(&uhme_hash[i].hmehash_mutex, NULL,
1082 MUTEX_DEFAULT, NULL);
1083 uhme_hash[i].hmeh_nextpa = HMEBLK_ENDPA;
1084 }
1085 khmehash_num--; /* make sure counter starts from 0 */
1086 uhmehash_num--; /* make sure counter starts from 0 */
1087
1088 /*
1089 * Allocate context domain structures.
1090 *
1091 * A platform may choose to modify max_mmu_ctxdoms in
1092 * set_platform_defaults(). If a platform does not define
1093 * a set_platform_defaults() or does not choose to modify
1094 * max_mmu_ctxdoms, it gets one MMU context domain for every CPU.
1095 *
1096 * For all platforms that have CPUs sharing MMUs, this
1097 * value must be defined.
1098 */
1099 if (max_mmu_ctxdoms == 0)
1100 max_mmu_ctxdoms = max_ncpus;
1101
1102 size = max_mmu_ctxdoms * sizeof (mmu_ctx_t *);
1103 mmu_ctxs_tbl = kmem_zalloc(size, KM_SLEEP);
1104
1105 /* mmu_ctx_t is 64 bytes aligned */
1106 mmuctxdom_cache = kmem_cache_create("mmuctxdom_cache",
1107 sizeof (mmu_ctx_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
1108 /*
1109 * MMU context domain initialization for the Boot CPU.
1110 * This needs the context domains array allocated above.
1111 */
1112 mutex_enter(&cpu_lock);
1113 sfmmu_cpu_init(CPU);
1114 mutex_exit(&cpu_lock);
1115
1116 /*
1117 * Intialize ism mapping list lock.
1118 */
1119
1120 mutex_init(&ism_mlist_lock, NULL, MUTEX_DEFAULT, NULL);
1121
1122 /*
1123 * Each sfmmu structure carries an array of MMU context info
1124 * structures, one per context domain. The size of this array depends
1125 * on the maximum number of context domains. So, the size of the
1126 * sfmmu structure varies per platform.
1127 *
1128 * sfmmu is allocated from static arena, because trap
1129 * handler at TL > 0 is not allowed to touch kernel relocatable
1130 * memory. sfmmu's alignment is changed to 64 bytes from
1131 * default 8 bytes, as the lower 6 bits will be used to pass
1132 * pgcnt to vtag_flush_pgcnt_tl1.
1133 */
1134 size = sizeof (sfmmu_t) + sizeof (sfmmu_ctx_t) * (max_mmu_ctxdoms - 1);
1135
1136 sfmmuid_cache = kmem_cache_create("sfmmuid_cache", size,
1137 64, sfmmu_idcache_constructor, sfmmu_idcache_destructor,
1138 NULL, NULL, static_arena, 0);
1139
1140 sfmmu_tsbinfo_cache = kmem_cache_create("sfmmu_tsbinfo_cache",
1141 sizeof (struct tsb_info), 0, NULL, NULL, NULL, NULL, NULL, 0);
1142
1143 /*
1144 * Since we only use the tsb8k cache to "borrow" pages for TSBs
1145 * from the heap when low on memory or when TSB_FORCEALLOC is
1146 * specified, don't use magazines to cache them--we want to return
1147 * them to the system as quickly as possible.
1148 */
1149 sfmmu_tsb8k_cache = kmem_cache_create("sfmmu_tsb8k_cache",
1150 MMU_PAGESIZE, MMU_PAGESIZE, NULL, NULL, NULL, NULL,
1151 static_arena, KMC_NOMAGAZINE);
1152
1153 /*
1154 * Set tsb_alloc_hiwater to 1/tsb_alloc_hiwater_factor of physical
1155 * memory, which corresponds to the old static reserve for TSBs.
1156 * tsb_alloc_hiwater_factor defaults to 32. This caps the amount of
1157 * memory we'll allocate for TSB slabs; beyond this point TSB
1158 * allocations will be taken from the kernel heap (via
1159 * sfmmu_tsb8k_cache) and will be throttled as would any other kmem
1160 * consumer.
1161 */
1162 if (tsb_alloc_hiwater_factor == 0) {
1163 tsb_alloc_hiwater_factor = TSB_ALLOC_HIWATER_FACTOR_DEFAULT;
1164 }
1165 SFMMU_SET_TSB_ALLOC_HIWATER(physmem);
1166
1167 for (sz = tsb_slab_ttesz; sz > 0; sz--) {
1168 if (!(disable_large_pages & (1 << sz)))
1169 break;
1170 }
1171
1172 if (sz < tsb_slab_ttesz) {
1173 tsb_slab_ttesz = sz;
1174 tsb_slab_shift = MMU_PAGESHIFT + (sz << 1) + sz;
1175 tsb_slab_size = 1 << tsb_slab_shift;
1176 tsb_slab_mask = (1 << (tsb_slab_shift - MMU_PAGESHIFT)) - 1;
1177 use_bigtsb_arena = 0;
1178 } else if (use_bigtsb_arena &&
1179 (disable_large_pages & (1 << bigtsb_slab_ttesz))) {
1180 use_bigtsb_arena = 0;
1181 }
1182
1183 if (!use_bigtsb_arena) {
1184 bigtsb_slab_shift = tsb_slab_shift;
1185 }
1186 SFMMU_SET_TSB_MAX_GROWSIZE(physmem);
1187
1188 /*
1189 * On smaller memory systems, allocate TSB memory in smaller chunks
1190 * than the default 4M slab size. We also honor disable_large_pages
1191 * here.
1192 *
1193 * The trap handlers need to be patched with the final slab shift,
1194 * since they need to be able to construct the TSB pointer at runtime.
1195 */
1196 if ((tsb_max_growsize <= TSB_512K_SZCODE) &&
1197 !(disable_large_pages & (1 << TTE512K))) {
1198 tsb_slab_ttesz = TTE512K;
1199 tsb_slab_shift = MMU_PAGESHIFT512K;
1200 tsb_slab_size = MMU_PAGESIZE512K;
1201 tsb_slab_mask = MMU_PAGEOFFSET512K >> MMU_PAGESHIFT;
1202 use_bigtsb_arena = 0;
1203 }
1204
1205 if (!use_bigtsb_arena) {
1206 bigtsb_slab_ttesz = tsb_slab_ttesz;
1207 bigtsb_slab_shift = tsb_slab_shift;
1208 bigtsb_slab_size = tsb_slab_size;
1209 bigtsb_slab_mask = tsb_slab_mask;
1210 }
1211
1212
1213 /*
1214 * Set up memory callback to update tsb_alloc_hiwater and
1215 * tsb_max_growsize.
1216 */
1217 i = kphysm_setup_func_register(&sfmmu_update_vec, (void *) 0);
1218 ASSERT(i == 0);
1219
1220 /*
1221 * kmem_tsb_arena is the source from which large TSB slabs are
1222 * drawn. The quantum of this arena corresponds to the largest
1223 * TSB size we can dynamically allocate for user processes.
1224 * Currently it must also be a supported page size since we
1225 * use exactly one translation entry to map each slab page.
1226 *
1227 * The per-lgroup kmem_tsb_default_arena arenas are the arenas from
1228 * which most TSBs are allocated. Since most TSB allocations are
1229 * typically 8K we have a kmem cache we stack on top of each
1230 * kmem_tsb_default_arena to speed up those allocations.
1231 *
1232 * Note the two-level scheme of arenas is required only
1233 * because vmem_create doesn't allow us to specify alignment
1234 * requirements. If this ever changes the code could be
1235 * simplified to use only one level of arenas.
1236 *
1237 * If 256M page support exists on sun4v, 256MB kmem_bigtsb_arena
1238 * will be provided in addition to the 4M kmem_tsb_arena.
1239 */
1240 if (use_bigtsb_arena) {
1241 kmem_bigtsb_arena = vmem_create("kmem_bigtsb", NULL, 0,
1242 bigtsb_slab_size, sfmmu_vmem_xalloc_aligned_wrapper,
1243 vmem_xfree, heap_arena, 0, VM_SLEEP);
1244 }
1245
1246 kmem_tsb_arena = vmem_create("kmem_tsb", NULL, 0, tsb_slab_size,
1247 sfmmu_vmem_xalloc_aligned_wrapper,
1248 vmem_xfree, heap_arena, 0, VM_SLEEP);
1249
1250 if (tsb_lgrp_affinity) {
1251 char s[50];
1252 for (i = 0; i < NLGRPS_MAX; i++) {
1253 if (use_bigtsb_arena) {
1254 (void) sprintf(s, "kmem_bigtsb_lgrp%d", i);
1255 kmem_bigtsb_default_arena[i] = vmem_create(s,
1256 NULL, 0, 2 * tsb_slab_size,
1257 sfmmu_tsb_segkmem_alloc,
1258 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena,
1259 0, VM_SLEEP | VM_BESTFIT);
1260 }
1261
1262 (void) sprintf(s, "kmem_tsb_lgrp%d", i);
1263 kmem_tsb_default_arena[i] = vmem_create(s,
1264 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
1265 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
1266 VM_SLEEP | VM_BESTFIT);
1267
1268 (void) sprintf(s, "sfmmu_tsb_lgrp%d_cache", i);
1269 sfmmu_tsb_cache[i] = kmem_cache_create(s,
1270 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
1271 kmem_tsb_default_arena[i], 0);
1272 }
1273 } else {
1274 if (use_bigtsb_arena) {
1275 kmem_bigtsb_default_arena[0] =
1276 vmem_create("kmem_bigtsb_default", NULL, 0,
1277 2 * tsb_slab_size, sfmmu_tsb_segkmem_alloc,
1278 sfmmu_tsb_segkmem_free, kmem_bigtsb_arena, 0,
1279 VM_SLEEP | VM_BESTFIT);
1280 }
1281
1282 kmem_tsb_default_arena[0] = vmem_create("kmem_tsb_default",
1283 NULL, 0, PAGESIZE, sfmmu_tsb_segkmem_alloc,
1284 sfmmu_tsb_segkmem_free, kmem_tsb_arena, 0,
1285 VM_SLEEP | VM_BESTFIT);
1286 sfmmu_tsb_cache[0] = kmem_cache_create("sfmmu_tsb_cache",
1287 PAGESIZE, PAGESIZE, NULL, NULL, NULL, NULL,
1288 kmem_tsb_default_arena[0], 0);
1289 }
1290
1291 sfmmu8_cache = kmem_cache_create("sfmmu8_cache", HME8BLK_SZ,
1292 HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
1293 sfmmu_hblkcache_destructor,
1294 sfmmu_hblkcache_reclaim, (void *)HME8BLK_SZ,
1295 hat_memload_arena, KMC_NOHASH);
1296
1297 hat_memload1_arena = vmem_create("hat_memload1", NULL, 0, PAGESIZE,
1298 segkmem_alloc_permanent, segkmem_free, heap_arena, 0,
1299 VMC_DUMPSAFE | VM_SLEEP);
1300
1301 sfmmu1_cache = kmem_cache_create("sfmmu1_cache", HME1BLK_SZ,
1302 HMEBLK_ALIGN, sfmmu_hblkcache_constructor,
1303 sfmmu_hblkcache_destructor,
1304 NULL, (void *)HME1BLK_SZ,
1305 hat_memload1_arena, KMC_NOHASH);
1306
1307 pa_hment_cache = kmem_cache_create("pa_hment_cache", PAHME_SZ,
1308 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH);
1309
1310 ism_blk_cache = kmem_cache_create("ism_blk_cache",
1311 sizeof (ism_blk_t), ecache_alignsize, NULL, NULL,
1312 NULL, NULL, static_arena, KMC_NOHASH);
1313
1314 ism_ment_cache = kmem_cache_create("ism_ment_cache",
1315 sizeof (ism_ment_t), 0, NULL, NULL,
1316 NULL, NULL, NULL, 0);
1317
1318 /*
1319 * We grab the first hat for the kernel,
1320 */
1321 AS_LOCK_ENTER(&kas, RW_WRITER);
1322 kas.a_hat = hat_alloc(&kas);
1323 AS_LOCK_EXIT(&kas);
1324
1325 /*
1326 * Initialize hblk_reserve.
1327 */
1328 ((struct hme_blk *)hblk_reserve)->hblk_nextpa =
1329 va_to_pa((caddr_t)hblk_reserve);
1330
1331 #ifndef UTSB_PHYS
1332 /*
1333 * Reserve some kernel virtual address space for the locked TTEs
1334 * that allow us to probe the TSB from TL>0.
1335 */
1336 utsb_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
1337 0, 0, NULL, NULL, VM_SLEEP);
1338 utsb4m_vabase = vmem_xalloc(heap_arena, tsb_slab_size, tsb_slab_size,
1339 0, 0, NULL, NULL, VM_SLEEP);
1340 #endif
1341
1342 #ifdef VAC
1343 /*
1344 * The big page VAC handling code assumes VAC
1345 * will not be bigger than the smallest big
1346 * page- which is 64K.
1347 */
1348 if (TTEPAGES(TTE64K) < CACHE_NUM_COLOR) {
1349 cmn_err(CE_PANIC, "VAC too big!");
1350 }
1351 #endif
1352
1353 (void) xhat_init();
1354
1355 uhme_hash_pa = va_to_pa(uhme_hash);
1356 khme_hash_pa = va_to_pa(khme_hash);
1357
1358 /*
1359 * Initialize relocation locks. kpr_suspendlock is held
1360 * at PIL_MAX to prevent interrupts from pinning the holder
1361 * of a suspended TTE which may access it leading to a
1362 * deadlock condition.
1363 */
1364 mutex_init(&kpr_mutex, NULL, MUTEX_DEFAULT, NULL);
1365 mutex_init(&kpr_suspendlock, NULL, MUTEX_SPIN, (void *)PIL_MAX);
1366
1367 /*
1368 * If Shared context support is disabled via /etc/system
1369 * set shctx_on to 0 here if it was set to 1 earlier in boot
1370 * sequence by cpu module initialization code.
1371 */
1372 if (shctx_on && disable_shctx) {
1373 shctx_on = 0;
1374 }
1375
1376 if (shctx_on) {
1377 srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS *
1378 sizeof (srd_buckets[0]), KM_SLEEP);
1379 for (i = 0; i < SFMMU_MAX_SRD_BUCKETS; i++) {
1380 mutex_init(&srd_buckets[i].srdb_lock, NULL,
1381 MUTEX_DEFAULT, NULL);
1382 }
1383
1384 srd_cache = kmem_cache_create("srd_cache", sizeof (sf_srd_t),
1385 0, sfmmu_srdcache_constructor, sfmmu_srdcache_destructor,
1386 NULL, NULL, NULL, 0);
1387 region_cache = kmem_cache_create("region_cache",
1388 sizeof (sf_region_t), 0, sfmmu_rgncache_constructor,
1389 sfmmu_rgncache_destructor, NULL, NULL, NULL, 0);
1390 scd_cache = kmem_cache_create("scd_cache", sizeof (sf_scd_t),
1391 0, sfmmu_scdcache_constructor, sfmmu_scdcache_destructor,
1392 NULL, NULL, NULL, 0);
1393 }
1394
1395 /*
1396 * Pre-allocate hrm_hashtab before enabling the collection of
1397 * refmod statistics. Allocating on the fly would mean us
1398 * running the risk of suffering recursive mutex enters or
1399 * deadlocks.
1400 */
1401 hrm_hashtab = kmem_zalloc(HRM_HASHSIZE * sizeof (struct hrmstat *),
1402 KM_SLEEP);
1403
1404 /* Allocate per-cpu pending freelist of hmeblks */
1405 cpu_hme_pend = kmem_zalloc((NCPU * sizeof (cpu_hme_pend_t)) + 64,
1406 KM_SLEEP);
1407 cpu_hme_pend = (cpu_hme_pend_t *)P2ROUNDUP(
1408 (uintptr_t)cpu_hme_pend, 64);
1409
1410 for (i = 0; i < NCPU; i++) {
1411 mutex_init(&cpu_hme_pend[i].chp_mutex, NULL, MUTEX_DEFAULT,
1412 NULL);
1413 }
1414
1415 if (cpu_hme_pend_thresh == 0) {
1416 cpu_hme_pend_thresh = CPU_HME_PEND_THRESH;
1417 }
1418 }
1419
1420 /*
1421 * Initialize locking for the hat layer, called early during boot.
1422 */
1423 static void
hat_lock_init()1424 hat_lock_init()
1425 {
1426 int i;
1427
1428 /*
1429 * initialize the array of mutexes protecting a page's mapping
1430 * list and p_nrm field.
1431 */
1432 for (i = 0; i < MML_TABLE_SIZE; i++)
1433 mutex_init(&mml_table[i].pad_mutex, NULL, MUTEX_DEFAULT, NULL);
1434
1435 if (kpm_enable) {
1436 for (i = 0; i < kpmp_table_sz; i++) {
1437 mutex_init(&kpmp_table[i].khl_mutex, NULL,
1438 MUTEX_DEFAULT, NULL);
1439 }
1440 }
1441
1442 /*
1443 * Initialize array of mutex locks that protects sfmmu fields and
1444 * TSB lists.
1445 */
1446 for (i = 0; i < SFMMU_NUM_LOCK; i++)
1447 mutex_init(HATLOCK_MUTEXP(&hat_lock[i]), NULL, MUTEX_DEFAULT,
1448 NULL);
1449 }
1450
1451 #define SFMMU_KERNEL_MAXVA \
1452 (kmem64_base ? (uintptr_t)kmem64_end : (SYSLIMIT))
1453
1454 /*
1455 * Allocate a hat structure.
1456 * Called when an address space first uses a hat.
1457 */
1458 struct hat *
hat_alloc(struct as * as)1459 hat_alloc(struct as *as)
1460 {
1461 sfmmu_t *sfmmup;
1462 int i;
1463 uint64_t cnum;
1464 extern uint_t get_color_start(struct as *);
1465
1466 ASSERT(AS_WRITE_HELD(as));
1467 sfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
1468 sfmmup->sfmmu_as = as;
1469 sfmmup->sfmmu_flags = 0;
1470 sfmmup->sfmmu_tteflags = 0;
1471 sfmmup->sfmmu_rtteflags = 0;
1472 LOCK_INIT_CLEAR(&sfmmup->sfmmu_ctx_lock);
1473
1474 if (as == &kas) {
1475 ksfmmup = sfmmup;
1476 sfmmup->sfmmu_cext = 0;
1477 cnum = KCONTEXT;
1478
1479 sfmmup->sfmmu_clrstart = 0;
1480 sfmmup->sfmmu_tsb = NULL;
1481 /*
1482 * hat_kern_setup() will call sfmmu_init_ktsbinfo()
1483 * to setup tsb_info for ksfmmup.
1484 */
1485 } else {
1486
1487 /*
1488 * Just set to invalid ctx. When it faults, it will
1489 * get a valid ctx. This would avoid the situation
1490 * where we get a ctx, but it gets stolen and then
1491 * we fault when we try to run and so have to get
1492 * another ctx.
1493 */
1494 sfmmup->sfmmu_cext = 0;
1495 cnum = INVALID_CONTEXT;
1496
1497 /* initialize original physical page coloring bin */
1498 sfmmup->sfmmu_clrstart = get_color_start(as);
1499 #ifdef DEBUG
1500 if (tsb_random_size) {
1501 uint32_t randval = (uint32_t)gettick() >> 4;
1502 int size = randval % (tsb_max_growsize + 1);
1503
1504 /* chose a random tsb size for stress testing */
1505 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb, size,
1506 TSB8K|TSB64K|TSB512K, 0, sfmmup);
1507 } else
1508 #endif /* DEBUG */
1509 (void) sfmmu_tsbinfo_alloc(&sfmmup->sfmmu_tsb,
1510 default_tsb_size,
1511 TSB8K|TSB64K|TSB512K, 0, sfmmup);
1512 sfmmup->sfmmu_flags = HAT_SWAPPED | HAT_ALLCTX_INVALID;
1513 ASSERT(sfmmup->sfmmu_tsb != NULL);
1514 }
1515
1516 ASSERT(max_mmu_ctxdoms > 0);
1517 for (i = 0; i < max_mmu_ctxdoms; i++) {
1518 sfmmup->sfmmu_ctxs[i].cnum = cnum;
1519 sfmmup->sfmmu_ctxs[i].gnum = 0;
1520 }
1521
1522 for (i = 0; i < max_mmu_page_sizes; i++) {
1523 sfmmup->sfmmu_ttecnt[i] = 0;
1524 sfmmup->sfmmu_scdrttecnt[i] = 0;
1525 sfmmup->sfmmu_ismttecnt[i] = 0;
1526 sfmmup->sfmmu_scdismttecnt[i] = 0;
1527 sfmmup->sfmmu_pgsz[i] = TTE8K;
1528 }
1529 sfmmup->sfmmu_tsb0_4minflcnt = 0;
1530 sfmmup->sfmmu_iblk = NULL;
1531 sfmmup->sfmmu_ismhat = 0;
1532 sfmmup->sfmmu_scdhat = 0;
1533 sfmmup->sfmmu_ismblkpa = (uint64_t)-1;
1534 if (sfmmup == ksfmmup) {
1535 CPUSET_ALL(sfmmup->sfmmu_cpusran);
1536 } else {
1537 CPUSET_ZERO(sfmmup->sfmmu_cpusran);
1538 }
1539 sfmmup->sfmmu_free = 0;
1540 sfmmup->sfmmu_rmstat = 0;
1541 sfmmup->sfmmu_clrbin = sfmmup->sfmmu_clrstart;
1542 sfmmup->sfmmu_xhat_provider = NULL;
1543 cv_init(&sfmmup->sfmmu_tsb_cv, NULL, CV_DEFAULT, NULL);
1544 sfmmup->sfmmu_srdp = NULL;
1545 SF_RGNMAP_ZERO(sfmmup->sfmmu_region_map);
1546 bzero(sfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
1547 sfmmup->sfmmu_scdp = NULL;
1548 sfmmup->sfmmu_scd_link.next = NULL;
1549 sfmmup->sfmmu_scd_link.prev = NULL;
1550 return (sfmmup);
1551 }
1552
1553 /*
1554 * Create per-MMU context domain kstats for a given MMU ctx.
1555 */
1556 static void
sfmmu_mmu_kstat_create(mmu_ctx_t * mmu_ctxp)1557 sfmmu_mmu_kstat_create(mmu_ctx_t *mmu_ctxp)
1558 {
1559 mmu_ctx_stat_t stat;
1560 kstat_t *mmu_kstat;
1561
1562 ASSERT(MUTEX_HELD(&cpu_lock));
1563 ASSERT(mmu_ctxp->mmu_kstat == NULL);
1564
1565 mmu_kstat = kstat_create("unix", mmu_ctxp->mmu_idx, "mmu_ctx",
1566 "hat", KSTAT_TYPE_NAMED, MMU_CTX_NUM_STATS, KSTAT_FLAG_VIRTUAL);
1567
1568 if (mmu_kstat == NULL) {
1569 cmn_err(CE_WARN, "kstat_create for MMU %d failed",
1570 mmu_ctxp->mmu_idx);
1571 } else {
1572 mmu_kstat->ks_data = mmu_ctxp->mmu_kstat_data;
1573 for (stat = 0; stat < MMU_CTX_NUM_STATS; stat++)
1574 kstat_named_init(&mmu_ctxp->mmu_kstat_data[stat],
1575 mmu_ctx_kstat_names[stat], KSTAT_DATA_INT64);
1576 mmu_ctxp->mmu_kstat = mmu_kstat;
1577 kstat_install(mmu_kstat);
1578 }
1579 }
1580
1581 /*
1582 * plat_cpuid_to_mmu_ctx_info() is a platform interface that returns MMU
1583 * context domain information for a given CPU. If a platform does not
1584 * specify that interface, then the function below is used instead to return
1585 * default information. The defaults are as follows:
1586 *
1587 * - The number of MMU context IDs supported on any CPU in the
1588 * system is 8K.
1589 * - There is one MMU context domain per CPU.
1590 */
1591 /*ARGSUSED*/
1592 static void
sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid,mmu_ctx_info_t * infop)1593 sfmmu_cpuid_to_mmu_ctx_info(processorid_t cpuid, mmu_ctx_info_t *infop)
1594 {
1595 infop->mmu_nctxs = nctxs;
1596 infop->mmu_idx = cpu[cpuid]->cpu_seqid;
1597 }
1598
1599 /*
1600 * Called during CPU initialization to set the MMU context-related information
1601 * for a CPU.
1602 *
1603 * cpu_lock serializes accesses to mmu_ctxs and mmu_saved_gnum.
1604 */
1605 void
sfmmu_cpu_init(cpu_t * cp)1606 sfmmu_cpu_init(cpu_t *cp)
1607 {
1608 mmu_ctx_info_t info;
1609 mmu_ctx_t *mmu_ctxp;
1610
1611 ASSERT(MUTEX_HELD(&cpu_lock));
1612
1613 if (&plat_cpuid_to_mmu_ctx_info == NULL)
1614 sfmmu_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
1615 else
1616 plat_cpuid_to_mmu_ctx_info(cp->cpu_id, &info);
1617
1618 ASSERT(info.mmu_idx < max_mmu_ctxdoms);
1619
1620 if ((mmu_ctxp = mmu_ctxs_tbl[info.mmu_idx]) == NULL) {
1621 /* Each mmu_ctx is cacheline aligned. */
1622 mmu_ctxp = kmem_cache_alloc(mmuctxdom_cache, KM_SLEEP);
1623 bzero(mmu_ctxp, sizeof (mmu_ctx_t));
1624
1625 mutex_init(&mmu_ctxp->mmu_lock, NULL, MUTEX_SPIN,
1626 (void *)ipltospl(DISP_LEVEL));
1627 mmu_ctxp->mmu_idx = info.mmu_idx;
1628 mmu_ctxp->mmu_nctxs = info.mmu_nctxs;
1629 /*
1630 * Globally for lifetime of a system,
1631 * gnum must always increase.
1632 * mmu_saved_gnum is protected by the cpu_lock.
1633 */
1634 mmu_ctxp->mmu_gnum = mmu_saved_gnum + 1;
1635 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
1636
1637 sfmmu_mmu_kstat_create(mmu_ctxp);
1638
1639 mmu_ctxs_tbl[info.mmu_idx] = mmu_ctxp;
1640 } else {
1641 ASSERT(mmu_ctxp->mmu_idx == info.mmu_idx);
1642 ASSERT(mmu_ctxp->mmu_nctxs <= info.mmu_nctxs);
1643 }
1644
1645 /*
1646 * The mmu_lock is acquired here to prevent races with
1647 * the wrap-around code.
1648 */
1649 mutex_enter(&mmu_ctxp->mmu_lock);
1650
1651
1652 mmu_ctxp->mmu_ncpus++;
1653 CPUSET_ADD(mmu_ctxp->mmu_cpuset, cp->cpu_id);
1654 CPU_MMU_IDX(cp) = info.mmu_idx;
1655 CPU_MMU_CTXP(cp) = mmu_ctxp;
1656
1657 mutex_exit(&mmu_ctxp->mmu_lock);
1658 }
1659
1660 static void
sfmmu_ctxdom_free(mmu_ctx_t * mmu_ctxp)1661 sfmmu_ctxdom_free(mmu_ctx_t *mmu_ctxp)
1662 {
1663 ASSERT(MUTEX_HELD(&cpu_lock));
1664 ASSERT(!MUTEX_HELD(&mmu_ctxp->mmu_lock));
1665
1666 mutex_destroy(&mmu_ctxp->mmu_lock);
1667
1668 if (mmu_ctxp->mmu_kstat)
1669 kstat_delete(mmu_ctxp->mmu_kstat);
1670
1671 /* mmu_saved_gnum is protected by the cpu_lock. */
1672 if (mmu_saved_gnum < mmu_ctxp->mmu_gnum)
1673 mmu_saved_gnum = mmu_ctxp->mmu_gnum;
1674
1675 kmem_cache_free(mmuctxdom_cache, mmu_ctxp);
1676 }
1677
1678 /*
1679 * Called to perform MMU context-related cleanup for a CPU.
1680 */
1681 void
sfmmu_cpu_cleanup(cpu_t * cp)1682 sfmmu_cpu_cleanup(cpu_t *cp)
1683 {
1684 mmu_ctx_t *mmu_ctxp;
1685
1686 ASSERT(MUTEX_HELD(&cpu_lock));
1687
1688 mmu_ctxp = CPU_MMU_CTXP(cp);
1689 ASSERT(mmu_ctxp != NULL);
1690
1691 /*
1692 * The mmu_lock is acquired here to prevent races with
1693 * the wrap-around code.
1694 */
1695 mutex_enter(&mmu_ctxp->mmu_lock);
1696
1697 CPU_MMU_CTXP(cp) = NULL;
1698
1699 CPUSET_DEL(mmu_ctxp->mmu_cpuset, cp->cpu_id);
1700 if (--mmu_ctxp->mmu_ncpus == 0) {
1701 mmu_ctxs_tbl[mmu_ctxp->mmu_idx] = NULL;
1702 mutex_exit(&mmu_ctxp->mmu_lock);
1703 sfmmu_ctxdom_free(mmu_ctxp);
1704 return;
1705 }
1706
1707 mutex_exit(&mmu_ctxp->mmu_lock);
1708 }
1709
1710 uint_t
sfmmu_ctxdom_nctxs(int idx)1711 sfmmu_ctxdom_nctxs(int idx)
1712 {
1713 return (mmu_ctxs_tbl[idx]->mmu_nctxs);
1714 }
1715
1716 #ifdef sun4v
1717 /*
1718 * sfmmu_ctxdoms_* is an interface provided to help keep context domains
1719 * consistant after suspend/resume on system that can resume on a different
1720 * hardware than it was suspended.
1721 *
1722 * sfmmu_ctxdom_lock(void) locks all context domains and prevents new contexts
1723 * from being allocated. It acquires all hat_locks, which blocks most access to
1724 * context data, except for a few cases that are handled separately or are
1725 * harmless. It wraps each domain to increment gnum and invalidate on-CPU
1726 * contexts, and forces cnum to its max. As a result of this call all user
1727 * threads that are running on CPUs trap and try to perform wrap around but
1728 * can't because hat_locks are taken. Threads that were not on CPUs but started
1729 * by scheduler go to sfmmu_alloc_ctx() to aquire context without checking
1730 * hat_lock, but fail, because cnum == nctxs, and therefore also trap and block
1731 * on hat_lock trying to wrap. sfmmu_ctxdom_lock() must be called before CPUs
1732 * are paused, else it could deadlock acquiring locks held by paused CPUs.
1733 *
1734 * sfmmu_ctxdoms_remove() removes context domains from every CPUs and records
1735 * the CPUs that had them. It must be called after CPUs have been paused. This
1736 * ensures that no threads are in sfmmu_alloc_ctx() accessing domain data,
1737 * because pause_cpus sends a mondo interrupt to every CPU, and sfmmu_alloc_ctx
1738 * runs with interrupts disabled. When CPUs are later resumed, they may enter
1739 * sfmmu_alloc_ctx, but it will check for CPU_MMU_CTXP = NULL and immediately
1740 * return failure. Or, they will be blocked trying to acquire hat_lock. Thus
1741 * after sfmmu_ctxdoms_remove returns, we are guaranteed that no one is
1742 * accessing the old context domains.
1743 *
1744 * sfmmu_ctxdoms_update(void) frees space used by old context domains and
1745 * allocates new context domains based on hardware layout. It initializes
1746 * every CPU that had context domain before migration to have one again.
1747 * sfmmu_ctxdoms_update must be called after CPUs are resumed, else it
1748 * could deadlock acquiring locks held by paused CPUs.
1749 *
1750 * sfmmu_ctxdoms_unlock(void) releases all hat_locks after which user threads
1751 * acquire new context ids and continue execution.
1752 *
1753 * Therefore functions should be called in the following order:
1754 * suspend_routine()
1755 * sfmmu_ctxdom_lock()
1756 * pause_cpus()
1757 * suspend()
1758 * if (suspend failed)
1759 * sfmmu_ctxdom_unlock()
1760 * ...
1761 * sfmmu_ctxdom_remove()
1762 * resume_cpus()
1763 * sfmmu_ctxdom_update()
1764 * sfmmu_ctxdom_unlock()
1765 */
1766 static cpuset_t sfmmu_ctxdoms_pset;
1767
1768 void
sfmmu_ctxdoms_remove()1769 sfmmu_ctxdoms_remove()
1770 {
1771 processorid_t id;
1772 cpu_t *cp;
1773
1774 /*
1775 * Record the CPUs that have domains in sfmmu_ctxdoms_pset, so they can
1776 * be restored post-migration. A CPU may be powered off and not have a
1777 * domain, for example.
1778 */
1779 CPUSET_ZERO(sfmmu_ctxdoms_pset);
1780
1781 for (id = 0; id < NCPU; id++) {
1782 if ((cp = cpu[id]) != NULL && CPU_MMU_CTXP(cp) != NULL) {
1783 CPUSET_ADD(sfmmu_ctxdoms_pset, id);
1784 CPU_MMU_CTXP(cp) = NULL;
1785 }
1786 }
1787 }
1788
1789 void
sfmmu_ctxdoms_lock(void)1790 sfmmu_ctxdoms_lock(void)
1791 {
1792 int idx;
1793 mmu_ctx_t *mmu_ctxp;
1794
1795 sfmmu_hat_lock_all();
1796
1797 /*
1798 * At this point, no thread can be in sfmmu_ctx_wrap_around, because
1799 * hat_lock is always taken before calling it.
1800 *
1801 * For each domain, set mmu_cnum to max so no more contexts can be
1802 * allocated, and wrap to flush on-CPU contexts and force threads to
1803 * acquire a new context when we later drop hat_lock after migration.
1804 * Setting mmu_cnum may race with sfmmu_alloc_ctx which also sets cnum,
1805 * but the latter uses CAS and will miscompare and not overwrite it.
1806 */
1807 kpreempt_disable(); /* required by sfmmu_ctx_wrap_around */
1808 for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
1809 if ((mmu_ctxp = mmu_ctxs_tbl[idx]) != NULL) {
1810 mutex_enter(&mmu_ctxp->mmu_lock);
1811 mmu_ctxp->mmu_cnum = mmu_ctxp->mmu_nctxs;
1812 /* make sure updated cnum visible */
1813 membar_enter();
1814 mutex_exit(&mmu_ctxp->mmu_lock);
1815 sfmmu_ctx_wrap_around(mmu_ctxp, B_FALSE);
1816 }
1817 }
1818 kpreempt_enable();
1819 }
1820
1821 void
sfmmu_ctxdoms_unlock(void)1822 sfmmu_ctxdoms_unlock(void)
1823 {
1824 sfmmu_hat_unlock_all();
1825 }
1826
1827 void
sfmmu_ctxdoms_update(void)1828 sfmmu_ctxdoms_update(void)
1829 {
1830 processorid_t id;
1831 cpu_t *cp;
1832 uint_t idx;
1833 mmu_ctx_t *mmu_ctxp;
1834
1835 /*
1836 * Free all context domains. As side effect, this increases
1837 * mmu_saved_gnum to the maximum gnum over all domains, which is used to
1838 * init gnum in the new domains, which therefore will be larger than the
1839 * sfmmu gnum for any process, guaranteeing that every process will see
1840 * a new generation and allocate a new context regardless of what new
1841 * domain it runs in.
1842 */
1843 mutex_enter(&cpu_lock);
1844
1845 for (idx = 0; idx < max_mmu_ctxdoms; idx++) {
1846 if (mmu_ctxs_tbl[idx] != NULL) {
1847 mmu_ctxp = mmu_ctxs_tbl[idx];
1848 mmu_ctxs_tbl[idx] = NULL;
1849 sfmmu_ctxdom_free(mmu_ctxp);
1850 }
1851 }
1852
1853 for (id = 0; id < NCPU; id++) {
1854 if (CPU_IN_SET(sfmmu_ctxdoms_pset, id) &&
1855 (cp = cpu[id]) != NULL)
1856 sfmmu_cpu_init(cp);
1857 }
1858 mutex_exit(&cpu_lock);
1859 }
1860 #endif
1861
1862 /*
1863 * Hat_setup, makes an address space context the current active one.
1864 * In sfmmu this translates to setting the secondary context with the
1865 * corresponding context.
1866 */
1867 void
hat_setup(struct hat * sfmmup,int allocflag)1868 hat_setup(struct hat *sfmmup, int allocflag)
1869 {
1870 hatlock_t *hatlockp;
1871
1872 /* Init needs some special treatment. */
1873 if (allocflag == HAT_INIT) {
1874 /*
1875 * Make sure that we have
1876 * 1. a TSB
1877 * 2. a valid ctx that doesn't get stolen after this point.
1878 */
1879 hatlockp = sfmmu_hat_enter(sfmmup);
1880
1881 /*
1882 * Swap in the TSB. hat_init() allocates tsbinfos without
1883 * TSBs, but we need one for init, since the kernel does some
1884 * special things to set up its stack and needs the TSB to
1885 * resolve page faults.
1886 */
1887 sfmmu_tsb_swapin(sfmmup, hatlockp);
1888
1889 sfmmu_get_ctx(sfmmup);
1890
1891 sfmmu_hat_exit(hatlockp);
1892 } else {
1893 ASSERT(allocflag == HAT_ALLOC);
1894
1895 hatlockp = sfmmu_hat_enter(sfmmup);
1896 kpreempt_disable();
1897
1898 CPUSET_ADD(sfmmup->sfmmu_cpusran, CPU->cpu_id);
1899 /*
1900 * sfmmu_setctx_sec takes <pgsz|cnum> as a parameter,
1901 * pagesize bits don't matter in this case since we are passing
1902 * INVALID_CONTEXT to it.
1903 * Compatibility Note: hw takes care of MMU_SCONTEXT1
1904 */
1905 sfmmu_setctx_sec(INVALID_CONTEXT);
1906 sfmmu_clear_utsbinfo();
1907
1908 kpreempt_enable();
1909 sfmmu_hat_exit(hatlockp);
1910 }
1911 }
1912
1913 /*
1914 * Free all the translation resources for the specified address space.
1915 * Called from as_free when an address space is being destroyed.
1916 */
1917 void
hat_free_start(struct hat * sfmmup)1918 hat_free_start(struct hat *sfmmup)
1919 {
1920 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
1921 ASSERT(sfmmup != ksfmmup);
1922 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
1923
1924 sfmmup->sfmmu_free = 1;
1925 if (sfmmup->sfmmu_scdp != NULL) {
1926 sfmmu_leave_scd(sfmmup, 0);
1927 }
1928
1929 ASSERT(sfmmup->sfmmu_scdp == NULL);
1930 }
1931
1932 void
hat_free_end(struct hat * sfmmup)1933 hat_free_end(struct hat *sfmmup)
1934 {
1935 int i;
1936
1937 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
1938 ASSERT(sfmmup->sfmmu_free == 1);
1939 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
1940 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
1941 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
1942 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
1943 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
1944 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
1945
1946 if (sfmmup->sfmmu_rmstat) {
1947 hat_freestat(sfmmup->sfmmu_as, NULL);
1948 }
1949
1950 while (sfmmup->sfmmu_tsb != NULL) {
1951 struct tsb_info *next = sfmmup->sfmmu_tsb->tsb_next;
1952 sfmmu_tsbinfo_free(sfmmup->sfmmu_tsb);
1953 sfmmup->sfmmu_tsb = next;
1954 }
1955
1956 if (sfmmup->sfmmu_srdp != NULL) {
1957 sfmmu_leave_srd(sfmmup);
1958 ASSERT(sfmmup->sfmmu_srdp == NULL);
1959 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
1960 if (sfmmup->sfmmu_hmeregion_links[i] != NULL) {
1961 kmem_free(sfmmup->sfmmu_hmeregion_links[i],
1962 SFMMU_L2_HMERLINKS_SIZE);
1963 sfmmup->sfmmu_hmeregion_links[i] = NULL;
1964 }
1965 }
1966 }
1967 sfmmu_free_sfmmu(sfmmup);
1968
1969 #ifdef DEBUG
1970 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
1971 ASSERT(sfmmup->sfmmu_hmeregion_links[i] == NULL);
1972 }
1973 #endif
1974
1975 kmem_cache_free(sfmmuid_cache, sfmmup);
1976 }
1977
1978 /*
1979 * Set up any translation structures, for the specified address space,
1980 * that are needed or preferred when the process is being swapped in.
1981 */
1982 /* ARGSUSED */
1983 void
hat_swapin(struct hat * hat)1984 hat_swapin(struct hat *hat)
1985 {
1986 ASSERT(hat->sfmmu_xhat_provider == NULL);
1987 }
1988
1989 /*
1990 * Free all of the translation resources, for the specified address space,
1991 * that can be freed while the process is swapped out. Called from as_swapout.
1992 * Also, free up the ctx that this process was using.
1993 */
1994 void
hat_swapout(struct hat * sfmmup)1995 hat_swapout(struct hat *sfmmup)
1996 {
1997 struct hmehash_bucket *hmebp;
1998 struct hme_blk *hmeblkp;
1999 struct hme_blk *pr_hblk = NULL;
2000 struct hme_blk *nx_hblk;
2001 int i;
2002 struct hme_blk *list = NULL;
2003 hatlock_t *hatlockp;
2004 struct tsb_info *tsbinfop;
2005 struct free_tsb {
2006 struct free_tsb *next;
2007 struct tsb_info *tsbinfop;
2008 }; /* free list of TSBs */
2009 struct free_tsb *freelist, *last, *next;
2010
2011 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
2012 SFMMU_STAT(sf_swapout);
2013
2014 /*
2015 * There is no way to go from an as to all its translations in sfmmu.
2016 * Here is one of the times when we take the big hit and traverse
2017 * the hash looking for hme_blks to free up. Not only do we free up
2018 * this as hme_blks but all those that are free. We are obviously
2019 * swapping because we need memory so let's free up as much
2020 * as we can.
2021 *
2022 * Note that we don't flush TLB/TSB here -- it's not necessary
2023 * because:
2024 * 1) we free the ctx we're using and throw away the TSB(s);
2025 * 2) processes aren't runnable while being swapped out.
2026 */
2027 ASSERT(sfmmup != KHATID);
2028 for (i = 0; i <= UHMEHASH_SZ; i++) {
2029 hmebp = &uhme_hash[i];
2030 SFMMU_HASH_LOCK(hmebp);
2031 hmeblkp = hmebp->hmeblkp;
2032 pr_hblk = NULL;
2033 while (hmeblkp) {
2034
2035 ASSERT(!hmeblkp->hblk_xhat_bit);
2036
2037 if ((hmeblkp->hblk_tag.htag_id == sfmmup) &&
2038 !hmeblkp->hblk_shw_bit && !hmeblkp->hblk_lckcnt) {
2039 ASSERT(!hmeblkp->hblk_shared);
2040 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
2041 (caddr_t)get_hblk_base(hmeblkp),
2042 get_hblk_endaddr(hmeblkp),
2043 NULL, HAT_UNLOAD);
2044 }
2045 nx_hblk = hmeblkp->hblk_next;
2046 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
2047 ASSERT(!hmeblkp->hblk_lckcnt);
2048 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
2049 &list, 0);
2050 } else {
2051 pr_hblk = hmeblkp;
2052 }
2053 hmeblkp = nx_hblk;
2054 }
2055 SFMMU_HASH_UNLOCK(hmebp);
2056 }
2057
2058 sfmmu_hblks_list_purge(&list, 0);
2059
2060 /*
2061 * Now free up the ctx so that others can reuse it.
2062 */
2063 hatlockp = sfmmu_hat_enter(sfmmup);
2064
2065 sfmmu_invalidate_ctx(sfmmup);
2066
2067 /*
2068 * Free TSBs, but not tsbinfos, and set SWAPPED flag.
2069 * If TSBs were never swapped in, just return.
2070 * This implies that we don't support partial swapping
2071 * of TSBs -- either all are swapped out, or none are.
2072 *
2073 * We must hold the HAT lock here to prevent racing with another
2074 * thread trying to unmap TTEs from the TSB or running the post-
2075 * relocator after relocating the TSB's memory. Unfortunately, we
2076 * can't free memory while holding the HAT lock or we could
2077 * deadlock, so we build a list of TSBs to be freed after marking
2078 * the tsbinfos as swapped out and free them after dropping the
2079 * lock.
2080 */
2081 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
2082 sfmmu_hat_exit(hatlockp);
2083 return;
2084 }
2085
2086 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPPED);
2087 last = freelist = NULL;
2088 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
2089 tsbinfop = tsbinfop->tsb_next) {
2090 ASSERT((tsbinfop->tsb_flags & TSB_SWAPPED) == 0);
2091
2092 /*
2093 * Cast the TSB into a struct free_tsb and put it on the free
2094 * list.
2095 */
2096 if (freelist == NULL) {
2097 last = freelist = (struct free_tsb *)tsbinfop->tsb_va;
2098 } else {
2099 last->next = (struct free_tsb *)tsbinfop->tsb_va;
2100 last = last->next;
2101 }
2102 last->next = NULL;
2103 last->tsbinfop = tsbinfop;
2104 tsbinfop->tsb_flags |= TSB_SWAPPED;
2105 /*
2106 * Zero out the TTE to clear the valid bit.
2107 * Note we can't use a value like 0xbad because we want to
2108 * ensure diagnostic bits are NEVER set on TTEs that might
2109 * be loaded. The intent is to catch any invalid access
2110 * to the swapped TSB, such as a thread running with a valid
2111 * context without first calling sfmmu_tsb_swapin() to
2112 * allocate TSB memory.
2113 */
2114 tsbinfop->tsb_tte.ll = 0;
2115 }
2116
2117 /* Now we can drop the lock and free the TSB memory. */
2118 sfmmu_hat_exit(hatlockp);
2119 for (; freelist != NULL; freelist = next) {
2120 next = freelist->next;
2121 sfmmu_tsb_free(freelist->tsbinfop);
2122 }
2123 }
2124
2125 /*
2126 * Duplicate the translations of an as into another newas
2127 */
2128 /* ARGSUSED */
2129 int
hat_dup(struct hat * hat,struct hat * newhat,caddr_t addr,size_t len,uint_t flag)2130 hat_dup(struct hat *hat, struct hat *newhat, caddr_t addr, size_t len,
2131 uint_t flag)
2132 {
2133 sf_srd_t *srdp;
2134 sf_scd_t *scdp;
2135 int i;
2136 extern uint_t get_color_start(struct as *);
2137
2138 ASSERT(hat->sfmmu_xhat_provider == NULL);
2139 ASSERT((flag == 0) || (flag == HAT_DUP_ALL) || (flag == HAT_DUP_COW) ||
2140 (flag == HAT_DUP_SRD));
2141 ASSERT(hat != ksfmmup);
2142 ASSERT(newhat != ksfmmup);
2143 ASSERT(flag != HAT_DUP_ALL || hat->sfmmu_srdp == newhat->sfmmu_srdp);
2144
2145 if (flag == HAT_DUP_COW) {
2146 panic("hat_dup: HAT_DUP_COW not supported");
2147 }
2148
2149 if (flag == HAT_DUP_SRD && ((srdp = hat->sfmmu_srdp) != NULL)) {
2150 ASSERT(srdp->srd_evp != NULL);
2151 VN_HOLD(srdp->srd_evp);
2152 ASSERT(srdp->srd_refcnt > 0);
2153 newhat->sfmmu_srdp = srdp;
2154 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt);
2155 }
2156
2157 /*
2158 * HAT_DUP_ALL flag is used after as duplication is done.
2159 */
2160 if (flag == HAT_DUP_ALL && ((srdp = newhat->sfmmu_srdp) != NULL)) {
2161 ASSERT(newhat->sfmmu_srdp->srd_refcnt >= 2);
2162 newhat->sfmmu_rtteflags = hat->sfmmu_rtteflags;
2163 if (hat->sfmmu_flags & HAT_4MTEXT_FLAG) {
2164 newhat->sfmmu_flags |= HAT_4MTEXT_FLAG;
2165 }
2166
2167 /* check if need to join scd */
2168 if ((scdp = hat->sfmmu_scdp) != NULL &&
2169 newhat->sfmmu_scdp != scdp) {
2170 int ret;
2171 SF_RGNMAP_IS_SUBSET(&newhat->sfmmu_region_map,
2172 &scdp->scd_region_map, ret);
2173 ASSERT(ret);
2174 sfmmu_join_scd(scdp, newhat);
2175 ASSERT(newhat->sfmmu_scdp == scdp &&
2176 scdp->scd_refcnt >= 2);
2177 for (i = 0; i < max_mmu_page_sizes; i++) {
2178 newhat->sfmmu_ismttecnt[i] =
2179 hat->sfmmu_ismttecnt[i];
2180 newhat->sfmmu_scdismttecnt[i] =
2181 hat->sfmmu_scdismttecnt[i];
2182 }
2183 }
2184
2185 sfmmu_check_page_sizes(newhat, 1);
2186 }
2187
2188 if (flag == HAT_DUP_ALL && consistent_coloring == 0 &&
2189 update_proc_pgcolorbase_after_fork != 0) {
2190 hat->sfmmu_clrbin = get_color_start(hat->sfmmu_as);
2191 }
2192 return (0);
2193 }
2194
2195 void
hat_memload(struct hat * hat,caddr_t addr,struct page * pp,uint_t attr,uint_t flags)2196 hat_memload(struct hat *hat, caddr_t addr, struct page *pp,
2197 uint_t attr, uint_t flags)
2198 {
2199 hat_do_memload(hat, addr, pp, attr, flags,
2200 SFMMU_INVALID_SHMERID);
2201 }
2202
2203 void
hat_memload_region(struct hat * hat,caddr_t addr,struct page * pp,uint_t attr,uint_t flags,hat_region_cookie_t rcookie)2204 hat_memload_region(struct hat *hat, caddr_t addr, struct page *pp,
2205 uint_t attr, uint_t flags, hat_region_cookie_t rcookie)
2206 {
2207 uint_t rid;
2208 if (rcookie == HAT_INVALID_REGION_COOKIE ||
2209 hat->sfmmu_xhat_provider != NULL) {
2210 hat_do_memload(hat, addr, pp, attr, flags,
2211 SFMMU_INVALID_SHMERID);
2212 return;
2213 }
2214 rid = (uint_t)((uint64_t)rcookie);
2215 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
2216 hat_do_memload(hat, addr, pp, attr, flags, rid);
2217 }
2218
2219 /*
2220 * Set up addr to map to page pp with protection prot.
2221 * As an optimization we also load the TSB with the
2222 * corresponding tte but it is no big deal if the tte gets kicked out.
2223 */
2224 static void
hat_do_memload(struct hat * hat,caddr_t addr,struct page * pp,uint_t attr,uint_t flags,uint_t rid)2225 hat_do_memload(struct hat *hat, caddr_t addr, struct page *pp,
2226 uint_t attr, uint_t flags, uint_t rid)
2227 {
2228 tte_t tte;
2229
2230
2231 ASSERT(hat != NULL);
2232 ASSERT(PAGE_LOCKED(pp));
2233 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
2234 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
2235 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2236 SFMMU_VALIDATE_HMERID(hat, rid, addr, MMU_PAGESIZE);
2237
2238 if (PP_ISFREE(pp)) {
2239 panic("hat_memload: loading a mapping to free page %p",
2240 (void *)pp);
2241 }
2242
2243 if (hat->sfmmu_xhat_provider) {
2244 /* no regions for xhats */
2245 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
2246 XHAT_MEMLOAD(hat, addr, pp, attr, flags);
2247 return;
2248 }
2249
2250 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as));
2251
2252 if (flags & ~SFMMU_LOAD_ALLFLAG)
2253 cmn_err(CE_NOTE, "hat_memload: unsupported flags %d",
2254 flags & ~SFMMU_LOAD_ALLFLAG);
2255
2256 if (hat->sfmmu_rmstat)
2257 hat_resvstat(MMU_PAGESIZE, hat->sfmmu_as, addr);
2258
2259 #if defined(SF_ERRATA_57)
2260 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2261 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2262 !(flags & HAT_LOAD_SHARE)) {
2263 cmn_err(CE_WARN, "hat_memload: illegal attempt to make user "
2264 " page executable");
2265 attr &= ~PROT_EXEC;
2266 }
2267 #endif
2268
2269 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
2270 (void) sfmmu_tteload_array(hat, &tte, addr, &pp, flags, rid);
2271
2272 /*
2273 * Check TSB and TLB page sizes.
2274 */
2275 if ((flags & HAT_LOAD_SHARE) == 0) {
2276 sfmmu_check_page_sizes(hat, 1);
2277 }
2278 }
2279
2280 /*
2281 * hat_devload can be called to map real memory (e.g.
2282 * /dev/kmem) and even though hat_devload will determine pf is
2283 * for memory, it will be unable to get a shared lock on the
2284 * page (because someone else has it exclusively) and will
2285 * pass dp = NULL. If tteload doesn't get a non-NULL
2286 * page pointer it can't cache memory.
2287 */
2288 void
hat_devload(struct hat * hat,caddr_t addr,size_t len,pfn_t pfn,uint_t attr,int flags)2289 hat_devload(struct hat *hat, caddr_t addr, size_t len, pfn_t pfn,
2290 uint_t attr, int flags)
2291 {
2292 tte_t tte;
2293 struct page *pp = NULL;
2294 int use_lgpg = 0;
2295
2296 ASSERT(hat != NULL);
2297
2298 if (hat->sfmmu_xhat_provider) {
2299 XHAT_DEVLOAD(hat, addr, len, pfn, attr, flags);
2300 return;
2301 }
2302
2303 ASSERT(!(flags & ~SFMMU_LOAD_ALLFLAG));
2304 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2305 ASSERT((hat == ksfmmup) || AS_LOCK_HELD(hat->sfmmu_as));
2306 if (len == 0)
2307 panic("hat_devload: zero len");
2308 if (flags & ~SFMMU_LOAD_ALLFLAG)
2309 cmn_err(CE_NOTE, "hat_devload: unsupported flags %d",
2310 flags & ~SFMMU_LOAD_ALLFLAG);
2311
2312 #if defined(SF_ERRATA_57)
2313 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2314 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2315 !(flags & HAT_LOAD_SHARE)) {
2316 cmn_err(CE_WARN, "hat_devload: illegal attempt to make user "
2317 " page executable");
2318 attr &= ~PROT_EXEC;
2319 }
2320 #endif
2321
2322 /*
2323 * If it's a memory page find its pp
2324 */
2325 if (!(flags & HAT_LOAD_NOCONSIST) && pf_is_memory(pfn)) {
2326 pp = page_numtopp_nolock(pfn);
2327 if (pp == NULL) {
2328 flags |= HAT_LOAD_NOCONSIST;
2329 } else {
2330 if (PP_ISFREE(pp)) {
2331 panic("hat_memload: loading "
2332 "a mapping to free page %p",
2333 (void *)pp);
2334 }
2335 if (!PAGE_LOCKED(pp) && !PP_ISNORELOC(pp)) {
2336 panic("hat_memload: loading a mapping "
2337 "to unlocked relocatable page %p",
2338 (void *)pp);
2339 }
2340 ASSERT(len == MMU_PAGESIZE);
2341 }
2342 }
2343
2344 if (hat->sfmmu_rmstat)
2345 hat_resvstat(len, hat->sfmmu_as, addr);
2346
2347 if (flags & HAT_LOAD_NOCONSIST) {
2348 attr |= SFMMU_UNCACHEVTTE;
2349 use_lgpg = 1;
2350 }
2351 if (!pf_is_memory(pfn)) {
2352 attr |= SFMMU_UNCACHEPTTE | HAT_NOSYNC;
2353 use_lgpg = 1;
2354 switch (attr & HAT_ORDER_MASK) {
2355 case HAT_STRICTORDER:
2356 case HAT_UNORDERED_OK:
2357 /*
2358 * we set the side effect bit for all non
2359 * memory mappings unless merging is ok
2360 */
2361 attr |= SFMMU_SIDEFFECT;
2362 break;
2363 case HAT_MERGING_OK:
2364 case HAT_LOADCACHING_OK:
2365 case HAT_STORECACHING_OK:
2366 break;
2367 default:
2368 panic("hat_devload: bad attr");
2369 break;
2370 }
2371 }
2372 while (len) {
2373 if (!use_lgpg) {
2374 sfmmu_memtte(&tte, pfn, attr, TTE8K);
2375 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2376 flags, SFMMU_INVALID_SHMERID);
2377 len -= MMU_PAGESIZE;
2378 addr += MMU_PAGESIZE;
2379 pfn++;
2380 continue;
2381 }
2382 /*
2383 * try to use large pages, check va/pa alignments
2384 * Note that 32M/256M page sizes are not (yet) supported.
2385 */
2386 if ((len >= MMU_PAGESIZE4M) &&
2387 !((uintptr_t)addr & MMU_PAGEOFFSET4M) &&
2388 !(disable_large_pages & (1 << TTE4M)) &&
2389 !(mmu_ptob(pfn) & MMU_PAGEOFFSET4M)) {
2390 sfmmu_memtte(&tte, pfn, attr, TTE4M);
2391 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2392 flags, SFMMU_INVALID_SHMERID);
2393 len -= MMU_PAGESIZE4M;
2394 addr += MMU_PAGESIZE4M;
2395 pfn += MMU_PAGESIZE4M / MMU_PAGESIZE;
2396 } else if ((len >= MMU_PAGESIZE512K) &&
2397 !((uintptr_t)addr & MMU_PAGEOFFSET512K) &&
2398 !(disable_large_pages & (1 << TTE512K)) &&
2399 !(mmu_ptob(pfn) & MMU_PAGEOFFSET512K)) {
2400 sfmmu_memtte(&tte, pfn, attr, TTE512K);
2401 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2402 flags, SFMMU_INVALID_SHMERID);
2403 len -= MMU_PAGESIZE512K;
2404 addr += MMU_PAGESIZE512K;
2405 pfn += MMU_PAGESIZE512K / MMU_PAGESIZE;
2406 } else if ((len >= MMU_PAGESIZE64K) &&
2407 !((uintptr_t)addr & MMU_PAGEOFFSET64K) &&
2408 !(disable_large_pages & (1 << TTE64K)) &&
2409 !(mmu_ptob(pfn) & MMU_PAGEOFFSET64K)) {
2410 sfmmu_memtte(&tte, pfn, attr, TTE64K);
2411 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2412 flags, SFMMU_INVALID_SHMERID);
2413 len -= MMU_PAGESIZE64K;
2414 addr += MMU_PAGESIZE64K;
2415 pfn += MMU_PAGESIZE64K / MMU_PAGESIZE;
2416 } else {
2417 sfmmu_memtte(&tte, pfn, attr, TTE8K);
2418 (void) sfmmu_tteload_array(hat, &tte, addr, &pp,
2419 flags, SFMMU_INVALID_SHMERID);
2420 len -= MMU_PAGESIZE;
2421 addr += MMU_PAGESIZE;
2422 pfn++;
2423 }
2424 }
2425
2426 /*
2427 * Check TSB and TLB page sizes.
2428 */
2429 if ((flags & HAT_LOAD_SHARE) == 0) {
2430 sfmmu_check_page_sizes(hat, 1);
2431 }
2432 }
2433
2434 void
hat_memload_array(struct hat * hat,caddr_t addr,size_t len,struct page ** pps,uint_t attr,uint_t flags)2435 hat_memload_array(struct hat *hat, caddr_t addr, size_t len,
2436 struct page **pps, uint_t attr, uint_t flags)
2437 {
2438 hat_do_memload_array(hat, addr, len, pps, attr, flags,
2439 SFMMU_INVALID_SHMERID);
2440 }
2441
2442 void
hat_memload_array_region(struct hat * hat,caddr_t addr,size_t len,struct page ** pps,uint_t attr,uint_t flags,hat_region_cookie_t rcookie)2443 hat_memload_array_region(struct hat *hat, caddr_t addr, size_t len,
2444 struct page **pps, uint_t attr, uint_t flags,
2445 hat_region_cookie_t rcookie)
2446 {
2447 uint_t rid;
2448 if (rcookie == HAT_INVALID_REGION_COOKIE ||
2449 hat->sfmmu_xhat_provider != NULL) {
2450 hat_do_memload_array(hat, addr, len, pps, attr, flags,
2451 SFMMU_INVALID_SHMERID);
2452 return;
2453 }
2454 rid = (uint_t)((uint64_t)rcookie);
2455 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
2456 hat_do_memload_array(hat, addr, len, pps, attr, flags, rid);
2457 }
2458
2459 /*
2460 * Map the largest extend possible out of the page array. The array may NOT
2461 * be in order. The largest possible mapping a page can have
2462 * is specified in the p_szc field. The p_szc field
2463 * cannot change as long as there any mappings (large or small)
2464 * to any of the pages that make up the large page. (ie. any
2465 * promotion/demotion of page size is not up to the hat but up to
2466 * the page free list manager). The array
2467 * should consist of properly aligned contigous pages that are
2468 * part of a big page for a large mapping to be created.
2469 */
2470 static void
hat_do_memload_array(struct hat * hat,caddr_t addr,size_t len,struct page ** pps,uint_t attr,uint_t flags,uint_t rid)2471 hat_do_memload_array(struct hat *hat, caddr_t addr, size_t len,
2472 struct page **pps, uint_t attr, uint_t flags, uint_t rid)
2473 {
2474 int ttesz;
2475 size_t mapsz;
2476 pgcnt_t numpg, npgs;
2477 tte_t tte;
2478 page_t *pp;
2479 uint_t large_pages_disable;
2480
2481 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
2482 SFMMU_VALIDATE_HMERID(hat, rid, addr, len);
2483
2484 if (hat->sfmmu_xhat_provider) {
2485 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
2486 XHAT_MEMLOAD_ARRAY(hat, addr, len, pps, attr, flags);
2487 return;
2488 }
2489
2490 if (hat->sfmmu_rmstat)
2491 hat_resvstat(len, hat->sfmmu_as, addr);
2492
2493 #if defined(SF_ERRATA_57)
2494 if ((hat != ksfmmup) && AS_TYPE_64BIT(hat->sfmmu_as) &&
2495 (addr < errata57_limit) && (attr & PROT_EXEC) &&
2496 !(flags & HAT_LOAD_SHARE)) {
2497 cmn_err(CE_WARN, "hat_memload_array: illegal attempt to make "
2498 "user page executable");
2499 attr &= ~PROT_EXEC;
2500 }
2501 #endif
2502
2503 /* Get number of pages */
2504 npgs = len >> MMU_PAGESHIFT;
2505
2506 if (flags & HAT_LOAD_SHARE) {
2507 large_pages_disable = disable_ism_large_pages;
2508 } else {
2509 large_pages_disable = disable_large_pages;
2510 }
2511
2512 if (npgs < NHMENTS || large_pages_disable == LARGE_PAGES_OFF) {
2513 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
2514 rid);
2515 return;
2516 }
2517
2518 while (npgs >= NHMENTS) {
2519 pp = *pps;
2520 for (ttesz = pp->p_szc; ttesz != TTE8K; ttesz--) {
2521 /*
2522 * Check if this page size is disabled.
2523 */
2524 if (large_pages_disable & (1 << ttesz))
2525 continue;
2526
2527 numpg = TTEPAGES(ttesz);
2528 mapsz = numpg << MMU_PAGESHIFT;
2529 if ((npgs >= numpg) &&
2530 IS_P2ALIGNED(addr, mapsz) &&
2531 IS_P2ALIGNED(pp->p_pagenum, numpg)) {
2532 /*
2533 * At this point we have enough pages and
2534 * we know the virtual address and the pfn
2535 * are properly aligned. We still need
2536 * to check for physical contiguity but since
2537 * it is very likely that this is the case
2538 * we will assume they are so and undo
2539 * the request if necessary. It would
2540 * be great if we could get a hint flag
2541 * like HAT_CONTIG which would tell us
2542 * the pages are contigous for sure.
2543 */
2544 sfmmu_memtte(&tte, (*pps)->p_pagenum,
2545 attr, ttesz);
2546 if (!sfmmu_tteload_array(hat, &tte, addr,
2547 pps, flags, rid)) {
2548 break;
2549 }
2550 }
2551 }
2552 if (ttesz == TTE8K) {
2553 /*
2554 * We were not able to map array using a large page
2555 * batch a hmeblk or fraction at a time.
2556 */
2557 numpg = ((uintptr_t)addr >> MMU_PAGESHIFT)
2558 & (NHMENTS-1);
2559 numpg = NHMENTS - numpg;
2560 ASSERT(numpg <= npgs);
2561 mapsz = numpg * MMU_PAGESIZE;
2562 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags,
2563 numpg, rid);
2564 }
2565 addr += mapsz;
2566 npgs -= numpg;
2567 pps += numpg;
2568 }
2569
2570 if (npgs) {
2571 sfmmu_memload_batchsmall(hat, addr, pps, attr, flags, npgs,
2572 rid);
2573 }
2574
2575 /*
2576 * Check TSB and TLB page sizes.
2577 */
2578 if ((flags & HAT_LOAD_SHARE) == 0) {
2579 sfmmu_check_page_sizes(hat, 1);
2580 }
2581 }
2582
2583 /*
2584 * Function tries to batch 8K pages into the same hme blk.
2585 */
2586 static void
sfmmu_memload_batchsmall(struct hat * hat,caddr_t vaddr,page_t ** pps,uint_t attr,uint_t flags,pgcnt_t npgs,uint_t rid)2587 sfmmu_memload_batchsmall(struct hat *hat, caddr_t vaddr, page_t **pps,
2588 uint_t attr, uint_t flags, pgcnt_t npgs, uint_t rid)
2589 {
2590 tte_t tte;
2591 page_t *pp;
2592 struct hmehash_bucket *hmebp;
2593 struct hme_blk *hmeblkp;
2594 int index;
2595
2596 while (npgs) {
2597 /*
2598 * Acquire the hash bucket.
2599 */
2600 hmebp = sfmmu_tteload_acquire_hashbucket(hat, vaddr, TTE8K,
2601 rid);
2602 ASSERT(hmebp);
2603
2604 /*
2605 * Find the hment block.
2606 */
2607 hmeblkp = sfmmu_tteload_find_hmeblk(hat, hmebp, vaddr,
2608 TTE8K, flags, rid);
2609 ASSERT(hmeblkp);
2610
2611 do {
2612 /*
2613 * Make the tte.
2614 */
2615 pp = *pps;
2616 sfmmu_memtte(&tte, pp->p_pagenum, attr, TTE8K);
2617
2618 /*
2619 * Add the translation.
2620 */
2621 (void) sfmmu_tteload_addentry(hat, hmeblkp, &tte,
2622 vaddr, pps, flags, rid);
2623
2624 /*
2625 * Goto next page.
2626 */
2627 pps++;
2628 npgs--;
2629
2630 /*
2631 * Goto next address.
2632 */
2633 vaddr += MMU_PAGESIZE;
2634
2635 /*
2636 * Don't crossover into a different hmentblk.
2637 */
2638 index = (int)(((uintptr_t)vaddr >> MMU_PAGESHIFT) &
2639 (NHMENTS-1));
2640
2641 } while (index != 0 && npgs != 0);
2642
2643 /*
2644 * Release the hash bucket.
2645 */
2646
2647 sfmmu_tteload_release_hashbucket(hmebp);
2648 }
2649 }
2650
2651 /*
2652 * Construct a tte for a page:
2653 *
2654 * tte_valid = 1
2655 * tte_size2 = size & TTE_SZ2_BITS (Panther and Olympus-C only)
2656 * tte_size = size
2657 * tte_nfo = attr & HAT_NOFAULT
2658 * tte_ie = attr & HAT_STRUCTURE_LE
2659 * tte_hmenum = hmenum
2660 * tte_pahi = pp->p_pagenum >> TTE_PASHIFT;
2661 * tte_palo = pp->p_pagenum & TTE_PALOMASK;
2662 * tte_ref = 1 (optimization)
2663 * tte_wr_perm = attr & PROT_WRITE;
2664 * tte_no_sync = attr & HAT_NOSYNC
2665 * tte_lock = attr & SFMMU_LOCKTTE
2666 * tte_cp = !(attr & SFMMU_UNCACHEPTTE)
2667 * tte_cv = !(attr & SFMMU_UNCACHEVTTE)
2668 * tte_e = attr & SFMMU_SIDEFFECT
2669 * tte_priv = !(attr & PROT_USER)
2670 * tte_hwwr = if nosync is set and it is writable we set the mod bit (opt)
2671 * tte_glb = 0
2672 */
2673 void
sfmmu_memtte(tte_t * ttep,pfn_t pfn,uint_t attr,int tte_sz)2674 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz)
2675 {
2676 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
2677
2678 ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */);
2679 ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */);
2680
2681 if (TTE_IS_NOSYNC(ttep)) {
2682 TTE_SET_REF(ttep);
2683 if (TTE_IS_WRITABLE(ttep)) {
2684 TTE_SET_MOD(ttep);
2685 }
2686 }
2687 if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) {
2688 panic("sfmmu_memtte: can't set both NFO and EXEC bits");
2689 }
2690 }
2691
2692 /*
2693 * This function will add a translation to the hme_blk and allocate the
2694 * hme_blk if one does not exist.
2695 * If a page structure is specified then it will add the
2696 * corresponding hment to the mapping list.
2697 * It will also update the hmenum field for the tte.
2698 *
2699 * Currently this function is only used for kernel mappings.
2700 * So pass invalid region to sfmmu_tteload_array().
2701 */
2702 void
sfmmu_tteload(struct hat * sfmmup,tte_t * ttep,caddr_t vaddr,page_t * pp,uint_t flags)2703 sfmmu_tteload(struct hat *sfmmup, tte_t *ttep, caddr_t vaddr, page_t *pp,
2704 uint_t flags)
2705 {
2706 ASSERT(sfmmup == ksfmmup);
2707 (void) sfmmu_tteload_array(sfmmup, ttep, vaddr, &pp, flags,
2708 SFMMU_INVALID_SHMERID);
2709 }
2710
2711 /*
2712 * Load (ttep != NULL) or unload (ttep == NULL) one entry in the TSB.
2713 * Assumes that a particular page size may only be resident in one TSB.
2714 */
2715 static void
sfmmu_mod_tsb(sfmmu_t * sfmmup,caddr_t vaddr,tte_t * ttep,int ttesz)2716 sfmmu_mod_tsb(sfmmu_t *sfmmup, caddr_t vaddr, tte_t *ttep, int ttesz)
2717 {
2718 struct tsb_info *tsbinfop = NULL;
2719 uint64_t tag;
2720 struct tsbe *tsbe_addr;
2721 uint64_t tsb_base;
2722 uint_t tsb_size;
2723 int vpshift = MMU_PAGESHIFT;
2724 int phys = 0;
2725
2726 if (sfmmup == ksfmmup) { /* No support for 32/256M ksfmmu pages */
2727 phys = ktsb_phys;
2728 if (ttesz >= TTE4M) {
2729 #ifndef sun4v
2730 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
2731 #endif
2732 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
2733 tsb_size = ktsb4m_szcode;
2734 } else {
2735 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
2736 tsb_size = ktsb_szcode;
2737 }
2738 } else {
2739 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
2740
2741 /*
2742 * If there isn't a TSB for this page size, or the TSB is
2743 * swapped out, there is nothing to do. Note that the latter
2744 * case seems impossible but can occur if hat_pageunload()
2745 * is called on an ISM mapping while the process is swapped
2746 * out.
2747 */
2748 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
2749 return;
2750
2751 /*
2752 * If another thread is in the middle of relocating a TSB
2753 * we can't unload the entry so set a flag so that the
2754 * TSB will be flushed before it can be accessed by the
2755 * process.
2756 */
2757 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
2758 if (ttep == NULL)
2759 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
2760 return;
2761 }
2762 #if defined(UTSB_PHYS)
2763 phys = 1;
2764 tsb_base = (uint64_t)tsbinfop->tsb_pa;
2765 #else
2766 tsb_base = (uint64_t)tsbinfop->tsb_va;
2767 #endif
2768 tsb_size = tsbinfop->tsb_szc;
2769 }
2770 if (ttesz >= TTE4M)
2771 vpshift = MMU_PAGESHIFT4M;
2772
2773 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
2774 tag = sfmmu_make_tsbtag(vaddr);
2775
2776 if (ttep == NULL) {
2777 sfmmu_unload_tsbe(tsbe_addr, tag, phys);
2778 } else {
2779 if (ttesz >= TTE4M) {
2780 SFMMU_STAT(sf_tsb_load4m);
2781 } else {
2782 SFMMU_STAT(sf_tsb_load8k);
2783 }
2784
2785 sfmmu_load_tsbe(tsbe_addr, tag, ttep, phys);
2786 }
2787 }
2788
2789 /*
2790 * Unmap all entries from [start, end) matching the given page size.
2791 *
2792 * This function is used primarily to unmap replicated 64K or 512K entries
2793 * from the TSB that are inserted using the base page size TSB pointer, but
2794 * it may also be called to unmap a range of addresses from the TSB.
2795 */
2796 void
sfmmu_unload_tsb_range(sfmmu_t * sfmmup,caddr_t start,caddr_t end,int ttesz)2797 sfmmu_unload_tsb_range(sfmmu_t *sfmmup, caddr_t start, caddr_t end, int ttesz)
2798 {
2799 struct tsb_info *tsbinfop;
2800 uint64_t tag;
2801 struct tsbe *tsbe_addr;
2802 caddr_t vaddr;
2803 uint64_t tsb_base;
2804 int vpshift, vpgsz;
2805 uint_t tsb_size;
2806 int phys = 0;
2807
2808 /*
2809 * Assumptions:
2810 * If ttesz == 8K, 64K or 512K, we walk through the range 8K
2811 * at a time shooting down any valid entries we encounter.
2812 *
2813 * If ttesz >= 4M we walk the range 4M at a time shooting
2814 * down any valid mappings we find.
2815 */
2816 if (sfmmup == ksfmmup) {
2817 phys = ktsb_phys;
2818 if (ttesz >= TTE4M) {
2819 #ifndef sun4v
2820 ASSERT((ttesz != TTE32M) && (ttesz != TTE256M));
2821 #endif
2822 tsb_base = (phys)? ktsb4m_pbase : (uint64_t)ktsb4m_base;
2823 tsb_size = ktsb4m_szcode;
2824 } else {
2825 tsb_base = (phys)? ktsb_pbase : (uint64_t)ktsb_base;
2826 tsb_size = ktsb_szcode;
2827 }
2828 } else {
2829 SFMMU_GET_TSBINFO(tsbinfop, sfmmup, ttesz);
2830
2831 /*
2832 * If there isn't a TSB for this page size, or the TSB is
2833 * swapped out, there is nothing to do. Note that the latter
2834 * case seems impossible but can occur if hat_pageunload()
2835 * is called on an ISM mapping while the process is swapped
2836 * out.
2837 */
2838 if (tsbinfop == NULL || (tsbinfop->tsb_flags & TSB_SWAPPED))
2839 return;
2840
2841 /*
2842 * If another thread is in the middle of relocating a TSB
2843 * we can't unload the entry so set a flag so that the
2844 * TSB will be flushed before it can be accessed by the
2845 * process.
2846 */
2847 if ((tsbinfop->tsb_flags & TSB_RELOC_FLAG) != 0) {
2848 tsbinfop->tsb_flags |= TSB_FLUSH_NEEDED;
2849 return;
2850 }
2851 #if defined(UTSB_PHYS)
2852 phys = 1;
2853 tsb_base = (uint64_t)tsbinfop->tsb_pa;
2854 #else
2855 tsb_base = (uint64_t)tsbinfop->tsb_va;
2856 #endif
2857 tsb_size = tsbinfop->tsb_szc;
2858 }
2859 if (ttesz >= TTE4M) {
2860 vpshift = MMU_PAGESHIFT4M;
2861 vpgsz = MMU_PAGESIZE4M;
2862 } else {
2863 vpshift = MMU_PAGESHIFT;
2864 vpgsz = MMU_PAGESIZE;
2865 }
2866
2867 for (vaddr = start; vaddr < end; vaddr += vpgsz) {
2868 tag = sfmmu_make_tsbtag(vaddr);
2869 tsbe_addr = sfmmu_get_tsbe(tsb_base, vaddr, vpshift, tsb_size);
2870 sfmmu_unload_tsbe(tsbe_addr, tag, phys);
2871 }
2872 }
2873
2874 /*
2875 * Select the optimum TSB size given the number of mappings
2876 * that need to be cached.
2877 */
2878 static int
sfmmu_select_tsb_szc(pgcnt_t pgcnt)2879 sfmmu_select_tsb_szc(pgcnt_t pgcnt)
2880 {
2881 int szc = 0;
2882
2883 #ifdef DEBUG
2884 if (tsb_grow_stress) {
2885 uint32_t randval = (uint32_t)gettick() >> 4;
2886 return (randval % (tsb_max_growsize + 1));
2887 }
2888 #endif /* DEBUG */
2889
2890 while ((szc < tsb_max_growsize) && (pgcnt > SFMMU_RSS_TSBSIZE(szc)))
2891 szc++;
2892 return (szc);
2893 }
2894
2895 /*
2896 * This function will add a translation to the hme_blk and allocate the
2897 * hme_blk if one does not exist.
2898 * If a page structure is specified then it will add the
2899 * corresponding hment to the mapping list.
2900 * It will also update the hmenum field for the tte.
2901 * Furthermore, it attempts to create a large page translation
2902 * for <addr,hat> at page array pps. It assumes addr and first
2903 * pp is correctly aligned. It returns 0 if successful and 1 otherwise.
2904 */
2905 static int
sfmmu_tteload_array(sfmmu_t * sfmmup,tte_t * ttep,caddr_t vaddr,page_t ** pps,uint_t flags,uint_t rid)2906 sfmmu_tteload_array(sfmmu_t *sfmmup, tte_t *ttep, caddr_t vaddr,
2907 page_t **pps, uint_t flags, uint_t rid)
2908 {
2909 struct hmehash_bucket *hmebp;
2910 struct hme_blk *hmeblkp;
2911 int ret;
2912 uint_t size;
2913
2914 /*
2915 * Get mapping size.
2916 */
2917 size = TTE_CSZ(ttep);
2918 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
2919
2920 /*
2921 * Acquire the hash bucket.
2922 */
2923 hmebp = sfmmu_tteload_acquire_hashbucket(sfmmup, vaddr, size, rid);
2924 ASSERT(hmebp);
2925
2926 /*
2927 * Find the hment block.
2928 */
2929 hmeblkp = sfmmu_tteload_find_hmeblk(sfmmup, hmebp, vaddr, size, flags,
2930 rid);
2931 ASSERT(hmeblkp);
2932
2933 /*
2934 * Add the translation.
2935 */
2936 ret = sfmmu_tteload_addentry(sfmmup, hmeblkp, ttep, vaddr, pps, flags,
2937 rid);
2938
2939 /*
2940 * Release the hash bucket.
2941 */
2942 sfmmu_tteload_release_hashbucket(hmebp);
2943
2944 return (ret);
2945 }
2946
2947 /*
2948 * Function locks and returns a pointer to the hash bucket for vaddr and size.
2949 */
2950 static struct hmehash_bucket *
sfmmu_tteload_acquire_hashbucket(sfmmu_t * sfmmup,caddr_t vaddr,int size,uint_t rid)2951 sfmmu_tteload_acquire_hashbucket(sfmmu_t *sfmmup, caddr_t vaddr, int size,
2952 uint_t rid)
2953 {
2954 struct hmehash_bucket *hmebp;
2955 int hmeshift;
2956 void *htagid = sfmmutohtagid(sfmmup, rid);
2957
2958 ASSERT(htagid != NULL);
2959
2960 hmeshift = HME_HASH_SHIFT(size);
2961
2962 hmebp = HME_HASH_FUNCTION(htagid, vaddr, hmeshift);
2963
2964 SFMMU_HASH_LOCK(hmebp);
2965
2966 return (hmebp);
2967 }
2968
2969 /*
2970 * Function returns a pointer to an hmeblk in the hash bucket, hmebp. If the
2971 * hmeblk doesn't exists for the [sfmmup, vaddr & size] signature, a hmeblk is
2972 * allocated.
2973 */
2974 static struct hme_blk *
sfmmu_tteload_find_hmeblk(sfmmu_t * sfmmup,struct hmehash_bucket * hmebp,caddr_t vaddr,uint_t size,uint_t flags,uint_t rid)2975 sfmmu_tteload_find_hmeblk(sfmmu_t *sfmmup, struct hmehash_bucket *hmebp,
2976 caddr_t vaddr, uint_t size, uint_t flags, uint_t rid)
2977 {
2978 hmeblk_tag hblktag;
2979 int hmeshift;
2980 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
2981
2982 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
2983
2984 hblktag.htag_id = sfmmutohtagid(sfmmup, rid);
2985 ASSERT(hblktag.htag_id != NULL);
2986 hmeshift = HME_HASH_SHIFT(size);
2987 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
2988 hblktag.htag_rehash = HME_HASH_REHASH(size);
2989 hblktag.htag_rid = rid;
2990
2991 ttearray_realloc:
2992
2993 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
2994
2995 /*
2996 * We block until hblk_reserve_lock is released; it's held by
2997 * the thread, temporarily using hblk_reserve, until hblk_reserve is
2998 * replaced by a hblk from sfmmu8_cache.
2999 */
3000 if (hmeblkp == (struct hme_blk *)hblk_reserve &&
3001 hblk_reserve_thread != curthread) {
3002 SFMMU_HASH_UNLOCK(hmebp);
3003 mutex_enter(&hblk_reserve_lock);
3004 mutex_exit(&hblk_reserve_lock);
3005 SFMMU_STAT(sf_hblk_reserve_hit);
3006 SFMMU_HASH_LOCK(hmebp);
3007 goto ttearray_realloc;
3008 }
3009
3010 if (hmeblkp == NULL) {
3011 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
3012 hblktag, flags, rid);
3013 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3014 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3015 } else {
3016 /*
3017 * It is possible for 8k and 64k hblks to collide since they
3018 * have the same rehash value. This is because we
3019 * lazily free hblks and 8K/64K blks could be lingering.
3020 * If we find size mismatch we free the block and & try again.
3021 */
3022 if (get_hblk_ttesz(hmeblkp) != size) {
3023 ASSERT(!hmeblkp->hblk_vcnt);
3024 ASSERT(!hmeblkp->hblk_hmecnt);
3025 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3026 &list, 0);
3027 goto ttearray_realloc;
3028 }
3029 if (hmeblkp->hblk_shw_bit) {
3030 /*
3031 * if the hblk was previously used as a shadow hblk then
3032 * we will change it to a normal hblk
3033 */
3034 ASSERT(!hmeblkp->hblk_shared);
3035 if (hmeblkp->hblk_shw_mask) {
3036 sfmmu_shadow_hcleanup(sfmmup, hmeblkp, hmebp);
3037 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3038 goto ttearray_realloc;
3039 } else {
3040 hmeblkp->hblk_shw_bit = 0;
3041 }
3042 }
3043 SFMMU_STAT(sf_hblk_hit);
3044 }
3045
3046 /*
3047 * hat_memload() should never call kmem_cache_free() for kernel hmeblks;
3048 * see block comment showing the stacktrace in sfmmu_hblk_alloc();
3049 * set the flag parameter to 1 so that sfmmu_hblks_list_purge() will
3050 * just add these hmeblks to the per-cpu pending queue.
3051 */
3052 sfmmu_hblks_list_purge(&list, 1);
3053
3054 ASSERT(get_hblk_ttesz(hmeblkp) == size);
3055 ASSERT(!hmeblkp->hblk_shw_bit);
3056 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3057 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3058 ASSERT(hmeblkp->hblk_tag.htag_rid == rid);
3059
3060 return (hmeblkp);
3061 }
3062
3063 /*
3064 * Function adds a tte entry into the hmeblk. It returns 0 if successful and 1
3065 * otherwise.
3066 */
3067 static int
sfmmu_tteload_addentry(sfmmu_t * sfmmup,struct hme_blk * hmeblkp,tte_t * ttep,caddr_t vaddr,page_t ** pps,uint_t flags,uint_t rid)3068 sfmmu_tteload_addentry(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, tte_t *ttep,
3069 caddr_t vaddr, page_t **pps, uint_t flags, uint_t rid)
3070 {
3071 page_t *pp = *pps;
3072 int hmenum, size, remap;
3073 tte_t tteold, flush_tte;
3074 #ifdef DEBUG
3075 tte_t orig_old;
3076 #endif /* DEBUG */
3077 struct sf_hment *sfhme;
3078 kmutex_t *pml, *pmtx;
3079 hatlock_t *hatlockp;
3080 int myflt;
3081
3082 /*
3083 * remove this panic when we decide to let user virtual address
3084 * space be >= USERLIMIT.
3085 */
3086 if (!TTE_IS_PRIVILEGED(ttep) && vaddr >= (caddr_t)USERLIMIT)
3087 panic("user addr %p in kernel space", (void *)vaddr);
3088 #if defined(TTE_IS_GLOBAL)
3089 if (TTE_IS_GLOBAL(ttep))
3090 panic("sfmmu_tteload: creating global tte");
3091 #endif
3092
3093 #ifdef DEBUG
3094 if (pf_is_memory(sfmmu_ttetopfn(ttep, vaddr)) &&
3095 !TTE_IS_PCACHEABLE(ttep) && !sfmmu_allow_nc_trans)
3096 panic("sfmmu_tteload: non cacheable memory tte");
3097 #endif /* DEBUG */
3098
3099 /* don't simulate dirty bit for writeable ISM/DISM mappings */
3100 if ((flags & HAT_LOAD_SHARE) && TTE_IS_WRITABLE(ttep)) {
3101 TTE_SET_REF(ttep);
3102 TTE_SET_MOD(ttep);
3103 }
3104
3105 if ((flags & HAT_LOAD_SHARE) || !TTE_IS_REF(ttep) ||
3106 !TTE_IS_MOD(ttep)) {
3107 /*
3108 * Don't load TSB for dummy as in ISM. Also don't preload
3109 * the TSB if the TTE isn't writable since we're likely to
3110 * fault on it again -- preloading can be fairly expensive.
3111 */
3112 flags |= SFMMU_NO_TSBLOAD;
3113 }
3114
3115 size = TTE_CSZ(ttep);
3116 switch (size) {
3117 case TTE8K:
3118 SFMMU_STAT(sf_tteload8k);
3119 break;
3120 case TTE64K:
3121 SFMMU_STAT(sf_tteload64k);
3122 break;
3123 case TTE512K:
3124 SFMMU_STAT(sf_tteload512k);
3125 break;
3126 case TTE4M:
3127 SFMMU_STAT(sf_tteload4m);
3128 break;
3129 case (TTE32M):
3130 SFMMU_STAT(sf_tteload32m);
3131 ASSERT(mmu_page_sizes == max_mmu_page_sizes);
3132 break;
3133 case (TTE256M):
3134 SFMMU_STAT(sf_tteload256m);
3135 ASSERT(mmu_page_sizes == max_mmu_page_sizes);
3136 break;
3137 }
3138
3139 ASSERT(!((uintptr_t)vaddr & TTE_PAGE_OFFSET(size)));
3140 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
3141 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) || hmeblkp->hblk_shared);
3142 ASSERT(SFMMU_IS_SHMERID_VALID(rid) || !hmeblkp->hblk_shared);
3143
3144 HBLKTOHME_IDX(sfhme, hmeblkp, vaddr, hmenum);
3145
3146 /*
3147 * Need to grab mlist lock here so that pageunload
3148 * will not change tte behind us.
3149 */
3150 if (pp) {
3151 pml = sfmmu_mlist_enter(pp);
3152 }
3153
3154 sfmmu_copytte(&sfhme->hme_tte, &tteold);
3155 /*
3156 * Look for corresponding hment and if valid verify
3157 * pfns are equal.
3158 */
3159 remap = TTE_IS_VALID(&tteold);
3160 if (remap) {
3161 pfn_t new_pfn, old_pfn;
3162
3163 old_pfn = TTE_TO_PFN(vaddr, &tteold);
3164 new_pfn = TTE_TO_PFN(vaddr, ttep);
3165
3166 if (flags & HAT_LOAD_REMAP) {
3167 /* make sure we are remapping same type of pages */
3168 if (pf_is_memory(old_pfn) != pf_is_memory(new_pfn)) {
3169 panic("sfmmu_tteload - tte remap io<->memory");
3170 }
3171 if (old_pfn != new_pfn &&
3172 (pp != NULL || sfhme->hme_page != NULL)) {
3173 panic("sfmmu_tteload - tte remap pp != NULL");
3174 }
3175 } else if (old_pfn != new_pfn) {
3176 panic("sfmmu_tteload - tte remap, hmeblkp 0x%p",
3177 (void *)hmeblkp);
3178 }
3179 ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep));
3180 }
3181
3182 if (pp) {
3183 if (size == TTE8K) {
3184 #ifdef VAC
3185 /*
3186 * Handle VAC consistency
3187 */
3188 if (!remap && (cache & CACHE_VAC) && !PP_ISNC(pp)) {
3189 sfmmu_vac_conflict(sfmmup, vaddr, pp);
3190 }
3191 #endif
3192
3193 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
3194 pmtx = sfmmu_page_enter(pp);
3195 PP_CLRRO(pp);
3196 sfmmu_page_exit(pmtx);
3197 } else if (!PP_ISMAPPED(pp) &&
3198 (!TTE_IS_WRITABLE(ttep)) && !(PP_ISMOD(pp))) {
3199 pmtx = sfmmu_page_enter(pp);
3200 if (!(PP_ISMOD(pp))) {
3201 PP_SETRO(pp);
3202 }
3203 sfmmu_page_exit(pmtx);
3204 }
3205
3206 } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) {
3207 /*
3208 * sfmmu_pagearray_setup failed so return
3209 */
3210 sfmmu_mlist_exit(pml);
3211 return (1);
3212 }
3213 }
3214
3215 /*
3216 * Make sure hment is not on a mapping list.
3217 */
3218 ASSERT(remap || (sfhme->hme_page == NULL));
3219
3220 /* if it is not a remap then hme->next better be NULL */
3221 ASSERT((!remap) ? sfhme->hme_next == NULL : 1);
3222
3223 if (flags & HAT_LOAD_LOCK) {
3224 if ((hmeblkp->hblk_lckcnt + 1) >= MAX_HBLK_LCKCNT) {
3225 panic("too high lckcnt-hmeblk %p",
3226 (void *)hmeblkp);
3227 }
3228 atomic_inc_32(&hmeblkp->hblk_lckcnt);
3229
3230 HBLK_STACK_TRACE(hmeblkp, HBLK_LOCK);
3231 }
3232
3233 #ifdef VAC
3234 if (pp && PP_ISNC(pp)) {
3235 /*
3236 * If the physical page is marked to be uncacheable, like
3237 * by a vac conflict, make sure the new mapping is also
3238 * uncacheable.
3239 */
3240 TTE_CLR_VCACHEABLE(ttep);
3241 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
3242 }
3243 #endif
3244 ttep->tte_hmenum = hmenum;
3245
3246 #ifdef DEBUG
3247 orig_old = tteold;
3248 #endif /* DEBUG */
3249
3250 while (sfmmu_modifytte_try(&tteold, ttep, &sfhme->hme_tte) < 0) {
3251 if ((sfmmup == KHATID) &&
3252 (flags & (HAT_LOAD_LOCK | HAT_LOAD_REMAP))) {
3253 sfmmu_copytte(&sfhme->hme_tte, &tteold);
3254 }
3255 #ifdef DEBUG
3256 chk_tte(&orig_old, &tteold, ttep, hmeblkp);
3257 #endif /* DEBUG */
3258 }
3259 ASSERT(TTE_IS_VALID(&sfhme->hme_tte));
3260
3261 if (!TTE_IS_VALID(&tteold)) {
3262
3263 atomic_inc_16(&hmeblkp->hblk_vcnt);
3264 if (rid == SFMMU_INVALID_SHMERID) {
3265 atomic_inc_ulong(&sfmmup->sfmmu_ttecnt[size]);
3266 } else {
3267 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
3268 sf_region_t *rgnp = srdp->srd_hmergnp[rid];
3269 /*
3270 * We already accounted for region ttecnt's in sfmmu
3271 * during hat_join_region() processing. Here we
3272 * only update ttecnt's in region struture.
3273 */
3274 atomic_inc_ulong(&rgnp->rgn_ttecnt[size]);
3275 }
3276 }
3277
3278 myflt = (astosfmmu(curthread->t_procp->p_as) == sfmmup);
3279 if (size > TTE8K && (flags & HAT_LOAD_SHARE) == 0 &&
3280 sfmmup != ksfmmup) {
3281 uchar_t tteflag = 1 << size;
3282 if (rid == SFMMU_INVALID_SHMERID) {
3283 if (!(sfmmup->sfmmu_tteflags & tteflag)) {
3284 hatlockp = sfmmu_hat_enter(sfmmup);
3285 sfmmup->sfmmu_tteflags |= tteflag;
3286 sfmmu_hat_exit(hatlockp);
3287 }
3288 } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) {
3289 hatlockp = sfmmu_hat_enter(sfmmup);
3290 sfmmup->sfmmu_rtteflags |= tteflag;
3291 sfmmu_hat_exit(hatlockp);
3292 }
3293 /*
3294 * Update the current CPU tsbmiss area, so the current thread
3295 * won't need to take the tsbmiss for the new pagesize.
3296 * The other threads in the process will update their tsb
3297 * miss area lazily in sfmmu_tsbmiss_exception() when they
3298 * fail to find the translation for a newly added pagesize.
3299 */
3300 if (size > TTE64K && myflt) {
3301 struct tsbmiss *tsbmp;
3302 kpreempt_disable();
3303 tsbmp = &tsbmiss_area[CPU->cpu_id];
3304 if (rid == SFMMU_INVALID_SHMERID) {
3305 if (!(tsbmp->uhat_tteflags & tteflag)) {
3306 tsbmp->uhat_tteflags |= tteflag;
3307 }
3308 } else {
3309 if (!(tsbmp->uhat_rtteflags & tteflag)) {
3310 tsbmp->uhat_rtteflags |= tteflag;
3311 }
3312 }
3313 kpreempt_enable();
3314 }
3315 }
3316
3317 if (size >= TTE4M && (flags & HAT_LOAD_TEXT) &&
3318 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
3319 hatlockp = sfmmu_hat_enter(sfmmup);
3320 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
3321 sfmmu_hat_exit(hatlockp);
3322 }
3323
3324 flush_tte.tte_intlo = (tteold.tte_intlo ^ ttep->tte_intlo) &
3325 hw_tte.tte_intlo;
3326 flush_tte.tte_inthi = (tteold.tte_inthi ^ ttep->tte_inthi) &
3327 hw_tte.tte_inthi;
3328
3329 if (remap && (flush_tte.tte_inthi || flush_tte.tte_intlo)) {
3330 /*
3331 * If remap and new tte differs from old tte we need
3332 * to sync the mod bit and flush TLB/TSB. We don't
3333 * need to sync ref bit because we currently always set
3334 * ref bit in tteload.
3335 */
3336 ASSERT(TTE_IS_REF(ttep));
3337 if (TTE_IS_MOD(&tteold)) {
3338 sfmmu_ttesync(sfmmup, vaddr, &tteold, pp);
3339 }
3340 /*
3341 * hwtte bits shouldn't change for SRD hmeblks as long as SRD
3342 * hmes are only used for read only text. Adding this code for
3343 * completeness and future use of shared hmeblks with writable
3344 * mappings of VMODSORT vnodes.
3345 */
3346 if (hmeblkp->hblk_shared) {
3347 cpuset_t cpuset = sfmmu_rgntlb_demap(vaddr,
3348 sfmmup->sfmmu_srdp->srd_hmergnp[rid], hmeblkp, 1);
3349 xt_sync(cpuset);
3350 SFMMU_STAT_ADD(sf_region_remap_demap, 1);
3351 } else {
3352 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 0);
3353 xt_sync(sfmmup->sfmmu_cpusran);
3354 }
3355 }
3356
3357 if ((flags & SFMMU_NO_TSBLOAD) == 0) {
3358 /*
3359 * We only preload 8K and 4M mappings into the TSB, since
3360 * 64K and 512K mappings are replicated and hence don't
3361 * have a single, unique TSB entry. Ditto for 32M/256M.
3362 */
3363 if (size == TTE8K || size == TTE4M) {
3364 sf_scd_t *scdp;
3365 hatlockp = sfmmu_hat_enter(sfmmup);
3366 /*
3367 * Don't preload private TSB if the mapping is used
3368 * by the shctx in the SCD.
3369 */
3370 scdp = sfmmup->sfmmu_scdp;
3371 if (rid == SFMMU_INVALID_SHMERID || scdp == NULL ||
3372 !SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
3373 sfmmu_load_tsb(sfmmup, vaddr, &sfhme->hme_tte,
3374 size);
3375 }
3376 sfmmu_hat_exit(hatlockp);
3377 }
3378 }
3379 if (pp) {
3380 if (!remap) {
3381 HME_ADD(sfhme, pp);
3382 atomic_inc_16(&hmeblkp->hblk_hmecnt);
3383 ASSERT(hmeblkp->hblk_hmecnt > 0);
3384
3385 /*
3386 * Cannot ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
3387 * see pageunload() for comment.
3388 */
3389 }
3390 sfmmu_mlist_exit(pml);
3391 }
3392
3393 return (0);
3394 }
3395 /*
3396 * Function unlocks hash bucket.
3397 */
3398 static void
sfmmu_tteload_release_hashbucket(struct hmehash_bucket * hmebp)3399 sfmmu_tteload_release_hashbucket(struct hmehash_bucket *hmebp)
3400 {
3401 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3402 SFMMU_HASH_UNLOCK(hmebp);
3403 }
3404
3405 /*
3406 * function which checks and sets up page array for a large
3407 * translation. Will set p_vcolor, p_index, p_ro fields.
3408 * Assumes addr and pfnum of first page are properly aligned.
3409 * Will check for physical contiguity. If check fails it return
3410 * non null.
3411 */
3412 static int
sfmmu_pagearray_setup(caddr_t addr,page_t ** pps,tte_t * ttep,int remap)3413 sfmmu_pagearray_setup(caddr_t addr, page_t **pps, tte_t *ttep, int remap)
3414 {
3415 int i, index, ttesz;
3416 pfn_t pfnum;
3417 pgcnt_t npgs;
3418 page_t *pp, *pp1;
3419 kmutex_t *pmtx;
3420 #ifdef VAC
3421 int osz;
3422 int cflags = 0;
3423 int vac_err = 0;
3424 #endif
3425 int newidx = 0;
3426
3427 ttesz = TTE_CSZ(ttep);
3428
3429 ASSERT(ttesz > TTE8K);
3430
3431 npgs = TTEPAGES(ttesz);
3432 index = PAGESZ_TO_INDEX(ttesz);
3433
3434 pfnum = (*pps)->p_pagenum;
3435 ASSERT(IS_P2ALIGNED(pfnum, npgs));
3436
3437 /*
3438 * Save the first pp so we can do HAT_TMPNC at the end.
3439 */
3440 pp1 = *pps;
3441 #ifdef VAC
3442 osz = fnd_mapping_sz(pp1);
3443 #endif
3444
3445 for (i = 0; i < npgs; i++, pps++) {
3446 pp = *pps;
3447 ASSERT(PAGE_LOCKED(pp));
3448 ASSERT(pp->p_szc >= ttesz);
3449 ASSERT(pp->p_szc == pp1->p_szc);
3450 ASSERT(sfmmu_mlist_held(pp));
3451
3452 /*
3453 * XXX is it possible to maintain P_RO on the root only?
3454 */
3455 if (TTE_IS_WRITABLE(ttep) && PP_ISRO(pp)) {
3456 pmtx = sfmmu_page_enter(pp);
3457 PP_CLRRO(pp);
3458 sfmmu_page_exit(pmtx);
3459 } else if (!PP_ISMAPPED(pp) && !TTE_IS_WRITABLE(ttep) &&
3460 !PP_ISMOD(pp)) {
3461 pmtx = sfmmu_page_enter(pp);
3462 if (!(PP_ISMOD(pp))) {
3463 PP_SETRO(pp);
3464 }
3465 sfmmu_page_exit(pmtx);
3466 }
3467
3468 /*
3469 * If this is a remap we skip vac & contiguity checks.
3470 */
3471 if (remap)
3472 continue;
3473
3474 /*
3475 * set p_vcolor and detect any vac conflicts.
3476 */
3477 #ifdef VAC
3478 if (vac_err == 0) {
3479 vac_err = sfmmu_vacconflict_array(addr, pp, &cflags);
3480
3481 }
3482 #endif
3483
3484 /*
3485 * Save current index in case we need to undo it.
3486 * Note: "PAGESZ_TO_INDEX(sz) (1 << (sz))"
3487 * "SFMMU_INDEX_SHIFT 6"
3488 * "SFMMU_INDEX_MASK ((1 << SFMMU_INDEX_SHIFT) - 1)"
3489 * "PP_MAPINDEX(p_index) (p_index & SFMMU_INDEX_MASK)"
3490 *
3491 * So: index = PAGESZ_TO_INDEX(ttesz);
3492 * if ttesz == 1 then index = 0x2
3493 * 2 then index = 0x4
3494 * 3 then index = 0x8
3495 * 4 then index = 0x10
3496 * 5 then index = 0x20
3497 * The code below checks if it's a new pagesize (ie, newidx)
3498 * in case we need to take it back out of p_index,
3499 * and then or's the new index into the existing index.
3500 */
3501 if ((PP_MAPINDEX(pp) & index) == 0)
3502 newidx = 1;
3503 pp->p_index = (PP_MAPINDEX(pp) | index);
3504
3505 /*
3506 * contiguity check
3507 */
3508 if (pp->p_pagenum != pfnum) {
3509 /*
3510 * If we fail the contiguity test then
3511 * the only thing we need to fix is the p_index field.
3512 * We might get a few extra flushes but since this
3513 * path is rare that is ok. The p_ro field will
3514 * get automatically fixed on the next tteload to
3515 * the page. NO TNC bit is set yet.
3516 */
3517 while (i >= 0) {
3518 pp = *pps;
3519 if (newidx)
3520 pp->p_index = (PP_MAPINDEX(pp) &
3521 ~index);
3522 pps--;
3523 i--;
3524 }
3525 return (1);
3526 }
3527 pfnum++;
3528 addr += MMU_PAGESIZE;
3529 }
3530
3531 #ifdef VAC
3532 if (vac_err) {
3533 if (ttesz > osz) {
3534 /*
3535 * There are some smaller mappings that causes vac
3536 * conflicts. Convert all existing small mappings to
3537 * TNC.
3538 */
3539 SFMMU_STAT_ADD(sf_uncache_conflict, npgs);
3540 sfmmu_page_cache_array(pp1, HAT_TMPNC, CACHE_FLUSH,
3541 npgs);
3542 } else {
3543 /* EMPTY */
3544 /*
3545 * If there exists an big page mapping,
3546 * that means the whole existing big page
3547 * has TNC setting already. No need to covert to
3548 * TNC again.
3549 */
3550 ASSERT(PP_ISTNC(pp1));
3551 }
3552 }
3553 #endif /* VAC */
3554
3555 return (0);
3556 }
3557
3558 #ifdef VAC
3559 /*
3560 * Routine that detects vac consistency for a large page. It also
3561 * sets virtual color for all pp's for this big mapping.
3562 */
3563 static int
sfmmu_vacconflict_array(caddr_t addr,page_t * pp,int * cflags)3564 sfmmu_vacconflict_array(caddr_t addr, page_t *pp, int *cflags)
3565 {
3566 int vcolor, ocolor;
3567
3568 ASSERT(sfmmu_mlist_held(pp));
3569
3570 if (PP_ISNC(pp)) {
3571 return (HAT_TMPNC);
3572 }
3573
3574 vcolor = addr_to_vcolor(addr);
3575 if (PP_NEWPAGE(pp)) {
3576 PP_SET_VCOLOR(pp, vcolor);
3577 return (0);
3578 }
3579
3580 ocolor = PP_GET_VCOLOR(pp);
3581 if (ocolor == vcolor) {
3582 return (0);
3583 }
3584
3585 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
3586 /*
3587 * Previous user of page had a differnet color
3588 * but since there are no current users
3589 * we just flush the cache and change the color.
3590 * As an optimization for large pages we flush the
3591 * entire cache of that color and set a flag.
3592 */
3593 SFMMU_STAT(sf_pgcolor_conflict);
3594 if (!CacheColor_IsFlushed(*cflags, ocolor)) {
3595 CacheColor_SetFlushed(*cflags, ocolor);
3596 sfmmu_cache_flushcolor(ocolor, pp->p_pagenum);
3597 }
3598 PP_SET_VCOLOR(pp, vcolor);
3599 return (0);
3600 }
3601
3602 /*
3603 * We got a real conflict with a current mapping.
3604 * set flags to start unencaching all mappings
3605 * and return failure so we restart looping
3606 * the pp array from the beginning.
3607 */
3608 return (HAT_TMPNC);
3609 }
3610 #endif /* VAC */
3611
3612 /*
3613 * creates a large page shadow hmeblk for a tte.
3614 * The purpose of this routine is to allow us to do quick unloads because
3615 * the vm layer can easily pass a very large but sparsely populated range.
3616 */
3617 static struct hme_blk *
sfmmu_shadow_hcreate(sfmmu_t * sfmmup,caddr_t vaddr,int ttesz,uint_t flags)3618 sfmmu_shadow_hcreate(sfmmu_t *sfmmup, caddr_t vaddr, int ttesz, uint_t flags)
3619 {
3620 struct hmehash_bucket *hmebp;
3621 hmeblk_tag hblktag;
3622 int hmeshift, size, vshift;
3623 uint_t shw_mask, newshw_mask;
3624 struct hme_blk *hmeblkp;
3625
3626 ASSERT(sfmmup != KHATID);
3627 if (mmu_page_sizes == max_mmu_page_sizes) {
3628 ASSERT(ttesz < TTE256M);
3629 } else {
3630 ASSERT(ttesz < TTE4M);
3631 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
3632 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
3633 }
3634
3635 if (ttesz == TTE8K) {
3636 size = TTE512K;
3637 } else {
3638 size = ++ttesz;
3639 }
3640
3641 hblktag.htag_id = sfmmup;
3642 hmeshift = HME_HASH_SHIFT(size);
3643 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
3644 hblktag.htag_rehash = HME_HASH_REHASH(size);
3645 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3646 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
3647
3648 SFMMU_HASH_LOCK(hmebp);
3649
3650 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
3651 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
3652 if (hmeblkp == NULL) {
3653 hmeblkp = sfmmu_hblk_alloc(sfmmup, vaddr, hmebp, size,
3654 hblktag, flags, SFMMU_INVALID_SHMERID);
3655 }
3656 ASSERT(hmeblkp);
3657 if (!hmeblkp->hblk_shw_mask) {
3658 /*
3659 * if this is a unused hblk it was just allocated or could
3660 * potentially be a previous large page hblk so we need to
3661 * set the shadow bit.
3662 */
3663 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
3664 hmeblkp->hblk_shw_bit = 1;
3665 } else if (hmeblkp->hblk_shw_bit == 0) {
3666 panic("sfmmu_shadow_hcreate: shw bit not set in hmeblkp 0x%p",
3667 (void *)hmeblkp);
3668 }
3669 ASSERT(hmeblkp->hblk_shw_bit == 1);
3670 ASSERT(!hmeblkp->hblk_shared);
3671 vshift = vaddr_to_vshift(hblktag, vaddr, size);
3672 ASSERT(vshift < 8);
3673 /*
3674 * Atomically set shw mask bit
3675 */
3676 do {
3677 shw_mask = hmeblkp->hblk_shw_mask;
3678 newshw_mask = shw_mask | (1 << vshift);
3679 newshw_mask = atomic_cas_32(&hmeblkp->hblk_shw_mask, shw_mask,
3680 newshw_mask);
3681 } while (newshw_mask != shw_mask);
3682
3683 SFMMU_HASH_UNLOCK(hmebp);
3684
3685 return (hmeblkp);
3686 }
3687
3688 /*
3689 * This routine cleanup a previous shadow hmeblk and changes it to
3690 * a regular hblk. This happens rarely but it is possible
3691 * when a process wants to use large pages and there are hblks still
3692 * lying around from the previous as that used these hmeblks.
3693 * The alternative was to cleanup the shadow hblks at unload time
3694 * but since so few user processes actually use large pages, it is
3695 * better to be lazy and cleanup at this time.
3696 */
3697 static void
sfmmu_shadow_hcleanup(sfmmu_t * sfmmup,struct hme_blk * hmeblkp,struct hmehash_bucket * hmebp)3698 sfmmu_shadow_hcleanup(sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
3699 struct hmehash_bucket *hmebp)
3700 {
3701 caddr_t addr, endaddr;
3702 int hashno, size;
3703
3704 ASSERT(hmeblkp->hblk_shw_bit);
3705 ASSERT(!hmeblkp->hblk_shared);
3706
3707 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
3708
3709 if (!hmeblkp->hblk_shw_mask) {
3710 hmeblkp->hblk_shw_bit = 0;
3711 return;
3712 }
3713 addr = (caddr_t)get_hblk_base(hmeblkp);
3714 endaddr = get_hblk_endaddr(hmeblkp);
3715 size = get_hblk_ttesz(hmeblkp);
3716 hashno = size - 1;
3717 ASSERT(hashno > 0);
3718 SFMMU_HASH_UNLOCK(hmebp);
3719
3720 sfmmu_free_hblks(sfmmup, addr, endaddr, hashno);
3721
3722 SFMMU_HASH_LOCK(hmebp);
3723 }
3724
3725 static void
sfmmu_free_hblks(sfmmu_t * sfmmup,caddr_t addr,caddr_t endaddr,int hashno)3726 sfmmu_free_hblks(sfmmu_t *sfmmup, caddr_t addr, caddr_t endaddr,
3727 int hashno)
3728 {
3729 int hmeshift, shadow = 0;
3730 hmeblk_tag hblktag;
3731 struct hmehash_bucket *hmebp;
3732 struct hme_blk *hmeblkp;
3733 struct hme_blk *nx_hblk, *pr_hblk, *list = NULL;
3734
3735 ASSERT(hashno > 0);
3736 hblktag.htag_id = sfmmup;
3737 hblktag.htag_rehash = hashno;
3738 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3739
3740 hmeshift = HME_HASH_SHIFT(hashno);
3741
3742 while (addr < endaddr) {
3743 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3744 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
3745 SFMMU_HASH_LOCK(hmebp);
3746 /* inline HME_HASH_SEARCH */
3747 hmeblkp = hmebp->hmeblkp;
3748 pr_hblk = NULL;
3749 while (hmeblkp) {
3750 if (HTAGS_EQ(hmeblkp->hblk_tag, hblktag)) {
3751 /* found hme_blk */
3752 ASSERT(!hmeblkp->hblk_shared);
3753 if (hmeblkp->hblk_shw_bit) {
3754 if (hmeblkp->hblk_shw_mask) {
3755 shadow = 1;
3756 sfmmu_shadow_hcleanup(sfmmup,
3757 hmeblkp, hmebp);
3758 break;
3759 } else {
3760 hmeblkp->hblk_shw_bit = 0;
3761 }
3762 }
3763
3764 /*
3765 * Hblk_hmecnt and hblk_vcnt could be non zero
3766 * since hblk_unload() does not gurantee that.
3767 *
3768 * XXX - this could cause tteload() to spin
3769 * where sfmmu_shadow_hcleanup() is called.
3770 */
3771 }
3772
3773 nx_hblk = hmeblkp->hblk_next;
3774 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
3775 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3776 &list, 0);
3777 } else {
3778 pr_hblk = hmeblkp;
3779 }
3780 hmeblkp = nx_hblk;
3781 }
3782
3783 SFMMU_HASH_UNLOCK(hmebp);
3784
3785 if (shadow) {
3786 /*
3787 * We found another shadow hblk so cleaned its
3788 * children. We need to go back and cleanup
3789 * the original hblk so we don't change the
3790 * addr.
3791 */
3792 shadow = 0;
3793 } else {
3794 addr = (caddr_t)roundup((uintptr_t)addr + 1,
3795 (1 << hmeshift));
3796 }
3797 }
3798 sfmmu_hblks_list_purge(&list, 0);
3799 }
3800
3801 /*
3802 * This routine's job is to delete stale invalid shared hmeregions hmeblks that
3803 * may still linger on after pageunload.
3804 */
3805 static void
sfmmu_cleanup_rhblk(sf_srd_t * srdp,caddr_t addr,uint_t rid,int ttesz)3806 sfmmu_cleanup_rhblk(sf_srd_t *srdp, caddr_t addr, uint_t rid, int ttesz)
3807 {
3808 int hmeshift;
3809 hmeblk_tag hblktag;
3810 struct hmehash_bucket *hmebp;
3811 struct hme_blk *hmeblkp;
3812 struct hme_blk *pr_hblk;
3813 struct hme_blk *list = NULL;
3814
3815 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3816 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3817
3818 hmeshift = HME_HASH_SHIFT(ttesz);
3819 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3820 hblktag.htag_rehash = ttesz;
3821 hblktag.htag_rid = rid;
3822 hblktag.htag_id = srdp;
3823 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
3824
3825 SFMMU_HASH_LOCK(hmebp);
3826 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
3827 if (hmeblkp != NULL) {
3828 ASSERT(hmeblkp->hblk_shared);
3829 ASSERT(!hmeblkp->hblk_shw_bit);
3830 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
3831 panic("sfmmu_cleanup_rhblk: valid hmeblk");
3832 }
3833 ASSERT(!hmeblkp->hblk_lckcnt);
3834 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3835 &list, 0);
3836 }
3837 SFMMU_HASH_UNLOCK(hmebp);
3838 sfmmu_hblks_list_purge(&list, 0);
3839 }
3840
3841 /* ARGSUSED */
3842 static void
sfmmu_rgn_cb_noop(caddr_t saddr,caddr_t eaddr,caddr_t r_saddr,size_t r_size,void * r_obj,u_offset_t r_objoff)3843 sfmmu_rgn_cb_noop(caddr_t saddr, caddr_t eaddr, caddr_t r_saddr,
3844 size_t r_size, void *r_obj, u_offset_t r_objoff)
3845 {
3846 }
3847
3848 /*
3849 * Searches for an hmeblk which maps addr, then unloads this mapping
3850 * and updates *eaddrp, if the hmeblk is found.
3851 */
3852 static void
sfmmu_unload_hmeregion_va(sf_srd_t * srdp,uint_t rid,caddr_t addr,caddr_t eaddr,int ttesz,caddr_t * eaddrp)3853 sfmmu_unload_hmeregion_va(sf_srd_t *srdp, uint_t rid, caddr_t addr,
3854 caddr_t eaddr, int ttesz, caddr_t *eaddrp)
3855 {
3856 int hmeshift;
3857 hmeblk_tag hblktag;
3858 struct hmehash_bucket *hmebp;
3859 struct hme_blk *hmeblkp;
3860 struct hme_blk *pr_hblk;
3861 struct hme_blk *list = NULL;
3862
3863 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3864 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3865 ASSERT(ttesz >= HBLK_MIN_TTESZ);
3866
3867 hmeshift = HME_HASH_SHIFT(ttesz);
3868 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3869 hblktag.htag_rehash = ttesz;
3870 hblktag.htag_rid = rid;
3871 hblktag.htag_id = srdp;
3872 hmebp = HME_HASH_FUNCTION(srdp, addr, hmeshift);
3873
3874 SFMMU_HASH_LOCK(hmebp);
3875 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
3876 if (hmeblkp != NULL) {
3877 ASSERT(hmeblkp->hblk_shared);
3878 ASSERT(!hmeblkp->hblk_lckcnt);
3879 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
3880 *eaddrp = sfmmu_hblk_unload(NULL, hmeblkp, addr,
3881 eaddr, NULL, HAT_UNLOAD);
3882 ASSERT(*eaddrp > addr);
3883 }
3884 ASSERT(!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt);
3885 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
3886 &list, 0);
3887 }
3888 SFMMU_HASH_UNLOCK(hmebp);
3889 sfmmu_hblks_list_purge(&list, 0);
3890 }
3891
3892 static void
sfmmu_unload_hmeregion(sf_srd_t * srdp,sf_region_t * rgnp)3893 sfmmu_unload_hmeregion(sf_srd_t *srdp, sf_region_t *rgnp)
3894 {
3895 int ttesz = rgnp->rgn_pgszc;
3896 size_t rsz = rgnp->rgn_size;
3897 caddr_t rsaddr = rgnp->rgn_saddr;
3898 caddr_t readdr = rsaddr + rsz;
3899 caddr_t rhsaddr;
3900 caddr_t va;
3901 uint_t rid = rgnp->rgn_id;
3902 caddr_t cbsaddr;
3903 caddr_t cbeaddr;
3904 hat_rgn_cb_func_t rcbfunc;
3905 ulong_t cnt;
3906
3907 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
3908 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
3909
3910 ASSERT(IS_P2ALIGNED(rsaddr, TTEBYTES(ttesz)));
3911 ASSERT(IS_P2ALIGNED(rsz, TTEBYTES(ttesz)));
3912 if (ttesz < HBLK_MIN_TTESZ) {
3913 ttesz = HBLK_MIN_TTESZ;
3914 rhsaddr = (caddr_t)P2ALIGN((uintptr_t)rsaddr, HBLK_MIN_BYTES);
3915 } else {
3916 rhsaddr = rsaddr;
3917 }
3918
3919 if ((rcbfunc = rgnp->rgn_cb_function) == NULL) {
3920 rcbfunc = sfmmu_rgn_cb_noop;
3921 }
3922
3923 while (ttesz >= HBLK_MIN_TTESZ) {
3924 cbsaddr = rsaddr;
3925 cbeaddr = rsaddr;
3926 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
3927 ttesz--;
3928 continue;
3929 }
3930 cnt = 0;
3931 va = rsaddr;
3932 while (va < readdr) {
3933 ASSERT(va >= rhsaddr);
3934 if (va != cbeaddr) {
3935 if (cbeaddr != cbsaddr) {
3936 ASSERT(cbeaddr > cbsaddr);
3937 (*rcbfunc)(cbsaddr, cbeaddr,
3938 rsaddr, rsz, rgnp->rgn_obj,
3939 rgnp->rgn_objoff);
3940 }
3941 cbsaddr = va;
3942 cbeaddr = va;
3943 }
3944 sfmmu_unload_hmeregion_va(srdp, rid, va, readdr,
3945 ttesz, &cbeaddr);
3946 cnt++;
3947 va = rhsaddr + (cnt << TTE_PAGE_SHIFT(ttesz));
3948 }
3949 if (cbeaddr != cbsaddr) {
3950 ASSERT(cbeaddr > cbsaddr);
3951 (*rcbfunc)(cbsaddr, cbeaddr, rsaddr,
3952 rsz, rgnp->rgn_obj,
3953 rgnp->rgn_objoff);
3954 }
3955 ttesz--;
3956 }
3957 }
3958
3959 /*
3960 * Release one hardware address translation lock on the given address range.
3961 */
3962 void
hat_unlock(struct hat * sfmmup,caddr_t addr,size_t len)3963 hat_unlock(struct hat *sfmmup, caddr_t addr, size_t len)
3964 {
3965 struct hmehash_bucket *hmebp;
3966 hmeblk_tag hblktag;
3967 int hmeshift, hashno = 1;
3968 struct hme_blk *hmeblkp, *list = NULL;
3969 caddr_t endaddr;
3970
3971 ASSERT(sfmmup != NULL);
3972 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
3973
3974 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
3975 ASSERT((len & MMU_PAGEOFFSET) == 0);
3976 endaddr = addr + len;
3977 hblktag.htag_id = sfmmup;
3978 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
3979
3980 /*
3981 * Spitfire supports 4 page sizes.
3982 * Most pages are expected to be of the smallest page size (8K) and
3983 * these will not need to be rehashed. 64K pages also don't need to be
3984 * rehashed because an hmeblk spans 64K of address space. 512K pages
3985 * might need 1 rehash and and 4M pages might need 2 rehashes.
3986 */
3987 while (addr < endaddr) {
3988 hmeshift = HME_HASH_SHIFT(hashno);
3989 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
3990 hblktag.htag_rehash = hashno;
3991 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
3992
3993 SFMMU_HASH_LOCK(hmebp);
3994
3995 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
3996 if (hmeblkp != NULL) {
3997 ASSERT(!hmeblkp->hblk_shared);
3998 /*
3999 * If we encounter a shadow hmeblk then
4000 * we know there are no valid hmeblks mapping
4001 * this address at this size or larger.
4002 * Just increment address by the smallest
4003 * page size.
4004 */
4005 if (hmeblkp->hblk_shw_bit) {
4006 addr += MMU_PAGESIZE;
4007 } else {
4008 addr = sfmmu_hblk_unlock(hmeblkp, addr,
4009 endaddr);
4010 }
4011 SFMMU_HASH_UNLOCK(hmebp);
4012 hashno = 1;
4013 continue;
4014 }
4015 SFMMU_HASH_UNLOCK(hmebp);
4016
4017 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
4018 /*
4019 * We have traversed the whole list and rehashed
4020 * if necessary without finding the address to unlock
4021 * which should never happen.
4022 */
4023 panic("sfmmu_unlock: addr not found. "
4024 "addr %p hat %p", (void *)addr, (void *)sfmmup);
4025 } else {
4026 hashno++;
4027 }
4028 }
4029
4030 sfmmu_hblks_list_purge(&list, 0);
4031 }
4032
4033 void
hat_unlock_region(struct hat * sfmmup,caddr_t addr,size_t len,hat_region_cookie_t rcookie)4034 hat_unlock_region(struct hat *sfmmup, caddr_t addr, size_t len,
4035 hat_region_cookie_t rcookie)
4036 {
4037 sf_srd_t *srdp;
4038 sf_region_t *rgnp;
4039 int ttesz;
4040 uint_t rid;
4041 caddr_t eaddr;
4042 caddr_t va;
4043 int hmeshift;
4044 hmeblk_tag hblktag;
4045 struct hmehash_bucket *hmebp;
4046 struct hme_blk *hmeblkp;
4047 struct hme_blk *pr_hblk;
4048 struct hme_blk *list;
4049
4050 if (rcookie == HAT_INVALID_REGION_COOKIE) {
4051 hat_unlock(sfmmup, addr, len);
4052 return;
4053 }
4054
4055 ASSERT(sfmmup != NULL);
4056 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4057 ASSERT(sfmmup != ksfmmup);
4058
4059 srdp = sfmmup->sfmmu_srdp;
4060 rid = (uint_t)((uint64_t)rcookie);
4061 VERIFY3U(rid, <, SFMMU_MAX_HME_REGIONS);
4062 eaddr = addr + len;
4063 va = addr;
4064 list = NULL;
4065 rgnp = srdp->srd_hmergnp[rid];
4066 SFMMU_VALIDATE_HMERID(sfmmup, rid, addr, len);
4067
4068 ASSERT(IS_P2ALIGNED(addr, TTEBYTES(rgnp->rgn_pgszc)));
4069 ASSERT(IS_P2ALIGNED(len, TTEBYTES(rgnp->rgn_pgszc)));
4070 if (rgnp->rgn_pgszc < HBLK_MIN_TTESZ) {
4071 ttesz = HBLK_MIN_TTESZ;
4072 } else {
4073 ttesz = rgnp->rgn_pgszc;
4074 }
4075 while (va < eaddr) {
4076 while (ttesz < rgnp->rgn_pgszc &&
4077 IS_P2ALIGNED(va, TTEBYTES(ttesz + 1))) {
4078 ttesz++;
4079 }
4080 while (ttesz >= HBLK_MIN_TTESZ) {
4081 if (!(rgnp->rgn_hmeflags & (1 << ttesz))) {
4082 ttesz--;
4083 continue;
4084 }
4085 hmeshift = HME_HASH_SHIFT(ttesz);
4086 hblktag.htag_bspage = HME_HASH_BSPAGE(va, hmeshift);
4087 hblktag.htag_rehash = ttesz;
4088 hblktag.htag_rid = rid;
4089 hblktag.htag_id = srdp;
4090 hmebp = HME_HASH_FUNCTION(srdp, va, hmeshift);
4091 SFMMU_HASH_LOCK(hmebp);
4092 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk,
4093 &list);
4094 if (hmeblkp == NULL) {
4095 SFMMU_HASH_UNLOCK(hmebp);
4096 ttesz--;
4097 continue;
4098 }
4099 ASSERT(hmeblkp->hblk_shared);
4100 va = sfmmu_hblk_unlock(hmeblkp, va, eaddr);
4101 ASSERT(va >= eaddr ||
4102 IS_P2ALIGNED((uintptr_t)va, TTEBYTES(ttesz)));
4103 SFMMU_HASH_UNLOCK(hmebp);
4104 break;
4105 }
4106 if (ttesz < HBLK_MIN_TTESZ) {
4107 panic("hat_unlock_region: addr not found "
4108 "addr %p hat %p", (void *)va, (void *)sfmmup);
4109 }
4110 }
4111 sfmmu_hblks_list_purge(&list, 0);
4112 }
4113
4114 /*
4115 * Function to unlock a range of addresses in an hmeblk. It returns the
4116 * next address that needs to be unlocked.
4117 * Should be called with the hash lock held.
4118 */
4119 static caddr_t
sfmmu_hblk_unlock(struct hme_blk * hmeblkp,caddr_t addr,caddr_t endaddr)4120 sfmmu_hblk_unlock(struct hme_blk *hmeblkp, caddr_t addr, caddr_t endaddr)
4121 {
4122 struct sf_hment *sfhme;
4123 tte_t tteold, ttemod;
4124 int ttesz, ret;
4125
4126 ASSERT(in_hblk_range(hmeblkp, addr));
4127 ASSERT(hmeblkp->hblk_shw_bit == 0);
4128
4129 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
4130 ttesz = get_hblk_ttesz(hmeblkp);
4131
4132 HBLKTOHME(sfhme, hmeblkp, addr);
4133 while (addr < endaddr) {
4134 readtte:
4135 sfmmu_copytte(&sfhme->hme_tte, &tteold);
4136 if (TTE_IS_VALID(&tteold)) {
4137
4138 ttemod = tteold;
4139
4140 ret = sfmmu_modifytte_try(&tteold, &ttemod,
4141 &sfhme->hme_tte);
4142
4143 if (ret < 0)
4144 goto readtte;
4145
4146 if (hmeblkp->hblk_lckcnt == 0)
4147 panic("zero hblk lckcnt");
4148
4149 if (((uintptr_t)addr + TTEBYTES(ttesz)) >
4150 (uintptr_t)endaddr)
4151 panic("can't unlock large tte");
4152
4153 ASSERT(hmeblkp->hblk_lckcnt > 0);
4154 atomic_dec_32(&hmeblkp->hblk_lckcnt);
4155 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
4156 } else {
4157 panic("sfmmu_hblk_unlock: invalid tte");
4158 }
4159 addr += TTEBYTES(ttesz);
4160 sfhme++;
4161 }
4162 return (addr);
4163 }
4164
4165 /*
4166 * Physical Address Mapping Framework
4167 *
4168 * General rules:
4169 *
4170 * (1) Applies only to seg_kmem memory pages. To make things easier,
4171 * seg_kpm addresses are also accepted by the routines, but nothing
4172 * is done with them since by definition their PA mappings are static.
4173 * (2) hat_add_callback() may only be called while holding the page lock
4174 * SE_SHARED or SE_EXCL of the underlying page (e.g., as_pagelock()),
4175 * or passing HAC_PAGELOCK flag.
4176 * (3) prehandler() and posthandler() may not call hat_add_callback() or
4177 * hat_delete_callback(), nor should they allocate memory. Post quiesce
4178 * callbacks may not sleep or acquire adaptive mutex locks.
4179 * (4) Either prehandler() or posthandler() (but not both) may be specified
4180 * as being NULL. Specifying an errhandler() is optional.
4181 *
4182 * Details of using the framework:
4183 *
4184 * registering a callback (hat_register_callback())
4185 *
4186 * Pass prehandler, posthandler, errhandler addresses
4187 * as described below. If capture_cpus argument is nonzero,
4188 * suspend callback to the prehandler will occur with CPUs
4189 * captured and executing xc_loop() and CPUs will remain
4190 * captured until after the posthandler suspend callback
4191 * occurs.
4192 *
4193 * adding a callback (hat_add_callback())
4194 *
4195 * as_pagelock();
4196 * hat_add_callback();
4197 * save returned pfn in private data structures or program registers;
4198 * as_pageunlock();
4199 *
4200 * prehandler()
4201 *
4202 * Stop all accesses by physical address to this memory page.
4203 * Called twice: the first, PRESUSPEND, is a context safe to acquire
4204 * adaptive locks. The second, SUSPEND, is called at high PIL with
4205 * CPUs captured so adaptive locks may NOT be acquired (and all spin
4206 * locks must be XCALL_PIL or higher locks).
4207 *
4208 * May return the following errors:
4209 * EIO: A fatal error has occurred. This will result in panic.
4210 * EAGAIN: The page cannot be suspended. This will fail the
4211 * relocation.
4212 * 0: Success.
4213 *
4214 * posthandler()
4215 *
4216 * Save new pfn in private data structures or program registers;
4217 * not allowed to fail (non-zero return values will result in panic).
4218 *
4219 * errhandler()
4220 *
4221 * called when an error occurs related to the callback. Currently
4222 * the only such error is HAT_CB_ERR_LEAKED which indicates that
4223 * a page is being freed, but there are still outstanding callback(s)
4224 * registered on the page.
4225 *
4226 * removing a callback (hat_delete_callback(); e.g., prior to freeing memory)
4227 *
4228 * stop using physical address
4229 * hat_delete_callback();
4230 *
4231 */
4232
4233 /*
4234 * Register a callback class. Each subsystem should do this once and
4235 * cache the id_t returned for use in setting up and tearing down callbacks.
4236 *
4237 * There is no facility for removing callback IDs once they are created;
4238 * the "key" should be unique for each module, so in case a module is unloaded
4239 * and subsequently re-loaded, we can recycle the module's previous entry.
4240 */
4241 id_t
hat_register_callback(int key,int (* prehandler)(caddr_t,uint_t,uint_t,void *),int (* posthandler)(caddr_t,uint_t,uint_t,void *,pfn_t),int (* errhandler)(caddr_t,uint_t,uint_t,void *),int capture_cpus)4242 hat_register_callback(int key,
4243 int (*prehandler)(caddr_t, uint_t, uint_t, void *),
4244 int (*posthandler)(caddr_t, uint_t, uint_t, void *, pfn_t),
4245 int (*errhandler)(caddr_t, uint_t, uint_t, void *),
4246 int capture_cpus)
4247 {
4248 id_t id;
4249
4250 /*
4251 * Search the table for a pre-existing callback associated with
4252 * the identifier "key". If one exists, we re-use that entry in
4253 * the table for this instance, otherwise we assign the next
4254 * available table slot.
4255 */
4256 for (id = 0; id < sfmmu_max_cb_id; id++) {
4257 if (sfmmu_cb_table[id].key == key)
4258 break;
4259 }
4260
4261 if (id == sfmmu_max_cb_id) {
4262 id = sfmmu_cb_nextid++;
4263 if (id >= sfmmu_max_cb_id)
4264 panic("hat_register_callback: out of callback IDs");
4265 }
4266
4267 ASSERT(prehandler != NULL || posthandler != NULL);
4268
4269 sfmmu_cb_table[id].key = key;
4270 sfmmu_cb_table[id].prehandler = prehandler;
4271 sfmmu_cb_table[id].posthandler = posthandler;
4272 sfmmu_cb_table[id].errhandler = errhandler;
4273 sfmmu_cb_table[id].capture_cpus = capture_cpus;
4274
4275 return (id);
4276 }
4277
4278 #define HAC_COOKIE_NONE (void *)-1
4279
4280 /*
4281 * Add relocation callbacks to the specified addr/len which will be called
4282 * when relocating the associated page. See the description of pre and
4283 * posthandler above for more details.
4284 *
4285 * If HAC_PAGELOCK is included in flags, the underlying memory page is
4286 * locked internally so the caller must be able to deal with the callback
4287 * running even before this function has returned. If HAC_PAGELOCK is not
4288 * set, it is assumed that the underlying memory pages are locked.
4289 *
4290 * Since the caller must track the individual page boundaries anyway,
4291 * we only allow a callback to be added to a single page (large
4292 * or small). Thus [addr, addr + len) MUST be contained within a single
4293 * page.
4294 *
4295 * Registering multiple callbacks on the same [addr, addr+len) is supported,
4296 * _provided_that_ a unique parameter is specified for each callback.
4297 * If multiple callbacks are registered on the same range the callback will
4298 * be invoked with each unique parameter. Registering the same callback with
4299 * the same argument more than once will result in corrupted kernel state.
4300 *
4301 * Returns the pfn of the underlying kernel page in *rpfn
4302 * on success, or PFN_INVALID on failure.
4303 *
4304 * cookiep (if passed) provides storage space for an opaque cookie
4305 * to return later to hat_delete_callback(). This cookie makes the callback
4306 * deletion significantly quicker by avoiding a potentially lengthy hash
4307 * search.
4308 *
4309 * Returns values:
4310 * 0: success
4311 * ENOMEM: memory allocation failure (e.g. flags was passed as HAC_NOSLEEP)
4312 * EINVAL: callback ID is not valid
4313 * ENXIO: ["vaddr", "vaddr" + len) is not mapped in the kernel's address
4314 * space
4315 * ERANGE: ["vaddr", "vaddr" + len) crosses a page boundary
4316 */
4317 int
hat_add_callback(id_t callback_id,caddr_t vaddr,uint_t len,uint_t flags,void * pvt,pfn_t * rpfn,void ** cookiep)4318 hat_add_callback(id_t callback_id, caddr_t vaddr, uint_t len, uint_t flags,
4319 void *pvt, pfn_t *rpfn, void **cookiep)
4320 {
4321 struct hmehash_bucket *hmebp;
4322 hmeblk_tag hblktag;
4323 struct hme_blk *hmeblkp;
4324 int hmeshift, hashno;
4325 caddr_t saddr, eaddr, baseaddr;
4326 struct pa_hment *pahmep;
4327 struct sf_hment *sfhmep, *osfhmep;
4328 kmutex_t *pml;
4329 tte_t tte;
4330 page_t *pp;
4331 vnode_t *vp;
4332 u_offset_t off;
4333 pfn_t pfn;
4334 int kmflags = (flags & HAC_SLEEP)? KM_SLEEP : KM_NOSLEEP;
4335 int locked = 0;
4336
4337 /*
4338 * For KPM mappings, just return the physical address since we
4339 * don't need to register any callbacks.
4340 */
4341 if (IS_KPM_ADDR(vaddr)) {
4342 uint64_t paddr;
4343 SFMMU_KPM_VTOP(vaddr, paddr);
4344 *rpfn = btop(paddr);
4345 if (cookiep != NULL)
4346 *cookiep = HAC_COOKIE_NONE;
4347 return (0);
4348 }
4349
4350 if (callback_id < (id_t)0 || callback_id >= sfmmu_cb_nextid) {
4351 *rpfn = PFN_INVALID;
4352 return (EINVAL);
4353 }
4354
4355 if ((pahmep = kmem_cache_alloc(pa_hment_cache, kmflags)) == NULL) {
4356 *rpfn = PFN_INVALID;
4357 return (ENOMEM);
4358 }
4359
4360 sfhmep = &pahmep->sfment;
4361
4362 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
4363 eaddr = saddr + len;
4364
4365 rehash:
4366 /* Find the mapping(s) for this page */
4367 for (hashno = TTE64K, hmeblkp = NULL;
4368 hmeblkp == NULL && hashno <= mmu_hashcnt;
4369 hashno++) {
4370 hmeshift = HME_HASH_SHIFT(hashno);
4371 hblktag.htag_id = ksfmmup;
4372 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4373 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
4374 hblktag.htag_rehash = hashno;
4375 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
4376
4377 SFMMU_HASH_LOCK(hmebp);
4378
4379 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
4380
4381 if (hmeblkp == NULL)
4382 SFMMU_HASH_UNLOCK(hmebp);
4383 }
4384
4385 if (hmeblkp == NULL) {
4386 kmem_cache_free(pa_hment_cache, pahmep);
4387 *rpfn = PFN_INVALID;
4388 return (ENXIO);
4389 }
4390
4391 ASSERT(!hmeblkp->hblk_shared);
4392
4393 HBLKTOHME(osfhmep, hmeblkp, saddr);
4394 sfmmu_copytte(&osfhmep->hme_tte, &tte);
4395
4396 if (!TTE_IS_VALID(&tte)) {
4397 SFMMU_HASH_UNLOCK(hmebp);
4398 kmem_cache_free(pa_hment_cache, pahmep);
4399 *rpfn = PFN_INVALID;
4400 return (ENXIO);
4401 }
4402
4403 /*
4404 * Make sure the boundaries for the callback fall within this
4405 * single mapping.
4406 */
4407 baseaddr = (caddr_t)get_hblk_base(hmeblkp);
4408 ASSERT(saddr >= baseaddr);
4409 if (eaddr > saddr + TTEBYTES(TTE_CSZ(&tte))) {
4410 SFMMU_HASH_UNLOCK(hmebp);
4411 kmem_cache_free(pa_hment_cache, pahmep);
4412 *rpfn = PFN_INVALID;
4413 return (ERANGE);
4414 }
4415
4416 pfn = sfmmu_ttetopfn(&tte, vaddr);
4417
4418 /*
4419 * The pfn may not have a page_t underneath in which case we
4420 * just return it. This can happen if we are doing I/O to a
4421 * static portion of the kernel's address space, for instance.
4422 */
4423 pp = osfhmep->hme_page;
4424 if (pp == NULL) {
4425 SFMMU_HASH_UNLOCK(hmebp);
4426 kmem_cache_free(pa_hment_cache, pahmep);
4427 *rpfn = pfn;
4428 if (cookiep)
4429 *cookiep = HAC_COOKIE_NONE;
4430 return (0);
4431 }
4432 ASSERT(pp == PP_PAGEROOT(pp));
4433
4434 vp = pp->p_vnode;
4435 off = pp->p_offset;
4436
4437 pml = sfmmu_mlist_enter(pp);
4438
4439 if (flags & HAC_PAGELOCK) {
4440 if (!page_trylock(pp, SE_SHARED)) {
4441 /*
4442 * Somebody is holding SE_EXCL lock. Might
4443 * even be hat_page_relocate(). Drop all
4444 * our locks, lookup the page in &kvp, and
4445 * retry. If it doesn't exist in &kvp and &zvp,
4446 * then we must be dealing with a kernel mapped
4447 * page which doesn't actually belong to
4448 * segkmem so we punt.
4449 */
4450 sfmmu_mlist_exit(pml);
4451 SFMMU_HASH_UNLOCK(hmebp);
4452 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
4453
4454 /* check zvp before giving up */
4455 if (pp == NULL)
4456 pp = page_lookup(&zvp, (u_offset_t)saddr,
4457 SE_SHARED);
4458
4459 /* Okay, we didn't find it, give up */
4460 if (pp == NULL) {
4461 kmem_cache_free(pa_hment_cache, pahmep);
4462 *rpfn = pfn;
4463 if (cookiep)
4464 *cookiep = HAC_COOKIE_NONE;
4465 return (0);
4466 }
4467 page_unlock(pp);
4468 goto rehash;
4469 }
4470 locked = 1;
4471 }
4472
4473 if (!PAGE_LOCKED(pp) && !panicstr)
4474 panic("hat_add_callback: page 0x%p not locked", (void *)pp);
4475
4476 if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
4477 pp->p_offset != off) {
4478 /*
4479 * The page moved before we got our hands on it. Drop
4480 * all the locks and try again.
4481 */
4482 ASSERT((flags & HAC_PAGELOCK) != 0);
4483 sfmmu_mlist_exit(pml);
4484 SFMMU_HASH_UNLOCK(hmebp);
4485 page_unlock(pp);
4486 locked = 0;
4487 goto rehash;
4488 }
4489
4490 if (!VN_ISKAS(vp)) {
4491 /*
4492 * This is not a segkmem page but another page which
4493 * has been kernel mapped. It had better have at least
4494 * a share lock on it. Return the pfn.
4495 */
4496 sfmmu_mlist_exit(pml);
4497 SFMMU_HASH_UNLOCK(hmebp);
4498 if (locked)
4499 page_unlock(pp);
4500 kmem_cache_free(pa_hment_cache, pahmep);
4501 ASSERT(PAGE_LOCKED(pp));
4502 *rpfn = pfn;
4503 if (cookiep)
4504 *cookiep = HAC_COOKIE_NONE;
4505 return (0);
4506 }
4507
4508 /*
4509 * Setup this pa_hment and link its embedded dummy sf_hment into
4510 * the mapping list.
4511 */
4512 pp->p_share++;
4513 pahmep->cb_id = callback_id;
4514 pahmep->addr = vaddr;
4515 pahmep->len = len;
4516 pahmep->refcnt = 1;
4517 pahmep->flags = 0;
4518 pahmep->pvt = pvt;
4519
4520 sfhmep->hme_tte.ll = 0;
4521 sfhmep->hme_data = pahmep;
4522 sfhmep->hme_prev = osfhmep;
4523 sfhmep->hme_next = osfhmep->hme_next;
4524
4525 if (osfhmep->hme_next)
4526 osfhmep->hme_next->hme_prev = sfhmep;
4527
4528 osfhmep->hme_next = sfhmep;
4529
4530 sfmmu_mlist_exit(pml);
4531 SFMMU_HASH_UNLOCK(hmebp);
4532
4533 if (locked)
4534 page_unlock(pp);
4535
4536 *rpfn = pfn;
4537 if (cookiep)
4538 *cookiep = (void *)pahmep;
4539
4540 return (0);
4541 }
4542
4543 /*
4544 * Remove the relocation callbacks from the specified addr/len.
4545 */
4546 void
hat_delete_callback(caddr_t vaddr,uint_t len,void * pvt,uint_t flags,void * cookie)4547 hat_delete_callback(caddr_t vaddr, uint_t len, void *pvt, uint_t flags,
4548 void *cookie)
4549 {
4550 struct hmehash_bucket *hmebp;
4551 hmeblk_tag hblktag;
4552 struct hme_blk *hmeblkp;
4553 int hmeshift, hashno;
4554 caddr_t saddr;
4555 struct pa_hment *pahmep;
4556 struct sf_hment *sfhmep, *osfhmep;
4557 kmutex_t *pml;
4558 tte_t tte;
4559 page_t *pp;
4560 vnode_t *vp;
4561 u_offset_t off;
4562 int locked = 0;
4563
4564 /*
4565 * If the cookie is HAC_COOKIE_NONE then there is no pa_hment to
4566 * remove so just return.
4567 */
4568 if (cookie == HAC_COOKIE_NONE || IS_KPM_ADDR(vaddr))
4569 return;
4570
4571 saddr = (caddr_t)((uintptr_t)vaddr & MMU_PAGEMASK);
4572
4573 rehash:
4574 /* Find the mapping(s) for this page */
4575 for (hashno = TTE64K, hmeblkp = NULL;
4576 hmeblkp == NULL && hashno <= mmu_hashcnt;
4577 hashno++) {
4578 hmeshift = HME_HASH_SHIFT(hashno);
4579 hblktag.htag_id = ksfmmup;
4580 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4581 hblktag.htag_bspage = HME_HASH_BSPAGE(saddr, hmeshift);
4582 hblktag.htag_rehash = hashno;
4583 hmebp = HME_HASH_FUNCTION(ksfmmup, saddr, hmeshift);
4584
4585 SFMMU_HASH_LOCK(hmebp);
4586
4587 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
4588
4589 if (hmeblkp == NULL)
4590 SFMMU_HASH_UNLOCK(hmebp);
4591 }
4592
4593 if (hmeblkp == NULL)
4594 return;
4595
4596 ASSERT(!hmeblkp->hblk_shared);
4597
4598 HBLKTOHME(osfhmep, hmeblkp, saddr);
4599
4600 sfmmu_copytte(&osfhmep->hme_tte, &tte);
4601 if (!TTE_IS_VALID(&tte)) {
4602 SFMMU_HASH_UNLOCK(hmebp);
4603 return;
4604 }
4605
4606 pp = osfhmep->hme_page;
4607 if (pp == NULL) {
4608 SFMMU_HASH_UNLOCK(hmebp);
4609 ASSERT(cookie == NULL);
4610 return;
4611 }
4612
4613 vp = pp->p_vnode;
4614 off = pp->p_offset;
4615
4616 pml = sfmmu_mlist_enter(pp);
4617
4618 if (flags & HAC_PAGELOCK) {
4619 if (!page_trylock(pp, SE_SHARED)) {
4620 /*
4621 * Somebody is holding SE_EXCL lock. Might
4622 * even be hat_page_relocate(). Drop all
4623 * our locks, lookup the page in &kvp, and
4624 * retry. If it doesn't exist in &kvp and &zvp,
4625 * then we must be dealing with a kernel mapped
4626 * page which doesn't actually belong to
4627 * segkmem so we punt.
4628 */
4629 sfmmu_mlist_exit(pml);
4630 SFMMU_HASH_UNLOCK(hmebp);
4631 pp = page_lookup(&kvp, (u_offset_t)saddr, SE_SHARED);
4632 /* check zvp before giving up */
4633 if (pp == NULL)
4634 pp = page_lookup(&zvp, (u_offset_t)saddr,
4635 SE_SHARED);
4636
4637 if (pp == NULL) {
4638 ASSERT(cookie == NULL);
4639 return;
4640 }
4641 page_unlock(pp);
4642 goto rehash;
4643 }
4644 locked = 1;
4645 }
4646
4647 ASSERT(PAGE_LOCKED(pp));
4648
4649 if (osfhmep->hme_page != pp || pp->p_vnode != vp ||
4650 pp->p_offset != off) {
4651 /*
4652 * The page moved before we got our hands on it. Drop
4653 * all the locks and try again.
4654 */
4655 ASSERT((flags & HAC_PAGELOCK) != 0);
4656 sfmmu_mlist_exit(pml);
4657 SFMMU_HASH_UNLOCK(hmebp);
4658 page_unlock(pp);
4659 locked = 0;
4660 goto rehash;
4661 }
4662
4663 if (!VN_ISKAS(vp)) {
4664 /*
4665 * This is not a segkmem page but another page which
4666 * has been kernel mapped.
4667 */
4668 sfmmu_mlist_exit(pml);
4669 SFMMU_HASH_UNLOCK(hmebp);
4670 if (locked)
4671 page_unlock(pp);
4672 ASSERT(cookie == NULL);
4673 return;
4674 }
4675
4676 if (cookie != NULL) {
4677 pahmep = (struct pa_hment *)cookie;
4678 sfhmep = &pahmep->sfment;
4679 } else {
4680 for (sfhmep = pp->p_mapping; sfhmep != NULL;
4681 sfhmep = sfhmep->hme_next) {
4682
4683 /*
4684 * skip va<->pa mappings
4685 */
4686 if (!IS_PAHME(sfhmep))
4687 continue;
4688
4689 pahmep = sfhmep->hme_data;
4690 ASSERT(pahmep != NULL);
4691
4692 /*
4693 * if pa_hment matches, remove it
4694 */
4695 if ((pahmep->pvt == pvt) &&
4696 (pahmep->addr == vaddr) &&
4697 (pahmep->len == len)) {
4698 break;
4699 }
4700 }
4701 }
4702
4703 if (sfhmep == NULL) {
4704 if (!panicstr) {
4705 panic("hat_delete_callback: pa_hment not found, pp %p",
4706 (void *)pp);
4707 }
4708 return;
4709 }
4710
4711 /*
4712 * Note: at this point a valid kernel mapping must still be
4713 * present on this page.
4714 */
4715 pp->p_share--;
4716 if (pp->p_share <= 0)
4717 panic("hat_delete_callback: zero p_share");
4718
4719 if (--pahmep->refcnt == 0) {
4720 if (pahmep->flags != 0)
4721 panic("hat_delete_callback: pa_hment is busy");
4722
4723 /*
4724 * Remove sfhmep from the mapping list for the page.
4725 */
4726 if (sfhmep->hme_prev) {
4727 sfhmep->hme_prev->hme_next = sfhmep->hme_next;
4728 } else {
4729 pp->p_mapping = sfhmep->hme_next;
4730 }
4731
4732 if (sfhmep->hme_next)
4733 sfhmep->hme_next->hme_prev = sfhmep->hme_prev;
4734
4735 sfmmu_mlist_exit(pml);
4736 SFMMU_HASH_UNLOCK(hmebp);
4737
4738 if (locked)
4739 page_unlock(pp);
4740
4741 kmem_cache_free(pa_hment_cache, pahmep);
4742 return;
4743 }
4744
4745 sfmmu_mlist_exit(pml);
4746 SFMMU_HASH_UNLOCK(hmebp);
4747 if (locked)
4748 page_unlock(pp);
4749 }
4750
4751 /*
4752 * hat_probe returns 1 if the translation for the address 'addr' is
4753 * loaded, zero otherwise.
4754 *
4755 * hat_probe should be used only for advisorary purposes because it may
4756 * occasionally return the wrong value. The implementation must guarantee that
4757 * returning the wrong value is a very rare event. hat_probe is used
4758 * to implement optimizations in the segment drivers.
4759 *
4760 */
4761 int
hat_probe(struct hat * sfmmup,caddr_t addr)4762 hat_probe(struct hat *sfmmup, caddr_t addr)
4763 {
4764 pfn_t pfn;
4765 tte_t tte;
4766
4767 ASSERT(sfmmup != NULL);
4768 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4769
4770 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
4771
4772 if (sfmmup == ksfmmup) {
4773 while ((pfn = sfmmu_vatopfn(addr, sfmmup, &tte))
4774 == PFN_SUSPENDED) {
4775 sfmmu_vatopfn_suspended(addr, sfmmup, &tte);
4776 }
4777 } else {
4778 pfn = sfmmu_uvatopfn(addr, sfmmup, NULL);
4779 }
4780
4781 if (pfn != PFN_INVALID)
4782 return (1);
4783 else
4784 return (0);
4785 }
4786
4787 ssize_t
hat_getpagesize(struct hat * sfmmup,caddr_t addr)4788 hat_getpagesize(struct hat *sfmmup, caddr_t addr)
4789 {
4790 tte_t tte;
4791
4792 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4793
4794 if (sfmmup == ksfmmup) {
4795 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4796 return (-1);
4797 }
4798 } else {
4799 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4800 return (-1);
4801 }
4802 }
4803
4804 ASSERT(TTE_IS_VALID(&tte));
4805 return (TTEBYTES(TTE_CSZ(&tte)));
4806 }
4807
4808 uint_t
hat_getattr(struct hat * sfmmup,caddr_t addr,uint_t * attr)4809 hat_getattr(struct hat *sfmmup, caddr_t addr, uint_t *attr)
4810 {
4811 tte_t tte;
4812
4813 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
4814
4815 if (sfmmup == ksfmmup) {
4816 if (sfmmu_vatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4817 tte.ll = 0;
4818 }
4819 } else {
4820 if (sfmmu_uvatopfn(addr, sfmmup, &tte) == PFN_INVALID) {
4821 tte.ll = 0;
4822 }
4823 }
4824 if (TTE_IS_VALID(&tte)) {
4825 *attr = sfmmu_ptov_attr(&tte);
4826 return (0);
4827 }
4828 *attr = 0;
4829 return ((uint_t)0xffffffff);
4830 }
4831
4832 /*
4833 * Enables more attributes on specified address range (ie. logical OR)
4834 */
4835 void
hat_setattr(struct hat * hat,caddr_t addr,size_t len,uint_t attr)4836 hat_setattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4837 {
4838 if (hat->sfmmu_xhat_provider) {
4839 XHAT_SETATTR(hat, addr, len, attr);
4840 return;
4841 } else {
4842 /*
4843 * This must be a CPU HAT. If the address space has
4844 * XHATs attached, change attributes for all of them,
4845 * just in case
4846 */
4847 ASSERT(hat->sfmmu_as != NULL);
4848 if (hat->sfmmu_as->a_xhat != NULL)
4849 xhat_setattr_all(hat->sfmmu_as, addr, len, attr);
4850 }
4851
4852 sfmmu_chgattr(hat, addr, len, attr, SFMMU_SETATTR);
4853 }
4854
4855 /*
4856 * Assigns attributes to the specified address range. All the attributes
4857 * are specified.
4858 */
4859 void
hat_chgattr(struct hat * hat,caddr_t addr,size_t len,uint_t attr)4860 hat_chgattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4861 {
4862 if (hat->sfmmu_xhat_provider) {
4863 XHAT_CHGATTR(hat, addr, len, attr);
4864 return;
4865 } else {
4866 /*
4867 * This must be a CPU HAT. If the address space has
4868 * XHATs attached, change attributes for all of them,
4869 * just in case
4870 */
4871 ASSERT(hat->sfmmu_as != NULL);
4872 if (hat->sfmmu_as->a_xhat != NULL)
4873 xhat_chgattr_all(hat->sfmmu_as, addr, len, attr);
4874 }
4875
4876 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CHGATTR);
4877 }
4878
4879 /*
4880 * Remove attributes on the specified address range (ie. loginal NAND)
4881 */
4882 void
hat_clrattr(struct hat * hat,caddr_t addr,size_t len,uint_t attr)4883 hat_clrattr(struct hat *hat, caddr_t addr, size_t len, uint_t attr)
4884 {
4885 if (hat->sfmmu_xhat_provider) {
4886 XHAT_CLRATTR(hat, addr, len, attr);
4887 return;
4888 } else {
4889 /*
4890 * This must be a CPU HAT. If the address space has
4891 * XHATs attached, change attributes for all of them,
4892 * just in case
4893 */
4894 ASSERT(hat->sfmmu_as != NULL);
4895 if (hat->sfmmu_as->a_xhat != NULL)
4896 xhat_clrattr_all(hat->sfmmu_as, addr, len, attr);
4897 }
4898
4899 sfmmu_chgattr(hat, addr, len, attr, SFMMU_CLRATTR);
4900 }
4901
4902 /*
4903 * Change attributes on an address range to that specified by attr and mode.
4904 */
4905 static void
sfmmu_chgattr(struct hat * sfmmup,caddr_t addr,size_t len,uint_t attr,int mode)4906 sfmmu_chgattr(struct hat *sfmmup, caddr_t addr, size_t len, uint_t attr,
4907 int mode)
4908 {
4909 struct hmehash_bucket *hmebp;
4910 hmeblk_tag hblktag;
4911 int hmeshift, hashno = 1;
4912 struct hme_blk *hmeblkp, *list = NULL;
4913 caddr_t endaddr;
4914 cpuset_t cpuset;
4915 demap_range_t dmr;
4916
4917 CPUSET_ZERO(cpuset);
4918
4919 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
4920 ASSERT((len & MMU_PAGEOFFSET) == 0);
4921 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
4922
4923 if ((attr & PROT_USER) && (mode != SFMMU_CLRATTR) &&
4924 ((addr + len) > (caddr_t)USERLIMIT)) {
4925 panic("user addr %p in kernel space",
4926 (void *)addr);
4927 }
4928
4929 endaddr = addr + len;
4930 hblktag.htag_id = sfmmup;
4931 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
4932 DEMAP_RANGE_INIT(sfmmup, &dmr);
4933
4934 while (addr < endaddr) {
4935 hmeshift = HME_HASH_SHIFT(hashno);
4936 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
4937 hblktag.htag_rehash = hashno;
4938 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
4939
4940 SFMMU_HASH_LOCK(hmebp);
4941
4942 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
4943 if (hmeblkp != NULL) {
4944 ASSERT(!hmeblkp->hblk_shared);
4945 /*
4946 * We've encountered a shadow hmeblk so skip the range
4947 * of the next smaller mapping size.
4948 */
4949 if (hmeblkp->hblk_shw_bit) {
4950 ASSERT(sfmmup != ksfmmup);
4951 ASSERT(hashno > 1);
4952 addr = (caddr_t)P2END((uintptr_t)addr,
4953 TTEBYTES(hashno - 1));
4954 } else {
4955 addr = sfmmu_hblk_chgattr(sfmmup,
4956 hmeblkp, addr, endaddr, &dmr, attr, mode);
4957 }
4958 SFMMU_HASH_UNLOCK(hmebp);
4959 hashno = 1;
4960 continue;
4961 }
4962 SFMMU_HASH_UNLOCK(hmebp);
4963
4964 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
4965 /*
4966 * We have traversed the whole list and rehashed
4967 * if necessary without finding the address to chgattr.
4968 * This is ok, so we increment the address by the
4969 * smallest hmeblk range for kernel mappings or for
4970 * user mappings with no large pages, and the largest
4971 * hmeblk range, to account for shadow hmeblks, for
4972 * user mappings with large pages and continue.
4973 */
4974 if (sfmmup == ksfmmup)
4975 addr = (caddr_t)P2END((uintptr_t)addr,
4976 TTEBYTES(1));
4977 else
4978 addr = (caddr_t)P2END((uintptr_t)addr,
4979 TTEBYTES(hashno));
4980 hashno = 1;
4981 } else {
4982 hashno++;
4983 }
4984 }
4985
4986 sfmmu_hblks_list_purge(&list, 0);
4987 DEMAP_RANGE_FLUSH(&dmr);
4988 cpuset = sfmmup->sfmmu_cpusran;
4989 xt_sync(cpuset);
4990 }
4991
4992 /*
4993 * This function chgattr on a range of addresses in an hmeblk. It returns the
4994 * next addres that needs to be chgattr.
4995 * It should be called with the hash lock held.
4996 * XXX It should be possible to optimize chgattr by not flushing every time but
4997 * on the other hand:
4998 * 1. do one flush crosscall.
4999 * 2. only flush if we are increasing permissions (make sure this will work)
5000 */
5001 static caddr_t
sfmmu_hblk_chgattr(struct hat * sfmmup,struct hme_blk * hmeblkp,caddr_t addr,caddr_t endaddr,demap_range_t * dmrp,uint_t attr,int mode)5002 sfmmu_hblk_chgattr(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
5003 caddr_t endaddr, demap_range_t *dmrp, uint_t attr, int mode)
5004 {
5005 tte_t tte, tteattr, tteflags, ttemod;
5006 struct sf_hment *sfhmep;
5007 int ttesz;
5008 struct page *pp = NULL;
5009 kmutex_t *pml, *pmtx;
5010 int ret;
5011 int use_demap_range;
5012 #if defined(SF_ERRATA_57)
5013 int check_exec;
5014 #endif
5015
5016 ASSERT(in_hblk_range(hmeblkp, addr));
5017 ASSERT(hmeblkp->hblk_shw_bit == 0);
5018 ASSERT(!hmeblkp->hblk_shared);
5019
5020 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
5021 ttesz = get_hblk_ttesz(hmeblkp);
5022
5023 /*
5024 * Flush the current demap region if addresses have been
5025 * skipped or the page size doesn't match.
5026 */
5027 use_demap_range = (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp));
5028 if (use_demap_range) {
5029 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
5030 } else if (dmrp != NULL) {
5031 DEMAP_RANGE_FLUSH(dmrp);
5032 }
5033
5034 tteattr.ll = sfmmu_vtop_attr(attr, mode, &tteflags);
5035 #if defined(SF_ERRATA_57)
5036 check_exec = (sfmmup != ksfmmup) &&
5037 AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
5038 TTE_IS_EXECUTABLE(&tteattr);
5039 #endif
5040 HBLKTOHME(sfhmep, hmeblkp, addr);
5041 while (addr < endaddr) {
5042 sfmmu_copytte(&sfhmep->hme_tte, &tte);
5043 if (TTE_IS_VALID(&tte)) {
5044 if ((tte.ll & tteflags.ll) == tteattr.ll) {
5045 /*
5046 * if the new attr is the same as old
5047 * continue
5048 */
5049 goto next_addr;
5050 }
5051 if (!TTE_IS_WRITABLE(&tteattr)) {
5052 /*
5053 * make sure we clear hw modify bit if we
5054 * removing write protections
5055 */
5056 tteflags.tte_intlo |= TTE_HWWR_INT;
5057 }
5058
5059 pml = NULL;
5060 pp = sfhmep->hme_page;
5061 if (pp) {
5062 pml = sfmmu_mlist_enter(pp);
5063 }
5064
5065 if (pp != sfhmep->hme_page) {
5066 /*
5067 * tte must have been unloaded.
5068 */
5069 ASSERT(pml);
5070 sfmmu_mlist_exit(pml);
5071 continue;
5072 }
5073
5074 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
5075
5076 ttemod = tte;
5077 ttemod.ll = (ttemod.ll & ~tteflags.ll) | tteattr.ll;
5078 ASSERT(TTE_TO_TTEPFN(&ttemod) == TTE_TO_TTEPFN(&tte));
5079
5080 #if defined(SF_ERRATA_57)
5081 if (check_exec && addr < errata57_limit)
5082 ttemod.tte_exec_perm = 0;
5083 #endif
5084 ret = sfmmu_modifytte_try(&tte, &ttemod,
5085 &sfhmep->hme_tte);
5086
5087 if (ret < 0) {
5088 /* tte changed underneath us */
5089 if (pml) {
5090 sfmmu_mlist_exit(pml);
5091 }
5092 continue;
5093 }
5094
5095 if (tteflags.tte_intlo & TTE_HWWR_INT) {
5096 /*
5097 * need to sync if we are clearing modify bit.
5098 */
5099 sfmmu_ttesync(sfmmup, addr, &tte, pp);
5100 }
5101
5102 if (pp && PP_ISRO(pp)) {
5103 if (tteattr.tte_intlo & TTE_WRPRM_INT) {
5104 pmtx = sfmmu_page_enter(pp);
5105 PP_CLRRO(pp);
5106 sfmmu_page_exit(pmtx);
5107 }
5108 }
5109
5110 if (ret > 0 && use_demap_range) {
5111 DEMAP_RANGE_MARKPG(dmrp, addr);
5112 } else if (ret > 0) {
5113 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
5114 }
5115
5116 if (pml) {
5117 sfmmu_mlist_exit(pml);
5118 }
5119 }
5120 next_addr:
5121 addr += TTEBYTES(ttesz);
5122 sfhmep++;
5123 DEMAP_RANGE_NEXTPG(dmrp);
5124 }
5125 return (addr);
5126 }
5127
5128 /*
5129 * This routine converts virtual attributes to physical ones. It will
5130 * update the tteflags field with the tte mask corresponding to the attributes
5131 * affected and it returns the new attributes. It will also clear the modify
5132 * bit if we are taking away write permission. This is necessary since the
5133 * modify bit is the hardware permission bit and we need to clear it in order
5134 * to detect write faults.
5135 */
5136 static uint64_t
sfmmu_vtop_attr(uint_t attr,int mode,tte_t * ttemaskp)5137 sfmmu_vtop_attr(uint_t attr, int mode, tte_t *ttemaskp)
5138 {
5139 tte_t ttevalue;
5140
5141 ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
5142
5143 switch (mode) {
5144 case SFMMU_CHGATTR:
5145 /* all attributes specified */
5146 ttevalue.tte_inthi = MAKE_TTEATTR_INTHI(attr);
5147 ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr);
5148 ttemaskp->tte_inthi = TTEINTHI_ATTR;
5149 ttemaskp->tte_intlo = TTEINTLO_ATTR;
5150 break;
5151 case SFMMU_SETATTR:
5152 ASSERT(!(attr & ~HAT_PROT_MASK));
5153 ttemaskp->ll = 0;
5154 ttevalue.ll = 0;
5155 /*
5156 * a valid tte implies exec and read for sfmmu
5157 * so no need to do anything about them.
5158 * since priviledged access implies user access
5159 * PROT_USER doesn't make sense either.
5160 */
5161 if (attr & PROT_WRITE) {
5162 ttemaskp->tte_intlo |= TTE_WRPRM_INT;
5163 ttevalue.tte_intlo |= TTE_WRPRM_INT;
5164 }
5165 break;
5166 case SFMMU_CLRATTR:
5167 /* attributes will be nand with current ones */
5168 if (attr & ~(PROT_WRITE | PROT_USER)) {
5169 panic("sfmmu: attr %x not supported", attr);
5170 }
5171 ttemaskp->ll = 0;
5172 ttevalue.ll = 0;
5173 if (attr & PROT_WRITE) {
5174 /* clear both writable and modify bit */
5175 ttemaskp->tte_intlo |= TTE_WRPRM_INT | TTE_HWWR_INT;
5176 }
5177 if (attr & PROT_USER) {
5178 ttemaskp->tte_intlo |= TTE_PRIV_INT;
5179 ttevalue.tte_intlo |= TTE_PRIV_INT;
5180 }
5181 break;
5182 default:
5183 panic("sfmmu_vtop_attr: bad mode %x", mode);
5184 }
5185 ASSERT(TTE_TO_TTEPFN(&ttevalue) == 0);
5186 return (ttevalue.ll);
5187 }
5188
5189 static uint_t
sfmmu_ptov_attr(tte_t * ttep)5190 sfmmu_ptov_attr(tte_t *ttep)
5191 {
5192 uint_t attr;
5193
5194 ASSERT(TTE_IS_VALID(ttep));
5195
5196 attr = PROT_READ;
5197
5198 if (TTE_IS_WRITABLE(ttep)) {
5199 attr |= PROT_WRITE;
5200 }
5201 if (TTE_IS_EXECUTABLE(ttep)) {
5202 attr |= PROT_EXEC;
5203 }
5204 if (!TTE_IS_PRIVILEGED(ttep)) {
5205 attr |= PROT_USER;
5206 }
5207 if (TTE_IS_NFO(ttep)) {
5208 attr |= HAT_NOFAULT;
5209 }
5210 if (TTE_IS_NOSYNC(ttep)) {
5211 attr |= HAT_NOSYNC;
5212 }
5213 if (TTE_IS_SIDEFFECT(ttep)) {
5214 attr |= SFMMU_SIDEFFECT;
5215 }
5216 if (!TTE_IS_VCACHEABLE(ttep)) {
5217 attr |= SFMMU_UNCACHEVTTE;
5218 }
5219 if (!TTE_IS_PCACHEABLE(ttep)) {
5220 attr |= SFMMU_UNCACHEPTTE;
5221 }
5222 return (attr);
5223 }
5224
5225 /*
5226 * hat_chgprot is a deprecated hat call. New segment drivers
5227 * should store all attributes and use hat_*attr calls.
5228 *
5229 * Change the protections in the virtual address range
5230 * given to the specified virtual protection. If vprot is ~PROT_WRITE,
5231 * then remove write permission, leaving the other
5232 * permissions unchanged. If vprot is ~PROT_USER, remove user permissions.
5233 *
5234 */
5235 void
hat_chgprot(struct hat * sfmmup,caddr_t addr,size_t len,uint_t vprot)5236 hat_chgprot(struct hat *sfmmup, caddr_t addr, size_t len, uint_t vprot)
5237 {
5238 struct hmehash_bucket *hmebp;
5239 hmeblk_tag hblktag;
5240 int hmeshift, hashno = 1;
5241 struct hme_blk *hmeblkp, *list = NULL;
5242 caddr_t endaddr;
5243 cpuset_t cpuset;
5244 demap_range_t dmr;
5245
5246 ASSERT((len & MMU_PAGEOFFSET) == 0);
5247 ASSERT(((uintptr_t)addr & MMU_PAGEOFFSET) == 0);
5248
5249 if (sfmmup->sfmmu_xhat_provider) {
5250 XHAT_CHGPROT(sfmmup, addr, len, vprot);
5251 return;
5252 } else {
5253 /*
5254 * This must be a CPU HAT. If the address space has
5255 * XHATs attached, change attributes for all of them,
5256 * just in case
5257 */
5258 ASSERT(sfmmup->sfmmu_as != NULL);
5259 if (sfmmup->sfmmu_as->a_xhat != NULL)
5260 xhat_chgprot_all(sfmmup->sfmmu_as, addr, len, vprot);
5261 }
5262
5263 CPUSET_ZERO(cpuset);
5264
5265 if ((vprot != (uint_t)~PROT_WRITE) && (vprot & PROT_USER) &&
5266 ((addr + len) > (caddr_t)USERLIMIT)) {
5267 panic("user addr %p vprot %x in kernel space",
5268 (void *)addr, vprot);
5269 }
5270 endaddr = addr + len;
5271 hblktag.htag_id = sfmmup;
5272 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
5273 DEMAP_RANGE_INIT(sfmmup, &dmr);
5274
5275 while (addr < endaddr) {
5276 hmeshift = HME_HASH_SHIFT(hashno);
5277 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
5278 hblktag.htag_rehash = hashno;
5279 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
5280
5281 SFMMU_HASH_LOCK(hmebp);
5282
5283 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
5284 if (hmeblkp != NULL) {
5285 ASSERT(!hmeblkp->hblk_shared);
5286 /*
5287 * We've encountered a shadow hmeblk so skip the range
5288 * of the next smaller mapping size.
5289 */
5290 if (hmeblkp->hblk_shw_bit) {
5291 ASSERT(sfmmup != ksfmmup);
5292 ASSERT(hashno > 1);
5293 addr = (caddr_t)P2END((uintptr_t)addr,
5294 TTEBYTES(hashno - 1));
5295 } else {
5296 addr = sfmmu_hblk_chgprot(sfmmup, hmeblkp,
5297 addr, endaddr, &dmr, vprot);
5298 }
5299 SFMMU_HASH_UNLOCK(hmebp);
5300 hashno = 1;
5301 continue;
5302 }
5303 SFMMU_HASH_UNLOCK(hmebp);
5304
5305 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
5306 /*
5307 * We have traversed the whole list and rehashed
5308 * if necessary without finding the address to chgprot.
5309 * This is ok so we increment the address by the
5310 * smallest hmeblk range for kernel mappings and the
5311 * largest hmeblk range, to account for shadow hmeblks,
5312 * for user mappings and continue.
5313 */
5314 if (sfmmup == ksfmmup)
5315 addr = (caddr_t)P2END((uintptr_t)addr,
5316 TTEBYTES(1));
5317 else
5318 addr = (caddr_t)P2END((uintptr_t)addr,
5319 TTEBYTES(hashno));
5320 hashno = 1;
5321 } else {
5322 hashno++;
5323 }
5324 }
5325
5326 sfmmu_hblks_list_purge(&list, 0);
5327 DEMAP_RANGE_FLUSH(&dmr);
5328 cpuset = sfmmup->sfmmu_cpusran;
5329 xt_sync(cpuset);
5330 }
5331
5332 /*
5333 * This function chgprots a range of addresses in an hmeblk. It returns the
5334 * next addres that needs to be chgprot.
5335 * It should be called with the hash lock held.
5336 * XXX It shold be possible to optimize chgprot by not flushing every time but
5337 * on the other hand:
5338 * 1. do one flush crosscall.
5339 * 2. only flush if we are increasing permissions (make sure this will work)
5340 */
5341 static caddr_t
sfmmu_hblk_chgprot(sfmmu_t * sfmmup,struct hme_blk * hmeblkp,caddr_t addr,caddr_t endaddr,demap_range_t * dmrp,uint_t vprot)5342 sfmmu_hblk_chgprot(sfmmu_t *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
5343 caddr_t endaddr, demap_range_t *dmrp, uint_t vprot)
5344 {
5345 uint_t pprot;
5346 tte_t tte, ttemod;
5347 struct sf_hment *sfhmep;
5348 uint_t tteflags;
5349 int ttesz;
5350 struct page *pp = NULL;
5351 kmutex_t *pml, *pmtx;
5352 int ret;
5353 int use_demap_range;
5354 #if defined(SF_ERRATA_57)
5355 int check_exec;
5356 #endif
5357
5358 ASSERT(in_hblk_range(hmeblkp, addr));
5359 ASSERT(hmeblkp->hblk_shw_bit == 0);
5360 ASSERT(!hmeblkp->hblk_shared);
5361
5362 #ifdef DEBUG
5363 if (get_hblk_ttesz(hmeblkp) != TTE8K &&
5364 (endaddr < get_hblk_endaddr(hmeblkp))) {
5365 panic("sfmmu_hblk_chgprot: partial chgprot of large page");
5366 }
5367 #endif /* DEBUG */
5368
5369 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
5370 ttesz = get_hblk_ttesz(hmeblkp);
5371
5372 pprot = sfmmu_vtop_prot(vprot, &tteflags);
5373 #if defined(SF_ERRATA_57)
5374 check_exec = (sfmmup != ksfmmup) &&
5375 AS_TYPE_64BIT(sfmmup->sfmmu_as) &&
5376 ((vprot & PROT_EXEC) == PROT_EXEC);
5377 #endif
5378 HBLKTOHME(sfhmep, hmeblkp, addr);
5379
5380 /*
5381 * Flush the current demap region if addresses have been
5382 * skipped or the page size doesn't match.
5383 */
5384 use_demap_range = (TTEBYTES(ttesz) == MMU_PAGESIZE);
5385 if (use_demap_range) {
5386 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
5387 } else if (dmrp != NULL) {
5388 DEMAP_RANGE_FLUSH(dmrp);
5389 }
5390
5391 while (addr < endaddr) {
5392 sfmmu_copytte(&sfhmep->hme_tte, &tte);
5393 if (TTE_IS_VALID(&tte)) {
5394 if (TTE_GET_LOFLAGS(&tte, tteflags) == pprot) {
5395 /*
5396 * if the new protection is the same as old
5397 * continue
5398 */
5399 goto next_addr;
5400 }
5401 pml = NULL;
5402 pp = sfhmep->hme_page;
5403 if (pp) {
5404 pml = sfmmu_mlist_enter(pp);
5405 }
5406 if (pp != sfhmep->hme_page) {
5407 /*
5408 * tte most have been unloaded
5409 * underneath us. Recheck
5410 */
5411 ASSERT(pml);
5412 sfmmu_mlist_exit(pml);
5413 continue;
5414 }
5415
5416 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
5417
5418 ttemod = tte;
5419 TTE_SET_LOFLAGS(&ttemod, tteflags, pprot);
5420 #if defined(SF_ERRATA_57)
5421 if (check_exec && addr < errata57_limit)
5422 ttemod.tte_exec_perm = 0;
5423 #endif
5424 ret = sfmmu_modifytte_try(&tte, &ttemod,
5425 &sfhmep->hme_tte);
5426
5427 if (ret < 0) {
5428 /* tte changed underneath us */
5429 if (pml) {
5430 sfmmu_mlist_exit(pml);
5431 }
5432 continue;
5433 }
5434
5435 if (tteflags & TTE_HWWR_INT) {
5436 /*
5437 * need to sync if we are clearing modify bit.
5438 */
5439 sfmmu_ttesync(sfmmup, addr, &tte, pp);
5440 }
5441
5442 if (pp && PP_ISRO(pp)) {
5443 if (pprot & TTE_WRPRM_INT) {
5444 pmtx = sfmmu_page_enter(pp);
5445 PP_CLRRO(pp);
5446 sfmmu_page_exit(pmtx);
5447 }
5448 }
5449
5450 if (ret > 0 && use_demap_range) {
5451 DEMAP_RANGE_MARKPG(dmrp, addr);
5452 } else if (ret > 0) {
5453 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
5454 }
5455
5456 if (pml) {
5457 sfmmu_mlist_exit(pml);
5458 }
5459 }
5460 next_addr:
5461 addr += TTEBYTES(ttesz);
5462 sfhmep++;
5463 DEMAP_RANGE_NEXTPG(dmrp);
5464 }
5465 return (addr);
5466 }
5467
5468 /*
5469 * This routine is deprecated and should only be used by hat_chgprot.
5470 * The correct routine is sfmmu_vtop_attr.
5471 * This routine converts virtual page protections to physical ones. It will
5472 * update the tteflags field with the tte mask corresponding to the protections
5473 * affected and it returns the new protections. It will also clear the modify
5474 * bit if we are taking away write permission. This is necessary since the
5475 * modify bit is the hardware permission bit and we need to clear it in order
5476 * to detect write faults.
5477 * It accepts the following special protections:
5478 * ~PROT_WRITE = remove write permissions.
5479 * ~PROT_USER = remove user permissions.
5480 */
5481 static uint_t
sfmmu_vtop_prot(uint_t vprot,uint_t * tteflagsp)5482 sfmmu_vtop_prot(uint_t vprot, uint_t *tteflagsp)
5483 {
5484 if (vprot == (uint_t)~PROT_WRITE) {
5485 *tteflagsp = TTE_WRPRM_INT | TTE_HWWR_INT;
5486 return (0); /* will cause wrprm to be cleared */
5487 }
5488 if (vprot == (uint_t)~PROT_USER) {
5489 *tteflagsp = TTE_PRIV_INT;
5490 return (0); /* will cause privprm to be cleared */
5491 }
5492 if ((vprot == 0) || (vprot == PROT_USER) ||
5493 ((vprot & PROT_ALL) != vprot)) {
5494 panic("sfmmu_vtop_prot -- bad prot %x", vprot);
5495 }
5496
5497 switch (vprot) {
5498 case (PROT_READ):
5499 case (PROT_EXEC):
5500 case (PROT_EXEC | PROT_READ):
5501 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
5502 return (TTE_PRIV_INT); /* set prv and clr wrt */
5503 case (PROT_WRITE):
5504 case (PROT_WRITE | PROT_READ):
5505 case (PROT_EXEC | PROT_WRITE):
5506 case (PROT_EXEC | PROT_WRITE | PROT_READ):
5507 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
5508 return (TTE_PRIV_INT | TTE_WRPRM_INT); /* set prv and wrt */
5509 case (PROT_USER | PROT_READ):
5510 case (PROT_USER | PROT_EXEC):
5511 case (PROT_USER | PROT_EXEC | PROT_READ):
5512 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT | TTE_HWWR_INT;
5513 return (0); /* clr prv and wrt */
5514 case (PROT_USER | PROT_WRITE):
5515 case (PROT_USER | PROT_WRITE | PROT_READ):
5516 case (PROT_USER | PROT_EXEC | PROT_WRITE):
5517 case (PROT_USER | PROT_EXEC | PROT_WRITE | PROT_READ):
5518 *tteflagsp = TTE_PRIV_INT | TTE_WRPRM_INT;
5519 return (TTE_WRPRM_INT); /* clr prv and set wrt */
5520 default:
5521 panic("sfmmu_vtop_prot -- bad prot %x", vprot);
5522 }
5523 return (0);
5524 }
5525
5526 /*
5527 * Alternate unload for very large virtual ranges. With a true 64 bit VA,
5528 * the normal algorithm would take too long for a very large VA range with
5529 * few real mappings. This routine just walks thru all HMEs in the global
5530 * hash table to find and remove mappings.
5531 */
5532 static void
hat_unload_large_virtual(struct hat * sfmmup,caddr_t startaddr,size_t len,uint_t flags,hat_callback_t * callback)5533 hat_unload_large_virtual(
5534 struct hat *sfmmup,
5535 caddr_t startaddr,
5536 size_t len,
5537 uint_t flags,
5538 hat_callback_t *callback)
5539 {
5540 struct hmehash_bucket *hmebp;
5541 struct hme_blk *hmeblkp;
5542 struct hme_blk *pr_hblk = NULL;
5543 struct hme_blk *nx_hblk;
5544 struct hme_blk *list = NULL;
5545 int i;
5546 demap_range_t dmr, *dmrp;
5547 cpuset_t cpuset;
5548 caddr_t endaddr = startaddr + len;
5549 caddr_t sa;
5550 caddr_t ea;
5551 caddr_t cb_sa[MAX_CB_ADDR];
5552 caddr_t cb_ea[MAX_CB_ADDR];
5553 int addr_cnt = 0;
5554 int a = 0;
5555
5556 if (sfmmup->sfmmu_free) {
5557 dmrp = NULL;
5558 } else {
5559 dmrp = &dmr;
5560 DEMAP_RANGE_INIT(sfmmup, dmrp);
5561 }
5562
5563 /*
5564 * Loop through all the hash buckets of HME blocks looking for matches.
5565 */
5566 for (i = 0; i <= UHMEHASH_SZ; i++) {
5567 hmebp = &uhme_hash[i];
5568 SFMMU_HASH_LOCK(hmebp);
5569 hmeblkp = hmebp->hmeblkp;
5570 pr_hblk = NULL;
5571 while (hmeblkp) {
5572 nx_hblk = hmeblkp->hblk_next;
5573
5574 /*
5575 * skip if not this context, if a shadow block or
5576 * if the mapping is not in the requested range
5577 */
5578 if (hmeblkp->hblk_tag.htag_id != sfmmup ||
5579 hmeblkp->hblk_shw_bit ||
5580 (sa = (caddr_t)get_hblk_base(hmeblkp)) >= endaddr ||
5581 (ea = get_hblk_endaddr(hmeblkp)) <= startaddr) {
5582 pr_hblk = hmeblkp;
5583 goto next_block;
5584 }
5585
5586 ASSERT(!hmeblkp->hblk_shared);
5587 /*
5588 * unload if there are any current valid mappings
5589 */
5590 if (hmeblkp->hblk_vcnt != 0 ||
5591 hmeblkp->hblk_hmecnt != 0)
5592 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
5593 sa, ea, dmrp, flags);
5594
5595 /*
5596 * on unmap we also release the HME block itself, once
5597 * all mappings are gone.
5598 */
5599 if ((flags & HAT_UNLOAD_UNMAP) != 0 &&
5600 !hmeblkp->hblk_vcnt &&
5601 !hmeblkp->hblk_hmecnt) {
5602 ASSERT(!hmeblkp->hblk_lckcnt);
5603 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
5604 &list, 0);
5605 } else {
5606 pr_hblk = hmeblkp;
5607 }
5608
5609 if (callback == NULL)
5610 goto next_block;
5611
5612 /*
5613 * HME blocks may span more than one page, but we may be
5614 * unmapping only one page, so check for a smaller range
5615 * for the callback
5616 */
5617 if (sa < startaddr)
5618 sa = startaddr;
5619 if (--ea > endaddr)
5620 ea = endaddr - 1;
5621
5622 cb_sa[addr_cnt] = sa;
5623 cb_ea[addr_cnt] = ea;
5624 if (++addr_cnt == MAX_CB_ADDR) {
5625 if (dmrp != NULL) {
5626 DEMAP_RANGE_FLUSH(dmrp);
5627 cpuset = sfmmup->sfmmu_cpusran;
5628 xt_sync(cpuset);
5629 }
5630
5631 for (a = 0; a < MAX_CB_ADDR; ++a) {
5632 callback->hcb_start_addr = cb_sa[a];
5633 callback->hcb_end_addr = cb_ea[a];
5634 callback->hcb_function(callback);
5635 }
5636 addr_cnt = 0;
5637 }
5638
5639 next_block:
5640 hmeblkp = nx_hblk;
5641 }
5642 SFMMU_HASH_UNLOCK(hmebp);
5643 }
5644
5645 sfmmu_hblks_list_purge(&list, 0);
5646 if (dmrp != NULL) {
5647 DEMAP_RANGE_FLUSH(dmrp);
5648 cpuset = sfmmup->sfmmu_cpusran;
5649 xt_sync(cpuset);
5650 }
5651
5652 for (a = 0; a < addr_cnt; ++a) {
5653 callback->hcb_start_addr = cb_sa[a];
5654 callback->hcb_end_addr = cb_ea[a];
5655 callback->hcb_function(callback);
5656 }
5657
5658 /*
5659 * Check TSB and TLB page sizes if the process isn't exiting.
5660 */
5661 if (!sfmmup->sfmmu_free)
5662 sfmmu_check_page_sizes(sfmmup, 0);
5663 }
5664
5665 /*
5666 * Unload all the mappings in the range [addr..addr+len). addr and len must
5667 * be MMU_PAGESIZE aligned.
5668 */
5669
5670 extern struct seg *segkmap;
5671 #define ISSEGKMAP(sfmmup, addr) (sfmmup == ksfmmup && \
5672 segkmap->s_base <= (addr) && (addr) < (segkmap->s_base + segkmap->s_size))
5673
5674
5675 void
hat_unload_callback(struct hat * sfmmup,caddr_t addr,size_t len,uint_t flags,hat_callback_t * callback)5676 hat_unload_callback(
5677 struct hat *sfmmup,
5678 caddr_t addr,
5679 size_t len,
5680 uint_t flags,
5681 hat_callback_t *callback)
5682 {
5683 struct hmehash_bucket *hmebp;
5684 hmeblk_tag hblktag;
5685 int hmeshift, hashno, iskernel;
5686 struct hme_blk *hmeblkp, *pr_hblk, *list = NULL;
5687 caddr_t endaddr;
5688 cpuset_t cpuset;
5689 int addr_count = 0;
5690 int a;
5691 caddr_t cb_start_addr[MAX_CB_ADDR];
5692 caddr_t cb_end_addr[MAX_CB_ADDR];
5693 int issegkmap = ISSEGKMAP(sfmmup, addr);
5694 demap_range_t dmr, *dmrp;
5695
5696 if (sfmmup->sfmmu_xhat_provider) {
5697 XHAT_UNLOAD_CALLBACK(sfmmup, addr, len, flags, callback);
5698 return;
5699 } else {
5700 /*
5701 * This must be a CPU HAT. If the address space has
5702 * XHATs attached, unload the mappings for all of them,
5703 * just in case
5704 */
5705 ASSERT(sfmmup->sfmmu_as != NULL);
5706 if (sfmmup->sfmmu_as->a_xhat != NULL)
5707 xhat_unload_callback_all(sfmmup->sfmmu_as, addr,
5708 len, flags, callback);
5709 }
5710
5711 ASSERT((sfmmup == ksfmmup) || (flags & HAT_UNLOAD_OTHER) || \
5712 AS_LOCK_HELD(sfmmup->sfmmu_as));
5713
5714 ASSERT(sfmmup != NULL);
5715 ASSERT((len & MMU_PAGEOFFSET) == 0);
5716 ASSERT(!((uintptr_t)addr & MMU_PAGEOFFSET));
5717
5718 /*
5719 * Probing through a large VA range (say 63 bits) will be slow, even
5720 * at 4 Meg steps between the probes. So, when the virtual address range
5721 * is very large, search the HME entries for what to unload.
5722 *
5723 * len >> TTE_PAGE_SHIFT(TTE4M) is the # of 4Meg probes we'd need
5724 *
5725 * UHMEHASH_SZ is number of hash buckets to examine
5726 *
5727 */
5728 if (sfmmup != KHATID && (len >> TTE_PAGE_SHIFT(TTE4M)) > UHMEHASH_SZ) {
5729 hat_unload_large_virtual(sfmmup, addr, len, flags, callback);
5730 return;
5731 }
5732
5733 CPUSET_ZERO(cpuset);
5734
5735 /*
5736 * If the process is exiting, we can save a lot of fuss since
5737 * we'll flush the TLB when we free the ctx anyway.
5738 */
5739 if (sfmmup->sfmmu_free) {
5740 dmrp = NULL;
5741 } else {
5742 dmrp = &dmr;
5743 DEMAP_RANGE_INIT(sfmmup, dmrp);
5744 }
5745
5746 endaddr = addr + len;
5747 hblktag.htag_id = sfmmup;
5748 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
5749
5750 /*
5751 * It is likely for the vm to call unload over a wide range of
5752 * addresses that are actually very sparsely populated by
5753 * translations. In order to speed this up the sfmmu hat supports
5754 * the concept of shadow hmeblks. Dummy large page hmeblks that
5755 * correspond to actual small translations are allocated at tteload
5756 * time and are referred to as shadow hmeblks. Now, during unload
5757 * time, we first check if we have a shadow hmeblk for that
5758 * translation. The absence of one means the corresponding address
5759 * range is empty and can be skipped.
5760 *
5761 * The kernel is an exception to above statement and that is why
5762 * we don't use shadow hmeblks and hash starting from the smallest
5763 * page size.
5764 */
5765 if (sfmmup == KHATID) {
5766 iskernel = 1;
5767 hashno = TTE64K;
5768 } else {
5769 iskernel = 0;
5770 if (mmu_page_sizes == max_mmu_page_sizes) {
5771 hashno = TTE256M;
5772 } else {
5773 hashno = TTE4M;
5774 }
5775 }
5776 while (addr < endaddr) {
5777 hmeshift = HME_HASH_SHIFT(hashno);
5778 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
5779 hblktag.htag_rehash = hashno;
5780 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
5781
5782 SFMMU_HASH_LOCK(hmebp);
5783
5784 HME_HASH_SEARCH_PREV(hmebp, hblktag, hmeblkp, pr_hblk, &list);
5785 if (hmeblkp == NULL) {
5786 /*
5787 * didn't find an hmeblk. skip the appropiate
5788 * address range.
5789 */
5790 SFMMU_HASH_UNLOCK(hmebp);
5791 if (iskernel) {
5792 if (hashno < mmu_hashcnt) {
5793 hashno++;
5794 continue;
5795 } else {
5796 hashno = TTE64K;
5797 addr = (caddr_t)roundup((uintptr_t)addr
5798 + 1, MMU_PAGESIZE64K);
5799 continue;
5800 }
5801 }
5802 addr = (caddr_t)roundup((uintptr_t)addr + 1,
5803 (1 << hmeshift));
5804 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5805 ASSERT(hashno == TTE64K);
5806 continue;
5807 }
5808 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5809 hashno = TTE512K;
5810 continue;
5811 }
5812 if (mmu_page_sizes == max_mmu_page_sizes) {
5813 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5814 hashno = TTE4M;
5815 continue;
5816 }
5817 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5818 hashno = TTE32M;
5819 continue;
5820 }
5821 hashno = TTE256M;
5822 continue;
5823 } else {
5824 hashno = TTE4M;
5825 continue;
5826 }
5827 }
5828 ASSERT(hmeblkp);
5829 ASSERT(!hmeblkp->hblk_shared);
5830 if (!hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
5831 /*
5832 * If the valid count is zero we can skip the range
5833 * mapped by this hmeblk.
5834 * We free hblks in the case of HAT_UNMAP. HAT_UNMAP
5835 * is used by segment drivers as a hint
5836 * that the mapping resource won't be used any longer.
5837 * The best example of this is during exit().
5838 */
5839 addr = (caddr_t)roundup((uintptr_t)addr + 1,
5840 get_hblk_span(hmeblkp));
5841 if ((flags & HAT_UNLOAD_UNMAP) ||
5842 (iskernel && !issegkmap)) {
5843 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk,
5844 &list, 0);
5845 }
5846 SFMMU_HASH_UNLOCK(hmebp);
5847
5848 if (iskernel) {
5849 hashno = TTE64K;
5850 continue;
5851 }
5852 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5853 ASSERT(hashno == TTE64K);
5854 continue;
5855 }
5856 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5857 hashno = TTE512K;
5858 continue;
5859 }
5860 if (mmu_page_sizes == max_mmu_page_sizes) {
5861 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5862 hashno = TTE4M;
5863 continue;
5864 }
5865 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5866 hashno = TTE32M;
5867 continue;
5868 }
5869 hashno = TTE256M;
5870 continue;
5871 } else {
5872 hashno = TTE4M;
5873 continue;
5874 }
5875 }
5876 if (hmeblkp->hblk_shw_bit) {
5877 /*
5878 * If we encounter a shadow hmeblk we know there is
5879 * smaller sized hmeblks mapping the same address space.
5880 * Decrement the hash size and rehash.
5881 */
5882 ASSERT(sfmmup != KHATID);
5883 hashno--;
5884 SFMMU_HASH_UNLOCK(hmebp);
5885 continue;
5886 }
5887
5888 /*
5889 * track callback address ranges.
5890 * only start a new range when it's not contiguous
5891 */
5892 if (callback != NULL) {
5893 if (addr_count > 0 &&
5894 addr == cb_end_addr[addr_count - 1])
5895 --addr_count;
5896 else
5897 cb_start_addr[addr_count] = addr;
5898 }
5899
5900 addr = sfmmu_hblk_unload(sfmmup, hmeblkp, addr, endaddr,
5901 dmrp, flags);
5902
5903 if (callback != NULL)
5904 cb_end_addr[addr_count++] = addr;
5905
5906 if (((flags & HAT_UNLOAD_UNMAP) || (iskernel && !issegkmap)) &&
5907 !hmeblkp->hblk_vcnt && !hmeblkp->hblk_hmecnt) {
5908 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 0);
5909 }
5910 SFMMU_HASH_UNLOCK(hmebp);
5911
5912 /*
5913 * Notify our caller as to exactly which pages
5914 * have been unloaded. We do these in clumps,
5915 * to minimize the number of xt_sync()s that need to occur.
5916 */
5917 if (callback != NULL && addr_count == MAX_CB_ADDR) {
5918 if (dmrp != NULL) {
5919 DEMAP_RANGE_FLUSH(dmrp);
5920 cpuset = sfmmup->sfmmu_cpusran;
5921 xt_sync(cpuset);
5922 }
5923
5924 for (a = 0; a < MAX_CB_ADDR; ++a) {
5925 callback->hcb_start_addr = cb_start_addr[a];
5926 callback->hcb_end_addr = cb_end_addr[a];
5927 callback->hcb_function(callback);
5928 }
5929 addr_count = 0;
5930 }
5931 if (iskernel) {
5932 hashno = TTE64K;
5933 continue;
5934 }
5935 if ((uintptr_t)addr & MMU_PAGEOFFSET512K) {
5936 ASSERT(hashno == TTE64K);
5937 continue;
5938 }
5939 if ((uintptr_t)addr & MMU_PAGEOFFSET4M) {
5940 hashno = TTE512K;
5941 continue;
5942 }
5943 if (mmu_page_sizes == max_mmu_page_sizes) {
5944 if ((uintptr_t)addr & MMU_PAGEOFFSET32M) {
5945 hashno = TTE4M;
5946 continue;
5947 }
5948 if ((uintptr_t)addr & MMU_PAGEOFFSET256M) {
5949 hashno = TTE32M;
5950 continue;
5951 }
5952 hashno = TTE256M;
5953 } else {
5954 hashno = TTE4M;
5955 }
5956 }
5957
5958 sfmmu_hblks_list_purge(&list, 0);
5959 if (dmrp != NULL) {
5960 DEMAP_RANGE_FLUSH(dmrp);
5961 cpuset = sfmmup->sfmmu_cpusran;
5962 xt_sync(cpuset);
5963 }
5964 if (callback && addr_count != 0) {
5965 for (a = 0; a < addr_count; ++a) {
5966 callback->hcb_start_addr = cb_start_addr[a];
5967 callback->hcb_end_addr = cb_end_addr[a];
5968 callback->hcb_function(callback);
5969 }
5970 }
5971
5972 /*
5973 * Check TSB and TLB page sizes if the process isn't exiting.
5974 */
5975 if (!sfmmup->sfmmu_free)
5976 sfmmu_check_page_sizes(sfmmup, 0);
5977 }
5978
5979 /*
5980 * Unload all the mappings in the range [addr..addr+len). addr and len must
5981 * be MMU_PAGESIZE aligned.
5982 */
5983 void
hat_unload(struct hat * sfmmup,caddr_t addr,size_t len,uint_t flags)5984 hat_unload(struct hat *sfmmup, caddr_t addr, size_t len, uint_t flags)
5985 {
5986 if (sfmmup->sfmmu_xhat_provider) {
5987 XHAT_UNLOAD(sfmmup, addr, len, flags);
5988 return;
5989 }
5990 hat_unload_callback(sfmmup, addr, len, flags, NULL);
5991 }
5992
5993
5994 /*
5995 * Find the largest mapping size for this page.
5996 */
5997 int
fnd_mapping_sz(page_t * pp)5998 fnd_mapping_sz(page_t *pp)
5999 {
6000 int sz;
6001 int p_index;
6002
6003 p_index = PP_MAPINDEX(pp);
6004
6005 sz = 0;
6006 p_index >>= 1; /* don't care about 8K bit */
6007 for (; p_index; p_index >>= 1) {
6008 sz++;
6009 }
6010
6011 return (sz);
6012 }
6013
6014 /*
6015 * This function unloads a range of addresses for an hmeblk.
6016 * It returns the next address to be unloaded.
6017 * It should be called with the hash lock held.
6018 */
6019 static caddr_t
sfmmu_hblk_unload(struct hat * sfmmup,struct hme_blk * hmeblkp,caddr_t addr,caddr_t endaddr,demap_range_t * dmrp,uint_t flags)6020 sfmmu_hblk_unload(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
6021 caddr_t endaddr, demap_range_t *dmrp, uint_t flags)
6022 {
6023 tte_t tte, ttemod;
6024 struct sf_hment *sfhmep;
6025 int ttesz;
6026 long ttecnt;
6027 page_t *pp;
6028 kmutex_t *pml;
6029 int ret;
6030 int use_demap_range;
6031
6032 ASSERT(in_hblk_range(hmeblkp, addr));
6033 ASSERT(!hmeblkp->hblk_shw_bit);
6034 ASSERT(sfmmup != NULL || hmeblkp->hblk_shared);
6035 ASSERT(sfmmup == NULL || !hmeblkp->hblk_shared);
6036 ASSERT(dmrp == NULL || !hmeblkp->hblk_shared);
6037
6038 #ifdef DEBUG
6039 if (get_hblk_ttesz(hmeblkp) != TTE8K &&
6040 (endaddr < get_hblk_endaddr(hmeblkp))) {
6041 panic("sfmmu_hblk_unload: partial unload of large page");
6042 }
6043 #endif /* DEBUG */
6044
6045 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
6046 ttesz = get_hblk_ttesz(hmeblkp);
6047
6048 use_demap_range = ((dmrp == NULL) ||
6049 (TTEBYTES(ttesz) == DEMAP_RANGE_PGSZ(dmrp)));
6050
6051 if (use_demap_range) {
6052 DEMAP_RANGE_CONTINUE(dmrp, addr, endaddr);
6053 } else if (dmrp != NULL) {
6054 DEMAP_RANGE_FLUSH(dmrp);
6055 }
6056 ttecnt = 0;
6057 HBLKTOHME(sfhmep, hmeblkp, addr);
6058
6059 while (addr < endaddr) {
6060 pml = NULL;
6061 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6062 if (TTE_IS_VALID(&tte)) {
6063 pp = sfhmep->hme_page;
6064 if (pp != NULL) {
6065 pml = sfmmu_mlist_enter(pp);
6066 }
6067
6068 /*
6069 * Verify if hme still points to 'pp' now that
6070 * we have p_mapping lock.
6071 */
6072 if (sfhmep->hme_page != pp) {
6073 if (pp != NULL && sfhmep->hme_page != NULL) {
6074 ASSERT(pml != NULL);
6075 sfmmu_mlist_exit(pml);
6076 /* Re-start this iteration. */
6077 continue;
6078 }
6079 ASSERT((pp != NULL) &&
6080 (sfhmep->hme_page == NULL));
6081 goto tte_unloaded;
6082 }
6083
6084 /*
6085 * This point on we have both HASH and p_mapping
6086 * lock.
6087 */
6088 ASSERT(pp == sfhmep->hme_page);
6089 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
6090
6091 /*
6092 * We need to loop on modify tte because it is
6093 * possible for pagesync to come along and
6094 * change the software bits beneath us.
6095 *
6096 * Page_unload can also invalidate the tte after
6097 * we read tte outside of p_mapping lock.
6098 */
6099 again:
6100 ttemod = tte;
6101
6102 TTE_SET_INVALID(&ttemod);
6103 ret = sfmmu_modifytte_try(&tte, &ttemod,
6104 &sfhmep->hme_tte);
6105
6106 if (ret <= 0) {
6107 if (TTE_IS_VALID(&tte)) {
6108 ASSERT(ret < 0);
6109 goto again;
6110 }
6111 if (pp != NULL) {
6112 panic("sfmmu_hblk_unload: pp = 0x%p "
6113 "tte became invalid under mlist"
6114 " lock = 0x%p", (void *)pp,
6115 (void *)pml);
6116 }
6117 continue;
6118 }
6119
6120 if (!(flags & HAT_UNLOAD_NOSYNC)) {
6121 sfmmu_ttesync(sfmmup, addr, &tte, pp);
6122 }
6123
6124 /*
6125 * Ok- we invalidated the tte. Do the rest of the job.
6126 */
6127 ttecnt++;
6128
6129 if (flags & HAT_UNLOAD_UNLOCK) {
6130 ASSERT(hmeblkp->hblk_lckcnt > 0);
6131 atomic_dec_32(&hmeblkp->hblk_lckcnt);
6132 HBLK_STACK_TRACE(hmeblkp, HBLK_UNLOCK);
6133 }
6134
6135 /*
6136 * Normally we would need to flush the page
6137 * from the virtual cache at this point in
6138 * order to prevent a potential cache alias
6139 * inconsistency.
6140 * The particular scenario we need to worry
6141 * about is:
6142 * Given: va1 and va2 are two virtual address
6143 * that alias and map the same physical
6144 * address.
6145 * 1. mapping exists from va1 to pa and data
6146 * has been read into the cache.
6147 * 2. unload va1.
6148 * 3. load va2 and modify data using va2.
6149 * 4 unload va2.
6150 * 5. load va1 and reference data. Unless we
6151 * flush the data cache when we unload we will
6152 * get stale data.
6153 * Fortunately, page coloring eliminates the
6154 * above scenario by remembering the color a
6155 * physical page was last or is currently
6156 * mapped to. Now, we delay the flush until
6157 * the loading of translations. Only when the
6158 * new translation is of a different color
6159 * are we forced to flush.
6160 */
6161 if (use_demap_range) {
6162 /*
6163 * Mark this page as needing a demap.
6164 */
6165 DEMAP_RANGE_MARKPG(dmrp, addr);
6166 } else {
6167 ASSERT(sfmmup != NULL);
6168 ASSERT(!hmeblkp->hblk_shared);
6169 sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
6170 sfmmup->sfmmu_free, 0);
6171 }
6172
6173 if (pp) {
6174 /*
6175 * Remove the hment from the mapping list
6176 */
6177 ASSERT(hmeblkp->hblk_hmecnt > 0);
6178
6179 /*
6180 * Again, we cannot
6181 * ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS);
6182 */
6183 HME_SUB(sfhmep, pp);
6184 membar_stst();
6185 atomic_dec_16(&hmeblkp->hblk_hmecnt);
6186 }
6187
6188 ASSERT(hmeblkp->hblk_vcnt > 0);
6189 atomic_dec_16(&hmeblkp->hblk_vcnt);
6190
6191 ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
6192 !hmeblkp->hblk_lckcnt);
6193
6194 #ifdef VAC
6195 if (pp && (pp->p_nrm & (P_KPMC | P_KPMS | P_TNC))) {
6196 if (PP_ISTNC(pp)) {
6197 /*
6198 * If page was temporary
6199 * uncached, try to recache
6200 * it. Note that HME_SUB() was
6201 * called above so p_index and
6202 * mlist had been updated.
6203 */
6204 conv_tnc(pp, ttesz);
6205 } else if (pp->p_mapping == NULL) {
6206 ASSERT(kpm_enable);
6207 /*
6208 * Page is marked to be in VAC conflict
6209 * to an existing kpm mapping and/or is
6210 * kpm mapped using only the regular
6211 * pagesize.
6212 */
6213 sfmmu_kpm_hme_unload(pp);
6214 }
6215 }
6216 #endif /* VAC */
6217 } else if ((pp = sfhmep->hme_page) != NULL) {
6218 /*
6219 * TTE is invalid but the hme
6220 * still exists. let pageunload
6221 * complete its job.
6222 */
6223 ASSERT(pml == NULL);
6224 pml = sfmmu_mlist_enter(pp);
6225 if (sfhmep->hme_page != NULL) {
6226 sfmmu_mlist_exit(pml);
6227 continue;
6228 }
6229 ASSERT(sfhmep->hme_page == NULL);
6230 } else if (hmeblkp->hblk_hmecnt != 0) {
6231 /*
6232 * pageunload may have not finished decrementing
6233 * hblk_vcnt and hblk_hmecnt. Find page_t if any and
6234 * wait for pageunload to finish. Rely on pageunload
6235 * to decrement hblk_hmecnt after hblk_vcnt.
6236 */
6237 pfn_t pfn = TTE_TO_TTEPFN(&tte);
6238 ASSERT(pml == NULL);
6239 if (pf_is_memory(pfn)) {
6240 pp = page_numtopp_nolock(pfn);
6241 if (pp != NULL) {
6242 pml = sfmmu_mlist_enter(pp);
6243 sfmmu_mlist_exit(pml);
6244 pml = NULL;
6245 }
6246 }
6247 }
6248
6249 tte_unloaded:
6250 /*
6251 * At this point, the tte we are looking at
6252 * should be unloaded, and hme has been unlinked
6253 * from page too. This is important because in
6254 * pageunload, it does ttesync() then HME_SUB.
6255 * We need to make sure HME_SUB has been completed
6256 * so we know ttesync() has been completed. Otherwise,
6257 * at exit time, after return from hat layer, VM will
6258 * release as structure which hat_setstat() (called
6259 * by ttesync()) needs.
6260 */
6261 #ifdef DEBUG
6262 {
6263 tte_t dtte;
6264
6265 ASSERT(sfhmep->hme_page == NULL);
6266
6267 sfmmu_copytte(&sfhmep->hme_tte, &dtte);
6268 ASSERT(!TTE_IS_VALID(&dtte));
6269 }
6270 #endif
6271
6272 if (pml) {
6273 sfmmu_mlist_exit(pml);
6274 }
6275
6276 addr += TTEBYTES(ttesz);
6277 sfhmep++;
6278 DEMAP_RANGE_NEXTPG(dmrp);
6279 }
6280 /*
6281 * For shared hmeblks this routine is only called when region is freed
6282 * and no longer referenced. So no need to decrement ttecnt
6283 * in the region structure here.
6284 */
6285 if (ttecnt > 0 && sfmmup != NULL) {
6286 atomic_add_long(&sfmmup->sfmmu_ttecnt[ttesz], -ttecnt);
6287 }
6288 return (addr);
6289 }
6290
6291 /*
6292 * Invalidate a virtual address range for the local CPU.
6293 * For best performance ensure that the va range is completely
6294 * mapped, otherwise the entire TLB will be flushed.
6295 */
6296 void
hat_flush_range(struct hat * sfmmup,caddr_t va,size_t size)6297 hat_flush_range(struct hat *sfmmup, caddr_t va, size_t size)
6298 {
6299 ssize_t sz;
6300 caddr_t endva = va + size;
6301
6302 while (va < endva) {
6303 sz = hat_getpagesize(sfmmup, va);
6304 if (sz < 0) {
6305 vtag_flushall();
6306 break;
6307 }
6308 vtag_flushpage(va, (uint64_t)sfmmup);
6309 va += sz;
6310 }
6311 }
6312
6313 /*
6314 * Synchronize all the mappings in the range [addr..addr+len).
6315 * Can be called with clearflag having two states:
6316 * HAT_SYNC_DONTZERO means just return the rm stats
6317 * HAT_SYNC_ZERORM means zero rm bits in the tte and return the stats
6318 */
6319 void
hat_sync(struct hat * sfmmup,caddr_t addr,size_t len,uint_t clearflag)6320 hat_sync(struct hat *sfmmup, caddr_t addr, size_t len, uint_t clearflag)
6321 {
6322 struct hmehash_bucket *hmebp;
6323 hmeblk_tag hblktag;
6324 int hmeshift, hashno = 1;
6325 struct hme_blk *hmeblkp, *list = NULL;
6326 caddr_t endaddr;
6327 cpuset_t cpuset;
6328
6329 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
6330 ASSERT((sfmmup == ksfmmup) || AS_LOCK_HELD(sfmmup->sfmmu_as));
6331 ASSERT((len & MMU_PAGEOFFSET) == 0);
6332 ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
6333 (clearflag == HAT_SYNC_ZERORM));
6334
6335 CPUSET_ZERO(cpuset);
6336
6337 endaddr = addr + len;
6338 hblktag.htag_id = sfmmup;
6339 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
6340
6341 /*
6342 * Spitfire supports 4 page sizes.
6343 * Most pages are expected to be of the smallest page
6344 * size (8K) and these will not need to be rehashed. 64K
6345 * pages also don't need to be rehashed because the an hmeblk
6346 * spans 64K of address space. 512K pages might need 1 rehash and
6347 * and 4M pages 2 rehashes.
6348 */
6349 while (addr < endaddr) {
6350 hmeshift = HME_HASH_SHIFT(hashno);
6351 hblktag.htag_bspage = HME_HASH_BSPAGE(addr, hmeshift);
6352 hblktag.htag_rehash = hashno;
6353 hmebp = HME_HASH_FUNCTION(sfmmup, addr, hmeshift);
6354
6355 SFMMU_HASH_LOCK(hmebp);
6356
6357 HME_HASH_SEARCH(hmebp, hblktag, hmeblkp, &list);
6358 if (hmeblkp != NULL) {
6359 ASSERT(!hmeblkp->hblk_shared);
6360 /*
6361 * We've encountered a shadow hmeblk so skip the range
6362 * of the next smaller mapping size.
6363 */
6364 if (hmeblkp->hblk_shw_bit) {
6365 ASSERT(sfmmup != ksfmmup);
6366 ASSERT(hashno > 1);
6367 addr = (caddr_t)P2END((uintptr_t)addr,
6368 TTEBYTES(hashno - 1));
6369 } else {
6370 addr = sfmmu_hblk_sync(sfmmup, hmeblkp,
6371 addr, endaddr, clearflag);
6372 }
6373 SFMMU_HASH_UNLOCK(hmebp);
6374 hashno = 1;
6375 continue;
6376 }
6377 SFMMU_HASH_UNLOCK(hmebp);
6378
6379 if (!HME_REHASH(sfmmup) || (hashno >= mmu_hashcnt)) {
6380 /*
6381 * We have traversed the whole list and rehashed
6382 * if necessary without finding the address to sync.
6383 * This is ok so we increment the address by the
6384 * smallest hmeblk range for kernel mappings and the
6385 * largest hmeblk range, to account for shadow hmeblks,
6386 * for user mappings and continue.
6387 */
6388 if (sfmmup == ksfmmup)
6389 addr = (caddr_t)P2END((uintptr_t)addr,
6390 TTEBYTES(1));
6391 else
6392 addr = (caddr_t)P2END((uintptr_t)addr,
6393 TTEBYTES(hashno));
6394 hashno = 1;
6395 } else {
6396 hashno++;
6397 }
6398 }
6399 sfmmu_hblks_list_purge(&list, 0);
6400 cpuset = sfmmup->sfmmu_cpusran;
6401 xt_sync(cpuset);
6402 }
6403
6404 static caddr_t
sfmmu_hblk_sync(struct hat * sfmmup,struct hme_blk * hmeblkp,caddr_t addr,caddr_t endaddr,int clearflag)6405 sfmmu_hblk_sync(struct hat *sfmmup, struct hme_blk *hmeblkp, caddr_t addr,
6406 caddr_t endaddr, int clearflag)
6407 {
6408 tte_t tte, ttemod;
6409 struct sf_hment *sfhmep;
6410 int ttesz;
6411 struct page *pp;
6412 kmutex_t *pml;
6413 int ret;
6414
6415 ASSERT(hmeblkp->hblk_shw_bit == 0);
6416 ASSERT(!hmeblkp->hblk_shared);
6417
6418 endaddr = MIN(endaddr, get_hblk_endaddr(hmeblkp));
6419
6420 ttesz = get_hblk_ttesz(hmeblkp);
6421 HBLKTOHME(sfhmep, hmeblkp, addr);
6422
6423 while (addr < endaddr) {
6424 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6425 if (TTE_IS_VALID(&tte)) {
6426 pml = NULL;
6427 pp = sfhmep->hme_page;
6428 if (pp) {
6429 pml = sfmmu_mlist_enter(pp);
6430 }
6431 if (pp != sfhmep->hme_page) {
6432 /*
6433 * tte most have been unloaded
6434 * underneath us. Recheck
6435 */
6436 ASSERT(pml);
6437 sfmmu_mlist_exit(pml);
6438 continue;
6439 }
6440
6441 ASSERT(pp == NULL || sfmmu_mlist_held(pp));
6442
6443 if (clearflag == HAT_SYNC_ZERORM) {
6444 ttemod = tte;
6445 TTE_CLR_RM(&ttemod);
6446 ret = sfmmu_modifytte_try(&tte, &ttemod,
6447 &sfhmep->hme_tte);
6448 if (ret < 0) {
6449 if (pml) {
6450 sfmmu_mlist_exit(pml);
6451 }
6452 continue;
6453 }
6454
6455 if (ret > 0) {
6456 sfmmu_tlb_demap(addr, sfmmup,
6457 hmeblkp, 0, 0);
6458 }
6459 }
6460 sfmmu_ttesync(sfmmup, addr, &tte, pp);
6461 if (pml) {
6462 sfmmu_mlist_exit(pml);
6463 }
6464 }
6465 addr += TTEBYTES(ttesz);
6466 sfhmep++;
6467 }
6468 return (addr);
6469 }
6470
6471 /*
6472 * This function will sync a tte to the page struct and it will
6473 * update the hat stats. Currently it allows us to pass a NULL pp
6474 * and we will simply update the stats. We may want to change this
6475 * so we only keep stats for pages backed by pp's.
6476 */
6477 static void
sfmmu_ttesync(struct hat * sfmmup,caddr_t addr,tte_t * ttep,page_t * pp)6478 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp)
6479 {
6480 uint_t rm = 0;
6481 int sz;
6482 pgcnt_t npgs;
6483
6484 ASSERT(TTE_IS_VALID(ttep));
6485
6486 if (TTE_IS_NOSYNC(ttep)) {
6487 return;
6488 }
6489
6490 if (TTE_IS_REF(ttep)) {
6491 rm = P_REF;
6492 }
6493 if (TTE_IS_MOD(ttep)) {
6494 rm |= P_MOD;
6495 }
6496
6497 if (rm == 0) {
6498 return;
6499 }
6500
6501 sz = TTE_CSZ(ttep);
6502 if (sfmmup != NULL && sfmmup->sfmmu_rmstat) {
6503 int i;
6504 caddr_t vaddr = addr;
6505
6506 for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) {
6507 hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm);
6508 }
6509
6510 }
6511
6512 /*
6513 * XXX I want to use cas to update nrm bits but they
6514 * currently belong in common/vm and not in hat where
6515 * they should be.
6516 * The nrm bits are protected by the same mutex as
6517 * the one that protects the page's mapping list.
6518 */
6519 if (!pp)
6520 return;
6521 ASSERT(sfmmu_mlist_held(pp));
6522 /*
6523 * If the tte is for a large page, we need to sync all the
6524 * pages covered by the tte.
6525 */
6526 if (sz != TTE8K) {
6527 ASSERT(pp->p_szc != 0);
6528 pp = PP_GROUPLEADER(pp, sz);
6529 ASSERT(sfmmu_mlist_held(pp));
6530 }
6531
6532 /* Get number of pages from tte size. */
6533 npgs = TTEPAGES(sz);
6534
6535 do {
6536 ASSERT(pp);
6537 ASSERT(sfmmu_mlist_held(pp));
6538 if (((rm & P_REF) != 0 && !PP_ISREF(pp)) ||
6539 ((rm & P_MOD) != 0 && !PP_ISMOD(pp)))
6540 hat_page_setattr(pp, rm);
6541
6542 /*
6543 * Are we done? If not, we must have a large mapping.
6544 * For large mappings we need to sync the rest of the pages
6545 * covered by this tte; goto the next page.
6546 */
6547 } while (--npgs > 0 && (pp = PP_PAGENEXT(pp)));
6548 }
6549
6550 /*
6551 * Execute pre-callback handler of each pa_hment linked to pp
6552 *
6553 * Inputs:
6554 * flag: either HAT_PRESUSPEND or HAT_SUSPEND.
6555 * capture_cpus: pointer to return value (below)
6556 *
6557 * Returns:
6558 * Propagates the subsystem callback return values back to the caller;
6559 * returns 0 on success. If capture_cpus is non-NULL, the value returned
6560 * is zero if all of the pa_hments are of a type that do not require
6561 * capturing CPUs prior to suspending the mapping, else it is 1.
6562 */
6563 static int
hat_pageprocess_precallbacks(struct page * pp,uint_t flag,int * capture_cpus)6564 hat_pageprocess_precallbacks(struct page *pp, uint_t flag, int *capture_cpus)
6565 {
6566 struct sf_hment *sfhmep;
6567 struct pa_hment *pahmep;
6568 int (*f)(caddr_t, uint_t, uint_t, void *);
6569 int ret;
6570 id_t id;
6571 int locked = 0;
6572 kmutex_t *pml;
6573
6574 ASSERT(PAGE_EXCL(pp));
6575 if (!sfmmu_mlist_held(pp)) {
6576 pml = sfmmu_mlist_enter(pp);
6577 locked = 1;
6578 }
6579
6580 if (capture_cpus)
6581 *capture_cpus = 0;
6582
6583 top:
6584 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6585 /*
6586 * skip sf_hments corresponding to VA<->PA mappings;
6587 * for pa_hment's, hme_tte.ll is zero
6588 */
6589 if (!IS_PAHME(sfhmep))
6590 continue;
6591
6592 pahmep = sfhmep->hme_data;
6593 ASSERT(pahmep != NULL);
6594
6595 /*
6596 * skip if pre-handler has been called earlier in this loop
6597 */
6598 if (pahmep->flags & flag)
6599 continue;
6600
6601 id = pahmep->cb_id;
6602 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
6603 if (capture_cpus && sfmmu_cb_table[id].capture_cpus != 0)
6604 *capture_cpus = 1;
6605 if ((f = sfmmu_cb_table[id].prehandler) == NULL) {
6606 pahmep->flags |= flag;
6607 continue;
6608 }
6609
6610 /*
6611 * Drop the mapping list lock to avoid locking order issues.
6612 */
6613 if (locked)
6614 sfmmu_mlist_exit(pml);
6615
6616 ret = f(pahmep->addr, pahmep->len, flag, pahmep->pvt);
6617 if (ret != 0)
6618 return (ret); /* caller must do the cleanup */
6619
6620 if (locked) {
6621 pml = sfmmu_mlist_enter(pp);
6622 pahmep->flags |= flag;
6623 goto top;
6624 }
6625
6626 pahmep->flags |= flag;
6627 }
6628
6629 if (locked)
6630 sfmmu_mlist_exit(pml);
6631
6632 return (0);
6633 }
6634
6635 /*
6636 * Execute post-callback handler of each pa_hment linked to pp
6637 *
6638 * Same overall assumptions and restrictions apply as for
6639 * hat_pageprocess_precallbacks().
6640 */
6641 static void
hat_pageprocess_postcallbacks(struct page * pp,uint_t flag)6642 hat_pageprocess_postcallbacks(struct page *pp, uint_t flag)
6643 {
6644 pfn_t pgpfn = pp->p_pagenum;
6645 pfn_t pgmask = btop(page_get_pagesize(pp->p_szc)) - 1;
6646 pfn_t newpfn;
6647 struct sf_hment *sfhmep;
6648 struct pa_hment *pahmep;
6649 int (*f)(caddr_t, uint_t, uint_t, void *, pfn_t);
6650 id_t id;
6651 int locked = 0;
6652 kmutex_t *pml;
6653
6654 ASSERT(PAGE_EXCL(pp));
6655 if (!sfmmu_mlist_held(pp)) {
6656 pml = sfmmu_mlist_enter(pp);
6657 locked = 1;
6658 }
6659
6660 top:
6661 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6662 /*
6663 * skip sf_hments corresponding to VA<->PA mappings;
6664 * for pa_hment's, hme_tte.ll is zero
6665 */
6666 if (!IS_PAHME(sfhmep))
6667 continue;
6668
6669 pahmep = sfhmep->hme_data;
6670 ASSERT(pahmep != NULL);
6671
6672 if ((pahmep->flags & flag) == 0)
6673 continue;
6674
6675 pahmep->flags &= ~flag;
6676
6677 id = pahmep->cb_id;
6678 ASSERT(id >= (id_t)0 && id < sfmmu_cb_nextid);
6679 if ((f = sfmmu_cb_table[id].posthandler) == NULL)
6680 continue;
6681
6682 /*
6683 * Convert the base page PFN into the constituent PFN
6684 * which is needed by the callback handler.
6685 */
6686 newpfn = pgpfn | (btop((uintptr_t)pahmep->addr) & pgmask);
6687
6688 /*
6689 * Drop the mapping list lock to avoid locking order issues.
6690 */
6691 if (locked)
6692 sfmmu_mlist_exit(pml);
6693
6694 if (f(pahmep->addr, pahmep->len, flag, pahmep->pvt, newpfn)
6695 != 0)
6696 panic("sfmmu: posthandler failed");
6697
6698 if (locked) {
6699 pml = sfmmu_mlist_enter(pp);
6700 goto top;
6701 }
6702 }
6703
6704 if (locked)
6705 sfmmu_mlist_exit(pml);
6706 }
6707
6708 /*
6709 * Suspend locked kernel mapping
6710 */
6711 void
hat_pagesuspend(struct page * pp)6712 hat_pagesuspend(struct page *pp)
6713 {
6714 struct sf_hment *sfhmep;
6715 sfmmu_t *sfmmup;
6716 tte_t tte, ttemod;
6717 struct hme_blk *hmeblkp;
6718 caddr_t addr;
6719 int index, cons;
6720 cpuset_t cpuset;
6721
6722 ASSERT(PAGE_EXCL(pp));
6723 ASSERT(sfmmu_mlist_held(pp));
6724
6725 mutex_enter(&kpr_suspendlock);
6726
6727 /*
6728 * We're about to suspend a kernel mapping so mark this thread as
6729 * non-traceable by DTrace. This prevents us from running into issues
6730 * with probe context trying to touch a suspended page
6731 * in the relocation codepath itself.
6732 */
6733 curthread->t_flag |= T_DONTDTRACE;
6734
6735 index = PP_MAPINDEX(pp);
6736 cons = TTE8K;
6737
6738 retry:
6739 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
6740
6741 if (IS_PAHME(sfhmep))
6742 continue;
6743
6744 if (get_hblk_ttesz(sfmmu_hmetohblk(sfhmep)) != cons)
6745 continue;
6746
6747 /*
6748 * Loop until we successfully set the suspend bit in
6749 * the TTE.
6750 */
6751 again:
6752 sfmmu_copytte(&sfhmep->hme_tte, &tte);
6753 ASSERT(TTE_IS_VALID(&tte));
6754
6755 ttemod = tte;
6756 TTE_SET_SUSPEND(&ttemod);
6757 if (sfmmu_modifytte_try(&tte, &ttemod,
6758 &sfhmep->hme_tte) < 0)
6759 goto again;
6760
6761 /*
6762 * Invalidate TSB entry
6763 */
6764 hmeblkp = sfmmu_hmetohblk(sfhmep);
6765
6766 sfmmup = hblktosfmmu(hmeblkp);
6767 ASSERT(sfmmup == ksfmmup);
6768 ASSERT(!hmeblkp->hblk_shared);
6769
6770 addr = tte_to_vaddr(hmeblkp, tte);
6771
6772 /*
6773 * No need to make sure that the TSB for this sfmmu is
6774 * not being relocated since it is ksfmmup and thus it
6775 * will never be relocated.
6776 */
6777 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
6778
6779 /*
6780 * Update xcall stats
6781 */
6782 cpuset = cpu_ready_set;
6783 CPUSET_DEL(cpuset, CPU->cpu_id);
6784
6785 /* LINTED: constant in conditional context */
6786 SFMMU_XCALL_STATS(ksfmmup);
6787
6788 /*
6789 * Flush TLB entry on remote CPU's
6790 */
6791 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
6792 (uint64_t)ksfmmup);
6793 xt_sync(cpuset);
6794
6795 /*
6796 * Flush TLB entry on local CPU
6797 */
6798 vtag_flushpage(addr, (uint64_t)ksfmmup);
6799 }
6800
6801 while (index != 0) {
6802 index = index >> 1;
6803 if (index != 0)
6804 cons++;
6805 if (index & 0x1) {
6806 pp = PP_GROUPLEADER(pp, cons);
6807 goto retry;
6808 }
6809 }
6810 }
6811
6812 #ifdef DEBUG
6813
6814 #define N_PRLE 1024
6815 struct prle {
6816 page_t *targ;
6817 page_t *repl;
6818 int status;
6819 int pausecpus;
6820 hrtime_t whence;
6821 };
6822
6823 static struct prle page_relocate_log[N_PRLE];
6824 static int prl_entry;
6825 static kmutex_t prl_mutex;
6826
6827 #define PAGE_RELOCATE_LOG(t, r, s, p) \
6828 mutex_enter(&prl_mutex); \
6829 page_relocate_log[prl_entry].targ = *(t); \
6830 page_relocate_log[prl_entry].repl = *(r); \
6831 page_relocate_log[prl_entry].status = (s); \
6832 page_relocate_log[prl_entry].pausecpus = (p); \
6833 page_relocate_log[prl_entry].whence = gethrtime(); \
6834 prl_entry = (prl_entry == (N_PRLE - 1))? 0 : prl_entry + 1; \
6835 mutex_exit(&prl_mutex);
6836
6837 #else /* !DEBUG */
6838 #define PAGE_RELOCATE_LOG(t, r, s, p)
6839 #endif
6840
6841 /*
6842 * Core Kernel Page Relocation Algorithm
6843 *
6844 * Input:
6845 *
6846 * target : constituent pages are SE_EXCL locked.
6847 * replacement: constituent pages are SE_EXCL locked.
6848 *
6849 * Output:
6850 *
6851 * nrelocp: number of pages relocated
6852 */
6853 int
hat_page_relocate(page_t ** target,page_t ** replacement,spgcnt_t * nrelocp)6854 hat_page_relocate(page_t **target, page_t **replacement, spgcnt_t *nrelocp)
6855 {
6856 page_t *targ, *repl;
6857 page_t *tpp, *rpp;
6858 kmutex_t *low, *high;
6859 spgcnt_t npages, i;
6860 page_t *pl = NULL;
6861 int old_pil;
6862 cpuset_t cpuset;
6863 int cap_cpus;
6864 int ret;
6865 #ifdef VAC
6866 int cflags = 0;
6867 #endif
6868
6869 if (!kcage_on || PP_ISNORELOC(*target)) {
6870 PAGE_RELOCATE_LOG(target, replacement, EAGAIN, -1);
6871 return (EAGAIN);
6872 }
6873
6874 mutex_enter(&kpr_mutex);
6875 kreloc_thread = curthread;
6876
6877 targ = *target;
6878 repl = *replacement;
6879 ASSERT(repl != NULL);
6880 ASSERT(targ->p_szc == repl->p_szc);
6881
6882 npages = page_get_pagecnt(targ->p_szc);
6883
6884 /*
6885 * unload VA<->PA mappings that are not locked
6886 */
6887 tpp = targ;
6888 for (i = 0; i < npages; i++) {
6889 (void) hat_pageunload(tpp, SFMMU_KERNEL_RELOC);
6890 tpp++;
6891 }
6892
6893 /*
6894 * Do "presuspend" callbacks, in a context from which we can still
6895 * block as needed. Note that we don't hold the mapping list lock
6896 * of "targ" at this point due to potential locking order issues;
6897 * we assume that between the hat_pageunload() above and holding
6898 * the SE_EXCL lock that the mapping list *cannot* change at this
6899 * point.
6900 */
6901 ret = hat_pageprocess_precallbacks(targ, HAT_PRESUSPEND, &cap_cpus);
6902 if (ret != 0) {
6903 /*
6904 * EIO translates to fatal error, for all others cleanup
6905 * and return EAGAIN.
6906 */
6907 ASSERT(ret != EIO);
6908 hat_pageprocess_postcallbacks(targ, HAT_POSTUNSUSPEND);
6909 PAGE_RELOCATE_LOG(target, replacement, ret, -1);
6910 kreloc_thread = NULL;
6911 mutex_exit(&kpr_mutex);
6912 return (EAGAIN);
6913 }
6914
6915 /*
6916 * acquire p_mapping list lock for both the target and replacement
6917 * root pages.
6918 *
6919 * low and high refer to the need to grab the mlist locks in a
6920 * specific order in order to prevent race conditions. Thus the
6921 * lower lock must be grabbed before the higher lock.
6922 *
6923 * This will block hat_unload's accessing p_mapping list. Since
6924 * we have SE_EXCL lock, hat_memload and hat_pageunload will be
6925 * blocked. Thus, no one else will be accessing the p_mapping list
6926 * while we suspend and reload the locked mapping below.
6927 */
6928 tpp = targ;
6929 rpp = repl;
6930 sfmmu_mlist_reloc_enter(tpp, rpp, &low, &high);
6931
6932 kpreempt_disable();
6933
6934 /*
6935 * We raise our PIL to 13 so that we don't get captured by
6936 * another CPU or pinned by an interrupt thread. We can't go to
6937 * PIL 14 since the nexus driver(s) may need to interrupt at
6938 * that level in the case of IOMMU pseudo mappings.
6939 */
6940 cpuset = cpu_ready_set;
6941 CPUSET_DEL(cpuset, CPU->cpu_id);
6942 if (!cap_cpus || CPUSET_ISNULL(cpuset)) {
6943 old_pil = splr(XCALL_PIL);
6944 } else {
6945 old_pil = -1;
6946 xc_attention(cpuset);
6947 }
6948 ASSERT(getpil() == XCALL_PIL);
6949
6950 /*
6951 * Now do suspend callbacks. In the case of an IOMMU mapping
6952 * this will suspend all DMA activity to the page while it is
6953 * being relocated. Since we are well above LOCK_LEVEL and CPUs
6954 * may be captured at this point we should have acquired any needed
6955 * locks in the presuspend callback.
6956 */
6957 ret = hat_pageprocess_precallbacks(targ, HAT_SUSPEND, NULL);
6958 if (ret != 0) {
6959 repl = targ;
6960 goto suspend_fail;
6961 }
6962
6963 /*
6964 * Raise the PIL yet again, this time to block all high-level
6965 * interrupts on this CPU. This is necessary to prevent an
6966 * interrupt routine from pinning the thread which holds the
6967 * mapping suspended and then touching the suspended page.
6968 *
6969 * Once the page is suspended we also need to be careful to
6970 * avoid calling any functions which touch any seg_kmem memory
6971 * since that memory may be backed by the very page we are
6972 * relocating in here!
6973 */
6974 hat_pagesuspend(targ);
6975
6976 /*
6977 * Now that we are confident everybody has stopped using this page,
6978 * copy the page contents. Note we use a physical copy to prevent
6979 * locking issues and to avoid fpRAS because we can't handle it in
6980 * this context.
6981 */
6982 for (i = 0; i < npages; i++, tpp++, rpp++) {
6983 #ifdef VAC
6984 /*
6985 * If the replacement has a different vcolor than
6986 * the one being replacd, we need to handle VAC
6987 * consistency for it just as we were setting up
6988 * a new mapping to it.
6989 */
6990 if ((PP_GET_VCOLOR(rpp) != NO_VCOLOR) &&
6991 (tpp->p_vcolor != rpp->p_vcolor) &&
6992 !CacheColor_IsFlushed(cflags, PP_GET_VCOLOR(rpp))) {
6993 CacheColor_SetFlushed(cflags, PP_GET_VCOLOR(rpp));
6994 sfmmu_cache_flushcolor(PP_GET_VCOLOR(rpp),
6995 rpp->p_pagenum);
6996 }
6997 #endif
6998 /*
6999 * Copy the contents of the page.
7000 */
7001 ppcopy_kernel(tpp, rpp);
7002 }
7003
7004 tpp = targ;
7005 rpp = repl;
7006 for (i = 0; i < npages; i++, tpp++, rpp++) {
7007 /*
7008 * Copy attributes. VAC consistency was handled above,
7009 * if required.
7010 */
7011 rpp->p_nrm = tpp->p_nrm;
7012 tpp->p_nrm = 0;
7013 rpp->p_index = tpp->p_index;
7014 tpp->p_index = 0;
7015 #ifdef VAC
7016 rpp->p_vcolor = tpp->p_vcolor;
7017 #endif
7018 }
7019
7020 /*
7021 * First, unsuspend the page, if we set the suspend bit, and transfer
7022 * the mapping list from the target page to the replacement page.
7023 * Next process postcallbacks; since pa_hment's are linked only to the
7024 * p_mapping list of root page, we don't iterate over the constituent
7025 * pages.
7026 */
7027 hat_pagereload(targ, repl);
7028
7029 suspend_fail:
7030 hat_pageprocess_postcallbacks(repl, HAT_UNSUSPEND);
7031
7032 /*
7033 * Now lower our PIL and release any captured CPUs since we
7034 * are out of the "danger zone". After this it will again be
7035 * safe to acquire adaptive mutex locks, or to drop them...
7036 */
7037 if (old_pil != -1) {
7038 splx(old_pil);
7039 } else {
7040 xc_dismissed(cpuset);
7041 }
7042
7043 kpreempt_enable();
7044
7045 sfmmu_mlist_reloc_exit(low, high);
7046
7047 /*
7048 * Postsuspend callbacks should drop any locks held across
7049 * the suspend callbacks. As before, we don't hold the mapping
7050 * list lock at this point.. our assumption is that the mapping
7051 * list still can't change due to our holding SE_EXCL lock and
7052 * there being no unlocked mappings left. Hence the restriction
7053 * on calling context to hat_delete_callback()
7054 */
7055 hat_pageprocess_postcallbacks(repl, HAT_POSTUNSUSPEND);
7056 if (ret != 0) {
7057 /*
7058 * The second presuspend call failed: we got here through
7059 * the suspend_fail label above.
7060 */
7061 ASSERT(ret != EIO);
7062 PAGE_RELOCATE_LOG(target, replacement, ret, cap_cpus);
7063 kreloc_thread = NULL;
7064 mutex_exit(&kpr_mutex);
7065 return (EAGAIN);
7066 }
7067
7068 /*
7069 * Now that we're out of the performance critical section we can
7070 * take care of updating the hash table, since we still
7071 * hold all the pages locked SE_EXCL at this point we
7072 * needn't worry about things changing out from under us.
7073 */
7074 tpp = targ;
7075 rpp = repl;
7076 for (i = 0; i < npages; i++, tpp++, rpp++) {
7077
7078 /*
7079 * replace targ with replacement in page_hash table
7080 */
7081 targ = tpp;
7082 page_relocate_hash(rpp, targ);
7083
7084 /*
7085 * concatenate target; caller of platform_page_relocate()
7086 * expects target to be concatenated after returning.
7087 */
7088 ASSERT(targ->p_next == targ);
7089 ASSERT(targ->p_prev == targ);
7090 page_list_concat(&pl, &targ);
7091 }
7092
7093 ASSERT(*target == pl);
7094 *nrelocp = npages;
7095 PAGE_RELOCATE_LOG(target, replacement, 0, cap_cpus);
7096 kreloc_thread = NULL;
7097 mutex_exit(&kpr_mutex);
7098 return (0);
7099 }
7100
7101 /*
7102 * Called when stray pa_hments are found attached to a page which is
7103 * being freed. Notify the subsystem which attached the pa_hment of
7104 * the error if it registered a suitable handler, else panic.
7105 */
7106 static void
sfmmu_pahment_leaked(struct pa_hment * pahmep)7107 sfmmu_pahment_leaked(struct pa_hment *pahmep)
7108 {
7109 id_t cb_id = pahmep->cb_id;
7110
7111 ASSERT(cb_id >= (id_t)0 && cb_id < sfmmu_cb_nextid);
7112 if (sfmmu_cb_table[cb_id].errhandler != NULL) {
7113 if (sfmmu_cb_table[cb_id].errhandler(pahmep->addr, pahmep->len,
7114 HAT_CB_ERR_LEAKED, pahmep->pvt) == 0)
7115 return; /* non-fatal */
7116 }
7117 panic("pa_hment leaked: 0x%p", (void *)pahmep);
7118 }
7119
7120 /*
7121 * Remove all mappings to page 'pp'.
7122 */
7123 int
hat_pageunload(struct page * pp,uint_t forceflag)7124 hat_pageunload(struct page *pp, uint_t forceflag)
7125 {
7126 struct page *origpp = pp;
7127 struct sf_hment *sfhme, *tmphme;
7128 struct hme_blk *hmeblkp;
7129 kmutex_t *pml;
7130 #ifdef VAC
7131 kmutex_t *pmtx;
7132 #endif
7133 cpuset_t cpuset, tset;
7134 int index, cons;
7135 int xhme_blks;
7136 int pa_hments;
7137
7138 ASSERT(PAGE_EXCL(pp));
7139
7140 retry_xhat:
7141 tmphme = NULL;
7142 xhme_blks = 0;
7143 pa_hments = 0;
7144 CPUSET_ZERO(cpuset);
7145
7146 pml = sfmmu_mlist_enter(pp);
7147
7148 #ifdef VAC
7149 if (pp->p_kpmref)
7150 sfmmu_kpm_pageunload(pp);
7151 ASSERT(!PP_ISMAPPED_KPM(pp));
7152 #endif
7153 /*
7154 * Clear vpm reference. Since the page is exclusively locked
7155 * vpm cannot be referencing it.
7156 */
7157 if (vpm_enable) {
7158 pp->p_vpmref = 0;
7159 }
7160
7161 index = PP_MAPINDEX(pp);
7162 cons = TTE8K;
7163 retry:
7164 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7165 tmphme = sfhme->hme_next;
7166
7167 if (IS_PAHME(sfhme)) {
7168 ASSERT(sfhme->hme_data != NULL);
7169 pa_hments++;
7170 continue;
7171 }
7172
7173 hmeblkp = sfmmu_hmetohblk(sfhme);
7174 if (hmeblkp->hblk_xhat_bit) {
7175 struct xhat_hme_blk *xblk =
7176 (struct xhat_hme_blk *)hmeblkp;
7177
7178 (void) XHAT_PAGEUNLOAD(xblk->xhat_hme_blk_hat,
7179 pp, forceflag, XBLK2PROVBLK(xblk));
7180
7181 xhme_blks = 1;
7182 continue;
7183 }
7184
7185 /*
7186 * If there are kernel mappings don't unload them, they will
7187 * be suspended.
7188 */
7189 if (forceflag == SFMMU_KERNEL_RELOC && hmeblkp->hblk_lckcnt &&
7190 hmeblkp->hblk_tag.htag_id == ksfmmup)
7191 continue;
7192
7193 tset = sfmmu_pageunload(pp, sfhme, cons);
7194 CPUSET_OR(cpuset, tset);
7195 }
7196
7197 while (index != 0) {
7198 index = index >> 1;
7199 if (index != 0)
7200 cons++;
7201 if (index & 0x1) {
7202 /* Go to leading page */
7203 pp = PP_GROUPLEADER(pp, cons);
7204 ASSERT(sfmmu_mlist_held(pp));
7205 goto retry;
7206 }
7207 }
7208
7209 /*
7210 * cpuset may be empty if the page was only mapped by segkpm,
7211 * in which case we won't actually cross-trap.
7212 */
7213 xt_sync(cpuset);
7214
7215 /*
7216 * The page should have no mappings at this point, unless
7217 * we were called from hat_page_relocate() in which case we
7218 * leave the locked mappings which will be suspended later.
7219 */
7220 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks || pa_hments ||
7221 (forceflag == SFMMU_KERNEL_RELOC));
7222
7223 #ifdef VAC
7224 if (PP_ISTNC(pp)) {
7225 if (cons == TTE8K) {
7226 pmtx = sfmmu_page_enter(pp);
7227 PP_CLRTNC(pp);
7228 sfmmu_page_exit(pmtx);
7229 } else {
7230 conv_tnc(pp, cons);
7231 }
7232 }
7233 #endif /* VAC */
7234
7235 if (pa_hments && forceflag != SFMMU_KERNEL_RELOC) {
7236 /*
7237 * Unlink any pa_hments and free them, calling back
7238 * the responsible subsystem to notify it of the error.
7239 * This can occur in situations such as drivers leaking
7240 * DMA handles: naughty, but common enough that we'd like
7241 * to keep the system running rather than bringing it
7242 * down with an obscure error like "pa_hment leaked"
7243 * which doesn't aid the user in debugging their driver.
7244 */
7245 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7246 tmphme = sfhme->hme_next;
7247 if (IS_PAHME(sfhme)) {
7248 struct pa_hment *pahmep = sfhme->hme_data;
7249 sfmmu_pahment_leaked(pahmep);
7250 HME_SUB(sfhme, pp);
7251 kmem_cache_free(pa_hment_cache, pahmep);
7252 }
7253 }
7254
7255 ASSERT(!PP_ISMAPPED(origpp) || xhme_blks);
7256 }
7257
7258 sfmmu_mlist_exit(pml);
7259
7260 /*
7261 * XHAT may not have finished unloading pages
7262 * because some other thread was waiting for
7263 * mlist lock and XHAT_PAGEUNLOAD let it do
7264 * the job.
7265 */
7266 if (xhme_blks) {
7267 pp = origpp;
7268 goto retry_xhat;
7269 }
7270
7271 return (0);
7272 }
7273
7274 cpuset_t
sfmmu_pageunload(page_t * pp,struct sf_hment * sfhme,int cons)7275 sfmmu_pageunload(page_t *pp, struct sf_hment *sfhme, int cons)
7276 {
7277 struct hme_blk *hmeblkp;
7278 sfmmu_t *sfmmup;
7279 tte_t tte, ttemod;
7280 #ifdef DEBUG
7281 tte_t orig_old;
7282 #endif /* DEBUG */
7283 caddr_t addr;
7284 int ttesz;
7285 int ret;
7286 cpuset_t cpuset;
7287
7288 ASSERT(pp != NULL);
7289 ASSERT(sfmmu_mlist_held(pp));
7290 ASSERT(!PP_ISKAS(pp));
7291
7292 CPUSET_ZERO(cpuset);
7293
7294 hmeblkp = sfmmu_hmetohblk(sfhme);
7295
7296 readtte:
7297 sfmmu_copytte(&sfhme->hme_tte, &tte);
7298 if (TTE_IS_VALID(&tte)) {
7299 sfmmup = hblktosfmmu(hmeblkp);
7300 ttesz = get_hblk_ttesz(hmeblkp);
7301 /*
7302 * Only unload mappings of 'cons' size.
7303 */
7304 if (ttesz != cons)
7305 return (cpuset);
7306
7307 /*
7308 * Note that we have p_mapping lock, but no hash lock here.
7309 * hblk_unload() has to have both hash lock AND p_mapping
7310 * lock before it tries to modify tte. So, the tte could
7311 * not become invalid in the sfmmu_modifytte_try() below.
7312 */
7313 ttemod = tte;
7314 #ifdef DEBUG
7315 orig_old = tte;
7316 #endif /* DEBUG */
7317
7318 TTE_SET_INVALID(&ttemod);
7319 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
7320 if (ret < 0) {
7321 #ifdef DEBUG
7322 /* only R/M bits can change. */
7323 chk_tte(&orig_old, &tte, &ttemod, hmeblkp);
7324 #endif /* DEBUG */
7325 goto readtte;
7326 }
7327
7328 if (ret == 0) {
7329 panic("pageunload: cas failed?");
7330 }
7331
7332 addr = tte_to_vaddr(hmeblkp, tte);
7333
7334 if (hmeblkp->hblk_shared) {
7335 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7336 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7337 sf_region_t *rgnp;
7338 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7339 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7340 ASSERT(srdp != NULL);
7341 rgnp = srdp->srd_hmergnp[rid];
7342 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
7343 cpuset = sfmmu_rgntlb_demap(addr, rgnp, hmeblkp, 1);
7344 sfmmu_ttesync(NULL, addr, &tte, pp);
7345 ASSERT(rgnp->rgn_ttecnt[ttesz] > 0);
7346 atomic_dec_ulong(&rgnp->rgn_ttecnt[ttesz]);
7347 } else {
7348 sfmmu_ttesync(sfmmup, addr, &tte, pp);
7349 atomic_dec_ulong(&sfmmup->sfmmu_ttecnt[ttesz]);
7350
7351 /*
7352 * We need to flush the page from the virtual cache
7353 * in order to prevent a virtual cache alias
7354 * inconsistency. The particular scenario we need
7355 * to worry about is:
7356 * Given: va1 and va2 are two virtual address that
7357 * alias and will map the same physical address.
7358 * 1. mapping exists from va1 to pa and data has
7359 * been read into the cache.
7360 * 2. unload va1.
7361 * 3. load va2 and modify data using va2.
7362 * 4 unload va2.
7363 * 5. load va1 and reference data. Unless we flush
7364 * the data cache when we unload we will get
7365 * stale data.
7366 * This scenario is taken care of by using virtual
7367 * page coloring.
7368 */
7369 if (sfmmup->sfmmu_ismhat) {
7370 /*
7371 * Flush TSBs, TLBs and caches
7372 * of every process
7373 * sharing this ism segment.
7374 */
7375 sfmmu_hat_lock_all();
7376 mutex_enter(&ism_mlist_lock);
7377 kpreempt_disable();
7378 sfmmu_ismtlbcache_demap(addr, sfmmup, hmeblkp,
7379 pp->p_pagenum, CACHE_NO_FLUSH);
7380 kpreempt_enable();
7381 mutex_exit(&ism_mlist_lock);
7382 sfmmu_hat_unlock_all();
7383 cpuset = cpu_ready_set;
7384 } else {
7385 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
7386 cpuset = sfmmup->sfmmu_cpusran;
7387 }
7388 }
7389
7390 /*
7391 * Hme_sub has to run after ttesync() and a_rss update.
7392 * See hblk_unload().
7393 */
7394 HME_SUB(sfhme, pp);
7395 membar_stst();
7396
7397 /*
7398 * We can not make ASSERT(hmeblkp->hblk_hmecnt <= NHMENTS)
7399 * since pteload may have done a HME_ADD() right after
7400 * we did the HME_SUB() above. Hmecnt is now maintained
7401 * by cas only. no lock guranteed its value. The only
7402 * gurantee we have is the hmecnt should not be less than
7403 * what it should be so the hblk will not be taken away.
7404 * It's also important that we decremented the hmecnt after
7405 * we are done with hmeblkp so that this hmeblk won't be
7406 * stolen.
7407 */
7408 ASSERT(hmeblkp->hblk_hmecnt > 0);
7409 ASSERT(hmeblkp->hblk_vcnt > 0);
7410 atomic_dec_16(&hmeblkp->hblk_vcnt);
7411 atomic_dec_16(&hmeblkp->hblk_hmecnt);
7412 /*
7413 * This is bug 4063182.
7414 * XXX: fixme
7415 * ASSERT(hmeblkp->hblk_hmecnt || hmeblkp->hblk_vcnt ||
7416 * !hmeblkp->hblk_lckcnt);
7417 */
7418 } else {
7419 panic("invalid tte? pp %p &tte %p",
7420 (void *)pp, (void *)&tte);
7421 }
7422
7423 return (cpuset);
7424 }
7425
7426 /*
7427 * While relocating a kernel page, this function will move the mappings
7428 * from tpp to dpp and modify any associated data with these mappings.
7429 * It also unsuspends the suspended kernel mapping.
7430 */
7431 static void
hat_pagereload(struct page * tpp,struct page * dpp)7432 hat_pagereload(struct page *tpp, struct page *dpp)
7433 {
7434 struct sf_hment *sfhme;
7435 tte_t tte, ttemod;
7436 int index, cons;
7437
7438 ASSERT(getpil() == PIL_MAX);
7439 ASSERT(sfmmu_mlist_held(tpp));
7440 ASSERT(sfmmu_mlist_held(dpp));
7441
7442 index = PP_MAPINDEX(tpp);
7443 cons = TTE8K;
7444
7445 /* Update real mappings to the page */
7446 retry:
7447 for (sfhme = tpp->p_mapping; sfhme != NULL; sfhme = sfhme->hme_next) {
7448 if (IS_PAHME(sfhme))
7449 continue;
7450 sfmmu_copytte(&sfhme->hme_tte, &tte);
7451 ttemod = tte;
7452
7453 /*
7454 * replace old pfn with new pfn in TTE
7455 */
7456 PFN_TO_TTE(ttemod, dpp->p_pagenum);
7457
7458 /*
7459 * clear suspend bit
7460 */
7461 ASSERT(TTE_IS_SUSPEND(&ttemod));
7462 TTE_CLR_SUSPEND(&ttemod);
7463
7464 if (sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte) < 0)
7465 panic("hat_pagereload(): sfmmu_modifytte_try() failed");
7466
7467 /*
7468 * set hme_page point to new page
7469 */
7470 sfhme->hme_page = dpp;
7471 }
7472
7473 /*
7474 * move p_mapping list from old page to new page
7475 */
7476 dpp->p_mapping = tpp->p_mapping;
7477 tpp->p_mapping = NULL;
7478 dpp->p_share = tpp->p_share;
7479 tpp->p_share = 0;
7480
7481 while (index != 0) {
7482 index = index >> 1;
7483 if (index != 0)
7484 cons++;
7485 if (index & 0x1) {
7486 tpp = PP_GROUPLEADER(tpp, cons);
7487 dpp = PP_GROUPLEADER(dpp, cons);
7488 goto retry;
7489 }
7490 }
7491
7492 curthread->t_flag &= ~T_DONTDTRACE;
7493 mutex_exit(&kpr_suspendlock);
7494 }
7495
7496 uint_t
hat_pagesync(struct page * pp,uint_t clearflag)7497 hat_pagesync(struct page *pp, uint_t clearflag)
7498 {
7499 struct sf_hment *sfhme, *tmphme = NULL;
7500 struct hme_blk *hmeblkp;
7501 kmutex_t *pml;
7502 cpuset_t cpuset, tset;
7503 int index, cons;
7504 extern ulong_t po_share;
7505 page_t *save_pp = pp;
7506 int stop_on_sh = 0;
7507 uint_t shcnt;
7508
7509 CPUSET_ZERO(cpuset);
7510
7511 if (PP_ISRO(pp) && (clearflag & HAT_SYNC_STOPON_MOD)) {
7512 return (PP_GENERIC_ATTR(pp));
7513 }
7514
7515 if ((clearflag & HAT_SYNC_ZERORM) == 0) {
7516 if ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(pp)) {
7517 return (PP_GENERIC_ATTR(pp));
7518 }
7519 if ((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(pp)) {
7520 return (PP_GENERIC_ATTR(pp));
7521 }
7522 if (clearflag & HAT_SYNC_STOPON_SHARED) {
7523 if (pp->p_share > po_share) {
7524 hat_page_setattr(pp, P_REF);
7525 return (PP_GENERIC_ATTR(pp));
7526 }
7527 stop_on_sh = 1;
7528 shcnt = 0;
7529 }
7530 }
7531
7532 clearflag &= ~HAT_SYNC_STOPON_SHARED;
7533 pml = sfmmu_mlist_enter(pp);
7534 index = PP_MAPINDEX(pp);
7535 cons = TTE8K;
7536 retry:
7537 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7538 /*
7539 * We need to save the next hment on the list since
7540 * it is possible for pagesync to remove an invalid hment
7541 * from the list.
7542 */
7543 tmphme = sfhme->hme_next;
7544 if (IS_PAHME(sfhme))
7545 continue;
7546 /*
7547 * If we are looking for large mappings and this hme doesn't
7548 * reach the range we are seeking, just ignore it.
7549 */
7550 hmeblkp = sfmmu_hmetohblk(sfhme);
7551 if (hmeblkp->hblk_xhat_bit)
7552 continue;
7553
7554 if (hme_size(sfhme) < cons)
7555 continue;
7556
7557 if (stop_on_sh) {
7558 if (hmeblkp->hblk_shared) {
7559 sf_srd_t *srdp = hblktosrd(hmeblkp);
7560 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7561 sf_region_t *rgnp;
7562 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7563 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7564 ASSERT(srdp != NULL);
7565 rgnp = srdp->srd_hmergnp[rid];
7566 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
7567 rgnp, rid);
7568 shcnt += rgnp->rgn_refcnt;
7569 } else {
7570 shcnt++;
7571 }
7572 if (shcnt > po_share) {
7573 /*
7574 * tell the pager to spare the page this time
7575 * around.
7576 */
7577 hat_page_setattr(save_pp, P_REF);
7578 index = 0;
7579 break;
7580 }
7581 }
7582 tset = sfmmu_pagesync(pp, sfhme,
7583 clearflag & ~HAT_SYNC_STOPON_RM);
7584 CPUSET_OR(cpuset, tset);
7585
7586 /*
7587 * If clearflag is HAT_SYNC_DONTZERO, break out as soon
7588 * as the "ref" or "mod" is set or share cnt exceeds po_share.
7589 */
7590 if ((clearflag & ~HAT_SYNC_STOPON_RM) == HAT_SYNC_DONTZERO &&
7591 (((clearflag & HAT_SYNC_STOPON_MOD) && PP_ISMOD(save_pp)) ||
7592 ((clearflag & HAT_SYNC_STOPON_REF) && PP_ISREF(save_pp)))) {
7593 index = 0;
7594 break;
7595 }
7596 }
7597
7598 while (index) {
7599 index = index >> 1;
7600 cons++;
7601 if (index & 0x1) {
7602 /* Go to leading page */
7603 pp = PP_GROUPLEADER(pp, cons);
7604 goto retry;
7605 }
7606 }
7607
7608 xt_sync(cpuset);
7609 sfmmu_mlist_exit(pml);
7610 return (PP_GENERIC_ATTR(save_pp));
7611 }
7612
7613 /*
7614 * Get all the hardware dependent attributes for a page struct
7615 */
7616 static cpuset_t
sfmmu_pagesync(struct page * pp,struct sf_hment * sfhme,uint_t clearflag)7617 sfmmu_pagesync(struct page *pp, struct sf_hment *sfhme,
7618 uint_t clearflag)
7619 {
7620 caddr_t addr;
7621 tte_t tte, ttemod;
7622 struct hme_blk *hmeblkp;
7623 int ret;
7624 sfmmu_t *sfmmup;
7625 cpuset_t cpuset;
7626
7627 ASSERT(pp != NULL);
7628 ASSERT(sfmmu_mlist_held(pp));
7629 ASSERT((clearflag == HAT_SYNC_DONTZERO) ||
7630 (clearflag == HAT_SYNC_ZERORM));
7631
7632 SFMMU_STAT(sf_pagesync);
7633
7634 CPUSET_ZERO(cpuset);
7635
7636 sfmmu_pagesync_retry:
7637
7638 sfmmu_copytte(&sfhme->hme_tte, &tte);
7639 if (TTE_IS_VALID(&tte)) {
7640 hmeblkp = sfmmu_hmetohblk(sfhme);
7641 sfmmup = hblktosfmmu(hmeblkp);
7642 addr = tte_to_vaddr(hmeblkp, tte);
7643 if (clearflag == HAT_SYNC_ZERORM) {
7644 ttemod = tte;
7645 TTE_CLR_RM(&ttemod);
7646 ret = sfmmu_modifytte_try(&tte, &ttemod,
7647 &sfhme->hme_tte);
7648 if (ret < 0) {
7649 /*
7650 * cas failed and the new value is not what
7651 * we want.
7652 */
7653 goto sfmmu_pagesync_retry;
7654 }
7655
7656 if (ret > 0) {
7657 /* we win the cas */
7658 if (hmeblkp->hblk_shared) {
7659 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7660 uint_t rid =
7661 hmeblkp->hblk_tag.htag_rid;
7662 sf_region_t *rgnp;
7663 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7664 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7665 ASSERT(srdp != NULL);
7666 rgnp = srdp->srd_hmergnp[rid];
7667 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
7668 srdp, rgnp, rid);
7669 cpuset = sfmmu_rgntlb_demap(addr,
7670 rgnp, hmeblkp, 1);
7671 } else {
7672 sfmmu_tlb_demap(addr, sfmmup, hmeblkp,
7673 0, 0);
7674 cpuset = sfmmup->sfmmu_cpusran;
7675 }
7676 }
7677 }
7678 sfmmu_ttesync(hmeblkp->hblk_shared ? NULL : sfmmup, addr,
7679 &tte, pp);
7680 }
7681 return (cpuset);
7682 }
7683
7684 /*
7685 * Remove write permission from a mappings to a page, so that
7686 * we can detect the next modification of it. This requires modifying
7687 * the TTE then invalidating (demap) any TLB entry using that TTE.
7688 * This code is similar to sfmmu_pagesync().
7689 */
7690 static cpuset_t
sfmmu_pageclrwrt(struct page * pp,struct sf_hment * sfhme)7691 sfmmu_pageclrwrt(struct page *pp, struct sf_hment *sfhme)
7692 {
7693 caddr_t addr;
7694 tte_t tte;
7695 tte_t ttemod;
7696 struct hme_blk *hmeblkp;
7697 int ret;
7698 sfmmu_t *sfmmup;
7699 cpuset_t cpuset;
7700
7701 ASSERT(pp != NULL);
7702 ASSERT(sfmmu_mlist_held(pp));
7703
7704 CPUSET_ZERO(cpuset);
7705 SFMMU_STAT(sf_clrwrt);
7706
7707 retry:
7708
7709 sfmmu_copytte(&sfhme->hme_tte, &tte);
7710 if (TTE_IS_VALID(&tte) && TTE_IS_WRITABLE(&tte)) {
7711 hmeblkp = sfmmu_hmetohblk(sfhme);
7712
7713 /*
7714 * xhat mappings should never be to a VMODSORT page.
7715 */
7716 ASSERT(hmeblkp->hblk_xhat_bit == 0);
7717
7718 sfmmup = hblktosfmmu(hmeblkp);
7719 addr = tte_to_vaddr(hmeblkp, tte);
7720
7721 ttemod = tte;
7722 TTE_CLR_WRT(&ttemod);
7723 TTE_CLR_MOD(&ttemod);
7724 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
7725
7726 /*
7727 * if cas failed and the new value is not what
7728 * we want retry
7729 */
7730 if (ret < 0)
7731 goto retry;
7732
7733 /* we win the cas */
7734 if (ret > 0) {
7735 if (hmeblkp->hblk_shared) {
7736 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
7737 uint_t rid = hmeblkp->hblk_tag.htag_rid;
7738 sf_region_t *rgnp;
7739 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
7740 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
7741 ASSERT(srdp != NULL);
7742 rgnp = srdp->srd_hmergnp[rid];
7743 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
7744 srdp, rgnp, rid);
7745 cpuset = sfmmu_rgntlb_demap(addr,
7746 rgnp, hmeblkp, 1);
7747 } else {
7748 sfmmu_tlb_demap(addr, sfmmup, hmeblkp, 0, 0);
7749 cpuset = sfmmup->sfmmu_cpusran;
7750 }
7751 }
7752 }
7753
7754 return (cpuset);
7755 }
7756
7757 /*
7758 * Walk all mappings of a page, removing write permission and clearing the
7759 * ref/mod bits. This code is similar to hat_pagesync()
7760 */
7761 static void
hat_page_clrwrt(page_t * pp)7762 hat_page_clrwrt(page_t *pp)
7763 {
7764 struct sf_hment *sfhme;
7765 struct sf_hment *tmphme = NULL;
7766 kmutex_t *pml;
7767 cpuset_t cpuset;
7768 cpuset_t tset;
7769 int index;
7770 int cons;
7771
7772 CPUSET_ZERO(cpuset);
7773
7774 pml = sfmmu_mlist_enter(pp);
7775 index = PP_MAPINDEX(pp);
7776 cons = TTE8K;
7777 retry:
7778 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
7779 tmphme = sfhme->hme_next;
7780
7781 /*
7782 * If we are looking for large mappings and this hme doesn't
7783 * reach the range we are seeking, just ignore its.
7784 */
7785
7786 if (hme_size(sfhme) < cons)
7787 continue;
7788
7789 tset = sfmmu_pageclrwrt(pp, sfhme);
7790 CPUSET_OR(cpuset, tset);
7791 }
7792
7793 while (index) {
7794 index = index >> 1;
7795 cons++;
7796 if (index & 0x1) {
7797 /* Go to leading page */
7798 pp = PP_GROUPLEADER(pp, cons);
7799 goto retry;
7800 }
7801 }
7802
7803 xt_sync(cpuset);
7804 sfmmu_mlist_exit(pml);
7805 }
7806
7807 /*
7808 * Set the given REF/MOD/RO bits for the given page.
7809 * For a vnode with a sorted v_pages list, we need to change
7810 * the attributes and the v_pages list together under page_vnode_mutex.
7811 */
7812 void
hat_page_setattr(page_t * pp,uint_t flag)7813 hat_page_setattr(page_t *pp, uint_t flag)
7814 {
7815 vnode_t *vp = pp->p_vnode;
7816 page_t **listp;
7817 kmutex_t *pmtx;
7818 kmutex_t *vphm = NULL;
7819 int noshuffle;
7820
7821 noshuffle = flag & P_NSH;
7822 flag &= ~P_NSH;
7823
7824 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7825
7826 /*
7827 * nothing to do if attribute already set
7828 */
7829 if ((pp->p_nrm & flag) == flag)
7830 return;
7831
7832 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp) &&
7833 !noshuffle) {
7834 vphm = page_vnode_mutex(vp);
7835 mutex_enter(vphm);
7836 }
7837
7838 pmtx = sfmmu_page_enter(pp);
7839 pp->p_nrm |= flag;
7840 sfmmu_page_exit(pmtx);
7841
7842 if (vphm != NULL) {
7843 /*
7844 * Some File Systems examine v_pages for NULL w/o
7845 * grabbing the vphm mutex. Must not let it become NULL when
7846 * pp is the only page on the list.
7847 */
7848 if (pp->p_vpnext != pp) {
7849 page_vpsub(&vp->v_pages, pp);
7850 if (vp->v_pages != NULL)
7851 listp = &vp->v_pages->p_vpprev->p_vpnext;
7852 else
7853 listp = &vp->v_pages;
7854 page_vpadd(listp, pp);
7855 }
7856 mutex_exit(vphm);
7857 }
7858 }
7859
7860 void
hat_page_clrattr(page_t * pp,uint_t flag)7861 hat_page_clrattr(page_t *pp, uint_t flag)
7862 {
7863 vnode_t *vp = pp->p_vnode;
7864 kmutex_t *pmtx;
7865
7866 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7867
7868 pmtx = sfmmu_page_enter(pp);
7869
7870 /*
7871 * Caller is expected to hold page's io lock for VMODSORT to work
7872 * correctly with pvn_vplist_dirty() and pvn_getdirty() when mod
7873 * bit is cleared.
7874 * We don't have assert to avoid tripping some existing third party
7875 * code. The dirty page is moved back to top of the v_page list
7876 * after IO is done in pvn_write_done().
7877 */
7878 pp->p_nrm &= ~flag;
7879 sfmmu_page_exit(pmtx);
7880
7881 if ((flag & P_MOD) != 0 && vp != NULL && IS_VMODSORT(vp)) {
7882
7883 /*
7884 * VMODSORT works by removing write permissions and getting
7885 * a fault when a page is made dirty. At this point
7886 * we need to remove write permission from all mappings
7887 * to this page.
7888 */
7889 hat_page_clrwrt(pp);
7890 }
7891 }
7892
7893 uint_t
hat_page_getattr(page_t * pp,uint_t flag)7894 hat_page_getattr(page_t *pp, uint_t flag)
7895 {
7896 ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
7897 return ((uint_t)(pp->p_nrm & flag));
7898 }
7899
7900 /*
7901 * DEBUG kernels: verify that a kernel va<->pa translation
7902 * is safe by checking the underlying page_t is in a page
7903 * relocation-safe state.
7904 */
7905 #ifdef DEBUG
7906 void
sfmmu_check_kpfn(pfn_t pfn)7907 sfmmu_check_kpfn(pfn_t pfn)
7908 {
7909 page_t *pp;
7910 int index, cons;
7911
7912 if (hat_check_vtop == 0)
7913 return;
7914
7915 if (kvseg.s_base == NULL || panicstr)
7916 return;
7917
7918 pp = page_numtopp_nolock(pfn);
7919 if (!pp)
7920 return;
7921
7922 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
7923 return;
7924
7925 /*
7926 * Handed a large kernel page, we dig up the root page since we
7927 * know the root page might have the lock also.
7928 */
7929 if (pp->p_szc != 0) {
7930 index = PP_MAPINDEX(pp);
7931 cons = TTE8K;
7932 again:
7933 while (index != 0) {
7934 index >>= 1;
7935 if (index != 0)
7936 cons++;
7937 if (index & 0x1) {
7938 pp = PP_GROUPLEADER(pp, cons);
7939 goto again;
7940 }
7941 }
7942 }
7943
7944 if (PAGE_LOCKED(pp) || PP_ISNORELOC(pp))
7945 return;
7946
7947 /*
7948 * Pages need to be locked or allocated "permanent" (either from
7949 * static_arena arena or explicitly setting PG_NORELOC when calling
7950 * page_create_va()) for VA->PA translations to be valid.
7951 */
7952 if (!PP_ISNORELOC(pp))
7953 panic("Illegal VA->PA translation, pp 0x%p not permanent",
7954 (void *)pp);
7955 else
7956 panic("Illegal VA->PA translation, pp 0x%p not locked",
7957 (void *)pp);
7958 }
7959 #endif /* DEBUG */
7960
7961 /*
7962 * Returns a page frame number for a given virtual address.
7963 * Returns PFN_INVALID to indicate an invalid mapping
7964 */
7965 pfn_t
hat_getpfnum(struct hat * hat,caddr_t addr)7966 hat_getpfnum(struct hat *hat, caddr_t addr)
7967 {
7968 pfn_t pfn;
7969 tte_t tte;
7970
7971 /*
7972 * We would like to
7973 * ASSERT(AS_LOCK_HELD(as));
7974 * but we can't because the iommu driver will call this
7975 * routine at interrupt time and it can't grab the as lock
7976 * or it will deadlock: A thread could have the as lock
7977 * and be waiting for io. The io can't complete
7978 * because the interrupt thread is blocked trying to grab
7979 * the as lock.
7980 */
7981
7982 ASSERT(hat->sfmmu_xhat_provider == NULL);
7983
7984 if (hat == ksfmmup) {
7985 if (IS_KMEM_VA_LARGEPAGE(addr)) {
7986 ASSERT(segkmem_lpszc > 0);
7987 pfn = sfmmu_kvaszc2pfn(addr, segkmem_lpszc);
7988 if (pfn != PFN_INVALID) {
7989 sfmmu_check_kpfn(pfn);
7990 return (pfn);
7991 }
7992 } else if (segkpm && IS_KPM_ADDR(addr)) {
7993 return (sfmmu_kpm_vatopfn(addr));
7994 }
7995 while ((pfn = sfmmu_vatopfn(addr, ksfmmup, &tte))
7996 == PFN_SUSPENDED) {
7997 sfmmu_vatopfn_suspended(addr, ksfmmup, &tte);
7998 }
7999 sfmmu_check_kpfn(pfn);
8000 return (pfn);
8001 } else {
8002 return (sfmmu_uvatopfn(addr, hat, NULL));
8003 }
8004 }
8005
8006 /*
8007 * This routine will return both pfn and tte for the vaddr.
8008 */
8009 static pfn_t
sfmmu_uvatopfn(caddr_t vaddr,struct hat * sfmmup,tte_t * ttep)8010 sfmmu_uvatopfn(caddr_t vaddr, struct hat *sfmmup, tte_t *ttep)
8011 {
8012 struct hmehash_bucket *hmebp;
8013 hmeblk_tag hblktag;
8014 int hmeshift, hashno = 1;
8015 struct hme_blk *hmeblkp = NULL;
8016 tte_t tte;
8017
8018 struct sf_hment *sfhmep;
8019 pfn_t pfn;
8020
8021 /* support for ISM */
8022 ism_map_t *ism_map;
8023 ism_blk_t *ism_blkp;
8024 int i;
8025 sfmmu_t *ism_hatid = NULL;
8026 sfmmu_t *locked_hatid = NULL;
8027 sfmmu_t *sv_sfmmup = sfmmup;
8028 caddr_t sv_vaddr = vaddr;
8029 sf_srd_t *srdp;
8030
8031 if (ttep == NULL) {
8032 ttep = &tte;
8033 } else {
8034 ttep->ll = 0;
8035 }
8036
8037 ASSERT(sfmmup != ksfmmup);
8038 SFMMU_STAT(sf_user_vtop);
8039 /*
8040 * Set ism_hatid if vaddr falls in a ISM segment.
8041 */
8042 ism_blkp = sfmmup->sfmmu_iblk;
8043 if (ism_blkp != NULL) {
8044 sfmmu_ismhat_enter(sfmmup, 0);
8045 locked_hatid = sfmmup;
8046 }
8047 while (ism_blkp != NULL && ism_hatid == NULL) {
8048 ism_map = ism_blkp->iblk_maps;
8049 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
8050 if (vaddr >= ism_start(ism_map[i]) &&
8051 vaddr < ism_end(ism_map[i])) {
8052 sfmmup = ism_hatid = ism_map[i].imap_ismhat;
8053 vaddr = (caddr_t)(vaddr -
8054 ism_start(ism_map[i]));
8055 break;
8056 }
8057 }
8058 ism_blkp = ism_blkp->iblk_next;
8059 }
8060 if (locked_hatid) {
8061 sfmmu_ismhat_exit(locked_hatid, 0);
8062 }
8063
8064 hblktag.htag_id = sfmmup;
8065 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
8066 do {
8067 hmeshift = HME_HASH_SHIFT(hashno);
8068 hblktag.htag_bspage = HME_HASH_BSPAGE(vaddr, hmeshift);
8069 hblktag.htag_rehash = hashno;
8070 hmebp = HME_HASH_FUNCTION(sfmmup, vaddr, hmeshift);
8071
8072 SFMMU_HASH_LOCK(hmebp);
8073
8074 HME_HASH_FAST_SEARCH(hmebp, hblktag, hmeblkp);
8075 if (hmeblkp != NULL) {
8076 ASSERT(!hmeblkp->hblk_shared);
8077 HBLKTOHME(sfhmep, hmeblkp, vaddr);
8078 sfmmu_copytte(&sfhmep->hme_tte, ttep);
8079 SFMMU_HASH_UNLOCK(hmebp);
8080 if (TTE_IS_VALID(ttep)) {
8081 pfn = TTE_TO_PFN(vaddr, ttep);
8082 return (pfn);
8083 }
8084 break;
8085 }
8086 SFMMU_HASH_UNLOCK(hmebp);
8087 hashno++;
8088 } while (HME_REHASH(sfmmup) && (hashno <= mmu_hashcnt));
8089
8090 if (SF_HMERGNMAP_ISNULL(sv_sfmmup)) {
8091 return (PFN_INVALID);
8092 }
8093 srdp = sv_sfmmup->sfmmu_srdp;
8094 ASSERT(srdp != NULL);
8095 ASSERT(srdp->srd_refcnt != 0);
8096 hblktag.htag_id = srdp;
8097 hashno = 1;
8098 do {
8099 hmeshift = HME_HASH_SHIFT(hashno);
8100 hblktag.htag_bspage = HME_HASH_BSPAGE(sv_vaddr, hmeshift);
8101 hblktag.htag_rehash = hashno;
8102 hmebp = HME_HASH_FUNCTION(srdp, sv_vaddr, hmeshift);
8103
8104 SFMMU_HASH_LOCK(hmebp);
8105 for (hmeblkp = hmebp->hmeblkp; hmeblkp != NULL;
8106 hmeblkp = hmeblkp->hblk_next) {
8107 uint_t rid;
8108 sf_region_t *rgnp;
8109 caddr_t rsaddr;
8110 caddr_t readdr;
8111
8112 if (!HTAGS_EQ_SHME(hmeblkp->hblk_tag, hblktag,
8113 sv_sfmmup->sfmmu_hmeregion_map)) {
8114 continue;
8115 }
8116 ASSERT(hmeblkp->hblk_shared);
8117 rid = hmeblkp->hblk_tag.htag_rid;
8118 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
8119 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
8120 rgnp = srdp->srd_hmergnp[rid];
8121 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
8122 HBLKTOHME(sfhmep, hmeblkp, sv_vaddr);
8123 sfmmu_copytte(&sfhmep->hme_tte, ttep);
8124 rsaddr = rgnp->rgn_saddr;
8125 readdr = rsaddr + rgnp->rgn_size;
8126 #ifdef DEBUG
8127 if (TTE_IS_VALID(ttep) ||
8128 get_hblk_ttesz(hmeblkp) > TTE8K) {
8129 caddr_t eva = tte_to_evaddr(hmeblkp, ttep);
8130 ASSERT(eva > sv_vaddr);
8131 ASSERT(sv_vaddr >= rsaddr);
8132 ASSERT(sv_vaddr < readdr);
8133 ASSERT(eva <= readdr);
8134 }
8135 #endif /* DEBUG */
8136 /*
8137 * Continue the search if we
8138 * found an invalid 8K tte outside of the area
8139 * covered by this hmeblk's region.
8140 */
8141 if (TTE_IS_VALID(ttep)) {
8142 SFMMU_HASH_UNLOCK(hmebp);
8143 pfn = TTE_TO_PFN(sv_vaddr, ttep);
8144 return (pfn);
8145 } else if (get_hblk_ttesz(hmeblkp) > TTE8K ||
8146 (sv_vaddr >= rsaddr && sv_vaddr < readdr)) {
8147 SFMMU_HASH_UNLOCK(hmebp);
8148 pfn = PFN_INVALID;
8149 return (pfn);
8150 }
8151 }
8152 SFMMU_HASH_UNLOCK(hmebp);
8153 hashno++;
8154 } while (hashno <= mmu_hashcnt);
8155 return (PFN_INVALID);
8156 }
8157
8158
8159 /*
8160 * For compatability with AT&T and later optimizations
8161 */
8162 /* ARGSUSED */
8163 void
hat_map(struct hat * hat,caddr_t addr,size_t len,uint_t flags)8164 hat_map(struct hat *hat, caddr_t addr, size_t len, uint_t flags)
8165 {
8166 ASSERT(hat != NULL);
8167 ASSERT(hat->sfmmu_xhat_provider == NULL);
8168 }
8169
8170 /*
8171 * Return the number of mappings to a particular page. This number is an
8172 * approximation of the number of people sharing the page.
8173 *
8174 * shared hmeblks or ism hmeblks are counted as 1 mapping here.
8175 * hat_page_checkshare() can be used to compare threshold to share
8176 * count that reflects the number of region sharers albeit at higher cost.
8177 */
8178 ulong_t
hat_page_getshare(page_t * pp)8179 hat_page_getshare(page_t *pp)
8180 {
8181 page_t *spp = pp; /* start page */
8182 kmutex_t *pml;
8183 ulong_t cnt;
8184 int index, sz = TTE64K;
8185
8186 /*
8187 * We need to grab the mlist lock to make sure any outstanding
8188 * load/unloads complete. Otherwise we could return zero
8189 * even though the unload(s) hasn't finished yet.
8190 */
8191 pml = sfmmu_mlist_enter(spp);
8192 cnt = spp->p_share;
8193
8194 #ifdef VAC
8195 if (kpm_enable)
8196 cnt += spp->p_kpmref;
8197 #endif
8198 if (vpm_enable && pp->p_vpmref) {
8199 cnt += 1;
8200 }
8201
8202 /*
8203 * If we have any large mappings, we count the number of
8204 * mappings that this large page is part of.
8205 */
8206 index = PP_MAPINDEX(spp);
8207 index >>= 1;
8208 while (index) {
8209 pp = PP_GROUPLEADER(spp, sz);
8210 if ((index & 0x1) && pp != spp) {
8211 cnt += pp->p_share;
8212 spp = pp;
8213 }
8214 index >>= 1;
8215 sz++;
8216 }
8217 sfmmu_mlist_exit(pml);
8218 return (cnt);
8219 }
8220
8221 /*
8222 * Return 1 if the number of mappings exceeds sh_thresh. Return 0
8223 * otherwise. Count shared hmeblks by region's refcnt.
8224 */
8225 int
hat_page_checkshare(page_t * pp,ulong_t sh_thresh)8226 hat_page_checkshare(page_t *pp, ulong_t sh_thresh)
8227 {
8228 kmutex_t *pml;
8229 ulong_t cnt = 0;
8230 int index, sz = TTE8K;
8231 struct sf_hment *sfhme, *tmphme = NULL;
8232 struct hme_blk *hmeblkp;
8233
8234 pml = sfmmu_mlist_enter(pp);
8235
8236 #ifdef VAC
8237 if (kpm_enable)
8238 cnt = pp->p_kpmref;
8239 #endif
8240
8241 if (vpm_enable && pp->p_vpmref) {
8242 cnt += 1;
8243 }
8244
8245 if (pp->p_share + cnt > sh_thresh) {
8246 sfmmu_mlist_exit(pml);
8247 return (1);
8248 }
8249
8250 index = PP_MAPINDEX(pp);
8251
8252 again:
8253 for (sfhme = pp->p_mapping; sfhme; sfhme = tmphme) {
8254 tmphme = sfhme->hme_next;
8255 if (IS_PAHME(sfhme)) {
8256 continue;
8257 }
8258
8259 hmeblkp = sfmmu_hmetohblk(sfhme);
8260 if (hmeblkp->hblk_xhat_bit) {
8261 cnt++;
8262 if (cnt > sh_thresh) {
8263 sfmmu_mlist_exit(pml);
8264 return (1);
8265 }
8266 continue;
8267 }
8268 if (hme_size(sfhme) != sz) {
8269 continue;
8270 }
8271
8272 if (hmeblkp->hblk_shared) {
8273 sf_srd_t *srdp = hblktosrd(hmeblkp);
8274 uint_t rid = hmeblkp->hblk_tag.htag_rid;
8275 sf_region_t *rgnp;
8276 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
8277 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
8278 ASSERT(srdp != NULL);
8279 rgnp = srdp->srd_hmergnp[rid];
8280 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp,
8281 rgnp, rid);
8282 cnt += rgnp->rgn_refcnt;
8283 } else {
8284 cnt++;
8285 }
8286 if (cnt > sh_thresh) {
8287 sfmmu_mlist_exit(pml);
8288 return (1);
8289 }
8290 }
8291
8292 index >>= 1;
8293 sz++;
8294 while (index) {
8295 pp = PP_GROUPLEADER(pp, sz);
8296 ASSERT(sfmmu_mlist_held(pp));
8297 if (index & 0x1) {
8298 goto again;
8299 }
8300 index >>= 1;
8301 sz++;
8302 }
8303 sfmmu_mlist_exit(pml);
8304 return (0);
8305 }
8306
8307 /*
8308 * Unload all large mappings to the pp and reset the p_szc field of every
8309 * constituent page according to the remaining mappings.
8310 *
8311 * pp must be locked SE_EXCL. Even though no other constituent pages are
8312 * locked it's legal to unload the large mappings to the pp because all
8313 * constituent pages of large locked mappings have to be locked SE_SHARED.
8314 * This means if we have SE_EXCL lock on one of constituent pages none of the
8315 * large mappings to pp are locked.
8316 *
8317 * Decrease p_szc field starting from the last constituent page and ending
8318 * with the root page. This method is used because other threads rely on the
8319 * root's p_szc to find the lock to syncronize on. After a root page_t's p_szc
8320 * is demoted then other threads will succeed in sfmmu_mlspl_enter(). This
8321 * ensures that p_szc changes of the constituent pages appears atomic for all
8322 * threads that use sfmmu_mlspl_enter() to examine p_szc field.
8323 *
8324 * This mechanism is only used for file system pages where it's not always
8325 * possible to get SE_EXCL locks on all constituent pages to demote the size
8326 * code (as is done for anonymous or kernel large pages).
8327 *
8328 * See more comments in front of sfmmu_mlspl_enter().
8329 */
8330 void
hat_page_demote(page_t * pp)8331 hat_page_demote(page_t *pp)
8332 {
8333 int index;
8334 int sz;
8335 cpuset_t cpuset;
8336 int sync = 0;
8337 page_t *rootpp;
8338 struct sf_hment *sfhme;
8339 struct sf_hment *tmphme = NULL;
8340 struct hme_blk *hmeblkp;
8341 uint_t pszc;
8342 page_t *lastpp;
8343 cpuset_t tset;
8344 pgcnt_t npgs;
8345 kmutex_t *pml;
8346 kmutex_t *pmtx = NULL;
8347
8348 ASSERT(PAGE_EXCL(pp));
8349 ASSERT(!PP_ISFREE(pp));
8350 ASSERT(!PP_ISKAS(pp));
8351 ASSERT(page_szc_lock_assert(pp));
8352 pml = sfmmu_mlist_enter(pp);
8353
8354 pszc = pp->p_szc;
8355 if (pszc == 0) {
8356 goto out;
8357 }
8358
8359 index = PP_MAPINDEX(pp) >> 1;
8360
8361 if (index) {
8362 CPUSET_ZERO(cpuset);
8363 sz = TTE64K;
8364 sync = 1;
8365 }
8366
8367 while (index) {
8368 if (!(index & 0x1)) {
8369 index >>= 1;
8370 sz++;
8371 continue;
8372 }
8373 ASSERT(sz <= pszc);
8374 rootpp = PP_GROUPLEADER(pp, sz);
8375 for (sfhme = rootpp->p_mapping; sfhme; sfhme = tmphme) {
8376 tmphme = sfhme->hme_next;
8377 ASSERT(!IS_PAHME(sfhme));
8378 hmeblkp = sfmmu_hmetohblk(sfhme);
8379 if (hme_size(sfhme) != sz) {
8380 continue;
8381 }
8382 if (hmeblkp->hblk_xhat_bit) {
8383 cmn_err(CE_PANIC,
8384 "hat_page_demote: xhat hmeblk");
8385 }
8386 tset = sfmmu_pageunload(rootpp, sfhme, sz);
8387 CPUSET_OR(cpuset, tset);
8388 }
8389 if (index >>= 1) {
8390 sz++;
8391 }
8392 }
8393
8394 ASSERT(!PP_ISMAPPED_LARGE(pp));
8395
8396 if (sync) {
8397 xt_sync(cpuset);
8398 #ifdef VAC
8399 if (PP_ISTNC(pp)) {
8400 conv_tnc(rootpp, sz);
8401 }
8402 #endif /* VAC */
8403 }
8404
8405 pmtx = sfmmu_page_enter(pp);
8406
8407 ASSERT(pp->p_szc == pszc);
8408 rootpp = PP_PAGEROOT(pp);
8409 ASSERT(rootpp->p_szc == pszc);
8410 lastpp = PP_PAGENEXT_N(rootpp, TTEPAGES(pszc) - 1);
8411
8412 while (lastpp != rootpp) {
8413 sz = PP_MAPINDEX(lastpp) ? fnd_mapping_sz(lastpp) : 0;
8414 ASSERT(sz < pszc);
8415 npgs = (sz == 0) ? 1 : TTEPAGES(sz);
8416 ASSERT(P2PHASE(lastpp->p_pagenum, npgs) == npgs - 1);
8417 while (--npgs > 0) {
8418 lastpp->p_szc = (uchar_t)sz;
8419 lastpp = PP_PAGEPREV(lastpp);
8420 }
8421 if (sz) {
8422 /*
8423 * make sure before current root's pszc
8424 * is updated all updates to constituent pages pszc
8425 * fields are globally visible.
8426 */
8427 membar_producer();
8428 }
8429 lastpp->p_szc = sz;
8430 ASSERT(IS_P2ALIGNED(lastpp->p_pagenum, TTEPAGES(sz)));
8431 if (lastpp != rootpp) {
8432 lastpp = PP_PAGEPREV(lastpp);
8433 }
8434 }
8435 if (sz == 0) {
8436 /* the loop above doesn't cover this case */
8437 rootpp->p_szc = 0;
8438 }
8439 out:
8440 ASSERT(pp->p_szc == 0);
8441 if (pmtx != NULL) {
8442 sfmmu_page_exit(pmtx);
8443 }
8444 sfmmu_mlist_exit(pml);
8445 }
8446
8447 /*
8448 * Refresh the HAT ismttecnt[] element for size szc.
8449 * Caller must have set ISM busy flag to prevent mapping
8450 * lists from changing while we're traversing them.
8451 */
8452 pgcnt_t
ism_tsb_entries(sfmmu_t * sfmmup,int szc)8453 ism_tsb_entries(sfmmu_t *sfmmup, int szc)
8454 {
8455 ism_blk_t *ism_blkp = sfmmup->sfmmu_iblk;
8456 ism_map_t *ism_map;
8457 pgcnt_t npgs = 0;
8458 pgcnt_t npgs_scd = 0;
8459 int j;
8460 sf_scd_t *scdp;
8461 uchar_t rid;
8462
8463 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
8464 scdp = sfmmup->sfmmu_scdp;
8465
8466 for (; ism_blkp != NULL; ism_blkp = ism_blkp->iblk_next) {
8467 ism_map = ism_blkp->iblk_maps;
8468 for (j = 0; ism_map[j].imap_ismhat && j < ISM_MAP_SLOTS; j++) {
8469 rid = ism_map[j].imap_rid;
8470 ASSERT(rid == SFMMU_INVALID_ISMRID ||
8471 rid < sfmmup->sfmmu_srdp->srd_next_ismrid);
8472
8473 if (scdp != NULL && rid != SFMMU_INVALID_ISMRID &&
8474 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) {
8475 /* ISM is in sfmmup's SCD */
8476 npgs_scd +=
8477 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc];
8478 } else {
8479 /* ISMs is not in SCD */
8480 npgs +=
8481 ism_map[j].imap_ismhat->sfmmu_ttecnt[szc];
8482 }
8483 }
8484 }
8485 sfmmup->sfmmu_ismttecnt[szc] = npgs;
8486 sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd;
8487 return (npgs);
8488 }
8489
8490 /*
8491 * Yield the memory claim requirement for an address space.
8492 *
8493 * This is currently implemented as the number of bytes that have active
8494 * hardware translations that have page structures. Therefore, it can
8495 * underestimate the traditional resident set size, eg, if the
8496 * physical page is present and the hardware translation is missing;
8497 * and it can overestimate the rss, eg, if there are active
8498 * translations to a frame buffer with page structs.
8499 * Also, it does not take sharing into account.
8500 *
8501 * Note that we don't acquire locks here since this function is most often
8502 * called from the clock thread.
8503 */
8504 size_t
hat_get_mapped_size(struct hat * hat)8505 hat_get_mapped_size(struct hat *hat)
8506 {
8507 size_t assize = 0;
8508 int i;
8509
8510 if (hat == NULL)
8511 return (0);
8512
8513 ASSERT(hat->sfmmu_xhat_provider == NULL);
8514
8515 for (i = 0; i < mmu_page_sizes; i++)
8516 assize += ((pgcnt_t)hat->sfmmu_ttecnt[i] +
8517 (pgcnt_t)hat->sfmmu_scdrttecnt[i]) * TTEBYTES(i);
8518
8519 if (hat->sfmmu_iblk == NULL)
8520 return (assize);
8521
8522 for (i = 0; i < mmu_page_sizes; i++)
8523 assize += ((pgcnt_t)hat->sfmmu_ismttecnt[i] +
8524 (pgcnt_t)hat->sfmmu_scdismttecnt[i]) * TTEBYTES(i);
8525
8526 return (assize);
8527 }
8528
8529 int
hat_stats_enable(struct hat * hat)8530 hat_stats_enable(struct hat *hat)
8531 {
8532 hatlock_t *hatlockp;
8533
8534 ASSERT(hat->sfmmu_xhat_provider == NULL);
8535
8536 hatlockp = sfmmu_hat_enter(hat);
8537 hat->sfmmu_rmstat++;
8538 sfmmu_hat_exit(hatlockp);
8539 return (1);
8540 }
8541
8542 void
hat_stats_disable(struct hat * hat)8543 hat_stats_disable(struct hat *hat)
8544 {
8545 hatlock_t *hatlockp;
8546
8547 ASSERT(hat->sfmmu_xhat_provider == NULL);
8548
8549 hatlockp = sfmmu_hat_enter(hat);
8550 hat->sfmmu_rmstat--;
8551 sfmmu_hat_exit(hatlockp);
8552 }
8553
8554 /*
8555 * Routines for entering or removing ourselves from the
8556 * ism_hat's mapping list. This is used for both private and
8557 * SCD hats.
8558 */
8559 static void
iment_add(struct ism_ment * iment,struct hat * ism_hat)8560 iment_add(struct ism_ment *iment, struct hat *ism_hat)
8561 {
8562 ASSERT(MUTEX_HELD(&ism_mlist_lock));
8563
8564 iment->iment_prev = NULL;
8565 iment->iment_next = ism_hat->sfmmu_iment;
8566 if (ism_hat->sfmmu_iment) {
8567 ism_hat->sfmmu_iment->iment_prev = iment;
8568 }
8569 ism_hat->sfmmu_iment = iment;
8570 }
8571
8572 static void
iment_sub(struct ism_ment * iment,struct hat * ism_hat)8573 iment_sub(struct ism_ment *iment, struct hat *ism_hat)
8574 {
8575 ASSERT(MUTEX_HELD(&ism_mlist_lock));
8576
8577 if (ism_hat->sfmmu_iment == NULL) {
8578 panic("ism map entry remove - no entries");
8579 }
8580
8581 if (iment->iment_prev) {
8582 ASSERT(ism_hat->sfmmu_iment != iment);
8583 iment->iment_prev->iment_next = iment->iment_next;
8584 } else {
8585 ASSERT(ism_hat->sfmmu_iment == iment);
8586 ism_hat->sfmmu_iment = iment->iment_next;
8587 }
8588
8589 if (iment->iment_next) {
8590 iment->iment_next->iment_prev = iment->iment_prev;
8591 }
8592
8593 /*
8594 * zero out the entry
8595 */
8596 iment->iment_next = NULL;
8597 iment->iment_prev = NULL;
8598 iment->iment_hat = NULL;
8599 iment->iment_base_va = 0;
8600 }
8601
8602 /*
8603 * Hat_share()/unshare() return an (non-zero) error
8604 * when saddr and daddr are not properly aligned.
8605 *
8606 * The top level mapping element determines the alignment
8607 * requirement for saddr and daddr, depending on different
8608 * architectures.
8609 *
8610 * When hat_share()/unshare() are not supported,
8611 * HATOP_SHARE()/UNSHARE() return 0
8612 */
8613 int
hat_share(struct hat * sfmmup,caddr_t addr,struct hat * ism_hatid,caddr_t sptaddr,size_t len,uint_t ismszc)8614 hat_share(struct hat *sfmmup, caddr_t addr,
8615 struct hat *ism_hatid, caddr_t sptaddr, size_t len, uint_t ismszc)
8616 {
8617 ism_blk_t *ism_blkp;
8618 ism_blk_t *new_iblk;
8619 ism_map_t *ism_map;
8620 ism_ment_t *ism_ment;
8621 int i, added;
8622 hatlock_t *hatlockp;
8623 int reload_mmu = 0;
8624 uint_t ismshift = page_get_shift(ismszc);
8625 size_t ismpgsz = page_get_pagesize(ismszc);
8626 uint_t ismmask = (uint_t)ismpgsz - 1;
8627 size_t sh_size = ISM_SHIFT(ismshift, len);
8628 ushort_t ismhatflag;
8629 hat_region_cookie_t rcookie;
8630 sf_scd_t *old_scdp;
8631
8632 #ifdef DEBUG
8633 caddr_t eaddr = addr + len;
8634 #endif /* DEBUG */
8635
8636 ASSERT(ism_hatid != NULL && sfmmup != NULL);
8637 ASSERT(sptaddr == ISMID_STARTADDR);
8638 /*
8639 * Check the alignment.
8640 */
8641 if (!ISM_ALIGNED(ismshift, addr) || !ISM_ALIGNED(ismshift, sptaddr))
8642 return (EINVAL);
8643
8644 /*
8645 * Check size alignment.
8646 */
8647 if (!ISM_ALIGNED(ismshift, len))
8648 return (EINVAL);
8649
8650 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
8651
8652 /*
8653 * Allocate ism_ment for the ism_hat's mapping list, and an
8654 * ism map blk in case we need one. We must do our
8655 * allocations before acquiring locks to prevent a deadlock
8656 * in the kmem allocator on the mapping list lock.
8657 */
8658 new_iblk = kmem_cache_alloc(ism_blk_cache, KM_SLEEP);
8659 ism_ment = kmem_cache_alloc(ism_ment_cache, KM_SLEEP);
8660
8661 /*
8662 * Serialize ISM mappings with the ISM busy flag, and also the
8663 * trap handlers.
8664 */
8665 sfmmu_ismhat_enter(sfmmup, 0);
8666
8667 /*
8668 * Allocate an ism map blk if necessary.
8669 */
8670 if (sfmmup->sfmmu_iblk == NULL) {
8671 sfmmup->sfmmu_iblk = new_iblk;
8672 bzero(new_iblk, sizeof (*new_iblk));
8673 new_iblk->iblk_nextpa = (uint64_t)-1;
8674 membar_stst(); /* make sure next ptr visible to all CPUs */
8675 sfmmup->sfmmu_ismblkpa = va_to_pa((caddr_t)new_iblk);
8676 reload_mmu = 1;
8677 new_iblk = NULL;
8678 }
8679
8680 #ifdef DEBUG
8681 /*
8682 * Make sure mapping does not already exist.
8683 */
8684 ism_blkp = sfmmup->sfmmu_iblk;
8685 while (ism_blkp != NULL) {
8686 ism_map = ism_blkp->iblk_maps;
8687 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) {
8688 if ((addr >= ism_start(ism_map[i]) &&
8689 addr < ism_end(ism_map[i])) ||
8690 eaddr > ism_start(ism_map[i]) &&
8691 eaddr <= ism_end(ism_map[i])) {
8692 panic("sfmmu_share: Already mapped!");
8693 }
8694 }
8695 ism_blkp = ism_blkp->iblk_next;
8696 }
8697 #endif /* DEBUG */
8698
8699 ASSERT(ismszc >= TTE4M);
8700 if (ismszc == TTE4M) {
8701 ismhatflag = HAT_4M_FLAG;
8702 } else if (ismszc == TTE32M) {
8703 ismhatflag = HAT_32M_FLAG;
8704 } else if (ismszc == TTE256M) {
8705 ismhatflag = HAT_256M_FLAG;
8706 }
8707 /*
8708 * Add mapping to first available mapping slot.
8709 */
8710 ism_blkp = sfmmup->sfmmu_iblk;
8711 added = 0;
8712 while (!added) {
8713 ism_map = ism_blkp->iblk_maps;
8714 for (i = 0; i < ISM_MAP_SLOTS; i++) {
8715 if (ism_map[i].imap_ismhat == NULL) {
8716
8717 ism_map[i].imap_ismhat = ism_hatid;
8718 ism_map[i].imap_vb_shift = (uchar_t)ismshift;
8719 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID;
8720 ism_map[i].imap_hatflags = ismhatflag;
8721 ism_map[i].imap_sz_mask = ismmask;
8722 /*
8723 * imap_seg is checked in ISM_CHECK to see if
8724 * non-NULL, then other info assumed valid.
8725 */
8726 membar_stst();
8727 ism_map[i].imap_seg = (uintptr_t)addr | sh_size;
8728 ism_map[i].imap_ment = ism_ment;
8729
8730 /*
8731 * Now add ourselves to the ism_hat's
8732 * mapping list.
8733 */
8734 ism_ment->iment_hat = sfmmup;
8735 ism_ment->iment_base_va = addr;
8736 ism_hatid->sfmmu_ismhat = 1;
8737 mutex_enter(&ism_mlist_lock);
8738 iment_add(ism_ment, ism_hatid);
8739 mutex_exit(&ism_mlist_lock);
8740 added = 1;
8741 break;
8742 }
8743 }
8744 if (!added && ism_blkp->iblk_next == NULL) {
8745 ism_blkp->iblk_next = new_iblk;
8746 new_iblk = NULL;
8747 bzero(ism_blkp->iblk_next,
8748 sizeof (*ism_blkp->iblk_next));
8749 ism_blkp->iblk_next->iblk_nextpa = (uint64_t)-1;
8750 membar_stst();
8751 ism_blkp->iblk_nextpa =
8752 va_to_pa((caddr_t)ism_blkp->iblk_next);
8753 }
8754 ism_blkp = ism_blkp->iblk_next;
8755 }
8756
8757 /*
8758 * After calling hat_join_region, sfmmup may join a new SCD or
8759 * move from the old scd to a new scd, in which case, we want to
8760 * shrink the sfmmup's private tsb size, i.e., pass shrink to
8761 * sfmmu_check_page_sizes at the end of this routine.
8762 */
8763 old_scdp = sfmmup->sfmmu_scdp;
8764
8765 rcookie = hat_join_region(sfmmup, addr, len, (void *)ism_hatid, 0,
8766 PROT_ALL, ismszc, NULL, HAT_REGION_ISM);
8767 if (rcookie != HAT_INVALID_REGION_COOKIE) {
8768 ism_map[i].imap_rid = (uchar_t)((uint64_t)rcookie);
8769 }
8770 /*
8771 * Update our counters for this sfmmup's ism mappings.
8772 */
8773 for (i = 0; i <= ismszc; i++) {
8774 if (!(disable_ism_large_pages & (1 << i)))
8775 (void) ism_tsb_entries(sfmmup, i);
8776 }
8777
8778 /*
8779 * For ISM and DISM we do not support 512K pages, so we only only
8780 * search the 4M and 8K/64K hashes for 4 pagesize cpus, and search the
8781 * 256M or 32M, and 4M and 8K/64K hashes for 6 pagesize cpus.
8782 *
8783 * Need to set 32M/256M ISM flags to make sure
8784 * sfmmu_check_page_sizes() enables them on Panther.
8785 */
8786 ASSERT((disable_ism_large_pages & (1 << TTE512K)) != 0);
8787
8788 switch (ismszc) {
8789 case TTE256M:
8790 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) {
8791 hatlockp = sfmmu_hat_enter(sfmmup);
8792 SFMMU_FLAGS_SET(sfmmup, HAT_256M_ISM);
8793 sfmmu_hat_exit(hatlockp);
8794 }
8795 break;
8796 case TTE32M:
8797 if (!SFMMU_FLAGS_ISSET(sfmmup, HAT_32M_ISM)) {
8798 hatlockp = sfmmu_hat_enter(sfmmup);
8799 SFMMU_FLAGS_SET(sfmmup, HAT_32M_ISM);
8800 sfmmu_hat_exit(hatlockp);
8801 }
8802 break;
8803 default:
8804 break;
8805 }
8806
8807 /*
8808 * If we updated the ismblkpa for this HAT we must make
8809 * sure all CPUs running this process reload their tsbmiss area.
8810 * Otherwise they will fail to load the mappings in the tsbmiss
8811 * handler and will loop calling pagefault().
8812 */
8813 if (reload_mmu) {
8814 hatlockp = sfmmu_hat_enter(sfmmup);
8815 sfmmu_sync_mmustate(sfmmup);
8816 sfmmu_hat_exit(hatlockp);
8817 }
8818
8819 sfmmu_ismhat_exit(sfmmup, 0);
8820
8821 /*
8822 * Free up ismblk if we didn't use it.
8823 */
8824 if (new_iblk != NULL)
8825 kmem_cache_free(ism_blk_cache, new_iblk);
8826
8827 /*
8828 * Check TSB and TLB page sizes.
8829 */
8830 if (sfmmup->sfmmu_scdp != NULL && old_scdp != sfmmup->sfmmu_scdp) {
8831 sfmmu_check_page_sizes(sfmmup, 0);
8832 } else {
8833 sfmmu_check_page_sizes(sfmmup, 1);
8834 }
8835 return (0);
8836 }
8837
8838 /*
8839 * hat_unshare removes exactly one ism_map from
8840 * this process's as. It expects multiple calls
8841 * to hat_unshare for multiple shm segments.
8842 */
8843 void
hat_unshare(struct hat * sfmmup,caddr_t addr,size_t len,uint_t ismszc)8844 hat_unshare(struct hat *sfmmup, caddr_t addr, size_t len, uint_t ismszc)
8845 {
8846 ism_map_t *ism_map;
8847 ism_ment_t *free_ment = NULL;
8848 ism_blk_t *ism_blkp;
8849 struct hat *ism_hatid;
8850 int found, i;
8851 hatlock_t *hatlockp;
8852 struct tsb_info *tsbinfo;
8853 uint_t ismshift = page_get_shift(ismszc);
8854 size_t sh_size = ISM_SHIFT(ismshift, len);
8855 uchar_t ism_rid;
8856 sf_scd_t *old_scdp;
8857
8858 ASSERT(ISM_ALIGNED(ismshift, addr));
8859 ASSERT(ISM_ALIGNED(ismshift, len));
8860 ASSERT(sfmmup != NULL);
8861 ASSERT(sfmmup != ksfmmup);
8862
8863 if (sfmmup->sfmmu_xhat_provider) {
8864 XHAT_UNSHARE(sfmmup, addr, len);
8865 return;
8866 } else {
8867 /*
8868 * This must be a CPU HAT. If the address space has
8869 * XHATs attached, inform all XHATs that ISM segment
8870 * is going away
8871 */
8872 ASSERT(sfmmup->sfmmu_as != NULL);
8873 if (sfmmup->sfmmu_as->a_xhat != NULL)
8874 xhat_unshare_all(sfmmup->sfmmu_as, addr, len);
8875 }
8876
8877 /*
8878 * Make sure that during the entire time ISM mappings are removed,
8879 * the trap handlers serialize behind us, and that no one else
8880 * can be mucking with ISM mappings. This also lets us get away
8881 * with not doing expensive cross calls to flush the TLB -- we
8882 * just discard the context, flush the entire TSB, and call it
8883 * a day.
8884 */
8885 sfmmu_ismhat_enter(sfmmup, 0);
8886
8887 /*
8888 * Remove the mapping.
8889 *
8890 * We can't have any holes in the ism map.
8891 * The tsb miss code while searching the ism map will
8892 * stop on an empty map slot. So we must move
8893 * everyone past the hole up 1 if any.
8894 *
8895 * Also empty ism map blks are not freed until the
8896 * process exits. This is to prevent a MT race condition
8897 * between sfmmu_unshare() and sfmmu_tsbmiss_exception().
8898 */
8899 found = 0;
8900 ism_blkp = sfmmup->sfmmu_iblk;
8901 while (!found && ism_blkp != NULL) {
8902 ism_map = ism_blkp->iblk_maps;
8903 for (i = 0; i < ISM_MAP_SLOTS; i++) {
8904 if (addr == ism_start(ism_map[i]) &&
8905 sh_size == (size_t)(ism_size(ism_map[i]))) {
8906 found = 1;
8907 break;
8908 }
8909 }
8910 if (!found)
8911 ism_blkp = ism_blkp->iblk_next;
8912 }
8913
8914 if (found) {
8915 ism_hatid = ism_map[i].imap_ismhat;
8916 ism_rid = ism_map[i].imap_rid;
8917 ASSERT(ism_hatid != NULL);
8918 ASSERT(ism_hatid->sfmmu_ismhat == 1);
8919
8920 /*
8921 * After hat_leave_region, the sfmmup may leave SCD,
8922 * in which case, we want to grow the private tsb size when
8923 * calling sfmmu_check_page_sizes at the end of the routine.
8924 */
8925 old_scdp = sfmmup->sfmmu_scdp;
8926 /*
8927 * Then remove ourselves from the region.
8928 */
8929 if (ism_rid != SFMMU_INVALID_ISMRID) {
8930 hat_leave_region(sfmmup, (void *)((uint64_t)ism_rid),
8931 HAT_REGION_ISM);
8932 }
8933
8934 /*
8935 * And now guarantee that any other cpu
8936 * that tries to process an ISM miss
8937 * will go to tl=0.
8938 */
8939 hatlockp = sfmmu_hat_enter(sfmmup);
8940 sfmmu_invalidate_ctx(sfmmup);
8941 sfmmu_hat_exit(hatlockp);
8942
8943 /*
8944 * Remove ourselves from the ism mapping list.
8945 */
8946 mutex_enter(&ism_mlist_lock);
8947 iment_sub(ism_map[i].imap_ment, ism_hatid);
8948 mutex_exit(&ism_mlist_lock);
8949 free_ment = ism_map[i].imap_ment;
8950
8951 /*
8952 * We delete the ism map by copying
8953 * the next map over the current one.
8954 * We will take the next one in the maps
8955 * array or from the next ism_blk.
8956 */
8957 while (ism_blkp != NULL) {
8958 ism_map = ism_blkp->iblk_maps;
8959 while (i < (ISM_MAP_SLOTS - 1)) {
8960 ism_map[i] = ism_map[i + 1];
8961 i++;
8962 }
8963 /* i == (ISM_MAP_SLOTS - 1) */
8964 ism_blkp = ism_blkp->iblk_next;
8965 if (ism_blkp != NULL) {
8966 ism_map[i] = ism_blkp->iblk_maps[0];
8967 i = 0;
8968 } else {
8969 ism_map[i].imap_seg = 0;
8970 ism_map[i].imap_vb_shift = 0;
8971 ism_map[i].imap_rid = SFMMU_INVALID_ISMRID;
8972 ism_map[i].imap_hatflags = 0;
8973 ism_map[i].imap_sz_mask = 0;
8974 ism_map[i].imap_ismhat = NULL;
8975 ism_map[i].imap_ment = NULL;
8976 }
8977 }
8978
8979 /*
8980 * Now flush entire TSB for the process, since
8981 * demapping page by page can be too expensive.
8982 * We don't have to flush the TLB here anymore
8983 * since we switch to a new TLB ctx instead.
8984 * Also, there is no need to flush if the process
8985 * is exiting since the TSB will be freed later.
8986 */
8987 if (!sfmmup->sfmmu_free) {
8988 hatlockp = sfmmu_hat_enter(sfmmup);
8989 for (tsbinfo = sfmmup->sfmmu_tsb; tsbinfo != NULL;
8990 tsbinfo = tsbinfo->tsb_next) {
8991 if (tsbinfo->tsb_flags & TSB_SWAPPED)
8992 continue;
8993 if (tsbinfo->tsb_flags & TSB_RELOC_FLAG) {
8994 tsbinfo->tsb_flags |=
8995 TSB_FLUSH_NEEDED;
8996 continue;
8997 }
8998
8999 sfmmu_inv_tsb(tsbinfo->tsb_va,
9000 TSB_BYTES(tsbinfo->tsb_szc));
9001 }
9002 sfmmu_hat_exit(hatlockp);
9003 }
9004 }
9005
9006 /*
9007 * Update our counters for this sfmmup's ism mappings.
9008 */
9009 for (i = 0; i <= ismszc; i++) {
9010 if (!(disable_ism_large_pages & (1 << i)))
9011 (void) ism_tsb_entries(sfmmup, i);
9012 }
9013
9014 sfmmu_ismhat_exit(sfmmup, 0);
9015
9016 /*
9017 * We must do our freeing here after dropping locks
9018 * to prevent a deadlock in the kmem allocator on the
9019 * mapping list lock.
9020 */
9021 if (free_ment != NULL)
9022 kmem_cache_free(ism_ment_cache, free_ment);
9023
9024 /*
9025 * Check TSB and TLB page sizes if the process isn't exiting.
9026 */
9027 if (!sfmmup->sfmmu_free) {
9028 if (found && old_scdp != NULL && sfmmup->sfmmu_scdp == NULL) {
9029 sfmmu_check_page_sizes(sfmmup, 1);
9030 } else {
9031 sfmmu_check_page_sizes(sfmmup, 0);
9032 }
9033 }
9034 }
9035
9036 /* ARGSUSED */
9037 static int
sfmmu_idcache_constructor(void * buf,void * cdrarg,int kmflags)9038 sfmmu_idcache_constructor(void *buf, void *cdrarg, int kmflags)
9039 {
9040 /* void *buf is sfmmu_t pointer */
9041 bzero(buf, sizeof (sfmmu_t));
9042
9043 return (0);
9044 }
9045
9046 /* ARGSUSED */
9047 static void
sfmmu_idcache_destructor(void * buf,void * cdrarg)9048 sfmmu_idcache_destructor(void *buf, void *cdrarg)
9049 {
9050 /* void *buf is sfmmu_t pointer */
9051 }
9052
9053 /*
9054 * setup kmem hmeblks by bzeroing all members and initializing the nextpa
9055 * field to be the pa of this hmeblk
9056 */
9057 /* ARGSUSED */
9058 static int
sfmmu_hblkcache_constructor(void * buf,void * cdrarg,int kmflags)9059 sfmmu_hblkcache_constructor(void *buf, void *cdrarg, int kmflags)
9060 {
9061 struct hme_blk *hmeblkp;
9062
9063 bzero(buf, (size_t)cdrarg);
9064 hmeblkp = (struct hme_blk *)buf;
9065 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp);
9066
9067 #ifdef HBLK_TRACE
9068 mutex_init(&hmeblkp->hblk_audit_lock, NULL, MUTEX_DEFAULT, NULL);
9069 #endif /* HBLK_TRACE */
9070
9071 return (0);
9072 }
9073
9074 /* ARGSUSED */
9075 static void
sfmmu_hblkcache_destructor(void * buf,void * cdrarg)9076 sfmmu_hblkcache_destructor(void *buf, void *cdrarg)
9077 {
9078
9079 #ifdef HBLK_TRACE
9080
9081 struct hme_blk *hmeblkp;
9082
9083 hmeblkp = (struct hme_blk *)buf;
9084 mutex_destroy(&hmeblkp->hblk_audit_lock);
9085
9086 #endif /* HBLK_TRACE */
9087 }
9088
9089 #define SFMMU_CACHE_RECLAIM_SCAN_RATIO 8
9090 static int sfmmu_cache_reclaim_scan_ratio = SFMMU_CACHE_RECLAIM_SCAN_RATIO;
9091 /*
9092 * The kmem allocator will callback into our reclaim routine when the system
9093 * is running low in memory. We traverse the hash and free up all unused but
9094 * still cached hme_blks. We also traverse the free list and free them up
9095 * as well.
9096 */
9097 /*ARGSUSED*/
9098 static void
sfmmu_hblkcache_reclaim(void * cdrarg)9099 sfmmu_hblkcache_reclaim(void *cdrarg)
9100 {
9101 int i;
9102 struct hmehash_bucket *hmebp;
9103 struct hme_blk *hmeblkp, *nx_hblk, *pr_hblk = NULL;
9104 static struct hmehash_bucket *uhmehash_reclaim_hand;
9105 static struct hmehash_bucket *khmehash_reclaim_hand;
9106 struct hme_blk *list = NULL, *last_hmeblkp;
9107 cpuset_t cpuset = cpu_ready_set;
9108 cpu_hme_pend_t *cpuhp;
9109
9110 /* Free up hmeblks on the cpu pending lists */
9111 for (i = 0; i < NCPU; i++) {
9112 cpuhp = &cpu_hme_pend[i];
9113 if (cpuhp->chp_listp != NULL) {
9114 mutex_enter(&cpuhp->chp_mutex);
9115 if (cpuhp->chp_listp == NULL) {
9116 mutex_exit(&cpuhp->chp_mutex);
9117 continue;
9118 }
9119 for (last_hmeblkp = cpuhp->chp_listp;
9120 last_hmeblkp->hblk_next != NULL;
9121 last_hmeblkp = last_hmeblkp->hblk_next)
9122 ;
9123 last_hmeblkp->hblk_next = list;
9124 list = cpuhp->chp_listp;
9125 cpuhp->chp_listp = NULL;
9126 cpuhp->chp_count = 0;
9127 mutex_exit(&cpuhp->chp_mutex);
9128 }
9129
9130 }
9131
9132 if (list != NULL) {
9133 kpreempt_disable();
9134 CPUSET_DEL(cpuset, CPU->cpu_id);
9135 xt_sync(cpuset);
9136 xt_sync(cpuset);
9137 kpreempt_enable();
9138 sfmmu_hblk_free(&list);
9139 list = NULL;
9140 }
9141
9142 hmebp = uhmehash_reclaim_hand;
9143 if (hmebp == NULL || hmebp > &uhme_hash[UHMEHASH_SZ])
9144 uhmehash_reclaim_hand = hmebp = uhme_hash;
9145 uhmehash_reclaim_hand += UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio;
9146
9147 for (i = UHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) {
9148 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) {
9149 hmeblkp = hmebp->hmeblkp;
9150 pr_hblk = NULL;
9151 while (hmeblkp) {
9152 nx_hblk = hmeblkp->hblk_next;
9153 if (!hmeblkp->hblk_vcnt &&
9154 !hmeblkp->hblk_hmecnt) {
9155 sfmmu_hblk_hash_rm(hmebp, hmeblkp,
9156 pr_hblk, &list, 0);
9157 } else {
9158 pr_hblk = hmeblkp;
9159 }
9160 hmeblkp = nx_hblk;
9161 }
9162 SFMMU_HASH_UNLOCK(hmebp);
9163 }
9164 if (hmebp++ == &uhme_hash[UHMEHASH_SZ])
9165 hmebp = uhme_hash;
9166 }
9167
9168 hmebp = khmehash_reclaim_hand;
9169 if (hmebp == NULL || hmebp > &khme_hash[KHMEHASH_SZ])
9170 khmehash_reclaim_hand = hmebp = khme_hash;
9171 khmehash_reclaim_hand += KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio;
9172
9173 for (i = KHMEHASH_SZ / sfmmu_cache_reclaim_scan_ratio; i; i--) {
9174 if (SFMMU_HASH_LOCK_TRYENTER(hmebp) != 0) {
9175 hmeblkp = hmebp->hmeblkp;
9176 pr_hblk = NULL;
9177 while (hmeblkp) {
9178 nx_hblk = hmeblkp->hblk_next;
9179 if (!hmeblkp->hblk_vcnt &&
9180 !hmeblkp->hblk_hmecnt) {
9181 sfmmu_hblk_hash_rm(hmebp, hmeblkp,
9182 pr_hblk, &list, 0);
9183 } else {
9184 pr_hblk = hmeblkp;
9185 }
9186 hmeblkp = nx_hblk;
9187 }
9188 SFMMU_HASH_UNLOCK(hmebp);
9189 }
9190 if (hmebp++ == &khme_hash[KHMEHASH_SZ])
9191 hmebp = khme_hash;
9192 }
9193 sfmmu_hblks_list_purge(&list, 0);
9194 }
9195
9196 /*
9197 * sfmmu_get_ppvcolor should become a vm_machdep or hatop interface.
9198 * same goes for sfmmu_get_addrvcolor().
9199 *
9200 * This function will return the virtual color for the specified page. The
9201 * virtual color corresponds to this page current mapping or its last mapping.
9202 * It is used by memory allocators to choose addresses with the correct
9203 * alignment so vac consistency is automatically maintained. If the page
9204 * has no color it returns -1.
9205 */
9206 /*ARGSUSED*/
9207 int
sfmmu_get_ppvcolor(struct page * pp)9208 sfmmu_get_ppvcolor(struct page *pp)
9209 {
9210 #ifdef VAC
9211 int color;
9212
9213 if (!(cache & CACHE_VAC) || PP_NEWPAGE(pp)) {
9214 return (-1);
9215 }
9216 color = PP_GET_VCOLOR(pp);
9217 ASSERT(color < mmu_btop(shm_alignment));
9218 return (color);
9219 #else
9220 return (-1);
9221 #endif /* VAC */
9222 }
9223
9224 /*
9225 * This function will return the desired alignment for vac consistency
9226 * (vac color) given a virtual address. If no vac is present it returns -1.
9227 */
9228 /*ARGSUSED*/
9229 int
sfmmu_get_addrvcolor(caddr_t vaddr)9230 sfmmu_get_addrvcolor(caddr_t vaddr)
9231 {
9232 #ifdef VAC
9233 if (cache & CACHE_VAC) {
9234 return (addr_to_vcolor(vaddr));
9235 } else {
9236 return (-1);
9237 }
9238 #else
9239 return (-1);
9240 #endif /* VAC */
9241 }
9242
9243 #ifdef VAC
9244 /*
9245 * Check for conflicts.
9246 * A conflict exists if the new and existent mappings do not match in
9247 * their "shm_alignment fields. If conflicts exist, the existant mappings
9248 * are flushed unless one of them is locked. If one of them is locked, then
9249 * the mappings are flushed and converted to non-cacheable mappings.
9250 */
9251 static void
sfmmu_vac_conflict(struct hat * hat,caddr_t addr,page_t * pp)9252 sfmmu_vac_conflict(struct hat *hat, caddr_t addr, page_t *pp)
9253 {
9254 struct hat *tmphat;
9255 struct sf_hment *sfhmep, *tmphme = NULL;
9256 struct hme_blk *hmeblkp;
9257 int vcolor;
9258 tte_t tte;
9259
9260 ASSERT(sfmmu_mlist_held(pp));
9261 ASSERT(!PP_ISNC(pp)); /* page better be cacheable */
9262
9263 vcolor = addr_to_vcolor(addr);
9264 if (PP_NEWPAGE(pp)) {
9265 PP_SET_VCOLOR(pp, vcolor);
9266 return;
9267 }
9268
9269 if (PP_GET_VCOLOR(pp) == vcolor) {
9270 return;
9271 }
9272
9273 if (!PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp)) {
9274 /*
9275 * Previous user of page had a different color
9276 * but since there are no current users
9277 * we just flush the cache and change the color.
9278 */
9279 SFMMU_STAT(sf_pgcolor_conflict);
9280 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp));
9281 PP_SET_VCOLOR(pp, vcolor);
9282 return;
9283 }
9284
9285 /*
9286 * If we get here we have a vac conflict with a current
9287 * mapping. VAC conflict policy is as follows.
9288 * - The default is to unload the other mappings unless:
9289 * - If we have a large mapping we uncache the page.
9290 * We need to uncache the rest of the large page too.
9291 * - If any of the mappings are locked we uncache the page.
9292 * - If the requested mapping is inconsistent
9293 * with another mapping and that mapping
9294 * is in the same address space we have to
9295 * make it non-cached. The default thing
9296 * to do is unload the inconsistent mapping
9297 * but if they are in the same address space
9298 * we run the risk of unmapping the pc or the
9299 * stack which we will use as we return to the user,
9300 * in which case we can then fault on the thing
9301 * we just unloaded and get into an infinite loop.
9302 */
9303 if (PP_ISMAPPED_LARGE(pp)) {
9304 int sz;
9305
9306 /*
9307 * Existing mapping is for big pages. We don't unload
9308 * existing big mappings to satisfy new mappings.
9309 * Always convert all mappings to TNC.
9310 */
9311 sz = fnd_mapping_sz(pp);
9312 pp = PP_GROUPLEADER(pp, sz);
9313 SFMMU_STAT_ADD(sf_uncache_conflict, TTEPAGES(sz));
9314 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH,
9315 TTEPAGES(sz));
9316
9317 return;
9318 }
9319
9320 /*
9321 * check if any mapping is in same as or if it is locked
9322 * since in that case we need to uncache.
9323 */
9324 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) {
9325 tmphme = sfhmep->hme_next;
9326 if (IS_PAHME(sfhmep))
9327 continue;
9328 hmeblkp = sfmmu_hmetohblk(sfhmep);
9329 if (hmeblkp->hblk_xhat_bit)
9330 continue;
9331 tmphat = hblktosfmmu(hmeblkp);
9332 sfmmu_copytte(&sfhmep->hme_tte, &tte);
9333 ASSERT(TTE_IS_VALID(&tte));
9334 if (hmeblkp->hblk_shared || tmphat == hat ||
9335 hmeblkp->hblk_lckcnt) {
9336 /*
9337 * We have an uncache conflict
9338 */
9339 SFMMU_STAT(sf_uncache_conflict);
9340 sfmmu_page_cache_array(pp, HAT_TMPNC, CACHE_FLUSH, 1);
9341 return;
9342 }
9343 }
9344
9345 /*
9346 * We have an unload conflict
9347 * We have already checked for LARGE mappings, therefore
9348 * the remaining mapping(s) must be TTE8K.
9349 */
9350 SFMMU_STAT(sf_unload_conflict);
9351
9352 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = tmphme) {
9353 tmphme = sfhmep->hme_next;
9354 if (IS_PAHME(sfhmep))
9355 continue;
9356 hmeblkp = sfmmu_hmetohblk(sfhmep);
9357 if (hmeblkp->hblk_xhat_bit)
9358 continue;
9359 ASSERT(!hmeblkp->hblk_shared);
9360 (void) sfmmu_pageunload(pp, sfhmep, TTE8K);
9361 }
9362
9363 if (PP_ISMAPPED_KPM(pp))
9364 sfmmu_kpm_vac_unload(pp, addr);
9365
9366 /*
9367 * Unloads only do TLB flushes so we need to flush the
9368 * cache here.
9369 */
9370 sfmmu_cache_flush(pp->p_pagenum, PP_GET_VCOLOR(pp));
9371 PP_SET_VCOLOR(pp, vcolor);
9372 }
9373
9374 /*
9375 * Whenever a mapping is unloaded and the page is in TNC state,
9376 * we see if the page can be made cacheable again. 'pp' is
9377 * the page that we just unloaded a mapping from, the size
9378 * of mapping that was unloaded is 'ottesz'.
9379 * Remark:
9380 * The recache policy for mpss pages can leave a performance problem
9381 * under the following circumstances:
9382 * . A large page in uncached mode has just been unmapped.
9383 * . All constituent pages are TNC due to a conflicting small mapping.
9384 * . There are many other, non conflicting, small mappings around for
9385 * a lot of the constituent pages.
9386 * . We're called w/ the "old" groupleader page and the old ottesz,
9387 * but this is irrelevant, since we're no more "PP_ISMAPPED_LARGE", so
9388 * we end up w/ TTE8K or npages == 1.
9389 * . We call tst_tnc w/ the old groupleader only, and if there is no
9390 * conflict, we re-cache only this page.
9391 * . All other small mappings are not checked and will be left in TNC mode.
9392 * The problem is not very serious because:
9393 * . mpss is actually only defined for heap and stack, so the probability
9394 * is not very high that a large page mapping exists in parallel to a small
9395 * one (this is possible, but seems to be bad programming style in the
9396 * appl).
9397 * . The problem gets a little bit more serious, when those TNC pages
9398 * have to be mapped into kernel space, e.g. for networking.
9399 * . When VAC alias conflicts occur in applications, this is regarded
9400 * as an application bug. So if kstat's show them, the appl should
9401 * be changed anyway.
9402 */
9403 void
conv_tnc(page_t * pp,int ottesz)9404 conv_tnc(page_t *pp, int ottesz)
9405 {
9406 int cursz, dosz;
9407 pgcnt_t curnpgs, dopgs;
9408 pgcnt_t pg64k;
9409 page_t *pp2;
9410
9411 /*
9412 * Determine how big a range we check for TNC and find
9413 * leader page. cursz is the size of the biggest
9414 * mapping that still exist on 'pp'.
9415 */
9416 if (PP_ISMAPPED_LARGE(pp)) {
9417 cursz = fnd_mapping_sz(pp);
9418 } else {
9419 cursz = TTE8K;
9420 }
9421
9422 if (ottesz >= cursz) {
9423 dosz = ottesz;
9424 pp2 = pp;
9425 } else {
9426 dosz = cursz;
9427 pp2 = PP_GROUPLEADER(pp, dosz);
9428 }
9429
9430 pg64k = TTEPAGES(TTE64K);
9431 dopgs = TTEPAGES(dosz);
9432
9433 ASSERT(dopgs == 1 || ((dopgs & (pg64k - 1)) == 0));
9434
9435 while (dopgs != 0) {
9436 curnpgs = TTEPAGES(cursz);
9437 if (tst_tnc(pp2, curnpgs)) {
9438 SFMMU_STAT_ADD(sf_recache, curnpgs);
9439 sfmmu_page_cache_array(pp2, HAT_CACHE, CACHE_NO_FLUSH,
9440 curnpgs);
9441 }
9442
9443 ASSERT(dopgs >= curnpgs);
9444 dopgs -= curnpgs;
9445
9446 if (dopgs == 0) {
9447 break;
9448 }
9449
9450 pp2 = PP_PAGENEXT_N(pp2, curnpgs);
9451 if (((dopgs & (pg64k - 1)) == 0) && PP_ISMAPPED_LARGE(pp2)) {
9452 cursz = fnd_mapping_sz(pp2);
9453 } else {
9454 cursz = TTE8K;
9455 }
9456 }
9457 }
9458
9459 /*
9460 * Returns 1 if page(s) can be converted from TNC to cacheable setting,
9461 * returns 0 otherwise. Note that oaddr argument is valid for only
9462 * 8k pages.
9463 */
9464 int
tst_tnc(page_t * pp,pgcnt_t npages)9465 tst_tnc(page_t *pp, pgcnt_t npages)
9466 {
9467 struct sf_hment *sfhme;
9468 struct hme_blk *hmeblkp;
9469 tte_t tte;
9470 caddr_t vaddr;
9471 int clr_valid = 0;
9472 int color, color1, bcolor;
9473 int i, ncolors;
9474
9475 ASSERT(pp != NULL);
9476 ASSERT(!(cache & CACHE_WRITEBACK));
9477
9478 if (npages > 1) {
9479 ncolors = CACHE_NUM_COLOR;
9480 }
9481
9482 for (i = 0; i < npages; i++) {
9483 ASSERT(sfmmu_mlist_held(pp));
9484 ASSERT(PP_ISTNC(pp));
9485 ASSERT(PP_GET_VCOLOR(pp) == NO_VCOLOR);
9486
9487 if (PP_ISPNC(pp)) {
9488 return (0);
9489 }
9490
9491 clr_valid = 0;
9492 if (PP_ISMAPPED_KPM(pp)) {
9493 caddr_t kpmvaddr;
9494
9495 ASSERT(kpm_enable);
9496 kpmvaddr = hat_kpm_page2va(pp, 1);
9497 ASSERT(!(npages > 1 && IS_KPM_ALIAS_RANGE(kpmvaddr)));
9498 color1 = addr_to_vcolor(kpmvaddr);
9499 clr_valid = 1;
9500 }
9501
9502 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) {
9503 if (IS_PAHME(sfhme))
9504 continue;
9505 hmeblkp = sfmmu_hmetohblk(sfhme);
9506 if (hmeblkp->hblk_xhat_bit)
9507 continue;
9508
9509 sfmmu_copytte(&sfhme->hme_tte, &tte);
9510 ASSERT(TTE_IS_VALID(&tte));
9511
9512 vaddr = tte_to_vaddr(hmeblkp, tte);
9513 color = addr_to_vcolor(vaddr);
9514
9515 if (npages > 1) {
9516 /*
9517 * If there is a big mapping, make sure
9518 * 8K mapping is consistent with the big
9519 * mapping.
9520 */
9521 bcolor = i % ncolors;
9522 if (color != bcolor) {
9523 return (0);
9524 }
9525 }
9526 if (!clr_valid) {
9527 clr_valid = 1;
9528 color1 = color;
9529 }
9530
9531 if (color1 != color) {
9532 return (0);
9533 }
9534 }
9535
9536 pp = PP_PAGENEXT(pp);
9537 }
9538
9539 return (1);
9540 }
9541
9542 void
sfmmu_page_cache_array(page_t * pp,int flags,int cache_flush_flag,pgcnt_t npages)9543 sfmmu_page_cache_array(page_t *pp, int flags, int cache_flush_flag,
9544 pgcnt_t npages)
9545 {
9546 kmutex_t *pmtx;
9547 int i, ncolors, bcolor;
9548 kpm_hlk_t *kpmp;
9549 cpuset_t cpuset;
9550
9551 ASSERT(pp != NULL);
9552 ASSERT(!(cache & CACHE_WRITEBACK));
9553
9554 kpmp = sfmmu_kpm_kpmp_enter(pp, npages);
9555 pmtx = sfmmu_page_enter(pp);
9556
9557 /*
9558 * Fast path caching single unmapped page
9559 */
9560 if (npages == 1 && !PP_ISMAPPED(pp) && !PP_ISMAPPED_KPM(pp) &&
9561 flags == HAT_CACHE) {
9562 PP_CLRTNC(pp);
9563 PP_CLRPNC(pp);
9564 sfmmu_page_exit(pmtx);
9565 sfmmu_kpm_kpmp_exit(kpmp);
9566 return;
9567 }
9568
9569 /*
9570 * We need to capture all cpus in order to change cacheability
9571 * because we can't allow one cpu to access the same physical
9572 * page using a cacheable and a non-cachebale mapping at the same
9573 * time. Since we may end up walking the ism mapping list
9574 * have to grab it's lock now since we can't after all the
9575 * cpus have been captured.
9576 */
9577 sfmmu_hat_lock_all();
9578 mutex_enter(&ism_mlist_lock);
9579 kpreempt_disable();
9580 cpuset = cpu_ready_set;
9581 xc_attention(cpuset);
9582
9583 if (npages > 1) {
9584 /*
9585 * Make sure all colors are flushed since the
9586 * sfmmu_page_cache() only flushes one color-
9587 * it does not know big pages.
9588 */
9589 ncolors = CACHE_NUM_COLOR;
9590 if (flags & HAT_TMPNC) {
9591 for (i = 0; i < ncolors; i++) {
9592 sfmmu_cache_flushcolor(i, pp->p_pagenum);
9593 }
9594 cache_flush_flag = CACHE_NO_FLUSH;
9595 }
9596 }
9597
9598 for (i = 0; i < npages; i++) {
9599
9600 ASSERT(sfmmu_mlist_held(pp));
9601
9602 if (!(flags == HAT_TMPNC && PP_ISTNC(pp))) {
9603
9604 if (npages > 1) {
9605 bcolor = i % ncolors;
9606 } else {
9607 bcolor = NO_VCOLOR;
9608 }
9609
9610 sfmmu_page_cache(pp, flags, cache_flush_flag,
9611 bcolor);
9612 }
9613
9614 pp = PP_PAGENEXT(pp);
9615 }
9616
9617 xt_sync(cpuset);
9618 xc_dismissed(cpuset);
9619 mutex_exit(&ism_mlist_lock);
9620 sfmmu_hat_unlock_all();
9621 sfmmu_page_exit(pmtx);
9622 sfmmu_kpm_kpmp_exit(kpmp);
9623 kpreempt_enable();
9624 }
9625
9626 /*
9627 * This function changes the virtual cacheability of all mappings to a
9628 * particular page. When changing from uncache to cacheable the mappings will
9629 * only be changed if all of them have the same virtual color.
9630 * We need to flush the cache in all cpus. It is possible that
9631 * a process referenced a page as cacheable but has sinced exited
9632 * and cleared the mapping list. We still to flush it but have no
9633 * state so all cpus is the only alternative.
9634 */
9635 static void
sfmmu_page_cache(page_t * pp,int flags,int cache_flush_flag,int bcolor)9636 sfmmu_page_cache(page_t *pp, int flags, int cache_flush_flag, int bcolor)
9637 {
9638 struct sf_hment *sfhme;
9639 struct hme_blk *hmeblkp;
9640 sfmmu_t *sfmmup;
9641 tte_t tte, ttemod;
9642 caddr_t vaddr;
9643 int ret, color;
9644 pfn_t pfn;
9645
9646 color = bcolor;
9647 pfn = pp->p_pagenum;
9648
9649 for (sfhme = pp->p_mapping; sfhme; sfhme = sfhme->hme_next) {
9650
9651 if (IS_PAHME(sfhme))
9652 continue;
9653 hmeblkp = sfmmu_hmetohblk(sfhme);
9654
9655 if (hmeblkp->hblk_xhat_bit)
9656 continue;
9657
9658 sfmmu_copytte(&sfhme->hme_tte, &tte);
9659 ASSERT(TTE_IS_VALID(&tte));
9660 vaddr = tte_to_vaddr(hmeblkp, tte);
9661 color = addr_to_vcolor(vaddr);
9662
9663 #ifdef DEBUG
9664 if ((flags & HAT_CACHE) && bcolor != NO_VCOLOR) {
9665 ASSERT(color == bcolor);
9666 }
9667 #endif
9668
9669 ASSERT(flags != HAT_TMPNC || color == PP_GET_VCOLOR(pp));
9670
9671 ttemod = tte;
9672 if (flags & (HAT_UNCACHE | HAT_TMPNC)) {
9673 TTE_CLR_VCACHEABLE(&ttemod);
9674 } else { /* flags & HAT_CACHE */
9675 TTE_SET_VCACHEABLE(&ttemod);
9676 }
9677 ret = sfmmu_modifytte_try(&tte, &ttemod, &sfhme->hme_tte);
9678 if (ret < 0) {
9679 /*
9680 * Since all cpus are captured modifytte should not
9681 * fail.
9682 */
9683 panic("sfmmu_page_cache: write to tte failed");
9684 }
9685
9686 sfmmup = hblktosfmmu(hmeblkp);
9687 if (cache_flush_flag == CACHE_FLUSH) {
9688 /*
9689 * Flush TSBs, TLBs and caches
9690 */
9691 if (hmeblkp->hblk_shared) {
9692 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
9693 uint_t rid = hmeblkp->hblk_tag.htag_rid;
9694 sf_region_t *rgnp;
9695 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
9696 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
9697 ASSERT(srdp != NULL);
9698 rgnp = srdp->srd_hmergnp[rid];
9699 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
9700 srdp, rgnp, rid);
9701 (void) sfmmu_rgntlb_demap(vaddr, rgnp,
9702 hmeblkp, 0);
9703 sfmmu_cache_flush(pfn, addr_to_vcolor(vaddr));
9704 } else if (sfmmup->sfmmu_ismhat) {
9705 if (flags & HAT_CACHE) {
9706 SFMMU_STAT(sf_ism_recache);
9707 } else {
9708 SFMMU_STAT(sf_ism_uncache);
9709 }
9710 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp,
9711 pfn, CACHE_FLUSH);
9712 } else {
9713 sfmmu_tlbcache_demap(vaddr, sfmmup, hmeblkp,
9714 pfn, 0, FLUSH_ALL_CPUS, CACHE_FLUSH, 1);
9715 }
9716
9717 /*
9718 * all cache entries belonging to this pfn are
9719 * now flushed.
9720 */
9721 cache_flush_flag = CACHE_NO_FLUSH;
9722 } else {
9723 /*
9724 * Flush only TSBs and TLBs.
9725 */
9726 if (hmeblkp->hblk_shared) {
9727 sf_srd_t *srdp = (sf_srd_t *)sfmmup;
9728 uint_t rid = hmeblkp->hblk_tag.htag_rid;
9729 sf_region_t *rgnp;
9730 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
9731 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
9732 ASSERT(srdp != NULL);
9733 rgnp = srdp->srd_hmergnp[rid];
9734 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp,
9735 srdp, rgnp, rid);
9736 (void) sfmmu_rgntlb_demap(vaddr, rgnp,
9737 hmeblkp, 0);
9738 } else if (sfmmup->sfmmu_ismhat) {
9739 if (flags & HAT_CACHE) {
9740 SFMMU_STAT(sf_ism_recache);
9741 } else {
9742 SFMMU_STAT(sf_ism_uncache);
9743 }
9744 sfmmu_ismtlbcache_demap(vaddr, sfmmup, hmeblkp,
9745 pfn, CACHE_NO_FLUSH);
9746 } else {
9747 sfmmu_tlb_demap(vaddr, sfmmup, hmeblkp, 0, 1);
9748 }
9749 }
9750 }
9751
9752 if (PP_ISMAPPED_KPM(pp))
9753 sfmmu_kpm_page_cache(pp, flags, cache_flush_flag);
9754
9755 switch (flags) {
9756
9757 default:
9758 panic("sfmmu_pagecache: unknown flags");
9759 break;
9760
9761 case HAT_CACHE:
9762 PP_CLRTNC(pp);
9763 PP_CLRPNC(pp);
9764 PP_SET_VCOLOR(pp, color);
9765 break;
9766
9767 case HAT_TMPNC:
9768 PP_SETTNC(pp);
9769 PP_SET_VCOLOR(pp, NO_VCOLOR);
9770 break;
9771
9772 case HAT_UNCACHE:
9773 PP_SETPNC(pp);
9774 PP_CLRTNC(pp);
9775 PP_SET_VCOLOR(pp, NO_VCOLOR);
9776 break;
9777 }
9778 }
9779 #endif /* VAC */
9780
9781
9782 /*
9783 * Wrapper routine used to return a context.
9784 *
9785 * It's the responsibility of the caller to guarantee that the
9786 * process serializes on calls here by taking the HAT lock for
9787 * the hat.
9788 *
9789 */
9790 static void
sfmmu_get_ctx(sfmmu_t * sfmmup)9791 sfmmu_get_ctx(sfmmu_t *sfmmup)
9792 {
9793 mmu_ctx_t *mmu_ctxp;
9794 uint_t pstate_save;
9795 int ret;
9796
9797 ASSERT(sfmmu_hat_lock_held(sfmmup));
9798 ASSERT(sfmmup != ksfmmup);
9799
9800 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID)) {
9801 sfmmu_setup_tsbinfo(sfmmup);
9802 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ALLCTX_INVALID);
9803 }
9804
9805 kpreempt_disable();
9806
9807 mmu_ctxp = CPU_MMU_CTXP(CPU);
9808 ASSERT(mmu_ctxp);
9809 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms);
9810 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]);
9811
9812 /*
9813 * Do a wrap-around if cnum reaches the max # cnum supported by a MMU.
9814 */
9815 if (mmu_ctxp->mmu_cnum == mmu_ctxp->mmu_nctxs)
9816 sfmmu_ctx_wrap_around(mmu_ctxp, B_TRUE);
9817
9818 /*
9819 * Let the MMU set up the page sizes to use for
9820 * this context in the TLB. Don't program 2nd dtlb for ism hat.
9821 */
9822 if ((&mmu_set_ctx_page_sizes) && (sfmmup->sfmmu_ismhat == 0)) {
9823 mmu_set_ctx_page_sizes(sfmmup);
9824 }
9825
9826 /*
9827 * sfmmu_alloc_ctx and sfmmu_load_mmustate will be performed with
9828 * interrupts disabled to prevent race condition with wrap-around
9829 * ctx invalidatation. In sun4v, ctx invalidation also involves
9830 * a HV call to set the number of TSBs to 0. If interrupts are not
9831 * disabled until after sfmmu_load_mmustate is complete TSBs may
9832 * become assigned to INVALID_CONTEXT. This is not allowed.
9833 */
9834 pstate_save = sfmmu_disable_intrs();
9835
9836 if (sfmmu_alloc_ctx(sfmmup, 1, CPU, SFMMU_PRIVATE) &&
9837 sfmmup->sfmmu_scdp != NULL) {
9838 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
9839 sfmmu_t *scsfmmup = scdp->scd_sfmmup;
9840 ret = sfmmu_alloc_ctx(scsfmmup, 1, CPU, SFMMU_SHARED);
9841 /* debug purpose only */
9842 ASSERT(!ret || scsfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum
9843 != INVALID_CONTEXT);
9844 }
9845 sfmmu_load_mmustate(sfmmup);
9846
9847 sfmmu_enable_intrs(pstate_save);
9848
9849 kpreempt_enable();
9850 }
9851
9852 /*
9853 * When all cnums are used up in a MMU, cnum will wrap around to the
9854 * next generation and start from 2.
9855 */
9856 static void
sfmmu_ctx_wrap_around(mmu_ctx_t * mmu_ctxp,boolean_t reset_cnum)9857 sfmmu_ctx_wrap_around(mmu_ctx_t *mmu_ctxp, boolean_t reset_cnum)
9858 {
9859
9860 /* caller must have disabled the preemption */
9861 ASSERT(curthread->t_preempt >= 1);
9862 ASSERT(mmu_ctxp != NULL);
9863
9864 /* acquire Per-MMU (PM) spin lock */
9865 mutex_enter(&mmu_ctxp->mmu_lock);
9866
9867 /* re-check to see if wrap-around is needed */
9868 if (mmu_ctxp->mmu_cnum < mmu_ctxp->mmu_nctxs)
9869 goto done;
9870
9871 SFMMU_MMU_STAT(mmu_wrap_around);
9872
9873 /* update gnum */
9874 ASSERT(mmu_ctxp->mmu_gnum != 0);
9875 mmu_ctxp->mmu_gnum++;
9876 if (mmu_ctxp->mmu_gnum == 0 ||
9877 mmu_ctxp->mmu_gnum > MAX_SFMMU_GNUM_VAL) {
9878 cmn_err(CE_PANIC, "mmu_gnum of mmu_ctx 0x%p is out of bound.",
9879 (void *)mmu_ctxp);
9880 }
9881
9882 if (mmu_ctxp->mmu_ncpus > 1) {
9883 cpuset_t cpuset;
9884
9885 membar_enter(); /* make sure updated gnum visible */
9886
9887 SFMMU_XCALL_STATS(NULL);
9888
9889 /* xcall to others on the same MMU to invalidate ctx */
9890 cpuset = mmu_ctxp->mmu_cpuset;
9891 ASSERT(CPU_IN_SET(cpuset, CPU->cpu_id) || !reset_cnum);
9892 CPUSET_DEL(cpuset, CPU->cpu_id);
9893 CPUSET_AND(cpuset, cpu_ready_set);
9894
9895 /*
9896 * Pass in INVALID_CONTEXT as the first parameter to
9897 * sfmmu_raise_tsb_exception, which invalidates the context
9898 * of any process running on the CPUs in the MMU.
9899 */
9900 xt_some(cpuset, sfmmu_raise_tsb_exception,
9901 INVALID_CONTEXT, INVALID_CONTEXT);
9902 xt_sync(cpuset);
9903
9904 SFMMU_MMU_STAT(mmu_tsb_raise_exception);
9905 }
9906
9907 if (sfmmu_getctx_sec() != INVALID_CONTEXT) {
9908 sfmmu_setctx_sec(INVALID_CONTEXT);
9909 sfmmu_clear_utsbinfo();
9910 }
9911
9912 /*
9913 * No xcall is needed here. For sun4u systems all CPUs in context
9914 * domain share a single physical MMU therefore it's enough to flush
9915 * TLB on local CPU. On sun4v systems we use 1 global context
9916 * domain and flush all remote TLBs in sfmmu_raise_tsb_exception
9917 * handler. Note that vtag_flushall_uctxs() is called
9918 * for Ultra II machine, where the equivalent flushall functionality
9919 * is implemented in SW, and only user ctx TLB entries are flushed.
9920 */
9921 if (&vtag_flushall_uctxs != NULL) {
9922 vtag_flushall_uctxs();
9923 } else {
9924 vtag_flushall();
9925 }
9926
9927 /* reset mmu cnum, skips cnum 0 and 1 */
9928 if (reset_cnum == B_TRUE)
9929 mmu_ctxp->mmu_cnum = NUM_LOCKED_CTXS;
9930
9931 done:
9932 mutex_exit(&mmu_ctxp->mmu_lock);
9933 }
9934
9935
9936 /*
9937 * For multi-threaded process, set the process context to INVALID_CONTEXT
9938 * so that it faults and reloads the MMU state from TL=0. For single-threaded
9939 * process, we can just load the MMU state directly without having to
9940 * set context invalid. Caller must hold the hat lock since we don't
9941 * acquire it here.
9942 */
9943 static void
sfmmu_sync_mmustate(sfmmu_t * sfmmup)9944 sfmmu_sync_mmustate(sfmmu_t *sfmmup)
9945 {
9946 uint_t cnum;
9947 uint_t pstate_save;
9948
9949 ASSERT(sfmmup != ksfmmup);
9950 ASSERT(sfmmu_hat_lock_held(sfmmup));
9951
9952 kpreempt_disable();
9953
9954 /*
9955 * We check whether the pass'ed-in sfmmup is the same as the
9956 * current running proc. This is to makes sure the current proc
9957 * stays single-threaded if it already is.
9958 */
9959 if ((sfmmup == curthread->t_procp->p_as->a_hat) &&
9960 (curthread->t_procp->p_lwpcnt == 1)) {
9961 /* single-thread */
9962 cnum = sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum;
9963 if (cnum != INVALID_CONTEXT) {
9964 uint_t curcnum;
9965 /*
9966 * Disable interrupts to prevent race condition
9967 * with sfmmu_ctx_wrap_around ctx invalidation.
9968 * In sun4v, ctx invalidation involves setting
9969 * TSB to NULL, hence, interrupts should be disabled
9970 * untill after sfmmu_load_mmustate is completed.
9971 */
9972 pstate_save = sfmmu_disable_intrs();
9973 curcnum = sfmmu_getctx_sec();
9974 if (curcnum == cnum)
9975 sfmmu_load_mmustate(sfmmup);
9976 sfmmu_enable_intrs(pstate_save);
9977 ASSERT(curcnum == cnum || curcnum == INVALID_CONTEXT);
9978 }
9979 } else {
9980 /*
9981 * multi-thread
9982 * or when sfmmup is not the same as the curproc.
9983 */
9984 sfmmu_invalidate_ctx(sfmmup);
9985 }
9986
9987 kpreempt_enable();
9988 }
9989
9990
9991 /*
9992 * Replace the specified TSB with a new TSB. This function gets called when
9993 * we grow, shrink or swapin a TSB. When swapping in a TSB (TSB_SWAPIN), the
9994 * TSB_FORCEALLOC flag may be used to force allocation of a minimum-sized TSB
9995 * (8K).
9996 *
9997 * Caller must hold the HAT lock, but should assume any tsb_info
9998 * pointers it has are no longer valid after calling this function.
9999 *
10000 * Return values:
10001 * TSB_ALLOCFAIL Failed to allocate a TSB, due to memory constraints
10002 * TSB_LOSTRACE HAT is busy, i.e. another thread is already doing
10003 * something to this tsbinfo/TSB
10004 * TSB_SUCCESS Operation succeeded
10005 */
10006 static tsb_replace_rc_t
sfmmu_replace_tsb(sfmmu_t * sfmmup,struct tsb_info * old_tsbinfo,uint_t szc,hatlock_t * hatlockp,uint_t flags)10007 sfmmu_replace_tsb(sfmmu_t *sfmmup, struct tsb_info *old_tsbinfo, uint_t szc,
10008 hatlock_t *hatlockp, uint_t flags)
10009 {
10010 struct tsb_info *new_tsbinfo = NULL;
10011 struct tsb_info *curtsb, *prevtsb;
10012 uint_t tte_sz_mask;
10013 int i;
10014
10015 ASSERT(sfmmup != ksfmmup);
10016 ASSERT(sfmmup->sfmmu_ismhat == 0);
10017 ASSERT(sfmmu_hat_lock_held(sfmmup));
10018 ASSERT(szc <= tsb_max_growsize);
10019
10020 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_BUSY))
10021 return (TSB_LOSTRACE);
10022
10023 /*
10024 * Find the tsb_info ahead of this one in the list, and
10025 * also make sure that the tsb_info passed in really
10026 * exists!
10027 */
10028 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb;
10029 curtsb != old_tsbinfo && curtsb != NULL;
10030 prevtsb = curtsb, curtsb = curtsb->tsb_next)
10031 ;
10032 ASSERT(curtsb != NULL);
10033
10034 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
10035 /*
10036 * The process is swapped out, so just set the new size
10037 * code. When it swaps back in, we'll allocate a new one
10038 * of the new chosen size.
10039 */
10040 curtsb->tsb_szc = szc;
10041 return (TSB_SUCCESS);
10042 }
10043 SFMMU_FLAGS_SET(sfmmup, HAT_BUSY);
10044
10045 tte_sz_mask = old_tsbinfo->tsb_ttesz_mask;
10046
10047 /*
10048 * All initialization is done inside of sfmmu_tsbinfo_alloc().
10049 * If we fail to allocate a TSB, exit.
10050 *
10051 * If tsb grows with new tsb size > 4M and old tsb size < 4M,
10052 * then try 4M slab after the initial alloc fails.
10053 *
10054 * If tsb swapin with tsb size > 4M, then try 4M after the
10055 * initial alloc fails.
10056 */
10057 sfmmu_hat_exit(hatlockp);
10058 if (sfmmu_tsbinfo_alloc(&new_tsbinfo, szc,
10059 tte_sz_mask, flags, sfmmup) &&
10060 (!(flags & (TSB_GROW | TSB_SWAPIN)) || (szc <= TSB_4M_SZCODE) ||
10061 (!(flags & TSB_SWAPIN) &&
10062 (old_tsbinfo->tsb_szc >= TSB_4M_SZCODE)) ||
10063 sfmmu_tsbinfo_alloc(&new_tsbinfo, TSB_4M_SZCODE,
10064 tte_sz_mask, flags, sfmmup))) {
10065 (void) sfmmu_hat_enter(sfmmup);
10066 if (!(flags & TSB_SWAPIN))
10067 SFMMU_STAT(sf_tsb_resize_failures);
10068 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
10069 return (TSB_ALLOCFAIL);
10070 }
10071 (void) sfmmu_hat_enter(sfmmup);
10072
10073 /*
10074 * Re-check to make sure somebody else didn't muck with us while we
10075 * didn't hold the HAT lock. If the process swapped out, fine, just
10076 * exit; this can happen if we try to shrink the TSB from the context
10077 * of another process (such as on an ISM unmap), though it is rare.
10078 */
10079 if (!(flags & TSB_SWAPIN) && SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
10080 SFMMU_STAT(sf_tsb_resize_failures);
10081 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
10082 sfmmu_hat_exit(hatlockp);
10083 sfmmu_tsbinfo_free(new_tsbinfo);
10084 (void) sfmmu_hat_enter(sfmmup);
10085 return (TSB_LOSTRACE);
10086 }
10087
10088 #ifdef DEBUG
10089 /* Reverify that the tsb_info still exists.. for debugging only */
10090 for (prevtsb = NULL, curtsb = sfmmup->sfmmu_tsb;
10091 curtsb != old_tsbinfo && curtsb != NULL;
10092 prevtsb = curtsb, curtsb = curtsb->tsb_next)
10093 ;
10094 ASSERT(curtsb != NULL);
10095 #endif /* DEBUG */
10096
10097 /*
10098 * Quiesce any CPUs running this process on their next TLB miss
10099 * so they atomically see the new tsb_info. We temporarily set the
10100 * context to invalid context so new threads that come on processor
10101 * after we do the xcall to cpusran will also serialize behind the
10102 * HAT lock on TLB miss and will see the new TSB. Since this short
10103 * race with a new thread coming on processor is relatively rare,
10104 * this synchronization mechanism should be cheaper than always
10105 * pausing all CPUs for the duration of the setup, which is what
10106 * the old implementation did. This is particuarly true if we are
10107 * copying a huge chunk of memory around during that window.
10108 *
10109 * The memory barriers are to make sure things stay consistent
10110 * with resume() since it does not hold the HAT lock while
10111 * walking the list of tsb_info structures.
10112 */
10113 if ((flags & TSB_SWAPIN) != TSB_SWAPIN) {
10114 /* The TSB is either growing or shrinking. */
10115 sfmmu_invalidate_ctx(sfmmup);
10116 } else {
10117 /*
10118 * It is illegal to swap in TSBs from a process other
10119 * than a process being swapped in. This in turn
10120 * implies we do not have a valid MMU context here
10121 * since a process needs one to resolve translation
10122 * misses.
10123 */
10124 ASSERT(curthread->t_procp->p_as->a_hat == sfmmup);
10125 }
10126
10127 #ifdef DEBUG
10128 ASSERT(max_mmu_ctxdoms > 0);
10129
10130 /*
10131 * Process should have INVALID_CONTEXT on all MMUs
10132 */
10133 for (i = 0; i < max_mmu_ctxdoms; i++) {
10134
10135 ASSERT(sfmmup->sfmmu_ctxs[i].cnum == INVALID_CONTEXT);
10136 }
10137 #endif
10138
10139 new_tsbinfo->tsb_next = old_tsbinfo->tsb_next;
10140 membar_stst(); /* strict ordering required */
10141 if (prevtsb)
10142 prevtsb->tsb_next = new_tsbinfo;
10143 else
10144 sfmmup->sfmmu_tsb = new_tsbinfo;
10145 membar_enter(); /* make sure new TSB globally visible */
10146
10147 /*
10148 * We need to migrate TSB entries from the old TSB to the new TSB
10149 * if tsb_remap_ttes is set and the TSB is growing.
10150 */
10151 if (tsb_remap_ttes && ((flags & TSB_GROW) == TSB_GROW))
10152 sfmmu_copy_tsb(old_tsbinfo, new_tsbinfo);
10153
10154 SFMMU_FLAGS_CLEAR(sfmmup, HAT_BUSY);
10155
10156 /*
10157 * Drop the HAT lock to free our old tsb_info.
10158 */
10159 sfmmu_hat_exit(hatlockp);
10160
10161 if ((flags & TSB_GROW) == TSB_GROW) {
10162 SFMMU_STAT(sf_tsb_grow);
10163 } else if ((flags & TSB_SHRINK) == TSB_SHRINK) {
10164 SFMMU_STAT(sf_tsb_shrink);
10165 }
10166
10167 sfmmu_tsbinfo_free(old_tsbinfo);
10168
10169 (void) sfmmu_hat_enter(sfmmup);
10170 return (TSB_SUCCESS);
10171 }
10172
10173 /*
10174 * This function will re-program hat pgsz array, and invalidate the
10175 * process' context, forcing the process to switch to another
10176 * context on the next TLB miss, and therefore start using the
10177 * TLB that is reprogrammed for the new page sizes.
10178 */
10179 void
sfmmu_reprog_pgsz_arr(sfmmu_t * sfmmup,uint8_t * tmp_pgsz)10180 sfmmu_reprog_pgsz_arr(sfmmu_t *sfmmup, uint8_t *tmp_pgsz)
10181 {
10182 int i;
10183 hatlock_t *hatlockp = NULL;
10184
10185 hatlockp = sfmmu_hat_enter(sfmmup);
10186 /* USIII+-IV+ optimization, requires hat lock */
10187 if (tmp_pgsz) {
10188 for (i = 0; i < mmu_page_sizes; i++)
10189 sfmmup->sfmmu_pgsz[i] = tmp_pgsz[i];
10190 }
10191 SFMMU_STAT(sf_tlb_reprog_pgsz);
10192
10193 sfmmu_invalidate_ctx(sfmmup);
10194
10195 sfmmu_hat_exit(hatlockp);
10196 }
10197
10198 /*
10199 * The scd_rttecnt field in the SCD must be updated to take account of the
10200 * regions which it contains.
10201 */
10202 static void
sfmmu_set_scd_rttecnt(sf_srd_t * srdp,sf_scd_t * scdp)10203 sfmmu_set_scd_rttecnt(sf_srd_t *srdp, sf_scd_t *scdp)
10204 {
10205 uint_t rid;
10206 uint_t i, j;
10207 ulong_t w;
10208 sf_region_t *rgnp;
10209
10210 ASSERT(srdp != NULL);
10211
10212 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
10213 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
10214 continue;
10215 }
10216
10217 j = 0;
10218 while (w) {
10219 if (!(w & 0x1)) {
10220 j++;
10221 w >>= 1;
10222 continue;
10223 }
10224 rid = (i << BT_ULSHIFT) | j;
10225 j++;
10226 w >>= 1;
10227
10228 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
10229 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
10230 rgnp = srdp->srd_hmergnp[rid];
10231 ASSERT(rgnp->rgn_refcnt > 0);
10232 ASSERT(rgnp->rgn_id == rid);
10233
10234 scdp->scd_rttecnt[rgnp->rgn_pgszc] +=
10235 rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc);
10236
10237 /*
10238 * Maintain the tsb0 inflation cnt for the regions
10239 * in the SCD.
10240 */
10241 if (rgnp->rgn_pgszc >= TTE4M) {
10242 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt +=
10243 rgnp->rgn_size >>
10244 (TTE_PAGE_SHIFT(TTE8K) + 2);
10245 }
10246 }
10247 }
10248 }
10249
10250 /*
10251 * This function assumes that there are either four or six supported page
10252 * sizes and at most two programmable TLBs, so we need to decide which
10253 * page sizes are most important and then tell the MMU layer so it
10254 * can adjust the TLB page sizes accordingly (if supported).
10255 *
10256 * If these assumptions change, this function will need to be
10257 * updated to support whatever the new limits are.
10258 *
10259 * The growing flag is nonzero if we are growing the address space,
10260 * and zero if it is shrinking. This allows us to decide whether
10261 * to grow or shrink our TSB, depending upon available memory
10262 * conditions.
10263 */
10264 static void
sfmmu_check_page_sizes(sfmmu_t * sfmmup,int growing)10265 sfmmu_check_page_sizes(sfmmu_t *sfmmup, int growing)
10266 {
10267 uint64_t ttecnt[MMU_PAGE_SIZES];
10268 uint64_t tte8k_cnt, tte4m_cnt;
10269 uint8_t i;
10270 int sectsb_thresh;
10271
10272 /*
10273 * Kernel threads, processes with small address spaces not using
10274 * large pages, and dummy ISM HATs need not apply.
10275 */
10276 if (sfmmup == ksfmmup || sfmmup->sfmmu_ismhat != NULL)
10277 return;
10278
10279 if (!SFMMU_LGPGS_INUSE(sfmmup) &&
10280 sfmmup->sfmmu_ttecnt[TTE8K] <= tsb_rss_factor)
10281 return;
10282
10283 for (i = 0; i < mmu_page_sizes; i++) {
10284 ttecnt[i] = sfmmup->sfmmu_ttecnt[i] +
10285 sfmmup->sfmmu_ismttecnt[i];
10286 }
10287
10288 /* Check pagesizes in use, and possibly reprogram DTLB. */
10289 if (&mmu_check_page_sizes)
10290 mmu_check_page_sizes(sfmmup, ttecnt);
10291
10292 /*
10293 * Calculate the number of 8k ttes to represent the span of these
10294 * pages.
10295 */
10296 tte8k_cnt = ttecnt[TTE8K] +
10297 (ttecnt[TTE64K] << (MMU_PAGESHIFT64K - MMU_PAGESHIFT)) +
10298 (ttecnt[TTE512K] << (MMU_PAGESHIFT512K - MMU_PAGESHIFT));
10299 if (mmu_page_sizes == max_mmu_page_sizes) {
10300 tte4m_cnt = ttecnt[TTE4M] +
10301 (ttecnt[TTE32M] << (MMU_PAGESHIFT32M - MMU_PAGESHIFT4M)) +
10302 (ttecnt[TTE256M] << (MMU_PAGESHIFT256M - MMU_PAGESHIFT4M));
10303 } else {
10304 tte4m_cnt = ttecnt[TTE4M];
10305 }
10306
10307 /*
10308 * Inflate tte8k_cnt to allow for region large page allocation failure.
10309 */
10310 tte8k_cnt += sfmmup->sfmmu_tsb0_4minflcnt;
10311
10312 /*
10313 * Inflate TSB sizes by a factor of 2 if this process
10314 * uses 4M text pages to minimize extra conflict misses
10315 * in the first TSB since without counting text pages
10316 * 8K TSB may become too small.
10317 *
10318 * Also double the size of the second TSB to minimize
10319 * extra conflict misses due to competition between 4M text pages
10320 * and data pages.
10321 *
10322 * We need to adjust the second TSB allocation threshold by the
10323 * inflation factor, since there is no point in creating a second
10324 * TSB when we know all the mappings can fit in the I/D TLBs.
10325 */
10326 sectsb_thresh = tsb_sectsb_threshold;
10327 if (sfmmup->sfmmu_flags & HAT_4MTEXT_FLAG) {
10328 tte8k_cnt <<= 1;
10329 tte4m_cnt <<= 1;
10330 sectsb_thresh <<= 1;
10331 }
10332
10333 /*
10334 * Check to see if our TSB is the right size; we may need to
10335 * grow or shrink it. If the process is small, our work is
10336 * finished at this point.
10337 */
10338 if (tte8k_cnt <= tsb_rss_factor && tte4m_cnt <= sectsb_thresh) {
10339 return;
10340 }
10341 sfmmu_size_tsb(sfmmup, growing, tte8k_cnt, tte4m_cnt, sectsb_thresh);
10342 }
10343
10344 static void
sfmmu_size_tsb(sfmmu_t * sfmmup,int growing,uint64_t tte8k_cnt,uint64_t tte4m_cnt,int sectsb_thresh)10345 sfmmu_size_tsb(sfmmu_t *sfmmup, int growing, uint64_t tte8k_cnt,
10346 uint64_t tte4m_cnt, int sectsb_thresh)
10347 {
10348 int tsb_bits;
10349 uint_t tsb_szc;
10350 struct tsb_info *tsbinfop;
10351 hatlock_t *hatlockp = NULL;
10352
10353 hatlockp = sfmmu_hat_enter(sfmmup);
10354 ASSERT(hatlockp != NULL);
10355 tsbinfop = sfmmup->sfmmu_tsb;
10356 ASSERT(tsbinfop != NULL);
10357
10358 /*
10359 * If we're growing, select the size based on RSS. If we're
10360 * shrinking, leave some room so we don't have to turn around and
10361 * grow again immediately.
10362 */
10363 if (growing)
10364 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt);
10365 else
10366 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt << 1);
10367
10368 if (!growing && (tsb_szc < tsbinfop->tsb_szc) &&
10369 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) {
10370 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc,
10371 hatlockp, TSB_SHRINK);
10372 } else if (growing && tsb_szc > tsbinfop->tsb_szc && TSB_OK_GROW()) {
10373 (void) sfmmu_replace_tsb(sfmmup, tsbinfop, tsb_szc,
10374 hatlockp, TSB_GROW);
10375 }
10376 tsbinfop = sfmmup->sfmmu_tsb;
10377
10378 /*
10379 * With the TLB and first TSB out of the way, we need to see if
10380 * we need a second TSB for 4M pages. If we managed to reprogram
10381 * the TLB page sizes above, the process will start using this new
10382 * TSB right away; otherwise, it will start using it on the next
10383 * context switch. Either way, it's no big deal so there's no
10384 * synchronization with the trap handlers here unless we grow the
10385 * TSB (in which case it's required to prevent using the old one
10386 * after it's freed). Note: second tsb is required for 32M/256M
10387 * page sizes.
10388 */
10389 if (tte4m_cnt > sectsb_thresh) {
10390 /*
10391 * If we're growing, select the size based on RSS. If we're
10392 * shrinking, leave some room so we don't have to turn
10393 * around and grow again immediately.
10394 */
10395 if (growing)
10396 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt);
10397 else
10398 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt << 1);
10399 if (tsbinfop->tsb_next == NULL) {
10400 struct tsb_info *newtsb;
10401 int allocflags = SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)?
10402 0 : TSB_ALLOC;
10403
10404 sfmmu_hat_exit(hatlockp);
10405
10406 /*
10407 * Try to allocate a TSB for 4[32|256]M pages. If we
10408 * can't get the size we want, retry w/a minimum sized
10409 * TSB. If that still didn't work, give up; we can
10410 * still run without one.
10411 */
10412 tsb_bits = (mmu_page_sizes == max_mmu_page_sizes)?
10413 TSB4M|TSB32M|TSB256M:TSB4M;
10414 if ((sfmmu_tsbinfo_alloc(&newtsb, tsb_szc, tsb_bits,
10415 allocflags, sfmmup)) &&
10416 (tsb_szc <= TSB_4M_SZCODE ||
10417 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE,
10418 tsb_bits, allocflags, sfmmup)) &&
10419 sfmmu_tsbinfo_alloc(&newtsb, TSB_MIN_SZCODE,
10420 tsb_bits, allocflags, sfmmup)) {
10421 return;
10422 }
10423
10424 hatlockp = sfmmu_hat_enter(sfmmup);
10425
10426 sfmmu_invalidate_ctx(sfmmup);
10427
10428 if (sfmmup->sfmmu_tsb->tsb_next == NULL) {
10429 sfmmup->sfmmu_tsb->tsb_next = newtsb;
10430 SFMMU_STAT(sf_tsb_sectsb_create);
10431 sfmmu_hat_exit(hatlockp);
10432 return;
10433 } else {
10434 /*
10435 * It's annoying, but possible for us
10436 * to get here.. we dropped the HAT lock
10437 * because of locking order in the kmem
10438 * allocator, and while we were off getting
10439 * our memory, some other thread decided to
10440 * do us a favor and won the race to get a
10441 * second TSB for this process. Sigh.
10442 */
10443 sfmmu_hat_exit(hatlockp);
10444 sfmmu_tsbinfo_free(newtsb);
10445 return;
10446 }
10447 }
10448
10449 /*
10450 * We have a second TSB, see if it's big enough.
10451 */
10452 tsbinfop = tsbinfop->tsb_next;
10453
10454 /*
10455 * Check to see if our second TSB is the right size;
10456 * we may need to grow or shrink it.
10457 * To prevent thrashing (e.g. growing the TSB on a
10458 * subsequent map operation), only try to shrink if
10459 * the TSB reach exceeds twice the virtual address
10460 * space size.
10461 */
10462 if (!growing && (tsb_szc < tsbinfop->tsb_szc) &&
10463 (tsb_szc >= default_tsb_size) && TSB_OK_SHRINK()) {
10464 (void) sfmmu_replace_tsb(sfmmup, tsbinfop,
10465 tsb_szc, hatlockp, TSB_SHRINK);
10466 } else if (growing && tsb_szc > tsbinfop->tsb_szc &&
10467 TSB_OK_GROW()) {
10468 (void) sfmmu_replace_tsb(sfmmup, tsbinfop,
10469 tsb_szc, hatlockp, TSB_GROW);
10470 }
10471 }
10472
10473 sfmmu_hat_exit(hatlockp);
10474 }
10475
10476 /*
10477 * Free up a sfmmu
10478 * Since the sfmmu is currently embedded in the hat struct we simply zero
10479 * out our fields and free up the ism map blk list if any.
10480 */
10481 static void
sfmmu_free_sfmmu(sfmmu_t * sfmmup)10482 sfmmu_free_sfmmu(sfmmu_t *sfmmup)
10483 {
10484 ism_blk_t *blkp, *nx_blkp;
10485 #ifdef DEBUG
10486 ism_map_t *map;
10487 int i;
10488 #endif
10489
10490 ASSERT(sfmmup->sfmmu_ttecnt[TTE8K] == 0);
10491 ASSERT(sfmmup->sfmmu_ttecnt[TTE64K] == 0);
10492 ASSERT(sfmmup->sfmmu_ttecnt[TTE512K] == 0);
10493 ASSERT(sfmmup->sfmmu_ttecnt[TTE4M] == 0);
10494 ASSERT(sfmmup->sfmmu_ttecnt[TTE32M] == 0);
10495 ASSERT(sfmmup->sfmmu_ttecnt[TTE256M] == 0);
10496 ASSERT(SF_RGNMAP_ISNULL(sfmmup));
10497
10498 sfmmup->sfmmu_free = 0;
10499 sfmmup->sfmmu_ismhat = 0;
10500
10501 blkp = sfmmup->sfmmu_iblk;
10502 sfmmup->sfmmu_iblk = NULL;
10503
10504 while (blkp) {
10505 #ifdef DEBUG
10506 map = blkp->iblk_maps;
10507 for (i = 0; i < ISM_MAP_SLOTS; i++) {
10508 ASSERT(map[i].imap_seg == 0);
10509 ASSERT(map[i].imap_ismhat == NULL);
10510 ASSERT(map[i].imap_ment == NULL);
10511 }
10512 #endif
10513 nx_blkp = blkp->iblk_next;
10514 blkp->iblk_next = NULL;
10515 blkp->iblk_nextpa = (uint64_t)-1;
10516 kmem_cache_free(ism_blk_cache, blkp);
10517 blkp = nx_blkp;
10518 }
10519 }
10520
10521 /*
10522 * Locking primitves accessed by HATLOCK macros
10523 */
10524
10525 #define SFMMU_SPL_MTX (0x0)
10526 #define SFMMU_ML_MTX (0x1)
10527
10528 #define SFMMU_MLSPL_MTX(type, pg) (((type) == SFMMU_SPL_MTX) ? \
10529 SPL_HASH(pg) : MLIST_HASH(pg))
10530
10531 kmutex_t *
sfmmu_page_enter(struct page * pp)10532 sfmmu_page_enter(struct page *pp)
10533 {
10534 return (sfmmu_mlspl_enter(pp, SFMMU_SPL_MTX));
10535 }
10536
10537 void
sfmmu_page_exit(kmutex_t * spl)10538 sfmmu_page_exit(kmutex_t *spl)
10539 {
10540 mutex_exit(spl);
10541 }
10542
10543 int
sfmmu_page_spl_held(struct page * pp)10544 sfmmu_page_spl_held(struct page *pp)
10545 {
10546 return (sfmmu_mlspl_held(pp, SFMMU_SPL_MTX));
10547 }
10548
10549 kmutex_t *
sfmmu_mlist_enter(struct page * pp)10550 sfmmu_mlist_enter(struct page *pp)
10551 {
10552 return (sfmmu_mlspl_enter(pp, SFMMU_ML_MTX));
10553 }
10554
10555 void
sfmmu_mlist_exit(kmutex_t * mml)10556 sfmmu_mlist_exit(kmutex_t *mml)
10557 {
10558 mutex_exit(mml);
10559 }
10560
10561 int
sfmmu_mlist_held(struct page * pp)10562 sfmmu_mlist_held(struct page *pp)
10563 {
10564
10565 return (sfmmu_mlspl_held(pp, SFMMU_ML_MTX));
10566 }
10567
10568 /*
10569 * Common code for sfmmu_mlist_enter() and sfmmu_page_enter(). For
10570 * sfmmu_mlist_enter() case mml_table lock array is used and for
10571 * sfmmu_page_enter() sfmmu_page_lock lock array is used.
10572 *
10573 * The lock is taken on a root page so that it protects an operation on all
10574 * constituent pages of a large page pp belongs to.
10575 *
10576 * The routine takes a lock from the appropriate array. The lock is determined
10577 * by hashing the root page. After taking the lock this routine checks if the
10578 * root page has the same size code that was used to determine the root (i.e
10579 * that root hasn't changed). If root page has the expected p_szc field we
10580 * have the right lock and it's returned to the caller. If root's p_szc
10581 * decreased we release the lock and retry from the beginning. This case can
10582 * happen due to hat_page_demote() decreasing p_szc between our load of p_szc
10583 * value and taking the lock. The number of retries due to p_szc decrease is
10584 * limited by the maximum p_szc value. If p_szc is 0 we return the lock
10585 * determined by hashing pp itself.
10586 *
10587 * If our caller doesn't hold a SE_SHARED or SE_EXCL lock on pp it's also
10588 * possible that p_szc can increase. To increase p_szc a thread has to lock
10589 * all constituent pages EXCL and do hat_pageunload() on all of them. All the
10590 * callers that don't hold a page locked recheck if hmeblk through which pp
10591 * was found still maps this pp. If it doesn't map it anymore returned lock
10592 * is immediately dropped. Therefore if sfmmu_mlspl_enter() hits the case of
10593 * p_szc increase after taking the lock it returns this lock without further
10594 * retries because in this case the caller doesn't care about which lock was
10595 * taken. The caller will drop it right away.
10596 *
10597 * After the routine returns it's guaranteed that hat_page_demote() can't
10598 * change p_szc field of any of constituent pages of a large page pp belongs
10599 * to as long as pp was either locked at least SHARED prior to this call or
10600 * the caller finds that hment that pointed to this pp still references this
10601 * pp (this also assumes that the caller holds hme hash bucket lock so that
10602 * the same pp can't be remapped into the same hmeblk after it was unmapped by
10603 * hat_pageunload()).
10604 */
10605 static kmutex_t *
sfmmu_mlspl_enter(struct page * pp,int type)10606 sfmmu_mlspl_enter(struct page *pp, int type)
10607 {
10608 kmutex_t *mtx;
10609 uint_t prev_rszc = UINT_MAX;
10610 page_t *rootpp;
10611 uint_t szc;
10612 uint_t rszc;
10613 uint_t pszc = pp->p_szc;
10614
10615 ASSERT(pp != NULL);
10616
10617 again:
10618 if (pszc == 0) {
10619 mtx = SFMMU_MLSPL_MTX(type, pp);
10620 mutex_enter(mtx);
10621 return (mtx);
10622 }
10623
10624 /* The lock lives in the root page */
10625 rootpp = PP_GROUPLEADER(pp, pszc);
10626 mtx = SFMMU_MLSPL_MTX(type, rootpp);
10627 mutex_enter(mtx);
10628
10629 /*
10630 * Return mml in the following 3 cases:
10631 *
10632 * 1) If pp itself is root since if its p_szc decreased before we took
10633 * the lock pp is still the root of smaller szc page. And if its p_szc
10634 * increased it doesn't matter what lock we return (see comment in
10635 * front of this routine).
10636 *
10637 * 2) If pp's not root but rootpp is the root of a rootpp->p_szc size
10638 * large page we have the right lock since any previous potential
10639 * hat_page_demote() is done demoting from greater than current root's
10640 * p_szc because hat_page_demote() changes root's p_szc last. No
10641 * further hat_page_demote() can start or be in progress since it
10642 * would need the same lock we currently hold.
10643 *
10644 * 3) If rootpp's p_szc increased since previous iteration it doesn't
10645 * matter what lock we return (see comment in front of this routine).
10646 */
10647 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc ||
10648 rszc >= prev_rszc) {
10649 return (mtx);
10650 }
10651
10652 /*
10653 * hat_page_demote() could have decreased root's p_szc.
10654 * In this case pp's p_szc must also be smaller than pszc.
10655 * Retry.
10656 */
10657 if (rszc < pszc) {
10658 szc = pp->p_szc;
10659 if (szc < pszc) {
10660 mutex_exit(mtx);
10661 pszc = szc;
10662 goto again;
10663 }
10664 /*
10665 * pp's p_szc increased after it was decreased.
10666 * page cannot be mapped. Return current lock. The caller
10667 * will drop it right away.
10668 */
10669 return (mtx);
10670 }
10671
10672 /*
10673 * root's p_szc is greater than pp's p_szc.
10674 * hat_page_demote() is not done with all pages
10675 * yet. Wait for it to complete.
10676 */
10677 mutex_exit(mtx);
10678 rootpp = PP_GROUPLEADER(rootpp, rszc);
10679 mtx = SFMMU_MLSPL_MTX(type, rootpp);
10680 mutex_enter(mtx);
10681 mutex_exit(mtx);
10682 prev_rszc = rszc;
10683 goto again;
10684 }
10685
10686 static int
sfmmu_mlspl_held(struct page * pp,int type)10687 sfmmu_mlspl_held(struct page *pp, int type)
10688 {
10689 kmutex_t *mtx;
10690
10691 ASSERT(pp != NULL);
10692 /* The lock lives in the root page */
10693 pp = PP_PAGEROOT(pp);
10694 ASSERT(pp != NULL);
10695
10696 mtx = SFMMU_MLSPL_MTX(type, pp);
10697 return (MUTEX_HELD(mtx));
10698 }
10699
10700 static uint_t
sfmmu_get_free_hblk(struct hme_blk ** hmeblkpp,uint_t critical)10701 sfmmu_get_free_hblk(struct hme_blk **hmeblkpp, uint_t critical)
10702 {
10703 struct hme_blk *hblkp;
10704
10705
10706 if (freehblkp != NULL) {
10707 mutex_enter(&freehblkp_lock);
10708 if (freehblkp != NULL) {
10709 /*
10710 * If the current thread is owning hblk_reserve OR
10711 * critical request from sfmmu_hblk_steal()
10712 * let it succeed even if freehblkcnt is really low.
10713 */
10714 if (freehblkcnt <= HBLK_RESERVE_MIN && !critical) {
10715 SFMMU_STAT(sf_get_free_throttle);
10716 mutex_exit(&freehblkp_lock);
10717 return (0);
10718 }
10719 freehblkcnt--;
10720 *hmeblkpp = freehblkp;
10721 hblkp = *hmeblkpp;
10722 freehblkp = hblkp->hblk_next;
10723 mutex_exit(&freehblkp_lock);
10724 hblkp->hblk_next = NULL;
10725 SFMMU_STAT(sf_get_free_success);
10726
10727 ASSERT(hblkp->hblk_hmecnt == 0);
10728 ASSERT(hblkp->hblk_vcnt == 0);
10729 ASSERT(hblkp->hblk_nextpa == va_to_pa((caddr_t)hblkp));
10730
10731 return (1);
10732 }
10733 mutex_exit(&freehblkp_lock);
10734 }
10735
10736 /* Check cpu hblk pending queues */
10737 if ((*hmeblkpp = sfmmu_check_pending_hblks(TTE8K)) != NULL) {
10738 hblkp = *hmeblkpp;
10739 hblkp->hblk_next = NULL;
10740 hblkp->hblk_nextpa = va_to_pa((caddr_t)hblkp);
10741
10742 ASSERT(hblkp->hblk_hmecnt == 0);
10743 ASSERT(hblkp->hblk_vcnt == 0);
10744
10745 return (1);
10746 }
10747
10748 SFMMU_STAT(sf_get_free_fail);
10749 return (0);
10750 }
10751
10752 static uint_t
sfmmu_put_free_hblk(struct hme_blk * hmeblkp,uint_t critical)10753 sfmmu_put_free_hblk(struct hme_blk *hmeblkp, uint_t critical)
10754 {
10755 struct hme_blk *hblkp;
10756
10757 ASSERT(hmeblkp->hblk_hmecnt == 0);
10758 ASSERT(hmeblkp->hblk_vcnt == 0);
10759 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp));
10760
10761 /*
10762 * If the current thread is mapping into kernel space,
10763 * let it succede even if freehblkcnt is max
10764 * so that it will avoid freeing it to kmem.
10765 * This will prevent stack overflow due to
10766 * possible recursion since kmem_cache_free()
10767 * might require creation of a slab which
10768 * in turn needs an hmeblk to map that slab;
10769 * let's break this vicious chain at the first
10770 * opportunity.
10771 */
10772 if (freehblkcnt < HBLK_RESERVE_CNT || critical) {
10773 mutex_enter(&freehblkp_lock);
10774 if (freehblkcnt < HBLK_RESERVE_CNT || critical) {
10775 SFMMU_STAT(sf_put_free_success);
10776 freehblkcnt++;
10777 hmeblkp->hblk_next = freehblkp;
10778 freehblkp = hmeblkp;
10779 mutex_exit(&freehblkp_lock);
10780 return (1);
10781 }
10782 mutex_exit(&freehblkp_lock);
10783 }
10784
10785 /*
10786 * Bring down freehblkcnt to HBLK_RESERVE_CNT. We are here
10787 * only if freehblkcnt is at least HBLK_RESERVE_CNT *and*
10788 * we are not in the process of mapping into kernel space.
10789 */
10790 ASSERT(!critical);
10791 while (freehblkcnt > HBLK_RESERVE_CNT) {
10792 mutex_enter(&freehblkp_lock);
10793 if (freehblkcnt > HBLK_RESERVE_CNT) {
10794 freehblkcnt--;
10795 hblkp = freehblkp;
10796 freehblkp = hblkp->hblk_next;
10797 mutex_exit(&freehblkp_lock);
10798 ASSERT(get_hblk_cache(hblkp) == sfmmu8_cache);
10799 kmem_cache_free(sfmmu8_cache, hblkp);
10800 continue;
10801 }
10802 mutex_exit(&freehblkp_lock);
10803 }
10804 SFMMU_STAT(sf_put_free_fail);
10805 return (0);
10806 }
10807
10808 static void
sfmmu_hblk_swap(struct hme_blk * new)10809 sfmmu_hblk_swap(struct hme_blk *new)
10810 {
10811 struct hme_blk *old, *hblkp, *prev;
10812 uint64_t newpa;
10813 caddr_t base, vaddr, endaddr;
10814 struct hmehash_bucket *hmebp;
10815 struct sf_hment *osfhme, *nsfhme;
10816 page_t *pp;
10817 kmutex_t *pml;
10818 tte_t tte;
10819 struct hme_blk *list = NULL;
10820
10821 #ifdef DEBUG
10822 hmeblk_tag hblktag;
10823 struct hme_blk *found;
10824 #endif
10825 old = HBLK_RESERVE;
10826 ASSERT(!old->hblk_shared);
10827
10828 /*
10829 * save pa before bcopy clobbers it
10830 */
10831 newpa = new->hblk_nextpa;
10832
10833 base = (caddr_t)get_hblk_base(old);
10834 endaddr = base + get_hblk_span(old);
10835
10836 /*
10837 * acquire hash bucket lock.
10838 */
10839 hmebp = sfmmu_tteload_acquire_hashbucket(ksfmmup, base, TTE8K,
10840 SFMMU_INVALID_SHMERID);
10841
10842 /*
10843 * copy contents from old to new
10844 */
10845 bcopy((void *)old, (void *)new, HME8BLK_SZ);
10846
10847 /*
10848 * add new to hash chain
10849 */
10850 sfmmu_hblk_hash_add(hmebp, new, newpa);
10851
10852 /*
10853 * search hash chain for hblk_reserve; this needs to be performed
10854 * after adding new, otherwise prev won't correspond to the hblk which
10855 * is prior to old in hash chain when we call sfmmu_hblk_hash_rm to
10856 * remove old later.
10857 */
10858 for (prev = NULL,
10859 hblkp = hmebp->hmeblkp; hblkp != NULL && hblkp != old;
10860 prev = hblkp, hblkp = hblkp->hblk_next)
10861 ;
10862
10863 if (hblkp != old)
10864 panic("sfmmu_hblk_swap: hblk_reserve not found");
10865
10866 /*
10867 * p_mapping list is still pointing to hments in hblk_reserve;
10868 * fix up p_mapping list so that they point to hments in new.
10869 *
10870 * Since all these mappings are created by hblk_reserve_thread
10871 * on the way and it's using at least one of the buffers from each of
10872 * the newly minted slabs, there is no danger of any of these
10873 * mappings getting unloaded by another thread.
10874 *
10875 * tsbmiss could only modify ref/mod bits of hments in old/new.
10876 * Since all of these hments hold mappings established by segkmem
10877 * and mappings in segkmem are setup with HAT_NOSYNC, ref/mod bits
10878 * have no meaning for the mappings in hblk_reserve. hments in
10879 * old and new are identical except for ref/mod bits.
10880 */
10881 for (vaddr = base; vaddr < endaddr; vaddr += TTEBYTES(TTE8K)) {
10882
10883 HBLKTOHME(osfhme, old, vaddr);
10884 sfmmu_copytte(&osfhme->hme_tte, &tte);
10885
10886 if (TTE_IS_VALID(&tte)) {
10887 if ((pp = osfhme->hme_page) == NULL)
10888 panic("sfmmu_hblk_swap: page not mapped");
10889
10890 pml = sfmmu_mlist_enter(pp);
10891
10892 if (pp != osfhme->hme_page)
10893 panic("sfmmu_hblk_swap: mapping changed");
10894
10895 HBLKTOHME(nsfhme, new, vaddr);
10896
10897 HME_ADD(nsfhme, pp);
10898 HME_SUB(osfhme, pp);
10899
10900 sfmmu_mlist_exit(pml);
10901 }
10902 }
10903
10904 /*
10905 * remove old from hash chain
10906 */
10907 sfmmu_hblk_hash_rm(hmebp, old, prev, &list, 1);
10908
10909 #ifdef DEBUG
10910
10911 hblktag.htag_id = ksfmmup;
10912 hblktag.htag_rid = SFMMU_INVALID_SHMERID;
10913 hblktag.htag_bspage = HME_HASH_BSPAGE(base, HME_HASH_SHIFT(TTE8K));
10914 hblktag.htag_rehash = HME_HASH_REHASH(TTE8K);
10915 HME_HASH_FAST_SEARCH(hmebp, hblktag, found);
10916
10917 if (found != new)
10918 panic("sfmmu_hblk_swap: new hblk not found");
10919 #endif
10920
10921 SFMMU_HASH_UNLOCK(hmebp);
10922
10923 /*
10924 * Reset hblk_reserve
10925 */
10926 bzero((void *)old, HME8BLK_SZ);
10927 old->hblk_nextpa = va_to_pa((caddr_t)old);
10928 }
10929
10930 /*
10931 * Grab the mlist mutex for both pages passed in.
10932 *
10933 * low and high will be returned as pointers to the mutexes for these pages.
10934 * low refers to the mutex residing in the lower bin of the mlist hash, while
10935 * high refers to the mutex residing in the higher bin of the mlist hash. This
10936 * is due to the locking order restrictions on the same thread grabbing
10937 * multiple mlist mutexes. The low lock must be acquired before the high lock.
10938 *
10939 * If both pages hash to the same mutex, only grab that single mutex, and
10940 * high will be returned as NULL
10941 * If the pages hash to different bins in the hash, grab the lower addressed
10942 * lock first and then the higher addressed lock in order to follow the locking
10943 * rules involved with the same thread grabbing multiple mlist mutexes.
10944 * low and high will both have non-NULL values.
10945 */
10946 static void
sfmmu_mlist_reloc_enter(struct page * targ,struct page * repl,kmutex_t ** low,kmutex_t ** high)10947 sfmmu_mlist_reloc_enter(struct page *targ, struct page *repl,
10948 kmutex_t **low, kmutex_t **high)
10949 {
10950 kmutex_t *mml_targ, *mml_repl;
10951
10952 /*
10953 * no need to do the dance around szc as in sfmmu_mlist_enter()
10954 * because this routine is only called by hat_page_relocate() and all
10955 * targ and repl pages are already locked EXCL so szc can't change.
10956 */
10957
10958 mml_targ = MLIST_HASH(PP_PAGEROOT(targ));
10959 mml_repl = MLIST_HASH(PP_PAGEROOT(repl));
10960
10961 if (mml_targ == mml_repl) {
10962 *low = mml_targ;
10963 *high = NULL;
10964 } else {
10965 if (mml_targ < mml_repl) {
10966 *low = mml_targ;
10967 *high = mml_repl;
10968 } else {
10969 *low = mml_repl;
10970 *high = mml_targ;
10971 }
10972 }
10973
10974 mutex_enter(*low);
10975 if (*high)
10976 mutex_enter(*high);
10977 }
10978
10979 static void
sfmmu_mlist_reloc_exit(kmutex_t * low,kmutex_t * high)10980 sfmmu_mlist_reloc_exit(kmutex_t *low, kmutex_t *high)
10981 {
10982 if (high)
10983 mutex_exit(high);
10984 mutex_exit(low);
10985 }
10986
10987 static hatlock_t *
sfmmu_hat_enter(sfmmu_t * sfmmup)10988 sfmmu_hat_enter(sfmmu_t *sfmmup)
10989 {
10990 hatlock_t *hatlockp;
10991
10992 if (sfmmup != ksfmmup) {
10993 hatlockp = TSB_HASH(sfmmup);
10994 mutex_enter(HATLOCK_MUTEXP(hatlockp));
10995 return (hatlockp);
10996 }
10997 return (NULL);
10998 }
10999
11000 static hatlock_t *
sfmmu_hat_tryenter(sfmmu_t * sfmmup)11001 sfmmu_hat_tryenter(sfmmu_t *sfmmup)
11002 {
11003 hatlock_t *hatlockp;
11004
11005 if (sfmmup != ksfmmup) {
11006 hatlockp = TSB_HASH(sfmmup);
11007 if (mutex_tryenter(HATLOCK_MUTEXP(hatlockp)) == 0)
11008 return (NULL);
11009 return (hatlockp);
11010 }
11011 return (NULL);
11012 }
11013
11014 static void
sfmmu_hat_exit(hatlock_t * hatlockp)11015 sfmmu_hat_exit(hatlock_t *hatlockp)
11016 {
11017 if (hatlockp != NULL)
11018 mutex_exit(HATLOCK_MUTEXP(hatlockp));
11019 }
11020
11021 static void
sfmmu_hat_lock_all(void)11022 sfmmu_hat_lock_all(void)
11023 {
11024 int i;
11025 for (i = 0; i < SFMMU_NUM_LOCK; i++)
11026 mutex_enter(HATLOCK_MUTEXP(&hat_lock[i]));
11027 }
11028
11029 static void
sfmmu_hat_unlock_all(void)11030 sfmmu_hat_unlock_all(void)
11031 {
11032 int i;
11033 for (i = SFMMU_NUM_LOCK - 1; i >= 0; i--)
11034 mutex_exit(HATLOCK_MUTEXP(&hat_lock[i]));
11035 }
11036
11037 int
sfmmu_hat_lock_held(sfmmu_t * sfmmup)11038 sfmmu_hat_lock_held(sfmmu_t *sfmmup)
11039 {
11040 ASSERT(sfmmup != ksfmmup);
11041 return (MUTEX_HELD(HATLOCK_MUTEXP(TSB_HASH(sfmmup))));
11042 }
11043
11044 /*
11045 * Locking primitives to provide consistency between ISM unmap
11046 * and other operations. Since ISM unmap can take a long time, we
11047 * use HAT_ISMBUSY flag (protected by the hatlock) to avoid creating
11048 * contention on the hatlock buckets while ISM segments are being
11049 * unmapped. The tradeoff is that the flags don't prevent priority
11050 * inversion from occurring, so we must request kernel priority in
11051 * case we have to sleep to keep from getting buried while holding
11052 * the HAT_ISMBUSY flag set, which in turn could block other kernel
11053 * threads from running (for example, in sfmmu_uvatopfn()).
11054 */
11055 static void
sfmmu_ismhat_enter(sfmmu_t * sfmmup,int hatlock_held)11056 sfmmu_ismhat_enter(sfmmu_t *sfmmup, int hatlock_held)
11057 {
11058 hatlock_t *hatlockp;
11059
11060 THREAD_KPRI_REQUEST();
11061 if (!hatlock_held)
11062 hatlockp = sfmmu_hat_enter(sfmmup);
11063 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY))
11064 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp));
11065 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY);
11066 if (!hatlock_held)
11067 sfmmu_hat_exit(hatlockp);
11068 }
11069
11070 static void
sfmmu_ismhat_exit(sfmmu_t * sfmmup,int hatlock_held)11071 sfmmu_ismhat_exit(sfmmu_t *sfmmup, int hatlock_held)
11072 {
11073 hatlock_t *hatlockp;
11074
11075 if (!hatlock_held)
11076 hatlockp = sfmmu_hat_enter(sfmmup);
11077 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
11078 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY);
11079 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11080 if (!hatlock_held)
11081 sfmmu_hat_exit(hatlockp);
11082 THREAD_KPRI_RELEASE();
11083 }
11084
11085 /*
11086 *
11087 * Algorithm:
11088 *
11089 * (1) if segkmem is not ready, allocate hblk from an array of pre-alloc'ed
11090 * hblks.
11091 *
11092 * (2) if we are allocating an hblk for mapping a slab in sfmmu_cache,
11093 *
11094 * (a) try to return an hblk from reserve pool of free hblks;
11095 * (b) if the reserve pool is empty, acquire hblk_reserve_lock
11096 * and return hblk_reserve.
11097 *
11098 * (3) call kmem_cache_alloc() to allocate hblk;
11099 *
11100 * (a) if hblk_reserve_lock is held by the current thread,
11101 * atomically replace hblk_reserve by the hblk that is
11102 * returned by kmem_cache_alloc; release hblk_reserve_lock
11103 * and call kmem_cache_alloc() again.
11104 * (b) if reserve pool is not full, add the hblk that is
11105 * returned by kmem_cache_alloc to reserve pool and
11106 * call kmem_cache_alloc again.
11107 *
11108 */
11109 static struct hme_blk *
sfmmu_hblk_alloc(sfmmu_t * sfmmup,caddr_t vaddr,struct hmehash_bucket * hmebp,uint_t size,hmeblk_tag hblktag,uint_t flags,uint_t rid)11110 sfmmu_hblk_alloc(sfmmu_t *sfmmup, caddr_t vaddr,
11111 struct hmehash_bucket *hmebp, uint_t size, hmeblk_tag hblktag,
11112 uint_t flags, uint_t rid)
11113 {
11114 struct hme_blk *hmeblkp = NULL;
11115 struct hme_blk *newhblkp;
11116 struct hme_blk *shw_hblkp = NULL;
11117 struct kmem_cache *sfmmu_cache = NULL;
11118 uint64_t hblkpa;
11119 ulong_t index;
11120 uint_t owner; /* set to 1 if using hblk_reserve */
11121 uint_t forcefree;
11122 int sleep;
11123 sf_srd_t *srdp;
11124 sf_region_t *rgnp;
11125
11126 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11127 ASSERT(hblktag.htag_rid == rid);
11128 SFMMU_VALIDATE_HMERID(sfmmup, rid, vaddr, TTEBYTES(size));
11129 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) ||
11130 IS_P2ALIGNED(vaddr, TTEBYTES(size)));
11131
11132 /*
11133 * If segkmem is not created yet, allocate from static hmeblks
11134 * created at the end of startup_modules(). See the block comment
11135 * in startup_modules() describing how we estimate the number of
11136 * static hmeblks that will be needed during re-map.
11137 */
11138 if (!hblk_alloc_dynamic) {
11139
11140 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
11141
11142 if (size == TTE8K) {
11143 index = nucleus_hblk8.index;
11144 if (index >= nucleus_hblk8.len) {
11145 /*
11146 * If we panic here, see startup_modules() to
11147 * make sure that we are calculating the
11148 * number of hblk8's that we need correctly.
11149 */
11150 prom_panic("no nucleus hblk8 to allocate");
11151 }
11152 hmeblkp =
11153 (struct hme_blk *)&nucleus_hblk8.list[index];
11154 nucleus_hblk8.index++;
11155 SFMMU_STAT(sf_hblk8_nalloc);
11156 } else {
11157 index = nucleus_hblk1.index;
11158 if (nucleus_hblk1.index >= nucleus_hblk1.len) {
11159 /*
11160 * If we panic here, see startup_modules().
11161 * Most likely you need to update the
11162 * calculation of the number of hblk1 elements
11163 * that the kernel needs to boot.
11164 */
11165 prom_panic("no nucleus hblk1 to allocate");
11166 }
11167 hmeblkp =
11168 (struct hme_blk *)&nucleus_hblk1.list[index];
11169 nucleus_hblk1.index++;
11170 SFMMU_STAT(sf_hblk1_nalloc);
11171 }
11172
11173 goto hblk_init;
11174 }
11175
11176 SFMMU_HASH_UNLOCK(hmebp);
11177
11178 if (sfmmup != KHATID && !SFMMU_IS_SHMERID_VALID(rid)) {
11179 if (mmu_page_sizes == max_mmu_page_sizes) {
11180 if (size < TTE256M)
11181 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr,
11182 size, flags);
11183 } else {
11184 if (size < TTE4M)
11185 shw_hblkp = sfmmu_shadow_hcreate(sfmmup, vaddr,
11186 size, flags);
11187 }
11188 } else if (SFMMU_IS_SHMERID_VALID(rid)) {
11189 /*
11190 * Shared hmes use per region bitmaps in rgn_hmeflag
11191 * rather than shadow hmeblks to keep track of the
11192 * mapping sizes which have been allocated for the region.
11193 * Here we cleanup old invalid hmeblks with this rid,
11194 * which may be left around by pageunload().
11195 */
11196 int ttesz;
11197 caddr_t va;
11198 caddr_t eva = vaddr + TTEBYTES(size);
11199
11200 ASSERT(sfmmup != KHATID);
11201
11202 srdp = sfmmup->sfmmu_srdp;
11203 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
11204 rgnp = srdp->srd_hmergnp[rid];
11205 ASSERT(rgnp != NULL && rgnp->rgn_id == rid);
11206 ASSERT(rgnp->rgn_refcnt != 0);
11207 ASSERT(size <= rgnp->rgn_pgszc);
11208
11209 ttesz = HBLK_MIN_TTESZ;
11210 do {
11211 if (!(rgnp->rgn_hmeflags & (0x1 << ttesz))) {
11212 continue;
11213 }
11214
11215 if (ttesz > size && ttesz != HBLK_MIN_TTESZ) {
11216 sfmmu_cleanup_rhblk(srdp, vaddr, rid, ttesz);
11217 } else if (ttesz < size) {
11218 for (va = vaddr; va < eva;
11219 va += TTEBYTES(ttesz)) {
11220 sfmmu_cleanup_rhblk(srdp, va, rid,
11221 ttesz);
11222 }
11223 }
11224 } while (++ttesz <= rgnp->rgn_pgszc);
11225 }
11226
11227 fill_hblk:
11228 owner = (hblk_reserve_thread == curthread) ? 1 : 0;
11229
11230 if (owner && size == TTE8K) {
11231
11232 ASSERT(!SFMMU_IS_SHMERID_VALID(rid));
11233 /*
11234 * We are really in a tight spot. We already own
11235 * hblk_reserve and we need another hblk. In anticipation
11236 * of this kind of scenario, we specifically set aside
11237 * HBLK_RESERVE_MIN number of hblks to be used exclusively
11238 * by owner of hblk_reserve.
11239 */
11240 SFMMU_STAT(sf_hblk_recurse_cnt);
11241
11242 if (!sfmmu_get_free_hblk(&hmeblkp, 1))
11243 panic("sfmmu_hblk_alloc: reserve list is empty");
11244
11245 goto hblk_verify;
11246 }
11247
11248 ASSERT(!owner);
11249
11250 if ((flags & HAT_NO_KALLOC) == 0) {
11251
11252 sfmmu_cache = ((size == TTE8K) ? sfmmu8_cache : sfmmu1_cache);
11253 sleep = ((sfmmup == KHATID) ? KM_NOSLEEP : KM_SLEEP);
11254
11255 if ((hmeblkp = kmem_cache_alloc(sfmmu_cache, sleep)) == NULL) {
11256 hmeblkp = sfmmu_hblk_steal(size);
11257 } else {
11258 /*
11259 * if we are the owner of hblk_reserve,
11260 * swap hblk_reserve with hmeblkp and
11261 * start a fresh life. Hope things go
11262 * better this time.
11263 */
11264 if (hblk_reserve_thread == curthread) {
11265 ASSERT(sfmmu_cache == sfmmu8_cache);
11266 sfmmu_hblk_swap(hmeblkp);
11267 hblk_reserve_thread = NULL;
11268 mutex_exit(&hblk_reserve_lock);
11269 goto fill_hblk;
11270 }
11271 /*
11272 * let's donate this hblk to our reserve list if
11273 * we are not mapping kernel range
11274 */
11275 if (size == TTE8K && sfmmup != KHATID) {
11276 if (sfmmu_put_free_hblk(hmeblkp, 0))
11277 goto fill_hblk;
11278 }
11279 }
11280 } else {
11281 /*
11282 * We are here to map the slab in sfmmu8_cache; let's
11283 * check if we could tap our reserve list; if successful,
11284 * this will avoid the pain of going thru sfmmu_hblk_swap
11285 */
11286 SFMMU_STAT(sf_hblk_slab_cnt);
11287 if (!sfmmu_get_free_hblk(&hmeblkp, 0)) {
11288 /*
11289 * let's start hblk_reserve dance
11290 */
11291 SFMMU_STAT(sf_hblk_reserve_cnt);
11292 owner = 1;
11293 mutex_enter(&hblk_reserve_lock);
11294 hmeblkp = HBLK_RESERVE;
11295 hblk_reserve_thread = curthread;
11296 }
11297 }
11298
11299 hblk_verify:
11300 ASSERT(hmeblkp != NULL);
11301 set_hblk_sz(hmeblkp, size);
11302 ASSERT(hmeblkp->hblk_nextpa == va_to_pa((caddr_t)hmeblkp));
11303 SFMMU_HASH_LOCK(hmebp);
11304 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp);
11305 if (newhblkp != NULL) {
11306 SFMMU_HASH_UNLOCK(hmebp);
11307 if (hmeblkp != HBLK_RESERVE) {
11308 /*
11309 * This is really tricky!
11310 *
11311 * vmem_alloc(vmem_seg_arena)
11312 * vmem_alloc(vmem_internal_arena)
11313 * segkmem_alloc(heap_arena)
11314 * vmem_alloc(heap_arena)
11315 * page_create()
11316 * hat_memload()
11317 * kmem_cache_free()
11318 * kmem_cache_alloc()
11319 * kmem_slab_create()
11320 * vmem_alloc(kmem_internal_arena)
11321 * segkmem_alloc(heap_arena)
11322 * vmem_alloc(heap_arena)
11323 * page_create()
11324 * hat_memload()
11325 * kmem_cache_free()
11326 * ...
11327 *
11328 * Thus, hat_memload() could call kmem_cache_free
11329 * for enough number of times that we could easily
11330 * hit the bottom of the stack or run out of reserve
11331 * list of vmem_seg structs. So, we must donate
11332 * this hblk to reserve list if it's allocated
11333 * from sfmmu8_cache *and* mapping kernel range.
11334 * We don't need to worry about freeing hmeblk1's
11335 * to kmem since they don't map any kmem slabs.
11336 *
11337 * Note: When segkmem supports largepages, we must
11338 * free hmeblk1's to reserve list as well.
11339 */
11340 forcefree = (sfmmup == KHATID) ? 1 : 0;
11341 if (size == TTE8K &&
11342 sfmmu_put_free_hblk(hmeblkp, forcefree)) {
11343 goto re_verify;
11344 }
11345 ASSERT(sfmmup != KHATID);
11346 kmem_cache_free(get_hblk_cache(hmeblkp), hmeblkp);
11347 } else {
11348 /*
11349 * Hey! we don't need hblk_reserve any more.
11350 */
11351 ASSERT(owner);
11352 hblk_reserve_thread = NULL;
11353 mutex_exit(&hblk_reserve_lock);
11354 owner = 0;
11355 }
11356 re_verify:
11357 /*
11358 * let's check if the goodies are still present
11359 */
11360 SFMMU_HASH_LOCK(hmebp);
11361 HME_HASH_FAST_SEARCH(hmebp, hblktag, newhblkp);
11362 if (newhblkp != NULL) {
11363 /*
11364 * return newhblkp if it's not hblk_reserve;
11365 * if newhblkp is hblk_reserve, return it
11366 * _only if_ we are the owner of hblk_reserve.
11367 */
11368 if (newhblkp != HBLK_RESERVE || owner) {
11369 ASSERT(!SFMMU_IS_SHMERID_VALID(rid) ||
11370 newhblkp->hblk_shared);
11371 ASSERT(SFMMU_IS_SHMERID_VALID(rid) ||
11372 !newhblkp->hblk_shared);
11373 return (newhblkp);
11374 } else {
11375 /*
11376 * we just hit hblk_reserve in the hash and
11377 * we are not the owner of that;
11378 *
11379 * block until hblk_reserve_thread completes
11380 * swapping hblk_reserve and try the dance
11381 * once again.
11382 */
11383 SFMMU_HASH_UNLOCK(hmebp);
11384 mutex_enter(&hblk_reserve_lock);
11385 mutex_exit(&hblk_reserve_lock);
11386 SFMMU_STAT(sf_hblk_reserve_hit);
11387 goto fill_hblk;
11388 }
11389 } else {
11390 /*
11391 * it's no more! try the dance once again.
11392 */
11393 SFMMU_HASH_UNLOCK(hmebp);
11394 goto fill_hblk;
11395 }
11396 }
11397
11398 hblk_init:
11399 if (SFMMU_IS_SHMERID_VALID(rid)) {
11400 uint16_t tteflag = 0x1 <<
11401 ((size < HBLK_MIN_TTESZ) ? HBLK_MIN_TTESZ : size);
11402
11403 if (!(rgnp->rgn_hmeflags & tteflag)) {
11404 atomic_or_16(&rgnp->rgn_hmeflags, tteflag);
11405 }
11406 hmeblkp->hblk_shared = 1;
11407 } else {
11408 hmeblkp->hblk_shared = 0;
11409 }
11410 set_hblk_sz(hmeblkp, size);
11411 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11412 hmeblkp->hblk_next = (struct hme_blk *)NULL;
11413 hmeblkp->hblk_tag = hblktag;
11414 hmeblkp->hblk_shadow = shw_hblkp;
11415 hblkpa = hmeblkp->hblk_nextpa;
11416 hmeblkp->hblk_nextpa = HMEBLK_ENDPA;
11417
11418 ASSERT(get_hblk_ttesz(hmeblkp) == size);
11419 ASSERT(get_hblk_span(hmeblkp) == HMEBLK_SPAN(size));
11420 ASSERT(hmeblkp->hblk_hmecnt == 0);
11421 ASSERT(hmeblkp->hblk_vcnt == 0);
11422 ASSERT(hmeblkp->hblk_lckcnt == 0);
11423 ASSERT(hblkpa == va_to_pa((caddr_t)hmeblkp));
11424 sfmmu_hblk_hash_add(hmebp, hmeblkp, hblkpa);
11425 return (hmeblkp);
11426 }
11427
11428 /*
11429 * This function cleans up the hme_blk and returns it to the free list.
11430 */
11431 /* ARGSUSED */
11432 static void
sfmmu_hblk_free(struct hme_blk ** listp)11433 sfmmu_hblk_free(struct hme_blk **listp)
11434 {
11435 struct hme_blk *hmeblkp, *next_hmeblkp;
11436 int size;
11437 uint_t critical;
11438 uint64_t hblkpa;
11439
11440 ASSERT(*listp != NULL);
11441
11442 hmeblkp = *listp;
11443 while (hmeblkp != NULL) {
11444 next_hmeblkp = hmeblkp->hblk_next;
11445 ASSERT(!hmeblkp->hblk_hmecnt);
11446 ASSERT(!hmeblkp->hblk_vcnt);
11447 ASSERT(!hmeblkp->hblk_lckcnt);
11448 ASSERT(hmeblkp != (struct hme_blk *)hblk_reserve);
11449 ASSERT(hmeblkp->hblk_shared == 0);
11450 ASSERT(hmeblkp->hblk_shw_bit == 0);
11451 ASSERT(hmeblkp->hblk_shadow == NULL);
11452
11453 hblkpa = va_to_pa((caddr_t)hmeblkp);
11454 ASSERT(hblkpa != (uint64_t)-1);
11455 critical = (hblktosfmmu(hmeblkp) == KHATID) ? 1 : 0;
11456
11457 size = get_hblk_ttesz(hmeblkp);
11458 hmeblkp->hblk_next = NULL;
11459 hmeblkp->hblk_nextpa = hblkpa;
11460
11461 if (hmeblkp->hblk_nuc_bit == 0) {
11462
11463 if (size != TTE8K ||
11464 !sfmmu_put_free_hblk(hmeblkp, critical))
11465 kmem_cache_free(get_hblk_cache(hmeblkp),
11466 hmeblkp);
11467 }
11468 hmeblkp = next_hmeblkp;
11469 }
11470 }
11471
11472 #define BUCKETS_TO_SEARCH_BEFORE_UNLOAD 30
11473 #define SFMMU_HBLK_STEAL_THRESHOLD 5
11474
11475 static uint_t sfmmu_hblk_steal_twice;
11476 static uint_t sfmmu_hblk_steal_count, sfmmu_hblk_steal_unload_count;
11477
11478 /*
11479 * Steal a hmeblk from user or kernel hme hash lists.
11480 * For 8K tte grab one from reserve pool (freehblkp) before proceeding to
11481 * steal and if we fail to steal after SFMMU_HBLK_STEAL_THRESHOLD attempts
11482 * tap into critical reserve of freehblkp.
11483 * Note: We remain looping in this routine until we find one.
11484 */
11485 static struct hme_blk *
sfmmu_hblk_steal(int size)11486 sfmmu_hblk_steal(int size)
11487 {
11488 static struct hmehash_bucket *uhmehash_steal_hand = NULL;
11489 struct hmehash_bucket *hmebp;
11490 struct hme_blk *hmeblkp = NULL, *pr_hblk;
11491 uint64_t hblkpa;
11492 int i;
11493 uint_t loop_cnt = 0, critical;
11494
11495 for (;;) {
11496 /* Check cpu hblk pending queues */
11497 if ((hmeblkp = sfmmu_check_pending_hblks(size)) != NULL) {
11498 hmeblkp->hblk_nextpa = va_to_pa((caddr_t)hmeblkp);
11499 ASSERT(hmeblkp->hblk_hmecnt == 0);
11500 ASSERT(hmeblkp->hblk_vcnt == 0);
11501 return (hmeblkp);
11502 }
11503
11504 if (size == TTE8K) {
11505 critical =
11506 (++loop_cnt > SFMMU_HBLK_STEAL_THRESHOLD) ? 1 : 0;
11507 if (sfmmu_get_free_hblk(&hmeblkp, critical))
11508 return (hmeblkp);
11509 }
11510
11511 hmebp = (uhmehash_steal_hand == NULL) ? uhme_hash :
11512 uhmehash_steal_hand;
11513 ASSERT(hmebp >= uhme_hash && hmebp <= &uhme_hash[UHMEHASH_SZ]);
11514
11515 for (i = 0; hmeblkp == NULL && i <= UHMEHASH_SZ +
11516 BUCKETS_TO_SEARCH_BEFORE_UNLOAD; i++) {
11517 SFMMU_HASH_LOCK(hmebp);
11518 hmeblkp = hmebp->hmeblkp;
11519 hblkpa = hmebp->hmeh_nextpa;
11520 pr_hblk = NULL;
11521 while (hmeblkp) {
11522 /*
11523 * check if it is a hmeblk that is not locked
11524 * and not shared. skip shadow hmeblks with
11525 * shadow_mask set i.e valid count non zero.
11526 */
11527 if ((get_hblk_ttesz(hmeblkp) == size) &&
11528 (hmeblkp->hblk_shw_bit == 0 ||
11529 hmeblkp->hblk_vcnt == 0) &&
11530 (hmeblkp->hblk_lckcnt == 0)) {
11531 /*
11532 * there is a high probability that we
11533 * will find a free one. search some
11534 * buckets for a free hmeblk initially
11535 * before unloading a valid hmeblk.
11536 */
11537 if ((hmeblkp->hblk_vcnt == 0 &&
11538 hmeblkp->hblk_hmecnt == 0) || (i >=
11539 BUCKETS_TO_SEARCH_BEFORE_UNLOAD)) {
11540 if (sfmmu_steal_this_hblk(hmebp,
11541 hmeblkp, hblkpa, pr_hblk)) {
11542 /*
11543 * Hblk is unloaded
11544 * successfully
11545 */
11546 break;
11547 }
11548 }
11549 }
11550 pr_hblk = hmeblkp;
11551 hblkpa = hmeblkp->hblk_nextpa;
11552 hmeblkp = hmeblkp->hblk_next;
11553 }
11554
11555 SFMMU_HASH_UNLOCK(hmebp);
11556 if (hmebp++ == &uhme_hash[UHMEHASH_SZ])
11557 hmebp = uhme_hash;
11558 }
11559 uhmehash_steal_hand = hmebp;
11560
11561 if (hmeblkp != NULL)
11562 break;
11563
11564 /*
11565 * in the worst case, look for a free one in the kernel
11566 * hash table.
11567 */
11568 for (i = 0, hmebp = khme_hash; i <= KHMEHASH_SZ; i++) {
11569 SFMMU_HASH_LOCK(hmebp);
11570 hmeblkp = hmebp->hmeblkp;
11571 hblkpa = hmebp->hmeh_nextpa;
11572 pr_hblk = NULL;
11573 while (hmeblkp) {
11574 /*
11575 * check if it is free hmeblk
11576 */
11577 if ((get_hblk_ttesz(hmeblkp) == size) &&
11578 (hmeblkp->hblk_lckcnt == 0) &&
11579 (hmeblkp->hblk_vcnt == 0) &&
11580 (hmeblkp->hblk_hmecnt == 0)) {
11581 if (sfmmu_steal_this_hblk(hmebp,
11582 hmeblkp, hblkpa, pr_hblk)) {
11583 break;
11584 } else {
11585 /*
11586 * Cannot fail since we have
11587 * hash lock.
11588 */
11589 panic("fail to steal?");
11590 }
11591 }
11592
11593 pr_hblk = hmeblkp;
11594 hblkpa = hmeblkp->hblk_nextpa;
11595 hmeblkp = hmeblkp->hblk_next;
11596 }
11597
11598 SFMMU_HASH_UNLOCK(hmebp);
11599 if (hmebp++ == &khme_hash[KHMEHASH_SZ])
11600 hmebp = khme_hash;
11601 }
11602
11603 if (hmeblkp != NULL)
11604 break;
11605 sfmmu_hblk_steal_twice++;
11606 }
11607 return (hmeblkp);
11608 }
11609
11610 /*
11611 * This routine does real work to prepare a hblk to be "stolen" by
11612 * unloading the mappings, updating shadow counts ....
11613 * It returns 1 if the block is ready to be reused (stolen), or 0
11614 * means the block cannot be stolen yet- pageunload is still working
11615 * on this hblk.
11616 */
11617 static int
sfmmu_steal_this_hblk(struct hmehash_bucket * hmebp,struct hme_blk * hmeblkp,uint64_t hblkpa,struct hme_blk * pr_hblk)11618 sfmmu_steal_this_hblk(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
11619 uint64_t hblkpa, struct hme_blk *pr_hblk)
11620 {
11621 int shw_size, vshift;
11622 struct hme_blk *shw_hblkp;
11623 caddr_t vaddr;
11624 uint_t shw_mask, newshw_mask;
11625 struct hme_blk *list = NULL;
11626
11627 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
11628
11629 /*
11630 * check if the hmeblk is free, unload if necessary
11631 */
11632 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
11633 sfmmu_t *sfmmup;
11634 demap_range_t dmr;
11635
11636 sfmmup = hblktosfmmu(hmeblkp);
11637 if (hmeblkp->hblk_shared || sfmmup->sfmmu_ismhat) {
11638 return (0);
11639 }
11640 DEMAP_RANGE_INIT(sfmmup, &dmr);
11641 (void) sfmmu_hblk_unload(sfmmup, hmeblkp,
11642 (caddr_t)get_hblk_base(hmeblkp),
11643 get_hblk_endaddr(hmeblkp), &dmr, HAT_UNLOAD);
11644 DEMAP_RANGE_FLUSH(&dmr);
11645 if (hmeblkp->hblk_vcnt || hmeblkp->hblk_hmecnt) {
11646 /*
11647 * Pageunload is working on the same hblk.
11648 */
11649 return (0);
11650 }
11651
11652 sfmmu_hblk_steal_unload_count++;
11653 }
11654
11655 ASSERT(hmeblkp->hblk_lckcnt == 0);
11656 ASSERT(hmeblkp->hblk_vcnt == 0 && hmeblkp->hblk_hmecnt == 0);
11657
11658 sfmmu_hblk_hash_rm(hmebp, hmeblkp, pr_hblk, &list, 1);
11659 hmeblkp->hblk_nextpa = hblkpa;
11660
11661 shw_hblkp = hmeblkp->hblk_shadow;
11662 if (shw_hblkp) {
11663 ASSERT(!hmeblkp->hblk_shared);
11664 shw_size = get_hblk_ttesz(shw_hblkp);
11665 vaddr = (caddr_t)get_hblk_base(hmeblkp);
11666 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size);
11667 ASSERT(vshift < 8);
11668 /*
11669 * Atomically clear shadow mask bit
11670 */
11671 do {
11672 shw_mask = shw_hblkp->hblk_shw_mask;
11673 ASSERT(shw_mask & (1 << vshift));
11674 newshw_mask = shw_mask & ~(1 << vshift);
11675 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask,
11676 shw_mask, newshw_mask);
11677 } while (newshw_mask != shw_mask);
11678 hmeblkp->hblk_shadow = NULL;
11679 }
11680
11681 /*
11682 * remove shadow bit if we are stealing an unused shadow hmeblk.
11683 * sfmmu_hblk_alloc needs it that way, will set shadow bit later if
11684 * we are indeed allocating a shadow hmeblk.
11685 */
11686 hmeblkp->hblk_shw_bit = 0;
11687
11688 if (hmeblkp->hblk_shared) {
11689 sf_srd_t *srdp;
11690 sf_region_t *rgnp;
11691 uint_t rid;
11692
11693 srdp = hblktosrd(hmeblkp);
11694 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
11695 rid = hmeblkp->hblk_tag.htag_rid;
11696 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
11697 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
11698 rgnp = srdp->srd_hmergnp[rid];
11699 ASSERT(rgnp != NULL);
11700 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
11701 hmeblkp->hblk_shared = 0;
11702 }
11703
11704 sfmmu_hblk_steal_count++;
11705 SFMMU_STAT(sf_steal_count);
11706
11707 return (1);
11708 }
11709
11710 struct hme_blk *
sfmmu_hmetohblk(struct sf_hment * sfhme)11711 sfmmu_hmetohblk(struct sf_hment *sfhme)
11712 {
11713 struct hme_blk *hmeblkp;
11714 struct sf_hment *sfhme0;
11715 struct hme_blk *hblk_dummy = 0;
11716
11717 /*
11718 * No dummy sf_hments, please.
11719 */
11720 ASSERT(sfhme->hme_tte.ll != 0);
11721
11722 sfhme0 = sfhme - sfhme->hme_tte.tte_hmenum;
11723 hmeblkp = (struct hme_blk *)((uintptr_t)sfhme0 -
11724 (uintptr_t)&hblk_dummy->hblk_hme[0]);
11725
11726 return (hmeblkp);
11727 }
11728
11729 /*
11730 * On swapin, get appropriately sized TSB(s) and clear the HAT_SWAPPED flag.
11731 * If we can't get appropriately sized TSB(s), try for 8K TSB(s) using
11732 * KM_SLEEP allocation.
11733 *
11734 * Return 0 on success, -1 otherwise.
11735 */
11736 static void
sfmmu_tsb_swapin(sfmmu_t * sfmmup,hatlock_t * hatlockp)11737 sfmmu_tsb_swapin(sfmmu_t *sfmmup, hatlock_t *hatlockp)
11738 {
11739 struct tsb_info *tsbinfop, *next;
11740 tsb_replace_rc_t rc;
11741 boolean_t gotfirst = B_FALSE;
11742
11743 ASSERT(sfmmup != ksfmmup);
11744 ASSERT(sfmmu_hat_lock_held(sfmmup));
11745
11746 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPIN)) {
11747 cv_wait(&sfmmup->sfmmu_tsb_cv, HATLOCK_MUTEXP(hatlockp));
11748 }
11749
11750 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
11751 SFMMU_FLAGS_SET(sfmmup, HAT_SWAPIN);
11752 } else {
11753 return;
11754 }
11755
11756 ASSERT(sfmmup->sfmmu_tsb != NULL);
11757
11758 /*
11759 * Loop over all tsbinfo's replacing them with ones that actually have
11760 * a TSB. If any of the replacements ever fail, bail out of the loop.
11761 */
11762 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL; tsbinfop = next) {
11763 ASSERT(tsbinfop->tsb_flags & TSB_SWAPPED);
11764 next = tsbinfop->tsb_next;
11765 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, tsbinfop->tsb_szc,
11766 hatlockp, TSB_SWAPIN);
11767 if (rc != TSB_SUCCESS) {
11768 break;
11769 }
11770 gotfirst = B_TRUE;
11771 }
11772
11773 switch (rc) {
11774 case TSB_SUCCESS:
11775 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN);
11776 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11777 return;
11778 case TSB_LOSTRACE:
11779 break;
11780 case TSB_ALLOCFAIL:
11781 break;
11782 default:
11783 panic("sfmmu_replace_tsb returned unrecognized failure code "
11784 "%d", rc);
11785 }
11786
11787 /*
11788 * In this case, we failed to get one of our TSBs. If we failed to
11789 * get the first TSB, get one of minimum size (8KB). Walk the list
11790 * and throw away the tsbinfos, starting where the allocation failed;
11791 * we can get by with just one TSB as long as we don't leave the
11792 * SWAPPED tsbinfo structures lying around.
11793 */
11794 tsbinfop = sfmmup->sfmmu_tsb;
11795 next = tsbinfop->tsb_next;
11796 tsbinfop->tsb_next = NULL;
11797
11798 sfmmu_hat_exit(hatlockp);
11799 for (tsbinfop = next; tsbinfop != NULL; tsbinfop = next) {
11800 next = tsbinfop->tsb_next;
11801 sfmmu_tsbinfo_free(tsbinfop);
11802 }
11803 hatlockp = sfmmu_hat_enter(sfmmup);
11804
11805 /*
11806 * If we don't have any TSBs, get a single 8K TSB for 8K, 64K and 512K
11807 * pages.
11808 */
11809 if (!gotfirst) {
11810 tsbinfop = sfmmup->sfmmu_tsb;
11811 rc = sfmmu_replace_tsb(sfmmup, tsbinfop, TSB_MIN_SZCODE,
11812 hatlockp, TSB_SWAPIN | TSB_FORCEALLOC);
11813 ASSERT(rc == TSB_SUCCESS);
11814 }
11815
11816 SFMMU_FLAGS_CLEAR(sfmmup, HAT_SWAPPED|HAT_SWAPIN);
11817 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
11818 }
11819
11820 static int
sfmmu_is_rgnva(sf_srd_t * srdp,caddr_t addr,ulong_t w,ulong_t bmw)11821 sfmmu_is_rgnva(sf_srd_t *srdp, caddr_t addr, ulong_t w, ulong_t bmw)
11822 {
11823 ulong_t bix = 0;
11824 uint_t rid;
11825 sf_region_t *rgnp;
11826
11827 ASSERT(srdp != NULL);
11828 ASSERT(srdp->srd_refcnt != 0);
11829
11830 w <<= BT_ULSHIFT;
11831 while (bmw) {
11832 if (!(bmw & 0x1)) {
11833 bix++;
11834 bmw >>= 1;
11835 continue;
11836 }
11837 rid = w | bix;
11838 rgnp = srdp->srd_hmergnp[rid];
11839 ASSERT(rgnp->rgn_refcnt > 0);
11840 ASSERT(rgnp->rgn_id == rid);
11841 if (addr < rgnp->rgn_saddr ||
11842 addr >= (rgnp->rgn_saddr + rgnp->rgn_size)) {
11843 bix++;
11844 bmw >>= 1;
11845 } else {
11846 return (1);
11847 }
11848 }
11849 return (0);
11850 }
11851
11852 /*
11853 * Handle exceptions for low level tsb_handler.
11854 *
11855 * There are many scenarios that could land us here:
11856 *
11857 * If the context is invalid we land here. The context can be invalid
11858 * for 3 reasons: 1) we couldn't allocate a new context and now need to
11859 * perform a wrap around operation in order to allocate a new context.
11860 * 2) Context was invalidated to change pagesize programming 3) ISMs or
11861 * TSBs configuration is changeing for this process and we are forced into
11862 * here to do a syncronization operation. If the context is valid we can
11863 * be here from window trap hanlder. In this case just call trap to handle
11864 * the fault.
11865 *
11866 * Note that the process will run in INVALID_CONTEXT before
11867 * faulting into here and subsequently loading the MMU registers
11868 * (including the TSB base register) associated with this process.
11869 * For this reason, the trap handlers must all test for
11870 * INVALID_CONTEXT before attempting to access any registers other
11871 * than the context registers.
11872 */
11873 void
sfmmu_tsbmiss_exception(struct regs * rp,uintptr_t tagaccess,uint_t traptype)11874 sfmmu_tsbmiss_exception(struct regs *rp, uintptr_t tagaccess, uint_t traptype)
11875 {
11876 sfmmu_t *sfmmup, *shsfmmup;
11877 uint_t ctxtype;
11878 klwp_id_t lwp;
11879 char lwp_save_state;
11880 hatlock_t *hatlockp, *shatlockp;
11881 struct tsb_info *tsbinfop;
11882 struct tsbmiss *tsbmp;
11883 sf_scd_t *scdp;
11884
11885 SFMMU_STAT(sf_tsb_exceptions);
11886 SFMMU_MMU_STAT(mmu_tsb_exceptions);
11887 sfmmup = astosfmmu(curthread->t_procp->p_as);
11888 /*
11889 * note that in sun4u, tagacces register contains ctxnum
11890 * while sun4v passes ctxtype in the tagaccess register.
11891 */
11892 ctxtype = tagaccess & TAGACC_CTX_MASK;
11893
11894 ASSERT(sfmmup != ksfmmup && ctxtype != KCONTEXT);
11895 ASSERT(sfmmup->sfmmu_ismhat == 0);
11896 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED) ||
11897 ctxtype == INVALID_CONTEXT);
11898
11899 if (ctxtype != INVALID_CONTEXT && traptype != T_DATA_PROT) {
11900 /*
11901 * We may land here because shme bitmap and pagesize
11902 * flags are updated lazily in tsbmiss area on other cpus.
11903 * If we detect here that tsbmiss area is out of sync with
11904 * sfmmu update it and retry the trapped instruction.
11905 * Otherwise call trap().
11906 */
11907 int ret = 0;
11908 uchar_t tteflag_mask = (1 << TTE64K) | (1 << TTE8K);
11909 caddr_t addr = (caddr_t)(tagaccess & TAGACC_VADDR_MASK);
11910
11911 /*
11912 * Must set lwp state to LWP_SYS before
11913 * trying to acquire any adaptive lock
11914 */
11915 lwp = ttolwp(curthread);
11916 ASSERT(lwp);
11917 lwp_save_state = lwp->lwp_state;
11918 lwp->lwp_state = LWP_SYS;
11919
11920 hatlockp = sfmmu_hat_enter(sfmmup);
11921 kpreempt_disable();
11922 tsbmp = &tsbmiss_area[CPU->cpu_id];
11923 ASSERT(sfmmup == tsbmp->usfmmup);
11924 if (((tsbmp->uhat_tteflags ^ sfmmup->sfmmu_tteflags) &
11925 ~tteflag_mask) ||
11926 ((tsbmp->uhat_rtteflags ^ sfmmup->sfmmu_rtteflags) &
11927 ~tteflag_mask)) {
11928 tsbmp->uhat_tteflags = sfmmup->sfmmu_tteflags;
11929 tsbmp->uhat_rtteflags = sfmmup->sfmmu_rtteflags;
11930 ret = 1;
11931 }
11932 if (sfmmup->sfmmu_srdp != NULL) {
11933 ulong_t *sm = sfmmup->sfmmu_hmeregion_map.bitmap;
11934 ulong_t *tm = tsbmp->shmermap;
11935 ulong_t i;
11936 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
11937 ulong_t d = tm[i] ^ sm[i];
11938 if (d) {
11939 if (d & sm[i]) {
11940 if (!ret && sfmmu_is_rgnva(
11941 sfmmup->sfmmu_srdp,
11942 addr, i, d & sm[i])) {
11943 ret = 1;
11944 }
11945 }
11946 tm[i] = sm[i];
11947 }
11948 }
11949 }
11950 kpreempt_enable();
11951 sfmmu_hat_exit(hatlockp);
11952 lwp->lwp_state = lwp_save_state;
11953 if (ret) {
11954 return;
11955 }
11956 } else if (ctxtype == INVALID_CONTEXT) {
11957 /*
11958 * First, make sure we come out of here with a valid ctx,
11959 * since if we don't get one we'll simply loop on the
11960 * faulting instruction.
11961 *
11962 * If the ISM mappings are changing, the TSB is relocated,
11963 * the process is swapped, the process is joining SCD or
11964 * leaving SCD or shared regions we serialize behind the
11965 * controlling thread with hat lock, sfmmu_flags and
11966 * sfmmu_tsb_cv condition variable.
11967 */
11968
11969 /*
11970 * Must set lwp state to LWP_SYS before
11971 * trying to acquire any adaptive lock
11972 */
11973 lwp = ttolwp(curthread);
11974 ASSERT(lwp);
11975 lwp_save_state = lwp->lwp_state;
11976 lwp->lwp_state = LWP_SYS;
11977
11978 hatlockp = sfmmu_hat_enter(sfmmup);
11979 retry:
11980 if ((scdp = sfmmup->sfmmu_scdp) != NULL) {
11981 shsfmmup = scdp->scd_sfmmup;
11982 ASSERT(shsfmmup != NULL);
11983
11984 for (tsbinfop = shsfmmup->sfmmu_tsb; tsbinfop != NULL;
11985 tsbinfop = tsbinfop->tsb_next) {
11986 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) {
11987 /* drop the private hat lock */
11988 sfmmu_hat_exit(hatlockp);
11989 /* acquire the shared hat lock */
11990 shatlockp = sfmmu_hat_enter(shsfmmup);
11991 /*
11992 * recheck to see if anything changed
11993 * after we drop the private hat lock.
11994 */
11995 if (sfmmup->sfmmu_scdp == scdp &&
11996 shsfmmup == scdp->scd_sfmmup) {
11997 sfmmu_tsb_chk_reloc(shsfmmup,
11998 shatlockp);
11999 }
12000 sfmmu_hat_exit(shatlockp);
12001 hatlockp = sfmmu_hat_enter(sfmmup);
12002 goto retry;
12003 }
12004 }
12005 }
12006
12007 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
12008 tsbinfop = tsbinfop->tsb_next) {
12009 if (tsbinfop->tsb_flags & TSB_RELOC_FLAG) {
12010 cv_wait(&sfmmup->sfmmu_tsb_cv,
12011 HATLOCK_MUTEXP(hatlockp));
12012 goto retry;
12013 }
12014 }
12015
12016 /*
12017 * Wait for ISM maps to be updated.
12018 */
12019 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) {
12020 cv_wait(&sfmmup->sfmmu_tsb_cv,
12021 HATLOCK_MUTEXP(hatlockp));
12022 goto retry;
12023 }
12024
12025 /* Is this process joining an SCD? */
12026 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
12027 /*
12028 * Flush private TSB and setup shared TSB.
12029 * sfmmu_finish_join_scd() does not drop the
12030 * hat lock.
12031 */
12032 sfmmu_finish_join_scd(sfmmup);
12033 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD);
12034 }
12035
12036 /*
12037 * If we're swapping in, get TSB(s). Note that we must do
12038 * this before we get a ctx or load the MMU state. Once
12039 * we swap in we have to recheck to make sure the TSB(s) and
12040 * ISM mappings didn't change while we slept.
12041 */
12042 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_SWAPPED)) {
12043 sfmmu_tsb_swapin(sfmmup, hatlockp);
12044 goto retry;
12045 }
12046
12047 sfmmu_get_ctx(sfmmup);
12048
12049 sfmmu_hat_exit(hatlockp);
12050 /*
12051 * Must restore lwp_state if not calling
12052 * trap() for further processing. Restore
12053 * it anyway.
12054 */
12055 lwp->lwp_state = lwp_save_state;
12056 return;
12057 }
12058 trap(rp, (caddr_t)tagaccess, traptype, 0);
12059 }
12060
12061 static void
sfmmu_tsb_chk_reloc(sfmmu_t * sfmmup,hatlock_t * hatlockp)12062 sfmmu_tsb_chk_reloc(sfmmu_t *sfmmup, hatlock_t *hatlockp)
12063 {
12064 struct tsb_info *tp;
12065
12066 ASSERT(sfmmu_hat_lock_held(sfmmup));
12067
12068 for (tp = sfmmup->sfmmu_tsb; tp != NULL; tp = tp->tsb_next) {
12069 if (tp->tsb_flags & TSB_RELOC_FLAG) {
12070 cv_wait(&sfmmup->sfmmu_tsb_cv,
12071 HATLOCK_MUTEXP(hatlockp));
12072 break;
12073 }
12074 }
12075 }
12076
12077 /*
12078 * sfmmu_vatopfn_suspended is called from GET_TTE when TL=0 and
12079 * TTE_SUSPENDED bit set in tte we block on aquiring a page lock
12080 * rather than spinning to avoid send mondo timeouts with
12081 * interrupts enabled. When the lock is acquired it is immediately
12082 * released and we return back to sfmmu_vatopfn just after
12083 * the GET_TTE call.
12084 */
12085 void
sfmmu_vatopfn_suspended(caddr_t vaddr,sfmmu_t * sfmmu,tte_t * ttep)12086 sfmmu_vatopfn_suspended(caddr_t vaddr, sfmmu_t *sfmmu, tte_t *ttep)
12087 {
12088 struct page **pp;
12089
12090 (void) as_pagelock(sfmmu->sfmmu_as, &pp, vaddr, TTE_CSZ(ttep), S_WRITE);
12091 as_pageunlock(sfmmu->sfmmu_as, pp, vaddr, TTE_CSZ(ttep), S_WRITE);
12092 }
12093
12094 /*
12095 * sfmmu_tsbmiss_suspended is called from GET_TTE when TL>0 and
12096 * TTE_SUSPENDED bit set in tte. We do this so that we can handle
12097 * cross traps which cannot be handled while spinning in the
12098 * trap handlers. Simply enter and exit the kpr_suspendlock spin
12099 * mutex, which is held by the holder of the suspend bit, and then
12100 * retry the trapped instruction after unwinding.
12101 */
12102 /*ARGSUSED*/
12103 void
sfmmu_tsbmiss_suspended(struct regs * rp,uintptr_t tagacc,uint_t traptype)12104 sfmmu_tsbmiss_suspended(struct regs *rp, uintptr_t tagacc, uint_t traptype)
12105 {
12106 ASSERT(curthread != kreloc_thread);
12107 mutex_enter(&kpr_suspendlock);
12108 mutex_exit(&kpr_suspendlock);
12109 }
12110
12111 /*
12112 * This routine could be optimized to reduce the number of xcalls by flushing
12113 * the entire TLBs if region reference count is above some threshold but the
12114 * tradeoff will depend on the size of the TLB. So for now flush the specific
12115 * page a context at a time.
12116 *
12117 * If uselocks is 0 then it's called after all cpus were captured and all the
12118 * hat locks were taken. In this case don't take the region lock by relying on
12119 * the order of list region update operations in hat_join_region(),
12120 * hat_leave_region() and hat_dup_region(). The ordering in those routines
12121 * guarantees that list is always forward walkable and reaches active sfmmus
12122 * regardless of where xc_attention() captures a cpu.
12123 */
12124 cpuset_t
sfmmu_rgntlb_demap(caddr_t addr,sf_region_t * rgnp,struct hme_blk * hmeblkp,int uselocks)12125 sfmmu_rgntlb_demap(caddr_t addr, sf_region_t *rgnp,
12126 struct hme_blk *hmeblkp, int uselocks)
12127 {
12128 sfmmu_t *sfmmup;
12129 cpuset_t cpuset;
12130 cpuset_t rcpuset;
12131 hatlock_t *hatlockp;
12132 uint_t rid = rgnp->rgn_id;
12133 sf_rgn_link_t *rlink;
12134 sf_scd_t *scdp;
12135
12136 ASSERT(hmeblkp->hblk_shared);
12137 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
12138 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
12139
12140 CPUSET_ZERO(rcpuset);
12141 if (uselocks) {
12142 mutex_enter(&rgnp->rgn_mutex);
12143 }
12144 sfmmup = rgnp->rgn_sfmmu_head;
12145 while (sfmmup != NULL) {
12146 if (uselocks) {
12147 hatlockp = sfmmu_hat_enter(sfmmup);
12148 }
12149
12150 /*
12151 * When an SCD is created the SCD hat is linked on the sfmmu
12152 * region lists for each hme region which is part of the
12153 * SCD. If we find an SCD hat, when walking these lists,
12154 * then we flush the shared TSBs, if we find a private hat,
12155 * which is part of an SCD, but where the region
12156 * is not part of the SCD then we flush the private TSBs.
12157 */
12158 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
12159 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
12160 scdp = sfmmup->sfmmu_scdp;
12161 if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
12162 if (uselocks) {
12163 sfmmu_hat_exit(hatlockp);
12164 }
12165 goto next;
12166 }
12167 }
12168
12169 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12170
12171 kpreempt_disable();
12172 cpuset = sfmmup->sfmmu_cpusran;
12173 CPUSET_AND(cpuset, cpu_ready_set);
12174 CPUSET_DEL(cpuset, CPU->cpu_id);
12175 SFMMU_XCALL_STATS(sfmmup);
12176 xt_some(cpuset, vtag_flushpage_tl1,
12177 (uint64_t)addr, (uint64_t)sfmmup);
12178 vtag_flushpage(addr, (uint64_t)sfmmup);
12179 if (uselocks) {
12180 sfmmu_hat_exit(hatlockp);
12181 }
12182 kpreempt_enable();
12183 CPUSET_OR(rcpuset, cpuset);
12184
12185 next:
12186 /* LINTED: constant in conditional context */
12187 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0);
12188 ASSERT(rlink != NULL);
12189 sfmmup = rlink->next;
12190 }
12191 if (uselocks) {
12192 mutex_exit(&rgnp->rgn_mutex);
12193 }
12194 return (rcpuset);
12195 }
12196
12197 /*
12198 * This routine takes an sfmmu pointer and the va for an adddress in an
12199 * ISM region as input and returns the corresponding region id in ism_rid.
12200 * The return value of 1 indicates that a region has been found and ism_rid
12201 * is valid, otherwise 0 is returned.
12202 */
12203 static int
find_ism_rid(sfmmu_t * sfmmup,sfmmu_t * ism_sfmmup,caddr_t va,uint_t * ism_rid)12204 find_ism_rid(sfmmu_t *sfmmup, sfmmu_t *ism_sfmmup, caddr_t va, uint_t *ism_rid)
12205 {
12206 ism_blk_t *ism_blkp;
12207 int i;
12208 ism_map_t *ism_map;
12209 #ifdef DEBUG
12210 struct hat *ism_hatid;
12211 #endif
12212 ASSERT(sfmmu_hat_lock_held(sfmmup));
12213
12214 ism_blkp = sfmmup->sfmmu_iblk;
12215 while (ism_blkp != NULL) {
12216 ism_map = ism_blkp->iblk_maps;
12217 for (i = 0; i < ISM_MAP_SLOTS && ism_map[i].imap_ismhat; i++) {
12218 if ((va >= ism_start(ism_map[i])) &&
12219 (va < ism_end(ism_map[i]))) {
12220
12221 *ism_rid = ism_map[i].imap_rid;
12222 #ifdef DEBUG
12223 ism_hatid = ism_map[i].imap_ismhat;
12224 ASSERT(ism_hatid == ism_sfmmup);
12225 ASSERT(ism_hatid->sfmmu_ismhat);
12226 #endif
12227 return (1);
12228 }
12229 }
12230 ism_blkp = ism_blkp->iblk_next;
12231 }
12232 return (0);
12233 }
12234
12235 /*
12236 * Special routine to flush out ism mappings- TSBs, TLBs and D-caches.
12237 * This routine may be called with all cpu's captured. Therefore, the
12238 * caller is responsible for holding all locks and disabling kernel
12239 * preemption.
12240 */
12241 /* ARGSUSED */
12242 static void
sfmmu_ismtlbcache_demap(caddr_t addr,sfmmu_t * ism_sfmmup,struct hme_blk * hmeblkp,pfn_t pfnum,int cache_flush_flag)12243 sfmmu_ismtlbcache_demap(caddr_t addr, sfmmu_t *ism_sfmmup,
12244 struct hme_blk *hmeblkp, pfn_t pfnum, int cache_flush_flag)
12245 {
12246 cpuset_t cpuset;
12247 caddr_t va;
12248 ism_ment_t *ment;
12249 sfmmu_t *sfmmup;
12250 #ifdef VAC
12251 int vcolor;
12252 #endif
12253
12254 sf_scd_t *scdp;
12255 uint_t ism_rid;
12256
12257 ASSERT(!hmeblkp->hblk_shared);
12258 /*
12259 * Walk the ism_hat's mapping list and flush the page
12260 * from every hat sharing this ism_hat. This routine
12261 * may be called while all cpu's have been captured.
12262 * Therefore we can't attempt to grab any locks. For now
12263 * this means we will protect the ism mapping list under
12264 * a single lock which will be grabbed by the caller.
12265 * If hat_share/unshare scalibility becomes a performance
12266 * problem then we may need to re-think ism mapping list locking.
12267 */
12268 ASSERT(ism_sfmmup->sfmmu_ismhat);
12269 ASSERT(MUTEX_HELD(&ism_mlist_lock));
12270 addr = addr - ISMID_STARTADDR;
12271
12272 for (ment = ism_sfmmup->sfmmu_iment; ment; ment = ment->iment_next) {
12273
12274 sfmmup = ment->iment_hat;
12275
12276 va = ment->iment_base_va;
12277 va = (caddr_t)((uintptr_t)va + (uintptr_t)addr);
12278
12279 /*
12280 * When an SCD is created the SCD hat is linked on the ism
12281 * mapping lists for each ISM segment which is part of the
12282 * SCD. If we find an SCD hat, when walking these lists,
12283 * then we flush the shared TSBs, if we find a private hat,
12284 * which is part of an SCD, but where the region
12285 * corresponding to this va is not part of the SCD then we
12286 * flush the private TSBs.
12287 */
12288 if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
12289 !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) &&
12290 !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) {
12291 if (!find_ism_rid(sfmmup, ism_sfmmup, va,
12292 &ism_rid)) {
12293 cmn_err(CE_PANIC,
12294 "can't find matching ISM rid!");
12295 }
12296
12297 scdp = sfmmup->sfmmu_scdp;
12298 if (SFMMU_IS_ISMRID_VALID(ism_rid) &&
12299 SF_RGNMAP_TEST(scdp->scd_ismregion_map,
12300 ism_rid)) {
12301 continue;
12302 }
12303 }
12304 SFMMU_UNLOAD_TSB(va, sfmmup, hmeblkp, 1);
12305
12306 cpuset = sfmmup->sfmmu_cpusran;
12307 CPUSET_AND(cpuset, cpu_ready_set);
12308 CPUSET_DEL(cpuset, CPU->cpu_id);
12309 SFMMU_XCALL_STATS(sfmmup);
12310 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)va,
12311 (uint64_t)sfmmup);
12312 vtag_flushpage(va, (uint64_t)sfmmup);
12313
12314 #ifdef VAC
12315 /*
12316 * Flush D$
12317 * When flushing D$ we must flush all
12318 * cpu's. See sfmmu_cache_flush().
12319 */
12320 if (cache_flush_flag == CACHE_FLUSH) {
12321 cpuset = cpu_ready_set;
12322 CPUSET_DEL(cpuset, CPU->cpu_id);
12323
12324 SFMMU_XCALL_STATS(sfmmup);
12325 vcolor = addr_to_vcolor(va);
12326 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12327 vac_flushpage(pfnum, vcolor);
12328 }
12329 #endif /* VAC */
12330 }
12331 }
12332
12333 /*
12334 * Demaps the TSB, CPU caches, and flushes all TLBs on all CPUs of
12335 * a particular virtual address and ctx. If noflush is set we do not
12336 * flush the TLB/TSB. This function may or may not be called with the
12337 * HAT lock held.
12338 */
12339 static void
sfmmu_tlbcache_demap(caddr_t addr,sfmmu_t * sfmmup,struct hme_blk * hmeblkp,pfn_t pfnum,int tlb_noflush,int cpu_flag,int cache_flush_flag,int hat_lock_held)12340 sfmmu_tlbcache_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
12341 pfn_t pfnum, int tlb_noflush, int cpu_flag, int cache_flush_flag,
12342 int hat_lock_held)
12343 {
12344 #ifdef VAC
12345 int vcolor;
12346 #endif
12347 cpuset_t cpuset;
12348 hatlock_t *hatlockp;
12349
12350 ASSERT(!hmeblkp->hblk_shared);
12351
12352 #if defined(lint) && !defined(VAC)
12353 pfnum = pfnum;
12354 cpu_flag = cpu_flag;
12355 cache_flush_flag = cache_flush_flag;
12356 #endif
12357
12358 /*
12359 * There is no longer a need to protect against ctx being
12360 * stolen here since we don't store the ctx in the TSB anymore.
12361 */
12362 #ifdef VAC
12363 vcolor = addr_to_vcolor(addr);
12364 #endif
12365
12366 /*
12367 * We must hold the hat lock during the flush of TLB,
12368 * to avoid a race with sfmmu_invalidate_ctx(), where
12369 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12370 * causing TLB demap routine to skip flush on that MMU.
12371 * If the context on a MMU has already been set to
12372 * INVALID_CONTEXT, we just get an extra flush on
12373 * that MMU.
12374 */
12375 if (!hat_lock_held && !tlb_noflush)
12376 hatlockp = sfmmu_hat_enter(sfmmup);
12377
12378 kpreempt_disable();
12379 if (!tlb_noflush) {
12380 /*
12381 * Flush the TSB and TLB.
12382 */
12383 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12384
12385 cpuset = sfmmup->sfmmu_cpusran;
12386 CPUSET_AND(cpuset, cpu_ready_set);
12387 CPUSET_DEL(cpuset, CPU->cpu_id);
12388
12389 SFMMU_XCALL_STATS(sfmmup);
12390
12391 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr,
12392 (uint64_t)sfmmup);
12393
12394 vtag_flushpage(addr, (uint64_t)sfmmup);
12395 }
12396
12397 if (!hat_lock_held && !tlb_noflush)
12398 sfmmu_hat_exit(hatlockp);
12399
12400 #ifdef VAC
12401 /*
12402 * Flush the D$
12403 *
12404 * Even if the ctx is stolen, we need to flush the
12405 * cache. Our ctx stealer only flushes the TLBs.
12406 */
12407 if (cache_flush_flag == CACHE_FLUSH) {
12408 if (cpu_flag & FLUSH_ALL_CPUS) {
12409 cpuset = cpu_ready_set;
12410 } else {
12411 cpuset = sfmmup->sfmmu_cpusran;
12412 CPUSET_AND(cpuset, cpu_ready_set);
12413 }
12414 CPUSET_DEL(cpuset, CPU->cpu_id);
12415 SFMMU_XCALL_STATS(sfmmup);
12416 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12417 vac_flushpage(pfnum, vcolor);
12418 }
12419 #endif /* VAC */
12420 kpreempt_enable();
12421 }
12422
12423 /*
12424 * Demaps the TSB and flushes all TLBs on all cpus for a particular virtual
12425 * address and ctx. If noflush is set we do not currently do anything.
12426 * This function may or may not be called with the HAT lock held.
12427 */
12428 static void
sfmmu_tlb_demap(caddr_t addr,sfmmu_t * sfmmup,struct hme_blk * hmeblkp,int tlb_noflush,int hat_lock_held)12429 sfmmu_tlb_demap(caddr_t addr, sfmmu_t *sfmmup, struct hme_blk *hmeblkp,
12430 int tlb_noflush, int hat_lock_held)
12431 {
12432 cpuset_t cpuset;
12433 hatlock_t *hatlockp;
12434
12435 ASSERT(!hmeblkp->hblk_shared);
12436
12437 /*
12438 * If the process is exiting we have nothing to do.
12439 */
12440 if (tlb_noflush)
12441 return;
12442
12443 /*
12444 * Flush TSB.
12445 */
12446 if (!hat_lock_held)
12447 hatlockp = sfmmu_hat_enter(sfmmup);
12448 SFMMU_UNLOAD_TSB(addr, sfmmup, hmeblkp, 0);
12449
12450 kpreempt_disable();
12451
12452 cpuset = sfmmup->sfmmu_cpusran;
12453 CPUSET_AND(cpuset, cpu_ready_set);
12454 CPUSET_DEL(cpuset, CPU->cpu_id);
12455
12456 SFMMU_XCALL_STATS(sfmmup);
12457 xt_some(cpuset, vtag_flushpage_tl1, (uint64_t)addr, (uint64_t)sfmmup);
12458
12459 vtag_flushpage(addr, (uint64_t)sfmmup);
12460
12461 if (!hat_lock_held)
12462 sfmmu_hat_exit(hatlockp);
12463
12464 kpreempt_enable();
12465
12466 }
12467
12468 /*
12469 * Special case of sfmmu_tlb_demap for MMU_PAGESIZE hblks. Use the xcall
12470 * call handler that can flush a range of pages to save on xcalls.
12471 */
12472 static int sfmmu_xcall_save;
12473
12474 /*
12475 * this routine is never used for demaping addresses backed by SRD hmeblks.
12476 */
12477 static void
sfmmu_tlb_range_demap(demap_range_t * dmrp)12478 sfmmu_tlb_range_demap(demap_range_t *dmrp)
12479 {
12480 sfmmu_t *sfmmup = dmrp->dmr_sfmmup;
12481 hatlock_t *hatlockp;
12482 cpuset_t cpuset;
12483 uint64_t sfmmu_pgcnt;
12484 pgcnt_t pgcnt = 0;
12485 int pgunload = 0;
12486 int dirtypg = 0;
12487 caddr_t addr = dmrp->dmr_addr;
12488 caddr_t eaddr;
12489 uint64_t bitvec = dmrp->dmr_bitvec;
12490
12491 ASSERT(bitvec & 1);
12492
12493 /*
12494 * Flush TSB and calculate number of pages to flush.
12495 */
12496 while (bitvec != 0) {
12497 dirtypg = 0;
12498 /*
12499 * Find the first page to flush and then count how many
12500 * pages there are after it that also need to be flushed.
12501 * This way the number of TSB flushes is minimized.
12502 */
12503 while ((bitvec & 1) == 0) {
12504 pgcnt++;
12505 addr += MMU_PAGESIZE;
12506 bitvec >>= 1;
12507 }
12508 while (bitvec & 1) {
12509 dirtypg++;
12510 bitvec >>= 1;
12511 }
12512 eaddr = addr + ptob(dirtypg);
12513 hatlockp = sfmmu_hat_enter(sfmmup);
12514 sfmmu_unload_tsb_range(sfmmup, addr, eaddr, TTE8K);
12515 sfmmu_hat_exit(hatlockp);
12516 pgunload += dirtypg;
12517 addr = eaddr;
12518 pgcnt += dirtypg;
12519 }
12520
12521 ASSERT((pgcnt<<MMU_PAGESHIFT) <= dmrp->dmr_endaddr - dmrp->dmr_addr);
12522 if (sfmmup->sfmmu_free == 0) {
12523 addr = dmrp->dmr_addr;
12524 bitvec = dmrp->dmr_bitvec;
12525
12526 /*
12527 * make sure it has SFMMU_PGCNT_SHIFT bits only,
12528 * as it will be used to pack argument for xt_some
12529 */
12530 ASSERT((pgcnt > 0) &&
12531 (pgcnt <= (1 << SFMMU_PGCNT_SHIFT)));
12532
12533 /*
12534 * Encode pgcnt as (pgcnt -1 ), and pass (pgcnt - 1) in
12535 * the low 6 bits of sfmmup. This is doable since pgcnt
12536 * always >= 1.
12537 */
12538 ASSERT(!((uint64_t)sfmmup & SFMMU_PGCNT_MASK));
12539 sfmmu_pgcnt = (uint64_t)sfmmup |
12540 ((pgcnt - 1) & SFMMU_PGCNT_MASK);
12541
12542 /*
12543 * We must hold the hat lock during the flush of TLB,
12544 * to avoid a race with sfmmu_invalidate_ctx(), where
12545 * sfmmu_cnum on a MMU could be set to INVALID_CONTEXT,
12546 * causing TLB demap routine to skip flush on that MMU.
12547 * If the context on a MMU has already been set to
12548 * INVALID_CONTEXT, we just get an extra flush on
12549 * that MMU.
12550 */
12551 hatlockp = sfmmu_hat_enter(sfmmup);
12552 kpreempt_disable();
12553
12554 cpuset = sfmmup->sfmmu_cpusran;
12555 CPUSET_AND(cpuset, cpu_ready_set);
12556 CPUSET_DEL(cpuset, CPU->cpu_id);
12557
12558 SFMMU_XCALL_STATS(sfmmup);
12559 xt_some(cpuset, vtag_flush_pgcnt_tl1, (uint64_t)addr,
12560 sfmmu_pgcnt);
12561
12562 for (; bitvec != 0; bitvec >>= 1) {
12563 if (bitvec & 1)
12564 vtag_flushpage(addr, (uint64_t)sfmmup);
12565 addr += MMU_PAGESIZE;
12566 }
12567 kpreempt_enable();
12568 sfmmu_hat_exit(hatlockp);
12569
12570 sfmmu_xcall_save += (pgunload-1);
12571 }
12572 dmrp->dmr_bitvec = 0;
12573 }
12574
12575 /*
12576 * In cases where we need to synchronize with TLB/TSB miss trap
12577 * handlers, _and_ need to flush the TLB, it's a lot easier to
12578 * throw away the context from the process than to do a
12579 * special song and dance to keep things consistent for the
12580 * handlers.
12581 *
12582 * Since the process suddenly ends up without a context and our caller
12583 * holds the hat lock, threads that fault after this function is called
12584 * will pile up on the lock. We can then do whatever we need to
12585 * atomically from the context of the caller. The first blocked thread
12586 * to resume executing will get the process a new context, and the
12587 * process will resume executing.
12588 *
12589 * One added advantage of this approach is that on MMUs that
12590 * support a "flush all" operation, we will delay the flush until
12591 * cnum wrap-around, and then flush the TLB one time. This
12592 * is rather rare, so it's a lot less expensive than making 8000
12593 * x-calls to flush the TLB 8000 times.
12594 *
12595 * A per-process (PP) lock is used to synchronize ctx allocations in
12596 * resume() and ctx invalidations here.
12597 */
12598 static void
sfmmu_invalidate_ctx(sfmmu_t * sfmmup)12599 sfmmu_invalidate_ctx(sfmmu_t *sfmmup)
12600 {
12601 cpuset_t cpuset;
12602 int cnum, currcnum;
12603 mmu_ctx_t *mmu_ctxp;
12604 int i;
12605 uint_t pstate_save;
12606
12607 SFMMU_STAT(sf_ctx_inv);
12608
12609 ASSERT(sfmmu_hat_lock_held(sfmmup));
12610 ASSERT(sfmmup != ksfmmup);
12611
12612 kpreempt_disable();
12613
12614 mmu_ctxp = CPU_MMU_CTXP(CPU);
12615 ASSERT(mmu_ctxp);
12616 ASSERT(mmu_ctxp->mmu_idx < max_mmu_ctxdoms);
12617 ASSERT(mmu_ctxp == mmu_ctxs_tbl[mmu_ctxp->mmu_idx]);
12618
12619 currcnum = sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum;
12620
12621 pstate_save = sfmmu_disable_intrs();
12622
12623 lock_set(&sfmmup->sfmmu_ctx_lock); /* acquire PP lock */
12624 /* set HAT cnum invalid across all context domains. */
12625 for (i = 0; i < max_mmu_ctxdoms; i++) {
12626
12627 cnum = sfmmup->sfmmu_ctxs[i].cnum;
12628 if (cnum == INVALID_CONTEXT) {
12629 continue;
12630 }
12631
12632 sfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT;
12633 }
12634 membar_enter(); /* make sure globally visible to all CPUs */
12635 lock_clear(&sfmmup->sfmmu_ctx_lock); /* release PP lock */
12636
12637 sfmmu_enable_intrs(pstate_save);
12638
12639 cpuset = sfmmup->sfmmu_cpusran;
12640 CPUSET_DEL(cpuset, CPU->cpu_id);
12641 CPUSET_AND(cpuset, cpu_ready_set);
12642 if (!CPUSET_ISNULL(cpuset)) {
12643 SFMMU_XCALL_STATS(sfmmup);
12644 xt_some(cpuset, sfmmu_raise_tsb_exception,
12645 (uint64_t)sfmmup, INVALID_CONTEXT);
12646 xt_sync(cpuset);
12647 SFMMU_STAT(sf_tsb_raise_exception);
12648 SFMMU_MMU_STAT(mmu_tsb_raise_exception);
12649 }
12650
12651 /*
12652 * If the hat to-be-invalidated is the same as the current
12653 * process on local CPU we need to invalidate
12654 * this CPU context as well.
12655 */
12656 if ((sfmmu_getctx_sec() == currcnum) &&
12657 (currcnum != INVALID_CONTEXT)) {
12658 /* sets shared context to INVALID too */
12659 sfmmu_setctx_sec(INVALID_CONTEXT);
12660 sfmmu_clear_utsbinfo();
12661 }
12662
12663 SFMMU_FLAGS_SET(sfmmup, HAT_ALLCTX_INVALID);
12664
12665 kpreempt_enable();
12666
12667 /*
12668 * we hold the hat lock, so nobody should allocate a context
12669 * for us yet
12670 */
12671 ASSERT(sfmmup->sfmmu_ctxs[mmu_ctxp->mmu_idx].cnum == INVALID_CONTEXT);
12672 }
12673
12674 #ifdef VAC
12675 /*
12676 * We need to flush the cache in all cpus. It is possible that
12677 * a process referenced a page as cacheable but has sinced exited
12678 * and cleared the mapping list. We still to flush it but have no
12679 * state so all cpus is the only alternative.
12680 */
12681 void
sfmmu_cache_flush(pfn_t pfnum,int vcolor)12682 sfmmu_cache_flush(pfn_t pfnum, int vcolor)
12683 {
12684 cpuset_t cpuset;
12685
12686 kpreempt_disable();
12687 cpuset = cpu_ready_set;
12688 CPUSET_DEL(cpuset, CPU->cpu_id);
12689 SFMMU_XCALL_STATS(NULL); /* account to any ctx */
12690 xt_some(cpuset, vac_flushpage_tl1, pfnum, vcolor);
12691 xt_sync(cpuset);
12692 vac_flushpage(pfnum, vcolor);
12693 kpreempt_enable();
12694 }
12695
12696 void
sfmmu_cache_flushcolor(int vcolor,pfn_t pfnum)12697 sfmmu_cache_flushcolor(int vcolor, pfn_t pfnum)
12698 {
12699 cpuset_t cpuset;
12700
12701 ASSERT(vcolor >= 0);
12702
12703 kpreempt_disable();
12704 cpuset = cpu_ready_set;
12705 CPUSET_DEL(cpuset, CPU->cpu_id);
12706 SFMMU_XCALL_STATS(NULL); /* account to any ctx */
12707 xt_some(cpuset, vac_flushcolor_tl1, vcolor, pfnum);
12708 xt_sync(cpuset);
12709 vac_flushcolor(vcolor, pfnum);
12710 kpreempt_enable();
12711 }
12712 #endif /* VAC */
12713
12714 /*
12715 * We need to prevent processes from accessing the TSB using a cached physical
12716 * address. It's alright if they try to access the TSB via virtual address
12717 * since they will just fault on that virtual address once the mapping has
12718 * been suspended.
12719 */
12720 #pragma weak sendmondo_in_recover
12721
12722 /* ARGSUSED */
12723 static int
sfmmu_tsb_pre_relocator(caddr_t va,uint_t tsbsz,uint_t flags,void * tsbinfo)12724 sfmmu_tsb_pre_relocator(caddr_t va, uint_t tsbsz, uint_t flags, void *tsbinfo)
12725 {
12726 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo;
12727 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu;
12728 hatlock_t *hatlockp;
12729 sf_scd_t *scdp;
12730
12731 if (flags != HAT_PRESUSPEND)
12732 return (0);
12733
12734 /*
12735 * If tsb is a shared TSB with TSB_SHAREDCTX set, sfmmup must
12736 * be a shared hat, then set SCD's tsbinfo's flag.
12737 * If tsb is not shared, sfmmup is a private hat, then set
12738 * its private tsbinfo's flag.
12739 */
12740 hatlockp = sfmmu_hat_enter(sfmmup);
12741 tsbinfop->tsb_flags |= TSB_RELOC_FLAG;
12742
12743 if (!(tsbinfop->tsb_flags & TSB_SHAREDCTX)) {
12744 sfmmu_tsb_inv_ctx(sfmmup);
12745 sfmmu_hat_exit(hatlockp);
12746 } else {
12747 /* release lock on the shared hat */
12748 sfmmu_hat_exit(hatlockp);
12749 /* sfmmup is a shared hat */
12750 ASSERT(sfmmup->sfmmu_scdhat);
12751 scdp = sfmmup->sfmmu_scdp;
12752 ASSERT(scdp != NULL);
12753 /* get private hat from the scd list */
12754 mutex_enter(&scdp->scd_mutex);
12755 sfmmup = scdp->scd_sf_list;
12756 while (sfmmup != NULL) {
12757 hatlockp = sfmmu_hat_enter(sfmmup);
12758 /*
12759 * We do not call sfmmu_tsb_inv_ctx here because
12760 * sendmondo_in_recover check is only needed for
12761 * sun4u.
12762 */
12763 sfmmu_invalidate_ctx(sfmmup);
12764 sfmmu_hat_exit(hatlockp);
12765 sfmmup = sfmmup->sfmmu_scd_link.next;
12766
12767 }
12768 mutex_exit(&scdp->scd_mutex);
12769 }
12770 return (0);
12771 }
12772
12773 static void
sfmmu_tsb_inv_ctx(sfmmu_t * sfmmup)12774 sfmmu_tsb_inv_ctx(sfmmu_t *sfmmup)
12775 {
12776 extern uint32_t sendmondo_in_recover;
12777
12778 ASSERT(sfmmu_hat_lock_held(sfmmup));
12779
12780 /*
12781 * For Cheetah+ Erratum 25:
12782 * Wait for any active recovery to finish. We can't risk
12783 * relocating the TSB of the thread running mondo_recover_proc()
12784 * since, if we did that, we would deadlock. The scenario we are
12785 * trying to avoid is as follows:
12786 *
12787 * THIS CPU RECOVER CPU
12788 * -------- -----------
12789 * Begins recovery, walking through TSB
12790 * hat_pagesuspend() TSB TTE
12791 * TLB miss on TSB TTE, spins at TL1
12792 * xt_sync()
12793 * send_mondo_timeout()
12794 * mondo_recover_proc()
12795 * ((deadlocked))
12796 *
12797 * The second half of the workaround is that mondo_recover_proc()
12798 * checks to see if the tsb_info has the RELOC flag set, and if it
12799 * does, it skips over that TSB without ever touching tsbinfop->tsb_va
12800 * and hence avoiding the TLB miss that could result in a deadlock.
12801 */
12802 if (&sendmondo_in_recover) {
12803 membar_enter(); /* make sure RELOC flag visible */
12804 while (sendmondo_in_recover) {
12805 drv_usecwait(1);
12806 membar_consumer();
12807 }
12808 }
12809
12810 sfmmu_invalidate_ctx(sfmmup);
12811 }
12812
12813 /* ARGSUSED */
12814 static int
sfmmu_tsb_post_relocator(caddr_t va,uint_t tsbsz,uint_t flags,void * tsbinfo,pfn_t newpfn)12815 sfmmu_tsb_post_relocator(caddr_t va, uint_t tsbsz, uint_t flags,
12816 void *tsbinfo, pfn_t newpfn)
12817 {
12818 hatlock_t *hatlockp;
12819 struct tsb_info *tsbinfop = (struct tsb_info *)tsbinfo;
12820 sfmmu_t *sfmmup = tsbinfop->tsb_sfmmu;
12821
12822 if (flags != HAT_POSTUNSUSPEND)
12823 return (0);
12824
12825 hatlockp = sfmmu_hat_enter(sfmmup);
12826
12827 SFMMU_STAT(sf_tsb_reloc);
12828
12829 /*
12830 * The process may have swapped out while we were relocating one
12831 * of its TSBs. If so, don't bother doing the setup since the
12832 * process can't be using the memory anymore.
12833 */
12834 if ((tsbinfop->tsb_flags & TSB_SWAPPED) == 0) {
12835 ASSERT(va == tsbinfop->tsb_va);
12836 sfmmu_tsbinfo_setup_phys(tsbinfop, newpfn);
12837
12838 if (tsbinfop->tsb_flags & TSB_FLUSH_NEEDED) {
12839 sfmmu_inv_tsb(tsbinfop->tsb_va,
12840 TSB_BYTES(tsbinfop->tsb_szc));
12841 tsbinfop->tsb_flags &= ~TSB_FLUSH_NEEDED;
12842 }
12843 }
12844
12845 membar_exit();
12846 tsbinfop->tsb_flags &= ~TSB_RELOC_FLAG;
12847 cv_broadcast(&sfmmup->sfmmu_tsb_cv);
12848
12849 sfmmu_hat_exit(hatlockp);
12850
12851 return (0);
12852 }
12853
12854 /*
12855 * Allocate and initialize a tsb_info structure. Note that we may or may not
12856 * allocate a TSB here, depending on the flags passed in.
12857 */
12858 static int
sfmmu_tsbinfo_alloc(struct tsb_info ** tsbinfopp,int tsb_szc,int tte_sz_mask,uint_t flags,sfmmu_t * sfmmup)12859 sfmmu_tsbinfo_alloc(struct tsb_info **tsbinfopp, int tsb_szc, int tte_sz_mask,
12860 uint_t flags, sfmmu_t *sfmmup)
12861 {
12862 int err;
12863
12864 *tsbinfopp = (struct tsb_info *)kmem_cache_alloc(
12865 sfmmu_tsbinfo_cache, KM_SLEEP);
12866
12867 if ((err = sfmmu_init_tsbinfo(*tsbinfopp, tte_sz_mask,
12868 tsb_szc, flags, sfmmup)) != 0) {
12869 kmem_cache_free(sfmmu_tsbinfo_cache, *tsbinfopp);
12870 SFMMU_STAT(sf_tsb_allocfail);
12871 *tsbinfopp = NULL;
12872 return (err);
12873 }
12874 SFMMU_STAT(sf_tsb_alloc);
12875
12876 /*
12877 * Bump the TSB size counters for this TSB size.
12878 */
12879 (*(((int *)&sfmmu_tsbsize_stat) + tsb_szc))++;
12880 return (0);
12881 }
12882
12883 static void
sfmmu_tsb_free(struct tsb_info * tsbinfo)12884 sfmmu_tsb_free(struct tsb_info *tsbinfo)
12885 {
12886 caddr_t tsbva = tsbinfo->tsb_va;
12887 uint_t tsb_size = TSB_BYTES(tsbinfo->tsb_szc);
12888 struct kmem_cache *kmem_cachep = tsbinfo->tsb_cache;
12889 vmem_t *vmp = tsbinfo->tsb_vmp;
12890
12891 /*
12892 * If we allocated this TSB from relocatable kernel memory, then we
12893 * need to uninstall the callback handler.
12894 */
12895 if (tsbinfo->tsb_cache != sfmmu_tsb8k_cache) {
12896 uintptr_t slab_mask;
12897 caddr_t slab_vaddr;
12898 page_t **ppl;
12899 int ret;
12900
12901 ASSERT(tsb_size <= MMU_PAGESIZE4M || use_bigtsb_arena);
12902 if (tsb_size > MMU_PAGESIZE4M)
12903 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT;
12904 else
12905 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT;
12906 slab_vaddr = (caddr_t)((uintptr_t)tsbva & slab_mask);
12907
12908 ret = as_pagelock(&kas, &ppl, slab_vaddr, PAGESIZE, S_WRITE);
12909 ASSERT(ret == 0);
12910 hat_delete_callback(tsbva, (uint_t)tsb_size, (void *)tsbinfo,
12911 0, NULL);
12912 as_pageunlock(&kas, ppl, slab_vaddr, PAGESIZE, S_WRITE);
12913 }
12914
12915 if (kmem_cachep != NULL) {
12916 kmem_cache_free(kmem_cachep, tsbva);
12917 } else {
12918 vmem_xfree(vmp, (void *)tsbva, tsb_size);
12919 }
12920 tsbinfo->tsb_va = (caddr_t)0xbad00bad;
12921 atomic_add_64(&tsb_alloc_bytes, -(int64_t)tsb_size);
12922 }
12923
12924 static void
sfmmu_tsbinfo_free(struct tsb_info * tsbinfo)12925 sfmmu_tsbinfo_free(struct tsb_info *tsbinfo)
12926 {
12927 if ((tsbinfo->tsb_flags & TSB_SWAPPED) == 0) {
12928 sfmmu_tsb_free(tsbinfo);
12929 }
12930 kmem_cache_free(sfmmu_tsbinfo_cache, tsbinfo);
12931
12932 }
12933
12934 /*
12935 * Setup all the references to physical memory for this tsbinfo.
12936 * The underlying page(s) must be locked.
12937 */
12938 static void
sfmmu_tsbinfo_setup_phys(struct tsb_info * tsbinfo,pfn_t pfn)12939 sfmmu_tsbinfo_setup_phys(struct tsb_info *tsbinfo, pfn_t pfn)
12940 {
12941 ASSERT(pfn != PFN_INVALID);
12942 ASSERT(pfn == va_to_pfn(tsbinfo->tsb_va));
12943
12944 #ifndef sun4v
12945 if (tsbinfo->tsb_szc == 0) {
12946 sfmmu_memtte(&tsbinfo->tsb_tte, pfn,
12947 PROT_WRITE|PROT_READ, TTE8K);
12948 } else {
12949 /*
12950 * Round down PA and use a large mapping; the handlers will
12951 * compute the TSB pointer at the correct offset into the
12952 * big virtual page. NOTE: this assumes all TSBs larger
12953 * than 8K must come from physically contiguous slabs of
12954 * size tsb_slab_size.
12955 */
12956 sfmmu_memtte(&tsbinfo->tsb_tte, pfn & ~tsb_slab_mask,
12957 PROT_WRITE|PROT_READ, tsb_slab_ttesz);
12958 }
12959 tsbinfo->tsb_pa = ptob(pfn);
12960
12961 TTE_SET_LOCKED(&tsbinfo->tsb_tte); /* lock the tte into dtlb */
12962 TTE_SET_MOD(&tsbinfo->tsb_tte); /* enable writes */
12963
12964 ASSERT(TTE_IS_PRIVILEGED(&tsbinfo->tsb_tte));
12965 ASSERT(TTE_IS_LOCKED(&tsbinfo->tsb_tte));
12966 #else /* sun4v */
12967 tsbinfo->tsb_pa = ptob(pfn);
12968 #endif /* sun4v */
12969 }
12970
12971
12972 /*
12973 * Returns zero on success, ENOMEM if over the high water mark,
12974 * or EAGAIN if the caller needs to retry with a smaller TSB
12975 * size (or specify TSB_FORCEALLOC if the allocation can't fail).
12976 *
12977 * This call cannot fail to allocate a TSB if TSB_FORCEALLOC
12978 * is specified and the TSB requested is PAGESIZE, though it
12979 * may sleep waiting for memory if sufficient memory is not
12980 * available.
12981 */
12982 static int
sfmmu_init_tsbinfo(struct tsb_info * tsbinfo,int tteszmask,int tsbcode,uint_t flags,sfmmu_t * sfmmup)12983 sfmmu_init_tsbinfo(struct tsb_info *tsbinfo, int tteszmask,
12984 int tsbcode, uint_t flags, sfmmu_t *sfmmup)
12985 {
12986 caddr_t vaddr = NULL;
12987 caddr_t slab_vaddr;
12988 uintptr_t slab_mask;
12989 int tsbbytes = TSB_BYTES(tsbcode);
12990 int lowmem = 0;
12991 struct kmem_cache *kmem_cachep = NULL;
12992 vmem_t *vmp = NULL;
12993 lgrp_id_t lgrpid = LGRP_NONE;
12994 pfn_t pfn;
12995 uint_t cbflags = HAC_SLEEP;
12996 page_t **pplist;
12997 int ret;
12998
12999 ASSERT(tsbbytes <= MMU_PAGESIZE4M || use_bigtsb_arena);
13000 if (tsbbytes > MMU_PAGESIZE4M)
13001 slab_mask = ~((uintptr_t)bigtsb_slab_mask) << PAGESHIFT;
13002 else
13003 slab_mask = ~((uintptr_t)tsb_slab_mask) << PAGESHIFT;
13004
13005 if (flags & (TSB_FORCEALLOC | TSB_SWAPIN | TSB_GROW | TSB_SHRINK))
13006 flags |= TSB_ALLOC;
13007
13008 ASSERT((flags & TSB_FORCEALLOC) == 0 || tsbcode == TSB_MIN_SZCODE);
13009
13010 tsbinfo->tsb_sfmmu = sfmmup;
13011
13012 /*
13013 * If not allocating a TSB, set up the tsbinfo, set TSB_SWAPPED, and
13014 * return.
13015 */
13016 if ((flags & TSB_ALLOC) == 0) {
13017 tsbinfo->tsb_szc = tsbcode;
13018 tsbinfo->tsb_ttesz_mask = tteszmask;
13019 tsbinfo->tsb_va = (caddr_t)0xbadbadbeef;
13020 tsbinfo->tsb_pa = -1;
13021 tsbinfo->tsb_tte.ll = 0;
13022 tsbinfo->tsb_next = NULL;
13023 tsbinfo->tsb_flags = TSB_SWAPPED;
13024 tsbinfo->tsb_cache = NULL;
13025 tsbinfo->tsb_vmp = NULL;
13026 return (0);
13027 }
13028
13029 #ifdef DEBUG
13030 /*
13031 * For debugging:
13032 * Randomly force allocation failures every tsb_alloc_mtbf
13033 * tries if TSB_FORCEALLOC is not specified. This will
13034 * return ENOMEM if tsb_alloc_mtbf is odd, or EAGAIN if
13035 * it is even, to allow testing of both failure paths...
13036 */
13037 if (tsb_alloc_mtbf && ((flags & TSB_FORCEALLOC) == 0) &&
13038 (tsb_alloc_count++ == tsb_alloc_mtbf)) {
13039 tsb_alloc_count = 0;
13040 tsb_alloc_fail_mtbf++;
13041 return ((tsb_alloc_mtbf & 1)? ENOMEM : EAGAIN);
13042 }
13043 #endif /* DEBUG */
13044
13045 /*
13046 * Enforce high water mark if we are not doing a forced allocation
13047 * and are not shrinking a process' TSB.
13048 */
13049 if ((flags & TSB_SHRINK) == 0 &&
13050 (tsbbytes + tsb_alloc_bytes) > tsb_alloc_hiwater) {
13051 if ((flags & TSB_FORCEALLOC) == 0)
13052 return (ENOMEM);
13053 lowmem = 1;
13054 }
13055
13056 /*
13057 * Allocate from the correct location based upon the size of the TSB
13058 * compared to the base page size, and what memory conditions dictate.
13059 * Note we always do nonblocking allocations from the TSB arena since
13060 * we don't want memory fragmentation to cause processes to block
13061 * indefinitely waiting for memory; until the kernel algorithms that
13062 * coalesce large pages are improved this is our best option.
13063 *
13064 * Algorithm:
13065 * If allocating a "large" TSB (>8K), allocate from the
13066 * appropriate kmem_tsb_default_arena vmem arena
13067 * else if low on memory or the TSB_FORCEALLOC flag is set or
13068 * tsb_forceheap is set
13069 * Allocate from kernel heap via sfmmu_tsb8k_cache with
13070 * KM_SLEEP (never fails)
13071 * else
13072 * Allocate from appropriate sfmmu_tsb_cache with
13073 * KM_NOSLEEP
13074 * endif
13075 */
13076 if (tsb_lgrp_affinity)
13077 lgrpid = lgrp_home_id(curthread);
13078 if (lgrpid == LGRP_NONE)
13079 lgrpid = 0; /* use lgrp of boot CPU */
13080
13081 if (tsbbytes > MMU_PAGESIZE) {
13082 if (tsbbytes > MMU_PAGESIZE4M) {
13083 vmp = kmem_bigtsb_default_arena[lgrpid];
13084 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes,
13085 0, 0, NULL, NULL, VM_NOSLEEP);
13086 } else {
13087 vmp = kmem_tsb_default_arena[lgrpid];
13088 vaddr = (caddr_t)vmem_xalloc(vmp, tsbbytes, tsbbytes,
13089 0, 0, NULL, NULL, VM_NOSLEEP);
13090 }
13091 #ifdef DEBUG
13092 } else if (lowmem || (flags & TSB_FORCEALLOC) || tsb_forceheap) {
13093 #else /* !DEBUG */
13094 } else if (lowmem || (flags & TSB_FORCEALLOC)) {
13095 #endif /* DEBUG */
13096 kmem_cachep = sfmmu_tsb8k_cache;
13097 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_SLEEP);
13098 ASSERT(vaddr != NULL);
13099 } else {
13100 kmem_cachep = sfmmu_tsb_cache[lgrpid];
13101 vaddr = (caddr_t)kmem_cache_alloc(kmem_cachep, KM_NOSLEEP);
13102 }
13103
13104 tsbinfo->tsb_cache = kmem_cachep;
13105 tsbinfo->tsb_vmp = vmp;
13106
13107 if (vaddr == NULL) {
13108 return (EAGAIN);
13109 }
13110
13111 atomic_add_64(&tsb_alloc_bytes, (int64_t)tsbbytes);
13112 kmem_cachep = tsbinfo->tsb_cache;
13113
13114 /*
13115 * If we are allocating from outside the cage, then we need to
13116 * register a relocation callback handler. Note that for now
13117 * since pseudo mappings always hang off of the slab's root page,
13118 * we need only lock the first 8K of the TSB slab. This is a bit
13119 * hacky but it is good for performance.
13120 */
13121 if (kmem_cachep != sfmmu_tsb8k_cache) {
13122 slab_vaddr = (caddr_t)((uintptr_t)vaddr & slab_mask);
13123 ret = as_pagelock(&kas, &pplist, slab_vaddr, PAGESIZE, S_WRITE);
13124 ASSERT(ret == 0);
13125 ret = hat_add_callback(sfmmu_tsb_cb_id, vaddr, (uint_t)tsbbytes,
13126 cbflags, (void *)tsbinfo, &pfn, NULL);
13127
13128 /*
13129 * Need to free up resources if we could not successfully
13130 * add the callback function and return an error condition.
13131 */
13132 if (ret != 0) {
13133 if (kmem_cachep) {
13134 kmem_cache_free(kmem_cachep, vaddr);
13135 } else {
13136 vmem_xfree(vmp, (void *)vaddr, tsbbytes);
13137 }
13138 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE,
13139 S_WRITE);
13140 return (EAGAIN);
13141 }
13142 } else {
13143 /*
13144 * Since allocation of 8K TSBs from heap is rare and occurs
13145 * during memory pressure we allocate them from permanent
13146 * memory rather than using callbacks to get the PFN.
13147 */
13148 pfn = hat_getpfnum(kas.a_hat, vaddr);
13149 }
13150
13151 tsbinfo->tsb_va = vaddr;
13152 tsbinfo->tsb_szc = tsbcode;
13153 tsbinfo->tsb_ttesz_mask = tteszmask;
13154 tsbinfo->tsb_next = NULL;
13155 tsbinfo->tsb_flags = 0;
13156
13157 sfmmu_tsbinfo_setup_phys(tsbinfo, pfn);
13158
13159 sfmmu_inv_tsb(vaddr, tsbbytes);
13160
13161 if (kmem_cachep != sfmmu_tsb8k_cache) {
13162 as_pageunlock(&kas, pplist, slab_vaddr, PAGESIZE, S_WRITE);
13163 }
13164
13165 return (0);
13166 }
13167
13168 /*
13169 * Initialize per cpu tsb and per cpu tsbmiss_area
13170 */
13171 void
sfmmu_init_tsbs(void)13172 sfmmu_init_tsbs(void)
13173 {
13174 int i;
13175 struct tsbmiss *tsbmissp;
13176 struct kpmtsbm *kpmtsbmp;
13177 #ifndef sun4v
13178 extern int dcache_line_mask;
13179 #endif /* sun4v */
13180 extern uint_t vac_colors;
13181
13182 /*
13183 * Init. tsb miss area.
13184 */
13185 tsbmissp = tsbmiss_area;
13186
13187 for (i = 0; i < NCPU; tsbmissp++, i++) {
13188 /*
13189 * initialize the tsbmiss area.
13190 * Do this for all possible CPUs as some may be added
13191 * while the system is running. There is no cost to this.
13192 */
13193 tsbmissp->ksfmmup = ksfmmup;
13194 #ifndef sun4v
13195 tsbmissp->dcache_line_mask = (uint16_t)dcache_line_mask;
13196 #endif /* sun4v */
13197 tsbmissp->khashstart =
13198 (struct hmehash_bucket *)va_to_pa((caddr_t)khme_hash);
13199 tsbmissp->uhashstart =
13200 (struct hmehash_bucket *)va_to_pa((caddr_t)uhme_hash);
13201 tsbmissp->khashsz = khmehash_num;
13202 tsbmissp->uhashsz = uhmehash_num;
13203 }
13204
13205 sfmmu_tsb_cb_id = hat_register_callback('T'<<16 | 'S' << 8 | 'B',
13206 sfmmu_tsb_pre_relocator, sfmmu_tsb_post_relocator, NULL, 0);
13207
13208 if (kpm_enable == 0)
13209 return;
13210
13211 /* -- Begin KPM specific init -- */
13212
13213 if (kpm_smallpages) {
13214 /*
13215 * If we're using base pagesize pages for seg_kpm
13216 * mappings, we use the kernel TSB since we can't afford
13217 * to allocate a second huge TSB for these mappings.
13218 */
13219 kpm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base;
13220 kpm_tsbsz = ktsb_szcode;
13221 kpmsm_tsbbase = kpm_tsbbase;
13222 kpmsm_tsbsz = kpm_tsbsz;
13223 } else {
13224 /*
13225 * In VAC conflict case, just put the entries in the
13226 * kernel 8K indexed TSB for now so we can find them.
13227 * This could really be changed in the future if we feel
13228 * the need...
13229 */
13230 kpmsm_tsbbase = ktsb_phys? ktsb_pbase : (uint64_t)ktsb_base;
13231 kpmsm_tsbsz = ktsb_szcode;
13232 kpm_tsbbase = ktsb_phys? ktsb4m_pbase : (uint64_t)ktsb4m_base;
13233 kpm_tsbsz = ktsb4m_szcode;
13234 }
13235
13236 kpmtsbmp = kpmtsbm_area;
13237 for (i = 0; i < NCPU; kpmtsbmp++, i++) {
13238 /*
13239 * Initialize the kpmtsbm area.
13240 * Do this for all possible CPUs as some may be added
13241 * while the system is running. There is no cost to this.
13242 */
13243 kpmtsbmp->vbase = kpm_vbase;
13244 kpmtsbmp->vend = kpm_vbase + kpm_size * vac_colors;
13245 kpmtsbmp->sz_shift = kpm_size_shift;
13246 kpmtsbmp->kpmp_shift = kpmp_shift;
13247 kpmtsbmp->kpmp2pshft = (uchar_t)kpmp2pshft;
13248 if (kpm_smallpages == 0) {
13249 kpmtsbmp->kpmp_table_sz = kpmp_table_sz;
13250 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_table);
13251 } else {
13252 kpmtsbmp->kpmp_table_sz = kpmp_stable_sz;
13253 kpmtsbmp->kpmp_tablepa = va_to_pa(kpmp_stable);
13254 }
13255 kpmtsbmp->msegphashpa = va_to_pa(memseg_phash);
13256 kpmtsbmp->flags = KPMTSBM_ENABLE_FLAG;
13257 #ifdef DEBUG
13258 kpmtsbmp->flags |= (kpm_tsbmtl) ? KPMTSBM_TLTSBM_FLAG : 0;
13259 #endif /* DEBUG */
13260 if (ktsb_phys)
13261 kpmtsbmp->flags |= KPMTSBM_TSBPHYS_FLAG;
13262 }
13263
13264 /* -- End KPM specific init -- */
13265 }
13266
13267 /* Avoid using sfmmu_tsbinfo_alloc() to avoid kmem_alloc - no real reason */
13268 struct tsb_info ktsb_info[2];
13269
13270 /*
13271 * Called from hat_kern_setup() to setup the tsb_info for ksfmmup.
13272 */
13273 void
sfmmu_init_ktsbinfo()13274 sfmmu_init_ktsbinfo()
13275 {
13276 ASSERT(ksfmmup != NULL);
13277 ASSERT(ksfmmup->sfmmu_tsb == NULL);
13278 /*
13279 * Allocate tsbinfos for kernel and copy in data
13280 * to make debug easier and sun4v setup easier.
13281 */
13282 ktsb_info[0].tsb_sfmmu = ksfmmup;
13283 ktsb_info[0].tsb_szc = ktsb_szcode;
13284 ktsb_info[0].tsb_ttesz_mask = TSB8K|TSB64K|TSB512K;
13285 ktsb_info[0].tsb_va = ktsb_base;
13286 ktsb_info[0].tsb_pa = ktsb_pbase;
13287 ktsb_info[0].tsb_flags = 0;
13288 ktsb_info[0].tsb_tte.ll = 0;
13289 ktsb_info[0].tsb_cache = NULL;
13290
13291 ktsb_info[1].tsb_sfmmu = ksfmmup;
13292 ktsb_info[1].tsb_szc = ktsb4m_szcode;
13293 ktsb_info[1].tsb_ttesz_mask = TSB4M;
13294 ktsb_info[1].tsb_va = ktsb4m_base;
13295 ktsb_info[1].tsb_pa = ktsb4m_pbase;
13296 ktsb_info[1].tsb_flags = 0;
13297 ktsb_info[1].tsb_tte.ll = 0;
13298 ktsb_info[1].tsb_cache = NULL;
13299
13300 /* Link them into ksfmmup. */
13301 ktsb_info[0].tsb_next = &ktsb_info[1];
13302 ktsb_info[1].tsb_next = NULL;
13303 ksfmmup->sfmmu_tsb = &ktsb_info[0];
13304
13305 sfmmu_setup_tsbinfo(ksfmmup);
13306 }
13307
13308 /*
13309 * Cache the last value returned from va_to_pa(). If the VA specified
13310 * in the current call to cached_va_to_pa() maps to the same Page (as the
13311 * previous call to cached_va_to_pa()), then compute the PA using
13312 * cached info, else call va_to_pa().
13313 *
13314 * Note: this function is neither MT-safe nor consistent in the presence
13315 * of multiple, interleaved threads. This function was created to enable
13316 * an optimization used during boot (at a point when there's only one thread
13317 * executing on the "boot CPU", and before startup_vm() has been called).
13318 */
13319 static uint64_t
cached_va_to_pa(void * vaddr)13320 cached_va_to_pa(void *vaddr)
13321 {
13322 static uint64_t prev_vaddr_base = 0;
13323 static uint64_t prev_pfn = 0;
13324
13325 if ((((uint64_t)vaddr) & MMU_PAGEMASK) == prev_vaddr_base) {
13326 return (prev_pfn | ((uint64_t)vaddr & MMU_PAGEOFFSET));
13327 } else {
13328 uint64_t pa = va_to_pa(vaddr);
13329
13330 if (pa != ((uint64_t)-1)) {
13331 /*
13332 * Computed physical address is valid. Cache its
13333 * related info for the next cached_va_to_pa() call.
13334 */
13335 prev_pfn = pa & MMU_PAGEMASK;
13336 prev_vaddr_base = ((uint64_t)vaddr) & MMU_PAGEMASK;
13337 }
13338
13339 return (pa);
13340 }
13341 }
13342
13343 /*
13344 * Carve up our nucleus hblk region. We may allocate more hblks than
13345 * asked due to rounding errors but we are guaranteed to have at least
13346 * enough space to allocate the requested number of hblk8's and hblk1's.
13347 */
13348 void
sfmmu_init_nucleus_hblks(caddr_t addr,size_t size,int nhblk8,int nhblk1)13349 sfmmu_init_nucleus_hblks(caddr_t addr, size_t size, int nhblk8, int nhblk1)
13350 {
13351 struct hme_blk *hmeblkp;
13352 size_t hme8blk_sz, hme1blk_sz;
13353 size_t i;
13354 size_t hblk8_bound;
13355 ulong_t j = 0, k = 0;
13356
13357 ASSERT(addr != NULL && size != 0);
13358
13359 /* Need to use proper structure alignment */
13360 hme8blk_sz = roundup(HME8BLK_SZ, sizeof (int64_t));
13361 hme1blk_sz = roundup(HME1BLK_SZ, sizeof (int64_t));
13362
13363 nucleus_hblk8.list = (void *)addr;
13364 nucleus_hblk8.index = 0;
13365
13366 /*
13367 * Use as much memory as possible for hblk8's since we
13368 * expect all bop_alloc'ed memory to be allocated in 8k chunks.
13369 * We need to hold back enough space for the hblk1's which
13370 * we'll allocate next.
13371 */
13372 hblk8_bound = size - (nhblk1 * hme1blk_sz) - hme8blk_sz;
13373 for (i = 0; i <= hblk8_bound; i += hme8blk_sz, j++) {
13374 hmeblkp = (struct hme_blk *)addr;
13375 addr += hme8blk_sz;
13376 hmeblkp->hblk_nuc_bit = 1;
13377 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp);
13378 }
13379 nucleus_hblk8.len = j;
13380 ASSERT(j >= nhblk8);
13381 SFMMU_STAT_ADD(sf_hblk8_ncreate, j);
13382
13383 nucleus_hblk1.list = (void *)addr;
13384 nucleus_hblk1.index = 0;
13385 for (; i <= (size - hme1blk_sz); i += hme1blk_sz, k++) {
13386 hmeblkp = (struct hme_blk *)addr;
13387 addr += hme1blk_sz;
13388 hmeblkp->hblk_nuc_bit = 1;
13389 hmeblkp->hblk_nextpa = cached_va_to_pa((caddr_t)hmeblkp);
13390 }
13391 ASSERT(k >= nhblk1);
13392 nucleus_hblk1.len = k;
13393 SFMMU_STAT_ADD(sf_hblk1_ncreate, k);
13394 }
13395
13396 /*
13397 * This function is currently not supported on this platform. For what
13398 * it's supposed to do, see hat.c and hat_srmmu.c
13399 */
13400 /* ARGSUSED */
13401 faultcode_t
hat_softlock(struct hat * hat,caddr_t addr,size_t * lenp,page_t ** ppp,uint_t flags)13402 hat_softlock(struct hat *hat, caddr_t addr, size_t *lenp, page_t **ppp,
13403 uint_t flags)
13404 {
13405 ASSERT(hat->sfmmu_xhat_provider == NULL);
13406 return (FC_NOSUPPORT);
13407 }
13408
13409 /*
13410 * Searchs the mapping list of the page for a mapping of the same size. If not
13411 * found the corresponding bit is cleared in the p_index field. When large
13412 * pages are more prevalent in the system, we can maintain the mapping list
13413 * in order and we don't have to traverse the list each time. Just check the
13414 * next and prev entries, and if both are of different size, we clear the bit.
13415 */
13416 static void
sfmmu_rm_large_mappings(page_t * pp,int ttesz)13417 sfmmu_rm_large_mappings(page_t *pp, int ttesz)
13418 {
13419 struct sf_hment *sfhmep;
13420 struct hme_blk *hmeblkp;
13421 int index;
13422 pgcnt_t npgs;
13423
13424 ASSERT(ttesz > TTE8K);
13425
13426 ASSERT(sfmmu_mlist_held(pp));
13427
13428 ASSERT(PP_ISMAPPED_LARGE(pp));
13429
13430 /*
13431 * Traverse mapping list looking for another mapping of same size.
13432 * since we only want to clear index field if all mappings of
13433 * that size are gone.
13434 */
13435
13436 for (sfhmep = pp->p_mapping; sfhmep; sfhmep = sfhmep->hme_next) {
13437 if (IS_PAHME(sfhmep))
13438 continue;
13439 hmeblkp = sfmmu_hmetohblk(sfhmep);
13440 if (hmeblkp->hblk_xhat_bit)
13441 continue;
13442 if (hme_size(sfhmep) == ttesz) {
13443 /*
13444 * another mapping of the same size. don't clear index.
13445 */
13446 return;
13447 }
13448 }
13449
13450 /*
13451 * Clear the p_index bit for large page.
13452 */
13453 index = PAGESZ_TO_INDEX(ttesz);
13454 npgs = TTEPAGES(ttesz);
13455 while (npgs-- > 0) {
13456 ASSERT(pp->p_index & index);
13457 pp->p_index &= ~index;
13458 pp = PP_PAGENEXT(pp);
13459 }
13460 }
13461
13462 /*
13463 * return supported features
13464 */
13465 /* ARGSUSED */
13466 int
hat_supported(enum hat_features feature,void * arg)13467 hat_supported(enum hat_features feature, void *arg)
13468 {
13469 switch (feature) {
13470 case HAT_SHARED_PT:
13471 case HAT_DYNAMIC_ISM_UNMAP:
13472 case HAT_VMODSORT:
13473 return (1);
13474 case HAT_SHARED_REGIONS:
13475 if (shctx_on)
13476 return (1);
13477 else
13478 return (0);
13479 default:
13480 return (0);
13481 }
13482 }
13483
13484 void
hat_enter(struct hat * hat)13485 hat_enter(struct hat *hat)
13486 {
13487 hatlock_t *hatlockp;
13488
13489 if (hat != ksfmmup) {
13490 hatlockp = TSB_HASH(hat);
13491 mutex_enter(HATLOCK_MUTEXP(hatlockp));
13492 }
13493 }
13494
13495 void
hat_exit(struct hat * hat)13496 hat_exit(struct hat *hat)
13497 {
13498 hatlock_t *hatlockp;
13499
13500 if (hat != ksfmmup) {
13501 hatlockp = TSB_HASH(hat);
13502 mutex_exit(HATLOCK_MUTEXP(hatlockp));
13503 }
13504 }
13505
13506 /*ARGSUSED*/
13507 void
hat_reserve(struct as * as,caddr_t addr,size_t len)13508 hat_reserve(struct as *as, caddr_t addr, size_t len)
13509 {
13510 }
13511
13512 static void
hat_kstat_init(void)13513 hat_kstat_init(void)
13514 {
13515 kstat_t *ksp;
13516
13517 ksp = kstat_create("unix", 0, "sfmmu_global_stat", "hat",
13518 KSTAT_TYPE_RAW, sizeof (struct sfmmu_global_stat),
13519 KSTAT_FLAG_VIRTUAL);
13520 if (ksp) {
13521 ksp->ks_data = (void *) &sfmmu_global_stat;
13522 kstat_install(ksp);
13523 }
13524 ksp = kstat_create("unix", 0, "sfmmu_tsbsize_stat", "hat",
13525 KSTAT_TYPE_RAW, sizeof (struct sfmmu_tsbsize_stat),
13526 KSTAT_FLAG_VIRTUAL);
13527 if (ksp) {
13528 ksp->ks_data = (void *) &sfmmu_tsbsize_stat;
13529 kstat_install(ksp);
13530 }
13531 ksp = kstat_create("unix", 0, "sfmmu_percpu_stat", "hat",
13532 KSTAT_TYPE_RAW, sizeof (struct sfmmu_percpu_stat) * NCPU,
13533 KSTAT_FLAG_WRITABLE);
13534 if (ksp) {
13535 ksp->ks_update = sfmmu_kstat_percpu_update;
13536 kstat_install(ksp);
13537 }
13538 }
13539
13540 /* ARGSUSED */
13541 static int
sfmmu_kstat_percpu_update(kstat_t * ksp,int rw)13542 sfmmu_kstat_percpu_update(kstat_t *ksp, int rw)
13543 {
13544 struct sfmmu_percpu_stat *cpu_kstat = ksp->ks_data;
13545 struct tsbmiss *tsbm = tsbmiss_area;
13546 struct kpmtsbm *kpmtsbm = kpmtsbm_area;
13547 int i;
13548
13549 ASSERT(cpu_kstat);
13550 if (rw == KSTAT_READ) {
13551 for (i = 0; i < NCPU; cpu_kstat++, tsbm++, kpmtsbm++, i++) {
13552 cpu_kstat->sf_itlb_misses = 0;
13553 cpu_kstat->sf_dtlb_misses = 0;
13554 cpu_kstat->sf_utsb_misses = tsbm->utsb_misses -
13555 tsbm->uprot_traps;
13556 cpu_kstat->sf_ktsb_misses = tsbm->ktsb_misses +
13557 kpmtsbm->kpm_tsb_misses - tsbm->kprot_traps;
13558 cpu_kstat->sf_tsb_hits = 0;
13559 cpu_kstat->sf_umod_faults = tsbm->uprot_traps;
13560 cpu_kstat->sf_kmod_faults = tsbm->kprot_traps;
13561 }
13562 } else {
13563 /* KSTAT_WRITE is used to clear stats */
13564 for (i = 0; i < NCPU; tsbm++, kpmtsbm++, i++) {
13565 tsbm->utsb_misses = 0;
13566 tsbm->ktsb_misses = 0;
13567 tsbm->uprot_traps = 0;
13568 tsbm->kprot_traps = 0;
13569 kpmtsbm->kpm_dtlb_misses = 0;
13570 kpmtsbm->kpm_tsb_misses = 0;
13571 }
13572 }
13573 return (0);
13574 }
13575
13576 #ifdef DEBUG
13577
13578 tte_t *gorig[NCPU], *gcur[NCPU], *gnew[NCPU];
13579
13580 /*
13581 * A tte checker. *orig_old is the value we read before cas.
13582 * *cur is the value returned by cas.
13583 * *new is the desired value when we do the cas.
13584 *
13585 * *hmeblkp is currently unused.
13586 */
13587
13588 /* ARGSUSED */
13589 void
chk_tte(tte_t * orig_old,tte_t * cur,tte_t * new,struct hme_blk * hmeblkp)13590 chk_tte(tte_t *orig_old, tte_t *cur, tte_t *new, struct hme_blk *hmeblkp)
13591 {
13592 pfn_t i, j, k;
13593 int cpuid = CPU->cpu_id;
13594
13595 gorig[cpuid] = orig_old;
13596 gcur[cpuid] = cur;
13597 gnew[cpuid] = new;
13598
13599 #ifdef lint
13600 hmeblkp = hmeblkp;
13601 #endif
13602
13603 if (TTE_IS_VALID(orig_old)) {
13604 if (TTE_IS_VALID(cur)) {
13605 i = TTE_TO_TTEPFN(orig_old);
13606 j = TTE_TO_TTEPFN(cur);
13607 k = TTE_TO_TTEPFN(new);
13608 if (i != j) {
13609 /* remap error? */
13610 panic("chk_tte: bad pfn, 0x%lx, 0x%lx", i, j);
13611 }
13612
13613 if (i != k) {
13614 /* remap error? */
13615 panic("chk_tte: bad pfn2, 0x%lx, 0x%lx", i, k);
13616 }
13617 } else {
13618 if (TTE_IS_VALID(new)) {
13619 panic("chk_tte: invalid cur? ");
13620 }
13621
13622 i = TTE_TO_TTEPFN(orig_old);
13623 k = TTE_TO_TTEPFN(new);
13624 if (i != k) {
13625 panic("chk_tte: bad pfn3, 0x%lx, 0x%lx", i, k);
13626 }
13627 }
13628 } else {
13629 if (TTE_IS_VALID(cur)) {
13630 j = TTE_TO_TTEPFN(cur);
13631 if (TTE_IS_VALID(new)) {
13632 k = TTE_TO_TTEPFN(new);
13633 if (j != k) {
13634 panic("chk_tte: bad pfn4, 0x%lx, 0x%lx",
13635 j, k);
13636 }
13637 } else {
13638 panic("chk_tte: why here?");
13639 }
13640 } else {
13641 if (!TTE_IS_VALID(new)) {
13642 panic("chk_tte: why here2 ?");
13643 }
13644 }
13645 }
13646 }
13647
13648 #endif /* DEBUG */
13649
13650 extern void prefetch_tsbe_read(struct tsbe *);
13651 extern void prefetch_tsbe_write(struct tsbe *);
13652
13653
13654 /*
13655 * We want to prefetch 7 cache lines ahead for our read prefetch. This gives
13656 * us optimal performance on Cheetah+. You can only have 8 outstanding
13657 * prefetches at any one time, so we opted for 7 read prefetches and 1 write
13658 * prefetch to make the most utilization of the prefetch capability.
13659 */
13660 #define TSBE_PREFETCH_STRIDE (7)
13661
13662 void
sfmmu_copy_tsb(struct tsb_info * old_tsbinfo,struct tsb_info * new_tsbinfo)13663 sfmmu_copy_tsb(struct tsb_info *old_tsbinfo, struct tsb_info *new_tsbinfo)
13664 {
13665 int old_bytes = TSB_BYTES(old_tsbinfo->tsb_szc);
13666 int new_bytes = TSB_BYTES(new_tsbinfo->tsb_szc);
13667 int old_entries = TSB_ENTRIES(old_tsbinfo->tsb_szc);
13668 int new_entries = TSB_ENTRIES(new_tsbinfo->tsb_szc);
13669 struct tsbe *old;
13670 struct tsbe *new;
13671 struct tsbe *new_base = (struct tsbe *)new_tsbinfo->tsb_va;
13672 uint64_t va;
13673 int new_offset;
13674 int i;
13675 int vpshift;
13676 int last_prefetch;
13677
13678 if (old_bytes == new_bytes) {
13679 bcopy(old_tsbinfo->tsb_va, new_tsbinfo->tsb_va, new_bytes);
13680 } else {
13681
13682 /*
13683 * A TSBE is 16 bytes which means there are four TSBE's per
13684 * P$ line (64 bytes), thus every 4 TSBE's we prefetch.
13685 */
13686 old = (struct tsbe *)old_tsbinfo->tsb_va;
13687 last_prefetch = old_entries - (4*(TSBE_PREFETCH_STRIDE+1));
13688 for (i = 0; i < old_entries; i++, old++) {
13689 if (((i & (4-1)) == 0) && (i < last_prefetch))
13690 prefetch_tsbe_read(old);
13691 if (!old->tte_tag.tag_invalid) {
13692 /*
13693 * We have a valid TTE to remap. Check the
13694 * size. We won't remap 64K or 512K TTEs
13695 * because they span more than one TSB entry
13696 * and are indexed using an 8K virt. page.
13697 * Ditto for 32M and 256M TTEs.
13698 */
13699 if (TTE_CSZ(&old->tte_data) == TTE64K ||
13700 TTE_CSZ(&old->tte_data) == TTE512K)
13701 continue;
13702 if (mmu_page_sizes == max_mmu_page_sizes) {
13703 if (TTE_CSZ(&old->tte_data) == TTE32M ||
13704 TTE_CSZ(&old->tte_data) == TTE256M)
13705 continue;
13706 }
13707
13708 /* clear the lower 22 bits of the va */
13709 va = *(uint64_t *)old << 22;
13710 /* turn va into a virtual pfn */
13711 va >>= 22 - TSB_START_SIZE;
13712 /*
13713 * or in bits from the offset in the tsb
13714 * to get the real virtual pfn. These
13715 * correspond to bits [21:13] in the va
13716 */
13717 vpshift =
13718 TTE_BSZS_SHIFT(TTE_CSZ(&old->tte_data)) &
13719 0x1ff;
13720 va |= (i << vpshift);
13721 va >>= vpshift;
13722 new_offset = va & (new_entries - 1);
13723 new = new_base + new_offset;
13724 prefetch_tsbe_write(new);
13725 *new = *old;
13726 }
13727 }
13728 }
13729 }
13730
13731 /*
13732 * unused in sfmmu
13733 */
13734 void
hat_dump(void)13735 hat_dump(void)
13736 {
13737 }
13738
13739 /*
13740 * Called when a thread is exiting and we have switched to the kernel address
13741 * space. Perform the same VM initialization resume() uses when switching
13742 * processes.
13743 *
13744 * Note that sfmmu_load_mmustate() is currently a no-op for kernel threads, but
13745 * we call it anyway in case the semantics change in the future.
13746 */
13747 /*ARGSUSED*/
13748 void
hat_thread_exit(kthread_t * thd)13749 hat_thread_exit(kthread_t *thd)
13750 {
13751 uint_t pgsz_cnum;
13752 uint_t pstate_save;
13753
13754 ASSERT(thd->t_procp->p_as == &kas);
13755
13756 pgsz_cnum = KCONTEXT;
13757 #ifdef sun4u
13758 pgsz_cnum |= (ksfmmup->sfmmu_cext << CTXREG_EXT_SHIFT);
13759 #endif
13760
13761 /*
13762 * Note that sfmmu_load_mmustate() is currently a no-op for
13763 * kernel threads. We need to disable interrupts here,
13764 * simply because otherwise sfmmu_load_mmustate() would panic
13765 * if the caller does not disable interrupts.
13766 */
13767 pstate_save = sfmmu_disable_intrs();
13768
13769 /* Compatibility Note: hw takes care of MMU_SCONTEXT1 */
13770 sfmmu_setctx_sec(pgsz_cnum);
13771 sfmmu_load_mmustate(ksfmmup);
13772 sfmmu_enable_intrs(pstate_save);
13773 }
13774
13775
13776 /*
13777 * SRD support
13778 */
13779 #define SRD_HASH_FUNCTION(vp) (((((uintptr_t)(vp)) >> 4) ^ \
13780 (((uintptr_t)(vp)) >> 11)) & \
13781 srd_hashmask)
13782
13783 /*
13784 * Attach the process to the srd struct associated with the exec vnode
13785 * from which the process is started.
13786 */
13787 void
hat_join_srd(struct hat * sfmmup,vnode_t * evp)13788 hat_join_srd(struct hat *sfmmup, vnode_t *evp)
13789 {
13790 uint_t hash = SRD_HASH_FUNCTION(evp);
13791 sf_srd_t *srdp;
13792 sf_srd_t *newsrdp;
13793
13794 ASSERT(sfmmup != ksfmmup);
13795 ASSERT(sfmmup->sfmmu_srdp == NULL);
13796
13797 if (!shctx_on) {
13798 return;
13799 }
13800
13801 VN_HOLD(evp);
13802
13803 if (srd_buckets[hash].srdb_srdp != NULL) {
13804 mutex_enter(&srd_buckets[hash].srdb_lock);
13805 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL;
13806 srdp = srdp->srd_hash) {
13807 if (srdp->srd_evp == evp) {
13808 ASSERT(srdp->srd_refcnt >= 0);
13809 sfmmup->sfmmu_srdp = srdp;
13810 atomic_inc_32(
13811 (volatile uint_t *)&srdp->srd_refcnt);
13812 mutex_exit(&srd_buckets[hash].srdb_lock);
13813 return;
13814 }
13815 }
13816 mutex_exit(&srd_buckets[hash].srdb_lock);
13817 }
13818 newsrdp = kmem_cache_alloc(srd_cache, KM_SLEEP);
13819 ASSERT(newsrdp->srd_next_ismrid == 0 && newsrdp->srd_next_hmerid == 0);
13820
13821 newsrdp->srd_evp = evp;
13822 newsrdp->srd_refcnt = 1;
13823 newsrdp->srd_hmergnfree = NULL;
13824 newsrdp->srd_ismrgnfree = NULL;
13825
13826 mutex_enter(&srd_buckets[hash].srdb_lock);
13827 for (srdp = srd_buckets[hash].srdb_srdp; srdp != NULL;
13828 srdp = srdp->srd_hash) {
13829 if (srdp->srd_evp == evp) {
13830 ASSERT(srdp->srd_refcnt >= 0);
13831 sfmmup->sfmmu_srdp = srdp;
13832 atomic_inc_32((volatile uint_t *)&srdp->srd_refcnt);
13833 mutex_exit(&srd_buckets[hash].srdb_lock);
13834 kmem_cache_free(srd_cache, newsrdp);
13835 return;
13836 }
13837 }
13838 newsrdp->srd_hash = srd_buckets[hash].srdb_srdp;
13839 srd_buckets[hash].srdb_srdp = newsrdp;
13840 sfmmup->sfmmu_srdp = newsrdp;
13841
13842 mutex_exit(&srd_buckets[hash].srdb_lock);
13843
13844 }
13845
13846 static void
sfmmu_leave_srd(sfmmu_t * sfmmup)13847 sfmmu_leave_srd(sfmmu_t *sfmmup)
13848 {
13849 vnode_t *evp;
13850 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
13851 uint_t hash;
13852 sf_srd_t **prev_srdpp;
13853 sf_region_t *rgnp;
13854 sf_region_t *nrgnp;
13855 #ifdef DEBUG
13856 int rgns = 0;
13857 #endif
13858 int i;
13859
13860 ASSERT(sfmmup != ksfmmup);
13861 ASSERT(srdp != NULL);
13862 ASSERT(srdp->srd_refcnt > 0);
13863 ASSERT(sfmmup->sfmmu_scdp == NULL);
13864 ASSERT(sfmmup->sfmmu_free == 1);
13865
13866 sfmmup->sfmmu_srdp = NULL;
13867 evp = srdp->srd_evp;
13868 ASSERT(evp != NULL);
13869 if (atomic_dec_32_nv((volatile uint_t *)&srdp->srd_refcnt)) {
13870 VN_RELE(evp);
13871 return;
13872 }
13873
13874 hash = SRD_HASH_FUNCTION(evp);
13875 mutex_enter(&srd_buckets[hash].srdb_lock);
13876 for (prev_srdpp = &srd_buckets[hash].srdb_srdp;
13877 (srdp = *prev_srdpp) != NULL; prev_srdpp = &srdp->srd_hash) {
13878 if (srdp->srd_evp == evp) {
13879 break;
13880 }
13881 }
13882 if (srdp == NULL || srdp->srd_refcnt) {
13883 mutex_exit(&srd_buckets[hash].srdb_lock);
13884 VN_RELE(evp);
13885 return;
13886 }
13887 *prev_srdpp = srdp->srd_hash;
13888 mutex_exit(&srd_buckets[hash].srdb_lock);
13889
13890 ASSERT(srdp->srd_refcnt == 0);
13891 VN_RELE(evp);
13892
13893 #ifdef DEBUG
13894 for (i = 0; i < SFMMU_MAX_REGION_BUCKETS; i++) {
13895 ASSERT(srdp->srd_rgnhash[i] == NULL);
13896 }
13897 #endif /* DEBUG */
13898
13899 /* free each hme regions in the srd */
13900 for (rgnp = srdp->srd_hmergnfree; rgnp != NULL; rgnp = nrgnp) {
13901 nrgnp = rgnp->rgn_next;
13902 ASSERT(rgnp->rgn_id < srdp->srd_next_hmerid);
13903 ASSERT(rgnp->rgn_refcnt == 0);
13904 ASSERT(rgnp->rgn_sfmmu_head == NULL);
13905 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
13906 ASSERT(rgnp->rgn_hmeflags == 0);
13907 ASSERT(srdp->srd_hmergnp[rgnp->rgn_id] == rgnp);
13908 #ifdef DEBUG
13909 for (i = 0; i < MMU_PAGE_SIZES; i++) {
13910 ASSERT(rgnp->rgn_ttecnt[i] == 0);
13911 }
13912 rgns++;
13913 #endif /* DEBUG */
13914 kmem_cache_free(region_cache, rgnp);
13915 }
13916 ASSERT(rgns == srdp->srd_next_hmerid);
13917
13918 #ifdef DEBUG
13919 rgns = 0;
13920 #endif
13921 /* free each ism rgns in the srd */
13922 for (rgnp = srdp->srd_ismrgnfree; rgnp != NULL; rgnp = nrgnp) {
13923 nrgnp = rgnp->rgn_next;
13924 ASSERT(rgnp->rgn_id < srdp->srd_next_ismrid);
13925 ASSERT(rgnp->rgn_refcnt == 0);
13926 ASSERT(rgnp->rgn_sfmmu_head == NULL);
13927 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
13928 ASSERT(srdp->srd_ismrgnp[rgnp->rgn_id] == rgnp);
13929 #ifdef DEBUG
13930 for (i = 0; i < MMU_PAGE_SIZES; i++) {
13931 ASSERT(rgnp->rgn_ttecnt[i] == 0);
13932 }
13933 rgns++;
13934 #endif /* DEBUG */
13935 kmem_cache_free(region_cache, rgnp);
13936 }
13937 ASSERT(rgns == srdp->srd_next_ismrid);
13938 ASSERT(srdp->srd_ismbusyrgns == 0);
13939 ASSERT(srdp->srd_hmebusyrgns == 0);
13940
13941 srdp->srd_next_ismrid = 0;
13942 srdp->srd_next_hmerid = 0;
13943
13944 bzero((void *)srdp->srd_ismrgnp,
13945 sizeof (sf_region_t *) * SFMMU_MAX_ISM_REGIONS);
13946 bzero((void *)srdp->srd_hmergnp,
13947 sizeof (sf_region_t *) * SFMMU_MAX_HME_REGIONS);
13948
13949 ASSERT(srdp->srd_scdp == NULL);
13950 kmem_cache_free(srd_cache, srdp);
13951 }
13952
13953 /* ARGSUSED */
13954 static int
sfmmu_srdcache_constructor(void * buf,void * cdrarg,int kmflags)13955 sfmmu_srdcache_constructor(void *buf, void *cdrarg, int kmflags)
13956 {
13957 sf_srd_t *srdp = (sf_srd_t *)buf;
13958 bzero(buf, sizeof (*srdp));
13959
13960 mutex_init(&srdp->srd_mutex, NULL, MUTEX_DEFAULT, NULL);
13961 mutex_init(&srdp->srd_scd_mutex, NULL, MUTEX_DEFAULT, NULL);
13962 return (0);
13963 }
13964
13965 /* ARGSUSED */
13966 static void
sfmmu_srdcache_destructor(void * buf,void * cdrarg)13967 sfmmu_srdcache_destructor(void *buf, void *cdrarg)
13968 {
13969 sf_srd_t *srdp = (sf_srd_t *)buf;
13970
13971 mutex_destroy(&srdp->srd_mutex);
13972 mutex_destroy(&srdp->srd_scd_mutex);
13973 }
13974
13975 /*
13976 * The caller makes sure hat_join_region()/hat_leave_region() can't be called
13977 * at the same time for the same process and address range. This is ensured by
13978 * the fact that address space is locked as writer when a process joins the
13979 * regions. Therefore there's no need to hold an srd lock during the entire
13980 * execution of hat_join_region()/hat_leave_region().
13981 */
13982
13983 #define RGN_HASH_FUNCTION(obj) (((((uintptr_t)(obj)) >> 4) ^ \
13984 (((uintptr_t)(obj)) >> 11)) & \
13985 srd_rgn_hashmask)
13986 /*
13987 * This routine implements the shared context functionality required when
13988 * attaching a segment to an address space. It must be called from
13989 * hat_share() for D(ISM) segments and from segvn_create() for segments
13990 * with the MAP_PRIVATE and MAP_TEXT flags set. It returns a region_cookie
13991 * which is saved in the private segment data for hme segments and
13992 * the ism_map structure for ism segments.
13993 */
13994 hat_region_cookie_t
hat_join_region(struct hat * sfmmup,caddr_t r_saddr,size_t r_size,void * r_obj,u_offset_t r_objoff,uchar_t r_perm,uchar_t r_pgszc,hat_rgn_cb_func_t r_cb_function,uint_t flags)13995 hat_join_region(struct hat *sfmmup,
13996 caddr_t r_saddr,
13997 size_t r_size,
13998 void *r_obj,
13999 u_offset_t r_objoff,
14000 uchar_t r_perm,
14001 uchar_t r_pgszc,
14002 hat_rgn_cb_func_t r_cb_function,
14003 uint_t flags)
14004 {
14005 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14006 uint_t rhash;
14007 uint_t rid;
14008 hatlock_t *hatlockp;
14009 sf_region_t *rgnp;
14010 sf_region_t *new_rgnp = NULL;
14011 int i;
14012 uint16_t *nextidp;
14013 sf_region_t **freelistp;
14014 int maxids;
14015 sf_region_t **rarrp;
14016 uint16_t *busyrgnsp;
14017 ulong_t rttecnt;
14018 uchar_t tteflag;
14019 uchar_t r_type = flags & HAT_REGION_TYPE_MASK;
14020 int text = (r_type == HAT_REGION_TEXT);
14021
14022 if (srdp == NULL || r_size == 0) {
14023 return (HAT_INVALID_REGION_COOKIE);
14024 }
14025
14026 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
14027 ASSERT(sfmmup != ksfmmup);
14028 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
14029 ASSERT(srdp->srd_refcnt > 0);
14030 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK));
14031 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM);
14032 ASSERT(r_pgszc < mmu_page_sizes);
14033 if (!IS_P2ALIGNED(r_saddr, TTEBYTES(r_pgszc)) ||
14034 !IS_P2ALIGNED(r_size, TTEBYTES(r_pgszc))) {
14035 panic("hat_join_region: region addr or size is not aligned\n");
14036 }
14037
14038
14039 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM :
14040 SFMMU_REGION_HME;
14041 /*
14042 * Currently only support shared hmes for the read only main text
14043 * region.
14044 */
14045 if (r_type == SFMMU_REGION_HME && ((r_obj != srdp->srd_evp) ||
14046 (r_perm & PROT_WRITE))) {
14047 return (HAT_INVALID_REGION_COOKIE);
14048 }
14049
14050 rhash = RGN_HASH_FUNCTION(r_obj);
14051
14052 if (r_type == SFMMU_REGION_ISM) {
14053 nextidp = &srdp->srd_next_ismrid;
14054 freelistp = &srdp->srd_ismrgnfree;
14055 maxids = SFMMU_MAX_ISM_REGIONS;
14056 rarrp = srdp->srd_ismrgnp;
14057 busyrgnsp = &srdp->srd_ismbusyrgns;
14058 } else {
14059 nextidp = &srdp->srd_next_hmerid;
14060 freelistp = &srdp->srd_hmergnfree;
14061 maxids = SFMMU_MAX_HME_REGIONS;
14062 rarrp = srdp->srd_hmergnp;
14063 busyrgnsp = &srdp->srd_hmebusyrgns;
14064 }
14065
14066 mutex_enter(&srdp->srd_mutex);
14067
14068 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL;
14069 rgnp = rgnp->rgn_hash) {
14070 if (rgnp->rgn_saddr == r_saddr && rgnp->rgn_size == r_size &&
14071 rgnp->rgn_obj == r_obj && rgnp->rgn_objoff == r_objoff &&
14072 rgnp->rgn_perm == r_perm && rgnp->rgn_pgszc == r_pgszc) {
14073 break;
14074 }
14075 }
14076
14077 rfound:
14078 if (rgnp != NULL) {
14079 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14080 ASSERT(rgnp->rgn_cb_function == r_cb_function);
14081 ASSERT(rgnp->rgn_refcnt >= 0);
14082 rid = rgnp->rgn_id;
14083 ASSERT(rid < maxids);
14084 ASSERT(rarrp[rid] == rgnp);
14085 ASSERT(rid < *nextidp);
14086 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt);
14087 mutex_exit(&srdp->srd_mutex);
14088 if (new_rgnp != NULL) {
14089 kmem_cache_free(region_cache, new_rgnp);
14090 }
14091 if (r_type == SFMMU_REGION_HME) {
14092 int myjoin =
14093 (sfmmup == astosfmmu(curthread->t_procp->p_as));
14094
14095 sfmmu_link_to_hmeregion(sfmmup, rgnp);
14096 /*
14097 * bitmap should be updated after linking sfmmu on
14098 * region list so that pageunload() doesn't skip
14099 * TSB/TLB flush. As soon as bitmap is updated another
14100 * thread in this process can already start accessing
14101 * this region.
14102 */
14103 /*
14104 * Normally ttecnt accounting is done as part of
14105 * pagefault handling. But a process may not take any
14106 * pagefaults on shared hmeblks created by some other
14107 * process. To compensate for this assume that the
14108 * entire region will end up faulted in using
14109 * the region's pagesize.
14110 *
14111 */
14112 if (r_pgszc > TTE8K) {
14113 tteflag = 1 << r_pgszc;
14114 if (disable_large_pages & tteflag) {
14115 tteflag = 0;
14116 }
14117 } else {
14118 tteflag = 0;
14119 }
14120 if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) {
14121 hatlockp = sfmmu_hat_enter(sfmmup);
14122 sfmmup->sfmmu_rtteflags |= tteflag;
14123 sfmmu_hat_exit(hatlockp);
14124 }
14125 hatlockp = sfmmu_hat_enter(sfmmup);
14126
14127 /*
14128 * Preallocate 1/4 of ttecnt's in 8K TSB for >= 4M
14129 * region to allow for large page allocation failure.
14130 */
14131 if (r_pgszc >= TTE4M) {
14132 sfmmup->sfmmu_tsb0_4minflcnt +=
14133 r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14134 }
14135
14136 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14137 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14138 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc],
14139 rttecnt);
14140
14141 if (text && r_pgszc >= TTE4M &&
14142 (tteflag || ((disable_large_pages >> TTE4M) &
14143 ((1 << (r_pgszc - TTE4M + 1)) - 1))) &&
14144 !SFMMU_FLAGS_ISSET(sfmmup, HAT_4MTEXT_FLAG)) {
14145 SFMMU_FLAGS_SET(sfmmup, HAT_4MTEXT_FLAG);
14146 }
14147
14148 sfmmu_hat_exit(hatlockp);
14149 /*
14150 * On Panther we need to make sure TLB is programmed
14151 * to accept 32M/256M pages. Call
14152 * sfmmu_check_page_sizes() now to make sure TLB is
14153 * setup before making hmeregions visible to other
14154 * threads.
14155 */
14156 sfmmu_check_page_sizes(sfmmup, 1);
14157 hatlockp = sfmmu_hat_enter(sfmmup);
14158 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid);
14159
14160 /*
14161 * if context is invalid tsb miss exception code will
14162 * call sfmmu_check_page_sizes() and update tsbmiss
14163 * area later.
14164 */
14165 kpreempt_disable();
14166 if (myjoin &&
14167 (sfmmup->sfmmu_ctxs[CPU_MMU_IDX(CPU)].cnum
14168 != INVALID_CONTEXT)) {
14169 struct tsbmiss *tsbmp;
14170
14171 tsbmp = &tsbmiss_area[CPU->cpu_id];
14172 ASSERT(sfmmup == tsbmp->usfmmup);
14173 BT_SET(tsbmp->shmermap, rid);
14174 if (r_pgszc > TTE64K) {
14175 tsbmp->uhat_rtteflags |= tteflag;
14176 }
14177
14178 }
14179 kpreempt_enable();
14180
14181 sfmmu_hat_exit(hatlockp);
14182 ASSERT((hat_region_cookie_t)((uint64_t)rid) !=
14183 HAT_INVALID_REGION_COOKIE);
14184 } else {
14185 hatlockp = sfmmu_hat_enter(sfmmup);
14186 SF_RGNMAP_ADD(sfmmup->sfmmu_ismregion_map, rid);
14187 sfmmu_hat_exit(hatlockp);
14188 }
14189 ASSERT(rid < maxids);
14190
14191 if (r_type == SFMMU_REGION_ISM) {
14192 sfmmu_find_scd(sfmmup);
14193 }
14194 return ((hat_region_cookie_t)((uint64_t)rid));
14195 }
14196
14197 ASSERT(new_rgnp == NULL);
14198
14199 if (*busyrgnsp >= maxids) {
14200 mutex_exit(&srdp->srd_mutex);
14201 return (HAT_INVALID_REGION_COOKIE);
14202 }
14203
14204 ASSERT(MUTEX_HELD(&srdp->srd_mutex));
14205 if (*freelistp != NULL) {
14206 rgnp = *freelistp;
14207 *freelistp = rgnp->rgn_next;
14208 ASSERT(rgnp->rgn_id < *nextidp);
14209 ASSERT(rgnp->rgn_id < maxids);
14210 ASSERT(rgnp->rgn_flags & SFMMU_REGION_FREE);
14211 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK)
14212 == r_type);
14213 ASSERT(rarrp[rgnp->rgn_id] == rgnp);
14214 ASSERT(rgnp->rgn_hmeflags == 0);
14215 } else {
14216 /*
14217 * release local locks before memory allocation.
14218 */
14219 mutex_exit(&srdp->srd_mutex);
14220
14221 new_rgnp = kmem_cache_alloc(region_cache, KM_SLEEP);
14222
14223 mutex_enter(&srdp->srd_mutex);
14224 for (rgnp = srdp->srd_rgnhash[rhash]; rgnp != NULL;
14225 rgnp = rgnp->rgn_hash) {
14226 if (rgnp->rgn_saddr == r_saddr &&
14227 rgnp->rgn_size == r_size &&
14228 rgnp->rgn_obj == r_obj &&
14229 rgnp->rgn_objoff == r_objoff &&
14230 rgnp->rgn_perm == r_perm &&
14231 rgnp->rgn_pgszc == r_pgszc) {
14232 break;
14233 }
14234 }
14235 if (rgnp != NULL) {
14236 goto rfound;
14237 }
14238
14239 if (*nextidp >= maxids) {
14240 mutex_exit(&srdp->srd_mutex);
14241 goto fail;
14242 }
14243 rgnp = new_rgnp;
14244 new_rgnp = NULL;
14245 rgnp->rgn_id = (*nextidp)++;
14246 ASSERT(rgnp->rgn_id < maxids);
14247 ASSERT(rarrp[rgnp->rgn_id] == NULL);
14248 rarrp[rgnp->rgn_id] = rgnp;
14249 }
14250
14251 ASSERT(rgnp->rgn_sfmmu_head == NULL);
14252 ASSERT(rgnp->rgn_hmeflags == 0);
14253 #ifdef DEBUG
14254 for (i = 0; i < MMU_PAGE_SIZES; i++) {
14255 ASSERT(rgnp->rgn_ttecnt[i] == 0);
14256 }
14257 #endif
14258 rgnp->rgn_saddr = r_saddr;
14259 rgnp->rgn_size = r_size;
14260 rgnp->rgn_obj = r_obj;
14261 rgnp->rgn_objoff = r_objoff;
14262 rgnp->rgn_perm = r_perm;
14263 rgnp->rgn_pgszc = r_pgszc;
14264 rgnp->rgn_flags = r_type;
14265 rgnp->rgn_refcnt = 0;
14266 rgnp->rgn_cb_function = r_cb_function;
14267 rgnp->rgn_hash = srdp->srd_rgnhash[rhash];
14268 srdp->srd_rgnhash[rhash] = rgnp;
14269 (*busyrgnsp)++;
14270 ASSERT(*busyrgnsp <= maxids);
14271 goto rfound;
14272
14273 fail:
14274 ASSERT(new_rgnp != NULL);
14275 kmem_cache_free(region_cache, new_rgnp);
14276 return (HAT_INVALID_REGION_COOKIE);
14277 }
14278
14279 /*
14280 * This function implements the shared context functionality required
14281 * when detaching a segment from an address space. It must be called
14282 * from hat_unshare() for all D(ISM) segments and from segvn_unmap(),
14283 * for segments with a valid region_cookie.
14284 * It will also be called from all seg_vn routines which change a
14285 * segment's attributes such as segvn_setprot(), segvn_setpagesize(),
14286 * segvn_clrszc() & segvn_advise(), as well as in the case of COW fault
14287 * from segvn_fault().
14288 */
14289 void
hat_leave_region(struct hat * sfmmup,hat_region_cookie_t rcookie,uint_t flags)14290 hat_leave_region(struct hat *sfmmup, hat_region_cookie_t rcookie, uint_t flags)
14291 {
14292 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14293 sf_scd_t *scdp;
14294 uint_t rhash;
14295 uint_t rid = (uint_t)((uint64_t)rcookie);
14296 hatlock_t *hatlockp = NULL;
14297 sf_region_t *rgnp;
14298 sf_region_t **prev_rgnpp;
14299 sf_region_t *cur_rgnp;
14300 void *r_obj;
14301 int i;
14302 caddr_t r_saddr;
14303 caddr_t r_eaddr;
14304 size_t r_size;
14305 uchar_t r_pgszc;
14306 uchar_t r_type = flags & HAT_REGION_TYPE_MASK;
14307
14308 ASSERT(sfmmup != ksfmmup);
14309 ASSERT(srdp != NULL);
14310 ASSERT(srdp->srd_refcnt > 0);
14311 ASSERT(!(flags & ~HAT_REGION_TYPE_MASK));
14312 ASSERT(flags == HAT_REGION_TEXT || flags == HAT_REGION_ISM);
14313 ASSERT(!sfmmup->sfmmu_free || sfmmup->sfmmu_scdp == NULL);
14314
14315 r_type = (r_type == HAT_REGION_ISM) ? SFMMU_REGION_ISM :
14316 SFMMU_REGION_HME;
14317
14318 if (r_type == SFMMU_REGION_ISM) {
14319 ASSERT(SFMMU_IS_ISMRID_VALID(rid));
14320 ASSERT(rid < SFMMU_MAX_ISM_REGIONS);
14321 rgnp = srdp->srd_ismrgnp[rid];
14322 } else {
14323 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14324 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
14325 rgnp = srdp->srd_hmergnp[rid];
14326 }
14327 ASSERT(rgnp != NULL);
14328 ASSERT(rgnp->rgn_id == rid);
14329 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14330 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE));
14331 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as));
14332
14333 ASSERT(sfmmup->sfmmu_xhat_provider == NULL);
14334 if (r_type == SFMMU_REGION_HME && sfmmup->sfmmu_as->a_xhat != NULL) {
14335 xhat_unload_callback_all(sfmmup->sfmmu_as, rgnp->rgn_saddr,
14336 rgnp->rgn_size, 0, NULL);
14337 }
14338
14339 if (sfmmup->sfmmu_free) {
14340 ulong_t rttecnt;
14341 r_pgszc = rgnp->rgn_pgszc;
14342 r_size = rgnp->rgn_size;
14343
14344 ASSERT(sfmmup->sfmmu_scdp == NULL);
14345 if (r_type == SFMMU_REGION_ISM) {
14346 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid);
14347 } else {
14348 /* update shme rgns ttecnt in sfmmu_ttecnt */
14349 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14350 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt);
14351
14352 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc],
14353 -rttecnt);
14354
14355 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid);
14356 }
14357 } else if (r_type == SFMMU_REGION_ISM) {
14358 hatlockp = sfmmu_hat_enter(sfmmup);
14359 ASSERT(rid < srdp->srd_next_ismrid);
14360 SF_RGNMAP_DEL(sfmmup->sfmmu_ismregion_map, rid);
14361 scdp = sfmmup->sfmmu_scdp;
14362 if (scdp != NULL &&
14363 SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid)) {
14364 sfmmu_leave_scd(sfmmup, r_type);
14365 ASSERT(sfmmu_hat_lock_held(sfmmup));
14366 }
14367 sfmmu_hat_exit(hatlockp);
14368 } else {
14369 ulong_t rttecnt;
14370 r_pgszc = rgnp->rgn_pgszc;
14371 r_saddr = rgnp->rgn_saddr;
14372 r_size = rgnp->rgn_size;
14373 r_eaddr = r_saddr + r_size;
14374
14375 ASSERT(r_type == SFMMU_REGION_HME);
14376 hatlockp = sfmmu_hat_enter(sfmmup);
14377 ASSERT(rid < srdp->srd_next_hmerid);
14378 SF_RGNMAP_DEL(sfmmup->sfmmu_hmeregion_map, rid);
14379
14380 /*
14381 * If region is part of an SCD call sfmmu_leave_scd().
14382 * Otherwise if process is not exiting and has valid context
14383 * just drop the context on the floor to lose stale TLB
14384 * entries and force the update of tsb miss area to reflect
14385 * the new region map. After that clean our TSB entries.
14386 */
14387 scdp = sfmmup->sfmmu_scdp;
14388 if (scdp != NULL &&
14389 SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
14390 sfmmu_leave_scd(sfmmup, r_type);
14391 ASSERT(sfmmu_hat_lock_held(sfmmup));
14392 }
14393 sfmmu_invalidate_ctx(sfmmup);
14394
14395 i = TTE8K;
14396 while (i < mmu_page_sizes) {
14397 if (rgnp->rgn_ttecnt[i] != 0) {
14398 sfmmu_unload_tsb_range(sfmmup, r_saddr,
14399 r_eaddr, i);
14400 if (i < TTE4M) {
14401 i = TTE4M;
14402 continue;
14403 } else {
14404 break;
14405 }
14406 }
14407 i++;
14408 }
14409 /* Remove the preallocated 1/4 8k ttecnt for 4M regions. */
14410 if (r_pgszc >= TTE4M) {
14411 rttecnt = r_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14412 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >=
14413 rttecnt);
14414 sfmmup->sfmmu_tsb0_4minflcnt -= rttecnt;
14415 }
14416
14417 /* update shme rgns ttecnt in sfmmu_ttecnt */
14418 rttecnt = r_size >> TTE_PAGE_SHIFT(r_pgszc);
14419 ASSERT(sfmmup->sfmmu_ttecnt[r_pgszc] >= rttecnt);
14420 atomic_add_long(&sfmmup->sfmmu_ttecnt[r_pgszc], -rttecnt);
14421
14422 sfmmu_hat_exit(hatlockp);
14423 if (scdp != NULL && sfmmup->sfmmu_scdp == NULL) {
14424 /* sfmmup left the scd, grow private tsb */
14425 sfmmu_check_page_sizes(sfmmup, 1);
14426 } else {
14427 sfmmu_check_page_sizes(sfmmup, 0);
14428 }
14429 }
14430
14431 if (r_type == SFMMU_REGION_HME) {
14432 sfmmu_unlink_from_hmeregion(sfmmup, rgnp);
14433 }
14434
14435 r_obj = rgnp->rgn_obj;
14436 if (atomic_dec_32_nv((volatile uint_t *)&rgnp->rgn_refcnt)) {
14437 return;
14438 }
14439
14440 /*
14441 * looks like nobody uses this region anymore. Free it.
14442 */
14443 rhash = RGN_HASH_FUNCTION(r_obj);
14444 mutex_enter(&srdp->srd_mutex);
14445 for (prev_rgnpp = &srdp->srd_rgnhash[rhash];
14446 (cur_rgnp = *prev_rgnpp) != NULL;
14447 prev_rgnpp = &cur_rgnp->rgn_hash) {
14448 if (cur_rgnp == rgnp && cur_rgnp->rgn_refcnt == 0) {
14449 break;
14450 }
14451 }
14452
14453 if (cur_rgnp == NULL) {
14454 mutex_exit(&srdp->srd_mutex);
14455 return;
14456 }
14457
14458 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == r_type);
14459 *prev_rgnpp = rgnp->rgn_hash;
14460 if (r_type == SFMMU_REGION_ISM) {
14461 rgnp->rgn_flags |= SFMMU_REGION_FREE;
14462 ASSERT(rid < srdp->srd_next_ismrid);
14463 rgnp->rgn_next = srdp->srd_ismrgnfree;
14464 srdp->srd_ismrgnfree = rgnp;
14465 ASSERT(srdp->srd_ismbusyrgns > 0);
14466 srdp->srd_ismbusyrgns--;
14467 mutex_exit(&srdp->srd_mutex);
14468 return;
14469 }
14470 mutex_exit(&srdp->srd_mutex);
14471
14472 /*
14473 * Destroy region's hmeblks.
14474 */
14475 sfmmu_unload_hmeregion(srdp, rgnp);
14476
14477 rgnp->rgn_hmeflags = 0;
14478
14479 ASSERT(rgnp->rgn_sfmmu_head == NULL);
14480 ASSERT(rgnp->rgn_id == rid);
14481 for (i = 0; i < MMU_PAGE_SIZES; i++) {
14482 rgnp->rgn_ttecnt[i] = 0;
14483 }
14484 rgnp->rgn_flags |= SFMMU_REGION_FREE;
14485 mutex_enter(&srdp->srd_mutex);
14486 ASSERT(rid < srdp->srd_next_hmerid);
14487 rgnp->rgn_next = srdp->srd_hmergnfree;
14488 srdp->srd_hmergnfree = rgnp;
14489 ASSERT(srdp->srd_hmebusyrgns > 0);
14490 srdp->srd_hmebusyrgns--;
14491 mutex_exit(&srdp->srd_mutex);
14492 }
14493
14494 /*
14495 * For now only called for hmeblk regions and not for ISM regions.
14496 */
14497 void
hat_dup_region(struct hat * sfmmup,hat_region_cookie_t rcookie)14498 hat_dup_region(struct hat *sfmmup, hat_region_cookie_t rcookie)
14499 {
14500 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14501 uint_t rid = (uint_t)((uint64_t)rcookie);
14502 sf_region_t *rgnp;
14503 sf_rgn_link_t *rlink;
14504 sf_rgn_link_t *hrlink;
14505 ulong_t rttecnt;
14506
14507 ASSERT(sfmmup != ksfmmup);
14508 ASSERT(srdp != NULL);
14509 ASSERT(srdp->srd_refcnt > 0);
14510
14511 ASSERT(rid < srdp->srd_next_hmerid);
14512 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14513 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
14514
14515 rgnp = srdp->srd_hmergnp[rid];
14516 ASSERT(rgnp->rgn_refcnt > 0);
14517 ASSERT(rgnp->rgn_id == rid);
14518 ASSERT((rgnp->rgn_flags & SFMMU_REGION_TYPE_MASK) == SFMMU_REGION_HME);
14519 ASSERT(!(rgnp->rgn_flags & SFMMU_REGION_FREE));
14520
14521 atomic_inc_32((volatile uint_t *)&rgnp->rgn_refcnt);
14522
14523 /* LINTED: constant in conditional context */
14524 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 0);
14525 ASSERT(rlink != NULL);
14526 mutex_enter(&rgnp->rgn_mutex);
14527 ASSERT(rgnp->rgn_sfmmu_head != NULL);
14528 /* LINTED: constant in conditional context */
14529 SFMMU_HMERID2RLINKP(rgnp->rgn_sfmmu_head, rid, hrlink, 0, 0);
14530 ASSERT(hrlink != NULL);
14531 ASSERT(hrlink->prev == NULL);
14532 rlink->next = rgnp->rgn_sfmmu_head;
14533 rlink->prev = NULL;
14534 hrlink->prev = sfmmup;
14535 /*
14536 * make sure rlink's next field is correct
14537 * before making this link visible.
14538 */
14539 membar_stst();
14540 rgnp->rgn_sfmmu_head = sfmmup;
14541 mutex_exit(&rgnp->rgn_mutex);
14542
14543 /* update sfmmu_ttecnt with the shme rgn ttecnt */
14544 rttecnt = rgnp->rgn_size >> TTE_PAGE_SHIFT(rgnp->rgn_pgszc);
14545 atomic_add_long(&sfmmup->sfmmu_ttecnt[rgnp->rgn_pgszc], rttecnt);
14546 /* update tsb0 inflation count */
14547 if (rgnp->rgn_pgszc >= TTE4M) {
14548 sfmmup->sfmmu_tsb0_4minflcnt +=
14549 rgnp->rgn_size >> (TTE_PAGE_SHIFT(TTE8K) + 2);
14550 }
14551 /*
14552 * Update regionid bitmask without hat lock since no other thread
14553 * can update this region bitmask right now.
14554 */
14555 SF_RGNMAP_ADD(sfmmup->sfmmu_hmeregion_map, rid);
14556 }
14557
14558 /* ARGSUSED */
14559 static int
sfmmu_rgncache_constructor(void * buf,void * cdrarg,int kmflags)14560 sfmmu_rgncache_constructor(void *buf, void *cdrarg, int kmflags)
14561 {
14562 sf_region_t *rgnp = (sf_region_t *)buf;
14563 bzero(buf, sizeof (*rgnp));
14564
14565 mutex_init(&rgnp->rgn_mutex, NULL, MUTEX_DEFAULT, NULL);
14566
14567 return (0);
14568 }
14569
14570 /* ARGSUSED */
14571 static void
sfmmu_rgncache_destructor(void * buf,void * cdrarg)14572 sfmmu_rgncache_destructor(void *buf, void *cdrarg)
14573 {
14574 sf_region_t *rgnp = (sf_region_t *)buf;
14575 mutex_destroy(&rgnp->rgn_mutex);
14576 }
14577
14578 static int
sfrgnmap_isnull(sf_region_map_t * map)14579 sfrgnmap_isnull(sf_region_map_t *map)
14580 {
14581 int i;
14582
14583 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14584 if (map->bitmap[i] != 0) {
14585 return (0);
14586 }
14587 }
14588 return (1);
14589 }
14590
14591 static int
sfhmergnmap_isnull(sf_hmeregion_map_t * map)14592 sfhmergnmap_isnull(sf_hmeregion_map_t *map)
14593 {
14594 int i;
14595
14596 for (i = 0; i < SFMMU_HMERGNMAP_WORDS; i++) {
14597 if (map->bitmap[i] != 0) {
14598 return (0);
14599 }
14600 }
14601 return (1);
14602 }
14603
14604 #ifdef DEBUG
14605 static void
check_scd_sfmmu_list(sfmmu_t ** headp,sfmmu_t * sfmmup,int onlist)14606 check_scd_sfmmu_list(sfmmu_t **headp, sfmmu_t *sfmmup, int onlist)
14607 {
14608 sfmmu_t *sp;
14609 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
14610
14611 for (sp = *headp; sp != NULL; sp = sp->sfmmu_scd_link.next) {
14612 ASSERT(srdp == sp->sfmmu_srdp);
14613 if (sp == sfmmup) {
14614 if (onlist) {
14615 return;
14616 } else {
14617 panic("shctx: sfmmu 0x%p found on scd"
14618 "list 0x%p", (void *)sfmmup,
14619 (void *)*headp);
14620 }
14621 }
14622 }
14623 if (onlist) {
14624 panic("shctx: sfmmu 0x%p not found on scd list 0x%p",
14625 (void *)sfmmup, (void *)*headp);
14626 } else {
14627 return;
14628 }
14629 }
14630 #else /* DEBUG */
14631 #define check_scd_sfmmu_list(headp, sfmmup, onlist)
14632 #endif /* DEBUG */
14633
14634 /*
14635 * Removes an sfmmu from the SCD sfmmu list.
14636 */
14637 static void
sfmmu_from_scd_list(sfmmu_t ** headp,sfmmu_t * sfmmup)14638 sfmmu_from_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup)
14639 {
14640 ASSERT(sfmmup->sfmmu_srdp != NULL);
14641 check_scd_sfmmu_list(headp, sfmmup, 1);
14642 if (sfmmup->sfmmu_scd_link.prev != NULL) {
14643 ASSERT(*headp != sfmmup);
14644 sfmmup->sfmmu_scd_link.prev->sfmmu_scd_link.next =
14645 sfmmup->sfmmu_scd_link.next;
14646 } else {
14647 ASSERT(*headp == sfmmup);
14648 *headp = sfmmup->sfmmu_scd_link.next;
14649 }
14650 if (sfmmup->sfmmu_scd_link.next != NULL) {
14651 sfmmup->sfmmu_scd_link.next->sfmmu_scd_link.prev =
14652 sfmmup->sfmmu_scd_link.prev;
14653 }
14654 }
14655
14656
14657 /*
14658 * Adds an sfmmu to the start of the queue.
14659 */
14660 static void
sfmmu_to_scd_list(sfmmu_t ** headp,sfmmu_t * sfmmup)14661 sfmmu_to_scd_list(sfmmu_t **headp, sfmmu_t *sfmmup)
14662 {
14663 check_scd_sfmmu_list(headp, sfmmup, 0);
14664 sfmmup->sfmmu_scd_link.prev = NULL;
14665 sfmmup->sfmmu_scd_link.next = *headp;
14666 if (*headp != NULL)
14667 (*headp)->sfmmu_scd_link.prev = sfmmup;
14668 *headp = sfmmup;
14669 }
14670
14671 /*
14672 * Remove an scd from the start of the queue.
14673 */
14674 static void
sfmmu_remove_scd(sf_scd_t ** headp,sf_scd_t * scdp)14675 sfmmu_remove_scd(sf_scd_t **headp, sf_scd_t *scdp)
14676 {
14677 if (scdp->scd_prev != NULL) {
14678 ASSERT(*headp != scdp);
14679 scdp->scd_prev->scd_next = scdp->scd_next;
14680 } else {
14681 ASSERT(*headp == scdp);
14682 *headp = scdp->scd_next;
14683 }
14684
14685 if (scdp->scd_next != NULL) {
14686 scdp->scd_next->scd_prev = scdp->scd_prev;
14687 }
14688 }
14689
14690 /*
14691 * Add an scd to the start of the queue.
14692 */
14693 static void
sfmmu_add_scd(sf_scd_t ** headp,sf_scd_t * scdp)14694 sfmmu_add_scd(sf_scd_t **headp, sf_scd_t *scdp)
14695 {
14696 scdp->scd_prev = NULL;
14697 scdp->scd_next = *headp;
14698 if (*headp != NULL) {
14699 (*headp)->scd_prev = scdp;
14700 }
14701 *headp = scdp;
14702 }
14703
14704 static int
sfmmu_alloc_scd_tsbs(sf_srd_t * srdp,sf_scd_t * scdp)14705 sfmmu_alloc_scd_tsbs(sf_srd_t *srdp, sf_scd_t *scdp)
14706 {
14707 uint_t rid;
14708 uint_t i;
14709 uint_t j;
14710 ulong_t w;
14711 sf_region_t *rgnp;
14712 ulong_t tte8k_cnt = 0;
14713 ulong_t tte4m_cnt = 0;
14714 uint_t tsb_szc;
14715 sfmmu_t *scsfmmup = scdp->scd_sfmmup;
14716 sfmmu_t *ism_hatid;
14717 struct tsb_info *newtsb;
14718 int szc;
14719
14720 ASSERT(srdp != NULL);
14721
14722 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14723 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
14724 continue;
14725 }
14726 j = 0;
14727 while (w) {
14728 if (!(w & 0x1)) {
14729 j++;
14730 w >>= 1;
14731 continue;
14732 }
14733 rid = (i << BT_ULSHIFT) | j;
14734 j++;
14735 w >>= 1;
14736
14737 if (rid < SFMMU_MAX_HME_REGIONS) {
14738 rgnp = srdp->srd_hmergnp[rid];
14739 ASSERT(rgnp->rgn_id == rid);
14740 ASSERT(rgnp->rgn_refcnt > 0);
14741
14742 if (rgnp->rgn_pgszc < TTE4M) {
14743 tte8k_cnt += rgnp->rgn_size >>
14744 TTE_PAGE_SHIFT(TTE8K);
14745 } else {
14746 ASSERT(rgnp->rgn_pgszc >= TTE4M);
14747 tte4m_cnt += rgnp->rgn_size >>
14748 TTE_PAGE_SHIFT(TTE4M);
14749 /*
14750 * Inflate SCD tsb0 by preallocating
14751 * 1/4 8k ttecnt for 4M regions to
14752 * allow for lgpg alloc failure.
14753 */
14754 tte8k_cnt += rgnp->rgn_size >>
14755 (TTE_PAGE_SHIFT(TTE8K) + 2);
14756 }
14757 } else {
14758 rid -= SFMMU_MAX_HME_REGIONS;
14759 rgnp = srdp->srd_ismrgnp[rid];
14760 ASSERT(rgnp->rgn_id == rid);
14761 ASSERT(rgnp->rgn_refcnt > 0);
14762
14763 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
14764 ASSERT(ism_hatid->sfmmu_ismhat);
14765
14766 for (szc = 0; szc < TTE4M; szc++) {
14767 tte8k_cnt +=
14768 ism_hatid->sfmmu_ttecnt[szc] <<
14769 TTE_BSZS_SHIFT(szc);
14770 }
14771
14772 ASSERT(rgnp->rgn_pgszc >= TTE4M);
14773 if (rgnp->rgn_pgszc >= TTE4M) {
14774 tte4m_cnt += rgnp->rgn_size >>
14775 TTE_PAGE_SHIFT(TTE4M);
14776 }
14777 }
14778 }
14779 }
14780
14781 tsb_szc = SELECT_TSB_SIZECODE(tte8k_cnt);
14782
14783 /* Allocate both the SCD TSBs here. */
14784 if (sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb,
14785 tsb_szc, TSB8K|TSB64K|TSB512K, TSB_ALLOC, scsfmmup) &&
14786 (tsb_szc <= TSB_4M_SZCODE ||
14787 sfmmu_tsbinfo_alloc(&scsfmmup->sfmmu_tsb,
14788 TSB_4M_SZCODE, TSB8K|TSB64K|TSB512K,
14789 TSB_ALLOC, scsfmmup))) {
14790
14791 SFMMU_STAT(sf_scd_1sttsb_allocfail);
14792 return (TSB_ALLOCFAIL);
14793 } else {
14794 scsfmmup->sfmmu_tsb->tsb_flags |= TSB_SHAREDCTX;
14795
14796 if (tte4m_cnt) {
14797 tsb_szc = SELECT_TSB_SIZECODE(tte4m_cnt);
14798 if (sfmmu_tsbinfo_alloc(&newtsb, tsb_szc,
14799 TSB4M|TSB32M|TSB256M, TSB_ALLOC, scsfmmup) &&
14800 (tsb_szc <= TSB_4M_SZCODE ||
14801 sfmmu_tsbinfo_alloc(&newtsb, TSB_4M_SZCODE,
14802 TSB4M|TSB32M|TSB256M,
14803 TSB_ALLOC, scsfmmup))) {
14804 /*
14805 * If we fail to allocate the 2nd shared tsb,
14806 * just free the 1st tsb, return failure.
14807 */
14808 sfmmu_tsbinfo_free(scsfmmup->sfmmu_tsb);
14809 SFMMU_STAT(sf_scd_2ndtsb_allocfail);
14810 return (TSB_ALLOCFAIL);
14811 } else {
14812 ASSERT(scsfmmup->sfmmu_tsb->tsb_next == NULL);
14813 newtsb->tsb_flags |= TSB_SHAREDCTX;
14814 scsfmmup->sfmmu_tsb->tsb_next = newtsb;
14815 SFMMU_STAT(sf_scd_2ndtsb_alloc);
14816 }
14817 }
14818 SFMMU_STAT(sf_scd_1sttsb_alloc);
14819 }
14820 return (TSB_SUCCESS);
14821 }
14822
14823 static void
sfmmu_free_scd_tsbs(sfmmu_t * scd_sfmmu)14824 sfmmu_free_scd_tsbs(sfmmu_t *scd_sfmmu)
14825 {
14826 while (scd_sfmmu->sfmmu_tsb != NULL) {
14827 struct tsb_info *next = scd_sfmmu->sfmmu_tsb->tsb_next;
14828 sfmmu_tsbinfo_free(scd_sfmmu->sfmmu_tsb);
14829 scd_sfmmu->sfmmu_tsb = next;
14830 }
14831 }
14832
14833 /*
14834 * Link the sfmmu onto the hme region list.
14835 */
14836 void
sfmmu_link_to_hmeregion(sfmmu_t * sfmmup,sf_region_t * rgnp)14837 sfmmu_link_to_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp)
14838 {
14839 uint_t rid;
14840 sf_rgn_link_t *rlink;
14841 sfmmu_t *head;
14842 sf_rgn_link_t *hrlink;
14843
14844 rid = rgnp->rgn_id;
14845 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14846
14847 /* LINTED: constant in conditional context */
14848 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 1, 1);
14849 ASSERT(rlink != NULL);
14850 mutex_enter(&rgnp->rgn_mutex);
14851 if ((head = rgnp->rgn_sfmmu_head) == NULL) {
14852 rlink->next = NULL;
14853 rlink->prev = NULL;
14854 /*
14855 * make sure rlink's next field is NULL
14856 * before making this link visible.
14857 */
14858 membar_stst();
14859 rgnp->rgn_sfmmu_head = sfmmup;
14860 } else {
14861 /* LINTED: constant in conditional context */
14862 SFMMU_HMERID2RLINKP(head, rid, hrlink, 0, 0);
14863 ASSERT(hrlink != NULL);
14864 ASSERT(hrlink->prev == NULL);
14865 rlink->next = head;
14866 rlink->prev = NULL;
14867 hrlink->prev = sfmmup;
14868 /*
14869 * make sure rlink's next field is correct
14870 * before making this link visible.
14871 */
14872 membar_stst();
14873 rgnp->rgn_sfmmu_head = sfmmup;
14874 }
14875 mutex_exit(&rgnp->rgn_mutex);
14876 }
14877
14878 /*
14879 * Unlink the sfmmu from the hme region list.
14880 */
14881 void
sfmmu_unlink_from_hmeregion(sfmmu_t * sfmmup,sf_region_t * rgnp)14882 sfmmu_unlink_from_hmeregion(sfmmu_t *sfmmup, sf_region_t *rgnp)
14883 {
14884 uint_t rid;
14885 sf_rgn_link_t *rlink;
14886
14887 rid = rgnp->rgn_id;
14888 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
14889
14890 /* LINTED: constant in conditional context */
14891 SFMMU_HMERID2RLINKP(sfmmup, rid, rlink, 0, 0);
14892 ASSERT(rlink != NULL);
14893 mutex_enter(&rgnp->rgn_mutex);
14894 if (rgnp->rgn_sfmmu_head == sfmmup) {
14895 sfmmu_t *next = rlink->next;
14896 rgnp->rgn_sfmmu_head = next;
14897 /*
14898 * if we are stopped by xc_attention() after this
14899 * point the forward link walking in
14900 * sfmmu_rgntlb_demap() will work correctly since the
14901 * head correctly points to the next element.
14902 */
14903 membar_stst();
14904 rlink->next = NULL;
14905 ASSERT(rlink->prev == NULL);
14906 if (next != NULL) {
14907 sf_rgn_link_t *nrlink;
14908 /* LINTED: constant in conditional context */
14909 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0);
14910 ASSERT(nrlink != NULL);
14911 ASSERT(nrlink->prev == sfmmup);
14912 nrlink->prev = NULL;
14913 }
14914 } else {
14915 sfmmu_t *next = rlink->next;
14916 sfmmu_t *prev = rlink->prev;
14917 sf_rgn_link_t *prlink;
14918
14919 ASSERT(prev != NULL);
14920 /* LINTED: constant in conditional context */
14921 SFMMU_HMERID2RLINKP(prev, rid, prlink, 0, 0);
14922 ASSERT(prlink != NULL);
14923 ASSERT(prlink->next == sfmmup);
14924 prlink->next = next;
14925 /*
14926 * if we are stopped by xc_attention()
14927 * after this point the forward link walking
14928 * will work correctly since the prev element
14929 * correctly points to the next element.
14930 */
14931 membar_stst();
14932 rlink->next = NULL;
14933 rlink->prev = NULL;
14934 if (next != NULL) {
14935 sf_rgn_link_t *nrlink;
14936 /* LINTED: constant in conditional context */
14937 SFMMU_HMERID2RLINKP(next, rid, nrlink, 0, 0);
14938 ASSERT(nrlink != NULL);
14939 ASSERT(nrlink->prev == sfmmup);
14940 nrlink->prev = prev;
14941 }
14942 }
14943 mutex_exit(&rgnp->rgn_mutex);
14944 }
14945
14946 /*
14947 * Link scd sfmmu onto ism or hme region list for each region in the
14948 * scd region map.
14949 */
14950 void
sfmmu_link_scd_to_regions(sf_srd_t * srdp,sf_scd_t * scdp)14951 sfmmu_link_scd_to_regions(sf_srd_t *srdp, sf_scd_t *scdp)
14952 {
14953 uint_t rid;
14954 uint_t i;
14955 uint_t j;
14956 ulong_t w;
14957 sf_region_t *rgnp;
14958 sfmmu_t *scsfmmup;
14959
14960 scsfmmup = scdp->scd_sfmmup;
14961 ASSERT(scsfmmup->sfmmu_scdhat);
14962 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
14963 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
14964 continue;
14965 }
14966 j = 0;
14967 while (w) {
14968 if (!(w & 0x1)) {
14969 j++;
14970 w >>= 1;
14971 continue;
14972 }
14973 rid = (i << BT_ULSHIFT) | j;
14974 j++;
14975 w >>= 1;
14976
14977 if (rid < SFMMU_MAX_HME_REGIONS) {
14978 rgnp = srdp->srd_hmergnp[rid];
14979 ASSERT(rgnp->rgn_id == rid);
14980 ASSERT(rgnp->rgn_refcnt > 0);
14981 sfmmu_link_to_hmeregion(scsfmmup, rgnp);
14982 } else {
14983 sfmmu_t *ism_hatid = NULL;
14984 ism_ment_t *ism_ment;
14985 rid -= SFMMU_MAX_HME_REGIONS;
14986 rgnp = srdp->srd_ismrgnp[rid];
14987 ASSERT(rgnp->rgn_id == rid);
14988 ASSERT(rgnp->rgn_refcnt > 0);
14989
14990 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
14991 ASSERT(ism_hatid->sfmmu_ismhat);
14992 ism_ment = &scdp->scd_ism_links[rid];
14993 ism_ment->iment_hat = scsfmmup;
14994 ism_ment->iment_base_va = rgnp->rgn_saddr;
14995 mutex_enter(&ism_mlist_lock);
14996 iment_add(ism_ment, ism_hatid);
14997 mutex_exit(&ism_mlist_lock);
14998
14999 }
15000 }
15001 }
15002 }
15003 /*
15004 * Unlink scd sfmmu from ism or hme region list for each region in the
15005 * scd region map.
15006 */
15007 void
sfmmu_unlink_scd_from_regions(sf_srd_t * srdp,sf_scd_t * scdp)15008 sfmmu_unlink_scd_from_regions(sf_srd_t *srdp, sf_scd_t *scdp)
15009 {
15010 uint_t rid;
15011 uint_t i;
15012 uint_t j;
15013 ulong_t w;
15014 sf_region_t *rgnp;
15015 sfmmu_t *scsfmmup;
15016
15017 scsfmmup = scdp->scd_sfmmup;
15018 for (i = 0; i < SFMMU_RGNMAP_WORDS; i++) {
15019 if ((w = scdp->scd_region_map.bitmap[i]) == 0) {
15020 continue;
15021 }
15022 j = 0;
15023 while (w) {
15024 if (!(w & 0x1)) {
15025 j++;
15026 w >>= 1;
15027 continue;
15028 }
15029 rid = (i << BT_ULSHIFT) | j;
15030 j++;
15031 w >>= 1;
15032
15033 if (rid < SFMMU_MAX_HME_REGIONS) {
15034 rgnp = srdp->srd_hmergnp[rid];
15035 ASSERT(rgnp->rgn_id == rid);
15036 ASSERT(rgnp->rgn_refcnt > 0);
15037 sfmmu_unlink_from_hmeregion(scsfmmup,
15038 rgnp);
15039
15040 } else {
15041 sfmmu_t *ism_hatid = NULL;
15042 ism_ment_t *ism_ment;
15043 rid -= SFMMU_MAX_HME_REGIONS;
15044 rgnp = srdp->srd_ismrgnp[rid];
15045 ASSERT(rgnp->rgn_id == rid);
15046 ASSERT(rgnp->rgn_refcnt > 0);
15047
15048 ism_hatid = (sfmmu_t *)rgnp->rgn_obj;
15049 ASSERT(ism_hatid->sfmmu_ismhat);
15050 ism_ment = &scdp->scd_ism_links[rid];
15051 ASSERT(ism_ment->iment_hat == scdp->scd_sfmmup);
15052 ASSERT(ism_ment->iment_base_va ==
15053 rgnp->rgn_saddr);
15054 mutex_enter(&ism_mlist_lock);
15055 iment_sub(ism_ment, ism_hatid);
15056 mutex_exit(&ism_mlist_lock);
15057
15058 }
15059 }
15060 }
15061 }
15062 /*
15063 * Allocates and initialises a new SCD structure, this is called with
15064 * the srd_scd_mutex held and returns with the reference count
15065 * initialised to 1.
15066 */
15067 static sf_scd_t *
sfmmu_alloc_scd(sf_srd_t * srdp,sf_region_map_t * new_map)15068 sfmmu_alloc_scd(sf_srd_t *srdp, sf_region_map_t *new_map)
15069 {
15070 sf_scd_t *new_scdp;
15071 sfmmu_t *scsfmmup;
15072 int i;
15073
15074 ASSERT(MUTEX_HELD(&srdp->srd_scd_mutex));
15075 new_scdp = kmem_cache_alloc(scd_cache, KM_SLEEP);
15076
15077 scsfmmup = kmem_cache_alloc(sfmmuid_cache, KM_SLEEP);
15078 new_scdp->scd_sfmmup = scsfmmup;
15079 scsfmmup->sfmmu_srdp = srdp;
15080 scsfmmup->sfmmu_scdp = new_scdp;
15081 scsfmmup->sfmmu_tsb0_4minflcnt = 0;
15082 scsfmmup->sfmmu_scdhat = 1;
15083 CPUSET_ALL(scsfmmup->sfmmu_cpusran);
15084 bzero(scsfmmup->sfmmu_hmeregion_links, SFMMU_L1_HMERLINKS_SIZE);
15085
15086 ASSERT(max_mmu_ctxdoms > 0);
15087 for (i = 0; i < max_mmu_ctxdoms; i++) {
15088 scsfmmup->sfmmu_ctxs[i].cnum = INVALID_CONTEXT;
15089 scsfmmup->sfmmu_ctxs[i].gnum = 0;
15090 }
15091
15092 for (i = 0; i < MMU_PAGE_SIZES; i++) {
15093 new_scdp->scd_rttecnt[i] = 0;
15094 }
15095
15096 new_scdp->scd_region_map = *new_map;
15097 new_scdp->scd_refcnt = 1;
15098 if (sfmmu_alloc_scd_tsbs(srdp, new_scdp) != TSB_SUCCESS) {
15099 kmem_cache_free(scd_cache, new_scdp);
15100 kmem_cache_free(sfmmuid_cache, scsfmmup);
15101 return (NULL);
15102 }
15103 if (&mmu_init_scd) {
15104 mmu_init_scd(new_scdp);
15105 }
15106 return (new_scdp);
15107 }
15108
15109 /*
15110 * The first phase of a process joining an SCD. The hat structure is
15111 * linked to the SCD queue and then the HAT_JOIN_SCD sfmmu flag is set
15112 * and a cross-call with context invalidation is used to cause the
15113 * remaining work to be carried out in the sfmmu_tsbmiss_exception()
15114 * routine.
15115 */
15116 static void
sfmmu_join_scd(sf_scd_t * scdp,sfmmu_t * sfmmup)15117 sfmmu_join_scd(sf_scd_t *scdp, sfmmu_t *sfmmup)
15118 {
15119 hatlock_t *hatlockp;
15120 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15121 int i;
15122 sf_scd_t *old_scdp;
15123
15124 ASSERT(srdp != NULL);
15125 ASSERT(scdp != NULL);
15126 ASSERT(scdp->scd_refcnt > 0);
15127 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
15128
15129 if ((old_scdp = sfmmup->sfmmu_scdp) != NULL) {
15130 ASSERT(old_scdp != scdp);
15131
15132 mutex_enter(&old_scdp->scd_mutex);
15133 sfmmu_from_scd_list(&old_scdp->scd_sf_list, sfmmup);
15134 mutex_exit(&old_scdp->scd_mutex);
15135 /*
15136 * sfmmup leaves the old scd. Update sfmmu_ttecnt to
15137 * include the shme rgn ttecnt for rgns that
15138 * were in the old SCD
15139 */
15140 for (i = 0; i < mmu_page_sizes; i++) {
15141 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15142 old_scdp->scd_rttecnt[i]);
15143 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15144 sfmmup->sfmmu_scdrttecnt[i]);
15145 }
15146 }
15147
15148 /*
15149 * Move sfmmu to the scd lists.
15150 */
15151 mutex_enter(&scdp->scd_mutex);
15152 sfmmu_to_scd_list(&scdp->scd_sf_list, sfmmup);
15153 mutex_exit(&scdp->scd_mutex);
15154 SF_SCD_INCR_REF(scdp);
15155
15156 hatlockp = sfmmu_hat_enter(sfmmup);
15157 /*
15158 * For a multi-thread process, we must stop
15159 * all the other threads before joining the scd.
15160 */
15161
15162 SFMMU_FLAGS_SET(sfmmup, HAT_JOIN_SCD);
15163
15164 sfmmu_invalidate_ctx(sfmmup);
15165 sfmmup->sfmmu_scdp = scdp;
15166
15167 /*
15168 * Copy scd_rttecnt into sfmmup's sfmmu_scdrttecnt, and update
15169 * sfmmu_ttecnt to not include the rgn ttecnt just joined in SCD.
15170 */
15171 for (i = 0; i < mmu_page_sizes; i++) {
15172 sfmmup->sfmmu_scdrttecnt[i] = scdp->scd_rttecnt[i];
15173 ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]);
15174 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15175 -sfmmup->sfmmu_scdrttecnt[i]);
15176 }
15177 /* update tsb0 inflation count */
15178 if (old_scdp != NULL) {
15179 sfmmup->sfmmu_tsb0_4minflcnt +=
15180 old_scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
15181 }
15182 ASSERT(sfmmup->sfmmu_tsb0_4minflcnt >=
15183 scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt);
15184 sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
15185
15186 sfmmu_hat_exit(hatlockp);
15187
15188 if (old_scdp != NULL) {
15189 SF_SCD_DECR_REF(srdp, old_scdp);
15190 }
15191
15192 }
15193
15194 /*
15195 * This routine is called by a process to become part of an SCD. It is called
15196 * from sfmmu_tsbmiss_exception() once most of the initial work has been
15197 * done by sfmmu_join_scd(). This routine must not drop the hat lock.
15198 */
15199 static void
sfmmu_finish_join_scd(sfmmu_t * sfmmup)15200 sfmmu_finish_join_scd(sfmmu_t *sfmmup)
15201 {
15202 struct tsb_info *tsbinfop;
15203
15204 ASSERT(sfmmu_hat_lock_held(sfmmup));
15205 ASSERT(sfmmup->sfmmu_scdp != NULL);
15206 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD));
15207 ASSERT(!SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15208 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ALLCTX_INVALID));
15209
15210 for (tsbinfop = sfmmup->sfmmu_tsb; tsbinfop != NULL;
15211 tsbinfop = tsbinfop->tsb_next) {
15212 if (tsbinfop->tsb_flags & TSB_SWAPPED) {
15213 continue;
15214 }
15215 ASSERT(!(tsbinfop->tsb_flags & TSB_RELOC_FLAG));
15216
15217 sfmmu_inv_tsb(tsbinfop->tsb_va,
15218 TSB_BYTES(tsbinfop->tsb_szc));
15219 }
15220
15221 /* Set HAT_CTX1_FLAG for all SCD ISMs */
15222 sfmmu_ism_hatflags(sfmmup, 1);
15223
15224 SFMMU_STAT(sf_join_scd);
15225 }
15226
15227 /*
15228 * This routine is called in order to check if there is an SCD which matches
15229 * the process's region map if not then a new SCD may be created.
15230 */
15231 static void
sfmmu_find_scd(sfmmu_t * sfmmup)15232 sfmmu_find_scd(sfmmu_t *sfmmup)
15233 {
15234 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15235 sf_scd_t *scdp, *new_scdp;
15236 int ret;
15237
15238 ASSERT(srdp != NULL);
15239 ASSERT(AS_WRITE_HELD(sfmmup->sfmmu_as));
15240
15241 mutex_enter(&srdp->srd_scd_mutex);
15242 for (scdp = srdp->srd_scdp; scdp != NULL;
15243 scdp = scdp->scd_next) {
15244 SF_RGNMAP_EQUAL(&scdp->scd_region_map,
15245 &sfmmup->sfmmu_region_map, ret);
15246 if (ret == 1) {
15247 SF_SCD_INCR_REF(scdp);
15248 mutex_exit(&srdp->srd_scd_mutex);
15249 sfmmu_join_scd(scdp, sfmmup);
15250 ASSERT(scdp->scd_refcnt >= 2);
15251 atomic_dec_32((volatile uint32_t *)&scdp->scd_refcnt);
15252 return;
15253 } else {
15254 /*
15255 * If the sfmmu region map is a subset of the scd
15256 * region map, then the assumption is that this process
15257 * will continue attaching to ISM segments until the
15258 * region maps are equal.
15259 */
15260 SF_RGNMAP_IS_SUBSET(&scdp->scd_region_map,
15261 &sfmmup->sfmmu_region_map, ret);
15262 if (ret == 1) {
15263 mutex_exit(&srdp->srd_scd_mutex);
15264 return;
15265 }
15266 }
15267 }
15268
15269 ASSERT(scdp == NULL);
15270 /*
15271 * No matching SCD has been found, create a new one.
15272 */
15273 if ((new_scdp = sfmmu_alloc_scd(srdp, &sfmmup->sfmmu_region_map)) ==
15274 NULL) {
15275 mutex_exit(&srdp->srd_scd_mutex);
15276 return;
15277 }
15278
15279 /*
15280 * sfmmu_alloc_scd() returns with a ref count of 1 on the scd.
15281 */
15282
15283 /* Set scd_rttecnt for shme rgns in SCD */
15284 sfmmu_set_scd_rttecnt(srdp, new_scdp);
15285
15286 /*
15287 * Link scd onto srd_scdp list and scd sfmmu onto region/iment lists.
15288 */
15289 sfmmu_link_scd_to_regions(srdp, new_scdp);
15290 sfmmu_add_scd(&srdp->srd_scdp, new_scdp);
15291 SFMMU_STAT_ADD(sf_create_scd, 1);
15292
15293 mutex_exit(&srdp->srd_scd_mutex);
15294 sfmmu_join_scd(new_scdp, sfmmup);
15295 ASSERT(new_scdp->scd_refcnt >= 2);
15296 atomic_dec_32((volatile uint32_t *)&new_scdp->scd_refcnt);
15297 }
15298
15299 /*
15300 * This routine is called by a process to remove itself from an SCD. It is
15301 * either called when the processes has detached from a segment or from
15302 * hat_free_start() as a result of calling exit.
15303 */
15304 static void
sfmmu_leave_scd(sfmmu_t * sfmmup,uchar_t r_type)15305 sfmmu_leave_scd(sfmmu_t *sfmmup, uchar_t r_type)
15306 {
15307 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
15308 sf_srd_t *srdp = sfmmup->sfmmu_srdp;
15309 hatlock_t *hatlockp = TSB_HASH(sfmmup);
15310 int i;
15311
15312 ASSERT(scdp != NULL);
15313 ASSERT(srdp != NULL);
15314
15315 if (sfmmup->sfmmu_free) {
15316 /*
15317 * If the process is part of an SCD the sfmmu is unlinked
15318 * from scd_sf_list.
15319 */
15320 mutex_enter(&scdp->scd_mutex);
15321 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup);
15322 mutex_exit(&scdp->scd_mutex);
15323 /*
15324 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15325 * are about to leave the SCD
15326 */
15327 for (i = 0; i < mmu_page_sizes; i++) {
15328 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15329 scdp->scd_rttecnt[i]);
15330 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15331 sfmmup->sfmmu_scdrttecnt[i]);
15332 sfmmup->sfmmu_scdrttecnt[i] = 0;
15333 }
15334 sfmmup->sfmmu_scdp = NULL;
15335
15336 SF_SCD_DECR_REF(srdp, scdp);
15337 return;
15338 }
15339
15340 ASSERT(r_type != SFMMU_REGION_ISM ||
15341 SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15342 ASSERT(scdp->scd_refcnt);
15343 ASSERT(!sfmmup->sfmmu_free);
15344 ASSERT(sfmmu_hat_lock_held(sfmmup));
15345 ASSERT(AS_LOCK_HELD(sfmmup->sfmmu_as));
15346
15347 /*
15348 * Wait for ISM maps to be updated.
15349 */
15350 if (r_type != SFMMU_REGION_ISM) {
15351 while (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY) &&
15352 sfmmup->sfmmu_scdp != NULL) {
15353 cv_wait(&sfmmup->sfmmu_tsb_cv,
15354 HATLOCK_MUTEXP(hatlockp));
15355 }
15356
15357 if (sfmmup->sfmmu_scdp == NULL) {
15358 sfmmu_hat_exit(hatlockp);
15359 return;
15360 }
15361 SFMMU_FLAGS_SET(sfmmup, HAT_ISMBUSY);
15362 }
15363
15364 if (SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
15365 SFMMU_FLAGS_CLEAR(sfmmup, HAT_JOIN_SCD);
15366 /*
15367 * Since HAT_JOIN_SCD was set our context
15368 * is still invalid.
15369 */
15370 } else {
15371 /*
15372 * For a multi-thread process, we must stop
15373 * all the other threads before leaving the scd.
15374 */
15375
15376 sfmmu_invalidate_ctx(sfmmup);
15377 }
15378
15379 /* Clear all the rid's for ISM, delete flags, etc */
15380 ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
15381 sfmmu_ism_hatflags(sfmmup, 0);
15382
15383 /*
15384 * Update sfmmu_ttecnt to include the rgn ttecnt for rgns that
15385 * are in SCD before this sfmmup leaves the SCD.
15386 */
15387 for (i = 0; i < mmu_page_sizes; i++) {
15388 ASSERT(sfmmup->sfmmu_scdrttecnt[i] ==
15389 scdp->scd_rttecnt[i]);
15390 atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
15391 sfmmup->sfmmu_scdrttecnt[i]);
15392 sfmmup->sfmmu_scdrttecnt[i] = 0;
15393 /* update ismttecnt to include SCD ism before hat leaves SCD */
15394 sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i];
15395 sfmmup->sfmmu_scdismttecnt[i] = 0;
15396 }
15397 /* update tsb0 inflation count */
15398 sfmmup->sfmmu_tsb0_4minflcnt += scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
15399
15400 if (r_type != SFMMU_REGION_ISM) {
15401 SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMBUSY);
15402 }
15403 sfmmup->sfmmu_scdp = NULL;
15404
15405 sfmmu_hat_exit(hatlockp);
15406
15407 /*
15408 * Unlink sfmmu from scd_sf_list this can be done without holding
15409 * the hat lock as we hold the sfmmu_as lock which prevents
15410 * hat_join_region from adding this thread to the scd again. Other
15411 * threads check if sfmmu_scdp is NULL under hat lock and if it's NULL
15412 * they won't get here, since sfmmu_leave_scd() clears sfmmu_scdp
15413 * while holding the hat lock.
15414 */
15415 mutex_enter(&scdp->scd_mutex);
15416 sfmmu_from_scd_list(&scdp->scd_sf_list, sfmmup);
15417 mutex_exit(&scdp->scd_mutex);
15418 SFMMU_STAT(sf_leave_scd);
15419
15420 SF_SCD_DECR_REF(srdp, scdp);
15421 hatlockp = sfmmu_hat_enter(sfmmup);
15422
15423 }
15424
15425 /*
15426 * Unlink and free up an SCD structure with a reference count of 0.
15427 */
15428 static void
sfmmu_destroy_scd(sf_srd_t * srdp,sf_scd_t * scdp,sf_region_map_t * scd_rmap)15429 sfmmu_destroy_scd(sf_srd_t *srdp, sf_scd_t *scdp, sf_region_map_t *scd_rmap)
15430 {
15431 sfmmu_t *scsfmmup;
15432 sf_scd_t *sp;
15433 hatlock_t *shatlockp;
15434 int i, ret;
15435
15436 mutex_enter(&srdp->srd_scd_mutex);
15437 for (sp = srdp->srd_scdp; sp != NULL; sp = sp->scd_next) {
15438 if (sp == scdp)
15439 break;
15440 }
15441 if (sp == NULL || sp->scd_refcnt) {
15442 mutex_exit(&srdp->srd_scd_mutex);
15443 return;
15444 }
15445
15446 /*
15447 * It is possible that the scd has been freed and reallocated with a
15448 * different region map while we've been waiting for the srd_scd_mutex.
15449 */
15450 SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret);
15451 if (ret != 1) {
15452 mutex_exit(&srdp->srd_scd_mutex);
15453 return;
15454 }
15455
15456 ASSERT(scdp->scd_sf_list == NULL);
15457 /*
15458 * Unlink scd from srd_scdp list.
15459 */
15460 sfmmu_remove_scd(&srdp->srd_scdp, scdp);
15461 mutex_exit(&srdp->srd_scd_mutex);
15462
15463 sfmmu_unlink_scd_from_regions(srdp, scdp);
15464
15465 /* Clear shared context tsb and release ctx */
15466 scsfmmup = scdp->scd_sfmmup;
15467
15468 /*
15469 * create a barrier so that scd will not be destroyed
15470 * if other thread still holds the same shared hat lock.
15471 * E.g., sfmmu_tsbmiss_exception() needs to acquire the
15472 * shared hat lock before checking the shared tsb reloc flag.
15473 */
15474 shatlockp = sfmmu_hat_enter(scsfmmup);
15475 sfmmu_hat_exit(shatlockp);
15476
15477 sfmmu_free_scd_tsbs(scsfmmup);
15478
15479 for (i = 0; i < SFMMU_L1_HMERLINKS; i++) {
15480 if (scsfmmup->sfmmu_hmeregion_links[i] != NULL) {
15481 kmem_free(scsfmmup->sfmmu_hmeregion_links[i],
15482 SFMMU_L2_HMERLINKS_SIZE);
15483 scsfmmup->sfmmu_hmeregion_links[i] = NULL;
15484 }
15485 }
15486 kmem_cache_free(sfmmuid_cache, scsfmmup);
15487 kmem_cache_free(scd_cache, scdp);
15488 SFMMU_STAT(sf_destroy_scd);
15489 }
15490
15491 /*
15492 * Modifies the HAT_CTX1_FLAG for each of the ISM segments which correspond to
15493 * bits which are set in the ism_region_map parameter. This flag indicates to
15494 * the tsbmiss handler that mapping for these segments should be loaded using
15495 * the shared context.
15496 */
15497 static void
sfmmu_ism_hatflags(sfmmu_t * sfmmup,int addflag)15498 sfmmu_ism_hatflags(sfmmu_t *sfmmup, int addflag)
15499 {
15500 sf_scd_t *scdp = sfmmup->sfmmu_scdp;
15501 ism_blk_t *ism_blkp;
15502 ism_map_t *ism_map;
15503 int i, rid;
15504
15505 ASSERT(sfmmup->sfmmu_iblk != NULL);
15506 ASSERT(scdp != NULL);
15507 /*
15508 * Note that the caller either set HAT_ISMBUSY flag or checked
15509 * under hat lock that HAT_ISMBUSY was not set by another thread.
15510 */
15511 ASSERT(sfmmu_hat_lock_held(sfmmup));
15512
15513 ism_blkp = sfmmup->sfmmu_iblk;
15514 while (ism_blkp != NULL) {
15515 ism_map = ism_blkp->iblk_maps;
15516 for (i = 0; ism_map[i].imap_ismhat && i < ISM_MAP_SLOTS; i++) {
15517 rid = ism_map[i].imap_rid;
15518 if (rid == SFMMU_INVALID_ISMRID) {
15519 continue;
15520 }
15521 ASSERT(rid >= 0 && rid < SFMMU_MAX_ISM_REGIONS);
15522 if (SF_RGNMAP_TEST(scdp->scd_ismregion_map, rid) &&
15523 addflag) {
15524 ism_map[i].imap_hatflags |=
15525 HAT_CTX1_FLAG;
15526 } else {
15527 ism_map[i].imap_hatflags &=
15528 ~HAT_CTX1_FLAG;
15529 }
15530 }
15531 ism_blkp = ism_blkp->iblk_next;
15532 }
15533 }
15534
15535 static int
sfmmu_srd_lock_held(sf_srd_t * srdp)15536 sfmmu_srd_lock_held(sf_srd_t *srdp)
15537 {
15538 return (MUTEX_HELD(&srdp->srd_mutex));
15539 }
15540
15541 /* ARGSUSED */
15542 static int
sfmmu_scdcache_constructor(void * buf,void * cdrarg,int kmflags)15543 sfmmu_scdcache_constructor(void *buf, void *cdrarg, int kmflags)
15544 {
15545 sf_scd_t *scdp = (sf_scd_t *)buf;
15546
15547 bzero(buf, sizeof (sf_scd_t));
15548 mutex_init(&scdp->scd_mutex, NULL, MUTEX_DEFAULT, NULL);
15549 return (0);
15550 }
15551
15552 /* ARGSUSED */
15553 static void
sfmmu_scdcache_destructor(void * buf,void * cdrarg)15554 sfmmu_scdcache_destructor(void *buf, void *cdrarg)
15555 {
15556 sf_scd_t *scdp = (sf_scd_t *)buf;
15557
15558 mutex_destroy(&scdp->scd_mutex);
15559 }
15560
15561 /*
15562 * The listp parameter is a pointer to a list of hmeblks which are partially
15563 * freed as result of calling sfmmu_hblk_hash_rm(), the last phase of the
15564 * freeing process is to cross-call all cpus to ensure that there are no
15565 * remaining cached references.
15566 *
15567 * If the local generation number is less than the global then we can free
15568 * hmeblks which are already on the pending queue as another cpu has completed
15569 * the cross-call.
15570 *
15571 * We cross-call to make sure that there are no threads on other cpus accessing
15572 * these hmblks and then complete the process of freeing them under the
15573 * following conditions:
15574 * The total number of pending hmeblks is greater than the threshold
15575 * The reserve list has fewer than HBLK_RESERVE_CNT hmeblks
15576 * It is at least 1 second since the last time we cross-called
15577 *
15578 * Otherwise, we add the hmeblks to the per-cpu pending queue.
15579 */
15580 static void
sfmmu_hblks_list_purge(struct hme_blk ** listp,int dontfree)15581 sfmmu_hblks_list_purge(struct hme_blk **listp, int dontfree)
15582 {
15583 struct hme_blk *hblkp, *pr_hblkp = NULL;
15584 int count = 0;
15585 cpuset_t cpuset = cpu_ready_set;
15586 cpu_hme_pend_t *cpuhp;
15587 timestruc_t now;
15588 int one_second_expired = 0;
15589
15590 gethrestime_lasttick(&now);
15591
15592 for (hblkp = *listp; hblkp != NULL; hblkp = hblkp->hblk_next) {
15593 ASSERT(hblkp->hblk_shw_bit == 0);
15594 ASSERT(hblkp->hblk_shared == 0);
15595 count++;
15596 pr_hblkp = hblkp;
15597 }
15598
15599 cpuhp = &cpu_hme_pend[CPU->cpu_seqid];
15600 mutex_enter(&cpuhp->chp_mutex);
15601
15602 if ((cpuhp->chp_count + count) == 0) {
15603 mutex_exit(&cpuhp->chp_mutex);
15604 return;
15605 }
15606
15607 if ((now.tv_sec - cpuhp->chp_timestamp) > 1) {
15608 one_second_expired = 1;
15609 }
15610
15611 if (!dontfree && (freehblkcnt < HBLK_RESERVE_CNT ||
15612 (cpuhp->chp_count + count) > cpu_hme_pend_thresh ||
15613 one_second_expired)) {
15614 /* Append global list to local */
15615 if (pr_hblkp == NULL) {
15616 *listp = cpuhp->chp_listp;
15617 } else {
15618 pr_hblkp->hblk_next = cpuhp->chp_listp;
15619 }
15620 cpuhp->chp_listp = NULL;
15621 cpuhp->chp_count = 0;
15622 cpuhp->chp_timestamp = now.tv_sec;
15623 mutex_exit(&cpuhp->chp_mutex);
15624
15625 kpreempt_disable();
15626 CPUSET_DEL(cpuset, CPU->cpu_id);
15627 xt_sync(cpuset);
15628 xt_sync(cpuset);
15629 kpreempt_enable();
15630
15631 /*
15632 * At this stage we know that no trap handlers on other
15633 * cpus can have references to hmeblks on the list.
15634 */
15635 sfmmu_hblk_free(listp);
15636 } else if (*listp != NULL) {
15637 pr_hblkp->hblk_next = cpuhp->chp_listp;
15638 cpuhp->chp_listp = *listp;
15639 cpuhp->chp_count += count;
15640 *listp = NULL;
15641 mutex_exit(&cpuhp->chp_mutex);
15642 } else {
15643 mutex_exit(&cpuhp->chp_mutex);
15644 }
15645 }
15646
15647 /*
15648 * Add an hmeblk to the the hash list.
15649 */
15650 void
sfmmu_hblk_hash_add(struct hmehash_bucket * hmebp,struct hme_blk * hmeblkp,uint64_t hblkpa)15651 sfmmu_hblk_hash_add(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
15652 uint64_t hblkpa)
15653 {
15654 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
15655 #ifdef DEBUG
15656 if (hmebp->hmeblkp == NULL) {
15657 ASSERT(hmebp->hmeh_nextpa == HMEBLK_ENDPA);
15658 }
15659 #endif /* DEBUG */
15660
15661 hmeblkp->hblk_nextpa = hmebp->hmeh_nextpa;
15662 /*
15663 * Since the TSB miss handler now does not lock the hash chain before
15664 * walking it, make sure that the hmeblks nextpa is globally visible
15665 * before we make the hmeblk globally visible by updating the chain root
15666 * pointer in the hash bucket.
15667 */
15668 membar_producer();
15669 hmebp->hmeh_nextpa = hblkpa;
15670 hmeblkp->hblk_next = hmebp->hmeblkp;
15671 hmebp->hmeblkp = hmeblkp;
15672
15673 }
15674
15675 /*
15676 * This function is the first part of a 2 part process to remove an hmeblk
15677 * from the hash chain. In this phase we unlink the hmeblk from the hash chain
15678 * but leave the next physical pointer unchanged. The hmeblk is then linked onto
15679 * a per-cpu pending list using the virtual address pointer.
15680 *
15681 * TSB miss trap handlers that start after this phase will no longer see
15682 * this hmeblk. TSB miss handlers that still cache this hmeblk in a register
15683 * can still use it for further chain traversal because we haven't yet modifed
15684 * the next physical pointer or freed it.
15685 *
15686 * In the second phase of hmeblk removal we'll issue a barrier xcall before
15687 * we reuse or free this hmeblk. This will make sure all lingering references to
15688 * the hmeblk after first phase disappear before we finally reclaim it.
15689 * This scheme eliminates the need for TSB miss handlers to lock hmeblk chains
15690 * during their traversal.
15691 *
15692 * The hmehash_mutex must be held when calling this function.
15693 *
15694 * Input:
15695 * hmebp - hme hash bucket pointer
15696 * hmeblkp - address of hmeblk to be removed
15697 * pr_hblk - virtual address of previous hmeblkp
15698 * listp - pointer to list of hmeblks linked by virtual address
15699 * free_now flag - indicates that a complete removal from the hash chains
15700 * is necessary.
15701 *
15702 * It is inefficient to use the free_now flag as a cross-call is required to
15703 * remove a single hmeblk from the hash chain but is necessary when hmeblks are
15704 * in short supply.
15705 */
15706 void
sfmmu_hblk_hash_rm(struct hmehash_bucket * hmebp,struct hme_blk * hmeblkp,struct hme_blk * pr_hblk,struct hme_blk ** listp,int free_now)15707 sfmmu_hblk_hash_rm(struct hmehash_bucket *hmebp, struct hme_blk *hmeblkp,
15708 struct hme_blk *pr_hblk, struct hme_blk **listp,
15709 int free_now)
15710 {
15711 int shw_size, vshift;
15712 struct hme_blk *shw_hblkp;
15713 uint_t shw_mask, newshw_mask;
15714 caddr_t vaddr;
15715 int size;
15716 cpuset_t cpuset = cpu_ready_set;
15717
15718 ASSERT(SFMMU_HASH_LOCK_ISHELD(hmebp));
15719
15720 if (hmebp->hmeblkp == hmeblkp) {
15721 hmebp->hmeh_nextpa = hmeblkp->hblk_nextpa;
15722 hmebp->hmeblkp = hmeblkp->hblk_next;
15723 } else {
15724 pr_hblk->hblk_nextpa = hmeblkp->hblk_nextpa;
15725 pr_hblk->hblk_next = hmeblkp->hblk_next;
15726 }
15727
15728 size = get_hblk_ttesz(hmeblkp);
15729 shw_hblkp = hmeblkp->hblk_shadow;
15730 if (shw_hblkp) {
15731 ASSERT(hblktosfmmu(hmeblkp) != KHATID);
15732 ASSERT(!hmeblkp->hblk_shared);
15733 #ifdef DEBUG
15734 if (mmu_page_sizes == max_mmu_page_sizes) {
15735 ASSERT(size < TTE256M);
15736 } else {
15737 ASSERT(size < TTE4M);
15738 }
15739 #endif /* DEBUG */
15740
15741 shw_size = get_hblk_ttesz(shw_hblkp);
15742 vaddr = (caddr_t)get_hblk_base(hmeblkp);
15743 vshift = vaddr_to_vshift(shw_hblkp->hblk_tag, vaddr, shw_size);
15744 ASSERT(vshift < 8);
15745 /*
15746 * Atomically clear shadow mask bit
15747 */
15748 do {
15749 shw_mask = shw_hblkp->hblk_shw_mask;
15750 ASSERT(shw_mask & (1 << vshift));
15751 newshw_mask = shw_mask & ~(1 << vshift);
15752 newshw_mask = atomic_cas_32(&shw_hblkp->hblk_shw_mask,
15753 shw_mask, newshw_mask);
15754 } while (newshw_mask != shw_mask);
15755 hmeblkp->hblk_shadow = NULL;
15756 }
15757 hmeblkp->hblk_shw_bit = 0;
15758
15759 if (hmeblkp->hblk_shared) {
15760 #ifdef DEBUG
15761 sf_srd_t *srdp;
15762 sf_region_t *rgnp;
15763 uint_t rid;
15764
15765 srdp = hblktosrd(hmeblkp);
15766 ASSERT(srdp != NULL && srdp->srd_refcnt != 0);
15767 rid = hmeblkp->hblk_tag.htag_rid;
15768 ASSERT(SFMMU_IS_SHMERID_VALID(rid));
15769 ASSERT(rid < SFMMU_MAX_HME_REGIONS);
15770 rgnp = srdp->srd_hmergnp[rid];
15771 ASSERT(rgnp != NULL);
15772 SFMMU_VALIDATE_SHAREDHBLK(hmeblkp, srdp, rgnp, rid);
15773 #endif /* DEBUG */
15774 hmeblkp->hblk_shared = 0;
15775 }
15776 if (free_now) {
15777 kpreempt_disable();
15778 CPUSET_DEL(cpuset, CPU->cpu_id);
15779 xt_sync(cpuset);
15780 xt_sync(cpuset);
15781 kpreempt_enable();
15782
15783 hmeblkp->hblk_nextpa = HMEBLK_ENDPA;
15784 hmeblkp->hblk_next = NULL;
15785 } else {
15786 /* Append hmeblkp to listp for processing later. */
15787 hmeblkp->hblk_next = *listp;
15788 *listp = hmeblkp;
15789 }
15790 }
15791
15792 /*
15793 * This routine is called when memory is in short supply and returns a free
15794 * hmeblk of the requested size from the cpu pending lists.
15795 */
15796 static struct hme_blk *
sfmmu_check_pending_hblks(int size)15797 sfmmu_check_pending_hblks(int size)
15798 {
15799 int i;
15800 struct hme_blk *hmeblkp = NULL, *last_hmeblkp;
15801 int found_hmeblk;
15802 cpuset_t cpuset = cpu_ready_set;
15803 cpu_hme_pend_t *cpuhp;
15804
15805 /* Flush cpu hblk pending queues */
15806 for (i = 0; i < NCPU; i++) {
15807 cpuhp = &cpu_hme_pend[i];
15808 if (cpuhp->chp_listp != NULL) {
15809 mutex_enter(&cpuhp->chp_mutex);
15810 if (cpuhp->chp_listp == NULL) {
15811 mutex_exit(&cpuhp->chp_mutex);
15812 continue;
15813 }
15814 found_hmeblk = 0;
15815 last_hmeblkp = NULL;
15816 for (hmeblkp = cpuhp->chp_listp; hmeblkp != NULL;
15817 hmeblkp = hmeblkp->hblk_next) {
15818 if (get_hblk_ttesz(hmeblkp) == size) {
15819 if (last_hmeblkp == NULL) {
15820 cpuhp->chp_listp =
15821 hmeblkp->hblk_next;
15822 } else {
15823 last_hmeblkp->hblk_next =
15824 hmeblkp->hblk_next;
15825 }
15826 ASSERT(cpuhp->chp_count > 0);
15827 cpuhp->chp_count--;
15828 found_hmeblk = 1;
15829 break;
15830 } else {
15831 last_hmeblkp = hmeblkp;
15832 }
15833 }
15834 mutex_exit(&cpuhp->chp_mutex);
15835
15836 if (found_hmeblk) {
15837 kpreempt_disable();
15838 CPUSET_DEL(cpuset, CPU->cpu_id);
15839 xt_sync(cpuset);
15840 xt_sync(cpuset);
15841 kpreempt_enable();
15842 return (hmeblkp);
15843 }
15844 }
15845 }
15846 return (NULL);
15847 }
15848