xref: /freebsd/sys/powerpc/aim/mmu_oea64.c (revision 5405b282e1f319b6f3597bb77f68be903e7f248c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2008-2015 Nathan Whitehorn
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  * Manages physical address maps.
34  *
35  * Since the information managed by this module is also stored by the
36  * logical address mapping module, this module may throw away valid virtual
37  * to physical mappings at almost any time.  However, invalidations of
38  * mappings must be done as requested.
39  *
40  * In order to cope with hardware architectures which make virtual to
41  * physical map invalidates expensive, this module may delay invalidate
42  * reduced protection operations until such time as they are actually
43  * necessary.  This module is given full information as to which processors
44  * are currently using which maps, and to when physical maps must be made
45  * correct.
46  */
47 
48 #include "opt_kstack_pages.h"
49 
50 #include <sys/param.h>
51 #include <sys/kernel.h>
52 #include <sys/conf.h>
53 #include <sys/queue.h>
54 #include <sys/cpuset.h>
55 #include <sys/kerneldump.h>
56 #include <sys/ktr.h>
57 #include <sys/lock.h>
58 #include <sys/msgbuf.h>
59 #include <sys/malloc.h>
60 #include <sys/mutex.h>
61 #include <sys/proc.h>
62 #include <sys/rwlock.h>
63 #include <sys/sched.h>
64 #include <sys/sysctl.h>
65 #include <sys/systm.h>
66 #include <sys/vmmeter.h>
67 #include <sys/smp.h>
68 
69 #include <sys/kdb.h>
70 
71 #include <dev/ofw/openfirm.h>
72 
73 #include <vm/vm.h>
74 #include <vm/vm_param.h>
75 #include <vm/vm_kern.h>
76 #include <vm/vm_page.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_object.h>
79 #include <vm/vm_extern.h>
80 #include <vm/vm_pageout.h>
81 #include <vm/uma.h>
82 
83 #include <machine/_inttypes.h>
84 #include <machine/cpu.h>
85 #include <machine/platform.h>
86 #include <machine/frame.h>
87 #include <machine/md_var.h>
88 #include <machine/psl.h>
89 #include <machine/bat.h>
90 #include <machine/hid.h>
91 #include <machine/pte.h>
92 #include <machine/sr.h>
93 #include <machine/trap.h>
94 #include <machine/mmuvar.h>
95 
96 #include "mmu_oea64.h"
97 #include "mmu_if.h"
98 #include "moea64_if.h"
99 
100 void moea64_release_vsid(uint64_t vsid);
101 uintptr_t moea64_get_unique_vsid(void);
102 
103 #define DISABLE_TRANS(msr)	msr = mfmsr(); mtmsr(msr & ~PSL_DR)
104 #define ENABLE_TRANS(msr)	mtmsr(msr)
105 
106 #define	VSID_MAKE(sr, hash)	((sr) | (((hash) & 0xfffff) << 4))
107 #define	VSID_TO_HASH(vsid)	(((vsid) >> 4) & 0xfffff)
108 #define	VSID_HASH_MASK		0x0000007fffffffffULL
109 
110 /*
111  * Locking semantics:
112  *
113  * There are two locks of interest: the page locks and the pmap locks, which
114  * protect their individual PVO lists and are locked in that order. The contents
115  * of all PVO entries are protected by the locks of their respective pmaps.
116  * The pmap of any PVO is guaranteed not to change so long as the PVO is linked
117  * into any list.
118  *
119  */
120 
121 #define PV_LOCK_PER_DOM	PA_LOCK_COUNT*3
122 #define PV_LOCK_COUNT	PV_LOCK_PER_DOM*MAXMEMDOM
123 static struct mtx_padalign pv_lock[PV_LOCK_COUNT];
124 
125 /*
126  * Cheap NUMA-izing of the pv locks, to reduce contention across domains.
127  * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the
128  * index at (N << 45).
129  */
130 #ifdef __powerpc64__
131 #define PV_LOCK_IDX(pa)	(pa_index(pa) % PV_LOCK_PER_DOM + \
132 			(((pa) >> 45) % MAXMEMDOM) * PV_LOCK_PER_DOM)
133 #else
134 #define PV_LOCK_IDX(pa)	(pa_index(pa) % PV_LOCK_COUNT)
135 #endif
136 #define PV_LOCKPTR(pa)	((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)]))
137 #define PV_LOCK(pa)		mtx_lock(PV_LOCKPTR(pa))
138 #define PV_UNLOCK(pa)		mtx_unlock(PV_LOCKPTR(pa))
139 #define PV_LOCKASSERT(pa) 	mtx_assert(PV_LOCKPTR(pa), MA_OWNED)
140 #define PV_PAGE_LOCK(m)		PV_LOCK(VM_PAGE_TO_PHYS(m))
141 #define PV_PAGE_UNLOCK(m)	PV_UNLOCK(VM_PAGE_TO_PHYS(m))
142 #define PV_PAGE_LOCKASSERT(m)	PV_LOCKASSERT(VM_PAGE_TO_PHYS(m))
143 
144 struct ofw_map {
145 	cell_t	om_va;
146 	cell_t	om_len;
147 	uint64_t om_pa;
148 	cell_t	om_mode;
149 };
150 
151 extern unsigned char _etext[];
152 extern unsigned char _end[];
153 
154 extern void *slbtrap, *slbtrapend;
155 
156 /*
157  * Map of physical memory regions.
158  */
159 static struct	mem_region *regions;
160 static struct	mem_region *pregions;
161 static struct	numa_mem_region *numa_pregions;
162 static u_int	phys_avail_count;
163 static int	regions_sz, pregions_sz, numapregions_sz;
164 
165 extern void bs_remap_earlyboot(void);
166 
167 /*
168  * Lock for the SLB tables.
169  */
170 struct mtx	moea64_slb_mutex;
171 
172 /*
173  * PTEG data.
174  */
175 u_long		moea64_pteg_count;
176 u_long		moea64_pteg_mask;
177 
178 /*
179  * PVO data.
180  */
181 
182 uma_zone_t	moea64_pvo_zone; /* zone for pvo entries */
183 
184 static struct	pvo_entry *moea64_bpvo_pool;
185 static int	moea64_bpvo_pool_index = 0;
186 static int	moea64_bpvo_pool_size = 327680;
187 TUNABLE_INT("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size);
188 SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD,
189     &moea64_bpvo_pool_index, 0, "");
190 
191 #define	VSID_NBPW	(sizeof(u_int32_t) * 8)
192 #ifdef __powerpc64__
193 #define	NVSIDS		(NPMAPS * 16)
194 #define VSID_HASHMASK	0xffffffffUL
195 #else
196 #define NVSIDS		NPMAPS
197 #define VSID_HASHMASK	0xfffffUL
198 #endif
199 static u_int	moea64_vsid_bitmap[NVSIDS / VSID_NBPW];
200 
201 static boolean_t moea64_initialized = FALSE;
202 
203 /*
204  * Statistics.
205  */
206 u_int	moea64_pte_valid = 0;
207 u_int	moea64_pte_overflow = 0;
208 u_int	moea64_pvo_entries = 0;
209 u_int	moea64_pvo_enter_calls = 0;
210 u_int	moea64_pvo_remove_calls = 0;
211 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD,
212     &moea64_pte_valid, 0, "");
213 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD,
214     &moea64_pte_overflow, 0, "");
215 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD,
216     &moea64_pvo_entries, 0, "");
217 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD,
218     &moea64_pvo_enter_calls, 0, "");
219 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD,
220     &moea64_pvo_remove_calls, 0, "");
221 
222 vm_offset_t	moea64_scratchpage_va[2];
223 struct pvo_entry *moea64_scratchpage_pvo[2];
224 struct	mtx	moea64_scratchpage_mtx;
225 
226 uint64_t 	moea64_large_page_mask = 0;
227 uint64_t	moea64_large_page_size = 0;
228 int		moea64_large_page_shift = 0;
229 
230 /*
231  * PVO calls.
232  */
233 static int	moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo,
234 		    struct pvo_head *pvo_head);
235 static void	moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo);
236 static void	moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo);
237 static struct	pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t);
238 
239 /*
240  * Utility routines.
241  */
242 static boolean_t	moea64_query_bit(mmu_t, vm_page_t, uint64_t);
243 static u_int		moea64_clear_bit(mmu_t, vm_page_t, uint64_t);
244 static void		moea64_kremove(mmu_t, vm_offset_t);
245 static void		moea64_syncicache(mmu_t, pmap_t pmap, vm_offset_t va,
246 			    vm_paddr_t pa, vm_size_t sz);
247 static void		moea64_pmap_init_qpages(void);
248 
249 /*
250  * Kernel MMU interface
251  */
252 void moea64_clear_modify(mmu_t, vm_page_t);
253 void moea64_copy_page(mmu_t, vm_page_t, vm_page_t);
254 void moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
255     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
256 int moea64_enter(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
257     u_int flags, int8_t psind);
258 void moea64_enter_object(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
259     vm_prot_t);
260 void moea64_enter_quick(mmu_t, pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
261 vm_paddr_t moea64_extract(mmu_t, pmap_t, vm_offset_t);
262 vm_page_t moea64_extract_and_hold(mmu_t, pmap_t, vm_offset_t, vm_prot_t);
263 void moea64_init(mmu_t);
264 boolean_t moea64_is_modified(mmu_t, vm_page_t);
265 boolean_t moea64_is_prefaultable(mmu_t, pmap_t, vm_offset_t);
266 boolean_t moea64_is_referenced(mmu_t, vm_page_t);
267 int moea64_ts_referenced(mmu_t, vm_page_t);
268 vm_offset_t moea64_map(mmu_t, vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
269 boolean_t moea64_page_exists_quick(mmu_t, pmap_t, vm_page_t);
270 void moea64_page_init(mmu_t, vm_page_t);
271 int moea64_page_wired_mappings(mmu_t, vm_page_t);
272 void moea64_pinit(mmu_t, pmap_t);
273 void moea64_pinit0(mmu_t, pmap_t);
274 void moea64_protect(mmu_t, pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
275 void moea64_qenter(mmu_t, vm_offset_t, vm_page_t *, int);
276 void moea64_qremove(mmu_t, vm_offset_t, int);
277 void moea64_release(mmu_t, pmap_t);
278 void moea64_remove(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
279 void moea64_remove_pages(mmu_t, pmap_t);
280 void moea64_remove_all(mmu_t, vm_page_t);
281 void moea64_remove_write(mmu_t, vm_page_t);
282 void moea64_unwire(mmu_t, pmap_t, vm_offset_t, vm_offset_t);
283 void moea64_zero_page(mmu_t, vm_page_t);
284 void moea64_zero_page_area(mmu_t, vm_page_t, int, int);
285 void moea64_activate(mmu_t, struct thread *);
286 void moea64_deactivate(mmu_t, struct thread *);
287 void *moea64_mapdev(mmu_t, vm_paddr_t, vm_size_t);
288 void *moea64_mapdev_attr(mmu_t, vm_paddr_t, vm_size_t, vm_memattr_t);
289 void moea64_unmapdev(mmu_t, vm_offset_t, vm_size_t);
290 vm_paddr_t moea64_kextract(mmu_t, vm_offset_t);
291 void moea64_page_set_memattr(mmu_t, vm_page_t m, vm_memattr_t ma);
292 void moea64_kenter_attr(mmu_t, vm_offset_t, vm_paddr_t, vm_memattr_t ma);
293 void moea64_kenter(mmu_t, vm_offset_t, vm_paddr_t);
294 boolean_t moea64_dev_direct_mapped(mmu_t, vm_paddr_t, vm_size_t);
295 static void moea64_sync_icache(mmu_t, pmap_t, vm_offset_t, vm_size_t);
296 void moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz,
297     void **va);
298 void moea64_scan_init(mmu_t mmu);
299 vm_offset_t moea64_quick_enter_page(mmu_t mmu, vm_page_t m);
300 void moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr);
301 static int moea64_map_user_ptr(mmu_t mmu, pmap_t pm,
302     volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen);
303 static int moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr,
304     int *is_user, vm_offset_t *decoded_addr);
305 
306 
307 static mmu_method_t moea64_methods[] = {
308 	MMUMETHOD(mmu_clear_modify,	moea64_clear_modify),
309 	MMUMETHOD(mmu_copy_page,	moea64_copy_page),
310 	MMUMETHOD(mmu_copy_pages,	moea64_copy_pages),
311 	MMUMETHOD(mmu_enter,		moea64_enter),
312 	MMUMETHOD(mmu_enter_object,	moea64_enter_object),
313 	MMUMETHOD(mmu_enter_quick,	moea64_enter_quick),
314 	MMUMETHOD(mmu_extract,		moea64_extract),
315 	MMUMETHOD(mmu_extract_and_hold,	moea64_extract_and_hold),
316 	MMUMETHOD(mmu_init,		moea64_init),
317 	MMUMETHOD(mmu_is_modified,	moea64_is_modified),
318 	MMUMETHOD(mmu_is_prefaultable,	moea64_is_prefaultable),
319 	MMUMETHOD(mmu_is_referenced,	moea64_is_referenced),
320 	MMUMETHOD(mmu_ts_referenced,	moea64_ts_referenced),
321 	MMUMETHOD(mmu_map,     		moea64_map),
322 	MMUMETHOD(mmu_page_exists_quick,moea64_page_exists_quick),
323 	MMUMETHOD(mmu_page_init,	moea64_page_init),
324 	MMUMETHOD(mmu_page_wired_mappings,moea64_page_wired_mappings),
325 	MMUMETHOD(mmu_pinit,		moea64_pinit),
326 	MMUMETHOD(mmu_pinit0,		moea64_pinit0),
327 	MMUMETHOD(mmu_protect,		moea64_protect),
328 	MMUMETHOD(mmu_qenter,		moea64_qenter),
329 	MMUMETHOD(mmu_qremove,		moea64_qremove),
330 	MMUMETHOD(mmu_release,		moea64_release),
331 	MMUMETHOD(mmu_remove,		moea64_remove),
332 	MMUMETHOD(mmu_remove_pages,	moea64_remove_pages),
333 	MMUMETHOD(mmu_remove_all,      	moea64_remove_all),
334 	MMUMETHOD(mmu_remove_write,	moea64_remove_write),
335 	MMUMETHOD(mmu_sync_icache,	moea64_sync_icache),
336 	MMUMETHOD(mmu_unwire,		moea64_unwire),
337 	MMUMETHOD(mmu_zero_page,       	moea64_zero_page),
338 	MMUMETHOD(mmu_zero_page_area,	moea64_zero_page_area),
339 	MMUMETHOD(mmu_activate,		moea64_activate),
340 	MMUMETHOD(mmu_deactivate,      	moea64_deactivate),
341 	MMUMETHOD(mmu_page_set_memattr,	moea64_page_set_memattr),
342 	MMUMETHOD(mmu_quick_enter_page, moea64_quick_enter_page),
343 	MMUMETHOD(mmu_quick_remove_page, moea64_quick_remove_page),
344 
345 	/* Internal interfaces */
346 	MMUMETHOD(mmu_mapdev,		moea64_mapdev),
347 	MMUMETHOD(mmu_mapdev_attr,	moea64_mapdev_attr),
348 	MMUMETHOD(mmu_unmapdev,		moea64_unmapdev),
349 	MMUMETHOD(mmu_kextract,		moea64_kextract),
350 	MMUMETHOD(mmu_kenter,		moea64_kenter),
351 	MMUMETHOD(mmu_kenter_attr,	moea64_kenter_attr),
352 	MMUMETHOD(mmu_dev_direct_mapped,moea64_dev_direct_mapped),
353 	MMUMETHOD(mmu_scan_init,	moea64_scan_init),
354 	MMUMETHOD(mmu_dumpsys_map,	moea64_dumpsys_map),
355 	MMUMETHOD(mmu_map_user_ptr,	moea64_map_user_ptr),
356 	MMUMETHOD(mmu_decode_kernel_ptr, moea64_decode_kernel_ptr),
357 
358 	{ 0, 0 }
359 };
360 
361 MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods, 0);
362 
363 static struct pvo_head *
364 vm_page_to_pvoh(vm_page_t m)
365 {
366 
367 	mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED);
368 	return (&m->md.mdpg_pvoh);
369 }
370 
371 static struct pvo_entry *
372 alloc_pvo_entry(int bootstrap)
373 {
374 	struct pvo_entry *pvo;
375 
376 	if (!moea64_initialized || bootstrap) {
377 		if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) {
378 			panic("moea64_enter: bpvo pool exhausted, %d, %d, %zd",
379 			      moea64_bpvo_pool_index, moea64_bpvo_pool_size,
380 			      moea64_bpvo_pool_size * sizeof(struct pvo_entry));
381 		}
382 		pvo = &moea64_bpvo_pool[
383 		    atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)];
384 		bzero(pvo, sizeof(*pvo));
385 		pvo->pvo_vaddr = PVO_BOOTSTRAP;
386 	} else {
387 		pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT);
388 		bzero(pvo, sizeof(*pvo));
389 	}
390 
391 	return (pvo);
392 }
393 
394 
395 static void
396 init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va)
397 {
398 	uint64_t vsid;
399 	uint64_t hash;
400 	int shift;
401 
402 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
403 
404 	pvo->pvo_pmap = pmap;
405 	va &= ~ADDR_POFF;
406 	pvo->pvo_vaddr |= va;
407 	vsid = va_to_vsid(pmap, va);
408 	pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
409 	    | (vsid << 16);
410 
411 	shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift :
412 	    ADDR_PIDX_SHFT;
413 	hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift);
414 	pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3;
415 }
416 
417 static void
418 free_pvo_entry(struct pvo_entry *pvo)
419 {
420 
421 	if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
422 		uma_zfree(moea64_pvo_zone, pvo);
423 }
424 
425 void
426 moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte)
427 {
428 
429 	lpte->pte_hi = (pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) &
430 	    LPTE_AVPN_MASK;
431 	lpte->pte_hi |= LPTE_VALID;
432 
433 	if (pvo->pvo_vaddr & PVO_LARGE)
434 		lpte->pte_hi |= LPTE_BIG;
435 	if (pvo->pvo_vaddr & PVO_WIRED)
436 		lpte->pte_hi |= LPTE_WIRED;
437 	if (pvo->pvo_vaddr & PVO_HID)
438 		lpte->pte_hi |= LPTE_HID;
439 
440 	lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */
441 	if (pvo->pvo_pte.prot & VM_PROT_WRITE)
442 		lpte->pte_lo |= LPTE_BW;
443 	else
444 		lpte->pte_lo |= LPTE_BR;
445 
446 	if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE))
447 		lpte->pte_lo |= LPTE_NOEXEC;
448 }
449 
450 static __inline uint64_t
451 moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
452 {
453 	uint64_t pte_lo;
454 	int i;
455 
456 	if (ma != VM_MEMATTR_DEFAULT) {
457 		switch (ma) {
458 		case VM_MEMATTR_UNCACHEABLE:
459 			return (LPTE_I | LPTE_G);
460 		case VM_MEMATTR_CACHEABLE:
461 			return (LPTE_M);
462 		case VM_MEMATTR_WRITE_COMBINING:
463 		case VM_MEMATTR_WRITE_BACK:
464 		case VM_MEMATTR_PREFETCHABLE:
465 			return (LPTE_I);
466 		case VM_MEMATTR_WRITE_THROUGH:
467 			return (LPTE_W | LPTE_M);
468 		}
469 	}
470 
471 	/*
472 	 * Assume the page is cache inhibited and access is guarded unless
473 	 * it's in our available memory array.
474 	 */
475 	pte_lo = LPTE_I | LPTE_G;
476 	for (i = 0; i < pregions_sz; i++) {
477 		if ((pa >= pregions[i].mr_start) &&
478 		    (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
479 			pte_lo &= ~(LPTE_I | LPTE_G);
480 			pte_lo |= LPTE_M;
481 			break;
482 		}
483 	}
484 
485 	return pte_lo;
486 }
487 
488 /*
489  * Quick sort callout for comparing memory regions.
490  */
491 static int	om_cmp(const void *a, const void *b);
492 
493 static int
494 om_cmp(const void *a, const void *b)
495 {
496 	const struct	ofw_map *mapa;
497 	const struct	ofw_map *mapb;
498 
499 	mapa = a;
500 	mapb = b;
501 	if (mapa->om_pa < mapb->om_pa)
502 		return (-1);
503 	else if (mapa->om_pa > mapb->om_pa)
504 		return (1);
505 	else
506 		return (0);
507 }
508 
509 static void
510 moea64_add_ofw_mappings(mmu_t mmup, phandle_t mmu, size_t sz)
511 {
512 	struct ofw_map	translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */
513 	pcell_t		acells, trans_cells[sz/sizeof(cell_t)];
514 	struct pvo_entry *pvo;
515 	register_t	msr;
516 	vm_offset_t	off;
517 	vm_paddr_t	pa_base;
518 	int		i, j;
519 
520 	bzero(translations, sz);
521 	OF_getencprop(OF_finddevice("/"), "#address-cells", &acells,
522 	    sizeof(acells));
523 	if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1)
524 		panic("moea64_bootstrap: can't get ofw translations");
525 
526 	CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations");
527 	sz /= sizeof(cell_t);
528 	for (i = 0, j = 0; i < sz; j++) {
529 		translations[j].om_va = trans_cells[i++];
530 		translations[j].om_len = trans_cells[i++];
531 		translations[j].om_pa = trans_cells[i++];
532 		if (acells == 2) {
533 			translations[j].om_pa <<= 32;
534 			translations[j].om_pa |= trans_cells[i++];
535 		}
536 		translations[j].om_mode = trans_cells[i++];
537 	}
538 	KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)",
539 	    i, sz));
540 
541 	sz = j;
542 	qsort(translations, sz, sizeof (*translations), om_cmp);
543 
544 	for (i = 0; i < sz; i++) {
545 		pa_base = translations[i].om_pa;
546 	      #ifndef __powerpc64__
547 		if ((translations[i].om_pa >> 32) != 0)
548 			panic("OFW translations above 32-bit boundary!");
549 	      #endif
550 
551 		if (pa_base % PAGE_SIZE)
552 			panic("OFW translation not page-aligned (phys)!");
553 		if (translations[i].om_va % PAGE_SIZE)
554 			panic("OFW translation not page-aligned (virt)!");
555 
556 		CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x",
557 		    pa_base, translations[i].om_va, translations[i].om_len);
558 
559 		/* Now enter the pages for this mapping */
560 
561 		DISABLE_TRANS(msr);
562 		for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
563 			/* If this address is direct-mapped, skip remapping */
564 			if (hw_direct_map &&
565 			    translations[i].om_va == PHYS_TO_DMAP(pa_base) &&
566 			    moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT)
567  			    == LPTE_M)
568 				continue;
569 
570 			PMAP_LOCK(kernel_pmap);
571 			pvo = moea64_pvo_find_va(kernel_pmap,
572 			    translations[i].om_va + off);
573 			PMAP_UNLOCK(kernel_pmap);
574 			if (pvo != NULL)
575 				continue;
576 
577 			moea64_kenter(mmup, translations[i].om_va + off,
578 			    pa_base + off);
579 		}
580 		ENABLE_TRANS(msr);
581 	}
582 }
583 
584 #ifdef __powerpc64__
585 static void
586 moea64_probe_large_page(void)
587 {
588 	uint16_t pvr = mfpvr() >> 16;
589 
590 	switch (pvr) {
591 	case IBM970:
592 	case IBM970FX:
593 	case IBM970MP:
594 		powerpc_sync(); isync();
595 		mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG);
596 		powerpc_sync(); isync();
597 
598 		/* FALLTHROUGH */
599 	default:
600 		if (moea64_large_page_size == 0) {
601 			moea64_large_page_size = 0x1000000; /* 16 MB */
602 			moea64_large_page_shift = 24;
603 		}
604 	}
605 
606 	moea64_large_page_mask = moea64_large_page_size - 1;
607 }
608 
609 static void
610 moea64_bootstrap_slb_prefault(vm_offset_t va, int large)
611 {
612 	struct slb *cache;
613 	struct slb entry;
614 	uint64_t esid, slbe;
615 	uint64_t i;
616 
617 	cache = PCPU_GET(aim.slb);
618 	esid = va >> ADDR_SR_SHFT;
619 	slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
620 
621 	for (i = 0; i < 64; i++) {
622 		if (cache[i].slbe == (slbe | i))
623 			return;
624 	}
625 
626 	entry.slbe = slbe;
627 	entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT;
628 	if (large)
629 		entry.slbv |= SLBV_L;
630 
631 	slb_insert_kernel(entry.slbe, entry.slbv);
632 }
633 #endif
634 
635 static void
636 moea64_setup_direct_map(mmu_t mmup, vm_offset_t kernelstart,
637     vm_offset_t kernelend)
638 {
639 	struct pvo_entry *pvo;
640 	register_t msr;
641 	vm_paddr_t pa;
642 	vm_offset_t size, off;
643 	uint64_t pte_lo;
644 	int i;
645 
646 	if (moea64_large_page_size == 0)
647 		hw_direct_map = 0;
648 
649 	DISABLE_TRANS(msr);
650 	if (hw_direct_map) {
651 		PMAP_LOCK(kernel_pmap);
652 		for (i = 0; i < pregions_sz; i++) {
653 		  for (pa = pregions[i].mr_start; pa < pregions[i].mr_start +
654 		     pregions[i].mr_size; pa += moea64_large_page_size) {
655 			pte_lo = LPTE_M;
656 
657 			pvo = alloc_pvo_entry(1 /* bootstrap */);
658 			pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE;
659 			init_pvo_entry(pvo, kernel_pmap, PHYS_TO_DMAP(pa));
660 
661 			/*
662 			 * Set memory access as guarded if prefetch within
663 			 * the page could exit the available physmem area.
664 			 */
665 			if (pa & moea64_large_page_mask) {
666 				pa &= moea64_large_page_mask;
667 				pte_lo |= LPTE_G;
668 			}
669 			if (pa + moea64_large_page_size >
670 			    pregions[i].mr_start + pregions[i].mr_size)
671 				pte_lo |= LPTE_G;
672 
673 			pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE |
674 			    VM_PROT_EXECUTE;
675 			pvo->pvo_pte.pa = pa | pte_lo;
676 			moea64_pvo_enter(mmup, pvo, NULL);
677 		  }
678 		}
679 		PMAP_UNLOCK(kernel_pmap);
680 	}
681 
682 	/*
683 	 * Make sure the kernel and BPVO pool stay mapped on systems either
684 	 * without a direct map or on which the kernel is not already executing
685 	 * out of the direct-mapped region.
686 	 */
687 
688 	if (!hw_direct_map || kernelstart < DMAP_BASE_ADDRESS) {
689 		for (pa = kernelstart & ~PAGE_MASK; pa < kernelend;
690 		    pa += PAGE_SIZE)
691 			moea64_kenter(mmup, pa, pa);
692 	}
693 
694 	if (!hw_direct_map) {
695 		size = moea64_bpvo_pool_size*sizeof(struct pvo_entry);
696 		off = (vm_offset_t)(moea64_bpvo_pool);
697 		for (pa = off; pa < off + size; pa += PAGE_SIZE)
698 			moea64_kenter(mmup, pa, pa);
699 	}
700 	ENABLE_TRANS(msr);
701 
702 	/*
703 	 * Allow user to override unmapped_buf_allowed for testing.
704 	 * XXXKIB Only direct map implementation was tested.
705 	 */
706 	if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
707 	    &unmapped_buf_allowed))
708 		unmapped_buf_allowed = hw_direct_map;
709 }
710 
711 /* Quick sort callout for comparing physical addresses. */
712 static int
713 pa_cmp(const void *a, const void *b)
714 {
715 	const vm_paddr_t *pa = a, *pb = b;
716 
717 	if (*pa < *pb)
718 		return (-1);
719 	else if (*pa > *pb)
720 		return (1);
721 	else
722 		return (0);
723 }
724 
725 void
726 moea64_early_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
727 {
728 	int		i, j;
729 	vm_size_t	physsz, hwphyssz;
730 	vm_paddr_t	kernelphysstart, kernelphysend;
731 	int		rm_pavail;
732 
733 #ifndef __powerpc64__
734 	/* We don't have a direct map since there is no BAT */
735 	hw_direct_map = 0;
736 
737 	/* Make sure battable is zero, since we have no BAT */
738 	for (i = 0; i < 16; i++) {
739 		battable[i].batu = 0;
740 		battable[i].batl = 0;
741 	}
742 #else
743 	moea64_probe_large_page();
744 
745 	/* Use a direct map if we have large page support */
746 	if (moea64_large_page_size > 0)
747 		hw_direct_map = 1;
748 	else
749 		hw_direct_map = 0;
750 
751 	/* Install trap handlers for SLBs */
752 	bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap);
753 	bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap);
754 	__syncicache((void *)EXC_DSE, 0x80);
755 	__syncicache((void *)EXC_ISE, 0x80);
756 #endif
757 
758 	kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS;
759 	kernelphysend = kernelend & ~DMAP_BASE_ADDRESS;
760 
761 	/* Get physical memory regions from firmware */
762 	mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
763 	CTR0(KTR_PMAP, "moea64_bootstrap: physical memory");
764 
765 	if (sizeof(phys_avail)/sizeof(phys_avail[0]) < regions_sz)
766 		panic("moea64_bootstrap: phys_avail too small");
767 
768 	phys_avail_count = 0;
769 	physsz = 0;
770 	hwphyssz = 0;
771 	TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
772 	for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
773 		CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)",
774 		    regions[i].mr_start, regions[i].mr_start +
775 		    regions[i].mr_size, regions[i].mr_size);
776 		if (hwphyssz != 0 &&
777 		    (physsz + regions[i].mr_size) >= hwphyssz) {
778 			if (physsz < hwphyssz) {
779 				phys_avail[j] = regions[i].mr_start;
780 				phys_avail[j + 1] = regions[i].mr_start +
781 				    hwphyssz - physsz;
782 				physsz = hwphyssz;
783 				phys_avail_count++;
784 			}
785 			break;
786 		}
787 		phys_avail[j] = regions[i].mr_start;
788 		phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
789 		phys_avail_count++;
790 		physsz += regions[i].mr_size;
791 	}
792 
793 	/* Check for overlap with the kernel and exception vectors */
794 	rm_pavail = 0;
795 	for (j = 0; j < 2*phys_avail_count; j+=2) {
796 		if (phys_avail[j] < EXC_LAST)
797 			phys_avail[j] += EXC_LAST;
798 
799 		if (phys_avail[j] >= kernelphysstart &&
800 		    phys_avail[j+1] <= kernelphysend) {
801 			phys_avail[j] = phys_avail[j+1] = ~0;
802 			rm_pavail++;
803 			continue;
804 		}
805 
806 		if (kernelphysstart >= phys_avail[j] &&
807 		    kernelphysstart < phys_avail[j+1]) {
808 			if (kernelphysend < phys_avail[j+1]) {
809 				phys_avail[2*phys_avail_count] =
810 				    (kernelphysend & ~PAGE_MASK) + PAGE_SIZE;
811 				phys_avail[2*phys_avail_count + 1] =
812 				    phys_avail[j+1];
813 				phys_avail_count++;
814 			}
815 
816 			phys_avail[j+1] = kernelphysstart & ~PAGE_MASK;
817 		}
818 
819 		if (kernelphysend >= phys_avail[j] &&
820 		    kernelphysend < phys_avail[j+1]) {
821 			if (kernelphysstart > phys_avail[j]) {
822 				phys_avail[2*phys_avail_count] = phys_avail[j];
823 				phys_avail[2*phys_avail_count + 1] =
824 				    kernelphysstart & ~PAGE_MASK;
825 				phys_avail_count++;
826 			}
827 
828 			phys_avail[j] = (kernelphysend & ~PAGE_MASK) +
829 			    PAGE_SIZE;
830 		}
831 	}
832 
833 	/* Remove physical available regions marked for removal (~0) */
834 	if (rm_pavail) {
835 		qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]),
836 			pa_cmp);
837 		phys_avail_count -= rm_pavail;
838 		for (i = 2*phys_avail_count;
839 		     i < 2*(phys_avail_count + rm_pavail); i+=2)
840 			phys_avail[i] = phys_avail[i+1] = 0;
841 	}
842 
843 	physmem = btoc(physsz);
844 
845 #ifdef PTEGCOUNT
846 	moea64_pteg_count = PTEGCOUNT;
847 #else
848 	moea64_pteg_count = 0x1000;
849 
850 	while (moea64_pteg_count < physmem)
851 		moea64_pteg_count <<= 1;
852 
853 	moea64_pteg_count >>= 1;
854 #endif /* PTEGCOUNT */
855 }
856 
857 void
858 moea64_mid_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
859 {
860 	int		i;
861 
862 	/*
863 	 * Set PTEG mask
864 	 */
865 	moea64_pteg_mask = moea64_pteg_count - 1;
866 
867 	/*
868 	 * Initialize SLB table lock and page locks
869 	 */
870 	mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF);
871 	for (i = 0; i < PV_LOCK_COUNT; i++)
872 		mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF);
873 
874 	/*
875 	 * Initialise the bootstrap pvo pool.
876 	 */
877 	moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc(
878 		moea64_bpvo_pool_size*sizeof(struct pvo_entry), 0);
879 	moea64_bpvo_pool_index = 0;
880 
881 	/* Place at address usable through the direct map */
882 	if (hw_direct_map)
883 		moea64_bpvo_pool = (struct pvo_entry *)
884 		    PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool);
885 
886 	/*
887 	 * Make sure kernel vsid is allocated as well as VSID 0.
888 	 */
889 	#ifndef __powerpc64__
890 	moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW]
891 		|= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
892 	moea64_vsid_bitmap[0] |= 1;
893 	#endif
894 
895 	/*
896 	 * Initialize the kernel pmap (which is statically allocated).
897 	 */
898 	#ifdef __powerpc64__
899 	for (i = 0; i < 64; i++) {
900 		pcpup->pc_aim.slb[i].slbv = 0;
901 		pcpup->pc_aim.slb[i].slbe = 0;
902 	}
903 	#else
904 	for (i = 0; i < 16; i++)
905 		kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
906 	#endif
907 
908 	kernel_pmap->pmap_phys = kernel_pmap;
909 	CPU_FILL(&kernel_pmap->pm_active);
910 	RB_INIT(&kernel_pmap->pmap_pvo);
911 
912 	PMAP_LOCK_INIT(kernel_pmap);
913 
914 	/*
915 	 * Now map in all the other buffers we allocated earlier
916 	 */
917 
918 	moea64_setup_direct_map(mmup, kernelstart, kernelend);
919 }
920 
921 void
922 moea64_late_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend)
923 {
924 	ihandle_t	mmui;
925 	phandle_t	chosen;
926 	phandle_t	mmu;
927 	ssize_t		sz;
928 	int		i;
929 	vm_offset_t	pa, va;
930 	void		*dpcpu;
931 
932 	/*
933 	 * Set up the Open Firmware pmap and add its mappings if not in real
934 	 * mode.
935 	 */
936 
937 	chosen = OF_finddevice("/chosen");
938 	if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) {
939 		mmu = OF_instance_to_package(mmui);
940 		if (mmu == -1 ||
941 		    (sz = OF_getproplen(mmu, "translations")) == -1)
942 			sz = 0;
943 		if (sz > 6144 /* tmpstksz - 2 KB headroom */)
944 			panic("moea64_bootstrap: too many ofw translations");
945 
946 		if (sz > 0)
947 			moea64_add_ofw_mappings(mmup, mmu, sz);
948 	}
949 
950 	/*
951 	 * Calculate the last available physical address.
952 	 */
953 	Maxmem = 0;
954 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
955 		Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
956 
957 	/*
958 	 * Initialize MMU.
959 	 */
960 	MMU_CPU_BOOTSTRAP(mmup,0);
961 	mtmsr(mfmsr() | PSL_DR | PSL_IR);
962 	pmap_bootstrapped++;
963 
964 	/*
965 	 * Set the start and end of kva.
966 	 */
967 	virtual_avail = VM_MIN_KERNEL_ADDRESS;
968 	virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
969 
970 	/*
971 	 * Map the entire KVA range into the SLB. We must not fault there.
972 	 */
973 	#ifdef __powerpc64__
974 	for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH)
975 		moea64_bootstrap_slb_prefault(va, 0);
976 	#endif
977 
978 	/*
979 	 * Remap any early IO mappings (console framebuffer, etc.)
980 	 */
981 	bs_remap_earlyboot();
982 
983 	/*
984 	 * Figure out how far we can extend virtual_end into segment 16
985 	 * without running into existing mappings. Segment 16 is guaranteed
986 	 * to contain neither RAM nor devices (at least on Apple hardware),
987 	 * but will generally contain some OFW mappings we should not
988 	 * step on.
989 	 */
990 
991 	#ifndef __powerpc64__	/* KVA is in high memory on PPC64 */
992 	PMAP_LOCK(kernel_pmap);
993 	while (virtual_end < VM_MAX_KERNEL_ADDRESS &&
994 	    moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL)
995 		virtual_end += PAGE_SIZE;
996 	PMAP_UNLOCK(kernel_pmap);
997 	#endif
998 
999 	/*
1000 	 * Allocate a kernel stack with a guard page for thread0 and map it
1001 	 * into the kernel page map.
1002 	 */
1003 	pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE);
1004 	va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
1005 	virtual_avail = va + kstack_pages * PAGE_SIZE;
1006 	CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
1007 	thread0.td_kstack = va;
1008 	thread0.td_kstack_pages = kstack_pages;
1009 	for (i = 0; i < kstack_pages; i++) {
1010 		moea64_kenter(mmup, va, pa);
1011 		pa += PAGE_SIZE;
1012 		va += PAGE_SIZE;
1013 	}
1014 
1015 	/*
1016 	 * Allocate virtual address space for the message buffer.
1017 	 */
1018 	pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE);
1019 	msgbufp = (struct msgbuf *)virtual_avail;
1020 	va = virtual_avail;
1021 	virtual_avail += round_page(msgbufsize);
1022 	while (va < virtual_avail) {
1023 		moea64_kenter(mmup, va, pa);
1024 		pa += PAGE_SIZE;
1025 		va += PAGE_SIZE;
1026 	}
1027 
1028 	/*
1029 	 * Allocate virtual address space for the dynamic percpu area.
1030 	 */
1031 	pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
1032 	dpcpu = (void *)virtual_avail;
1033 	va = virtual_avail;
1034 	virtual_avail += DPCPU_SIZE;
1035 	while (va < virtual_avail) {
1036 		moea64_kenter(mmup, va, pa);
1037 		pa += PAGE_SIZE;
1038 		va += PAGE_SIZE;
1039 	}
1040 	dpcpu_init(dpcpu, curcpu);
1041 
1042 	/*
1043 	 * Allocate some things for page zeroing. We put this directly
1044 	 * in the page table and use MOEA64_PTE_REPLACE to avoid any
1045 	 * of the PVO book-keeping or other parts of the VM system
1046 	 * from even knowing that this hack exists.
1047 	 */
1048 
1049 	if (!hw_direct_map) {
1050 		mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL,
1051 		    MTX_DEF);
1052 		for (i = 0; i < 2; i++) {
1053 			moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE;
1054 			virtual_end -= PAGE_SIZE;
1055 
1056 			moea64_kenter(mmup, moea64_scratchpage_va[i], 0);
1057 
1058 			PMAP_LOCK(kernel_pmap);
1059 			moea64_scratchpage_pvo[i] = moea64_pvo_find_va(
1060 			    kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]);
1061 			PMAP_UNLOCK(kernel_pmap);
1062 		}
1063 	}
1064 
1065 	numa_mem_regions(&numa_pregions, &numapregions_sz);
1066 }
1067 
1068 static void
1069 moea64_pmap_init_qpages(void)
1070 {
1071 	struct pcpu *pc;
1072 	int i;
1073 
1074 	if (hw_direct_map)
1075 		return;
1076 
1077 	CPU_FOREACH(i) {
1078 		pc = pcpu_find(i);
1079 		pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
1080 		if (pc->pc_qmap_addr == 0)
1081 			panic("pmap_init_qpages: unable to allocate KVA");
1082 		PMAP_LOCK(kernel_pmap);
1083 		pc->pc_aim.qmap_pvo =
1084 		    moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr);
1085 		PMAP_UNLOCK(kernel_pmap);
1086 		mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF);
1087 	}
1088 }
1089 
1090 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL);
1091 
1092 /*
1093  * Activate a user pmap.  This mostly involves setting some non-CPU
1094  * state.
1095  */
1096 void
1097 moea64_activate(mmu_t mmu, struct thread *td)
1098 {
1099 	pmap_t	pm;
1100 
1101 	pm = &td->td_proc->p_vmspace->vm_pmap;
1102 	CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
1103 
1104 	#ifdef __powerpc64__
1105 	PCPU_SET(aim.userslb, pm->pm_slb);
1106 	__asm __volatile("slbmte %0, %1; isync" ::
1107 	    "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE));
1108 	#else
1109 	PCPU_SET(curpmap, pm->pmap_phys);
1110 	mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid);
1111 	#endif
1112 }
1113 
1114 void
1115 moea64_deactivate(mmu_t mmu, struct thread *td)
1116 {
1117 	pmap_t	pm;
1118 
1119 	__asm __volatile("isync; slbie %0" :: "r"(USER_ADDR));
1120 
1121 	pm = &td->td_proc->p_vmspace->vm_pmap;
1122 	CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
1123 	#ifdef __powerpc64__
1124 	PCPU_SET(aim.userslb, NULL);
1125 	#else
1126 	PCPU_SET(curpmap, NULL);
1127 	#endif
1128 }
1129 
1130 void
1131 moea64_unwire(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1132 {
1133 	struct	pvo_entry key, *pvo;
1134 	vm_page_t m;
1135 	int64_t	refchg;
1136 
1137 	key.pvo_vaddr = sva;
1138 	PMAP_LOCK(pm);
1139 	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1140 	    pvo != NULL && PVO_VADDR(pvo) < eva;
1141 	    pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
1142 		if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1143 			panic("moea64_unwire: pvo %p is missing PVO_WIRED",
1144 			    pvo);
1145 		pvo->pvo_vaddr &= ~PVO_WIRED;
1146 		refchg = MOEA64_PTE_REPLACE(mmu, pvo, 0 /* No invalidation */);
1147 		if ((pvo->pvo_vaddr & PVO_MANAGED) &&
1148 		    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1149 			if (refchg < 0)
1150 				refchg = LPTE_CHG;
1151 			m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
1152 
1153 			refchg |= atomic_readandclear_32(&m->md.mdpg_attrs);
1154 			if (refchg & LPTE_CHG)
1155 				vm_page_dirty(m);
1156 			if (refchg & LPTE_REF)
1157 				vm_page_aflag_set(m, PGA_REFERENCED);
1158 		}
1159 		pm->pm_stats.wired_count--;
1160 	}
1161 	PMAP_UNLOCK(pm);
1162 }
1163 
1164 /*
1165  * This goes through and sets the physical address of our
1166  * special scratch PTE to the PA we want to zero or copy. Because
1167  * of locking issues (this can get called in pvo_enter() by
1168  * the UMA allocator), we can't use most other utility functions here
1169  */
1170 
1171 static __inline
1172 void moea64_set_scratchpage_pa(mmu_t mmup, int which, vm_paddr_t pa) {
1173 
1174 	KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!"));
1175 	mtx_assert(&moea64_scratchpage_mtx, MA_OWNED);
1176 
1177 	moea64_scratchpage_pvo[which]->pvo_pte.pa =
1178 	    moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa;
1179 	MOEA64_PTE_REPLACE(mmup, moea64_scratchpage_pvo[which],
1180 	    MOEA64_PTE_INVALIDATE);
1181 	isync();
1182 }
1183 
1184 void
1185 moea64_copy_page(mmu_t mmu, vm_page_t msrc, vm_page_t mdst)
1186 {
1187 	vm_offset_t	dst;
1188 	vm_offset_t	src;
1189 
1190 	dst = VM_PAGE_TO_PHYS(mdst);
1191 	src = VM_PAGE_TO_PHYS(msrc);
1192 
1193 	if (hw_direct_map) {
1194 		bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst),
1195 		    PAGE_SIZE);
1196 	} else {
1197 		mtx_lock(&moea64_scratchpage_mtx);
1198 
1199 		moea64_set_scratchpage_pa(mmu, 0, src);
1200 		moea64_set_scratchpage_pa(mmu, 1, dst);
1201 
1202 		bcopy((void *)moea64_scratchpage_va[0],
1203 		    (void *)moea64_scratchpage_va[1], PAGE_SIZE);
1204 
1205 		mtx_unlock(&moea64_scratchpage_mtx);
1206 	}
1207 }
1208 
1209 static inline void
1210 moea64_copy_pages_dmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
1211     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1212 {
1213 	void *a_cp, *b_cp;
1214 	vm_offset_t a_pg_offset, b_pg_offset;
1215 	int cnt;
1216 
1217 	while (xfersize > 0) {
1218 		a_pg_offset = a_offset & PAGE_MASK;
1219 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1220 		a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1221 		    VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
1222 		    a_pg_offset;
1223 		b_pg_offset = b_offset & PAGE_MASK;
1224 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1225 		b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1226 		    VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
1227 		    b_pg_offset;
1228 		bcopy(a_cp, b_cp, cnt);
1229 		a_offset += cnt;
1230 		b_offset += cnt;
1231 		xfersize -= cnt;
1232 	}
1233 }
1234 
1235 static inline void
1236 moea64_copy_pages_nodmap(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
1237     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1238 {
1239 	void *a_cp, *b_cp;
1240 	vm_offset_t a_pg_offset, b_pg_offset;
1241 	int cnt;
1242 
1243 	mtx_lock(&moea64_scratchpage_mtx);
1244 	while (xfersize > 0) {
1245 		a_pg_offset = a_offset & PAGE_MASK;
1246 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1247 		moea64_set_scratchpage_pa(mmu, 0,
1248 		    VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]));
1249 		a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset;
1250 		b_pg_offset = b_offset & PAGE_MASK;
1251 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1252 		moea64_set_scratchpage_pa(mmu, 1,
1253 		    VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]));
1254 		b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset;
1255 		bcopy(a_cp, b_cp, cnt);
1256 		a_offset += cnt;
1257 		b_offset += cnt;
1258 		xfersize -= cnt;
1259 	}
1260 	mtx_unlock(&moea64_scratchpage_mtx);
1261 }
1262 
1263 void
1264 moea64_copy_pages(mmu_t mmu, vm_page_t *ma, vm_offset_t a_offset,
1265     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1266 {
1267 
1268 	if (hw_direct_map) {
1269 		moea64_copy_pages_dmap(mmu, ma, a_offset, mb, b_offset,
1270 		    xfersize);
1271 	} else {
1272 		moea64_copy_pages_nodmap(mmu, ma, a_offset, mb, b_offset,
1273 		    xfersize);
1274 	}
1275 }
1276 
1277 void
1278 moea64_zero_page_area(mmu_t mmu, vm_page_t m, int off, int size)
1279 {
1280 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1281 
1282 	if (size + off > PAGE_SIZE)
1283 		panic("moea64_zero_page: size + off > PAGE_SIZE");
1284 
1285 	if (hw_direct_map) {
1286 		bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size);
1287 	} else {
1288 		mtx_lock(&moea64_scratchpage_mtx);
1289 		moea64_set_scratchpage_pa(mmu, 0, pa);
1290 		bzero((caddr_t)moea64_scratchpage_va[0] + off, size);
1291 		mtx_unlock(&moea64_scratchpage_mtx);
1292 	}
1293 }
1294 
1295 /*
1296  * Zero a page of physical memory by temporarily mapping it
1297  */
1298 void
1299 moea64_zero_page(mmu_t mmu, vm_page_t m)
1300 {
1301 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1302 	vm_offset_t va, off;
1303 
1304 	if (!hw_direct_map) {
1305 		mtx_lock(&moea64_scratchpage_mtx);
1306 
1307 		moea64_set_scratchpage_pa(mmu, 0, pa);
1308 		va = moea64_scratchpage_va[0];
1309 	} else {
1310 		va = PHYS_TO_DMAP(pa);
1311 	}
1312 
1313 	for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1314 		__asm __volatile("dcbz 0,%0" :: "r"(va + off));
1315 
1316 	if (!hw_direct_map)
1317 		mtx_unlock(&moea64_scratchpage_mtx);
1318 }
1319 
1320 vm_offset_t
1321 moea64_quick_enter_page(mmu_t mmu, vm_page_t m)
1322 {
1323 	struct pvo_entry *pvo;
1324 	vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1325 
1326 	if (hw_direct_map)
1327 		return (PHYS_TO_DMAP(pa));
1328 
1329 	/*
1330  	 * MOEA64_PTE_REPLACE does some locking, so we can't just grab
1331 	 * a critical section and access the PCPU data like on i386.
1332 	 * Instead, pin the thread and grab the PCPU lock to prevent
1333 	 * a preempting thread from using the same PCPU data.
1334 	 */
1335 	sched_pin();
1336 
1337 	mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED);
1338 	pvo = PCPU_GET(aim.qmap_pvo);
1339 
1340 	mtx_lock(PCPU_PTR(aim.qmap_lock));
1341 	pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) |
1342 	    (uint64_t)pa;
1343 	MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_INVALIDATE);
1344 	isync();
1345 
1346 	return (PCPU_GET(qmap_addr));
1347 }
1348 
1349 void
1350 moea64_quick_remove_page(mmu_t mmu, vm_offset_t addr)
1351 {
1352 	if (hw_direct_map)
1353 		return;
1354 
1355 	mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED);
1356 	KASSERT(PCPU_GET(qmap_addr) == addr,
1357 	    ("moea64_quick_remove_page: invalid address"));
1358 	mtx_unlock(PCPU_PTR(aim.qmap_lock));
1359 	sched_unpin();
1360 }
1361 
1362 /*
1363  * Map the given physical page at the specified virtual address in the
1364  * target pmap with the protection requested.  If specified the page
1365  * will be wired down.
1366  */
1367 
1368 int
1369 moea64_enter(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_page_t m,
1370     vm_prot_t prot, u_int flags, int8_t psind)
1371 {
1372 	struct		pvo_entry *pvo, *oldpvo;
1373 	struct		pvo_head *pvo_head;
1374 	uint64_t	pte_lo;
1375 	int		error;
1376 
1377 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
1378 		VM_OBJECT_ASSERT_LOCKED(m->object);
1379 
1380 	pvo = alloc_pvo_entry(0);
1381 	pvo->pvo_pmap = NULL; /* to be filled in later */
1382 	pvo->pvo_pte.prot = prot;
1383 
1384 	pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m));
1385 	pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo;
1386 
1387 	if ((flags & PMAP_ENTER_WIRED) != 0)
1388 		pvo->pvo_vaddr |= PVO_WIRED;
1389 
1390 	if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) {
1391 		pvo_head = NULL;
1392 	} else {
1393 		pvo_head = &m->md.mdpg_pvoh;
1394 		pvo->pvo_vaddr |= PVO_MANAGED;
1395 	}
1396 
1397 	for (;;) {
1398 		PV_PAGE_LOCK(m);
1399 		PMAP_LOCK(pmap);
1400 		if (pvo->pvo_pmap == NULL)
1401 			init_pvo_entry(pvo, pmap, va);
1402 		if (prot & VM_PROT_WRITE)
1403 			if (pmap_bootstrapped &&
1404 			    (m->oflags & VPO_UNMANAGED) == 0)
1405 				vm_page_aflag_set(m, PGA_WRITEABLE);
1406 
1407 		oldpvo = moea64_pvo_find_va(pmap, va);
1408 		if (oldpvo != NULL) {
1409 			if (oldpvo->pvo_vaddr == pvo->pvo_vaddr &&
1410 			    oldpvo->pvo_pte.pa == pvo->pvo_pte.pa &&
1411 			    oldpvo->pvo_pte.prot == prot) {
1412 				/* Identical mapping already exists */
1413 				error = 0;
1414 
1415 				/* If not in page table, reinsert it */
1416 				if (MOEA64_PTE_SYNCH(mmu, oldpvo) < 0) {
1417 					moea64_pte_overflow--;
1418 					MOEA64_PTE_INSERT(mmu, oldpvo);
1419 				}
1420 
1421 				/* Then just clean up and go home */
1422 				PV_PAGE_UNLOCK(m);
1423 				PMAP_UNLOCK(pmap);
1424 				free_pvo_entry(pvo);
1425 				break;
1426 			}
1427 
1428 			/* Otherwise, need to kill it first */
1429 			KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old "
1430 			    "mapping does not match new mapping"));
1431 			moea64_pvo_remove_from_pmap(mmu, oldpvo);
1432 		}
1433 		error = moea64_pvo_enter(mmu, pvo, pvo_head);
1434 		PV_PAGE_UNLOCK(m);
1435 		PMAP_UNLOCK(pmap);
1436 
1437 		/* Free any dead pages */
1438 		if (oldpvo != NULL) {
1439 			PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN);
1440 			moea64_pvo_remove_from_page(mmu, oldpvo);
1441 			PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN);
1442 			free_pvo_entry(oldpvo);
1443 		}
1444 
1445 		if (error != ENOMEM)
1446 			break;
1447 		if ((flags & PMAP_ENTER_NOSLEEP) != 0)
1448 			return (KERN_RESOURCE_SHORTAGE);
1449 		VM_OBJECT_ASSERT_UNLOCKED(m->object);
1450 		vm_wait(NULL);
1451 	}
1452 
1453 	/*
1454 	 * Flush the page from the instruction cache if this page is
1455 	 * mapped executable and cacheable.
1456 	 */
1457 	if (pmap != kernel_pmap && !(m->aflags & PGA_EXECUTABLE) &&
1458 	    (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1459 		vm_page_aflag_set(m, PGA_EXECUTABLE);
1460 		moea64_syncicache(mmu, pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1461 	}
1462 	return (KERN_SUCCESS);
1463 }
1464 
1465 static void
1466 moea64_syncicache(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1467     vm_size_t sz)
1468 {
1469 
1470 	/*
1471 	 * This is much trickier than on older systems because
1472 	 * we can't sync the icache on physical addresses directly
1473 	 * without a direct map. Instead we check a couple of cases
1474 	 * where the memory is already mapped in and, failing that,
1475 	 * use the same trick we use for page zeroing to create
1476 	 * a temporary mapping for this physical address.
1477 	 */
1478 
1479 	if (!pmap_bootstrapped) {
1480 		/*
1481 		 * If PMAP is not bootstrapped, we are likely to be
1482 		 * in real mode.
1483 		 */
1484 		__syncicache((void *)(uintptr_t)pa, sz);
1485 	} else if (pmap == kernel_pmap) {
1486 		__syncicache((void *)va, sz);
1487 	} else if (hw_direct_map) {
1488 		__syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz);
1489 	} else {
1490 		/* Use the scratch page to set up a temp mapping */
1491 
1492 		mtx_lock(&moea64_scratchpage_mtx);
1493 
1494 		moea64_set_scratchpage_pa(mmu, 1, pa & ~ADDR_POFF);
1495 		__syncicache((void *)(moea64_scratchpage_va[1] +
1496 		    (va & ADDR_POFF)), sz);
1497 
1498 		mtx_unlock(&moea64_scratchpage_mtx);
1499 	}
1500 }
1501 
1502 /*
1503  * Maps a sequence of resident pages belonging to the same object.
1504  * The sequence begins with the given page m_start.  This page is
1505  * mapped at the given virtual address start.  Each subsequent page is
1506  * mapped at a virtual address that is offset from start by the same
1507  * amount as the page is offset from m_start within the object.  The
1508  * last page in the sequence is the page with the largest offset from
1509  * m_start that can be mapped at a virtual address less than the given
1510  * virtual address end.  Not every virtual page between start and end
1511  * is mapped; only those for which a resident page exists with the
1512  * corresponding offset from m_start are mapped.
1513  */
1514 void
1515 moea64_enter_object(mmu_t mmu, pmap_t pm, vm_offset_t start, vm_offset_t end,
1516     vm_page_t m_start, vm_prot_t prot)
1517 {
1518 	vm_page_t m;
1519 	vm_pindex_t diff, psize;
1520 
1521 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
1522 
1523 	psize = atop(end - start);
1524 	m = m_start;
1525 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1526 		moea64_enter(mmu, pm, start + ptoa(diff), m, prot &
1527 		    (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP, 0);
1528 		m = TAILQ_NEXT(m, listq);
1529 	}
1530 }
1531 
1532 void
1533 moea64_enter_quick(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_page_t m,
1534     vm_prot_t prot)
1535 {
1536 
1537 	moea64_enter(mmu, pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1538 	    PMAP_ENTER_NOSLEEP, 0);
1539 }
1540 
1541 vm_paddr_t
1542 moea64_extract(mmu_t mmu, pmap_t pm, vm_offset_t va)
1543 {
1544 	struct	pvo_entry *pvo;
1545 	vm_paddr_t pa;
1546 
1547 	PMAP_LOCK(pm);
1548 	pvo = moea64_pvo_find_va(pm, va);
1549 	if (pvo == NULL)
1550 		pa = 0;
1551 	else
1552 		pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo));
1553 	PMAP_UNLOCK(pm);
1554 
1555 	return (pa);
1556 }
1557 
1558 /*
1559  * Atomically extract and hold the physical page with the given
1560  * pmap and virtual address pair if that mapping permits the given
1561  * protection.
1562  */
1563 vm_page_t
1564 moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1565 {
1566 	struct	pvo_entry *pvo;
1567 	vm_page_t m;
1568         vm_paddr_t pa;
1569 
1570 	m = NULL;
1571 	pa = 0;
1572 	PMAP_LOCK(pmap);
1573 retry:
1574 	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1575 	if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) {
1576 		if (vm_page_pa_tryrelock(pmap,
1577 		    pvo->pvo_pte.pa & LPTE_RPGN, &pa))
1578 			goto retry;
1579 		m = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
1580 		vm_page_hold(m);
1581 	}
1582 	PA_UNLOCK_COND(pa);
1583 	PMAP_UNLOCK(pmap);
1584 	return (m);
1585 }
1586 
1587 static mmu_t installed_mmu;
1588 
1589 static void *
1590 moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
1591     uint8_t *flags, int wait)
1592 {
1593 	struct pvo_entry *pvo;
1594         vm_offset_t va;
1595         vm_page_t m;
1596         int needed_lock;
1597 
1598 	/*
1599 	 * This entire routine is a horrible hack to avoid bothering kmem
1600 	 * for new KVA addresses. Because this can get called from inside
1601 	 * kmem allocation routines, calling kmem for a new address here
1602 	 * can lead to multiply locking non-recursive mutexes.
1603 	 */
1604 
1605 	*flags = UMA_SLAB_PRIV;
1606 	needed_lock = !PMAP_LOCKED(kernel_pmap);
1607 
1608 	m = vm_page_alloc_domain(NULL, 0, domain,
1609 	    malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1610 	if (m == NULL)
1611 		return (NULL);
1612 
1613 	va = VM_PAGE_TO_PHYS(m);
1614 
1615 	pvo = alloc_pvo_entry(1 /* bootstrap */);
1616 
1617 	pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE;
1618 	pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M;
1619 
1620 	if (needed_lock)
1621 		PMAP_LOCK(kernel_pmap);
1622 
1623 	init_pvo_entry(pvo, kernel_pmap, va);
1624 	pvo->pvo_vaddr |= PVO_WIRED;
1625 
1626 	moea64_pvo_enter(installed_mmu, pvo, NULL);
1627 
1628 	if (needed_lock)
1629 		PMAP_UNLOCK(kernel_pmap);
1630 
1631 	if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
1632                 bzero((void *)va, PAGE_SIZE);
1633 
1634 	return (void *)va;
1635 }
1636 
1637 extern int elf32_nxstack;
1638 
1639 void
1640 moea64_init(mmu_t mmu)
1641 {
1642 
1643 	CTR0(KTR_PMAP, "moea64_init");
1644 
1645 	moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1646 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1647 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
1648 
1649 	if (!hw_direct_map) {
1650 		installed_mmu = mmu;
1651 		uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc);
1652 	}
1653 
1654 #ifdef COMPAT_FREEBSD32
1655 	elf32_nxstack = 1;
1656 #endif
1657 
1658 	moea64_initialized = TRUE;
1659 }
1660 
1661 boolean_t
1662 moea64_is_referenced(mmu_t mmu, vm_page_t m)
1663 {
1664 
1665 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1666 	    ("moea64_is_referenced: page %p is not managed", m));
1667 
1668 	return (moea64_query_bit(mmu, m, LPTE_REF));
1669 }
1670 
1671 boolean_t
1672 moea64_is_modified(mmu_t mmu, vm_page_t m)
1673 {
1674 
1675 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1676 	    ("moea64_is_modified: page %p is not managed", m));
1677 
1678 	/*
1679 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
1680 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
1681 	 * is clear, no PTEs can have LPTE_CHG set.
1682 	 */
1683 	VM_OBJECT_ASSERT_LOCKED(m->object);
1684 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
1685 		return (FALSE);
1686 	return (moea64_query_bit(mmu, m, LPTE_CHG));
1687 }
1688 
1689 boolean_t
1690 moea64_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va)
1691 {
1692 	struct pvo_entry *pvo;
1693 	boolean_t rv = TRUE;
1694 
1695 	PMAP_LOCK(pmap);
1696 	pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1697 	if (pvo != NULL)
1698 		rv = FALSE;
1699 	PMAP_UNLOCK(pmap);
1700 	return (rv);
1701 }
1702 
1703 void
1704 moea64_clear_modify(mmu_t mmu, vm_page_t m)
1705 {
1706 
1707 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1708 	    ("moea64_clear_modify: page %p is not managed", m));
1709 	VM_OBJECT_ASSERT_WLOCKED(m->object);
1710 	KASSERT(!vm_page_xbusied(m),
1711 	    ("moea64_clear_modify: page %p is exclusive busied", m));
1712 
1713 	/*
1714 	 * If the page is not PGA_WRITEABLE, then no PTEs can have LPTE_CHG
1715 	 * set.  If the object containing the page is locked and the page is
1716 	 * not exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
1717 	 */
1718 	if ((m->aflags & PGA_WRITEABLE) == 0)
1719 		return;
1720 	moea64_clear_bit(mmu, m, LPTE_CHG);
1721 }
1722 
1723 /*
1724  * Clear the write and modified bits in each of the given page's mappings.
1725  */
1726 void
1727 moea64_remove_write(mmu_t mmu, vm_page_t m)
1728 {
1729 	struct	pvo_entry *pvo;
1730 	int64_t	refchg, ret;
1731 	pmap_t	pmap;
1732 
1733 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1734 	    ("moea64_remove_write: page %p is not managed", m));
1735 
1736 	/*
1737 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
1738 	 * set by another thread while the object is locked.  Thus,
1739 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
1740 	 */
1741 	VM_OBJECT_ASSERT_WLOCKED(m->object);
1742 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
1743 		return;
1744 	powerpc_sync();
1745 	PV_PAGE_LOCK(m);
1746 	refchg = 0;
1747 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1748 		pmap = pvo->pvo_pmap;
1749 		PMAP_LOCK(pmap);
1750 		if (!(pvo->pvo_vaddr & PVO_DEAD) &&
1751 		    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1752 			pvo->pvo_pte.prot &= ~VM_PROT_WRITE;
1753 			ret = MOEA64_PTE_REPLACE(mmu, pvo,
1754 			    MOEA64_PTE_PROT_UPDATE);
1755 			if (ret < 0)
1756 				ret = LPTE_CHG;
1757 			refchg |= ret;
1758 			if (pvo->pvo_pmap == kernel_pmap)
1759 				isync();
1760 		}
1761 		PMAP_UNLOCK(pmap);
1762 	}
1763 	if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG)
1764 		vm_page_dirty(m);
1765 	vm_page_aflag_clear(m, PGA_WRITEABLE);
1766 	PV_PAGE_UNLOCK(m);
1767 }
1768 
1769 /*
1770  *	moea64_ts_referenced:
1771  *
1772  *	Return a count of reference bits for a page, clearing those bits.
1773  *	It is not necessary for every reference bit to be cleared, but it
1774  *	is necessary that 0 only be returned when there are truly no
1775  *	reference bits set.
1776  *
1777  *	XXX: The exact number of bits to check and clear is a matter that
1778  *	should be tested and standardized at some point in the future for
1779  *	optimal aging of shared pages.
1780  */
1781 int
1782 moea64_ts_referenced(mmu_t mmu, vm_page_t m)
1783 {
1784 
1785 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1786 	    ("moea64_ts_referenced: page %p is not managed", m));
1787 	return (moea64_clear_bit(mmu, m, LPTE_REF));
1788 }
1789 
1790 /*
1791  * Modify the WIMG settings of all mappings for a page.
1792  */
1793 void
1794 moea64_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma)
1795 {
1796 	struct	pvo_entry *pvo;
1797 	int64_t	refchg;
1798 	pmap_t	pmap;
1799 	uint64_t lo;
1800 
1801 	if ((m->oflags & VPO_UNMANAGED) != 0) {
1802 		m->md.mdpg_cache_attrs = ma;
1803 		return;
1804 	}
1805 
1806 	lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
1807 
1808 	PV_PAGE_LOCK(m);
1809 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1810 		pmap = pvo->pvo_pmap;
1811 		PMAP_LOCK(pmap);
1812 		if (!(pvo->pvo_vaddr & PVO_DEAD)) {
1813 			pvo->pvo_pte.pa &= ~LPTE_WIMG;
1814 			pvo->pvo_pte.pa |= lo;
1815 			refchg = MOEA64_PTE_REPLACE(mmu, pvo,
1816 			    MOEA64_PTE_INVALIDATE);
1817 			if (refchg < 0)
1818 				refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ?
1819 				    LPTE_CHG : 0;
1820 			if ((pvo->pvo_vaddr & PVO_MANAGED) &&
1821 			    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1822 				refchg |=
1823 				    atomic_readandclear_32(&m->md.mdpg_attrs);
1824 				if (refchg & LPTE_CHG)
1825 					vm_page_dirty(m);
1826 				if (refchg & LPTE_REF)
1827 					vm_page_aflag_set(m, PGA_REFERENCED);
1828 			}
1829 			if (pvo->pvo_pmap == kernel_pmap)
1830 				isync();
1831 		}
1832 		PMAP_UNLOCK(pmap);
1833 	}
1834 	m->md.mdpg_cache_attrs = ma;
1835 	PV_PAGE_UNLOCK(m);
1836 }
1837 
1838 /*
1839  * Map a wired page into kernel virtual address space.
1840  */
1841 void
1842 moea64_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
1843 {
1844 	int		error;
1845 	struct pvo_entry *pvo, *oldpvo;
1846 
1847 	pvo = alloc_pvo_entry(0);
1848 	pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
1849 	pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma);
1850 	pvo->pvo_vaddr |= PVO_WIRED;
1851 
1852 	PMAP_LOCK(kernel_pmap);
1853 	oldpvo = moea64_pvo_find_va(kernel_pmap, va);
1854 	if (oldpvo != NULL)
1855 		moea64_pvo_remove_from_pmap(mmu, oldpvo);
1856 	init_pvo_entry(pvo, kernel_pmap, va);
1857 	error = moea64_pvo_enter(mmu, pvo, NULL);
1858 	PMAP_UNLOCK(kernel_pmap);
1859 
1860 	/* Free any dead pages */
1861 	if (oldpvo != NULL) {
1862 		PV_LOCK(oldpvo->pvo_pte.pa & LPTE_RPGN);
1863 		moea64_pvo_remove_from_page(mmu, oldpvo);
1864 		PV_UNLOCK(oldpvo->pvo_pte.pa & LPTE_RPGN);
1865 		free_pvo_entry(oldpvo);
1866 	}
1867 
1868 	if (error != 0 && error != ENOENT)
1869 		panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va,
1870 		    (uintmax_t)pa, error);
1871 }
1872 
1873 void
1874 moea64_kenter(mmu_t mmu, vm_offset_t va, vm_paddr_t pa)
1875 {
1876 
1877 	moea64_kenter_attr(mmu, va, pa, VM_MEMATTR_DEFAULT);
1878 }
1879 
1880 /*
1881  * Extract the physical page address associated with the given kernel virtual
1882  * address.
1883  */
1884 vm_paddr_t
1885 moea64_kextract(mmu_t mmu, vm_offset_t va)
1886 {
1887 	struct		pvo_entry *pvo;
1888 	vm_paddr_t pa;
1889 
1890 	/*
1891 	 * Shortcut the direct-mapped case when applicable.  We never put
1892 	 * anything but 1:1 (or 62-bit aliased) mappings below
1893 	 * VM_MIN_KERNEL_ADDRESS.
1894 	 */
1895 	if (va < VM_MIN_KERNEL_ADDRESS)
1896 		return (va & ~DMAP_BASE_ADDRESS);
1897 
1898 	PMAP_LOCK(kernel_pmap);
1899 	pvo = moea64_pvo_find_va(kernel_pmap, va);
1900 	KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
1901 	    va));
1902 	pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va - PVO_VADDR(pvo));
1903 	PMAP_UNLOCK(kernel_pmap);
1904 	return (pa);
1905 }
1906 
1907 /*
1908  * Remove a wired page from kernel virtual address space.
1909  */
1910 void
1911 moea64_kremove(mmu_t mmu, vm_offset_t va)
1912 {
1913 	moea64_remove(mmu, kernel_pmap, va, va + PAGE_SIZE);
1914 }
1915 
1916 /*
1917  * Provide a kernel pointer corresponding to a given userland pointer.
1918  * The returned pointer is valid until the next time this function is
1919  * called in this thread. This is used internally in copyin/copyout.
1920  */
1921 static int
1922 moea64_map_user_ptr(mmu_t mmu, pmap_t pm, volatile const void *uaddr,
1923     void **kaddr, size_t ulen, size_t *klen)
1924 {
1925 	size_t l;
1926 #ifdef __powerpc64__
1927 	struct slb *slb;
1928 #endif
1929 	register_t slbv;
1930 
1931 	*kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK);
1932 	l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr);
1933 	if (l > ulen)
1934 		l = ulen;
1935 	if (klen)
1936 		*klen = l;
1937 	else if (l != ulen)
1938 		return (EFAULT);
1939 
1940 #ifdef __powerpc64__
1941 	/* Try lockless look-up first */
1942 	slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr);
1943 
1944 	if (slb == NULL) {
1945 		/* If it isn't there, we need to pre-fault the VSID */
1946 		PMAP_LOCK(pm);
1947 		slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT;
1948 		PMAP_UNLOCK(pm);
1949 	} else {
1950 		slbv = slb->slbv;
1951 	}
1952 
1953 	/* Mark segment no-execute */
1954 	slbv |= SLBV_N;
1955 #else
1956 	slbv = va_to_vsid(pm, (vm_offset_t)uaddr);
1957 
1958 	/* Mark segment no-execute */
1959 	slbv |= SR_N;
1960 #endif
1961 
1962 	/* If we have already set this VSID, we can just return */
1963 	if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv)
1964 		return (0);
1965 
1966 	__asm __volatile("isync");
1967 	curthread->td_pcb->pcb_cpu.aim.usr_segm =
1968 	    (uintptr_t)uaddr >> ADDR_SR_SHFT;
1969 	curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv;
1970 #ifdef __powerpc64__
1971 	__asm __volatile ("slbie %0; slbmte %1, %2; isync" ::
1972 	    "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE));
1973 #else
1974 	__asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv));
1975 #endif
1976 
1977 	return (0);
1978 }
1979 
1980 /*
1981  * Figure out where a given kernel pointer (usually in a fault) points
1982  * to from the VM's perspective, potentially remapping into userland's
1983  * address space.
1984  */
1985 static int
1986 moea64_decode_kernel_ptr(mmu_t mmu, vm_offset_t addr, int *is_user,
1987     vm_offset_t *decoded_addr)
1988 {
1989 	vm_offset_t user_sr;
1990 
1991 	if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) {
1992 		user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm;
1993 		addr &= ADDR_PIDX | ADDR_POFF;
1994 		addr |= user_sr << ADDR_SR_SHFT;
1995 		*decoded_addr = addr;
1996 		*is_user = 1;
1997 	} else {
1998 		*decoded_addr = addr;
1999 		*is_user = 0;
2000 	}
2001 
2002 	return (0);
2003 }
2004 
2005 /*
2006  * Map a range of physical addresses into kernel virtual address space.
2007  *
2008  * The value passed in *virt is a suggested virtual address for the mapping.
2009  * Architectures which can support a direct-mapped physical to virtual region
2010  * can return the appropriate address within that region, leaving '*virt'
2011  * unchanged.  Other architectures should map the pages starting at '*virt' and
2012  * update '*virt' with the first usable address after the mapped region.
2013  */
2014 vm_offset_t
2015 moea64_map(mmu_t mmu, vm_offset_t *virt, vm_paddr_t pa_start,
2016     vm_paddr_t pa_end, int prot)
2017 {
2018 	vm_offset_t	sva, va;
2019 
2020 	if (hw_direct_map) {
2021 		/*
2022 		 * Check if every page in the region is covered by the direct
2023 		 * map. The direct map covers all of physical memory. Use
2024 		 * moea64_calc_wimg() as a shortcut to see if the page is in
2025 		 * physical memory as a way to see if the direct map covers it.
2026 		 */
2027 		for (va = pa_start; va < pa_end; va += PAGE_SIZE)
2028 			if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M)
2029 				break;
2030 		if (va == pa_end)
2031 			return (PHYS_TO_DMAP(pa_start));
2032 	}
2033 	sva = *virt;
2034 	va = sva;
2035 	/* XXX respect prot argument */
2036 	for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
2037 		moea64_kenter(mmu, va, pa_start);
2038 	*virt = va;
2039 
2040 	return (sva);
2041 }
2042 
2043 /*
2044  * Returns true if the pmap's pv is one of the first
2045  * 16 pvs linked to from this page.  This count may
2046  * be changed upwards or downwards in the future; it
2047  * is only necessary that true be returned for a small
2048  * subset of pmaps for proper page aging.
2049  */
2050 boolean_t
2051 moea64_page_exists_quick(mmu_t mmu, pmap_t pmap, vm_page_t m)
2052 {
2053         int loops;
2054 	struct pvo_entry *pvo;
2055 	boolean_t rv;
2056 
2057 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2058 	    ("moea64_page_exists_quick: page %p is not managed", m));
2059 	loops = 0;
2060 	rv = FALSE;
2061 	PV_PAGE_LOCK(m);
2062 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2063 		if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) {
2064 			rv = TRUE;
2065 			break;
2066 		}
2067 		if (++loops >= 16)
2068 			break;
2069 	}
2070 	PV_PAGE_UNLOCK(m);
2071 	return (rv);
2072 }
2073 
2074 void
2075 moea64_page_init(mmu_t mmu __unused, vm_page_t m)
2076 {
2077 
2078 	m->md.mdpg_attrs = 0;
2079 	m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
2080 	LIST_INIT(&m->md.mdpg_pvoh);
2081 }
2082 
2083 /*
2084  * Return the number of managed mappings to the given physical page
2085  * that are wired.
2086  */
2087 int
2088 moea64_page_wired_mappings(mmu_t mmu, vm_page_t m)
2089 {
2090 	struct pvo_entry *pvo;
2091 	int count;
2092 
2093 	count = 0;
2094 	if ((m->oflags & VPO_UNMANAGED) != 0)
2095 		return (count);
2096 	PV_PAGE_LOCK(m);
2097 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
2098 		if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED)
2099 			count++;
2100 	PV_PAGE_UNLOCK(m);
2101 	return (count);
2102 }
2103 
2104 static uintptr_t	moea64_vsidcontext;
2105 
2106 uintptr_t
2107 moea64_get_unique_vsid(void) {
2108 	u_int entropy;
2109 	register_t hash;
2110 	uint32_t mask;
2111 	int i;
2112 
2113 	entropy = 0;
2114 	__asm __volatile("mftb %0" : "=r"(entropy));
2115 
2116 	mtx_lock(&moea64_slb_mutex);
2117 	for (i = 0; i < NVSIDS; i += VSID_NBPW) {
2118 		u_int	n;
2119 
2120 		/*
2121 		 * Create a new value by mutiplying by a prime and adding in
2122 		 * entropy from the timebase register.  This is to make the
2123 		 * VSID more random so that the PT hash function collides
2124 		 * less often.  (Note that the prime casues gcc to do shifts
2125 		 * instead of a multiply.)
2126 		 */
2127 		moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy;
2128 		hash = moea64_vsidcontext & (NVSIDS - 1);
2129 		if (hash == 0)		/* 0 is special, avoid it */
2130 			continue;
2131 		n = hash >> 5;
2132 		mask = 1 << (hash & (VSID_NBPW - 1));
2133 		hash = (moea64_vsidcontext & VSID_HASHMASK);
2134 		if (moea64_vsid_bitmap[n] & mask) {	/* collision? */
2135 			/* anything free in this bucket? */
2136 			if (moea64_vsid_bitmap[n] == 0xffffffff) {
2137 				entropy = (moea64_vsidcontext >> 20);
2138 				continue;
2139 			}
2140 			i = ffs(~moea64_vsid_bitmap[n]) - 1;
2141 			mask = 1 << i;
2142 			hash &= rounddown2(VSID_HASHMASK, VSID_NBPW);
2143 			hash |= i;
2144 		}
2145 		if (hash == VSID_VRMA)	/* also special, avoid this too */
2146 			continue;
2147 		KASSERT(!(moea64_vsid_bitmap[n] & mask),
2148 		    ("Allocating in-use VSID %#zx\n", hash));
2149 		moea64_vsid_bitmap[n] |= mask;
2150 		mtx_unlock(&moea64_slb_mutex);
2151 		return (hash);
2152 	}
2153 
2154 	mtx_unlock(&moea64_slb_mutex);
2155 	panic("%s: out of segments",__func__);
2156 }
2157 
2158 #ifdef __powerpc64__
2159 void
2160 moea64_pinit(mmu_t mmu, pmap_t pmap)
2161 {
2162 
2163 	RB_INIT(&pmap->pmap_pvo);
2164 
2165 	pmap->pm_slb_tree_root = slb_alloc_tree();
2166 	pmap->pm_slb = slb_alloc_user_cache();
2167 	pmap->pm_slb_len = 0;
2168 }
2169 #else
2170 void
2171 moea64_pinit(mmu_t mmu, pmap_t pmap)
2172 {
2173 	int	i;
2174 	uint32_t hash;
2175 
2176 	RB_INIT(&pmap->pmap_pvo);
2177 
2178 	if (pmap_bootstrapped)
2179 		pmap->pmap_phys = (pmap_t)moea64_kextract(mmu,
2180 		    (vm_offset_t)pmap);
2181 	else
2182 		pmap->pmap_phys = pmap;
2183 
2184 	/*
2185 	 * Allocate some segment registers for this pmap.
2186 	 */
2187 	hash = moea64_get_unique_vsid();
2188 
2189 	for (i = 0; i < 16; i++)
2190 		pmap->pm_sr[i] = VSID_MAKE(i, hash);
2191 
2192 	KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0"));
2193 }
2194 #endif
2195 
2196 /*
2197  * Initialize the pmap associated with process 0.
2198  */
2199 void
2200 moea64_pinit0(mmu_t mmu, pmap_t pm)
2201 {
2202 
2203 	PMAP_LOCK_INIT(pm);
2204 	moea64_pinit(mmu, pm);
2205 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
2206 }
2207 
2208 /*
2209  * Set the physical protection on the specified range of this map as requested.
2210  */
2211 static void
2212 moea64_pvo_protect(mmu_t mmu,  pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot)
2213 {
2214 	struct vm_page *pg;
2215 	vm_prot_t oldprot;
2216 	int32_t refchg;
2217 
2218 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
2219 
2220 	/*
2221 	 * Change the protection of the page.
2222 	 */
2223 	oldprot = pvo->pvo_pte.prot;
2224 	pvo->pvo_pte.prot = prot;
2225 	pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
2226 
2227 	/*
2228 	 * If the PVO is in the page table, update mapping
2229 	 */
2230 	refchg = MOEA64_PTE_REPLACE(mmu, pvo, MOEA64_PTE_PROT_UPDATE);
2231 	if (refchg < 0)
2232 		refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0;
2233 
2234 	if (pm != kernel_pmap && pg != NULL && !(pg->aflags & PGA_EXECUTABLE) &&
2235 	    (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
2236 		if ((pg->oflags & VPO_UNMANAGED) == 0)
2237 			vm_page_aflag_set(pg, PGA_EXECUTABLE);
2238 		moea64_syncicache(mmu, pm, PVO_VADDR(pvo),
2239 		    pvo->pvo_pte.pa & LPTE_RPGN, PAGE_SIZE);
2240 	}
2241 
2242 	/*
2243 	 * Update vm about the REF/CHG bits if the page is managed and we have
2244 	 * removed write access.
2245 	 */
2246 	if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) &&
2247 	    (oldprot & VM_PROT_WRITE)) {
2248 		refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2249 		if (refchg & LPTE_CHG)
2250 			vm_page_dirty(pg);
2251 		if (refchg & LPTE_REF)
2252 			vm_page_aflag_set(pg, PGA_REFERENCED);
2253 	}
2254 }
2255 
2256 void
2257 moea64_protect(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva,
2258     vm_prot_t prot)
2259 {
2260 	struct	pvo_entry *pvo, *tpvo, key;
2261 
2262 	CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
2263 	    sva, eva, prot);
2264 
2265 	KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
2266 	    ("moea64_protect: non current pmap"));
2267 
2268 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2269 		moea64_remove(mmu, pm, sva, eva);
2270 		return;
2271 	}
2272 
2273 	PMAP_LOCK(pm);
2274 	key.pvo_vaddr = sva;
2275 	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2276 	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2277 		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2278 		moea64_pvo_protect(mmu, pm, pvo, prot);
2279 	}
2280 	PMAP_UNLOCK(pm);
2281 }
2282 
2283 /*
2284  * Map a list of wired pages into kernel virtual address space.  This is
2285  * intended for temporary mappings which do not need page modification or
2286  * references recorded.  Existing mappings in the region are overwritten.
2287  */
2288 void
2289 moea64_qenter(mmu_t mmu, vm_offset_t va, vm_page_t *m, int count)
2290 {
2291 	while (count-- > 0) {
2292 		moea64_kenter(mmu, va, VM_PAGE_TO_PHYS(*m));
2293 		va += PAGE_SIZE;
2294 		m++;
2295 	}
2296 }
2297 
2298 /*
2299  * Remove page mappings from kernel virtual address space.  Intended for
2300  * temporary mappings entered by moea64_qenter.
2301  */
2302 void
2303 moea64_qremove(mmu_t mmu, vm_offset_t va, int count)
2304 {
2305 	while (count-- > 0) {
2306 		moea64_kremove(mmu, va);
2307 		va += PAGE_SIZE;
2308 	}
2309 }
2310 
2311 void
2312 moea64_release_vsid(uint64_t vsid)
2313 {
2314 	int idx, mask;
2315 
2316 	mtx_lock(&moea64_slb_mutex);
2317 	idx = vsid & (NVSIDS-1);
2318 	mask = 1 << (idx % VSID_NBPW);
2319 	idx /= VSID_NBPW;
2320 	KASSERT(moea64_vsid_bitmap[idx] & mask,
2321 	    ("Freeing unallocated VSID %#jx", vsid));
2322 	moea64_vsid_bitmap[idx] &= ~mask;
2323 	mtx_unlock(&moea64_slb_mutex);
2324 }
2325 
2326 
2327 void
2328 moea64_release(mmu_t mmu, pmap_t pmap)
2329 {
2330 
2331 	/*
2332 	 * Free segment registers' VSIDs
2333 	 */
2334     #ifdef __powerpc64__
2335 	slb_free_tree(pmap);
2336 	slb_free_user_cache(pmap->pm_slb);
2337     #else
2338 	KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0"));
2339 
2340 	moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0]));
2341     #endif
2342 }
2343 
2344 /*
2345  * Remove all pages mapped by the specified pmap
2346  */
2347 void
2348 moea64_remove_pages(mmu_t mmu, pmap_t pm)
2349 {
2350 	struct pvo_entry *pvo, *tpvo;
2351 	struct pvo_tree tofree;
2352 
2353 	RB_INIT(&tofree);
2354 
2355 	PMAP_LOCK(pm);
2356 	RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) {
2357 		if (pvo->pvo_vaddr & PVO_WIRED)
2358 			continue;
2359 
2360 		/*
2361 		 * For locking reasons, remove this from the page table and
2362 		 * pmap, but save delinking from the vm_page for a second
2363 		 * pass
2364 		 */
2365 		moea64_pvo_remove_from_pmap(mmu, pvo);
2366 		RB_INSERT(pvo_tree, &tofree, pvo);
2367 	}
2368 	PMAP_UNLOCK(pm);
2369 
2370 	RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) {
2371 		PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN);
2372 		moea64_pvo_remove_from_page(mmu, pvo);
2373 		PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN);
2374 		RB_REMOVE(pvo_tree, &tofree, pvo);
2375 		free_pvo_entry(pvo);
2376 	}
2377 }
2378 
2379 /*
2380  * Remove the given range of addresses from the specified map.
2381  */
2382 void
2383 moea64_remove(mmu_t mmu, pmap_t pm, vm_offset_t sva, vm_offset_t eva)
2384 {
2385 	struct  pvo_entry *pvo, *tpvo, key;
2386 	struct pvo_tree tofree;
2387 
2388 	/*
2389 	 * Perform an unsynchronized read.  This is, however, safe.
2390 	 */
2391 	if (pm->pm_stats.resident_count == 0)
2392 		return;
2393 
2394 	key.pvo_vaddr = sva;
2395 
2396 	RB_INIT(&tofree);
2397 
2398 	PMAP_LOCK(pm);
2399 	for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2400 	    pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2401 		tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2402 
2403 		/*
2404 		 * For locking reasons, remove this from the page table and
2405 		 * pmap, but save delinking from the vm_page for a second
2406 		 * pass
2407 		 */
2408 		moea64_pvo_remove_from_pmap(mmu, pvo);
2409 		RB_INSERT(pvo_tree, &tofree, pvo);
2410 	}
2411 	PMAP_UNLOCK(pm);
2412 
2413 	RB_FOREACH_SAFE(pvo, pvo_tree, &tofree, tpvo) {
2414 		PV_LOCK(pvo->pvo_pte.pa & LPTE_RPGN);
2415 		moea64_pvo_remove_from_page(mmu, pvo);
2416 		PV_UNLOCK(pvo->pvo_pte.pa & LPTE_RPGN);
2417 		RB_REMOVE(pvo_tree, &tofree, pvo);
2418 		free_pvo_entry(pvo);
2419 	}
2420 }
2421 
2422 /*
2423  * Remove physical page from all pmaps in which it resides. moea64_pvo_remove()
2424  * will reflect changes in pte's back to the vm_page.
2425  */
2426 void
2427 moea64_remove_all(mmu_t mmu, vm_page_t m)
2428 {
2429 	struct	pvo_entry *pvo, *next_pvo;
2430 	struct	pvo_head freequeue;
2431 	int	wasdead;
2432 	pmap_t	pmap;
2433 
2434 	LIST_INIT(&freequeue);
2435 
2436 	PV_PAGE_LOCK(m);
2437 	LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) {
2438 		pmap = pvo->pvo_pmap;
2439 		PMAP_LOCK(pmap);
2440 		wasdead = (pvo->pvo_vaddr & PVO_DEAD);
2441 		if (!wasdead)
2442 			moea64_pvo_remove_from_pmap(mmu, pvo);
2443 		moea64_pvo_remove_from_page(mmu, pvo);
2444 		if (!wasdead)
2445 			LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink);
2446 		PMAP_UNLOCK(pmap);
2447 
2448 	}
2449 	KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings"));
2450 	KASSERT(!(m->aflags & PGA_WRITEABLE), ("Page still writable"));
2451 	PV_PAGE_UNLOCK(m);
2452 
2453 	/* Clean up UMA allocations */
2454 	LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo)
2455 		free_pvo_entry(pvo);
2456 }
2457 
2458 /*
2459  * Allocate a physical page of memory directly from the phys_avail map.
2460  * Can only be called from moea64_bootstrap before avail start and end are
2461  * calculated.
2462  */
2463 vm_offset_t
2464 moea64_bootstrap_alloc(vm_size_t size, vm_size_t align)
2465 {
2466 	vm_offset_t	s, e;
2467 	int		i, j;
2468 
2469 	size = round_page(size);
2470 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2471 		if (align != 0)
2472 			s = roundup2(phys_avail[i], align);
2473 		else
2474 			s = phys_avail[i];
2475 		e = s + size;
2476 
2477 		if (s < phys_avail[i] || e > phys_avail[i + 1])
2478 			continue;
2479 
2480 		if (s + size > platform_real_maxaddr())
2481 			continue;
2482 
2483 		if (s == phys_avail[i]) {
2484 			phys_avail[i] += size;
2485 		} else if (e == phys_avail[i + 1]) {
2486 			phys_avail[i + 1] -= size;
2487 		} else {
2488 			for (j = phys_avail_count * 2; j > i; j -= 2) {
2489 				phys_avail[j] = phys_avail[j - 2];
2490 				phys_avail[j + 1] = phys_avail[j - 1];
2491 			}
2492 
2493 			phys_avail[i + 3] = phys_avail[i + 1];
2494 			phys_avail[i + 1] = s;
2495 			phys_avail[i + 2] = e;
2496 			phys_avail_count++;
2497 		}
2498 
2499 		return (s);
2500 	}
2501 	panic("moea64_bootstrap_alloc: could not allocate memory");
2502 }
2503 
2504 static int
2505 moea64_pvo_enter(mmu_t mmu, struct pvo_entry *pvo, struct pvo_head *pvo_head)
2506 {
2507 	int first, err;
2508 
2509 	PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2510 	KASSERT(moea64_pvo_find_va(pvo->pvo_pmap, PVO_VADDR(pvo)) == NULL,
2511 	    ("Existing mapping for VA %#jx", (uintmax_t)PVO_VADDR(pvo)));
2512 
2513 	moea64_pvo_enter_calls++;
2514 
2515 	/*
2516 	 * Add to pmap list
2517 	 */
2518 	RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2519 
2520 	/*
2521 	 * Remember if the list was empty and therefore will be the first
2522 	 * item.
2523 	 */
2524 	if (pvo_head != NULL) {
2525 		if (LIST_FIRST(pvo_head) == NULL)
2526 			first = 1;
2527 		LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2528 	}
2529 
2530 	if (pvo->pvo_vaddr & PVO_WIRED)
2531 		pvo->pvo_pmap->pm_stats.wired_count++;
2532 	pvo->pvo_pmap->pm_stats.resident_count++;
2533 
2534 	/*
2535 	 * Insert it into the hardware page table
2536 	 */
2537 	err = MOEA64_PTE_INSERT(mmu, pvo);
2538 	if (err != 0) {
2539 		panic("moea64_pvo_enter: overflow");
2540 	}
2541 
2542 	moea64_pvo_entries++;
2543 
2544 	if (pvo->pvo_pmap == kernel_pmap)
2545 		isync();
2546 
2547 #ifdef __powerpc64__
2548 	/*
2549 	 * Make sure all our bootstrap mappings are in the SLB as soon
2550 	 * as virtual memory is switched on.
2551 	 */
2552 	if (!pmap_bootstrapped)
2553 		moea64_bootstrap_slb_prefault(PVO_VADDR(pvo),
2554 		    pvo->pvo_vaddr & PVO_LARGE);
2555 #endif
2556 
2557 	return (first ? ENOENT : 0);
2558 }
2559 
2560 static void
2561 moea64_pvo_remove_from_pmap(mmu_t mmu, struct pvo_entry *pvo)
2562 {
2563 	struct	vm_page *pg;
2564 	int32_t refchg;
2565 
2566 	KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap"));
2567 	PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2568 	KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO"));
2569 
2570 	/*
2571 	 * If there is an active pte entry, we need to deactivate it
2572 	 */
2573 	refchg = MOEA64_PTE_UNSET(mmu, pvo);
2574 	if (refchg < 0) {
2575 		/*
2576 		 * If it was evicted from the page table, be pessimistic and
2577 		 * dirty the page.
2578 		 */
2579 		if (pvo->pvo_pte.prot & VM_PROT_WRITE)
2580 			refchg = LPTE_CHG;
2581 		else
2582 			refchg = 0;
2583 	}
2584 
2585 	/*
2586 	 * Update our statistics.
2587 	 */
2588 	pvo->pvo_pmap->pm_stats.resident_count--;
2589 	if (pvo->pvo_vaddr & PVO_WIRED)
2590 		pvo->pvo_pmap->pm_stats.wired_count--;
2591 
2592 	/*
2593 	 * Remove this PVO from the pmap list.
2594 	 */
2595 	RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2596 
2597 	/*
2598 	 * Mark this for the next sweep
2599 	 */
2600 	pvo->pvo_vaddr |= PVO_DEAD;
2601 
2602 	/* Send RC bits to VM */
2603 	if ((pvo->pvo_vaddr & PVO_MANAGED) &&
2604 	    (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2605 		pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
2606 		if (pg != NULL) {
2607 			refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2608 			if (refchg & LPTE_CHG)
2609 				vm_page_dirty(pg);
2610 			if (refchg & LPTE_REF)
2611 				vm_page_aflag_set(pg, PGA_REFERENCED);
2612 		}
2613 	}
2614 }
2615 
2616 static void
2617 moea64_pvo_remove_from_page(mmu_t mmu, struct pvo_entry *pvo)
2618 {
2619 	struct	vm_page *pg;
2620 
2621 	KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page"));
2622 
2623 	/* Use NULL pmaps as a sentinel for races in page deletion */
2624 	if (pvo->pvo_pmap == NULL)
2625 		return;
2626 	pvo->pvo_pmap = NULL;
2627 
2628 	/*
2629 	 * Update vm about page writeability/executability if managed
2630 	 */
2631 	PV_LOCKASSERT(pvo->pvo_pte.pa & LPTE_RPGN);
2632 	if (pvo->pvo_vaddr & PVO_MANAGED) {
2633 		pg = PHYS_TO_VM_PAGE(pvo->pvo_pte.pa & LPTE_RPGN);
2634 
2635 		if (pg != NULL) {
2636 			LIST_REMOVE(pvo, pvo_vlink);
2637 			if (LIST_EMPTY(vm_page_to_pvoh(pg)))
2638 				vm_page_aflag_clear(pg,
2639 				    PGA_WRITEABLE | PGA_EXECUTABLE);
2640 		}
2641 	}
2642 
2643 	moea64_pvo_entries--;
2644 	moea64_pvo_remove_calls++;
2645 }
2646 
2647 static struct pvo_entry *
2648 moea64_pvo_find_va(pmap_t pm, vm_offset_t va)
2649 {
2650 	struct pvo_entry key;
2651 
2652 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
2653 
2654 	key.pvo_vaddr = va & ~ADDR_POFF;
2655 	return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key));
2656 }
2657 
2658 static boolean_t
2659 moea64_query_bit(mmu_t mmu, vm_page_t m, uint64_t ptebit)
2660 {
2661 	struct	pvo_entry *pvo;
2662 	int64_t ret;
2663 	boolean_t rv;
2664 
2665 	/*
2666 	 * See if this bit is stored in the page already.
2667 	 */
2668 	if (m->md.mdpg_attrs & ptebit)
2669 		return (TRUE);
2670 
2671 	/*
2672 	 * Examine each PTE.  Sync so that any pending REF/CHG bits are
2673 	 * flushed to the PTEs.
2674 	 */
2675 	rv = FALSE;
2676 	powerpc_sync();
2677 	PV_PAGE_LOCK(m);
2678 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2679 		ret = 0;
2680 
2681 		/*
2682 		 * See if this pvo has a valid PTE.  if so, fetch the
2683 		 * REF/CHG bits from the valid PTE.  If the appropriate
2684 		 * ptebit is set, return success.
2685 		 */
2686 		PMAP_LOCK(pvo->pvo_pmap);
2687 		if (!(pvo->pvo_vaddr & PVO_DEAD))
2688 			ret = MOEA64_PTE_SYNCH(mmu, pvo);
2689 		PMAP_UNLOCK(pvo->pvo_pmap);
2690 
2691 		if (ret > 0) {
2692 			atomic_set_32(&m->md.mdpg_attrs,
2693 			    ret & (LPTE_CHG | LPTE_REF));
2694 			if (ret & ptebit) {
2695 				rv = TRUE;
2696 				break;
2697 			}
2698 		}
2699 	}
2700 	PV_PAGE_UNLOCK(m);
2701 
2702 	return (rv);
2703 }
2704 
2705 static u_int
2706 moea64_clear_bit(mmu_t mmu, vm_page_t m, u_int64_t ptebit)
2707 {
2708 	u_int	count;
2709 	struct	pvo_entry *pvo;
2710 	int64_t ret;
2711 
2712 	/*
2713 	 * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
2714 	 * we can reset the right ones).
2715 	 */
2716 	powerpc_sync();
2717 
2718 	/*
2719 	 * For each pvo entry, clear the pte's ptebit.
2720 	 */
2721 	count = 0;
2722 	PV_PAGE_LOCK(m);
2723 	LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2724 		ret = 0;
2725 
2726 		PMAP_LOCK(pvo->pvo_pmap);
2727 		if (!(pvo->pvo_vaddr & PVO_DEAD))
2728 			ret = MOEA64_PTE_CLEAR(mmu, pvo, ptebit);
2729 		PMAP_UNLOCK(pvo->pvo_pmap);
2730 
2731 		if (ret > 0 && (ret & ptebit))
2732 			count++;
2733 	}
2734 	atomic_clear_32(&m->md.mdpg_attrs, ptebit);
2735 	PV_PAGE_UNLOCK(m);
2736 
2737 	return (count);
2738 }
2739 
2740 boolean_t
2741 moea64_dev_direct_mapped(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
2742 {
2743 	struct pvo_entry *pvo, key;
2744 	vm_offset_t ppa;
2745 	int error = 0;
2746 
2747 	if (hw_direct_map && mem_valid(pa, size) == 0)
2748 		return (0);
2749 
2750 	PMAP_LOCK(kernel_pmap);
2751 	ppa = pa & ~ADDR_POFF;
2752 	key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa;
2753 	for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
2754 	    ppa < pa + size; ppa += PAGE_SIZE,
2755 	    pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
2756 		if (pvo == NULL || (pvo->pvo_pte.pa & LPTE_RPGN) != ppa) {
2757 			error = EFAULT;
2758 			break;
2759 		}
2760 	}
2761 	PMAP_UNLOCK(kernel_pmap);
2762 
2763 	return (error);
2764 }
2765 
2766 /*
2767  * Map a set of physical memory pages into the kernel virtual
2768  * address space. Return a pointer to where it is mapped. This
2769  * routine is intended to be used for mapping device memory,
2770  * NOT real memory.
2771  */
2772 void *
2773 moea64_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
2774 {
2775 	vm_offset_t va, tmpva, ppa, offset;
2776 
2777 	ppa = trunc_page(pa);
2778 	offset = pa & PAGE_MASK;
2779 	size = roundup2(offset + size, PAGE_SIZE);
2780 
2781 	va = kva_alloc(size);
2782 
2783 	if (!va)
2784 		panic("moea64_mapdev: Couldn't alloc kernel virtual memory");
2785 
2786 	for (tmpva = va; size > 0;) {
2787 		moea64_kenter_attr(mmu, tmpva, ppa, ma);
2788 		size -= PAGE_SIZE;
2789 		tmpva += PAGE_SIZE;
2790 		ppa += PAGE_SIZE;
2791 	}
2792 
2793 	return ((void *)(va + offset));
2794 }
2795 
2796 void *
2797 moea64_mapdev(mmu_t mmu, vm_paddr_t pa, vm_size_t size)
2798 {
2799 
2800 	return moea64_mapdev_attr(mmu, pa, size, VM_MEMATTR_DEFAULT);
2801 }
2802 
2803 void
2804 moea64_unmapdev(mmu_t mmu, vm_offset_t va, vm_size_t size)
2805 {
2806 	vm_offset_t base, offset;
2807 
2808 	base = trunc_page(va);
2809 	offset = va & PAGE_MASK;
2810 	size = roundup2(offset + size, PAGE_SIZE);
2811 
2812 	kva_free(base, size);
2813 }
2814 
2815 void
2816 moea64_sync_icache(mmu_t mmu, pmap_t pm, vm_offset_t va, vm_size_t sz)
2817 {
2818 	struct pvo_entry *pvo;
2819 	vm_offset_t lim;
2820 	vm_paddr_t pa;
2821 	vm_size_t len;
2822 
2823 	PMAP_LOCK(pm);
2824 	while (sz > 0) {
2825 		lim = round_page(va+1);
2826 		len = MIN(lim - va, sz);
2827 		pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
2828 		if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) {
2829 			pa = (pvo->pvo_pte.pa & LPTE_RPGN) | (va & ADDR_POFF);
2830 			moea64_syncicache(mmu, pm, va, pa, len);
2831 		}
2832 		va += len;
2833 		sz -= len;
2834 	}
2835 	PMAP_UNLOCK(pm);
2836 }
2837 
2838 void
2839 moea64_dumpsys_map(mmu_t mmu, vm_paddr_t pa, size_t sz, void **va)
2840 {
2841 
2842 	*va = (void *)(uintptr_t)pa;
2843 }
2844 
2845 extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
2846 
2847 void
2848 moea64_scan_init(mmu_t mmu)
2849 {
2850 	struct pvo_entry *pvo;
2851 	vm_offset_t va;
2852 	int i;
2853 
2854 	if (!do_minidump) {
2855 		/* Initialize phys. segments for dumpsys(). */
2856 		memset(&dump_map, 0, sizeof(dump_map));
2857 		mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
2858 		for (i = 0; i < pregions_sz; i++) {
2859 			dump_map[i].pa_start = pregions[i].mr_start;
2860 			dump_map[i].pa_size = pregions[i].mr_size;
2861 		}
2862 		return;
2863 	}
2864 
2865 	/* Virtual segments for minidumps: */
2866 	memset(&dump_map, 0, sizeof(dump_map));
2867 
2868 	/* 1st: kernel .data and .bss. */
2869 	dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
2870 	dump_map[0].pa_size = round_page((uintptr_t)_end) -
2871 	    dump_map[0].pa_start;
2872 
2873 	/* 2nd: msgbuf and tables (see pmap_bootstrap()). */
2874 	dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr;
2875 	dump_map[1].pa_size = round_page(msgbufp->msg_size);
2876 
2877 	/* 3rd: kernel VM. */
2878 	va = dump_map[1].pa_start + dump_map[1].pa_size;
2879 	/* Find start of next chunk (from va). */
2880 	while (va < virtual_end) {
2881 		/* Don't dump the buffer cache. */
2882 		if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
2883 			va = kmi.buffer_eva;
2884 			continue;
2885 		}
2886 		pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
2887 		if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD))
2888 			break;
2889 		va += PAGE_SIZE;
2890 	}
2891 	if (va < virtual_end) {
2892 		dump_map[2].pa_start = va;
2893 		va += PAGE_SIZE;
2894 		/* Find last page in chunk. */
2895 		while (va < virtual_end) {
2896 			/* Don't run into the buffer cache. */
2897 			if (va == kmi.buffer_sva)
2898 				break;
2899 			pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
2900 			if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD))
2901 				break;
2902 			va += PAGE_SIZE;
2903 		}
2904 		dump_map[2].pa_size = va - dump_map[2].pa_start;
2905 	}
2906 }
2907 
2908