xref: /titanic_51/usr/src/uts/sun4u/os/ppage.c (revision b3697b90e692e3e5d859fb77d285d4c056d99eda)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/archsystm.h>
31 #include <sys/machsystm.h>
32 #include <sys/t_lock.h>
33 #include <sys/vmem.h>
34 #include <sys/mman.h>
35 #include <sys/vm.h>
36 #include <sys/cpu.h>
37 #include <sys/cmn_err.h>
38 #include <sys/cpuvar.h>
39 #include <sys/atomic.h>
40 #include <vm/as.h>
41 #include <vm/hat.h>
42 #include <vm/as.h>
43 #include <vm/page.h>
44 #include <vm/seg.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/seg_kpm.h>
47 #include <vm/hat_sfmmu.h>
48 #include <sys/debug.h>
49 #include <sys/cpu_module.h>
50 #include <sys/mem_cage.h>
51 
52 /*
53  * A quick way to generate a cache consistent address to map in a page.
54  * users: ppcopy, pagezero, /proc, dev/mem
55  *
56  * The ppmapin/ppmapout routines provide a quick way of generating a cache
57  * consistent address by reserving a given amount of kernel address space.
58  * The base is PPMAPBASE and its size is PPMAPSIZE.  This memory is divided
59  * into x number of sets, where x is the number of colors for the virtual
60  * cache. The number of colors is how many times a page can be mapped
61  * simulatenously in the cache.  For direct map caches this translates to
62  * the number of pages in the cache.
63  * Each set will be assigned a group of virtual pages from the reserved memory
64  * depending on its virtual color.
65  * When trying to assign a virtual address we will find out the color for the
66  * physical page in question (if applicable).  Then we will try to find an
67  * available virtual page from the set of the appropiate color.
68  */
69 
70 #define	clsettoarray(color, set) ((color * nsets) + set)
71 
72 int pp_slots = 4;		/* small default, tuned by cpu module */
73 
74 /* tuned by cpu module, default is "safe" */
75 int pp_consistent_coloring = PPAGE_STORES_POLLUTE | PPAGE_LOADS_POLLUTE;
76 
77 static caddr_t	ppmap_vaddrs[PPMAPSIZE / MMU_PAGESIZE];
78 static int	nsets;			/* number of sets */
79 static int	ppmap_pages;		/* generate align mask */
80 static int	ppmap_shift;		/* set selector */
81 
82 #ifdef PPDEBUG
83 #define		MAXCOLORS	16	/* for debug only */
84 static int	ppalloc_noslot = 0;	/* # of allocations from kernelmap */
85 static int	align_hits[MAXCOLORS];
86 static int	pp_allocs;		/* # of ppmapin requests */
87 #endif /* PPDEBUG */
88 
89 /*
90  * There are only 64 TLB entries on spitfire, 16 on cheetah
91  * (fully-associative TLB) so we allow the cpu module to tune the
92  * number to use here via pp_slots.
93  */
94 static struct ppmap_va {
95 	caddr_t	ppmap_slots[MAXPP_SLOTS];
96 } ppmap_va[NCPU];
97 
98 void
99 ppmapinit(void)
100 {
101 	int color, nset, setsize;
102 	caddr_t va;
103 
104 	ASSERT(pp_slots <= MAXPP_SLOTS);
105 
106 	va = (caddr_t)PPMAPBASE;
107 	if (cache & CACHE_VAC) {
108 		int a;
109 
110 		ppmap_pages = mmu_btop(shm_alignment);
111 		nsets = PPMAPSIZE / shm_alignment;
112 		setsize = shm_alignment;
113 		ppmap_shift = MMU_PAGESHIFT;
114 		a = ppmap_pages;
115 		while (a >>= 1)
116 			ppmap_shift++;
117 	} else {
118 		/*
119 		 * If we do not have a virtual indexed cache we simply
120 		 * have only one set containing all pages.
121 		 */
122 		ppmap_pages = 1;
123 		nsets = mmu_btop(PPMAPSIZE);
124 		setsize = MMU_PAGESIZE;
125 		ppmap_shift = MMU_PAGESHIFT;
126 	}
127 	for (color = 0; color < ppmap_pages; color++) {
128 		for (nset = 0; nset < nsets; nset++) {
129 			ppmap_vaddrs[clsettoarray(color, nset)] =
130 			    (caddr_t)((uintptr_t)va + (nset * setsize));
131 		}
132 		va += MMU_PAGESIZE;
133 	}
134 }
135 
136 /*
137  * Allocate a cache consistent virtual address to map a page, pp,
138  * with protection, vprot; and map it in the MMU, using the most
139  * efficient means possible.  The argument avoid is a virtual address
140  * hint which when masked yields an offset into a virtual cache
141  * that should be avoided when allocating an address to map in a
142  * page.  An avoid arg of -1 means you don't care, for instance pagezero.
143  *
144  * machine dependent, depends on virtual address space layout,
145  * understands that all kernel addresses have bit 31 set.
146  *
147  * NOTE: For sun4 platforms the meaning of the hint argument is opposite from
148  * that found in other architectures.  In other architectures the hint
149  * (called avoid) was used to ask ppmapin to NOT use the specified cache color.
150  * This was used to avoid virtual cache trashing in the bcopy.  Unfortunately
151  * in the case of a COW,  this later on caused a cache aliasing conflict.  In
152  * sun4, the bcopy routine uses the block ld/st instructions so we don't have
153  * to worry about virtual cache trashing.  Actually, by using the hint to choose
154  * the right color we can almost guarantee a cache conflict will not occur.
155  */
156 
157 caddr_t
158 ppmapin(page_t *pp, uint_t vprot, caddr_t hint)
159 {
160 	int color, nset, index, start;
161 	caddr_t va;
162 
163 #ifdef PPDEBUG
164 	pp_allocs++;
165 #endif /* PPDEBUG */
166 	if (cache & CACHE_VAC) {
167 		color = sfmmu_get_ppvcolor(pp);
168 		if (color == -1) {
169 			if ((intptr_t)hint != -1L) {
170 				color = addr_to_vcolor(hint);
171 			} else {
172 				color = addr_to_vcolor(mmu_ptob(pp->p_pagenum));
173 			}
174 		}
175 
176 	} else {
177 		/*
178 		 * For physical caches, we can pick any address we want.
179 		 */
180 		color = 0;
181 	}
182 
183 	start = color;
184 	do {
185 		for (nset = 0; nset < nsets; nset++) {
186 			index = clsettoarray(color, nset);
187 			va = ppmap_vaddrs[index];
188 			if (va != NULL) {
189 #ifdef PPDEBUG
190 				align_hits[color]++;
191 #endif /* PPDEBUG */
192 				if (casptr(&ppmap_vaddrs[index],
193 				    va, NULL) == va) {
194 					hat_memload(kas.a_hat, va, pp,
195 					    vprot | HAT_NOSYNC,
196 					    HAT_LOAD_LOCK);
197 					return (va);
198 				}
199 			}
200 		}
201 		/*
202 		 * first pick didn't succeed, try another
203 		 */
204 		if (++color == ppmap_pages)
205 			color = 0;
206 	} while (color != start);
207 
208 #ifdef PPDEBUG
209 	ppalloc_noslot++;
210 #endif /* PPDEBUG */
211 
212 	/*
213 	 * No free slots; get a random one from the kernel heap area.
214 	 */
215 	va = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
216 
217 	hat_memload(kas.a_hat, va, pp, vprot | HAT_NOSYNC, HAT_LOAD_LOCK);
218 
219 	return (va);
220 
221 }
222 
223 void
224 ppmapout(caddr_t va)
225 {
226 	int color, nset, index;
227 
228 	if (va >= kernelheap && va < ekernelheap) {
229 		/*
230 		 * Space came from kernelmap, flush the page and
231 		 * return the space.
232 		 */
233 		hat_unload(kas.a_hat, va, PAGESIZE,
234 		    (HAT_UNLOAD_NOSYNC | HAT_UNLOAD_UNLOCK));
235 		vmem_free(heap_arena, va, PAGESIZE);
236 	} else {
237 		/*
238 		 * Space came from ppmap_vaddrs[], give it back.
239 		 */
240 		color = addr_to_vcolor(va);
241 		ASSERT((cache & CACHE_VAC)? (color < ppmap_pages) : 1);
242 
243 		nset = ((uintptr_t)va >> ppmap_shift) & (nsets - 1);
244 		index = clsettoarray(color, nset);
245 		hat_unload(kas.a_hat, va, PAGESIZE,
246 		    (HAT_UNLOAD_NOSYNC | HAT_UNLOAD_UNLOCK));
247 
248 		ASSERT(ppmap_vaddrs[index] == NULL);
249 		ppmap_vaddrs[index] = va;
250 	}
251 }
252 
253 #ifdef DEBUG
254 #define	PP_STAT_ADD(stat)	(stat)++
255 uint_t pload, ploadfail;
256 uint_t ppzero, ppzero_short;
257 #else
258 #define	PP_STAT_ADD(stat)
259 #endif /* DEBUG */
260 
261 /*
262  * Find a slot in per CPU page copy area. Load up a locked TLB in the
263  * running cpu. We don't call hat layer to load up the tte since the
264  * mapping is only temporary. If the thread migrates it'll get a TLB
265  * miss trap and TLB/TSB miss handler will panic since there is no
266  * official hat record of this mapping.
267  */
268 static caddr_t
269 pp_load_tlb(processorid_t cpu, caddr_t **pslot, page_t *pp, uint_t prot)
270 {
271 	struct ppmap_va	*ppmap;
272 	tte_t		tte;
273 	caddr_t		*myslot;
274 	caddr_t		va;
275 	long		i, start, stride;
276 	int		vcolor;
277 	uint_t		flags, strict_flag;
278 
279 	PP_STAT_ADD(pload);
280 
281 	ppmap = &ppmap_va[cpu];
282 	va = (caddr_t)(PPMAP_FAST_BASE + (MMU_PAGESIZE * MAXPP_SLOTS) * cpu);
283 	myslot = ppmap->ppmap_slots;
284 	ASSERT(addr_to_vcolor(va) == 0);
285 
286 	if (prot & TTE_HWWR_INT) {
287 		flags = PPAGE_STORE_VCOLORING | PPAGE_STORES_POLLUTE;
288 		strict_flag = PPAGE_STORES_POLLUTE;
289 	} else {
290 		flags = PPAGE_LOAD_VCOLORING | PPAGE_LOADS_POLLUTE;
291 		strict_flag = PPAGE_LOADS_POLLUTE;
292 	}
293 
294 	/*
295 	 * If consistent handling is required then keep the current
296 	 * vcolor of the page.  Furthermore, if loads or stores can
297 	 * pollute the VAC then using a "new" page (unassigned vcolor)
298 	 * won't work and we have to return a failure.
299 	 */
300 	if (pp_consistent_coloring & flags) {
301 		vcolor = sfmmu_get_ppvcolor(pp);
302 		if ((vcolor == -1) &&
303 		    (pp_consistent_coloring & strict_flag))
304 			return (NULL);
305 		/* else keep the current vcolor of the page */
306 	} else {
307 		vcolor = -1;
308 	}
309 
310 	if (vcolor != -1) {
311 		va += MMU_PAGESIZE * vcolor;
312 		start = vcolor;
313 		stride = ppmap_pages; /* number of colors */
314 		myslot += vcolor;
315 	} else {
316 		start = 0;
317 		stride = 1;
318 	}
319 
320 	for (i = start; i < pp_slots; i += stride) {
321 		if (*myslot == NULL) {
322 			if (casptr(myslot, NULL, va) == NULL)
323 				break;
324 		}
325 		myslot += stride;
326 		va += MMU_PAGESIZE * stride;
327 	}
328 
329 	if (i >= pp_slots) {
330 		PP_STAT_ADD(ploadfail);
331 		return (NULL);
332 	}
333 
334 	ASSERT(vcolor == -1 || addr_to_vcolor(va) == vcolor);
335 
336 	/*
337 	 * Now we have a slot we can use, make the tte.
338 	 */
339 	tte.tte_inthi = TTE_VALID_INT | TTE_PFN_INTHI(pp->p_pagenum);
340 	tte.tte_intlo = TTE_PFN_INTLO(pp->p_pagenum) | TTE_CP_INT |
341 	    TTE_CV_INT | TTE_PRIV_INT | TTE_LCK_INT | prot;
342 
343 	ASSERT(CPU->cpu_id == cpu);
344 	sfmmu_dtlb_ld_kva(va, &tte);
345 
346 	*pslot = myslot;	/* Return ptr to the slot we used. */
347 
348 	return (va);
349 }
350 
351 static void
352 pp_unload_tlb(caddr_t *pslot, caddr_t va)
353 {
354 	ASSERT(*pslot == va);
355 
356 	vtag_flushpage(va, (uint64_t)ksfmmup);
357 	*pslot = NULL;				/* release the slot */
358 }
359 
360 /*
361  * Common copy routine which attempts to use hwblkpagecopy.  If this routine
362  * can't be used, failure (0) will be returned.  Otherwise, a PAGESIZE page
363  * will be copied and success (1) will be returned.
364  */
365 int
366 ppcopy_common(page_t *fm_pp, page_t *to_pp)
367 {
368 	caddr_t fm_va, to_va;
369 	caddr_t	*fm_slot, *to_slot;
370 	processorid_t cpu;
371 	label_t ljb;
372 	int ret = 1;
373 
374 	ASSERT(fm_pp != NULL && PAGE_LOCKED(fm_pp));
375 	ASSERT(to_pp != NULL && PAGE_LOCKED(to_pp));
376 
377 	/*
378 	 * If we can't use VIS block loads and stores we can't use
379 	 * pp_load_tlb/pp_unload_tlb due to the possibility of
380 	 * d$ aliasing.
381 	 */
382 	if (!use_hw_bcopy && (cache & CACHE_VAC))
383 		return (0);
384 
385 	kpreempt_disable();
386 	cpu = CPU->cpu_id;
387 	fm_va = pp_load_tlb(cpu, &fm_slot, fm_pp, 0);
388 	if (fm_va == NULL) {
389 		kpreempt_enable();
390 		return (0);
391 	}
392 	to_va = pp_load_tlb(cpu, &to_slot, to_pp, TTE_HWWR_INT);
393 	if (to_va == NULL) {
394 		pp_unload_tlb(fm_slot, fm_va);
395 		kpreempt_enable();
396 		return (0);
397 	}
398 	if (on_fault(&ljb)) {
399 		ret = 0;
400 		goto faulted;
401 	}
402 	hwblkpagecopy(fm_va, to_va);
403 	no_fault();
404 faulted:
405 	ASSERT(CPU->cpu_id == cpu);
406 	pp_unload_tlb(fm_slot, fm_va);
407 	pp_unload_tlb(to_slot, to_va);
408 	kpreempt_enable();
409 	return (ret);
410 }
411 
412 /*
413  * Routine to copy kernel pages during relocation.  It will copy one
414  * PAGESIZE page to another PAGESIZE page.  This function may be called
415  * above LOCK_LEVEL so it should not grab any locks.
416  */
417 void
418 ppcopy_kernel__relocatable(page_t *fm_pp, page_t *to_pp)
419 {
420 	uint64_t fm_pa, to_pa;
421 	size_t nbytes;
422 
423 	fm_pa = (uint64_t)(fm_pp->p_pagenum) << MMU_PAGESHIFT;
424 	to_pa = (uint64_t)(to_pp->p_pagenum) << MMU_PAGESHIFT;
425 
426 	nbytes = MMU_PAGESIZE;
427 
428 	for (; nbytes > 0; fm_pa += 32, to_pa += 32, nbytes -= 32)
429 		hw_pa_bcopy32(fm_pa, to_pa);
430 }
431 
432 /*
433  * Copy the data from the physical page represented by "frompp" to
434  * that represented by "topp".
435  *
436  * Try to use per cpu mapping first, if that fails then call pp_mapin
437  * to load it.
438  *
439  * Returns one on success or zero on some sort of fault while doing the copy.
440  */
441 int
442 ppcopy(page_t *fm_pp, page_t *to_pp)
443 {
444 	caddr_t fm_va, to_va;
445 	label_t ljb;
446 	int ret = 1;
447 	boolean_t	use_kpm = B_FALSE;
448 
449 	/* Try the fast path first */
450 	if (ppcopy_common(fm_pp, to_pp))
451 		return (1);
452 
453 	/*
454 	 * Try to map using KPM if enabled and we are the cageout thread.
455 	 * If it fails, fall back to ppmapin/ppmaput
456 	 */
457 
458 	if (kpm_enable) {
459 		if (curthread == kcage_cageout_thread)
460 			use_kpm = B_TRUE;
461 	}
462 
463 	if (use_kpm) {
464 		if ((fm_va = hat_kpm_mapin(fm_pp, NULL)) == NULL ||
465 		    (to_va = hat_kpm_mapin(to_pp, NULL)) == NULL) {
466 			if (fm_va != NULL)
467 				hat_kpm_mapout(fm_pp, NULL, fm_va);
468 			use_kpm = B_FALSE;
469 		}
470 	}
471 
472 	if (use_kpm == B_FALSE) {
473 		/* do the slow path */
474 		fm_va = ppmapin(fm_pp, PROT_READ, (caddr_t)-1);
475 		to_va = ppmapin(to_pp, PROT_READ | PROT_WRITE, fm_va);
476 		if (on_fault(&ljb)) {
477 			ret = 0;
478 			goto faulted;
479 		}
480 	}
481 	bcopy(fm_va, to_va, PAGESIZE);
482 	no_fault();
483 faulted:
484 	/* unmap */
485 	if (use_kpm == B_TRUE) {
486 		hat_kpm_mapout(fm_pp, NULL, fm_va);
487 		hat_kpm_mapout(to_pp, NULL, to_va);
488 	} else {
489 		ppmapout(fm_va);
490 		ppmapout(to_va);
491 	}
492 	return (ret);
493 }
494 
495 /*
496  * Zero the physical page from off to off + len given by `pp'
497  * without changing the reference and modified bits of page.
498  *
499  * Again, we'll try per cpu mapping first.
500  */
501 void
502 pagezero(page_t *pp, uint_t off, uint_t len)
503 {
504 	caddr_t va;
505 	caddr_t *slot;
506 	int fast = 1;
507 	processorid_t cpu;
508 	extern int hwblkclr(void *, size_t);
509 	extern int use_hw_bzero;
510 
511 	ASSERT((int)len > 0 && (int)off >= 0 && off + len <= PAGESIZE);
512 	ASSERT(PAGE_LOCKED(pp));
513 
514 	PP_STAT_ADD(ppzero);
515 
516 	if (len != MMU_PAGESIZE || !use_hw_bzero) {
517 		/*
518 		 * Since the fast path doesn't do anything about
519 		 * VAC coloring, we make sure bcopy h/w will be used.
520 		 */
521 		fast = 0;
522 		va = NULL;
523 		PP_STAT_ADD(ppzero_short);
524 	}
525 
526 	kpreempt_disable();
527 
528 	if (fast) {
529 		cpu = CPU->cpu_id;
530 		va = pp_load_tlb(cpu, &slot, pp, TTE_HWWR_INT);
531 	}
532 
533 	if (va == NULL) {
534 		/*
535 		 * We are here either length != MMU_PAGESIZE or pp_load_tlb()
536 		 * returns NULL or use_hw_bzero is disabled.
537 		 */
538 		va = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
539 		fast = 0;
540 	}
541 
542 	if (hwblkclr(va + off, len)) {
543 		/*
544 		 * We may not have used block commit asi.
545 		 * So flush the I-$ manually
546 		 */
547 
548 		ASSERT(fast == 0);
549 
550 		sync_icache(va + off, len);
551 	} else {
552 		/*
553 		 * We have used blk commit, and flushed the I-$. However we
554 		 * still may have an instruction in the pipeline. Only a flush
555 		 * instruction will invalidate that.
556 		 */
557 		doflush(va);
558 	}
559 
560 	if (fast) {
561 		ASSERT(CPU->cpu_id == cpu);
562 		pp_unload_tlb(slot, va);
563 	} else {
564 		ppmapout(va);
565 	}
566 
567 	kpreempt_enable();
568 }
569