xref: /linux/arch/powerpc/mm/hugetlbpage.c (revision 9ce7677cfd7cd871adb457c80bea3b581b839641)
1 /*
2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  *
6  * Based on the IA-32 version:
7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8  */
9 
10 #include <linux/init.h>
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
19 #include <asm/mman.h>
20 #include <asm/pgalloc.h>
21 #include <asm/tlb.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
26 #include <asm/tlb.h>
27 
28 #include <linux/sysctl.h>
29 
30 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
31 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32 
33 /* Modelled after find_linux_pte() */
34 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
35 {
36 	pgd_t *pg;
37 	pud_t *pu;
38 	pmd_t *pm;
39 	pte_t *pt;
40 
41 	BUG_ON(! in_hugepage_area(mm->context, addr));
42 
43 	addr &= HPAGE_MASK;
44 
45 	pg = pgd_offset(mm, addr);
46 	if (!pgd_none(*pg)) {
47 		pu = pud_offset(pg, addr);
48 		if (!pud_none(*pu)) {
49 			pm = pmd_offset(pu, addr);
50 #ifdef CONFIG_PPC_64K_PAGES
51 			/* Currently, we use the normal PTE offset within full
52 			 * size PTE pages, thus our huge PTEs are scattered in
53 			 * the PTE page and we do waste some. We may change
54 			 * that in the future, but the current mecanism keeps
55 			 * things much simpler
56 			 */
57 			if (!pmd_none(*pm)) {
58 				/* Note: pte_offset_* are all equivalent on
59 				 * ppc64 as we don't have HIGHMEM
60 				 */
61 				pt = pte_offset_kernel(pm, addr);
62 				return pt;
63 			}
64 #else /* CONFIG_PPC_64K_PAGES */
65 			/* On 4k pages, we put huge PTEs in the PMD page */
66 			pt = (pte_t *)pm;
67 			return pt;
68 #endif /* CONFIG_PPC_64K_PAGES */
69 		}
70 	}
71 
72 	return NULL;
73 }
74 
75 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
76 {
77 	pgd_t *pg;
78 	pud_t *pu;
79 	pmd_t *pm;
80 	pte_t *pt;
81 
82 	BUG_ON(! in_hugepage_area(mm->context, addr));
83 
84 	addr &= HPAGE_MASK;
85 
86 	pg = pgd_offset(mm, addr);
87 	pu = pud_alloc(mm, pg, addr);
88 
89 	if (pu) {
90 		pm = pmd_alloc(mm, pu, addr);
91 		if (pm) {
92 #ifdef CONFIG_PPC_64K_PAGES
93 			/* See comment in huge_pte_offset. Note that if we ever
94 			 * want to put the page size in the PMD, we would have
95 			 * to open code our own pte_alloc* function in order
96 			 * to populate and set the size atomically
97 			 */
98 			pt = pte_alloc_map(mm, pm, addr);
99 #else /* CONFIG_PPC_64K_PAGES */
100 			pt = (pte_t *)pm;
101 #endif /* CONFIG_PPC_64K_PAGES */
102 			return pt;
103 		}
104 	}
105 
106 	return NULL;
107 }
108 
109 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
110 		     pte_t *ptep, pte_t pte)
111 {
112 	if (pte_present(*ptep)) {
113 		/* We open-code pte_clear because we need to pass the right
114 		 * argument to hpte_update (huge / !huge)
115 		 */
116 		unsigned long old = pte_update(ptep, ~0UL);
117 		if (old & _PAGE_HASHPTE)
118 			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
119 		flush_tlb_pending();
120 	}
121 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
122 }
123 
124 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
125 			      pte_t *ptep)
126 {
127 	unsigned long old = pte_update(ptep, ~0UL);
128 
129 	if (old & _PAGE_HASHPTE)
130 		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
131 	*ptep = __pte(0);
132 
133 	return __pte(old);
134 }
135 
136 /*
137  * This function checks for proper alignment of input addr and len parameters.
138  */
139 int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
140 {
141 	if (len & ~HPAGE_MASK)
142 		return -EINVAL;
143 	if (addr & ~HPAGE_MASK)
144 		return -EINVAL;
145 	if (! (within_hugepage_low_range(addr, len)
146 	       || within_hugepage_high_range(addr, len)) )
147 		return -EINVAL;
148 	return 0;
149 }
150 
151 struct slb_flush_info {
152 	struct mm_struct *mm;
153 	u16 newareas;
154 };
155 
156 static void flush_low_segments(void *parm)
157 {
158 	struct slb_flush_info *fi = parm;
159 	unsigned long i;
160 
161 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
162 
163 	if (current->active_mm != fi->mm)
164 		return;
165 
166 	/* Only need to do anything if this CPU is working in the same
167 	 * mm as the one which has changed */
168 
169 	/* update the paca copy of the context struct */
170 	get_paca()->context = current->active_mm->context;
171 
172 	asm volatile("isync" : : : "memory");
173 	for (i = 0; i < NUM_LOW_AREAS; i++) {
174 		if (! (fi->newareas & (1U << i)))
175 			continue;
176 		asm volatile("slbie %0"
177 			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
178 	}
179 	asm volatile("isync" : : : "memory");
180 }
181 
182 static void flush_high_segments(void *parm)
183 {
184 	struct slb_flush_info *fi = parm;
185 	unsigned long i, j;
186 
187 
188 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
189 
190 	if (current->active_mm != fi->mm)
191 		return;
192 
193 	/* Only need to do anything if this CPU is working in the same
194 	 * mm as the one which has changed */
195 
196 	/* update the paca copy of the context struct */
197 	get_paca()->context = current->active_mm->context;
198 
199 	asm volatile("isync" : : : "memory");
200 	for (i = 0; i < NUM_HIGH_AREAS; i++) {
201 		if (! (fi->newareas & (1U << i)))
202 			continue;
203 		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
204 			asm volatile("slbie %0"
205 				     :: "r" (((i << HTLB_AREA_SHIFT)
206 					      + (j << SID_SHIFT)) | SLBIE_C));
207 	}
208 	asm volatile("isync" : : : "memory");
209 }
210 
211 static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
212 {
213 	unsigned long start = area << SID_SHIFT;
214 	unsigned long end = (area+1) << SID_SHIFT;
215 	struct vm_area_struct *vma;
216 
217 	BUG_ON(area >= NUM_LOW_AREAS);
218 
219 	/* Check no VMAs are in the region */
220 	vma = find_vma(mm, start);
221 	if (vma && (vma->vm_start < end))
222 		return -EBUSY;
223 
224 	return 0;
225 }
226 
227 static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
228 {
229 	unsigned long start = area << HTLB_AREA_SHIFT;
230 	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
231 	struct vm_area_struct *vma;
232 
233 	BUG_ON(area >= NUM_HIGH_AREAS);
234 
235 	/* Hack, so that each addresses is controlled by exactly one
236 	 * of the high or low area bitmaps, the first high area starts
237 	 * at 4GB, not 0 */
238 	if (start == 0)
239 		start = 0x100000000UL;
240 
241 	/* Check no VMAs are in the region */
242 	vma = find_vma(mm, start);
243 	if (vma && (vma->vm_start < end))
244 		return -EBUSY;
245 
246 	return 0;
247 }
248 
249 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
250 {
251 	unsigned long i;
252 	struct slb_flush_info fi;
253 
254 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
255 	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
256 
257 	newareas &= ~(mm->context.low_htlb_areas);
258 	if (! newareas)
259 		return 0; /* The segments we want are already open */
260 
261 	for (i = 0; i < NUM_LOW_AREAS; i++)
262 		if ((1 << i) & newareas)
263 			if (prepare_low_area_for_htlb(mm, i) != 0)
264 				return -EBUSY;
265 
266 	mm->context.low_htlb_areas |= newareas;
267 
268 	/* the context change must make it to memory before the flush,
269 	 * so that further SLB misses do the right thing. */
270 	mb();
271 
272 	fi.mm = mm;
273 	fi.newareas = newareas;
274 	on_each_cpu(flush_low_segments, &fi, 0, 1);
275 
276 	return 0;
277 }
278 
279 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
280 {
281 	struct slb_flush_info fi;
282 	unsigned long i;
283 
284 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
285 	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
286 		     != NUM_HIGH_AREAS);
287 
288 	newareas &= ~(mm->context.high_htlb_areas);
289 	if (! newareas)
290 		return 0; /* The areas we want are already open */
291 
292 	for (i = 0; i < NUM_HIGH_AREAS; i++)
293 		if ((1 << i) & newareas)
294 			if (prepare_high_area_for_htlb(mm, i) != 0)
295 				return -EBUSY;
296 
297 	mm->context.high_htlb_areas |= newareas;
298 
299 	/* update the paca copy of the context struct */
300 	get_paca()->context = mm->context;
301 
302 	/* the context change must make it to memory before the flush,
303 	 * so that further SLB misses do the right thing. */
304 	mb();
305 
306 	fi.mm = mm;
307 	fi.newareas = newareas;
308 	on_each_cpu(flush_high_segments, &fi, 0, 1);
309 
310 	return 0;
311 }
312 
313 int prepare_hugepage_range(unsigned long addr, unsigned long len)
314 {
315 	int err = 0;
316 
317 	if ( (addr+len) < addr )
318 		return -EINVAL;
319 
320 	if (addr < 0x100000000UL)
321 		err = open_low_hpage_areas(current->mm,
322 					  LOW_ESID_MASK(addr, len));
323 	if ((addr + len) > 0x100000000UL)
324 		err = open_high_hpage_areas(current->mm,
325 					    HTLB_AREA_MASK(addr, len));
326 	if (err) {
327 		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
328 		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
329 		       addr, len,
330 		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
331 		return err;
332 	}
333 
334 	return 0;
335 }
336 
337 struct page *
338 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
339 {
340 	pte_t *ptep;
341 	struct page *page;
342 
343 	if (! in_hugepage_area(mm->context, address))
344 		return ERR_PTR(-EINVAL);
345 
346 	ptep = huge_pte_offset(mm, address);
347 	page = pte_page(*ptep);
348 	if (page)
349 		page += (address % HPAGE_SIZE) / PAGE_SIZE;
350 
351 	return page;
352 }
353 
354 int pmd_huge(pmd_t pmd)
355 {
356 	return 0;
357 }
358 
359 struct page *
360 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
361 		pmd_t *pmd, int write)
362 {
363 	BUG();
364 	return NULL;
365 }
366 
367 /* Because we have an exclusive hugepage region which lies within the
368  * normal user address space, we have to take special measures to make
369  * non-huge mmap()s evade the hugepage reserved regions. */
370 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
371 				     unsigned long len, unsigned long pgoff,
372 				     unsigned long flags)
373 {
374 	struct mm_struct *mm = current->mm;
375 	struct vm_area_struct *vma;
376 	unsigned long start_addr;
377 
378 	if (len > TASK_SIZE)
379 		return -ENOMEM;
380 
381 	if (addr) {
382 		addr = PAGE_ALIGN(addr);
383 		vma = find_vma(mm, addr);
384 		if (((TASK_SIZE - len) >= addr)
385 		    && (!vma || (addr+len) <= vma->vm_start)
386 		    && !is_hugepage_only_range(mm, addr,len))
387 			return addr;
388 	}
389 	if (len > mm->cached_hole_size) {
390 	        start_addr = addr = mm->free_area_cache;
391 	} else {
392 	        start_addr = addr = TASK_UNMAPPED_BASE;
393 	        mm->cached_hole_size = 0;
394 	}
395 
396 full_search:
397 	vma = find_vma(mm, addr);
398 	while (TASK_SIZE - len >= addr) {
399 		BUG_ON(vma && (addr >= vma->vm_end));
400 
401 		if (touches_hugepage_low_range(mm, addr, len)) {
402 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
403 			vma = find_vma(mm, addr);
404 			continue;
405 		}
406 		if (touches_hugepage_high_range(mm, addr, len)) {
407 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
408 			vma = find_vma(mm, addr);
409 			continue;
410 		}
411 		if (!vma || addr + len <= vma->vm_start) {
412 			/*
413 			 * Remember the place where we stopped the search:
414 			 */
415 			mm->free_area_cache = addr + len;
416 			return addr;
417 		}
418 		if (addr + mm->cached_hole_size < vma->vm_start)
419 		        mm->cached_hole_size = vma->vm_start - addr;
420 		addr = vma->vm_end;
421 		vma = vma->vm_next;
422 	}
423 
424 	/* Make sure we didn't miss any holes */
425 	if (start_addr != TASK_UNMAPPED_BASE) {
426 		start_addr = addr = TASK_UNMAPPED_BASE;
427 		mm->cached_hole_size = 0;
428 		goto full_search;
429 	}
430 	return -ENOMEM;
431 }
432 
433 /*
434  * This mmap-allocator allocates new areas top-down from below the
435  * stack's low limit (the base):
436  *
437  * Because we have an exclusive hugepage region which lies within the
438  * normal user address space, we have to take special measures to make
439  * non-huge mmap()s evade the hugepage reserved regions.
440  */
441 unsigned long
442 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
443 			  const unsigned long len, const unsigned long pgoff,
444 			  const unsigned long flags)
445 {
446 	struct vm_area_struct *vma, *prev_vma;
447 	struct mm_struct *mm = current->mm;
448 	unsigned long base = mm->mmap_base, addr = addr0;
449 	unsigned long largest_hole = mm->cached_hole_size;
450 	int first_time = 1;
451 
452 	/* requested length too big for entire address space */
453 	if (len > TASK_SIZE)
454 		return -ENOMEM;
455 
456 	/* dont allow allocations above current base */
457 	if (mm->free_area_cache > base)
458 		mm->free_area_cache = base;
459 
460 	/* requesting a specific address */
461 	if (addr) {
462 		addr = PAGE_ALIGN(addr);
463 		vma = find_vma(mm, addr);
464 		if (TASK_SIZE - len >= addr &&
465 				(!vma || addr + len <= vma->vm_start)
466 				&& !is_hugepage_only_range(mm, addr,len))
467 			return addr;
468 	}
469 
470 	if (len <= largest_hole) {
471 	        largest_hole = 0;
472 		mm->free_area_cache = base;
473 	}
474 try_again:
475 	/* make sure it can fit in the remaining address space */
476 	if (mm->free_area_cache < len)
477 		goto fail;
478 
479 	/* either no address requested or cant fit in requested address hole */
480 	addr = (mm->free_area_cache - len) & PAGE_MASK;
481 	do {
482 hugepage_recheck:
483 		if (touches_hugepage_low_range(mm, addr, len)) {
484 			addr = (addr & ((~0) << SID_SHIFT)) - len;
485 			goto hugepage_recheck;
486 		} else if (touches_hugepage_high_range(mm, addr, len)) {
487 			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
488 			goto hugepage_recheck;
489 		}
490 
491 		/*
492 		 * Lookup failure means no vma is above this address,
493 		 * i.e. return with success:
494 		 */
495  	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
496 			return addr;
497 
498 		/*
499 		 * new region fits between prev_vma->vm_end and
500 		 * vma->vm_start, use it:
501 		 */
502 		if (addr+len <= vma->vm_start &&
503 		          (!prev_vma || (addr >= prev_vma->vm_end))) {
504 			/* remember the address as a hint for next time */
505 		        mm->cached_hole_size = largest_hole;
506 		        return (mm->free_area_cache = addr);
507 		} else {
508 			/* pull free_area_cache down to the first hole */
509 		        if (mm->free_area_cache == vma->vm_end) {
510 				mm->free_area_cache = vma->vm_start;
511 				mm->cached_hole_size = largest_hole;
512 			}
513 		}
514 
515 		/* remember the largest hole we saw so far */
516 		if (addr + largest_hole < vma->vm_start)
517 		        largest_hole = vma->vm_start - addr;
518 
519 		/* try just below the current vma->vm_start */
520 		addr = vma->vm_start-len;
521 	} while (len <= vma->vm_start);
522 
523 fail:
524 	/*
525 	 * if hint left us with no space for the requested
526 	 * mapping then try again:
527 	 */
528 	if (first_time) {
529 		mm->free_area_cache = base;
530 		largest_hole = 0;
531 		first_time = 0;
532 		goto try_again;
533 	}
534 	/*
535 	 * A failed mmap() very likely causes application failure,
536 	 * so fall back to the bottom-up function here. This scenario
537 	 * can happen with large stack limits and large mmap()
538 	 * allocations.
539 	 */
540 	mm->free_area_cache = TASK_UNMAPPED_BASE;
541 	mm->cached_hole_size = ~0UL;
542 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
543 	/*
544 	 * Restore the topdown base:
545 	 */
546 	mm->free_area_cache = base;
547 	mm->cached_hole_size = ~0UL;
548 
549 	return addr;
550 }
551 
552 static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
553 {
554 	unsigned long addr = 0;
555 	struct vm_area_struct *vma;
556 
557 	vma = find_vma(current->mm, addr);
558 	while (addr + len <= 0x100000000UL) {
559 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
560 
561 		if (! __within_hugepage_low_range(addr, len, segmask)) {
562 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
563 			vma = find_vma(current->mm, addr);
564 			continue;
565 		}
566 
567 		if (!vma || (addr + len) <= vma->vm_start)
568 			return addr;
569 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
570 		/* Depending on segmask this might not be a confirmed
571 		 * hugepage region, so the ALIGN could have skipped
572 		 * some VMAs */
573 		vma = find_vma(current->mm, addr);
574 	}
575 
576 	return -ENOMEM;
577 }
578 
579 static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
580 {
581 	unsigned long addr = 0x100000000UL;
582 	struct vm_area_struct *vma;
583 
584 	vma = find_vma(current->mm, addr);
585 	while (addr + len <= TASK_SIZE_USER64) {
586 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
587 
588 		if (! __within_hugepage_high_range(addr, len, areamask)) {
589 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
590 			vma = find_vma(current->mm, addr);
591 			continue;
592 		}
593 
594 		if (!vma || (addr + len) <= vma->vm_start)
595 			return addr;
596 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
597 		/* Depending on segmask this might not be a confirmed
598 		 * hugepage region, so the ALIGN could have skipped
599 		 * some VMAs */
600 		vma = find_vma(current->mm, addr);
601 	}
602 
603 	return -ENOMEM;
604 }
605 
606 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
607 					unsigned long len, unsigned long pgoff,
608 					unsigned long flags)
609 {
610 	int lastshift;
611 	u16 areamask, curareas;
612 
613 	if (HPAGE_SHIFT == 0)
614 		return -EINVAL;
615 	if (len & ~HPAGE_MASK)
616 		return -EINVAL;
617 
618 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
619 		return -EINVAL;
620 
621 	if (test_thread_flag(TIF_32BIT)) {
622 		curareas = current->mm->context.low_htlb_areas;
623 
624 		/* First see if we can do the mapping in the existing
625 		 * low areas */
626 		addr = htlb_get_low_area(len, curareas);
627 		if (addr != -ENOMEM)
628 			return addr;
629 
630 		lastshift = 0;
631 		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
632 		     ! lastshift; areamask >>=1) {
633 			if (areamask & 1)
634 				lastshift = 1;
635 
636 			addr = htlb_get_low_area(len, curareas | areamask);
637 			if ((addr != -ENOMEM)
638 			    && open_low_hpage_areas(current->mm, areamask) == 0)
639 				return addr;
640 		}
641 	} else {
642 		curareas = current->mm->context.high_htlb_areas;
643 
644 		/* First see if we can do the mapping in the existing
645 		 * high areas */
646 		addr = htlb_get_high_area(len, curareas);
647 		if (addr != -ENOMEM)
648 			return addr;
649 
650 		lastshift = 0;
651 		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
652 		     ! lastshift; areamask >>=1) {
653 			if (areamask & 1)
654 				lastshift = 1;
655 
656 			addr = htlb_get_high_area(len, curareas | areamask);
657 			if ((addr != -ENOMEM)
658 			    && open_high_hpage_areas(current->mm, areamask) == 0)
659 				return addr;
660 		}
661 	}
662 	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
663 	       " enough areas\n");
664 	return -ENOMEM;
665 }
666 
667 /*
668  * Called by asm hashtable.S for doing lazy icache flush
669  */
670 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
671 						  pte_t pte, int trap)
672 {
673 	struct page *page;
674 	int i;
675 
676 	if (!pfn_valid(pte_pfn(pte)))
677 		return rflags;
678 
679 	page = pte_page(pte);
680 
681 	/* page is dirty */
682 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
683 		if (trap == 0x400) {
684 			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
685 				__flush_dcache_icache(page_address(page+i));
686 			set_bit(PG_arch_1, &page->flags);
687 		} else {
688 			rflags |= HPTE_R_N;
689 		}
690 	}
691 	return rflags;
692 }
693 
694 int hash_huge_page(struct mm_struct *mm, unsigned long access,
695 		   unsigned long ea, unsigned long vsid, int local,
696 		   unsigned long trap)
697 {
698 	pte_t *ptep;
699 	unsigned long old_pte, new_pte;
700 	unsigned long va, rflags, pa;
701 	long slot;
702 	int err = 1;
703 
704 	ptep = huge_pte_offset(mm, ea);
705 
706 	/* Search the Linux page table for a match with va */
707 	va = (vsid << 28) | (ea & 0x0fffffff);
708 
709 	/*
710 	 * If no pte found or not present, send the problem up to
711 	 * do_page_fault
712 	 */
713 	if (unlikely(!ptep || pte_none(*ptep)))
714 		goto out;
715 
716 	/*
717 	 * Check the user's access rights to the page.  If access should be
718 	 * prevented then send the problem up to do_page_fault.
719 	 */
720 	if (unlikely(access & ~pte_val(*ptep)))
721 		goto out;
722 	/*
723 	 * At this point, we have a pte (old_pte) which can be used to build
724 	 * or update an HPTE. There are 2 cases:
725 	 *
726 	 * 1. There is a valid (present) pte with no associated HPTE (this is
727 	 *	the most common case)
728 	 * 2. There is a valid (present) pte with an associated HPTE. The
729 	 *	current values of the pp bits in the HPTE prevent access
730 	 *	because we are doing software DIRTY bit management and the
731 	 *	page is currently not DIRTY.
732 	 */
733 
734 
735 	do {
736 		old_pte = pte_val(*ptep);
737 		if (old_pte & _PAGE_BUSY)
738 			goto out;
739 		new_pte = old_pte | _PAGE_BUSY |
740 			_PAGE_ACCESSED | _PAGE_HASHPTE;
741 	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
742 					 old_pte, new_pte));
743 
744 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
745  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
746 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
747 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
748 		/* No CPU has hugepages but lacks no execute, so we
749 		 * don't need to worry about that case */
750 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
751 						       trap);
752 
753 	/* Check if pte already has an hpte (case 2) */
754 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
755 		/* There MIGHT be an HPTE for this pte */
756 		unsigned long hash, slot;
757 
758 		hash = hpt_hash(va, HPAGE_SHIFT);
759 		if (old_pte & _PAGE_F_SECOND)
760 			hash = ~hash;
761 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
762 		slot += (old_pte & _PAGE_F_GIX) >> 12;
763 
764 		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
765 					 local) == -1)
766 			old_pte &= ~_PAGE_HPTEFLAGS;
767 	}
768 
769 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
770 		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
771 		unsigned long hpte_group;
772 
773 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
774 
775 repeat:
776 		hpte_group = ((hash & htab_hash_mask) *
777 			      HPTES_PER_GROUP) & ~0x7UL;
778 
779 		/* clear HPTE slot informations in new PTE */
780 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
781 
782 		/* Add in WIMG bits */
783 		/* XXX We should store these in the pte */
784 		/* --BenH: I think they are ... */
785 		rflags |= _PAGE_COHERENT;
786 
787 		/* Insert into the hash table, primary slot */
788 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
789 					  mmu_huge_psize);
790 
791 		/* Primary is full, try the secondary */
792 		if (unlikely(slot == -1)) {
793 			new_pte |= _PAGE_F_SECOND;
794 			hpte_group = ((~hash & htab_hash_mask) *
795 				      HPTES_PER_GROUP) & ~0x7UL;
796 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
797 						  HPTE_V_SECONDARY,
798 						  mmu_huge_psize);
799 			if (slot == -1) {
800 				if (mftb() & 0x1)
801 					hpte_group = ((hash & htab_hash_mask) *
802 						      HPTES_PER_GROUP)&~0x7UL;
803 
804 				ppc_md.hpte_remove(hpte_group);
805 				goto repeat;
806                         }
807 		}
808 
809 		if (unlikely(slot == -2))
810 			panic("hash_huge_page: pte_insert failed\n");
811 
812 		new_pte |= (slot << 12) & _PAGE_F_GIX;
813 	}
814 
815 	/*
816 	 * No need to use ldarx/stdcx here
817 	 */
818 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
819 
820 	err = 0;
821 
822  out:
823 	return err;
824 }
825