xref: /linux/arch/powerpc/mm/hugetlbpage.c (revision de2fe5e07d58424bc286fff3fd3c1b0bf933cd58)
1 /*
2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  *
6  * Based on the IA-32 version:
7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8  */
9 
10 #include <linux/init.h>
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
19 #include <asm/mman.h>
20 #include <asm/pgalloc.h>
21 #include <asm/tlb.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
26 #include <asm/tlb.h>
27 
28 #include <linux/sysctl.h>
29 
30 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
31 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32 
33 /* Modelled after find_linux_pte() */
34 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
35 {
36 	pgd_t *pg;
37 	pud_t *pu;
38 	pmd_t *pm;
39 	pte_t *pt;
40 
41 	BUG_ON(! in_hugepage_area(mm->context, addr));
42 
43 	addr &= HPAGE_MASK;
44 
45 	pg = pgd_offset(mm, addr);
46 	if (!pgd_none(*pg)) {
47 		pu = pud_offset(pg, addr);
48 		if (!pud_none(*pu)) {
49 			pm = pmd_offset(pu, addr);
50 #ifdef CONFIG_PPC_64K_PAGES
51 			/* Currently, we use the normal PTE offset within full
52 			 * size PTE pages, thus our huge PTEs are scattered in
53 			 * the PTE page and we do waste some. We may change
54 			 * that in the future, but the current mecanism keeps
55 			 * things much simpler
56 			 */
57 			if (!pmd_none(*pm)) {
58 				/* Note: pte_offset_* are all equivalent on
59 				 * ppc64 as we don't have HIGHMEM
60 				 */
61 				pt = pte_offset_kernel(pm, addr);
62 				return pt;
63 			}
64 #else /* CONFIG_PPC_64K_PAGES */
65 			/* On 4k pages, we put huge PTEs in the PMD page */
66 			pt = (pte_t *)pm;
67 			return pt;
68 #endif /* CONFIG_PPC_64K_PAGES */
69 		}
70 	}
71 
72 	return NULL;
73 }
74 
75 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
76 {
77 	pgd_t *pg;
78 	pud_t *pu;
79 	pmd_t *pm;
80 	pte_t *pt;
81 
82 	BUG_ON(! in_hugepage_area(mm->context, addr));
83 
84 	addr &= HPAGE_MASK;
85 
86 	pg = pgd_offset(mm, addr);
87 	pu = pud_alloc(mm, pg, addr);
88 
89 	if (pu) {
90 		pm = pmd_alloc(mm, pu, addr);
91 		if (pm) {
92 #ifdef CONFIG_PPC_64K_PAGES
93 			/* See comment in huge_pte_offset. Note that if we ever
94 			 * want to put the page size in the PMD, we would have
95 			 * to open code our own pte_alloc* function in order
96 			 * to populate and set the size atomically
97 			 */
98 			pt = pte_alloc_map(mm, pm, addr);
99 #else /* CONFIG_PPC_64K_PAGES */
100 			pt = (pte_t *)pm;
101 #endif /* CONFIG_PPC_64K_PAGES */
102 			return pt;
103 		}
104 	}
105 
106 	return NULL;
107 }
108 
109 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
110 		     pte_t *ptep, pte_t pte)
111 {
112 	if (pte_present(*ptep)) {
113 		/* We open-code pte_clear because we need to pass the right
114 		 * argument to hpte_update (huge / !huge)
115 		 */
116 		unsigned long old = pte_update(ptep, ~0UL);
117 		if (old & _PAGE_HASHPTE)
118 			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
119 		flush_tlb_pending();
120 	}
121 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
122 }
123 
124 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
125 			      pte_t *ptep)
126 {
127 	unsigned long old = pte_update(ptep, ~0UL);
128 
129 	if (old & _PAGE_HASHPTE)
130 		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
131 	*ptep = __pte(0);
132 
133 	return __pte(old);
134 }
135 
136 struct slb_flush_info {
137 	struct mm_struct *mm;
138 	u16 newareas;
139 };
140 
141 static void flush_low_segments(void *parm)
142 {
143 	struct slb_flush_info *fi = parm;
144 	unsigned long i;
145 
146 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
147 
148 	if (current->active_mm != fi->mm)
149 		return;
150 
151 	/* Only need to do anything if this CPU is working in the same
152 	 * mm as the one which has changed */
153 
154 	/* update the paca copy of the context struct */
155 	get_paca()->context = current->active_mm->context;
156 
157 	asm volatile("isync" : : : "memory");
158 	for (i = 0; i < NUM_LOW_AREAS; i++) {
159 		if (! (fi->newareas & (1U << i)))
160 			continue;
161 		asm volatile("slbie %0"
162 			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
163 	}
164 	asm volatile("isync" : : : "memory");
165 }
166 
167 static void flush_high_segments(void *parm)
168 {
169 	struct slb_flush_info *fi = parm;
170 	unsigned long i, j;
171 
172 
173 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
174 
175 	if (current->active_mm != fi->mm)
176 		return;
177 
178 	/* Only need to do anything if this CPU is working in the same
179 	 * mm as the one which has changed */
180 
181 	/* update the paca copy of the context struct */
182 	get_paca()->context = current->active_mm->context;
183 
184 	asm volatile("isync" : : : "memory");
185 	for (i = 0; i < NUM_HIGH_AREAS; i++) {
186 		if (! (fi->newareas & (1U << i)))
187 			continue;
188 		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
189 			asm volatile("slbie %0"
190 				     :: "r" (((i << HTLB_AREA_SHIFT)
191 					      + (j << SID_SHIFT)) | SLBIE_C));
192 	}
193 	asm volatile("isync" : : : "memory");
194 }
195 
196 static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
197 {
198 	unsigned long start = area << SID_SHIFT;
199 	unsigned long end = (area+1) << SID_SHIFT;
200 	struct vm_area_struct *vma;
201 
202 	BUG_ON(area >= NUM_LOW_AREAS);
203 
204 	/* Check no VMAs are in the region */
205 	vma = find_vma(mm, start);
206 	if (vma && (vma->vm_start < end))
207 		return -EBUSY;
208 
209 	return 0;
210 }
211 
212 static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
213 {
214 	unsigned long start = area << HTLB_AREA_SHIFT;
215 	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
216 	struct vm_area_struct *vma;
217 
218 	BUG_ON(area >= NUM_HIGH_AREAS);
219 
220 	/* Hack, so that each addresses is controlled by exactly one
221 	 * of the high or low area bitmaps, the first high area starts
222 	 * at 4GB, not 0 */
223 	if (start == 0)
224 		start = 0x100000000UL;
225 
226 	/* Check no VMAs are in the region */
227 	vma = find_vma(mm, start);
228 	if (vma && (vma->vm_start < end))
229 		return -EBUSY;
230 
231 	return 0;
232 }
233 
234 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
235 {
236 	unsigned long i;
237 	struct slb_flush_info fi;
238 
239 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
240 	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
241 
242 	newareas &= ~(mm->context.low_htlb_areas);
243 	if (! newareas)
244 		return 0; /* The segments we want are already open */
245 
246 	for (i = 0; i < NUM_LOW_AREAS; i++)
247 		if ((1 << i) & newareas)
248 			if (prepare_low_area_for_htlb(mm, i) != 0)
249 				return -EBUSY;
250 
251 	mm->context.low_htlb_areas |= newareas;
252 
253 	/* the context change must make it to memory before the flush,
254 	 * so that further SLB misses do the right thing. */
255 	mb();
256 
257 	fi.mm = mm;
258 	fi.newareas = newareas;
259 	on_each_cpu(flush_low_segments, &fi, 0, 1);
260 
261 	return 0;
262 }
263 
264 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
265 {
266 	struct slb_flush_info fi;
267 	unsigned long i;
268 
269 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
270 	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
271 		     != NUM_HIGH_AREAS);
272 
273 	newareas &= ~(mm->context.high_htlb_areas);
274 	if (! newareas)
275 		return 0; /* The areas we want are already open */
276 
277 	for (i = 0; i < NUM_HIGH_AREAS; i++)
278 		if ((1 << i) & newareas)
279 			if (prepare_high_area_for_htlb(mm, i) != 0)
280 				return -EBUSY;
281 
282 	mm->context.high_htlb_areas |= newareas;
283 
284 	/* update the paca copy of the context struct */
285 	get_paca()->context = mm->context;
286 
287 	/* the context change must make it to memory before the flush,
288 	 * so that further SLB misses do the right thing. */
289 	mb();
290 
291 	fi.mm = mm;
292 	fi.newareas = newareas;
293 	on_each_cpu(flush_high_segments, &fi, 0, 1);
294 
295 	return 0;
296 }
297 
298 int prepare_hugepage_range(unsigned long addr, unsigned long len)
299 {
300 	int err = 0;
301 
302 	if ( (addr+len) < addr )
303 		return -EINVAL;
304 
305 	if (addr < 0x100000000UL)
306 		err = open_low_hpage_areas(current->mm,
307 					  LOW_ESID_MASK(addr, len));
308 	if ((addr + len) > 0x100000000UL)
309 		err = open_high_hpage_areas(current->mm,
310 					    HTLB_AREA_MASK(addr, len));
311 	if (err) {
312 		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
313 		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
314 		       addr, len,
315 		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
316 		return err;
317 	}
318 
319 	return 0;
320 }
321 
322 struct page *
323 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
324 {
325 	pte_t *ptep;
326 	struct page *page;
327 
328 	if (! in_hugepage_area(mm->context, address))
329 		return ERR_PTR(-EINVAL);
330 
331 	ptep = huge_pte_offset(mm, address);
332 	page = pte_page(*ptep);
333 	if (page)
334 		page += (address % HPAGE_SIZE) / PAGE_SIZE;
335 
336 	return page;
337 }
338 
339 int pmd_huge(pmd_t pmd)
340 {
341 	return 0;
342 }
343 
344 struct page *
345 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
346 		pmd_t *pmd, int write)
347 {
348 	BUG();
349 	return NULL;
350 }
351 
352 /* Because we have an exclusive hugepage region which lies within the
353  * normal user address space, we have to take special measures to make
354  * non-huge mmap()s evade the hugepage reserved regions. */
355 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
356 				     unsigned long len, unsigned long pgoff,
357 				     unsigned long flags)
358 {
359 	struct mm_struct *mm = current->mm;
360 	struct vm_area_struct *vma;
361 	unsigned long start_addr;
362 
363 	if (len > TASK_SIZE)
364 		return -ENOMEM;
365 
366 	if (addr) {
367 		addr = PAGE_ALIGN(addr);
368 		vma = find_vma(mm, addr);
369 		if (((TASK_SIZE - len) >= addr)
370 		    && (!vma || (addr+len) <= vma->vm_start)
371 		    && !is_hugepage_only_range(mm, addr,len))
372 			return addr;
373 	}
374 	if (len > mm->cached_hole_size) {
375 	        start_addr = addr = mm->free_area_cache;
376 	} else {
377 	        start_addr = addr = TASK_UNMAPPED_BASE;
378 	        mm->cached_hole_size = 0;
379 	}
380 
381 full_search:
382 	vma = find_vma(mm, addr);
383 	while (TASK_SIZE - len >= addr) {
384 		BUG_ON(vma && (addr >= vma->vm_end));
385 
386 		if (touches_hugepage_low_range(mm, addr, len)) {
387 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
388 			vma = find_vma(mm, addr);
389 			continue;
390 		}
391 		if (touches_hugepage_high_range(mm, addr, len)) {
392 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
393 			vma = find_vma(mm, addr);
394 			continue;
395 		}
396 		if (!vma || addr + len <= vma->vm_start) {
397 			/*
398 			 * Remember the place where we stopped the search:
399 			 */
400 			mm->free_area_cache = addr + len;
401 			return addr;
402 		}
403 		if (addr + mm->cached_hole_size < vma->vm_start)
404 		        mm->cached_hole_size = vma->vm_start - addr;
405 		addr = vma->vm_end;
406 		vma = vma->vm_next;
407 	}
408 
409 	/* Make sure we didn't miss any holes */
410 	if (start_addr != TASK_UNMAPPED_BASE) {
411 		start_addr = addr = TASK_UNMAPPED_BASE;
412 		mm->cached_hole_size = 0;
413 		goto full_search;
414 	}
415 	return -ENOMEM;
416 }
417 
418 /*
419  * This mmap-allocator allocates new areas top-down from below the
420  * stack's low limit (the base):
421  *
422  * Because we have an exclusive hugepage region which lies within the
423  * normal user address space, we have to take special measures to make
424  * non-huge mmap()s evade the hugepage reserved regions.
425  */
426 unsigned long
427 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
428 			  const unsigned long len, const unsigned long pgoff,
429 			  const unsigned long flags)
430 {
431 	struct vm_area_struct *vma, *prev_vma;
432 	struct mm_struct *mm = current->mm;
433 	unsigned long base = mm->mmap_base, addr = addr0;
434 	unsigned long largest_hole = mm->cached_hole_size;
435 	int first_time = 1;
436 
437 	/* requested length too big for entire address space */
438 	if (len > TASK_SIZE)
439 		return -ENOMEM;
440 
441 	/* dont allow allocations above current base */
442 	if (mm->free_area_cache > base)
443 		mm->free_area_cache = base;
444 
445 	/* requesting a specific address */
446 	if (addr) {
447 		addr = PAGE_ALIGN(addr);
448 		vma = find_vma(mm, addr);
449 		if (TASK_SIZE - len >= addr &&
450 				(!vma || addr + len <= vma->vm_start)
451 				&& !is_hugepage_only_range(mm, addr,len))
452 			return addr;
453 	}
454 
455 	if (len <= largest_hole) {
456 	        largest_hole = 0;
457 		mm->free_area_cache = base;
458 	}
459 try_again:
460 	/* make sure it can fit in the remaining address space */
461 	if (mm->free_area_cache < len)
462 		goto fail;
463 
464 	/* either no address requested or cant fit in requested address hole */
465 	addr = (mm->free_area_cache - len) & PAGE_MASK;
466 	do {
467 hugepage_recheck:
468 		if (touches_hugepage_low_range(mm, addr, len)) {
469 			addr = (addr & ((~0) << SID_SHIFT)) - len;
470 			goto hugepage_recheck;
471 		} else if (touches_hugepage_high_range(mm, addr, len)) {
472 			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
473 			goto hugepage_recheck;
474 		}
475 
476 		/*
477 		 * Lookup failure means no vma is above this address,
478 		 * i.e. return with success:
479 		 */
480  	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
481 			return addr;
482 
483 		/*
484 		 * new region fits between prev_vma->vm_end and
485 		 * vma->vm_start, use it:
486 		 */
487 		if (addr+len <= vma->vm_start &&
488 		          (!prev_vma || (addr >= prev_vma->vm_end))) {
489 			/* remember the address as a hint for next time */
490 		        mm->cached_hole_size = largest_hole;
491 		        return (mm->free_area_cache = addr);
492 		} else {
493 			/* pull free_area_cache down to the first hole */
494 		        if (mm->free_area_cache == vma->vm_end) {
495 				mm->free_area_cache = vma->vm_start;
496 				mm->cached_hole_size = largest_hole;
497 			}
498 		}
499 
500 		/* remember the largest hole we saw so far */
501 		if (addr + largest_hole < vma->vm_start)
502 		        largest_hole = vma->vm_start - addr;
503 
504 		/* try just below the current vma->vm_start */
505 		addr = vma->vm_start-len;
506 	} while (len <= vma->vm_start);
507 
508 fail:
509 	/*
510 	 * if hint left us with no space for the requested
511 	 * mapping then try again:
512 	 */
513 	if (first_time) {
514 		mm->free_area_cache = base;
515 		largest_hole = 0;
516 		first_time = 0;
517 		goto try_again;
518 	}
519 	/*
520 	 * A failed mmap() very likely causes application failure,
521 	 * so fall back to the bottom-up function here. This scenario
522 	 * can happen with large stack limits and large mmap()
523 	 * allocations.
524 	 */
525 	mm->free_area_cache = TASK_UNMAPPED_BASE;
526 	mm->cached_hole_size = ~0UL;
527 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
528 	/*
529 	 * Restore the topdown base:
530 	 */
531 	mm->free_area_cache = base;
532 	mm->cached_hole_size = ~0UL;
533 
534 	return addr;
535 }
536 
537 static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
538 {
539 	struct vm_area_struct *vma;
540 
541 	vma = find_vma(current->mm, addr);
542 	if (!vma || ((addr + len) <= vma->vm_start))
543 		return 0;
544 
545 	return -ENOMEM;
546 }
547 
548 static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
549 {
550 	unsigned long addr = 0;
551 	struct vm_area_struct *vma;
552 
553 	vma = find_vma(current->mm, addr);
554 	while (addr + len <= 0x100000000UL) {
555 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
556 
557 		if (! __within_hugepage_low_range(addr, len, segmask)) {
558 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
559 			vma = find_vma(current->mm, addr);
560 			continue;
561 		}
562 
563 		if (!vma || (addr + len) <= vma->vm_start)
564 			return addr;
565 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
566 		/* Depending on segmask this might not be a confirmed
567 		 * hugepage region, so the ALIGN could have skipped
568 		 * some VMAs */
569 		vma = find_vma(current->mm, addr);
570 	}
571 
572 	return -ENOMEM;
573 }
574 
575 static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
576 {
577 	unsigned long addr = 0x100000000UL;
578 	struct vm_area_struct *vma;
579 
580 	vma = find_vma(current->mm, addr);
581 	while (addr + len <= TASK_SIZE_USER64) {
582 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
583 
584 		if (! __within_hugepage_high_range(addr, len, areamask)) {
585 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
586 			vma = find_vma(current->mm, addr);
587 			continue;
588 		}
589 
590 		if (!vma || (addr + len) <= vma->vm_start)
591 			return addr;
592 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
593 		/* Depending on segmask this might not be a confirmed
594 		 * hugepage region, so the ALIGN could have skipped
595 		 * some VMAs */
596 		vma = find_vma(current->mm, addr);
597 	}
598 
599 	return -ENOMEM;
600 }
601 
602 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
603 					unsigned long len, unsigned long pgoff,
604 					unsigned long flags)
605 {
606 	int lastshift;
607 	u16 areamask, curareas;
608 
609 	if (HPAGE_SHIFT == 0)
610 		return -EINVAL;
611 	if (len & ~HPAGE_MASK)
612 		return -EINVAL;
613 
614 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
615 		return -EINVAL;
616 
617 	/* Paranoia, caller should have dealt with this */
618 	BUG_ON((addr + len)  < addr);
619 
620 	if (test_thread_flag(TIF_32BIT)) {
621 		/* Paranoia, caller should have dealt with this */
622 		BUG_ON((addr + len) > 0x100000000UL);
623 
624 		curareas = current->mm->context.low_htlb_areas;
625 
626 		/* First see if we can use the hint address */
627 		if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
628 			areamask = LOW_ESID_MASK(addr, len);
629 			if (open_low_hpage_areas(current->mm, areamask) == 0)
630 				return addr;
631 		}
632 
633 		/* Next see if we can map in the existing low areas */
634 		addr = htlb_get_low_area(len, curareas);
635 		if (addr != -ENOMEM)
636 			return addr;
637 
638 		/* Finally go looking for areas to open */
639 		lastshift = 0;
640 		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
641 		     ! lastshift; areamask >>=1) {
642 			if (areamask & 1)
643 				lastshift = 1;
644 
645 			addr = htlb_get_low_area(len, curareas | areamask);
646 			if ((addr != -ENOMEM)
647 			    && open_low_hpage_areas(current->mm, areamask) == 0)
648 				return addr;
649 		}
650 	} else {
651 		curareas = current->mm->context.high_htlb_areas;
652 
653 		/* First see if we can use the hint address */
654 		/* We discourage 64-bit processes from doing hugepage
655 		 * mappings below 4GB (must use MAP_FIXED) */
656 		if ((addr >= 0x100000000UL)
657 		    && (htlb_check_hinted_area(addr, len) == 0)) {
658 			areamask = HTLB_AREA_MASK(addr, len);
659 			if (open_high_hpage_areas(current->mm, areamask) == 0)
660 				return addr;
661 		}
662 
663 		/* Next see if we can map in the existing high areas */
664 		addr = htlb_get_high_area(len, curareas);
665 		if (addr != -ENOMEM)
666 			return addr;
667 
668 		/* Finally go looking for areas to open */
669 		lastshift = 0;
670 		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
671 		     ! lastshift; areamask >>=1) {
672 			if (areamask & 1)
673 				lastshift = 1;
674 
675 			addr = htlb_get_high_area(len, curareas | areamask);
676 			if ((addr != -ENOMEM)
677 			    && open_high_hpage_areas(current->mm, areamask) == 0)
678 				return addr;
679 		}
680 	}
681 	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
682 	       " enough areas\n");
683 	return -ENOMEM;
684 }
685 
686 /*
687  * Called by asm hashtable.S for doing lazy icache flush
688  */
689 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
690 						  pte_t pte, int trap)
691 {
692 	struct page *page;
693 	int i;
694 
695 	if (!pfn_valid(pte_pfn(pte)))
696 		return rflags;
697 
698 	page = pte_page(pte);
699 
700 	/* page is dirty */
701 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
702 		if (trap == 0x400) {
703 			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
704 				__flush_dcache_icache(page_address(page+i));
705 			set_bit(PG_arch_1, &page->flags);
706 		} else {
707 			rflags |= HPTE_R_N;
708 		}
709 	}
710 	return rflags;
711 }
712 
713 int hash_huge_page(struct mm_struct *mm, unsigned long access,
714 		   unsigned long ea, unsigned long vsid, int local,
715 		   unsigned long trap)
716 {
717 	pte_t *ptep;
718 	unsigned long old_pte, new_pte;
719 	unsigned long va, rflags, pa;
720 	long slot;
721 	int err = 1;
722 
723 	ptep = huge_pte_offset(mm, ea);
724 
725 	/* Search the Linux page table for a match with va */
726 	va = (vsid << 28) | (ea & 0x0fffffff);
727 
728 	/*
729 	 * If no pte found or not present, send the problem up to
730 	 * do_page_fault
731 	 */
732 	if (unlikely(!ptep || pte_none(*ptep)))
733 		goto out;
734 
735 	/*
736 	 * Check the user's access rights to the page.  If access should be
737 	 * prevented then send the problem up to do_page_fault.
738 	 */
739 	if (unlikely(access & ~pte_val(*ptep)))
740 		goto out;
741 	/*
742 	 * At this point, we have a pte (old_pte) which can be used to build
743 	 * or update an HPTE. There are 2 cases:
744 	 *
745 	 * 1. There is a valid (present) pte with no associated HPTE (this is
746 	 *	the most common case)
747 	 * 2. There is a valid (present) pte with an associated HPTE. The
748 	 *	current values of the pp bits in the HPTE prevent access
749 	 *	because we are doing software DIRTY bit management and the
750 	 *	page is currently not DIRTY.
751 	 */
752 
753 
754 	do {
755 		old_pte = pte_val(*ptep);
756 		if (old_pte & _PAGE_BUSY)
757 			goto out;
758 		new_pte = old_pte | _PAGE_BUSY |
759 			_PAGE_ACCESSED | _PAGE_HASHPTE;
760 	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
761 					 old_pte, new_pte));
762 
763 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
764  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
765 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
766 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
767 		/* No CPU has hugepages but lacks no execute, so we
768 		 * don't need to worry about that case */
769 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
770 						       trap);
771 
772 	/* Check if pte already has an hpte (case 2) */
773 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
774 		/* There MIGHT be an HPTE for this pte */
775 		unsigned long hash, slot;
776 
777 		hash = hpt_hash(va, HPAGE_SHIFT);
778 		if (old_pte & _PAGE_F_SECOND)
779 			hash = ~hash;
780 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
781 		slot += (old_pte & _PAGE_F_GIX) >> 12;
782 
783 		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
784 					 local) == -1)
785 			old_pte &= ~_PAGE_HPTEFLAGS;
786 	}
787 
788 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
789 		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
790 		unsigned long hpte_group;
791 
792 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
793 
794 repeat:
795 		hpte_group = ((hash & htab_hash_mask) *
796 			      HPTES_PER_GROUP) & ~0x7UL;
797 
798 		/* clear HPTE slot informations in new PTE */
799 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
800 
801 		/* Add in WIMG bits */
802 		/* XXX We should store these in the pte */
803 		/* --BenH: I think they are ... */
804 		rflags |= _PAGE_COHERENT;
805 
806 		/* Insert into the hash table, primary slot */
807 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
808 					  mmu_huge_psize);
809 
810 		/* Primary is full, try the secondary */
811 		if (unlikely(slot == -1)) {
812 			new_pte |= _PAGE_F_SECOND;
813 			hpte_group = ((~hash & htab_hash_mask) *
814 				      HPTES_PER_GROUP) & ~0x7UL;
815 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
816 						  HPTE_V_SECONDARY,
817 						  mmu_huge_psize);
818 			if (slot == -1) {
819 				if (mftb() & 0x1)
820 					hpte_group = ((hash & htab_hash_mask) *
821 						      HPTES_PER_GROUP)&~0x7UL;
822 
823 				ppc_md.hpte_remove(hpte_group);
824 				goto repeat;
825                         }
826 		}
827 
828 		if (unlikely(slot == -2))
829 			panic("hash_huge_page: pte_insert failed\n");
830 
831 		new_pte |= (slot << 12) & _PAGE_F_GIX;
832 	}
833 
834 	/*
835 	 * No need to use ldarx/stdcx here
836 	 */
837 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
838 
839 	err = 0;
840 
841  out:
842 	return err;
843 }
844