xref: /linux/arch/powerpc/mm/hugetlbpage.c (revision 7b12b9137930eb821b68e1bfa11e9de692208620)
1 /*
2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
3  *
4  * Copyright (C) 2003 David Gibson, IBM Corporation.
5  *
6  * Based on the IA-32 version:
7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
8  */
9 
10 #include <linux/init.h>
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
19 #include <asm/mman.h>
20 #include <asm/pgalloc.h>
21 #include <asm/tlb.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
26 #include <asm/tlb.h>
27 
28 #include <linux/sysctl.h>
29 
30 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
31 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
32 
33 #ifdef CONFIG_PPC_64K_PAGES
34 #define HUGEPTE_INDEX_SIZE	(PMD_SHIFT-HPAGE_SHIFT)
35 #else
36 #define HUGEPTE_INDEX_SIZE	(PUD_SHIFT-HPAGE_SHIFT)
37 #endif
38 #define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
39 #define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
40 
41 #define HUGEPD_SHIFT		(HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
42 #define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
43 #define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
44 
45 #define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])
46 
47 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
48  * will choke on pointers to hugepte tables, which is handy for
49  * catching screwups early. */
50 #define HUGEPD_OK	0x1
51 
52 typedef struct { unsigned long pd; } hugepd_t;
53 
54 #define hugepd_none(hpd)	((hpd).pd == 0)
55 
56 static inline pte_t *hugepd_page(hugepd_t hpd)
57 {
58 	BUG_ON(!(hpd.pd & HUGEPD_OK));
59 	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
60 }
61 
62 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
63 {
64 	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
65 	pte_t *dir = hugepd_page(*hpdp);
66 
67 	return dir + idx;
68 }
69 
70 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
71 			   unsigned long address)
72 {
73 	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
74 				      GFP_KERNEL|__GFP_REPEAT);
75 
76 	if (! new)
77 		return -ENOMEM;
78 
79 	spin_lock(&mm->page_table_lock);
80 	if (!hugepd_none(*hpdp))
81 		kmem_cache_free(huge_pgtable_cache, new);
82 	else
83 		hpdp->pd = (unsigned long)new | HUGEPD_OK;
84 	spin_unlock(&mm->page_table_lock);
85 	return 0;
86 }
87 
88 /* Modelled after find_linux_pte() */
89 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
90 {
91 	pgd_t *pg;
92 	pud_t *pu;
93 
94 	BUG_ON(! in_hugepage_area(mm->context, addr));
95 
96 	addr &= HPAGE_MASK;
97 
98 	pg = pgd_offset(mm, addr);
99 	if (!pgd_none(*pg)) {
100 		pu = pud_offset(pg, addr);
101 		if (!pud_none(*pu)) {
102 #ifdef CONFIG_PPC_64K_PAGES
103 			pmd_t *pm;
104 			pm = pmd_offset(pu, addr);
105 			if (!pmd_none(*pm))
106 				return hugepte_offset((hugepd_t *)pm, addr);
107 #else
108 			return hugepte_offset((hugepd_t *)pu, addr);
109 #endif
110 		}
111 	}
112 
113 	return NULL;
114 }
115 
116 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
117 {
118 	pgd_t *pg;
119 	pud_t *pu;
120 	hugepd_t *hpdp = NULL;
121 
122 	BUG_ON(! in_hugepage_area(mm->context, addr));
123 
124 	addr &= HPAGE_MASK;
125 
126 	pg = pgd_offset(mm, addr);
127 	pu = pud_alloc(mm, pg, addr);
128 
129 	if (pu) {
130 #ifdef CONFIG_PPC_64K_PAGES
131 		pmd_t *pm;
132 		pm = pmd_alloc(mm, pu, addr);
133 		if (pm)
134 			hpdp = (hugepd_t *)pm;
135 #else
136 		hpdp = (hugepd_t *)pu;
137 #endif
138 	}
139 
140 	if (! hpdp)
141 		return NULL;
142 
143 	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
144 		return NULL;
145 
146 	return hugepte_offset(hpdp, addr);
147 }
148 
149 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
150 {
151 	pte_t *hugepte = hugepd_page(*hpdp);
152 
153 	hpdp->pd = 0;
154 	tlb->need_flush = 1;
155 	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
156 						 HUGEPTE_TABLE_SIZE-1));
157 }
158 
159 #ifdef CONFIG_PPC_64K_PAGES
160 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
161 				   unsigned long addr, unsigned long end,
162 				   unsigned long floor, unsigned long ceiling)
163 {
164 	pmd_t *pmd;
165 	unsigned long next;
166 	unsigned long start;
167 
168 	start = addr;
169 	pmd = pmd_offset(pud, addr);
170 	do {
171 		next = pmd_addr_end(addr, end);
172 		if (pmd_none(*pmd))
173 			continue;
174 		free_hugepte_range(tlb, (hugepd_t *)pmd);
175 	} while (pmd++, addr = next, addr != end);
176 
177 	start &= PUD_MASK;
178 	if (start < floor)
179 		return;
180 	if (ceiling) {
181 		ceiling &= PUD_MASK;
182 		if (!ceiling)
183 			return;
184 	}
185 	if (end - 1 > ceiling - 1)
186 		return;
187 
188 	pmd = pmd_offset(pud, start);
189 	pud_clear(pud);
190 	pmd_free_tlb(tlb, pmd);
191 }
192 #endif
193 
194 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
195 				   unsigned long addr, unsigned long end,
196 				   unsigned long floor, unsigned long ceiling)
197 {
198 	pud_t *pud;
199 	unsigned long next;
200 	unsigned long start;
201 
202 	start = addr;
203 	pud = pud_offset(pgd, addr);
204 	do {
205 		next = pud_addr_end(addr, end);
206 #ifdef CONFIG_PPC_64K_PAGES
207 		if (pud_none_or_clear_bad(pud))
208 			continue;
209 		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
210 #else
211 		if (pud_none(*pud))
212 			continue;
213 		free_hugepte_range(tlb, (hugepd_t *)pud);
214 #endif
215 	} while (pud++, addr = next, addr != end);
216 
217 	start &= PGDIR_MASK;
218 	if (start < floor)
219 		return;
220 	if (ceiling) {
221 		ceiling &= PGDIR_MASK;
222 		if (!ceiling)
223 			return;
224 	}
225 	if (end - 1 > ceiling - 1)
226 		return;
227 
228 	pud = pud_offset(pgd, start);
229 	pgd_clear(pgd);
230 	pud_free_tlb(tlb, pud);
231 }
232 
233 /*
234  * This function frees user-level page tables of a process.
235  *
236  * Must be called with pagetable lock held.
237  */
238 void hugetlb_free_pgd_range(struct mmu_gather **tlb,
239 			    unsigned long addr, unsigned long end,
240 			    unsigned long floor, unsigned long ceiling)
241 {
242 	pgd_t *pgd;
243 	unsigned long next;
244 	unsigned long start;
245 
246 	/*
247 	 * Comments below take from the normal free_pgd_range().  They
248 	 * apply here too.  The tests against HUGEPD_MASK below are
249 	 * essential, because we *don't* test for this at the bottom
250 	 * level.  Without them we'll attempt to free a hugepte table
251 	 * when we unmap just part of it, even if there are other
252 	 * active mappings using it.
253 	 *
254 	 * The next few lines have given us lots of grief...
255 	 *
256 	 * Why are we testing HUGEPD* at this top level?  Because
257 	 * often there will be no work to do at all, and we'd prefer
258 	 * not to go all the way down to the bottom just to discover
259 	 * that.
260 	 *
261 	 * Why all these "- 1"s?  Because 0 represents both the bottom
262 	 * of the address space and the top of it (using -1 for the
263 	 * top wouldn't help much: the masks would do the wrong thing).
264 	 * The rule is that addr 0 and floor 0 refer to the bottom of
265 	 * the address space, but end 0 and ceiling 0 refer to the top
266 	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
267 	 * that end 0 case should be mythical).
268 	 *
269 	 * Wherever addr is brought up or ceiling brought down, we
270 	 * must be careful to reject "the opposite 0" before it
271 	 * confuses the subsequent tests.  But what about where end is
272 	 * brought down by HUGEPD_SIZE below? no, end can't go down to
273 	 * 0 there.
274 	 *
275 	 * Whereas we round start (addr) and ceiling down, by different
276 	 * masks at different levels, in order to test whether a table
277 	 * now has no other vmas using it, so can be freed, we don't
278 	 * bother to round floor or end up - the tests don't need that.
279 	 */
280 
281 	addr &= HUGEPD_MASK;
282 	if (addr < floor) {
283 		addr += HUGEPD_SIZE;
284 		if (!addr)
285 			return;
286 	}
287 	if (ceiling) {
288 		ceiling &= HUGEPD_MASK;
289 		if (!ceiling)
290 			return;
291 	}
292 	if (end - 1 > ceiling - 1)
293 		end -= HUGEPD_SIZE;
294 	if (addr > end - 1)
295 		return;
296 
297 	start = addr;
298 	pgd = pgd_offset((*tlb)->mm, addr);
299 	do {
300 		BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
301 		next = pgd_addr_end(addr, end);
302 		if (pgd_none_or_clear_bad(pgd))
303 			continue;
304 		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
305 	} while (pgd++, addr = next, addr != end);
306 }
307 
308 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
309 		     pte_t *ptep, pte_t pte)
310 {
311 	if (pte_present(*ptep)) {
312 		/* We open-code pte_clear because we need to pass the right
313 		 * argument to hpte_update (huge / !huge)
314 		 */
315 		unsigned long old = pte_update(ptep, ~0UL);
316 		if (old & _PAGE_HASHPTE)
317 			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
318 		flush_tlb_pending();
319 	}
320 	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
321 }
322 
323 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
324 			      pte_t *ptep)
325 {
326 	unsigned long old = pte_update(ptep, ~0UL);
327 
328 	if (old & _PAGE_HASHPTE)
329 		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);
330 	*ptep = __pte(0);
331 
332 	return __pte(old);
333 }
334 
335 struct slb_flush_info {
336 	struct mm_struct *mm;
337 	u16 newareas;
338 };
339 
340 static void flush_low_segments(void *parm)
341 {
342 	struct slb_flush_info *fi = parm;
343 	unsigned long i;
344 
345 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);
346 
347 	if (current->active_mm != fi->mm)
348 		return;
349 
350 	/* Only need to do anything if this CPU is working in the same
351 	 * mm as the one which has changed */
352 
353 	/* update the paca copy of the context struct */
354 	get_paca()->context = current->active_mm->context;
355 
356 	asm volatile("isync" : : : "memory");
357 	for (i = 0; i < NUM_LOW_AREAS; i++) {
358 		if (! (fi->newareas & (1U << i)))
359 			continue;
360 		asm volatile("slbie %0"
361 			     : : "r" ((i << SID_SHIFT) | SLBIE_C));
362 	}
363 	asm volatile("isync" : : : "memory");
364 }
365 
366 static void flush_high_segments(void *parm)
367 {
368 	struct slb_flush_info *fi = parm;
369 	unsigned long i, j;
370 
371 
372 	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);
373 
374 	if (current->active_mm != fi->mm)
375 		return;
376 
377 	/* Only need to do anything if this CPU is working in the same
378 	 * mm as the one which has changed */
379 
380 	/* update the paca copy of the context struct */
381 	get_paca()->context = current->active_mm->context;
382 
383 	asm volatile("isync" : : : "memory");
384 	for (i = 0; i < NUM_HIGH_AREAS; i++) {
385 		if (! (fi->newareas & (1U << i)))
386 			continue;
387 		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
388 			asm volatile("slbie %0"
389 				     :: "r" (((i << HTLB_AREA_SHIFT)
390 					      + (j << SID_SHIFT)) | SLBIE_C));
391 	}
392 	asm volatile("isync" : : : "memory");
393 }
394 
395 static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
396 {
397 	unsigned long start = area << SID_SHIFT;
398 	unsigned long end = (area+1) << SID_SHIFT;
399 	struct vm_area_struct *vma;
400 
401 	BUG_ON(area >= NUM_LOW_AREAS);
402 
403 	/* Check no VMAs are in the region */
404 	vma = find_vma(mm, start);
405 	if (vma && (vma->vm_start < end))
406 		return -EBUSY;
407 
408 	return 0;
409 }
410 
411 static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
412 {
413 	unsigned long start = area << HTLB_AREA_SHIFT;
414 	unsigned long end = (area+1) << HTLB_AREA_SHIFT;
415 	struct vm_area_struct *vma;
416 
417 	BUG_ON(area >= NUM_HIGH_AREAS);
418 
419 	/* Hack, so that each addresses is controlled by exactly one
420 	 * of the high or low area bitmaps, the first high area starts
421 	 * at 4GB, not 0 */
422 	if (start == 0)
423 		start = 0x100000000UL;
424 
425 	/* Check no VMAs are in the region */
426 	vma = find_vma(mm, start);
427 	if (vma && (vma->vm_start < end))
428 		return -EBUSY;
429 
430 	return 0;
431 }
432 
433 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
434 {
435 	unsigned long i;
436 	struct slb_flush_info fi;
437 
438 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
439 	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
440 
441 	newareas &= ~(mm->context.low_htlb_areas);
442 	if (! newareas)
443 		return 0; /* The segments we want are already open */
444 
445 	for (i = 0; i < NUM_LOW_AREAS; i++)
446 		if ((1 << i) & newareas)
447 			if (prepare_low_area_for_htlb(mm, i) != 0)
448 				return -EBUSY;
449 
450 	mm->context.low_htlb_areas |= newareas;
451 
452 	/* the context change must make it to memory before the flush,
453 	 * so that further SLB misses do the right thing. */
454 	mb();
455 
456 	fi.mm = mm;
457 	fi.newareas = newareas;
458 	on_each_cpu(flush_low_segments, &fi, 0, 1);
459 
460 	return 0;
461 }
462 
463 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
464 {
465 	struct slb_flush_info fi;
466 	unsigned long i;
467 
468 	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
469 	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
470 		     != NUM_HIGH_AREAS);
471 
472 	newareas &= ~(mm->context.high_htlb_areas);
473 	if (! newareas)
474 		return 0; /* The areas we want are already open */
475 
476 	for (i = 0; i < NUM_HIGH_AREAS; i++)
477 		if ((1 << i) & newareas)
478 			if (prepare_high_area_for_htlb(mm, i) != 0)
479 				return -EBUSY;
480 
481 	mm->context.high_htlb_areas |= newareas;
482 
483 	/* update the paca copy of the context struct */
484 	get_paca()->context = mm->context;
485 
486 	/* the context change must make it to memory before the flush,
487 	 * so that further SLB misses do the right thing. */
488 	mb();
489 
490 	fi.mm = mm;
491 	fi.newareas = newareas;
492 	on_each_cpu(flush_high_segments, &fi, 0, 1);
493 
494 	return 0;
495 }
496 
497 int prepare_hugepage_range(unsigned long addr, unsigned long len)
498 {
499 	int err = 0;
500 
501 	if ( (addr+len) < addr )
502 		return -EINVAL;
503 
504 	if (addr < 0x100000000UL)
505 		err = open_low_hpage_areas(current->mm,
506 					  LOW_ESID_MASK(addr, len));
507 	if ((addr + len) > 0x100000000UL)
508 		err = open_high_hpage_areas(current->mm,
509 					    HTLB_AREA_MASK(addr, len));
510 	if (err) {
511 		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
512 		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
513 		       addr, len,
514 		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
515 		return err;
516 	}
517 
518 	return 0;
519 }
520 
521 struct page *
522 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
523 {
524 	pte_t *ptep;
525 	struct page *page;
526 
527 	if (! in_hugepage_area(mm->context, address))
528 		return ERR_PTR(-EINVAL);
529 
530 	ptep = huge_pte_offset(mm, address);
531 	page = pte_page(*ptep);
532 	if (page)
533 		page += (address % HPAGE_SIZE) / PAGE_SIZE;
534 
535 	return page;
536 }
537 
538 int pmd_huge(pmd_t pmd)
539 {
540 	return 0;
541 }
542 
543 struct page *
544 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
545 		pmd_t *pmd, int write)
546 {
547 	BUG();
548 	return NULL;
549 }
550 
551 /* Because we have an exclusive hugepage region which lies within the
552  * normal user address space, we have to take special measures to make
553  * non-huge mmap()s evade the hugepage reserved regions. */
554 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
555 				     unsigned long len, unsigned long pgoff,
556 				     unsigned long flags)
557 {
558 	struct mm_struct *mm = current->mm;
559 	struct vm_area_struct *vma;
560 	unsigned long start_addr;
561 
562 	if (len > TASK_SIZE)
563 		return -ENOMEM;
564 
565 	if (addr) {
566 		addr = PAGE_ALIGN(addr);
567 		vma = find_vma(mm, addr);
568 		if (((TASK_SIZE - len) >= addr)
569 		    && (!vma || (addr+len) <= vma->vm_start)
570 		    && !is_hugepage_only_range(mm, addr,len))
571 			return addr;
572 	}
573 	if (len > mm->cached_hole_size) {
574 	        start_addr = addr = mm->free_area_cache;
575 	} else {
576 	        start_addr = addr = TASK_UNMAPPED_BASE;
577 	        mm->cached_hole_size = 0;
578 	}
579 
580 full_search:
581 	vma = find_vma(mm, addr);
582 	while (TASK_SIZE - len >= addr) {
583 		BUG_ON(vma && (addr >= vma->vm_end));
584 
585 		if (touches_hugepage_low_range(mm, addr, len)) {
586 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
587 			vma = find_vma(mm, addr);
588 			continue;
589 		}
590 		if (touches_hugepage_high_range(mm, addr, len)) {
591 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
592 			vma = find_vma(mm, addr);
593 			continue;
594 		}
595 		if (!vma || addr + len <= vma->vm_start) {
596 			/*
597 			 * Remember the place where we stopped the search:
598 			 */
599 			mm->free_area_cache = addr + len;
600 			return addr;
601 		}
602 		if (addr + mm->cached_hole_size < vma->vm_start)
603 		        mm->cached_hole_size = vma->vm_start - addr;
604 		addr = vma->vm_end;
605 		vma = vma->vm_next;
606 	}
607 
608 	/* Make sure we didn't miss any holes */
609 	if (start_addr != TASK_UNMAPPED_BASE) {
610 		start_addr = addr = TASK_UNMAPPED_BASE;
611 		mm->cached_hole_size = 0;
612 		goto full_search;
613 	}
614 	return -ENOMEM;
615 }
616 
617 /*
618  * This mmap-allocator allocates new areas top-down from below the
619  * stack's low limit (the base):
620  *
621  * Because we have an exclusive hugepage region which lies within the
622  * normal user address space, we have to take special measures to make
623  * non-huge mmap()s evade the hugepage reserved regions.
624  */
625 unsigned long
626 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
627 			  const unsigned long len, const unsigned long pgoff,
628 			  const unsigned long flags)
629 {
630 	struct vm_area_struct *vma, *prev_vma;
631 	struct mm_struct *mm = current->mm;
632 	unsigned long base = mm->mmap_base, addr = addr0;
633 	unsigned long largest_hole = mm->cached_hole_size;
634 	int first_time = 1;
635 
636 	/* requested length too big for entire address space */
637 	if (len > TASK_SIZE)
638 		return -ENOMEM;
639 
640 	/* dont allow allocations above current base */
641 	if (mm->free_area_cache > base)
642 		mm->free_area_cache = base;
643 
644 	/* requesting a specific address */
645 	if (addr) {
646 		addr = PAGE_ALIGN(addr);
647 		vma = find_vma(mm, addr);
648 		if (TASK_SIZE - len >= addr &&
649 				(!vma || addr + len <= vma->vm_start)
650 				&& !is_hugepage_only_range(mm, addr,len))
651 			return addr;
652 	}
653 
654 	if (len <= largest_hole) {
655 	        largest_hole = 0;
656 		mm->free_area_cache = base;
657 	}
658 try_again:
659 	/* make sure it can fit in the remaining address space */
660 	if (mm->free_area_cache < len)
661 		goto fail;
662 
663 	/* either no address requested or cant fit in requested address hole */
664 	addr = (mm->free_area_cache - len) & PAGE_MASK;
665 	do {
666 hugepage_recheck:
667 		if (touches_hugepage_low_range(mm, addr, len)) {
668 			addr = (addr & ((~0) << SID_SHIFT)) - len;
669 			goto hugepage_recheck;
670 		} else if (touches_hugepage_high_range(mm, addr, len)) {
671 			addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
672 			goto hugepage_recheck;
673 		}
674 
675 		/*
676 		 * Lookup failure means no vma is above this address,
677 		 * i.e. return with success:
678 		 */
679  	 	if (!(vma = find_vma_prev(mm, addr, &prev_vma)))
680 			return addr;
681 
682 		/*
683 		 * new region fits between prev_vma->vm_end and
684 		 * vma->vm_start, use it:
685 		 */
686 		if (addr+len <= vma->vm_start &&
687 		          (!prev_vma || (addr >= prev_vma->vm_end))) {
688 			/* remember the address as a hint for next time */
689 		        mm->cached_hole_size = largest_hole;
690 		        return (mm->free_area_cache = addr);
691 		} else {
692 			/* pull free_area_cache down to the first hole */
693 		        if (mm->free_area_cache == vma->vm_end) {
694 				mm->free_area_cache = vma->vm_start;
695 				mm->cached_hole_size = largest_hole;
696 			}
697 		}
698 
699 		/* remember the largest hole we saw so far */
700 		if (addr + largest_hole < vma->vm_start)
701 		        largest_hole = vma->vm_start - addr;
702 
703 		/* try just below the current vma->vm_start */
704 		addr = vma->vm_start-len;
705 	} while (len <= vma->vm_start);
706 
707 fail:
708 	/*
709 	 * if hint left us with no space for the requested
710 	 * mapping then try again:
711 	 */
712 	if (first_time) {
713 		mm->free_area_cache = base;
714 		largest_hole = 0;
715 		first_time = 0;
716 		goto try_again;
717 	}
718 	/*
719 	 * A failed mmap() very likely causes application failure,
720 	 * so fall back to the bottom-up function here. This scenario
721 	 * can happen with large stack limits and large mmap()
722 	 * allocations.
723 	 */
724 	mm->free_area_cache = TASK_UNMAPPED_BASE;
725 	mm->cached_hole_size = ~0UL;
726 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
727 	/*
728 	 * Restore the topdown base:
729 	 */
730 	mm->free_area_cache = base;
731 	mm->cached_hole_size = ~0UL;
732 
733 	return addr;
734 }
735 
736 static int htlb_check_hinted_area(unsigned long addr, unsigned long len)
737 {
738 	struct vm_area_struct *vma;
739 
740 	vma = find_vma(current->mm, addr);
741 	if (!vma || ((addr + len) <= vma->vm_start))
742 		return 0;
743 
744 	return -ENOMEM;
745 }
746 
747 static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
748 {
749 	unsigned long addr = 0;
750 	struct vm_area_struct *vma;
751 
752 	vma = find_vma(current->mm, addr);
753 	while (addr + len <= 0x100000000UL) {
754 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
755 
756 		if (! __within_hugepage_low_range(addr, len, segmask)) {
757 			addr = ALIGN(addr+1, 1<<SID_SHIFT);
758 			vma = find_vma(current->mm, addr);
759 			continue;
760 		}
761 
762 		if (!vma || (addr + len) <= vma->vm_start)
763 			return addr;
764 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
765 		/* Depending on segmask this might not be a confirmed
766 		 * hugepage region, so the ALIGN could have skipped
767 		 * some VMAs */
768 		vma = find_vma(current->mm, addr);
769 	}
770 
771 	return -ENOMEM;
772 }
773 
774 static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
775 {
776 	unsigned long addr = 0x100000000UL;
777 	struct vm_area_struct *vma;
778 
779 	vma = find_vma(current->mm, addr);
780 	while (addr + len <= TASK_SIZE_USER64) {
781 		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
782 
783 		if (! __within_hugepage_high_range(addr, len, areamask)) {
784 			addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
785 			vma = find_vma(current->mm, addr);
786 			continue;
787 		}
788 
789 		if (!vma || (addr + len) <= vma->vm_start)
790 			return addr;
791 		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
792 		/* Depending on segmask this might not be a confirmed
793 		 * hugepage region, so the ALIGN could have skipped
794 		 * some VMAs */
795 		vma = find_vma(current->mm, addr);
796 	}
797 
798 	return -ENOMEM;
799 }
800 
801 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
802 					unsigned long len, unsigned long pgoff,
803 					unsigned long flags)
804 {
805 	int lastshift;
806 	u16 areamask, curareas;
807 
808 	if (HPAGE_SHIFT == 0)
809 		return -EINVAL;
810 	if (len & ~HPAGE_MASK)
811 		return -EINVAL;
812 
813 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
814 		return -EINVAL;
815 
816 	/* Paranoia, caller should have dealt with this */
817 	BUG_ON((addr + len)  < addr);
818 
819 	if (test_thread_flag(TIF_32BIT)) {
820 		/* Paranoia, caller should have dealt with this */
821 		BUG_ON((addr + len) > 0x100000000UL);
822 
823 		curareas = current->mm->context.low_htlb_areas;
824 
825 		/* First see if we can use the hint address */
826 		if (addr && (htlb_check_hinted_area(addr, len) == 0)) {
827 			areamask = LOW_ESID_MASK(addr, len);
828 			if (open_low_hpage_areas(current->mm, areamask) == 0)
829 				return addr;
830 		}
831 
832 		/* Next see if we can map in the existing low areas */
833 		addr = htlb_get_low_area(len, curareas);
834 		if (addr != -ENOMEM)
835 			return addr;
836 
837 		/* Finally go looking for areas to open */
838 		lastshift = 0;
839 		for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
840 		     ! lastshift; areamask >>=1) {
841 			if (areamask & 1)
842 				lastshift = 1;
843 
844 			addr = htlb_get_low_area(len, curareas | areamask);
845 			if ((addr != -ENOMEM)
846 			    && open_low_hpage_areas(current->mm, areamask) == 0)
847 				return addr;
848 		}
849 	} else {
850 		curareas = current->mm->context.high_htlb_areas;
851 
852 		/* First see if we can use the hint address */
853 		/* We discourage 64-bit processes from doing hugepage
854 		 * mappings below 4GB (must use MAP_FIXED) */
855 		if ((addr >= 0x100000000UL)
856 		    && (htlb_check_hinted_area(addr, len) == 0)) {
857 			areamask = HTLB_AREA_MASK(addr, len);
858 			if (open_high_hpage_areas(current->mm, areamask) == 0)
859 				return addr;
860 		}
861 
862 		/* Next see if we can map in the existing high areas */
863 		addr = htlb_get_high_area(len, curareas);
864 		if (addr != -ENOMEM)
865 			return addr;
866 
867 		/* Finally go looking for areas to open */
868 		lastshift = 0;
869 		for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
870 		     ! lastshift; areamask >>=1) {
871 			if (areamask & 1)
872 				lastshift = 1;
873 
874 			addr = htlb_get_high_area(len, curareas | areamask);
875 			if ((addr != -ENOMEM)
876 			    && open_high_hpage_areas(current->mm, areamask) == 0)
877 				return addr;
878 		}
879 	}
880 	printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
881 	       " enough areas\n");
882 	return -ENOMEM;
883 }
884 
885 /*
886  * Called by asm hashtable.S for doing lazy icache flush
887  */
888 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
889 						  pte_t pte, int trap)
890 {
891 	struct page *page;
892 	int i;
893 
894 	if (!pfn_valid(pte_pfn(pte)))
895 		return rflags;
896 
897 	page = pte_page(pte);
898 
899 	/* page is dirty */
900 	if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
901 		if (trap == 0x400) {
902 			for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
903 				__flush_dcache_icache(page_address(page+i));
904 			set_bit(PG_arch_1, &page->flags);
905 		} else {
906 			rflags |= HPTE_R_N;
907 		}
908 	}
909 	return rflags;
910 }
911 
912 int hash_huge_page(struct mm_struct *mm, unsigned long access,
913 		   unsigned long ea, unsigned long vsid, int local,
914 		   unsigned long trap)
915 {
916 	pte_t *ptep;
917 	unsigned long old_pte, new_pte;
918 	unsigned long va, rflags, pa;
919 	long slot;
920 	int err = 1;
921 
922 	ptep = huge_pte_offset(mm, ea);
923 
924 	/* Search the Linux page table for a match with va */
925 	va = (vsid << 28) | (ea & 0x0fffffff);
926 
927 	/*
928 	 * If no pte found or not present, send the problem up to
929 	 * do_page_fault
930 	 */
931 	if (unlikely(!ptep || pte_none(*ptep)))
932 		goto out;
933 
934 	/*
935 	 * Check the user's access rights to the page.  If access should be
936 	 * prevented then send the problem up to do_page_fault.
937 	 */
938 	if (unlikely(access & ~pte_val(*ptep)))
939 		goto out;
940 	/*
941 	 * At this point, we have a pte (old_pte) which can be used to build
942 	 * or update an HPTE. There are 2 cases:
943 	 *
944 	 * 1. There is a valid (present) pte with no associated HPTE (this is
945 	 *	the most common case)
946 	 * 2. There is a valid (present) pte with an associated HPTE. The
947 	 *	current values of the pp bits in the HPTE prevent access
948 	 *	because we are doing software DIRTY bit management and the
949 	 *	page is currently not DIRTY.
950 	 */
951 
952 
953 	do {
954 		old_pte = pte_val(*ptep);
955 		if (old_pte & _PAGE_BUSY)
956 			goto out;
957 		new_pte = old_pte | _PAGE_BUSY |
958 			_PAGE_ACCESSED | _PAGE_HASHPTE;
959 	} while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
960 					 old_pte, new_pte));
961 
962 	rflags = 0x2 | (!(new_pte & _PAGE_RW));
963  	/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
964 	rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
965 	if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
966 		/* No CPU has hugepages but lacks no execute, so we
967 		 * don't need to worry about that case */
968 		rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
969 						       trap);
970 
971 	/* Check if pte already has an hpte (case 2) */
972 	if (unlikely(old_pte & _PAGE_HASHPTE)) {
973 		/* There MIGHT be an HPTE for this pte */
974 		unsigned long hash, slot;
975 
976 		hash = hpt_hash(va, HPAGE_SHIFT);
977 		if (old_pte & _PAGE_F_SECOND)
978 			hash = ~hash;
979 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
980 		slot += (old_pte & _PAGE_F_GIX) >> 12;
981 
982 		if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
983 					 local) == -1)
984 			old_pte &= ~_PAGE_HPTEFLAGS;
985 	}
986 
987 	if (likely(!(old_pte & _PAGE_HASHPTE))) {
988 		unsigned long hash = hpt_hash(va, HPAGE_SHIFT);
989 		unsigned long hpte_group;
990 
991 		pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
992 
993 repeat:
994 		hpte_group = ((hash & htab_hash_mask) *
995 			      HPTES_PER_GROUP) & ~0x7UL;
996 
997 		/* clear HPTE slot informations in new PTE */
998 		new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
999 
1000 		/* Add in WIMG bits */
1001 		/* XXX We should store these in the pte */
1002 		/* --BenH: I think they are ... */
1003 		rflags |= _PAGE_COHERENT;
1004 
1005 		/* Insert into the hash table, primary slot */
1006 		slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
1007 					  mmu_huge_psize);
1008 
1009 		/* Primary is full, try the secondary */
1010 		if (unlikely(slot == -1)) {
1011 			new_pte |= _PAGE_F_SECOND;
1012 			hpte_group = ((~hash & htab_hash_mask) *
1013 				      HPTES_PER_GROUP) & ~0x7UL;
1014 			slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
1015 						  HPTE_V_SECONDARY,
1016 						  mmu_huge_psize);
1017 			if (slot == -1) {
1018 				if (mftb() & 0x1)
1019 					hpte_group = ((hash & htab_hash_mask) *
1020 						      HPTES_PER_GROUP)&~0x7UL;
1021 
1022 				ppc_md.hpte_remove(hpte_group);
1023 				goto repeat;
1024                         }
1025 		}
1026 
1027 		if (unlikely(slot == -2))
1028 			panic("hash_huge_page: pte_insert failed\n");
1029 
1030 		new_pte |= (slot << 12) & _PAGE_F_GIX;
1031 	}
1032 
1033 	/*
1034 	 * No need to use ldarx/stdcx here
1035 	 */
1036 	*ptep = __pte(new_pte & ~_PAGE_BUSY);
1037 
1038 	err = 0;
1039 
1040  out:
1041 	return err;
1042 }
1043 
1044 static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
1045 {
1046 	memset(addr, 0, kmem_cache_size(cache));
1047 }
1048 
1049 static int __init hugetlbpage_init(void)
1050 {
1051 	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
1052 		return -ENODEV;
1053 
1054 	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
1055 					       HUGEPTE_TABLE_SIZE,
1056 					       HUGEPTE_TABLE_SIZE,
1057 					       SLAB_HWCACHE_ALIGN |
1058 					       SLAB_MUST_HWCACHE_ALIGN,
1059 					       zero_ctor, NULL);
1060 	if (! huge_pgtable_cache)
1061 		panic("hugetlbpage_init(): could not create hugepte cache\n");
1062 
1063 	return 0;
1064 }
1065 
1066 module_init(hugetlbpage_init);
1067