1 /* 2 * PPC64 (POWER4) Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * 6 * Based on the IA-32 version: 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 8 */ 9 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/hugetlb.h> 14 #include <linux/pagemap.h> 15 #include <linux/smp_lock.h> 16 #include <linux/slab.h> 17 #include <linux/err.h> 18 #include <linux/sysctl.h> 19 #include <asm/mman.h> 20 #include <asm/pgalloc.h> 21 #include <asm/tlb.h> 22 #include <asm/tlbflush.h> 23 #include <asm/mmu_context.h> 24 #include <asm/machdep.h> 25 #include <asm/cputable.h> 26 #include <asm/tlb.h> 27 #include <asm/spu.h> 28 29 #include <linux/sysctl.h> 30 31 #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) 32 #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) 33 34 #ifdef CONFIG_PPC_64K_PAGES 35 #define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT) 36 #else 37 #define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT) 38 #endif 39 #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) 40 #define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE) 41 42 #define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE) 43 #define HUGEPD_SIZE (1UL << HUGEPD_SHIFT) 44 #define HUGEPD_MASK (~(HUGEPD_SIZE-1)) 45 46 #define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM]) 47 48 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 49 * will choke on pointers to hugepte tables, which is handy for 50 * catching screwups early. */ 51 #define HUGEPD_OK 0x1 52 53 typedef struct { unsigned long pd; } hugepd_t; 54 55 #define hugepd_none(hpd) ((hpd).pd == 0) 56 57 static inline pte_t *hugepd_page(hugepd_t hpd) 58 { 59 BUG_ON(!(hpd.pd & HUGEPD_OK)); 60 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 61 } 62 63 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr) 64 { 65 unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1)); 66 pte_t *dir = hugepd_page(*hpdp); 67 68 return dir + idx; 69 } 70 71 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 72 unsigned long address) 73 { 74 pte_t *new = kmem_cache_alloc(huge_pgtable_cache, 75 GFP_KERNEL|__GFP_REPEAT); 76 77 if (! new) 78 return -ENOMEM; 79 80 spin_lock(&mm->page_table_lock); 81 if (!hugepd_none(*hpdp)) 82 kmem_cache_free(huge_pgtable_cache, new); 83 else 84 hpdp->pd = (unsigned long)new | HUGEPD_OK; 85 spin_unlock(&mm->page_table_lock); 86 return 0; 87 } 88 89 /* Modelled after find_linux_pte() */ 90 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 91 { 92 pgd_t *pg; 93 pud_t *pu; 94 95 BUG_ON(! in_hugepage_area(mm->context, addr)); 96 97 addr &= HPAGE_MASK; 98 99 pg = pgd_offset(mm, addr); 100 if (!pgd_none(*pg)) { 101 pu = pud_offset(pg, addr); 102 if (!pud_none(*pu)) { 103 #ifdef CONFIG_PPC_64K_PAGES 104 pmd_t *pm; 105 pm = pmd_offset(pu, addr); 106 if (!pmd_none(*pm)) 107 return hugepte_offset((hugepd_t *)pm, addr); 108 #else 109 return hugepte_offset((hugepd_t *)pu, addr); 110 #endif 111 } 112 } 113 114 return NULL; 115 } 116 117 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 118 { 119 pgd_t *pg; 120 pud_t *pu; 121 hugepd_t *hpdp = NULL; 122 123 BUG_ON(! in_hugepage_area(mm->context, addr)); 124 125 addr &= HPAGE_MASK; 126 127 pg = pgd_offset(mm, addr); 128 pu = pud_alloc(mm, pg, addr); 129 130 if (pu) { 131 #ifdef CONFIG_PPC_64K_PAGES 132 pmd_t *pm; 133 pm = pmd_alloc(mm, pu, addr); 134 if (pm) 135 hpdp = (hugepd_t *)pm; 136 #else 137 hpdp = (hugepd_t *)pu; 138 #endif 139 } 140 141 if (! hpdp) 142 return NULL; 143 144 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr)) 145 return NULL; 146 147 return hugepte_offset(hpdp, addr); 148 } 149 150 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 151 { 152 return 0; 153 } 154 155 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp) 156 { 157 pte_t *hugepte = hugepd_page(*hpdp); 158 159 hpdp->pd = 0; 160 tlb->need_flush = 1; 161 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM, 162 PGF_CACHENUM_MASK)); 163 } 164 165 #ifdef CONFIG_PPC_64K_PAGES 166 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 167 unsigned long addr, unsigned long end, 168 unsigned long floor, unsigned long ceiling) 169 { 170 pmd_t *pmd; 171 unsigned long next; 172 unsigned long start; 173 174 start = addr; 175 pmd = pmd_offset(pud, addr); 176 do { 177 next = pmd_addr_end(addr, end); 178 if (pmd_none(*pmd)) 179 continue; 180 free_hugepte_range(tlb, (hugepd_t *)pmd); 181 } while (pmd++, addr = next, addr != end); 182 183 start &= PUD_MASK; 184 if (start < floor) 185 return; 186 if (ceiling) { 187 ceiling &= PUD_MASK; 188 if (!ceiling) 189 return; 190 } 191 if (end - 1 > ceiling - 1) 192 return; 193 194 pmd = pmd_offset(pud, start); 195 pud_clear(pud); 196 pmd_free_tlb(tlb, pmd); 197 } 198 #endif 199 200 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 201 unsigned long addr, unsigned long end, 202 unsigned long floor, unsigned long ceiling) 203 { 204 pud_t *pud; 205 unsigned long next; 206 unsigned long start; 207 208 start = addr; 209 pud = pud_offset(pgd, addr); 210 do { 211 next = pud_addr_end(addr, end); 212 #ifdef CONFIG_PPC_64K_PAGES 213 if (pud_none_or_clear_bad(pud)) 214 continue; 215 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling); 216 #else 217 if (pud_none(*pud)) 218 continue; 219 free_hugepte_range(tlb, (hugepd_t *)pud); 220 #endif 221 } while (pud++, addr = next, addr != end); 222 223 start &= PGDIR_MASK; 224 if (start < floor) 225 return; 226 if (ceiling) { 227 ceiling &= PGDIR_MASK; 228 if (!ceiling) 229 return; 230 } 231 if (end - 1 > ceiling - 1) 232 return; 233 234 pud = pud_offset(pgd, start); 235 pgd_clear(pgd); 236 pud_free_tlb(tlb, pud); 237 } 238 239 /* 240 * This function frees user-level page tables of a process. 241 * 242 * Must be called with pagetable lock held. 243 */ 244 void hugetlb_free_pgd_range(struct mmu_gather **tlb, 245 unsigned long addr, unsigned long end, 246 unsigned long floor, unsigned long ceiling) 247 { 248 pgd_t *pgd; 249 unsigned long next; 250 unsigned long start; 251 252 /* 253 * Comments below take from the normal free_pgd_range(). They 254 * apply here too. The tests against HUGEPD_MASK below are 255 * essential, because we *don't* test for this at the bottom 256 * level. Without them we'll attempt to free a hugepte table 257 * when we unmap just part of it, even if there are other 258 * active mappings using it. 259 * 260 * The next few lines have given us lots of grief... 261 * 262 * Why are we testing HUGEPD* at this top level? Because 263 * often there will be no work to do at all, and we'd prefer 264 * not to go all the way down to the bottom just to discover 265 * that. 266 * 267 * Why all these "- 1"s? Because 0 represents both the bottom 268 * of the address space and the top of it (using -1 for the 269 * top wouldn't help much: the masks would do the wrong thing). 270 * The rule is that addr 0 and floor 0 refer to the bottom of 271 * the address space, but end 0 and ceiling 0 refer to the top 272 * Comparisons need to use "end - 1" and "ceiling - 1" (though 273 * that end 0 case should be mythical). 274 * 275 * Wherever addr is brought up or ceiling brought down, we 276 * must be careful to reject "the opposite 0" before it 277 * confuses the subsequent tests. But what about where end is 278 * brought down by HUGEPD_SIZE below? no, end can't go down to 279 * 0 there. 280 * 281 * Whereas we round start (addr) and ceiling down, by different 282 * masks at different levels, in order to test whether a table 283 * now has no other vmas using it, so can be freed, we don't 284 * bother to round floor or end up - the tests don't need that. 285 */ 286 287 addr &= HUGEPD_MASK; 288 if (addr < floor) { 289 addr += HUGEPD_SIZE; 290 if (!addr) 291 return; 292 } 293 if (ceiling) { 294 ceiling &= HUGEPD_MASK; 295 if (!ceiling) 296 return; 297 } 298 if (end - 1 > ceiling - 1) 299 end -= HUGEPD_SIZE; 300 if (addr > end - 1) 301 return; 302 303 start = addr; 304 pgd = pgd_offset((*tlb)->mm, addr); 305 do { 306 BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr)); 307 next = pgd_addr_end(addr, end); 308 if (pgd_none_or_clear_bad(pgd)) 309 continue; 310 hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 311 } while (pgd++, addr = next, addr != end); 312 } 313 314 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 315 pte_t *ptep, pte_t pte) 316 { 317 if (pte_present(*ptep)) { 318 /* We open-code pte_clear because we need to pass the right 319 * argument to hpte_update (huge / !huge) 320 */ 321 unsigned long old = pte_update(ptep, ~0UL); 322 if (old & _PAGE_HASHPTE) 323 hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); 324 flush_tlb_pending(); 325 } 326 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 327 } 328 329 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 330 pte_t *ptep) 331 { 332 unsigned long old = pte_update(ptep, ~0UL); 333 334 if (old & _PAGE_HASHPTE) 335 hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); 336 *ptep = __pte(0); 337 338 return __pte(old); 339 } 340 341 struct slb_flush_info { 342 struct mm_struct *mm; 343 u16 newareas; 344 }; 345 346 static void flush_low_segments(void *parm) 347 { 348 struct slb_flush_info *fi = parm; 349 unsigned long i; 350 351 BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS); 352 353 if (current->active_mm != fi->mm) 354 return; 355 356 /* Only need to do anything if this CPU is working in the same 357 * mm as the one which has changed */ 358 359 /* update the paca copy of the context struct */ 360 get_paca()->context = current->active_mm->context; 361 362 asm volatile("isync" : : : "memory"); 363 for (i = 0; i < NUM_LOW_AREAS; i++) { 364 if (! (fi->newareas & (1U << i))) 365 continue; 366 asm volatile("slbie %0" 367 : : "r" ((i << SID_SHIFT) | SLBIE_C)); 368 } 369 asm volatile("isync" : : : "memory"); 370 } 371 372 static void flush_high_segments(void *parm) 373 { 374 struct slb_flush_info *fi = parm; 375 unsigned long i, j; 376 377 378 BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS); 379 380 if (current->active_mm != fi->mm) 381 return; 382 383 /* Only need to do anything if this CPU is working in the same 384 * mm as the one which has changed */ 385 386 /* update the paca copy of the context struct */ 387 get_paca()->context = current->active_mm->context; 388 389 asm volatile("isync" : : : "memory"); 390 for (i = 0; i < NUM_HIGH_AREAS; i++) { 391 if (! (fi->newareas & (1U << i))) 392 continue; 393 for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) 394 asm volatile("slbie %0" 395 :: "r" (((i << HTLB_AREA_SHIFT) 396 + (j << SID_SHIFT)) | SLBIE_C)); 397 } 398 asm volatile("isync" : : : "memory"); 399 } 400 401 static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) 402 { 403 unsigned long start = area << SID_SHIFT; 404 unsigned long end = (area+1) << SID_SHIFT; 405 struct vm_area_struct *vma; 406 407 BUG_ON(area >= NUM_LOW_AREAS); 408 409 /* Check no VMAs are in the region */ 410 vma = find_vma(mm, start); 411 if (vma && (vma->vm_start < end)) 412 return -EBUSY; 413 414 return 0; 415 } 416 417 static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) 418 { 419 unsigned long start = area << HTLB_AREA_SHIFT; 420 unsigned long end = (area+1) << HTLB_AREA_SHIFT; 421 struct vm_area_struct *vma; 422 423 BUG_ON(area >= NUM_HIGH_AREAS); 424 425 /* Hack, so that each addresses is controlled by exactly one 426 * of the high or low area bitmaps, the first high area starts 427 * at 4GB, not 0 */ 428 if (start == 0) 429 start = 0x100000000UL; 430 431 /* Check no VMAs are in the region */ 432 vma = find_vma(mm, start); 433 if (vma && (vma->vm_start < end)) 434 return -EBUSY; 435 436 return 0; 437 } 438 439 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) 440 { 441 unsigned long i; 442 struct slb_flush_info fi; 443 444 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); 445 BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); 446 447 newareas &= ~(mm->context.low_htlb_areas); 448 if (! newareas) 449 return 0; /* The segments we want are already open */ 450 451 for (i = 0; i < NUM_LOW_AREAS; i++) 452 if ((1 << i) & newareas) 453 if (prepare_low_area_for_htlb(mm, i) != 0) 454 return -EBUSY; 455 456 mm->context.low_htlb_areas |= newareas; 457 458 /* the context change must make it to memory before the flush, 459 * so that further SLB misses do the right thing. */ 460 mb(); 461 462 fi.mm = mm; 463 fi.newareas = newareas; 464 on_each_cpu(flush_low_segments, &fi, 0, 1); 465 466 return 0; 467 } 468 469 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) 470 { 471 struct slb_flush_info fi; 472 unsigned long i; 473 474 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); 475 BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) 476 != NUM_HIGH_AREAS); 477 478 newareas &= ~(mm->context.high_htlb_areas); 479 if (! newareas) 480 return 0; /* The areas we want are already open */ 481 482 for (i = 0; i < NUM_HIGH_AREAS; i++) 483 if ((1 << i) & newareas) 484 if (prepare_high_area_for_htlb(mm, i) != 0) 485 return -EBUSY; 486 487 mm->context.high_htlb_areas |= newareas; 488 489 /* the context change must make it to memory before the flush, 490 * so that further SLB misses do the right thing. */ 491 mb(); 492 493 fi.mm = mm; 494 fi.newareas = newareas; 495 on_each_cpu(flush_high_segments, &fi, 0, 1); 496 497 return 0; 498 } 499 500 int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff) 501 { 502 int err = 0; 503 504 if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT)) 505 return -EINVAL; 506 if (len & ~HPAGE_MASK) 507 return -EINVAL; 508 if (addr & ~HPAGE_MASK) 509 return -EINVAL; 510 511 if (addr < 0x100000000UL) 512 err = open_low_hpage_areas(current->mm, 513 LOW_ESID_MASK(addr, len)); 514 if ((addr + len) > 0x100000000UL) 515 err = open_high_hpage_areas(current->mm, 516 HTLB_AREA_MASK(addr, len)); 517 #ifdef CONFIG_SPE_BASE 518 spu_flush_all_slbs(current->mm); 519 #endif 520 if (err) { 521 printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" 522 " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", 523 addr, len, 524 LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); 525 return err; 526 } 527 528 return 0; 529 } 530 531 struct page * 532 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 533 { 534 pte_t *ptep; 535 struct page *page; 536 537 if (! in_hugepage_area(mm->context, address)) 538 return ERR_PTR(-EINVAL); 539 540 ptep = huge_pte_offset(mm, address); 541 page = pte_page(*ptep); 542 if (page) 543 page += (address % HPAGE_SIZE) / PAGE_SIZE; 544 545 return page; 546 } 547 548 int pmd_huge(pmd_t pmd) 549 { 550 return 0; 551 } 552 553 struct page * 554 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 555 pmd_t *pmd, int write) 556 { 557 BUG(); 558 return NULL; 559 } 560 561 /* Because we have an exclusive hugepage region which lies within the 562 * normal user address space, we have to take special measures to make 563 * non-huge mmap()s evade the hugepage reserved regions. */ 564 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, 565 unsigned long len, unsigned long pgoff, 566 unsigned long flags) 567 { 568 struct mm_struct *mm = current->mm; 569 struct vm_area_struct *vma; 570 unsigned long start_addr; 571 572 if (len > TASK_SIZE) 573 return -ENOMEM; 574 575 if (addr) { 576 addr = PAGE_ALIGN(addr); 577 vma = find_vma(mm, addr); 578 if (((TASK_SIZE - len) >= addr) 579 && (!vma || (addr+len) <= vma->vm_start) 580 && !is_hugepage_only_range(mm, addr,len)) 581 return addr; 582 } 583 if (len > mm->cached_hole_size) { 584 start_addr = addr = mm->free_area_cache; 585 } else { 586 start_addr = addr = TASK_UNMAPPED_BASE; 587 mm->cached_hole_size = 0; 588 } 589 590 full_search: 591 vma = find_vma(mm, addr); 592 while (TASK_SIZE - len >= addr) { 593 BUG_ON(vma && (addr >= vma->vm_end)); 594 595 if (touches_hugepage_low_range(mm, addr, len)) { 596 addr = ALIGN(addr+1, 1<<SID_SHIFT); 597 vma = find_vma(mm, addr); 598 continue; 599 } 600 if (touches_hugepage_high_range(mm, addr, len)) { 601 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); 602 vma = find_vma(mm, addr); 603 continue; 604 } 605 if (!vma || addr + len <= vma->vm_start) { 606 /* 607 * Remember the place where we stopped the search: 608 */ 609 mm->free_area_cache = addr + len; 610 return addr; 611 } 612 if (addr + mm->cached_hole_size < vma->vm_start) 613 mm->cached_hole_size = vma->vm_start - addr; 614 addr = vma->vm_end; 615 vma = vma->vm_next; 616 } 617 618 /* Make sure we didn't miss any holes */ 619 if (start_addr != TASK_UNMAPPED_BASE) { 620 start_addr = addr = TASK_UNMAPPED_BASE; 621 mm->cached_hole_size = 0; 622 goto full_search; 623 } 624 return -ENOMEM; 625 } 626 627 /* 628 * This mmap-allocator allocates new areas top-down from below the 629 * stack's low limit (the base): 630 * 631 * Because we have an exclusive hugepage region which lies within the 632 * normal user address space, we have to take special measures to make 633 * non-huge mmap()s evade the hugepage reserved regions. 634 */ 635 unsigned long 636 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 637 const unsigned long len, const unsigned long pgoff, 638 const unsigned long flags) 639 { 640 struct vm_area_struct *vma, *prev_vma; 641 struct mm_struct *mm = current->mm; 642 unsigned long base = mm->mmap_base, addr = addr0; 643 unsigned long largest_hole = mm->cached_hole_size; 644 int first_time = 1; 645 646 /* requested length too big for entire address space */ 647 if (len > TASK_SIZE) 648 return -ENOMEM; 649 650 /* dont allow allocations above current base */ 651 if (mm->free_area_cache > base) 652 mm->free_area_cache = base; 653 654 /* requesting a specific address */ 655 if (addr) { 656 addr = PAGE_ALIGN(addr); 657 vma = find_vma(mm, addr); 658 if (TASK_SIZE - len >= addr && 659 (!vma || addr + len <= vma->vm_start) 660 && !is_hugepage_only_range(mm, addr,len)) 661 return addr; 662 } 663 664 if (len <= largest_hole) { 665 largest_hole = 0; 666 mm->free_area_cache = base; 667 } 668 try_again: 669 /* make sure it can fit in the remaining address space */ 670 if (mm->free_area_cache < len) 671 goto fail; 672 673 /* either no address requested or cant fit in requested address hole */ 674 addr = (mm->free_area_cache - len) & PAGE_MASK; 675 do { 676 hugepage_recheck: 677 if (touches_hugepage_low_range(mm, addr, len)) { 678 addr = (addr & ((~0) << SID_SHIFT)) - len; 679 goto hugepage_recheck; 680 } else if (touches_hugepage_high_range(mm, addr, len)) { 681 addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; 682 goto hugepage_recheck; 683 } 684 685 /* 686 * Lookup failure means no vma is above this address, 687 * i.e. return with success: 688 */ 689 if (!(vma = find_vma_prev(mm, addr, &prev_vma))) 690 return addr; 691 692 /* 693 * new region fits between prev_vma->vm_end and 694 * vma->vm_start, use it: 695 */ 696 if (addr+len <= vma->vm_start && 697 (!prev_vma || (addr >= prev_vma->vm_end))) { 698 /* remember the address as a hint for next time */ 699 mm->cached_hole_size = largest_hole; 700 return (mm->free_area_cache = addr); 701 } else { 702 /* pull free_area_cache down to the first hole */ 703 if (mm->free_area_cache == vma->vm_end) { 704 mm->free_area_cache = vma->vm_start; 705 mm->cached_hole_size = largest_hole; 706 } 707 } 708 709 /* remember the largest hole we saw so far */ 710 if (addr + largest_hole < vma->vm_start) 711 largest_hole = vma->vm_start - addr; 712 713 /* try just below the current vma->vm_start */ 714 addr = vma->vm_start-len; 715 } while (len <= vma->vm_start); 716 717 fail: 718 /* 719 * if hint left us with no space for the requested 720 * mapping then try again: 721 */ 722 if (first_time) { 723 mm->free_area_cache = base; 724 largest_hole = 0; 725 first_time = 0; 726 goto try_again; 727 } 728 /* 729 * A failed mmap() very likely causes application failure, 730 * so fall back to the bottom-up function here. This scenario 731 * can happen with large stack limits and large mmap() 732 * allocations. 733 */ 734 mm->free_area_cache = TASK_UNMAPPED_BASE; 735 mm->cached_hole_size = ~0UL; 736 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 737 /* 738 * Restore the topdown base: 739 */ 740 mm->free_area_cache = base; 741 mm->cached_hole_size = ~0UL; 742 743 return addr; 744 } 745 746 static int htlb_check_hinted_area(unsigned long addr, unsigned long len) 747 { 748 struct vm_area_struct *vma; 749 750 vma = find_vma(current->mm, addr); 751 if (TASK_SIZE - len >= addr && 752 (!vma || ((addr + len) <= vma->vm_start))) 753 return 0; 754 755 return -ENOMEM; 756 } 757 758 static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) 759 { 760 unsigned long addr = 0; 761 struct vm_area_struct *vma; 762 763 vma = find_vma(current->mm, addr); 764 while (addr + len <= 0x100000000UL) { 765 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ 766 767 if (! __within_hugepage_low_range(addr, len, segmask)) { 768 addr = ALIGN(addr+1, 1<<SID_SHIFT); 769 vma = find_vma(current->mm, addr); 770 continue; 771 } 772 773 if (!vma || (addr + len) <= vma->vm_start) 774 return addr; 775 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 776 /* Depending on segmask this might not be a confirmed 777 * hugepage region, so the ALIGN could have skipped 778 * some VMAs */ 779 vma = find_vma(current->mm, addr); 780 } 781 782 return -ENOMEM; 783 } 784 785 static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) 786 { 787 unsigned long addr = 0x100000000UL; 788 struct vm_area_struct *vma; 789 790 vma = find_vma(current->mm, addr); 791 while (addr + len <= TASK_SIZE_USER64) { 792 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ 793 794 if (! __within_hugepage_high_range(addr, len, areamask)) { 795 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); 796 vma = find_vma(current->mm, addr); 797 continue; 798 } 799 800 if (!vma || (addr + len) <= vma->vm_start) 801 return addr; 802 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 803 /* Depending on segmask this might not be a confirmed 804 * hugepage region, so the ALIGN could have skipped 805 * some VMAs */ 806 vma = find_vma(current->mm, addr); 807 } 808 809 return -ENOMEM; 810 } 811 812 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 813 unsigned long len, unsigned long pgoff, 814 unsigned long flags) 815 { 816 int lastshift; 817 u16 areamask, curareas; 818 819 if (HPAGE_SHIFT == 0) 820 return -EINVAL; 821 if (len & ~HPAGE_MASK) 822 return -EINVAL; 823 if (len > TASK_SIZE) 824 return -ENOMEM; 825 826 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 827 return -EINVAL; 828 829 /* Paranoia, caller should have dealt with this */ 830 BUG_ON((addr + len) < addr); 831 832 if (test_thread_flag(TIF_32BIT)) { 833 curareas = current->mm->context.low_htlb_areas; 834 835 /* First see if we can use the hint address */ 836 if (addr && (htlb_check_hinted_area(addr, len) == 0)) { 837 areamask = LOW_ESID_MASK(addr, len); 838 if (open_low_hpage_areas(current->mm, areamask) == 0) 839 return addr; 840 } 841 842 /* Next see if we can map in the existing low areas */ 843 addr = htlb_get_low_area(len, curareas); 844 if (addr != -ENOMEM) 845 return addr; 846 847 /* Finally go looking for areas to open */ 848 lastshift = 0; 849 for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); 850 ! lastshift; areamask >>=1) { 851 if (areamask & 1) 852 lastshift = 1; 853 854 addr = htlb_get_low_area(len, curareas | areamask); 855 if ((addr != -ENOMEM) 856 && open_low_hpage_areas(current->mm, areamask) == 0) 857 return addr; 858 } 859 } else { 860 curareas = current->mm->context.high_htlb_areas; 861 862 /* First see if we can use the hint address */ 863 /* We discourage 64-bit processes from doing hugepage 864 * mappings below 4GB (must use MAP_FIXED) */ 865 if ((addr >= 0x100000000UL) 866 && (htlb_check_hinted_area(addr, len) == 0)) { 867 areamask = HTLB_AREA_MASK(addr, len); 868 if (open_high_hpage_areas(current->mm, areamask) == 0) 869 return addr; 870 } 871 872 /* Next see if we can map in the existing high areas */ 873 addr = htlb_get_high_area(len, curareas); 874 if (addr != -ENOMEM) 875 return addr; 876 877 /* Finally go looking for areas to open */ 878 lastshift = 0; 879 for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); 880 ! lastshift; areamask >>=1) { 881 if (areamask & 1) 882 lastshift = 1; 883 884 addr = htlb_get_high_area(len, curareas | areamask); 885 if ((addr != -ENOMEM) 886 && open_high_hpage_areas(current->mm, areamask) == 0) 887 return addr; 888 } 889 } 890 printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" 891 " enough areas\n"); 892 return -ENOMEM; 893 } 894 895 /* 896 * Called by asm hashtable.S for doing lazy icache flush 897 */ 898 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, 899 pte_t pte, int trap) 900 { 901 struct page *page; 902 int i; 903 904 if (!pfn_valid(pte_pfn(pte))) 905 return rflags; 906 907 page = pte_page(pte); 908 909 /* page is dirty */ 910 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 911 if (trap == 0x400) { 912 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) 913 __flush_dcache_icache(page_address(page+i)); 914 set_bit(PG_arch_1, &page->flags); 915 } else { 916 rflags |= HPTE_R_N; 917 } 918 } 919 return rflags; 920 } 921 922 int hash_huge_page(struct mm_struct *mm, unsigned long access, 923 unsigned long ea, unsigned long vsid, int local, 924 unsigned long trap) 925 { 926 pte_t *ptep; 927 unsigned long old_pte, new_pte; 928 unsigned long va, rflags, pa; 929 long slot; 930 int err = 1; 931 932 ptep = huge_pte_offset(mm, ea); 933 934 /* Search the Linux page table for a match with va */ 935 va = (vsid << 28) | (ea & 0x0fffffff); 936 937 /* 938 * If no pte found or not present, send the problem up to 939 * do_page_fault 940 */ 941 if (unlikely(!ptep || pte_none(*ptep))) 942 goto out; 943 944 /* 945 * Check the user's access rights to the page. If access should be 946 * prevented then send the problem up to do_page_fault. 947 */ 948 if (unlikely(access & ~pte_val(*ptep))) 949 goto out; 950 /* 951 * At this point, we have a pte (old_pte) which can be used to build 952 * or update an HPTE. There are 2 cases: 953 * 954 * 1. There is a valid (present) pte with no associated HPTE (this is 955 * the most common case) 956 * 2. There is a valid (present) pte with an associated HPTE. The 957 * current values of the pp bits in the HPTE prevent access 958 * because we are doing software DIRTY bit management and the 959 * page is currently not DIRTY. 960 */ 961 962 963 do { 964 old_pte = pte_val(*ptep); 965 if (old_pte & _PAGE_BUSY) 966 goto out; 967 new_pte = old_pte | _PAGE_BUSY | 968 _PAGE_ACCESSED | _PAGE_HASHPTE; 969 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, 970 old_pte, new_pte)); 971 972 rflags = 0x2 | (!(new_pte & _PAGE_RW)); 973 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ 974 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); 975 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 976 /* No CPU has hugepages but lacks no execute, so we 977 * don't need to worry about that case */ 978 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), 979 trap); 980 981 /* Check if pte already has an hpte (case 2) */ 982 if (unlikely(old_pte & _PAGE_HASHPTE)) { 983 /* There MIGHT be an HPTE for this pte */ 984 unsigned long hash, slot; 985 986 hash = hpt_hash(va, HPAGE_SHIFT); 987 if (old_pte & _PAGE_F_SECOND) 988 hash = ~hash; 989 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 990 slot += (old_pte & _PAGE_F_GIX) >> 12; 991 992 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize, 993 local) == -1) 994 old_pte &= ~_PAGE_HPTEFLAGS; 995 } 996 997 if (likely(!(old_pte & _PAGE_HASHPTE))) { 998 unsigned long hash = hpt_hash(va, HPAGE_SHIFT); 999 unsigned long hpte_group; 1000 1001 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 1002 1003 repeat: 1004 hpte_group = ((hash & htab_hash_mask) * 1005 HPTES_PER_GROUP) & ~0x7UL; 1006 1007 /* clear HPTE slot informations in new PTE */ 1008 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 1009 1010 /* Add in WIMG bits */ 1011 /* XXX We should store these in the pte */ 1012 /* --BenH: I think they are ... */ 1013 rflags |= _PAGE_COHERENT; 1014 1015 /* Insert into the hash table, primary slot */ 1016 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, 1017 mmu_huge_psize); 1018 1019 /* Primary is full, try the secondary */ 1020 if (unlikely(slot == -1)) { 1021 hpte_group = ((~hash & htab_hash_mask) * 1022 HPTES_PER_GROUP) & ~0x7UL; 1023 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 1024 HPTE_V_SECONDARY, 1025 mmu_huge_psize); 1026 if (slot == -1) { 1027 if (mftb() & 0x1) 1028 hpte_group = ((hash & htab_hash_mask) * 1029 HPTES_PER_GROUP)&~0x7UL; 1030 1031 ppc_md.hpte_remove(hpte_group); 1032 goto repeat; 1033 } 1034 } 1035 1036 if (unlikely(slot == -2)) 1037 panic("hash_huge_page: pte_insert failed\n"); 1038 1039 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); 1040 } 1041 1042 /* 1043 * No need to use ldarx/stdcx here 1044 */ 1045 *ptep = __pte(new_pte & ~_PAGE_BUSY); 1046 1047 err = 0; 1048 1049 out: 1050 return err; 1051 } 1052 1053 static void zero_ctor(void *addr, struct kmem_cache *cache, unsigned long flags) 1054 { 1055 memset(addr, 0, kmem_cache_size(cache)); 1056 } 1057 1058 static int __init hugetlbpage_init(void) 1059 { 1060 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 1061 return -ENODEV; 1062 1063 huge_pgtable_cache = kmem_cache_create("hugepte_cache", 1064 HUGEPTE_TABLE_SIZE, 1065 HUGEPTE_TABLE_SIZE, 1066 SLAB_HWCACHE_ALIGN | 1067 SLAB_MUST_HWCACHE_ALIGN, 1068 zero_ctor, NULL); 1069 if (! huge_pgtable_cache) 1070 panic("hugetlbpage_init(): could not create hugepte cache\n"); 1071 1072 return 0; 1073 } 1074 1075 module_init(hugetlbpage_init); 1076