1 /* 2 * PPC64 (POWER4) Huge TLB Page Support for Kernel. 3 * 4 * Copyright (C) 2003 David Gibson, IBM Corporation. 5 * 6 * Based on the IA-32 version: 7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> 8 */ 9 10 #include <linux/init.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/hugetlb.h> 14 #include <linux/pagemap.h> 15 #include <linux/smp_lock.h> 16 #include <linux/slab.h> 17 #include <linux/err.h> 18 #include <linux/sysctl.h> 19 #include <asm/mman.h> 20 #include <asm/pgalloc.h> 21 #include <asm/tlb.h> 22 #include <asm/tlbflush.h> 23 #include <asm/mmu_context.h> 24 #include <asm/machdep.h> 25 #include <asm/cputable.h> 26 #include <asm/tlb.h> 27 28 #include <linux/sysctl.h> 29 30 #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) 31 #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) 32 33 #ifdef CONFIG_PPC_64K_PAGES 34 #define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT) 35 #else 36 #define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT) 37 #endif 38 #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE) 39 #define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE) 40 41 #define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE) 42 #define HUGEPD_SIZE (1UL << HUGEPD_SHIFT) 43 #define HUGEPD_MASK (~(HUGEPD_SIZE-1)) 44 45 #define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM]) 46 47 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() 48 * will choke on pointers to hugepte tables, which is handy for 49 * catching screwups early. */ 50 #define HUGEPD_OK 0x1 51 52 typedef struct { unsigned long pd; } hugepd_t; 53 54 #define hugepd_none(hpd) ((hpd).pd == 0) 55 56 static inline pte_t *hugepd_page(hugepd_t hpd) 57 { 58 BUG_ON(!(hpd.pd & HUGEPD_OK)); 59 return (pte_t *)(hpd.pd & ~HUGEPD_OK); 60 } 61 62 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr) 63 { 64 unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1)); 65 pte_t *dir = hugepd_page(*hpdp); 66 67 return dir + idx; 68 } 69 70 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, 71 unsigned long address) 72 { 73 pte_t *new = kmem_cache_alloc(huge_pgtable_cache, 74 GFP_KERNEL|__GFP_REPEAT); 75 76 if (! new) 77 return -ENOMEM; 78 79 spin_lock(&mm->page_table_lock); 80 if (!hugepd_none(*hpdp)) 81 kmem_cache_free(huge_pgtable_cache, new); 82 else 83 hpdp->pd = (unsigned long)new | HUGEPD_OK; 84 spin_unlock(&mm->page_table_lock); 85 return 0; 86 } 87 88 /* Modelled after find_linux_pte() */ 89 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 90 { 91 pgd_t *pg; 92 pud_t *pu; 93 94 BUG_ON(! in_hugepage_area(mm->context, addr)); 95 96 addr &= HPAGE_MASK; 97 98 pg = pgd_offset(mm, addr); 99 if (!pgd_none(*pg)) { 100 pu = pud_offset(pg, addr); 101 if (!pud_none(*pu)) { 102 #ifdef CONFIG_PPC_64K_PAGES 103 pmd_t *pm; 104 pm = pmd_offset(pu, addr); 105 if (!pmd_none(*pm)) 106 return hugepte_offset((hugepd_t *)pm, addr); 107 #else 108 return hugepte_offset((hugepd_t *)pu, addr); 109 #endif 110 } 111 } 112 113 return NULL; 114 } 115 116 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 117 { 118 pgd_t *pg; 119 pud_t *pu; 120 hugepd_t *hpdp = NULL; 121 122 BUG_ON(! in_hugepage_area(mm->context, addr)); 123 124 addr &= HPAGE_MASK; 125 126 pg = pgd_offset(mm, addr); 127 pu = pud_alloc(mm, pg, addr); 128 129 if (pu) { 130 #ifdef CONFIG_PPC_64K_PAGES 131 pmd_t *pm; 132 pm = pmd_alloc(mm, pu, addr); 133 if (pm) 134 hpdp = (hugepd_t *)pm; 135 #else 136 hpdp = (hugepd_t *)pu; 137 #endif 138 } 139 140 if (! hpdp) 141 return NULL; 142 143 if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr)) 144 return NULL; 145 146 return hugepte_offset(hpdp, addr); 147 } 148 149 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp) 150 { 151 pte_t *hugepte = hugepd_page(*hpdp); 152 153 hpdp->pd = 0; 154 tlb->need_flush = 1; 155 pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM, 156 HUGEPTE_TABLE_SIZE-1)); 157 } 158 159 #ifdef CONFIG_PPC_64K_PAGES 160 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 161 unsigned long addr, unsigned long end, 162 unsigned long floor, unsigned long ceiling) 163 { 164 pmd_t *pmd; 165 unsigned long next; 166 unsigned long start; 167 168 start = addr; 169 pmd = pmd_offset(pud, addr); 170 do { 171 next = pmd_addr_end(addr, end); 172 if (pmd_none(*pmd)) 173 continue; 174 free_hugepte_range(tlb, (hugepd_t *)pmd); 175 } while (pmd++, addr = next, addr != end); 176 177 start &= PUD_MASK; 178 if (start < floor) 179 return; 180 if (ceiling) { 181 ceiling &= PUD_MASK; 182 if (!ceiling) 183 return; 184 } 185 if (end - 1 > ceiling - 1) 186 return; 187 188 pmd = pmd_offset(pud, start); 189 pud_clear(pud); 190 pmd_free_tlb(tlb, pmd); 191 } 192 #endif 193 194 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 195 unsigned long addr, unsigned long end, 196 unsigned long floor, unsigned long ceiling) 197 { 198 pud_t *pud; 199 unsigned long next; 200 unsigned long start; 201 202 start = addr; 203 pud = pud_offset(pgd, addr); 204 do { 205 next = pud_addr_end(addr, end); 206 #ifdef CONFIG_PPC_64K_PAGES 207 if (pud_none_or_clear_bad(pud)) 208 continue; 209 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling); 210 #else 211 if (pud_none(*pud)) 212 continue; 213 free_hugepte_range(tlb, (hugepd_t *)pud); 214 #endif 215 } while (pud++, addr = next, addr != end); 216 217 start &= PGDIR_MASK; 218 if (start < floor) 219 return; 220 if (ceiling) { 221 ceiling &= PGDIR_MASK; 222 if (!ceiling) 223 return; 224 } 225 if (end - 1 > ceiling - 1) 226 return; 227 228 pud = pud_offset(pgd, start); 229 pgd_clear(pgd); 230 pud_free_tlb(tlb, pud); 231 } 232 233 /* 234 * This function frees user-level page tables of a process. 235 * 236 * Must be called with pagetable lock held. 237 */ 238 void hugetlb_free_pgd_range(struct mmu_gather **tlb, 239 unsigned long addr, unsigned long end, 240 unsigned long floor, unsigned long ceiling) 241 { 242 pgd_t *pgd; 243 unsigned long next; 244 unsigned long start; 245 246 /* 247 * Comments below take from the normal free_pgd_range(). They 248 * apply here too. The tests against HUGEPD_MASK below are 249 * essential, because we *don't* test for this at the bottom 250 * level. Without them we'll attempt to free a hugepte table 251 * when we unmap just part of it, even if there are other 252 * active mappings using it. 253 * 254 * The next few lines have given us lots of grief... 255 * 256 * Why are we testing HUGEPD* at this top level? Because 257 * often there will be no work to do at all, and we'd prefer 258 * not to go all the way down to the bottom just to discover 259 * that. 260 * 261 * Why all these "- 1"s? Because 0 represents both the bottom 262 * of the address space and the top of it (using -1 for the 263 * top wouldn't help much: the masks would do the wrong thing). 264 * The rule is that addr 0 and floor 0 refer to the bottom of 265 * the address space, but end 0 and ceiling 0 refer to the top 266 * Comparisons need to use "end - 1" and "ceiling - 1" (though 267 * that end 0 case should be mythical). 268 * 269 * Wherever addr is brought up or ceiling brought down, we 270 * must be careful to reject "the opposite 0" before it 271 * confuses the subsequent tests. But what about where end is 272 * brought down by HUGEPD_SIZE below? no, end can't go down to 273 * 0 there. 274 * 275 * Whereas we round start (addr) and ceiling down, by different 276 * masks at different levels, in order to test whether a table 277 * now has no other vmas using it, so can be freed, we don't 278 * bother to round floor or end up - the tests don't need that. 279 */ 280 281 addr &= HUGEPD_MASK; 282 if (addr < floor) { 283 addr += HUGEPD_SIZE; 284 if (!addr) 285 return; 286 } 287 if (ceiling) { 288 ceiling &= HUGEPD_MASK; 289 if (!ceiling) 290 return; 291 } 292 if (end - 1 > ceiling - 1) 293 end -= HUGEPD_SIZE; 294 if (addr > end - 1) 295 return; 296 297 start = addr; 298 pgd = pgd_offset((*tlb)->mm, addr); 299 do { 300 BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr)); 301 next = pgd_addr_end(addr, end); 302 if (pgd_none_or_clear_bad(pgd)) 303 continue; 304 hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 305 } while (pgd++, addr = next, addr != end); 306 } 307 308 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 309 pte_t *ptep, pte_t pte) 310 { 311 if (pte_present(*ptep)) { 312 /* We open-code pte_clear because we need to pass the right 313 * argument to hpte_update (huge / !huge) 314 */ 315 unsigned long old = pte_update(ptep, ~0UL); 316 if (old & _PAGE_HASHPTE) 317 hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); 318 flush_tlb_pending(); 319 } 320 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 321 } 322 323 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, 324 pte_t *ptep) 325 { 326 unsigned long old = pte_update(ptep, ~0UL); 327 328 if (old & _PAGE_HASHPTE) 329 hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); 330 *ptep = __pte(0); 331 332 return __pte(old); 333 } 334 335 struct slb_flush_info { 336 struct mm_struct *mm; 337 u16 newareas; 338 }; 339 340 static void flush_low_segments(void *parm) 341 { 342 struct slb_flush_info *fi = parm; 343 unsigned long i; 344 345 BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS); 346 347 if (current->active_mm != fi->mm) 348 return; 349 350 /* Only need to do anything if this CPU is working in the same 351 * mm as the one which has changed */ 352 353 /* update the paca copy of the context struct */ 354 get_paca()->context = current->active_mm->context; 355 356 asm volatile("isync" : : : "memory"); 357 for (i = 0; i < NUM_LOW_AREAS; i++) { 358 if (! (fi->newareas & (1U << i))) 359 continue; 360 asm volatile("slbie %0" 361 : : "r" ((i << SID_SHIFT) | SLBIE_C)); 362 } 363 asm volatile("isync" : : : "memory"); 364 } 365 366 static void flush_high_segments(void *parm) 367 { 368 struct slb_flush_info *fi = parm; 369 unsigned long i, j; 370 371 372 BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS); 373 374 if (current->active_mm != fi->mm) 375 return; 376 377 /* Only need to do anything if this CPU is working in the same 378 * mm as the one which has changed */ 379 380 /* update the paca copy of the context struct */ 381 get_paca()->context = current->active_mm->context; 382 383 asm volatile("isync" : : : "memory"); 384 for (i = 0; i < NUM_HIGH_AREAS; i++) { 385 if (! (fi->newareas & (1U << i))) 386 continue; 387 for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) 388 asm volatile("slbie %0" 389 :: "r" (((i << HTLB_AREA_SHIFT) 390 + (j << SID_SHIFT)) | SLBIE_C)); 391 } 392 asm volatile("isync" : : : "memory"); 393 } 394 395 static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) 396 { 397 unsigned long start = area << SID_SHIFT; 398 unsigned long end = (area+1) << SID_SHIFT; 399 struct vm_area_struct *vma; 400 401 BUG_ON(area >= NUM_LOW_AREAS); 402 403 /* Check no VMAs are in the region */ 404 vma = find_vma(mm, start); 405 if (vma && (vma->vm_start < end)) 406 return -EBUSY; 407 408 return 0; 409 } 410 411 static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) 412 { 413 unsigned long start = area << HTLB_AREA_SHIFT; 414 unsigned long end = (area+1) << HTLB_AREA_SHIFT; 415 struct vm_area_struct *vma; 416 417 BUG_ON(area >= NUM_HIGH_AREAS); 418 419 /* Hack, so that each addresses is controlled by exactly one 420 * of the high or low area bitmaps, the first high area starts 421 * at 4GB, not 0 */ 422 if (start == 0) 423 start = 0x100000000UL; 424 425 /* Check no VMAs are in the region */ 426 vma = find_vma(mm, start); 427 if (vma && (vma->vm_start < end)) 428 return -EBUSY; 429 430 return 0; 431 } 432 433 static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) 434 { 435 unsigned long i; 436 struct slb_flush_info fi; 437 438 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); 439 BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); 440 441 newareas &= ~(mm->context.low_htlb_areas); 442 if (! newareas) 443 return 0; /* The segments we want are already open */ 444 445 for (i = 0; i < NUM_LOW_AREAS; i++) 446 if ((1 << i) & newareas) 447 if (prepare_low_area_for_htlb(mm, i) != 0) 448 return -EBUSY; 449 450 mm->context.low_htlb_areas |= newareas; 451 452 /* the context change must make it to memory before the flush, 453 * so that further SLB misses do the right thing. */ 454 mb(); 455 456 fi.mm = mm; 457 fi.newareas = newareas; 458 on_each_cpu(flush_low_segments, &fi, 0, 1); 459 460 return 0; 461 } 462 463 static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) 464 { 465 struct slb_flush_info fi; 466 unsigned long i; 467 468 BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); 469 BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) 470 != NUM_HIGH_AREAS); 471 472 newareas &= ~(mm->context.high_htlb_areas); 473 if (! newareas) 474 return 0; /* The areas we want are already open */ 475 476 for (i = 0; i < NUM_HIGH_AREAS; i++) 477 if ((1 << i) & newareas) 478 if (prepare_high_area_for_htlb(mm, i) != 0) 479 return -EBUSY; 480 481 mm->context.high_htlb_areas |= newareas; 482 483 /* update the paca copy of the context struct */ 484 get_paca()->context = mm->context; 485 486 /* the context change must make it to memory before the flush, 487 * so that further SLB misses do the right thing. */ 488 mb(); 489 490 fi.mm = mm; 491 fi.newareas = newareas; 492 on_each_cpu(flush_high_segments, &fi, 0, 1); 493 494 return 0; 495 } 496 497 int prepare_hugepage_range(unsigned long addr, unsigned long len) 498 { 499 int err = 0; 500 501 if ( (addr+len) < addr ) 502 return -EINVAL; 503 504 if (addr < 0x100000000UL) 505 err = open_low_hpage_areas(current->mm, 506 LOW_ESID_MASK(addr, len)); 507 if ((addr + len) > 0x100000000UL) 508 err = open_high_hpage_areas(current->mm, 509 HTLB_AREA_MASK(addr, len)); 510 if (err) { 511 printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" 512 " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", 513 addr, len, 514 LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); 515 return err; 516 } 517 518 return 0; 519 } 520 521 struct page * 522 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) 523 { 524 pte_t *ptep; 525 struct page *page; 526 527 if (! in_hugepage_area(mm->context, address)) 528 return ERR_PTR(-EINVAL); 529 530 ptep = huge_pte_offset(mm, address); 531 page = pte_page(*ptep); 532 if (page) 533 page += (address % HPAGE_SIZE) / PAGE_SIZE; 534 535 return page; 536 } 537 538 int pmd_huge(pmd_t pmd) 539 { 540 return 0; 541 } 542 543 struct page * 544 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 545 pmd_t *pmd, int write) 546 { 547 BUG(); 548 return NULL; 549 } 550 551 /* Because we have an exclusive hugepage region which lies within the 552 * normal user address space, we have to take special measures to make 553 * non-huge mmap()s evade the hugepage reserved regions. */ 554 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, 555 unsigned long len, unsigned long pgoff, 556 unsigned long flags) 557 { 558 struct mm_struct *mm = current->mm; 559 struct vm_area_struct *vma; 560 unsigned long start_addr; 561 562 if (len > TASK_SIZE) 563 return -ENOMEM; 564 565 if (addr) { 566 addr = PAGE_ALIGN(addr); 567 vma = find_vma(mm, addr); 568 if (((TASK_SIZE - len) >= addr) 569 && (!vma || (addr+len) <= vma->vm_start) 570 && !is_hugepage_only_range(mm, addr,len)) 571 return addr; 572 } 573 if (len > mm->cached_hole_size) { 574 start_addr = addr = mm->free_area_cache; 575 } else { 576 start_addr = addr = TASK_UNMAPPED_BASE; 577 mm->cached_hole_size = 0; 578 } 579 580 full_search: 581 vma = find_vma(mm, addr); 582 while (TASK_SIZE - len >= addr) { 583 BUG_ON(vma && (addr >= vma->vm_end)); 584 585 if (touches_hugepage_low_range(mm, addr, len)) { 586 addr = ALIGN(addr+1, 1<<SID_SHIFT); 587 vma = find_vma(mm, addr); 588 continue; 589 } 590 if (touches_hugepage_high_range(mm, addr, len)) { 591 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); 592 vma = find_vma(mm, addr); 593 continue; 594 } 595 if (!vma || addr + len <= vma->vm_start) { 596 /* 597 * Remember the place where we stopped the search: 598 */ 599 mm->free_area_cache = addr + len; 600 return addr; 601 } 602 if (addr + mm->cached_hole_size < vma->vm_start) 603 mm->cached_hole_size = vma->vm_start - addr; 604 addr = vma->vm_end; 605 vma = vma->vm_next; 606 } 607 608 /* Make sure we didn't miss any holes */ 609 if (start_addr != TASK_UNMAPPED_BASE) { 610 start_addr = addr = TASK_UNMAPPED_BASE; 611 mm->cached_hole_size = 0; 612 goto full_search; 613 } 614 return -ENOMEM; 615 } 616 617 /* 618 * This mmap-allocator allocates new areas top-down from below the 619 * stack's low limit (the base): 620 * 621 * Because we have an exclusive hugepage region which lies within the 622 * normal user address space, we have to take special measures to make 623 * non-huge mmap()s evade the hugepage reserved regions. 624 */ 625 unsigned long 626 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 627 const unsigned long len, const unsigned long pgoff, 628 const unsigned long flags) 629 { 630 struct vm_area_struct *vma, *prev_vma; 631 struct mm_struct *mm = current->mm; 632 unsigned long base = mm->mmap_base, addr = addr0; 633 unsigned long largest_hole = mm->cached_hole_size; 634 int first_time = 1; 635 636 /* requested length too big for entire address space */ 637 if (len > TASK_SIZE) 638 return -ENOMEM; 639 640 /* dont allow allocations above current base */ 641 if (mm->free_area_cache > base) 642 mm->free_area_cache = base; 643 644 /* requesting a specific address */ 645 if (addr) { 646 addr = PAGE_ALIGN(addr); 647 vma = find_vma(mm, addr); 648 if (TASK_SIZE - len >= addr && 649 (!vma || addr + len <= vma->vm_start) 650 && !is_hugepage_only_range(mm, addr,len)) 651 return addr; 652 } 653 654 if (len <= largest_hole) { 655 largest_hole = 0; 656 mm->free_area_cache = base; 657 } 658 try_again: 659 /* make sure it can fit in the remaining address space */ 660 if (mm->free_area_cache < len) 661 goto fail; 662 663 /* either no address requested or cant fit in requested address hole */ 664 addr = (mm->free_area_cache - len) & PAGE_MASK; 665 do { 666 hugepage_recheck: 667 if (touches_hugepage_low_range(mm, addr, len)) { 668 addr = (addr & ((~0) << SID_SHIFT)) - len; 669 goto hugepage_recheck; 670 } else if (touches_hugepage_high_range(mm, addr, len)) { 671 addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; 672 goto hugepage_recheck; 673 } 674 675 /* 676 * Lookup failure means no vma is above this address, 677 * i.e. return with success: 678 */ 679 if (!(vma = find_vma_prev(mm, addr, &prev_vma))) 680 return addr; 681 682 /* 683 * new region fits between prev_vma->vm_end and 684 * vma->vm_start, use it: 685 */ 686 if (addr+len <= vma->vm_start && 687 (!prev_vma || (addr >= prev_vma->vm_end))) { 688 /* remember the address as a hint for next time */ 689 mm->cached_hole_size = largest_hole; 690 return (mm->free_area_cache = addr); 691 } else { 692 /* pull free_area_cache down to the first hole */ 693 if (mm->free_area_cache == vma->vm_end) { 694 mm->free_area_cache = vma->vm_start; 695 mm->cached_hole_size = largest_hole; 696 } 697 } 698 699 /* remember the largest hole we saw so far */ 700 if (addr + largest_hole < vma->vm_start) 701 largest_hole = vma->vm_start - addr; 702 703 /* try just below the current vma->vm_start */ 704 addr = vma->vm_start-len; 705 } while (len <= vma->vm_start); 706 707 fail: 708 /* 709 * if hint left us with no space for the requested 710 * mapping then try again: 711 */ 712 if (first_time) { 713 mm->free_area_cache = base; 714 largest_hole = 0; 715 first_time = 0; 716 goto try_again; 717 } 718 /* 719 * A failed mmap() very likely causes application failure, 720 * so fall back to the bottom-up function here. This scenario 721 * can happen with large stack limits and large mmap() 722 * allocations. 723 */ 724 mm->free_area_cache = TASK_UNMAPPED_BASE; 725 mm->cached_hole_size = ~0UL; 726 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 727 /* 728 * Restore the topdown base: 729 */ 730 mm->free_area_cache = base; 731 mm->cached_hole_size = ~0UL; 732 733 return addr; 734 } 735 736 static int htlb_check_hinted_area(unsigned long addr, unsigned long len) 737 { 738 struct vm_area_struct *vma; 739 740 vma = find_vma(current->mm, addr); 741 if (!vma || ((addr + len) <= vma->vm_start)) 742 return 0; 743 744 return -ENOMEM; 745 } 746 747 static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) 748 { 749 unsigned long addr = 0; 750 struct vm_area_struct *vma; 751 752 vma = find_vma(current->mm, addr); 753 while (addr + len <= 0x100000000UL) { 754 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ 755 756 if (! __within_hugepage_low_range(addr, len, segmask)) { 757 addr = ALIGN(addr+1, 1<<SID_SHIFT); 758 vma = find_vma(current->mm, addr); 759 continue; 760 } 761 762 if (!vma || (addr + len) <= vma->vm_start) 763 return addr; 764 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 765 /* Depending on segmask this might not be a confirmed 766 * hugepage region, so the ALIGN could have skipped 767 * some VMAs */ 768 vma = find_vma(current->mm, addr); 769 } 770 771 return -ENOMEM; 772 } 773 774 static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) 775 { 776 unsigned long addr = 0x100000000UL; 777 struct vm_area_struct *vma; 778 779 vma = find_vma(current->mm, addr); 780 while (addr + len <= TASK_SIZE_USER64) { 781 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ 782 783 if (! __within_hugepage_high_range(addr, len, areamask)) { 784 addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); 785 vma = find_vma(current->mm, addr); 786 continue; 787 } 788 789 if (!vma || (addr + len) <= vma->vm_start) 790 return addr; 791 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 792 /* Depending on segmask this might not be a confirmed 793 * hugepage region, so the ALIGN could have skipped 794 * some VMAs */ 795 vma = find_vma(current->mm, addr); 796 } 797 798 return -ENOMEM; 799 } 800 801 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 802 unsigned long len, unsigned long pgoff, 803 unsigned long flags) 804 { 805 int lastshift; 806 u16 areamask, curareas; 807 808 if (HPAGE_SHIFT == 0) 809 return -EINVAL; 810 if (len & ~HPAGE_MASK) 811 return -EINVAL; 812 813 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 814 return -EINVAL; 815 816 /* Paranoia, caller should have dealt with this */ 817 BUG_ON((addr + len) < addr); 818 819 if (test_thread_flag(TIF_32BIT)) { 820 /* Paranoia, caller should have dealt with this */ 821 BUG_ON((addr + len) > 0x100000000UL); 822 823 curareas = current->mm->context.low_htlb_areas; 824 825 /* First see if we can use the hint address */ 826 if (addr && (htlb_check_hinted_area(addr, len) == 0)) { 827 areamask = LOW_ESID_MASK(addr, len); 828 if (open_low_hpage_areas(current->mm, areamask) == 0) 829 return addr; 830 } 831 832 /* Next see if we can map in the existing low areas */ 833 addr = htlb_get_low_area(len, curareas); 834 if (addr != -ENOMEM) 835 return addr; 836 837 /* Finally go looking for areas to open */ 838 lastshift = 0; 839 for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); 840 ! lastshift; areamask >>=1) { 841 if (areamask & 1) 842 lastshift = 1; 843 844 addr = htlb_get_low_area(len, curareas | areamask); 845 if ((addr != -ENOMEM) 846 && open_low_hpage_areas(current->mm, areamask) == 0) 847 return addr; 848 } 849 } else { 850 curareas = current->mm->context.high_htlb_areas; 851 852 /* First see if we can use the hint address */ 853 /* We discourage 64-bit processes from doing hugepage 854 * mappings below 4GB (must use MAP_FIXED) */ 855 if ((addr >= 0x100000000UL) 856 && (htlb_check_hinted_area(addr, len) == 0)) { 857 areamask = HTLB_AREA_MASK(addr, len); 858 if (open_high_hpage_areas(current->mm, areamask) == 0) 859 return addr; 860 } 861 862 /* Next see if we can map in the existing high areas */ 863 addr = htlb_get_high_area(len, curareas); 864 if (addr != -ENOMEM) 865 return addr; 866 867 /* Finally go looking for areas to open */ 868 lastshift = 0; 869 for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); 870 ! lastshift; areamask >>=1) { 871 if (areamask & 1) 872 lastshift = 1; 873 874 addr = htlb_get_high_area(len, curareas | areamask); 875 if ((addr != -ENOMEM) 876 && open_high_hpage_areas(current->mm, areamask) == 0) 877 return addr; 878 } 879 } 880 printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" 881 " enough areas\n"); 882 return -ENOMEM; 883 } 884 885 /* 886 * Called by asm hashtable.S for doing lazy icache flush 887 */ 888 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, 889 pte_t pte, int trap) 890 { 891 struct page *page; 892 int i; 893 894 if (!pfn_valid(pte_pfn(pte))) 895 return rflags; 896 897 page = pte_page(pte); 898 899 /* page is dirty */ 900 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { 901 if (trap == 0x400) { 902 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) 903 __flush_dcache_icache(page_address(page+i)); 904 set_bit(PG_arch_1, &page->flags); 905 } else { 906 rflags |= HPTE_R_N; 907 } 908 } 909 return rflags; 910 } 911 912 int hash_huge_page(struct mm_struct *mm, unsigned long access, 913 unsigned long ea, unsigned long vsid, int local, 914 unsigned long trap) 915 { 916 pte_t *ptep; 917 unsigned long old_pte, new_pte; 918 unsigned long va, rflags, pa; 919 long slot; 920 int err = 1; 921 922 ptep = huge_pte_offset(mm, ea); 923 924 /* Search the Linux page table for a match with va */ 925 va = (vsid << 28) | (ea & 0x0fffffff); 926 927 /* 928 * If no pte found or not present, send the problem up to 929 * do_page_fault 930 */ 931 if (unlikely(!ptep || pte_none(*ptep))) 932 goto out; 933 934 /* 935 * Check the user's access rights to the page. If access should be 936 * prevented then send the problem up to do_page_fault. 937 */ 938 if (unlikely(access & ~pte_val(*ptep))) 939 goto out; 940 /* 941 * At this point, we have a pte (old_pte) which can be used to build 942 * or update an HPTE. There are 2 cases: 943 * 944 * 1. There is a valid (present) pte with no associated HPTE (this is 945 * the most common case) 946 * 2. There is a valid (present) pte with an associated HPTE. The 947 * current values of the pp bits in the HPTE prevent access 948 * because we are doing software DIRTY bit management and the 949 * page is currently not DIRTY. 950 */ 951 952 953 do { 954 old_pte = pte_val(*ptep); 955 if (old_pte & _PAGE_BUSY) 956 goto out; 957 new_pte = old_pte | _PAGE_BUSY | 958 _PAGE_ACCESSED | _PAGE_HASHPTE; 959 } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, 960 old_pte, new_pte)); 961 962 rflags = 0x2 | (!(new_pte & _PAGE_RW)); 963 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ 964 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); 965 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) 966 /* No CPU has hugepages but lacks no execute, so we 967 * don't need to worry about that case */ 968 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), 969 trap); 970 971 /* Check if pte already has an hpte (case 2) */ 972 if (unlikely(old_pte & _PAGE_HASHPTE)) { 973 /* There MIGHT be an HPTE for this pte */ 974 unsigned long hash, slot; 975 976 hash = hpt_hash(va, HPAGE_SHIFT); 977 if (old_pte & _PAGE_F_SECOND) 978 hash = ~hash; 979 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 980 slot += (old_pte & _PAGE_F_GIX) >> 12; 981 982 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize, 983 local) == -1) 984 old_pte &= ~_PAGE_HPTEFLAGS; 985 } 986 987 if (likely(!(old_pte & _PAGE_HASHPTE))) { 988 unsigned long hash = hpt_hash(va, HPAGE_SHIFT); 989 unsigned long hpte_group; 990 991 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 992 993 repeat: 994 hpte_group = ((hash & htab_hash_mask) * 995 HPTES_PER_GROUP) & ~0x7UL; 996 997 /* clear HPTE slot informations in new PTE */ 998 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 999 1000 /* Add in WIMG bits */ 1001 /* XXX We should store these in the pte */ 1002 /* --BenH: I think they are ... */ 1003 rflags |= _PAGE_COHERENT; 1004 1005 /* Insert into the hash table, primary slot */ 1006 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, 1007 mmu_huge_psize); 1008 1009 /* Primary is full, try the secondary */ 1010 if (unlikely(slot == -1)) { 1011 new_pte |= _PAGE_F_SECOND; 1012 hpte_group = ((~hash & htab_hash_mask) * 1013 HPTES_PER_GROUP) & ~0x7UL; 1014 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 1015 HPTE_V_SECONDARY, 1016 mmu_huge_psize); 1017 if (slot == -1) { 1018 if (mftb() & 0x1) 1019 hpte_group = ((hash & htab_hash_mask) * 1020 HPTES_PER_GROUP)&~0x7UL; 1021 1022 ppc_md.hpte_remove(hpte_group); 1023 goto repeat; 1024 } 1025 } 1026 1027 if (unlikely(slot == -2)) 1028 panic("hash_huge_page: pte_insert failed\n"); 1029 1030 new_pte |= (slot << 12) & _PAGE_F_GIX; 1031 } 1032 1033 /* 1034 * No need to use ldarx/stdcx here 1035 */ 1036 *ptep = __pte(new_pte & ~_PAGE_BUSY); 1037 1038 err = 0; 1039 1040 out: 1041 return err; 1042 } 1043 1044 static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags) 1045 { 1046 memset(addr, 0, kmem_cache_size(cache)); 1047 } 1048 1049 static int __init hugetlbpage_init(void) 1050 { 1051 if (!cpu_has_feature(CPU_FTR_16M_PAGE)) 1052 return -ENODEV; 1053 1054 huge_pgtable_cache = kmem_cache_create("hugepte_cache", 1055 HUGEPTE_TABLE_SIZE, 1056 HUGEPTE_TABLE_SIZE, 1057 SLAB_HWCACHE_ALIGN | 1058 SLAB_MUST_HWCACHE_ALIGN, 1059 zero_ctor, NULL); 1060 if (! huge_pgtable_cache) 1061 panic("hugetlbpage_init(): could not create hugepte cache\n"); 1062 1063 return 0; 1064 } 1065 1066 module_init(hugetlbpage_init); 1067