1 /* 2 * Generic hugetlb support. 3 * (C) Nadia Yvette Chambers, April 2004 4 */ 5 #include <linux/list.h> 6 #include <linux/init.h> 7 #include <linux/module.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/bootmem.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/rmap.h> 23 #include <linux/swap.h> 24 #include <linux/swapops.h> 25 #include <linux/page-isolation.h> 26 #include <linux/jhash.h> 27 28 #include <asm/page.h> 29 #include <asm/pgtable.h> 30 #include <asm/tlb.h> 31 32 #include <linux/io.h> 33 #include <linux/hugetlb.h> 34 #include <linux/hugetlb_cgroup.h> 35 #include <linux/node.h> 36 #include "internal.h" 37 38 int hugepages_treat_as_movable; 39 40 int hugetlb_max_hstate __read_mostly; 41 unsigned int default_hstate_idx; 42 struct hstate hstates[HUGE_MAX_HSTATE]; 43 /* 44 * Minimum page order among possible hugepage sizes, set to a proper value 45 * at boot time. 46 */ 47 static unsigned int minimum_order __read_mostly = UINT_MAX; 48 49 __initdata LIST_HEAD(huge_boot_pages); 50 51 /* for command line parsing */ 52 static struct hstate * __initdata parsed_hstate; 53 static unsigned long __initdata default_hstate_max_huge_pages; 54 static unsigned long __initdata default_hstate_size; 55 56 /* 57 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 58 * free_huge_pages, and surplus_huge_pages. 59 */ 60 DEFINE_SPINLOCK(hugetlb_lock); 61 62 /* 63 * Serializes faults on the same logical page. This is used to 64 * prevent spurious OOMs when the hugepage pool is fully utilized. 65 */ 66 static int num_fault_mutexes; 67 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; 68 69 /* Forward declaration */ 70 static int hugetlb_acct_memory(struct hstate *h, long delta); 71 72 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 73 { 74 bool free = (spool->count == 0) && (spool->used_hpages == 0); 75 76 spin_unlock(&spool->lock); 77 78 /* If no pages are used, and no other handles to the subpool 79 * remain, give up any reservations mased on minimum size and 80 * free the subpool */ 81 if (free) { 82 if (spool->min_hpages != -1) 83 hugetlb_acct_memory(spool->hstate, 84 -spool->min_hpages); 85 kfree(spool); 86 } 87 } 88 89 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 90 long min_hpages) 91 { 92 struct hugepage_subpool *spool; 93 94 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 95 if (!spool) 96 return NULL; 97 98 spin_lock_init(&spool->lock); 99 spool->count = 1; 100 spool->max_hpages = max_hpages; 101 spool->hstate = h; 102 spool->min_hpages = min_hpages; 103 104 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 105 kfree(spool); 106 return NULL; 107 } 108 spool->rsv_hpages = min_hpages; 109 110 return spool; 111 } 112 113 void hugepage_put_subpool(struct hugepage_subpool *spool) 114 { 115 spin_lock(&spool->lock); 116 BUG_ON(!spool->count); 117 spool->count--; 118 unlock_or_release_subpool(spool); 119 } 120 121 /* 122 * Subpool accounting for allocating and reserving pages. 123 * Return -ENOMEM if there are not enough resources to satisfy the 124 * the request. Otherwise, return the number of pages by which the 125 * global pools must be adjusted (upward). The returned value may 126 * only be different than the passed value (delta) in the case where 127 * a subpool minimum size must be manitained. 128 */ 129 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 130 long delta) 131 { 132 long ret = delta; 133 134 if (!spool) 135 return ret; 136 137 spin_lock(&spool->lock); 138 139 if (spool->max_hpages != -1) { /* maximum size accounting */ 140 if ((spool->used_hpages + delta) <= spool->max_hpages) 141 spool->used_hpages += delta; 142 else { 143 ret = -ENOMEM; 144 goto unlock_ret; 145 } 146 } 147 148 if (spool->min_hpages != -1) { /* minimum size accounting */ 149 if (delta > spool->rsv_hpages) { 150 /* 151 * Asking for more reserves than those already taken on 152 * behalf of subpool. Return difference. 153 */ 154 ret = delta - spool->rsv_hpages; 155 spool->rsv_hpages = 0; 156 } else { 157 ret = 0; /* reserves already accounted for */ 158 spool->rsv_hpages -= delta; 159 } 160 } 161 162 unlock_ret: 163 spin_unlock(&spool->lock); 164 return ret; 165 } 166 167 /* 168 * Subpool accounting for freeing and unreserving pages. 169 * Return the number of global page reservations that must be dropped. 170 * The return value may only be different than the passed value (delta) 171 * in the case where a subpool minimum size must be maintained. 172 */ 173 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 174 long delta) 175 { 176 long ret = delta; 177 178 if (!spool) 179 return delta; 180 181 spin_lock(&spool->lock); 182 183 if (spool->max_hpages != -1) /* maximum size accounting */ 184 spool->used_hpages -= delta; 185 186 if (spool->min_hpages != -1) { /* minimum size accounting */ 187 if (spool->rsv_hpages + delta <= spool->min_hpages) 188 ret = 0; 189 else 190 ret = spool->rsv_hpages + delta - spool->min_hpages; 191 192 spool->rsv_hpages += delta; 193 if (spool->rsv_hpages > spool->min_hpages) 194 spool->rsv_hpages = spool->min_hpages; 195 } 196 197 /* 198 * If hugetlbfs_put_super couldn't free spool due to an outstanding 199 * quota reference, free it now. 200 */ 201 unlock_or_release_subpool(spool); 202 203 return ret; 204 } 205 206 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 207 { 208 return HUGETLBFS_SB(inode->i_sb)->spool; 209 } 210 211 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 212 { 213 return subpool_inode(file_inode(vma->vm_file)); 214 } 215 216 /* 217 * Region tracking -- allows tracking of reservations and instantiated pages 218 * across the pages in a mapping. 219 * 220 * The region data structures are embedded into a resv_map and protected 221 * by a resv_map's lock. The set of regions within the resv_map represent 222 * reservations for huge pages, or huge pages that have already been 223 * instantiated within the map. The from and to elements are huge page 224 * indicies into the associated mapping. from indicates the starting index 225 * of the region. to represents the first index past the end of the region. 226 * 227 * For example, a file region structure with from == 0 and to == 4 represents 228 * four huge pages in a mapping. It is important to note that the to element 229 * represents the first element past the end of the region. This is used in 230 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. 231 * 232 * Interval notation of the form [from, to) will be used to indicate that 233 * the endpoint from is inclusive and to is exclusive. 234 */ 235 struct file_region { 236 struct list_head link; 237 long from; 238 long to; 239 }; 240 241 /* 242 * Add the huge page range represented by [f, t) to the reserve 243 * map. Existing regions will be expanded to accommodate the 244 * specified range. We know only existing regions need to be 245 * expanded, because region_add is only called after region_chg 246 * with the same range. If a new file_region structure must 247 * be allocated, it is done in region_chg. 248 * 249 * Return the number of new huge pages added to the map. This 250 * number is greater than or equal to zero. 251 */ 252 static long region_add(struct resv_map *resv, long f, long t) 253 { 254 struct list_head *head = &resv->regions; 255 struct file_region *rg, *nrg, *trg; 256 long add = 0; 257 258 spin_lock(&resv->lock); 259 /* Locate the region we are either in or before. */ 260 list_for_each_entry(rg, head, link) 261 if (f <= rg->to) 262 break; 263 264 /* Round our left edge to the current segment if it encloses us. */ 265 if (f > rg->from) 266 f = rg->from; 267 268 /* Check for and consume any regions we now overlap with. */ 269 nrg = rg; 270 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 271 if (&rg->link == head) 272 break; 273 if (rg->from > t) 274 break; 275 276 /* If this area reaches higher then extend our area to 277 * include it completely. If this is not the first area 278 * which we intend to reuse, free it. */ 279 if (rg->to > t) 280 t = rg->to; 281 if (rg != nrg) { 282 /* Decrement return value by the deleted range. 283 * Another range will span this area so that by 284 * end of routine add will be >= zero 285 */ 286 add -= (rg->to - rg->from); 287 list_del(&rg->link); 288 kfree(rg); 289 } 290 } 291 292 add += (nrg->from - f); /* Added to beginning of region */ 293 nrg->from = f; 294 add += t - nrg->to; /* Added to end of region */ 295 nrg->to = t; 296 297 spin_unlock(&resv->lock); 298 VM_BUG_ON(add < 0); 299 return add; 300 } 301 302 /* 303 * Examine the existing reserve map and determine how many 304 * huge pages in the specified range [f, t) are NOT currently 305 * represented. This routine is called before a subsequent 306 * call to region_add that will actually modify the reserve 307 * map to add the specified range [f, t). region_chg does 308 * not change the number of huge pages represented by the 309 * map. However, if the existing regions in the map can not 310 * be expanded to represent the new range, a new file_region 311 * structure is added to the map as a placeholder. This is 312 * so that the subsequent region_add call will have all the 313 * regions it needs and will not fail. 314 * 315 * Returns the number of huge pages that need to be added 316 * to the existing reservation map for the range [f, t). 317 * This number is greater or equal to zero. -ENOMEM is 318 * returned if a new file_region structure is needed and can 319 * not be allocated. 320 */ 321 static long region_chg(struct resv_map *resv, long f, long t) 322 { 323 struct list_head *head = &resv->regions; 324 struct file_region *rg, *nrg = NULL; 325 long chg = 0; 326 327 retry: 328 spin_lock(&resv->lock); 329 /* Locate the region we are before or in. */ 330 list_for_each_entry(rg, head, link) 331 if (f <= rg->to) 332 break; 333 334 /* If we are below the current region then a new region is required. 335 * Subtle, allocate a new region at the position but make it zero 336 * size such that we can guarantee to record the reservation. */ 337 if (&rg->link == head || t < rg->from) { 338 if (!nrg) { 339 spin_unlock(&resv->lock); 340 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 341 if (!nrg) 342 return -ENOMEM; 343 344 nrg->from = f; 345 nrg->to = f; 346 INIT_LIST_HEAD(&nrg->link); 347 goto retry; 348 } 349 350 list_add(&nrg->link, rg->link.prev); 351 chg = t - f; 352 goto out_nrg; 353 } 354 355 /* Round our left edge to the current segment if it encloses us. */ 356 if (f > rg->from) 357 f = rg->from; 358 chg = t - f; 359 360 /* Check for and consume any regions we now overlap with. */ 361 list_for_each_entry(rg, rg->link.prev, link) { 362 if (&rg->link == head) 363 break; 364 if (rg->from > t) 365 goto out; 366 367 /* We overlap with this area, if it extends further than 368 * us then we must extend ourselves. Account for its 369 * existing reservation. */ 370 if (rg->to > t) { 371 chg += rg->to - t; 372 t = rg->to; 373 } 374 chg -= rg->to - rg->from; 375 } 376 377 out: 378 spin_unlock(&resv->lock); 379 /* We already know we raced and no longer need the new region */ 380 kfree(nrg); 381 return chg; 382 out_nrg: 383 spin_unlock(&resv->lock); 384 return chg; 385 } 386 387 /* 388 * Truncate the reserve map at index 'end'. Modify/truncate any 389 * region which contains end. Delete any regions past end. 390 * Return the number of huge pages removed from the map. 391 */ 392 static long region_truncate(struct resv_map *resv, long end) 393 { 394 struct list_head *head = &resv->regions; 395 struct file_region *rg, *trg; 396 long chg = 0; 397 398 spin_lock(&resv->lock); 399 /* Locate the region we are either in or before. */ 400 list_for_each_entry(rg, head, link) 401 if (end <= rg->to) 402 break; 403 if (&rg->link == head) 404 goto out; 405 406 /* If we are in the middle of a region then adjust it. */ 407 if (end > rg->from) { 408 chg = rg->to - end; 409 rg->to = end; 410 rg = list_entry(rg->link.next, typeof(*rg), link); 411 } 412 413 /* Drop any remaining regions. */ 414 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 415 if (&rg->link == head) 416 break; 417 chg += rg->to - rg->from; 418 list_del(&rg->link); 419 kfree(rg); 420 } 421 422 out: 423 spin_unlock(&resv->lock); 424 return chg; 425 } 426 427 /* 428 * Count and return the number of huge pages in the reserve map 429 * that intersect with the range [f, t). 430 */ 431 static long region_count(struct resv_map *resv, long f, long t) 432 { 433 struct list_head *head = &resv->regions; 434 struct file_region *rg; 435 long chg = 0; 436 437 spin_lock(&resv->lock); 438 /* Locate each segment we overlap with, and count that overlap. */ 439 list_for_each_entry(rg, head, link) { 440 long seg_from; 441 long seg_to; 442 443 if (rg->to <= f) 444 continue; 445 if (rg->from >= t) 446 break; 447 448 seg_from = max(rg->from, f); 449 seg_to = min(rg->to, t); 450 451 chg += seg_to - seg_from; 452 } 453 spin_unlock(&resv->lock); 454 455 return chg; 456 } 457 458 /* 459 * Convert the address within this vma to the page offset within 460 * the mapping, in pagecache page units; huge pages here. 461 */ 462 static pgoff_t vma_hugecache_offset(struct hstate *h, 463 struct vm_area_struct *vma, unsigned long address) 464 { 465 return ((address - vma->vm_start) >> huge_page_shift(h)) + 466 (vma->vm_pgoff >> huge_page_order(h)); 467 } 468 469 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 470 unsigned long address) 471 { 472 return vma_hugecache_offset(hstate_vma(vma), vma, address); 473 } 474 475 /* 476 * Return the size of the pages allocated when backing a VMA. In the majority 477 * cases this will be same size as used by the page table entries. 478 */ 479 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 480 { 481 struct hstate *hstate; 482 483 if (!is_vm_hugetlb_page(vma)) 484 return PAGE_SIZE; 485 486 hstate = hstate_vma(vma); 487 488 return 1UL << huge_page_shift(hstate); 489 } 490 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 491 492 /* 493 * Return the page size being used by the MMU to back a VMA. In the majority 494 * of cases, the page size used by the kernel matches the MMU size. On 495 * architectures where it differs, an architecture-specific version of this 496 * function is required. 497 */ 498 #ifndef vma_mmu_pagesize 499 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 500 { 501 return vma_kernel_pagesize(vma); 502 } 503 #endif 504 505 /* 506 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 507 * bits of the reservation map pointer, which are always clear due to 508 * alignment. 509 */ 510 #define HPAGE_RESV_OWNER (1UL << 0) 511 #define HPAGE_RESV_UNMAPPED (1UL << 1) 512 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 513 514 /* 515 * These helpers are used to track how many pages are reserved for 516 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 517 * is guaranteed to have their future faults succeed. 518 * 519 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 520 * the reserve counters are updated with the hugetlb_lock held. It is safe 521 * to reset the VMA at fork() time as it is not in use yet and there is no 522 * chance of the global counters getting corrupted as a result of the values. 523 * 524 * The private mapping reservation is represented in a subtly different 525 * manner to a shared mapping. A shared mapping has a region map associated 526 * with the underlying file, this region map represents the backing file 527 * pages which have ever had a reservation assigned which this persists even 528 * after the page is instantiated. A private mapping has a region map 529 * associated with the original mmap which is attached to all VMAs which 530 * reference it, this region map represents those offsets which have consumed 531 * reservation ie. where pages have been instantiated. 532 */ 533 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 534 { 535 return (unsigned long)vma->vm_private_data; 536 } 537 538 static void set_vma_private_data(struct vm_area_struct *vma, 539 unsigned long value) 540 { 541 vma->vm_private_data = (void *)value; 542 } 543 544 struct resv_map *resv_map_alloc(void) 545 { 546 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 547 if (!resv_map) 548 return NULL; 549 550 kref_init(&resv_map->refs); 551 spin_lock_init(&resv_map->lock); 552 INIT_LIST_HEAD(&resv_map->regions); 553 554 return resv_map; 555 } 556 557 void resv_map_release(struct kref *ref) 558 { 559 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 560 561 /* Clear out any active regions before we release the map. */ 562 region_truncate(resv_map, 0); 563 kfree(resv_map); 564 } 565 566 static inline struct resv_map *inode_resv_map(struct inode *inode) 567 { 568 return inode->i_mapping->private_data; 569 } 570 571 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 572 { 573 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 574 if (vma->vm_flags & VM_MAYSHARE) { 575 struct address_space *mapping = vma->vm_file->f_mapping; 576 struct inode *inode = mapping->host; 577 578 return inode_resv_map(inode); 579 580 } else { 581 return (struct resv_map *)(get_vma_private_data(vma) & 582 ~HPAGE_RESV_MASK); 583 } 584 } 585 586 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 587 { 588 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 589 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 590 591 set_vma_private_data(vma, (get_vma_private_data(vma) & 592 HPAGE_RESV_MASK) | (unsigned long)map); 593 } 594 595 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 596 { 597 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 598 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 599 600 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 601 } 602 603 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 604 { 605 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 606 607 return (get_vma_private_data(vma) & flag) != 0; 608 } 609 610 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 611 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 612 { 613 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 614 if (!(vma->vm_flags & VM_MAYSHARE)) 615 vma->vm_private_data = (void *)0; 616 } 617 618 /* Returns true if the VMA has associated reserve pages */ 619 static int vma_has_reserves(struct vm_area_struct *vma, long chg) 620 { 621 if (vma->vm_flags & VM_NORESERVE) { 622 /* 623 * This address is already reserved by other process(chg == 0), 624 * so, we should decrement reserved count. Without decrementing, 625 * reserve count remains after releasing inode, because this 626 * allocated page will go into page cache and is regarded as 627 * coming from reserved pool in releasing step. Currently, we 628 * don't have any other solution to deal with this situation 629 * properly, so add work-around here. 630 */ 631 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 632 return 1; 633 else 634 return 0; 635 } 636 637 /* Shared mappings always use reserves */ 638 if (vma->vm_flags & VM_MAYSHARE) 639 return 1; 640 641 /* 642 * Only the process that called mmap() has reserves for 643 * private mappings. 644 */ 645 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 646 return 1; 647 648 return 0; 649 } 650 651 static void enqueue_huge_page(struct hstate *h, struct page *page) 652 { 653 int nid = page_to_nid(page); 654 list_move(&page->lru, &h->hugepage_freelists[nid]); 655 h->free_huge_pages++; 656 h->free_huge_pages_node[nid]++; 657 } 658 659 static struct page *dequeue_huge_page_node(struct hstate *h, int nid) 660 { 661 struct page *page; 662 663 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) 664 if (!is_migrate_isolate_page(page)) 665 break; 666 /* 667 * if 'non-isolated free hugepage' not found on the list, 668 * the allocation fails. 669 */ 670 if (&h->hugepage_freelists[nid] == &page->lru) 671 return NULL; 672 list_move(&page->lru, &h->hugepage_activelist); 673 set_page_refcounted(page); 674 h->free_huge_pages--; 675 h->free_huge_pages_node[nid]--; 676 return page; 677 } 678 679 /* Movability of hugepages depends on migration support. */ 680 static inline gfp_t htlb_alloc_mask(struct hstate *h) 681 { 682 if (hugepages_treat_as_movable || hugepage_migration_supported(h)) 683 return GFP_HIGHUSER_MOVABLE; 684 else 685 return GFP_HIGHUSER; 686 } 687 688 static struct page *dequeue_huge_page_vma(struct hstate *h, 689 struct vm_area_struct *vma, 690 unsigned long address, int avoid_reserve, 691 long chg) 692 { 693 struct page *page = NULL; 694 struct mempolicy *mpol; 695 nodemask_t *nodemask; 696 struct zonelist *zonelist; 697 struct zone *zone; 698 struct zoneref *z; 699 unsigned int cpuset_mems_cookie; 700 701 /* 702 * A child process with MAP_PRIVATE mappings created by their parent 703 * have no page reserves. This check ensures that reservations are 704 * not "stolen". The child may still get SIGKILLed 705 */ 706 if (!vma_has_reserves(vma, chg) && 707 h->free_huge_pages - h->resv_huge_pages == 0) 708 goto err; 709 710 /* If reserves cannot be used, ensure enough pages are in the pool */ 711 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 712 goto err; 713 714 retry_cpuset: 715 cpuset_mems_cookie = read_mems_allowed_begin(); 716 zonelist = huge_zonelist(vma, address, 717 htlb_alloc_mask(h), &mpol, &nodemask); 718 719 for_each_zone_zonelist_nodemask(zone, z, zonelist, 720 MAX_NR_ZONES - 1, nodemask) { 721 if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { 722 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 723 if (page) { 724 if (avoid_reserve) 725 break; 726 if (!vma_has_reserves(vma, chg)) 727 break; 728 729 SetPagePrivate(page); 730 h->resv_huge_pages--; 731 break; 732 } 733 } 734 } 735 736 mpol_cond_put(mpol); 737 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 738 goto retry_cpuset; 739 return page; 740 741 err: 742 return NULL; 743 } 744 745 /* 746 * common helper functions for hstate_next_node_to_{alloc|free}. 747 * We may have allocated or freed a huge page based on a different 748 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 749 * be outside of *nodes_allowed. Ensure that we use an allowed 750 * node for alloc or free. 751 */ 752 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 753 { 754 nid = next_node(nid, *nodes_allowed); 755 if (nid == MAX_NUMNODES) 756 nid = first_node(*nodes_allowed); 757 VM_BUG_ON(nid >= MAX_NUMNODES); 758 759 return nid; 760 } 761 762 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 763 { 764 if (!node_isset(nid, *nodes_allowed)) 765 nid = next_node_allowed(nid, nodes_allowed); 766 return nid; 767 } 768 769 /* 770 * returns the previously saved node ["this node"] from which to 771 * allocate a persistent huge page for the pool and advance the 772 * next node from which to allocate, handling wrap at end of node 773 * mask. 774 */ 775 static int hstate_next_node_to_alloc(struct hstate *h, 776 nodemask_t *nodes_allowed) 777 { 778 int nid; 779 780 VM_BUG_ON(!nodes_allowed); 781 782 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 783 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 784 785 return nid; 786 } 787 788 /* 789 * helper for free_pool_huge_page() - return the previously saved 790 * node ["this node"] from which to free a huge page. Advance the 791 * next node id whether or not we find a free huge page to free so 792 * that the next attempt to free addresses the next node. 793 */ 794 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 795 { 796 int nid; 797 798 VM_BUG_ON(!nodes_allowed); 799 800 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 801 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 802 803 return nid; 804 } 805 806 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 807 for (nr_nodes = nodes_weight(*mask); \ 808 nr_nodes > 0 && \ 809 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 810 nr_nodes--) 811 812 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 813 for (nr_nodes = nodes_weight(*mask); \ 814 nr_nodes > 0 && \ 815 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 816 nr_nodes--) 817 818 #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) 819 static void destroy_compound_gigantic_page(struct page *page, 820 unsigned long order) 821 { 822 int i; 823 int nr_pages = 1 << order; 824 struct page *p = page + 1; 825 826 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 827 __ClearPageTail(p); 828 set_page_refcounted(p); 829 p->first_page = NULL; 830 } 831 832 set_compound_order(page, 0); 833 __ClearPageHead(page); 834 } 835 836 static void free_gigantic_page(struct page *page, unsigned order) 837 { 838 free_contig_range(page_to_pfn(page), 1 << order); 839 } 840 841 static int __alloc_gigantic_page(unsigned long start_pfn, 842 unsigned long nr_pages) 843 { 844 unsigned long end_pfn = start_pfn + nr_pages; 845 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 846 } 847 848 static bool pfn_range_valid_gigantic(unsigned long start_pfn, 849 unsigned long nr_pages) 850 { 851 unsigned long i, end_pfn = start_pfn + nr_pages; 852 struct page *page; 853 854 for (i = start_pfn; i < end_pfn; i++) { 855 if (!pfn_valid(i)) 856 return false; 857 858 page = pfn_to_page(i); 859 860 if (PageReserved(page)) 861 return false; 862 863 if (page_count(page) > 0) 864 return false; 865 866 if (PageHuge(page)) 867 return false; 868 } 869 870 return true; 871 } 872 873 static bool zone_spans_last_pfn(const struct zone *zone, 874 unsigned long start_pfn, unsigned long nr_pages) 875 { 876 unsigned long last_pfn = start_pfn + nr_pages - 1; 877 return zone_spans_pfn(zone, last_pfn); 878 } 879 880 static struct page *alloc_gigantic_page(int nid, unsigned order) 881 { 882 unsigned long nr_pages = 1 << order; 883 unsigned long ret, pfn, flags; 884 struct zone *z; 885 886 z = NODE_DATA(nid)->node_zones; 887 for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { 888 spin_lock_irqsave(&z->lock, flags); 889 890 pfn = ALIGN(z->zone_start_pfn, nr_pages); 891 while (zone_spans_last_pfn(z, pfn, nr_pages)) { 892 if (pfn_range_valid_gigantic(pfn, nr_pages)) { 893 /* 894 * We release the zone lock here because 895 * alloc_contig_range() will also lock the zone 896 * at some point. If there's an allocation 897 * spinning on this lock, it may win the race 898 * and cause alloc_contig_range() to fail... 899 */ 900 spin_unlock_irqrestore(&z->lock, flags); 901 ret = __alloc_gigantic_page(pfn, nr_pages); 902 if (!ret) 903 return pfn_to_page(pfn); 904 spin_lock_irqsave(&z->lock, flags); 905 } 906 pfn += nr_pages; 907 } 908 909 spin_unlock_irqrestore(&z->lock, flags); 910 } 911 912 return NULL; 913 } 914 915 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); 916 static void prep_compound_gigantic_page(struct page *page, unsigned long order); 917 918 static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) 919 { 920 struct page *page; 921 922 page = alloc_gigantic_page(nid, huge_page_order(h)); 923 if (page) { 924 prep_compound_gigantic_page(page, huge_page_order(h)); 925 prep_new_huge_page(h, page, nid); 926 } 927 928 return page; 929 } 930 931 static int alloc_fresh_gigantic_page(struct hstate *h, 932 nodemask_t *nodes_allowed) 933 { 934 struct page *page = NULL; 935 int nr_nodes, node; 936 937 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 938 page = alloc_fresh_gigantic_page_node(h, node); 939 if (page) 940 return 1; 941 } 942 943 return 0; 944 } 945 946 static inline bool gigantic_page_supported(void) { return true; } 947 #else 948 static inline bool gigantic_page_supported(void) { return false; } 949 static inline void free_gigantic_page(struct page *page, unsigned order) { } 950 static inline void destroy_compound_gigantic_page(struct page *page, 951 unsigned long order) { } 952 static inline int alloc_fresh_gigantic_page(struct hstate *h, 953 nodemask_t *nodes_allowed) { return 0; } 954 #endif 955 956 static void update_and_free_page(struct hstate *h, struct page *page) 957 { 958 int i; 959 960 if (hstate_is_gigantic(h) && !gigantic_page_supported()) 961 return; 962 963 h->nr_huge_pages--; 964 h->nr_huge_pages_node[page_to_nid(page)]--; 965 for (i = 0; i < pages_per_huge_page(h); i++) { 966 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 967 1 << PG_referenced | 1 << PG_dirty | 968 1 << PG_active | 1 << PG_private | 969 1 << PG_writeback); 970 } 971 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 972 set_compound_page_dtor(page, NULL); 973 set_page_refcounted(page); 974 if (hstate_is_gigantic(h)) { 975 destroy_compound_gigantic_page(page, huge_page_order(h)); 976 free_gigantic_page(page, huge_page_order(h)); 977 } else { 978 __free_pages(page, huge_page_order(h)); 979 } 980 } 981 982 struct hstate *size_to_hstate(unsigned long size) 983 { 984 struct hstate *h; 985 986 for_each_hstate(h) { 987 if (huge_page_size(h) == size) 988 return h; 989 } 990 return NULL; 991 } 992 993 /* 994 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked 995 * to hstate->hugepage_activelist.) 996 * 997 * This function can be called for tail pages, but never returns true for them. 998 */ 999 bool page_huge_active(struct page *page) 1000 { 1001 VM_BUG_ON_PAGE(!PageHuge(page), page); 1002 return PageHead(page) && PagePrivate(&page[1]); 1003 } 1004 1005 /* never called for tail page */ 1006 static void set_page_huge_active(struct page *page) 1007 { 1008 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1009 SetPagePrivate(&page[1]); 1010 } 1011 1012 static void clear_page_huge_active(struct page *page) 1013 { 1014 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 1015 ClearPagePrivate(&page[1]); 1016 } 1017 1018 void free_huge_page(struct page *page) 1019 { 1020 /* 1021 * Can't pass hstate in here because it is called from the 1022 * compound page destructor. 1023 */ 1024 struct hstate *h = page_hstate(page); 1025 int nid = page_to_nid(page); 1026 struct hugepage_subpool *spool = 1027 (struct hugepage_subpool *)page_private(page); 1028 bool restore_reserve; 1029 1030 set_page_private(page, 0); 1031 page->mapping = NULL; 1032 BUG_ON(page_count(page)); 1033 BUG_ON(page_mapcount(page)); 1034 restore_reserve = PagePrivate(page); 1035 ClearPagePrivate(page); 1036 1037 /* 1038 * A return code of zero implies that the subpool will be under its 1039 * minimum size if the reservation is not restored after page is free. 1040 * Therefore, force restore_reserve operation. 1041 */ 1042 if (hugepage_subpool_put_pages(spool, 1) == 0) 1043 restore_reserve = true; 1044 1045 spin_lock(&hugetlb_lock); 1046 clear_page_huge_active(page); 1047 hugetlb_cgroup_uncharge_page(hstate_index(h), 1048 pages_per_huge_page(h), page); 1049 if (restore_reserve) 1050 h->resv_huge_pages++; 1051 1052 if (h->surplus_huge_pages_node[nid]) { 1053 /* remove the page from active list */ 1054 list_del(&page->lru); 1055 update_and_free_page(h, page); 1056 h->surplus_huge_pages--; 1057 h->surplus_huge_pages_node[nid]--; 1058 } else { 1059 arch_clear_hugepage_flags(page); 1060 enqueue_huge_page(h, page); 1061 } 1062 spin_unlock(&hugetlb_lock); 1063 } 1064 1065 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1066 { 1067 INIT_LIST_HEAD(&page->lru); 1068 set_compound_page_dtor(page, free_huge_page); 1069 spin_lock(&hugetlb_lock); 1070 set_hugetlb_cgroup(page, NULL); 1071 h->nr_huge_pages++; 1072 h->nr_huge_pages_node[nid]++; 1073 spin_unlock(&hugetlb_lock); 1074 put_page(page); /* free it into the hugepage allocator */ 1075 } 1076 1077 static void prep_compound_gigantic_page(struct page *page, unsigned long order) 1078 { 1079 int i; 1080 int nr_pages = 1 << order; 1081 struct page *p = page + 1; 1082 1083 /* we rely on prep_new_huge_page to set the destructor */ 1084 set_compound_order(page, order); 1085 __SetPageHead(page); 1086 __ClearPageReserved(page); 1087 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1088 /* 1089 * For gigantic hugepages allocated through bootmem at 1090 * boot, it's safer to be consistent with the not-gigantic 1091 * hugepages and clear the PG_reserved bit from all tail pages 1092 * too. Otherwse drivers using get_user_pages() to access tail 1093 * pages may get the reference counting wrong if they see 1094 * PG_reserved set on a tail page (despite the head page not 1095 * having PG_reserved set). Enforcing this consistency between 1096 * head and tail pages allows drivers to optimize away a check 1097 * on the head page when they need know if put_page() is needed 1098 * after get_user_pages(). 1099 */ 1100 __ClearPageReserved(p); 1101 set_page_count(p, 0); 1102 p->first_page = page; 1103 /* Make sure p->first_page is always valid for PageTail() */ 1104 smp_wmb(); 1105 __SetPageTail(p); 1106 } 1107 } 1108 1109 /* 1110 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1111 * transparent huge pages. See the PageTransHuge() documentation for more 1112 * details. 1113 */ 1114 int PageHuge(struct page *page) 1115 { 1116 if (!PageCompound(page)) 1117 return 0; 1118 1119 page = compound_head(page); 1120 return get_compound_page_dtor(page) == free_huge_page; 1121 } 1122 EXPORT_SYMBOL_GPL(PageHuge); 1123 1124 /* 1125 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1126 * normal or transparent huge pages. 1127 */ 1128 int PageHeadHuge(struct page *page_head) 1129 { 1130 if (!PageHead(page_head)) 1131 return 0; 1132 1133 return get_compound_page_dtor(page_head) == free_huge_page; 1134 } 1135 1136 pgoff_t __basepage_index(struct page *page) 1137 { 1138 struct page *page_head = compound_head(page); 1139 pgoff_t index = page_index(page_head); 1140 unsigned long compound_idx; 1141 1142 if (!PageHuge(page_head)) 1143 return page_index(page); 1144 1145 if (compound_order(page_head) >= MAX_ORDER) 1146 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1147 else 1148 compound_idx = page - page_head; 1149 1150 return (index << compound_order(page_head)) + compound_idx; 1151 } 1152 1153 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 1154 { 1155 struct page *page; 1156 1157 page = alloc_pages_exact_node(nid, 1158 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1159 __GFP_REPEAT|__GFP_NOWARN, 1160 huge_page_order(h)); 1161 if (page) { 1162 prep_new_huge_page(h, page, nid); 1163 } 1164 1165 return page; 1166 } 1167 1168 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 1169 { 1170 struct page *page; 1171 int nr_nodes, node; 1172 int ret = 0; 1173 1174 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1175 page = alloc_fresh_huge_page_node(h, node); 1176 if (page) { 1177 ret = 1; 1178 break; 1179 } 1180 } 1181 1182 if (ret) 1183 count_vm_event(HTLB_BUDDY_PGALLOC); 1184 else 1185 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1186 1187 return ret; 1188 } 1189 1190 /* 1191 * Free huge page from pool from next node to free. 1192 * Attempt to keep persistent huge pages more or less 1193 * balanced over allowed nodes. 1194 * Called with hugetlb_lock locked. 1195 */ 1196 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1197 bool acct_surplus) 1198 { 1199 int nr_nodes, node; 1200 int ret = 0; 1201 1202 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1203 /* 1204 * If we're returning unused surplus pages, only examine 1205 * nodes with surplus pages. 1206 */ 1207 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 1208 !list_empty(&h->hugepage_freelists[node])) { 1209 struct page *page = 1210 list_entry(h->hugepage_freelists[node].next, 1211 struct page, lru); 1212 list_del(&page->lru); 1213 h->free_huge_pages--; 1214 h->free_huge_pages_node[node]--; 1215 if (acct_surplus) { 1216 h->surplus_huge_pages--; 1217 h->surplus_huge_pages_node[node]--; 1218 } 1219 update_and_free_page(h, page); 1220 ret = 1; 1221 break; 1222 } 1223 } 1224 1225 return ret; 1226 } 1227 1228 /* 1229 * Dissolve a given free hugepage into free buddy pages. This function does 1230 * nothing for in-use (including surplus) hugepages. 1231 */ 1232 static void dissolve_free_huge_page(struct page *page) 1233 { 1234 spin_lock(&hugetlb_lock); 1235 if (PageHuge(page) && !page_count(page)) { 1236 struct hstate *h = page_hstate(page); 1237 int nid = page_to_nid(page); 1238 list_del(&page->lru); 1239 h->free_huge_pages--; 1240 h->free_huge_pages_node[nid]--; 1241 update_and_free_page(h, page); 1242 } 1243 spin_unlock(&hugetlb_lock); 1244 } 1245 1246 /* 1247 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 1248 * make specified memory blocks removable from the system. 1249 * Note that start_pfn should aligned with (minimum) hugepage size. 1250 */ 1251 void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1252 { 1253 unsigned long pfn; 1254 1255 if (!hugepages_supported()) 1256 return; 1257 1258 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); 1259 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) 1260 dissolve_free_huge_page(pfn_to_page(pfn)); 1261 } 1262 1263 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 1264 { 1265 struct page *page; 1266 unsigned int r_nid; 1267 1268 if (hstate_is_gigantic(h)) 1269 return NULL; 1270 1271 /* 1272 * Assume we will successfully allocate the surplus page to 1273 * prevent racing processes from causing the surplus to exceed 1274 * overcommit 1275 * 1276 * This however introduces a different race, where a process B 1277 * tries to grow the static hugepage pool while alloc_pages() is 1278 * called by process A. B will only examine the per-node 1279 * counters in determining if surplus huge pages can be 1280 * converted to normal huge pages in adjust_pool_surplus(). A 1281 * won't be able to increment the per-node counter, until the 1282 * lock is dropped by B, but B doesn't drop hugetlb_lock until 1283 * no more huge pages can be converted from surplus to normal 1284 * state (and doesn't try to convert again). Thus, we have a 1285 * case where a surplus huge page exists, the pool is grown, and 1286 * the surplus huge page still exists after, even though it 1287 * should just have been converted to a normal huge page. This 1288 * does not leak memory, though, as the hugepage will be freed 1289 * once it is out of use. It also does not allow the counters to 1290 * go out of whack in adjust_pool_surplus() as we don't modify 1291 * the node values until we've gotten the hugepage and only the 1292 * per-node value is checked there. 1293 */ 1294 spin_lock(&hugetlb_lock); 1295 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 1296 spin_unlock(&hugetlb_lock); 1297 return NULL; 1298 } else { 1299 h->nr_huge_pages++; 1300 h->surplus_huge_pages++; 1301 } 1302 spin_unlock(&hugetlb_lock); 1303 1304 if (nid == NUMA_NO_NODE) 1305 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| 1306 __GFP_REPEAT|__GFP_NOWARN, 1307 huge_page_order(h)); 1308 else 1309 page = alloc_pages_exact_node(nid, 1310 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1311 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 1312 1313 spin_lock(&hugetlb_lock); 1314 if (page) { 1315 INIT_LIST_HEAD(&page->lru); 1316 r_nid = page_to_nid(page); 1317 set_compound_page_dtor(page, free_huge_page); 1318 set_hugetlb_cgroup(page, NULL); 1319 /* 1320 * We incremented the global counters already 1321 */ 1322 h->nr_huge_pages_node[r_nid]++; 1323 h->surplus_huge_pages_node[r_nid]++; 1324 __count_vm_event(HTLB_BUDDY_PGALLOC); 1325 } else { 1326 h->nr_huge_pages--; 1327 h->surplus_huge_pages--; 1328 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1329 } 1330 spin_unlock(&hugetlb_lock); 1331 1332 return page; 1333 } 1334 1335 /* 1336 * This allocation function is useful in the context where vma is irrelevant. 1337 * E.g. soft-offlining uses this function because it only cares physical 1338 * address of error page. 1339 */ 1340 struct page *alloc_huge_page_node(struct hstate *h, int nid) 1341 { 1342 struct page *page = NULL; 1343 1344 spin_lock(&hugetlb_lock); 1345 if (h->free_huge_pages - h->resv_huge_pages > 0) 1346 page = dequeue_huge_page_node(h, nid); 1347 spin_unlock(&hugetlb_lock); 1348 1349 if (!page) 1350 page = alloc_buddy_huge_page(h, nid); 1351 1352 return page; 1353 } 1354 1355 /* 1356 * Increase the hugetlb pool such that it can accommodate a reservation 1357 * of size 'delta'. 1358 */ 1359 static int gather_surplus_pages(struct hstate *h, int delta) 1360 { 1361 struct list_head surplus_list; 1362 struct page *page, *tmp; 1363 int ret, i; 1364 int needed, allocated; 1365 bool alloc_ok = true; 1366 1367 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 1368 if (needed <= 0) { 1369 h->resv_huge_pages += delta; 1370 return 0; 1371 } 1372 1373 allocated = 0; 1374 INIT_LIST_HEAD(&surplus_list); 1375 1376 ret = -ENOMEM; 1377 retry: 1378 spin_unlock(&hugetlb_lock); 1379 for (i = 0; i < needed; i++) { 1380 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1381 if (!page) { 1382 alloc_ok = false; 1383 break; 1384 } 1385 list_add(&page->lru, &surplus_list); 1386 } 1387 allocated += i; 1388 1389 /* 1390 * After retaking hugetlb_lock, we need to recalculate 'needed' 1391 * because either resv_huge_pages or free_huge_pages may have changed. 1392 */ 1393 spin_lock(&hugetlb_lock); 1394 needed = (h->resv_huge_pages + delta) - 1395 (h->free_huge_pages + allocated); 1396 if (needed > 0) { 1397 if (alloc_ok) 1398 goto retry; 1399 /* 1400 * We were not able to allocate enough pages to 1401 * satisfy the entire reservation so we free what 1402 * we've allocated so far. 1403 */ 1404 goto free; 1405 } 1406 /* 1407 * The surplus_list now contains _at_least_ the number of extra pages 1408 * needed to accommodate the reservation. Add the appropriate number 1409 * of pages to the hugetlb pool and free the extras back to the buddy 1410 * allocator. Commit the entire reservation here to prevent another 1411 * process from stealing the pages as they are added to the pool but 1412 * before they are reserved. 1413 */ 1414 needed += allocated; 1415 h->resv_huge_pages += delta; 1416 ret = 0; 1417 1418 /* Free the needed pages to the hugetlb pool */ 1419 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1420 if ((--needed) < 0) 1421 break; 1422 /* 1423 * This page is now managed by the hugetlb allocator and has 1424 * no users -- drop the buddy allocator's reference. 1425 */ 1426 put_page_testzero(page); 1427 VM_BUG_ON_PAGE(page_count(page), page); 1428 enqueue_huge_page(h, page); 1429 } 1430 free: 1431 spin_unlock(&hugetlb_lock); 1432 1433 /* Free unnecessary surplus pages to the buddy allocator */ 1434 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 1435 put_page(page); 1436 spin_lock(&hugetlb_lock); 1437 1438 return ret; 1439 } 1440 1441 /* 1442 * When releasing a hugetlb pool reservation, any surplus pages that were 1443 * allocated to satisfy the reservation must be explicitly freed if they were 1444 * never used. 1445 * Called with hugetlb_lock held. 1446 */ 1447 static void return_unused_surplus_pages(struct hstate *h, 1448 unsigned long unused_resv_pages) 1449 { 1450 unsigned long nr_pages; 1451 1452 /* Uncommit the reservation */ 1453 h->resv_huge_pages -= unused_resv_pages; 1454 1455 /* Cannot return gigantic pages currently */ 1456 if (hstate_is_gigantic(h)) 1457 return; 1458 1459 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1460 1461 /* 1462 * We want to release as many surplus pages as possible, spread 1463 * evenly across all nodes with memory. Iterate across these nodes 1464 * until we can no longer free unreserved surplus pages. This occurs 1465 * when the nodes with surplus pages have no free pages. 1466 * free_pool_huge_page() will balance the the freed pages across the 1467 * on-line nodes with memory and will handle the hstate accounting. 1468 */ 1469 while (nr_pages--) { 1470 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) 1471 break; 1472 cond_resched_lock(&hugetlb_lock); 1473 } 1474 } 1475 1476 /* 1477 * vma_needs_reservation and vma_commit_reservation are used by the huge 1478 * page allocation routines to manage reservations. 1479 * 1480 * vma_needs_reservation is called to determine if the huge page at addr 1481 * within the vma has an associated reservation. If a reservation is 1482 * needed, the value 1 is returned. The caller is then responsible for 1483 * managing the global reservation and subpool usage counts. After 1484 * the huge page has been allocated, vma_commit_reservation is called 1485 * to add the page to the reservation map. 1486 * 1487 * In the normal case, vma_commit_reservation returns the same value 1488 * as the preceding vma_needs_reservation call. The only time this 1489 * is not the case is if a reserve map was changed between calls. It 1490 * is the responsibility of the caller to notice the difference and 1491 * take appropriate action. 1492 */ 1493 static long __vma_reservation_common(struct hstate *h, 1494 struct vm_area_struct *vma, unsigned long addr, 1495 bool commit) 1496 { 1497 struct resv_map *resv; 1498 pgoff_t idx; 1499 long ret; 1500 1501 resv = vma_resv_map(vma); 1502 if (!resv) 1503 return 1; 1504 1505 idx = vma_hugecache_offset(h, vma, addr); 1506 if (commit) 1507 ret = region_add(resv, idx, idx + 1); 1508 else 1509 ret = region_chg(resv, idx, idx + 1); 1510 1511 if (vma->vm_flags & VM_MAYSHARE) 1512 return ret; 1513 else 1514 return ret < 0 ? ret : 0; 1515 } 1516 1517 static long vma_needs_reservation(struct hstate *h, 1518 struct vm_area_struct *vma, unsigned long addr) 1519 { 1520 return __vma_reservation_common(h, vma, addr, false); 1521 } 1522 1523 static long vma_commit_reservation(struct hstate *h, 1524 struct vm_area_struct *vma, unsigned long addr) 1525 { 1526 return __vma_reservation_common(h, vma, addr, true); 1527 } 1528 1529 static struct page *alloc_huge_page(struct vm_area_struct *vma, 1530 unsigned long addr, int avoid_reserve) 1531 { 1532 struct hugepage_subpool *spool = subpool_vma(vma); 1533 struct hstate *h = hstate_vma(vma); 1534 struct page *page; 1535 long chg, commit; 1536 int ret, idx; 1537 struct hugetlb_cgroup *h_cg; 1538 1539 idx = hstate_index(h); 1540 /* 1541 * Processes that did not create the mapping will have no 1542 * reserves and will not have accounted against subpool 1543 * limit. Check that the subpool limit can be made before 1544 * satisfying the allocation MAP_NORESERVE mappings may also 1545 * need pages and subpool limit allocated allocated if no reserve 1546 * mapping overlaps. 1547 */ 1548 chg = vma_needs_reservation(h, vma, addr); 1549 if (chg < 0) 1550 return ERR_PTR(-ENOMEM); 1551 if (chg || avoid_reserve) 1552 if (hugepage_subpool_get_pages(spool, 1) < 0) 1553 return ERR_PTR(-ENOSPC); 1554 1555 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1556 if (ret) 1557 goto out_subpool_put; 1558 1559 spin_lock(&hugetlb_lock); 1560 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); 1561 if (!page) { 1562 spin_unlock(&hugetlb_lock); 1563 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1564 if (!page) 1565 goto out_uncharge_cgroup; 1566 1567 spin_lock(&hugetlb_lock); 1568 list_move(&page->lru, &h->hugepage_activelist); 1569 /* Fall through */ 1570 } 1571 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 1572 spin_unlock(&hugetlb_lock); 1573 1574 set_page_private(page, (unsigned long)spool); 1575 1576 commit = vma_commit_reservation(h, vma, addr); 1577 if (unlikely(chg > commit)) { 1578 /* 1579 * The page was added to the reservation map between 1580 * vma_needs_reservation and vma_commit_reservation. 1581 * This indicates a race with hugetlb_reserve_pages. 1582 * Adjust for the subpool count incremented above AND 1583 * in hugetlb_reserve_pages for the same page. Also, 1584 * the reservation count added in hugetlb_reserve_pages 1585 * no longer applies. 1586 */ 1587 long rsv_adjust; 1588 1589 rsv_adjust = hugepage_subpool_put_pages(spool, 1); 1590 hugetlb_acct_memory(h, -rsv_adjust); 1591 } 1592 return page; 1593 1594 out_uncharge_cgroup: 1595 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 1596 out_subpool_put: 1597 if (chg || avoid_reserve) 1598 hugepage_subpool_put_pages(spool, 1); 1599 return ERR_PTR(-ENOSPC); 1600 } 1601 1602 /* 1603 * alloc_huge_page()'s wrapper which simply returns the page if allocation 1604 * succeeds, otherwise NULL. This function is called from new_vma_page(), 1605 * where no ERR_VALUE is expected to be returned. 1606 */ 1607 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, 1608 unsigned long addr, int avoid_reserve) 1609 { 1610 struct page *page = alloc_huge_page(vma, addr, avoid_reserve); 1611 if (IS_ERR(page)) 1612 page = NULL; 1613 return page; 1614 } 1615 1616 int __weak alloc_bootmem_huge_page(struct hstate *h) 1617 { 1618 struct huge_bootmem_page *m; 1619 int nr_nodes, node; 1620 1621 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 1622 void *addr; 1623 1624 addr = memblock_virt_alloc_try_nid_nopanic( 1625 huge_page_size(h), huge_page_size(h), 1626 0, BOOTMEM_ALLOC_ACCESSIBLE, node); 1627 if (addr) { 1628 /* 1629 * Use the beginning of the huge page to store the 1630 * huge_bootmem_page struct (until gather_bootmem 1631 * puts them into the mem_map). 1632 */ 1633 m = addr; 1634 goto found; 1635 } 1636 } 1637 return 0; 1638 1639 found: 1640 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 1641 /* Put them into a private list first because mem_map is not up yet */ 1642 list_add(&m->list, &huge_boot_pages); 1643 m->hstate = h; 1644 return 1; 1645 } 1646 1647 static void __init prep_compound_huge_page(struct page *page, int order) 1648 { 1649 if (unlikely(order > (MAX_ORDER - 1))) 1650 prep_compound_gigantic_page(page, order); 1651 else 1652 prep_compound_page(page, order); 1653 } 1654 1655 /* Put bootmem huge pages into the standard lists after mem_map is up */ 1656 static void __init gather_bootmem_prealloc(void) 1657 { 1658 struct huge_bootmem_page *m; 1659 1660 list_for_each_entry(m, &huge_boot_pages, list) { 1661 struct hstate *h = m->hstate; 1662 struct page *page; 1663 1664 #ifdef CONFIG_HIGHMEM 1665 page = pfn_to_page(m->phys >> PAGE_SHIFT); 1666 memblock_free_late(__pa(m), 1667 sizeof(struct huge_bootmem_page)); 1668 #else 1669 page = virt_to_page(m); 1670 #endif 1671 WARN_ON(page_count(page) != 1); 1672 prep_compound_huge_page(page, h->order); 1673 WARN_ON(PageReserved(page)); 1674 prep_new_huge_page(h, page, page_to_nid(page)); 1675 /* 1676 * If we had gigantic hugepages allocated at boot time, we need 1677 * to restore the 'stolen' pages to totalram_pages in order to 1678 * fix confusing memory reports from free(1) and another 1679 * side-effects, like CommitLimit going negative. 1680 */ 1681 if (hstate_is_gigantic(h)) 1682 adjust_managed_page_count(page, 1 << h->order); 1683 } 1684 } 1685 1686 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 1687 { 1688 unsigned long i; 1689 1690 for (i = 0; i < h->max_huge_pages; ++i) { 1691 if (hstate_is_gigantic(h)) { 1692 if (!alloc_bootmem_huge_page(h)) 1693 break; 1694 } else if (!alloc_fresh_huge_page(h, 1695 &node_states[N_MEMORY])) 1696 break; 1697 } 1698 h->max_huge_pages = i; 1699 } 1700 1701 static void __init hugetlb_init_hstates(void) 1702 { 1703 struct hstate *h; 1704 1705 for_each_hstate(h) { 1706 if (minimum_order > huge_page_order(h)) 1707 minimum_order = huge_page_order(h); 1708 1709 /* oversize hugepages were init'ed in early boot */ 1710 if (!hstate_is_gigantic(h)) 1711 hugetlb_hstate_alloc_pages(h); 1712 } 1713 VM_BUG_ON(minimum_order == UINT_MAX); 1714 } 1715 1716 static char * __init memfmt(char *buf, unsigned long n) 1717 { 1718 if (n >= (1UL << 30)) 1719 sprintf(buf, "%lu GB", n >> 30); 1720 else if (n >= (1UL << 20)) 1721 sprintf(buf, "%lu MB", n >> 20); 1722 else 1723 sprintf(buf, "%lu KB", n >> 10); 1724 return buf; 1725 } 1726 1727 static void __init report_hugepages(void) 1728 { 1729 struct hstate *h; 1730 1731 for_each_hstate(h) { 1732 char buf[32]; 1733 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 1734 memfmt(buf, huge_page_size(h)), 1735 h->free_huge_pages); 1736 } 1737 } 1738 1739 #ifdef CONFIG_HIGHMEM 1740 static void try_to_free_low(struct hstate *h, unsigned long count, 1741 nodemask_t *nodes_allowed) 1742 { 1743 int i; 1744 1745 if (hstate_is_gigantic(h)) 1746 return; 1747 1748 for_each_node_mask(i, *nodes_allowed) { 1749 struct page *page, *next; 1750 struct list_head *freel = &h->hugepage_freelists[i]; 1751 list_for_each_entry_safe(page, next, freel, lru) { 1752 if (count >= h->nr_huge_pages) 1753 return; 1754 if (PageHighMem(page)) 1755 continue; 1756 list_del(&page->lru); 1757 update_and_free_page(h, page); 1758 h->free_huge_pages--; 1759 h->free_huge_pages_node[page_to_nid(page)]--; 1760 } 1761 } 1762 } 1763 #else 1764 static inline void try_to_free_low(struct hstate *h, unsigned long count, 1765 nodemask_t *nodes_allowed) 1766 { 1767 } 1768 #endif 1769 1770 /* 1771 * Increment or decrement surplus_huge_pages. Keep node-specific counters 1772 * balanced by operating on them in a round-robin fashion. 1773 * Returns 1 if an adjustment was made. 1774 */ 1775 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 1776 int delta) 1777 { 1778 int nr_nodes, node; 1779 1780 VM_BUG_ON(delta != -1 && delta != 1); 1781 1782 if (delta < 0) { 1783 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1784 if (h->surplus_huge_pages_node[node]) 1785 goto found; 1786 } 1787 } else { 1788 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1789 if (h->surplus_huge_pages_node[node] < 1790 h->nr_huge_pages_node[node]) 1791 goto found; 1792 } 1793 } 1794 return 0; 1795 1796 found: 1797 h->surplus_huge_pages += delta; 1798 h->surplus_huge_pages_node[node] += delta; 1799 return 1; 1800 } 1801 1802 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1803 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 1804 nodemask_t *nodes_allowed) 1805 { 1806 unsigned long min_count, ret; 1807 1808 if (hstate_is_gigantic(h) && !gigantic_page_supported()) 1809 return h->max_huge_pages; 1810 1811 /* 1812 * Increase the pool size 1813 * First take pages out of surplus state. Then make up the 1814 * remaining difference by allocating fresh huge pages. 1815 * 1816 * We might race with alloc_buddy_huge_page() here and be unable 1817 * to convert a surplus huge page to a normal huge page. That is 1818 * not critical, though, it just means the overall size of the 1819 * pool might be one hugepage larger than it needs to be, but 1820 * within all the constraints specified by the sysctls. 1821 */ 1822 spin_lock(&hugetlb_lock); 1823 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1824 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 1825 break; 1826 } 1827 1828 while (count > persistent_huge_pages(h)) { 1829 /* 1830 * If this allocation races such that we no longer need the 1831 * page, free_huge_page will handle it by freeing the page 1832 * and reducing the surplus. 1833 */ 1834 spin_unlock(&hugetlb_lock); 1835 if (hstate_is_gigantic(h)) 1836 ret = alloc_fresh_gigantic_page(h, nodes_allowed); 1837 else 1838 ret = alloc_fresh_huge_page(h, nodes_allowed); 1839 spin_lock(&hugetlb_lock); 1840 if (!ret) 1841 goto out; 1842 1843 /* Bail for signals. Probably ctrl-c from user */ 1844 if (signal_pending(current)) 1845 goto out; 1846 } 1847 1848 /* 1849 * Decrease the pool size 1850 * First return free pages to the buddy allocator (being careful 1851 * to keep enough around to satisfy reservations). Then place 1852 * pages into surplus state as needed so the pool will shrink 1853 * to the desired size as pages become free. 1854 * 1855 * By placing pages into the surplus state independent of the 1856 * overcommit value, we are allowing the surplus pool size to 1857 * exceed overcommit. There are few sane options here. Since 1858 * alloc_buddy_huge_page() is checking the global counter, 1859 * though, we'll note that we're not allowed to exceed surplus 1860 * and won't grow the pool anywhere else. Not until one of the 1861 * sysctls are changed, or the surplus pages go out of use. 1862 */ 1863 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1864 min_count = max(count, min_count); 1865 try_to_free_low(h, min_count, nodes_allowed); 1866 while (min_count < persistent_huge_pages(h)) { 1867 if (!free_pool_huge_page(h, nodes_allowed, 0)) 1868 break; 1869 cond_resched_lock(&hugetlb_lock); 1870 } 1871 while (count < persistent_huge_pages(h)) { 1872 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 1873 break; 1874 } 1875 out: 1876 ret = persistent_huge_pages(h); 1877 spin_unlock(&hugetlb_lock); 1878 return ret; 1879 } 1880 1881 #define HSTATE_ATTR_RO(_name) \ 1882 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1883 1884 #define HSTATE_ATTR(_name) \ 1885 static struct kobj_attribute _name##_attr = \ 1886 __ATTR(_name, 0644, _name##_show, _name##_store) 1887 1888 static struct kobject *hugepages_kobj; 1889 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1890 1891 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 1892 1893 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 1894 { 1895 int i; 1896 1897 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1898 if (hstate_kobjs[i] == kobj) { 1899 if (nidp) 1900 *nidp = NUMA_NO_NODE; 1901 return &hstates[i]; 1902 } 1903 1904 return kobj_to_node_hstate(kobj, nidp); 1905 } 1906 1907 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 1908 struct kobj_attribute *attr, char *buf) 1909 { 1910 struct hstate *h; 1911 unsigned long nr_huge_pages; 1912 int nid; 1913 1914 h = kobj_to_hstate(kobj, &nid); 1915 if (nid == NUMA_NO_NODE) 1916 nr_huge_pages = h->nr_huge_pages; 1917 else 1918 nr_huge_pages = h->nr_huge_pages_node[nid]; 1919 1920 return sprintf(buf, "%lu\n", nr_huge_pages); 1921 } 1922 1923 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 1924 struct hstate *h, int nid, 1925 unsigned long count, size_t len) 1926 { 1927 int err; 1928 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1929 1930 if (hstate_is_gigantic(h) && !gigantic_page_supported()) { 1931 err = -EINVAL; 1932 goto out; 1933 } 1934 1935 if (nid == NUMA_NO_NODE) { 1936 /* 1937 * global hstate attribute 1938 */ 1939 if (!(obey_mempolicy && 1940 init_nodemask_of_mempolicy(nodes_allowed))) { 1941 NODEMASK_FREE(nodes_allowed); 1942 nodes_allowed = &node_states[N_MEMORY]; 1943 } 1944 } else if (nodes_allowed) { 1945 /* 1946 * per node hstate attribute: adjust count to global, 1947 * but restrict alloc/free to the specified node. 1948 */ 1949 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1950 init_nodemask_of_node(nodes_allowed, nid); 1951 } else 1952 nodes_allowed = &node_states[N_MEMORY]; 1953 1954 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1955 1956 if (nodes_allowed != &node_states[N_MEMORY]) 1957 NODEMASK_FREE(nodes_allowed); 1958 1959 return len; 1960 out: 1961 NODEMASK_FREE(nodes_allowed); 1962 return err; 1963 } 1964 1965 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1966 struct kobject *kobj, const char *buf, 1967 size_t len) 1968 { 1969 struct hstate *h; 1970 unsigned long count; 1971 int nid; 1972 int err; 1973 1974 err = kstrtoul(buf, 10, &count); 1975 if (err) 1976 return err; 1977 1978 h = kobj_to_hstate(kobj, &nid); 1979 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 1980 } 1981 1982 static ssize_t nr_hugepages_show(struct kobject *kobj, 1983 struct kobj_attribute *attr, char *buf) 1984 { 1985 return nr_hugepages_show_common(kobj, attr, buf); 1986 } 1987 1988 static ssize_t nr_hugepages_store(struct kobject *kobj, 1989 struct kobj_attribute *attr, const char *buf, size_t len) 1990 { 1991 return nr_hugepages_store_common(false, kobj, buf, len); 1992 } 1993 HSTATE_ATTR(nr_hugepages); 1994 1995 #ifdef CONFIG_NUMA 1996 1997 /* 1998 * hstate attribute for optionally mempolicy-based constraint on persistent 1999 * huge page alloc/free. 2000 */ 2001 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 2002 struct kobj_attribute *attr, char *buf) 2003 { 2004 return nr_hugepages_show_common(kobj, attr, buf); 2005 } 2006 2007 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 2008 struct kobj_attribute *attr, const char *buf, size_t len) 2009 { 2010 return nr_hugepages_store_common(true, kobj, buf, len); 2011 } 2012 HSTATE_ATTR(nr_hugepages_mempolicy); 2013 #endif 2014 2015 2016 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 2017 struct kobj_attribute *attr, char *buf) 2018 { 2019 struct hstate *h = kobj_to_hstate(kobj, NULL); 2020 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 2021 } 2022 2023 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 2024 struct kobj_attribute *attr, const char *buf, size_t count) 2025 { 2026 int err; 2027 unsigned long input; 2028 struct hstate *h = kobj_to_hstate(kobj, NULL); 2029 2030 if (hstate_is_gigantic(h)) 2031 return -EINVAL; 2032 2033 err = kstrtoul(buf, 10, &input); 2034 if (err) 2035 return err; 2036 2037 spin_lock(&hugetlb_lock); 2038 h->nr_overcommit_huge_pages = input; 2039 spin_unlock(&hugetlb_lock); 2040 2041 return count; 2042 } 2043 HSTATE_ATTR(nr_overcommit_hugepages); 2044 2045 static ssize_t free_hugepages_show(struct kobject *kobj, 2046 struct kobj_attribute *attr, char *buf) 2047 { 2048 struct hstate *h; 2049 unsigned long free_huge_pages; 2050 int nid; 2051 2052 h = kobj_to_hstate(kobj, &nid); 2053 if (nid == NUMA_NO_NODE) 2054 free_huge_pages = h->free_huge_pages; 2055 else 2056 free_huge_pages = h->free_huge_pages_node[nid]; 2057 2058 return sprintf(buf, "%lu\n", free_huge_pages); 2059 } 2060 HSTATE_ATTR_RO(free_hugepages); 2061 2062 static ssize_t resv_hugepages_show(struct kobject *kobj, 2063 struct kobj_attribute *attr, char *buf) 2064 { 2065 struct hstate *h = kobj_to_hstate(kobj, NULL); 2066 return sprintf(buf, "%lu\n", h->resv_huge_pages); 2067 } 2068 HSTATE_ATTR_RO(resv_hugepages); 2069 2070 static ssize_t surplus_hugepages_show(struct kobject *kobj, 2071 struct kobj_attribute *attr, char *buf) 2072 { 2073 struct hstate *h; 2074 unsigned long surplus_huge_pages; 2075 int nid; 2076 2077 h = kobj_to_hstate(kobj, &nid); 2078 if (nid == NUMA_NO_NODE) 2079 surplus_huge_pages = h->surplus_huge_pages; 2080 else 2081 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 2082 2083 return sprintf(buf, "%lu\n", surplus_huge_pages); 2084 } 2085 HSTATE_ATTR_RO(surplus_hugepages); 2086 2087 static struct attribute *hstate_attrs[] = { 2088 &nr_hugepages_attr.attr, 2089 &nr_overcommit_hugepages_attr.attr, 2090 &free_hugepages_attr.attr, 2091 &resv_hugepages_attr.attr, 2092 &surplus_hugepages_attr.attr, 2093 #ifdef CONFIG_NUMA 2094 &nr_hugepages_mempolicy_attr.attr, 2095 #endif 2096 NULL, 2097 }; 2098 2099 static struct attribute_group hstate_attr_group = { 2100 .attrs = hstate_attrs, 2101 }; 2102 2103 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 2104 struct kobject **hstate_kobjs, 2105 struct attribute_group *hstate_attr_group) 2106 { 2107 int retval; 2108 int hi = hstate_index(h); 2109 2110 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 2111 if (!hstate_kobjs[hi]) 2112 return -ENOMEM; 2113 2114 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 2115 if (retval) 2116 kobject_put(hstate_kobjs[hi]); 2117 2118 return retval; 2119 } 2120 2121 static void __init hugetlb_sysfs_init(void) 2122 { 2123 struct hstate *h; 2124 int err; 2125 2126 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 2127 if (!hugepages_kobj) 2128 return; 2129 2130 for_each_hstate(h) { 2131 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 2132 hstate_kobjs, &hstate_attr_group); 2133 if (err) 2134 pr_err("Hugetlb: Unable to add hstate %s", h->name); 2135 } 2136 } 2137 2138 #ifdef CONFIG_NUMA 2139 2140 /* 2141 * node_hstate/s - associate per node hstate attributes, via their kobjects, 2142 * with node devices in node_devices[] using a parallel array. The array 2143 * index of a node device or _hstate == node id. 2144 * This is here to avoid any static dependency of the node device driver, in 2145 * the base kernel, on the hugetlb module. 2146 */ 2147 struct node_hstate { 2148 struct kobject *hugepages_kobj; 2149 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 2150 }; 2151 struct node_hstate node_hstates[MAX_NUMNODES]; 2152 2153 /* 2154 * A subset of global hstate attributes for node devices 2155 */ 2156 static struct attribute *per_node_hstate_attrs[] = { 2157 &nr_hugepages_attr.attr, 2158 &free_hugepages_attr.attr, 2159 &surplus_hugepages_attr.attr, 2160 NULL, 2161 }; 2162 2163 static struct attribute_group per_node_hstate_attr_group = { 2164 .attrs = per_node_hstate_attrs, 2165 }; 2166 2167 /* 2168 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 2169 * Returns node id via non-NULL nidp. 2170 */ 2171 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 2172 { 2173 int nid; 2174 2175 for (nid = 0; nid < nr_node_ids; nid++) { 2176 struct node_hstate *nhs = &node_hstates[nid]; 2177 int i; 2178 for (i = 0; i < HUGE_MAX_HSTATE; i++) 2179 if (nhs->hstate_kobjs[i] == kobj) { 2180 if (nidp) 2181 *nidp = nid; 2182 return &hstates[i]; 2183 } 2184 } 2185 2186 BUG(); 2187 return NULL; 2188 } 2189 2190 /* 2191 * Unregister hstate attributes from a single node device. 2192 * No-op if no hstate attributes attached. 2193 */ 2194 static void hugetlb_unregister_node(struct node *node) 2195 { 2196 struct hstate *h; 2197 struct node_hstate *nhs = &node_hstates[node->dev.id]; 2198 2199 if (!nhs->hugepages_kobj) 2200 return; /* no hstate attributes */ 2201 2202 for_each_hstate(h) { 2203 int idx = hstate_index(h); 2204 if (nhs->hstate_kobjs[idx]) { 2205 kobject_put(nhs->hstate_kobjs[idx]); 2206 nhs->hstate_kobjs[idx] = NULL; 2207 } 2208 } 2209 2210 kobject_put(nhs->hugepages_kobj); 2211 nhs->hugepages_kobj = NULL; 2212 } 2213 2214 /* 2215 * hugetlb module exit: unregister hstate attributes from node devices 2216 * that have them. 2217 */ 2218 static void hugetlb_unregister_all_nodes(void) 2219 { 2220 int nid; 2221 2222 /* 2223 * disable node device registrations. 2224 */ 2225 register_hugetlbfs_with_node(NULL, NULL); 2226 2227 /* 2228 * remove hstate attributes from any nodes that have them. 2229 */ 2230 for (nid = 0; nid < nr_node_ids; nid++) 2231 hugetlb_unregister_node(node_devices[nid]); 2232 } 2233 2234 /* 2235 * Register hstate attributes for a single node device. 2236 * No-op if attributes already registered. 2237 */ 2238 static void hugetlb_register_node(struct node *node) 2239 { 2240 struct hstate *h; 2241 struct node_hstate *nhs = &node_hstates[node->dev.id]; 2242 int err; 2243 2244 if (nhs->hugepages_kobj) 2245 return; /* already allocated */ 2246 2247 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 2248 &node->dev.kobj); 2249 if (!nhs->hugepages_kobj) 2250 return; 2251 2252 for_each_hstate(h) { 2253 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 2254 nhs->hstate_kobjs, 2255 &per_node_hstate_attr_group); 2256 if (err) { 2257 pr_err("Hugetlb: Unable to add hstate %s for node %d\n", 2258 h->name, node->dev.id); 2259 hugetlb_unregister_node(node); 2260 break; 2261 } 2262 } 2263 } 2264 2265 /* 2266 * hugetlb init time: register hstate attributes for all registered node 2267 * devices of nodes that have memory. All on-line nodes should have 2268 * registered their associated device by this time. 2269 */ 2270 static void __init hugetlb_register_all_nodes(void) 2271 { 2272 int nid; 2273 2274 for_each_node_state(nid, N_MEMORY) { 2275 struct node *node = node_devices[nid]; 2276 if (node->dev.id == nid) 2277 hugetlb_register_node(node); 2278 } 2279 2280 /* 2281 * Let the node device driver know we're here so it can 2282 * [un]register hstate attributes on node hotplug. 2283 */ 2284 register_hugetlbfs_with_node(hugetlb_register_node, 2285 hugetlb_unregister_node); 2286 } 2287 #else /* !CONFIG_NUMA */ 2288 2289 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 2290 { 2291 BUG(); 2292 if (nidp) 2293 *nidp = -1; 2294 return NULL; 2295 } 2296 2297 static void hugetlb_unregister_all_nodes(void) { } 2298 2299 static void hugetlb_register_all_nodes(void) { } 2300 2301 #endif 2302 2303 static void __exit hugetlb_exit(void) 2304 { 2305 struct hstate *h; 2306 2307 hugetlb_unregister_all_nodes(); 2308 2309 for_each_hstate(h) { 2310 kobject_put(hstate_kobjs[hstate_index(h)]); 2311 } 2312 2313 kobject_put(hugepages_kobj); 2314 kfree(htlb_fault_mutex_table); 2315 } 2316 module_exit(hugetlb_exit); 2317 2318 static int __init hugetlb_init(void) 2319 { 2320 int i; 2321 2322 if (!hugepages_supported()) 2323 return 0; 2324 2325 if (!size_to_hstate(default_hstate_size)) { 2326 default_hstate_size = HPAGE_SIZE; 2327 if (!size_to_hstate(default_hstate_size)) 2328 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 2329 } 2330 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); 2331 if (default_hstate_max_huge_pages) 2332 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 2333 2334 hugetlb_init_hstates(); 2335 gather_bootmem_prealloc(); 2336 report_hugepages(); 2337 2338 hugetlb_sysfs_init(); 2339 hugetlb_register_all_nodes(); 2340 hugetlb_cgroup_file_init(); 2341 2342 #ifdef CONFIG_SMP 2343 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 2344 #else 2345 num_fault_mutexes = 1; 2346 #endif 2347 htlb_fault_mutex_table = 2348 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); 2349 BUG_ON(!htlb_fault_mutex_table); 2350 2351 for (i = 0; i < num_fault_mutexes; i++) 2352 mutex_init(&htlb_fault_mutex_table[i]); 2353 return 0; 2354 } 2355 module_init(hugetlb_init); 2356 2357 /* Should be called on processing a hugepagesz=... option */ 2358 void __init hugetlb_add_hstate(unsigned order) 2359 { 2360 struct hstate *h; 2361 unsigned long i; 2362 2363 if (size_to_hstate(PAGE_SIZE << order)) { 2364 pr_warning("hugepagesz= specified twice, ignoring\n"); 2365 return; 2366 } 2367 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 2368 BUG_ON(order == 0); 2369 h = &hstates[hugetlb_max_hstate++]; 2370 h->order = order; 2371 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 2372 h->nr_huge_pages = 0; 2373 h->free_huge_pages = 0; 2374 for (i = 0; i < MAX_NUMNODES; ++i) 2375 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 2376 INIT_LIST_HEAD(&h->hugepage_activelist); 2377 h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); 2378 h->next_nid_to_free = first_node(node_states[N_MEMORY]); 2379 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 2380 huge_page_size(h)/1024); 2381 2382 parsed_hstate = h; 2383 } 2384 2385 static int __init hugetlb_nrpages_setup(char *s) 2386 { 2387 unsigned long *mhp; 2388 static unsigned long *last_mhp; 2389 2390 /* 2391 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, 2392 * so this hugepages= parameter goes to the "default hstate". 2393 */ 2394 if (!hugetlb_max_hstate) 2395 mhp = &default_hstate_max_huge_pages; 2396 else 2397 mhp = &parsed_hstate->max_huge_pages; 2398 2399 if (mhp == last_mhp) { 2400 pr_warning("hugepages= specified twice without " 2401 "interleaving hugepagesz=, ignoring\n"); 2402 return 1; 2403 } 2404 2405 if (sscanf(s, "%lu", mhp) <= 0) 2406 *mhp = 0; 2407 2408 /* 2409 * Global state is always initialized later in hugetlb_init. 2410 * But we need to allocate >= MAX_ORDER hstates here early to still 2411 * use the bootmem allocator. 2412 */ 2413 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) 2414 hugetlb_hstate_alloc_pages(parsed_hstate); 2415 2416 last_mhp = mhp; 2417 2418 return 1; 2419 } 2420 __setup("hugepages=", hugetlb_nrpages_setup); 2421 2422 static int __init hugetlb_default_setup(char *s) 2423 { 2424 default_hstate_size = memparse(s, &s); 2425 return 1; 2426 } 2427 __setup("default_hugepagesz=", hugetlb_default_setup); 2428 2429 static unsigned int cpuset_mems_nr(unsigned int *array) 2430 { 2431 int node; 2432 unsigned int nr = 0; 2433 2434 for_each_node_mask(node, cpuset_current_mems_allowed) 2435 nr += array[node]; 2436 2437 return nr; 2438 } 2439 2440 #ifdef CONFIG_SYSCTL 2441 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 2442 struct ctl_table *table, int write, 2443 void __user *buffer, size_t *length, loff_t *ppos) 2444 { 2445 struct hstate *h = &default_hstate; 2446 unsigned long tmp = h->max_huge_pages; 2447 int ret; 2448 2449 if (!hugepages_supported()) 2450 return -ENOTSUPP; 2451 2452 table->data = &tmp; 2453 table->maxlen = sizeof(unsigned long); 2454 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2455 if (ret) 2456 goto out; 2457 2458 if (write) 2459 ret = __nr_hugepages_store_common(obey_mempolicy, h, 2460 NUMA_NO_NODE, tmp, *length); 2461 out: 2462 return ret; 2463 } 2464 2465 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 2466 void __user *buffer, size_t *length, loff_t *ppos) 2467 { 2468 2469 return hugetlb_sysctl_handler_common(false, table, write, 2470 buffer, length, ppos); 2471 } 2472 2473 #ifdef CONFIG_NUMA 2474 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 2475 void __user *buffer, size_t *length, loff_t *ppos) 2476 { 2477 return hugetlb_sysctl_handler_common(true, table, write, 2478 buffer, length, ppos); 2479 } 2480 #endif /* CONFIG_NUMA */ 2481 2482 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 2483 void __user *buffer, 2484 size_t *length, loff_t *ppos) 2485 { 2486 struct hstate *h = &default_hstate; 2487 unsigned long tmp; 2488 int ret; 2489 2490 if (!hugepages_supported()) 2491 return -ENOTSUPP; 2492 2493 tmp = h->nr_overcommit_huge_pages; 2494 2495 if (write && hstate_is_gigantic(h)) 2496 return -EINVAL; 2497 2498 table->data = &tmp; 2499 table->maxlen = sizeof(unsigned long); 2500 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2501 if (ret) 2502 goto out; 2503 2504 if (write) { 2505 spin_lock(&hugetlb_lock); 2506 h->nr_overcommit_huge_pages = tmp; 2507 spin_unlock(&hugetlb_lock); 2508 } 2509 out: 2510 return ret; 2511 } 2512 2513 #endif /* CONFIG_SYSCTL */ 2514 2515 void hugetlb_report_meminfo(struct seq_file *m) 2516 { 2517 struct hstate *h = &default_hstate; 2518 if (!hugepages_supported()) 2519 return; 2520 seq_printf(m, 2521 "HugePages_Total: %5lu\n" 2522 "HugePages_Free: %5lu\n" 2523 "HugePages_Rsvd: %5lu\n" 2524 "HugePages_Surp: %5lu\n" 2525 "Hugepagesize: %8lu kB\n", 2526 h->nr_huge_pages, 2527 h->free_huge_pages, 2528 h->resv_huge_pages, 2529 h->surplus_huge_pages, 2530 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2531 } 2532 2533 int hugetlb_report_node_meminfo(int nid, char *buf) 2534 { 2535 struct hstate *h = &default_hstate; 2536 if (!hugepages_supported()) 2537 return 0; 2538 return sprintf(buf, 2539 "Node %d HugePages_Total: %5u\n" 2540 "Node %d HugePages_Free: %5u\n" 2541 "Node %d HugePages_Surp: %5u\n", 2542 nid, h->nr_huge_pages_node[nid], 2543 nid, h->free_huge_pages_node[nid], 2544 nid, h->surplus_huge_pages_node[nid]); 2545 } 2546 2547 void hugetlb_show_meminfo(void) 2548 { 2549 struct hstate *h; 2550 int nid; 2551 2552 if (!hugepages_supported()) 2553 return; 2554 2555 for_each_node_state(nid, N_MEMORY) 2556 for_each_hstate(h) 2557 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 2558 nid, 2559 h->nr_huge_pages_node[nid], 2560 h->free_huge_pages_node[nid], 2561 h->surplus_huge_pages_node[nid], 2562 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2563 } 2564 2565 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2566 unsigned long hugetlb_total_pages(void) 2567 { 2568 struct hstate *h; 2569 unsigned long nr_total_pages = 0; 2570 2571 for_each_hstate(h) 2572 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 2573 return nr_total_pages; 2574 } 2575 2576 static int hugetlb_acct_memory(struct hstate *h, long delta) 2577 { 2578 int ret = -ENOMEM; 2579 2580 spin_lock(&hugetlb_lock); 2581 /* 2582 * When cpuset is configured, it breaks the strict hugetlb page 2583 * reservation as the accounting is done on a global variable. Such 2584 * reservation is completely rubbish in the presence of cpuset because 2585 * the reservation is not checked against page availability for the 2586 * current cpuset. Application can still potentially OOM'ed by kernel 2587 * with lack of free htlb page in cpuset that the task is in. 2588 * Attempt to enforce strict accounting with cpuset is almost 2589 * impossible (or too ugly) because cpuset is too fluid that 2590 * task or memory node can be dynamically moved between cpusets. 2591 * 2592 * The change of semantics for shared hugetlb mapping with cpuset is 2593 * undesirable. However, in order to preserve some of the semantics, 2594 * we fall back to check against current free page availability as 2595 * a best attempt and hopefully to minimize the impact of changing 2596 * semantics that cpuset has. 2597 */ 2598 if (delta > 0) { 2599 if (gather_surplus_pages(h, delta) < 0) 2600 goto out; 2601 2602 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 2603 return_unused_surplus_pages(h, delta); 2604 goto out; 2605 } 2606 } 2607 2608 ret = 0; 2609 if (delta < 0) 2610 return_unused_surplus_pages(h, (unsigned long) -delta); 2611 2612 out: 2613 spin_unlock(&hugetlb_lock); 2614 return ret; 2615 } 2616 2617 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 2618 { 2619 struct resv_map *resv = vma_resv_map(vma); 2620 2621 /* 2622 * This new VMA should share its siblings reservation map if present. 2623 * The VMA will only ever have a valid reservation map pointer where 2624 * it is being copied for another still existing VMA. As that VMA 2625 * has a reference to the reservation map it cannot disappear until 2626 * after this open call completes. It is therefore safe to take a 2627 * new reference here without additional locking. 2628 */ 2629 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2630 kref_get(&resv->refs); 2631 } 2632 2633 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2634 { 2635 struct hstate *h = hstate_vma(vma); 2636 struct resv_map *resv = vma_resv_map(vma); 2637 struct hugepage_subpool *spool = subpool_vma(vma); 2638 unsigned long reserve, start, end; 2639 long gbl_reserve; 2640 2641 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2642 return; 2643 2644 start = vma_hugecache_offset(h, vma, vma->vm_start); 2645 end = vma_hugecache_offset(h, vma, vma->vm_end); 2646 2647 reserve = (end - start) - region_count(resv, start, end); 2648 2649 kref_put(&resv->refs, resv_map_release); 2650 2651 if (reserve) { 2652 /* 2653 * Decrement reserve counts. The global reserve count may be 2654 * adjusted if the subpool has a minimum size. 2655 */ 2656 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 2657 hugetlb_acct_memory(h, -gbl_reserve); 2658 } 2659 } 2660 2661 /* 2662 * We cannot handle pagefaults against hugetlb pages at all. They cause 2663 * handle_mm_fault() to try to instantiate regular-sized pages in the 2664 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 2665 * this far. 2666 */ 2667 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2668 { 2669 BUG(); 2670 return 0; 2671 } 2672 2673 const struct vm_operations_struct hugetlb_vm_ops = { 2674 .fault = hugetlb_vm_op_fault, 2675 .open = hugetlb_vm_op_open, 2676 .close = hugetlb_vm_op_close, 2677 }; 2678 2679 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 2680 int writable) 2681 { 2682 pte_t entry; 2683 2684 if (writable) { 2685 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 2686 vma->vm_page_prot))); 2687 } else { 2688 entry = huge_pte_wrprotect(mk_huge_pte(page, 2689 vma->vm_page_prot)); 2690 } 2691 entry = pte_mkyoung(entry); 2692 entry = pte_mkhuge(entry); 2693 entry = arch_make_huge_pte(entry, vma, page, writable); 2694 2695 return entry; 2696 } 2697 2698 static void set_huge_ptep_writable(struct vm_area_struct *vma, 2699 unsigned long address, pte_t *ptep) 2700 { 2701 pte_t entry; 2702 2703 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 2704 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 2705 update_mmu_cache(vma, address, ptep); 2706 } 2707 2708 static int is_hugetlb_entry_migration(pte_t pte) 2709 { 2710 swp_entry_t swp; 2711 2712 if (huge_pte_none(pte) || pte_present(pte)) 2713 return 0; 2714 swp = pte_to_swp_entry(pte); 2715 if (non_swap_entry(swp) && is_migration_entry(swp)) 2716 return 1; 2717 else 2718 return 0; 2719 } 2720 2721 static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2722 { 2723 swp_entry_t swp; 2724 2725 if (huge_pte_none(pte) || pte_present(pte)) 2726 return 0; 2727 swp = pte_to_swp_entry(pte); 2728 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) 2729 return 1; 2730 else 2731 return 0; 2732 } 2733 2734 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 2735 struct vm_area_struct *vma) 2736 { 2737 pte_t *src_pte, *dst_pte, entry; 2738 struct page *ptepage; 2739 unsigned long addr; 2740 int cow; 2741 struct hstate *h = hstate_vma(vma); 2742 unsigned long sz = huge_page_size(h); 2743 unsigned long mmun_start; /* For mmu_notifiers */ 2744 unsigned long mmun_end; /* For mmu_notifiers */ 2745 int ret = 0; 2746 2747 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2748 2749 mmun_start = vma->vm_start; 2750 mmun_end = vma->vm_end; 2751 if (cow) 2752 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); 2753 2754 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2755 spinlock_t *src_ptl, *dst_ptl; 2756 src_pte = huge_pte_offset(src, addr); 2757 if (!src_pte) 2758 continue; 2759 dst_pte = huge_pte_alloc(dst, addr, sz); 2760 if (!dst_pte) { 2761 ret = -ENOMEM; 2762 break; 2763 } 2764 2765 /* If the pagetables are shared don't copy or take references */ 2766 if (dst_pte == src_pte) 2767 continue; 2768 2769 dst_ptl = huge_pte_lock(h, dst, dst_pte); 2770 src_ptl = huge_pte_lockptr(h, src, src_pte); 2771 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 2772 entry = huge_ptep_get(src_pte); 2773 if (huge_pte_none(entry)) { /* skip none entry */ 2774 ; 2775 } else if (unlikely(is_hugetlb_entry_migration(entry) || 2776 is_hugetlb_entry_hwpoisoned(entry))) { 2777 swp_entry_t swp_entry = pte_to_swp_entry(entry); 2778 2779 if (is_write_migration_entry(swp_entry) && cow) { 2780 /* 2781 * COW mappings require pages in both 2782 * parent and child to be set to read. 2783 */ 2784 make_migration_entry_read(&swp_entry); 2785 entry = swp_entry_to_pte(swp_entry); 2786 set_huge_pte_at(src, addr, src_pte, entry); 2787 } 2788 set_huge_pte_at(dst, addr, dst_pte, entry); 2789 } else { 2790 if (cow) { 2791 huge_ptep_set_wrprotect(src, addr, src_pte); 2792 mmu_notifier_invalidate_range(src, mmun_start, 2793 mmun_end); 2794 } 2795 entry = huge_ptep_get(src_pte); 2796 ptepage = pte_page(entry); 2797 get_page(ptepage); 2798 page_dup_rmap(ptepage); 2799 set_huge_pte_at(dst, addr, dst_pte, entry); 2800 } 2801 spin_unlock(src_ptl); 2802 spin_unlock(dst_ptl); 2803 } 2804 2805 if (cow) 2806 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); 2807 2808 return ret; 2809 } 2810 2811 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 2812 unsigned long start, unsigned long end, 2813 struct page *ref_page) 2814 { 2815 int force_flush = 0; 2816 struct mm_struct *mm = vma->vm_mm; 2817 unsigned long address; 2818 pte_t *ptep; 2819 pte_t pte; 2820 spinlock_t *ptl; 2821 struct page *page; 2822 struct hstate *h = hstate_vma(vma); 2823 unsigned long sz = huge_page_size(h); 2824 const unsigned long mmun_start = start; /* For mmu_notifiers */ 2825 const unsigned long mmun_end = end; /* For mmu_notifiers */ 2826 2827 WARN_ON(!is_vm_hugetlb_page(vma)); 2828 BUG_ON(start & ~huge_page_mask(h)); 2829 BUG_ON(end & ~huge_page_mask(h)); 2830 2831 tlb_start_vma(tlb, vma); 2832 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2833 address = start; 2834 again: 2835 for (; address < end; address += sz) { 2836 ptep = huge_pte_offset(mm, address); 2837 if (!ptep) 2838 continue; 2839 2840 ptl = huge_pte_lock(h, mm, ptep); 2841 if (huge_pmd_unshare(mm, &address, ptep)) 2842 goto unlock; 2843 2844 pte = huge_ptep_get(ptep); 2845 if (huge_pte_none(pte)) 2846 goto unlock; 2847 2848 /* 2849 * Migrating hugepage or HWPoisoned hugepage is already 2850 * unmapped and its refcount is dropped, so just clear pte here. 2851 */ 2852 if (unlikely(!pte_present(pte))) { 2853 huge_pte_clear(mm, address, ptep); 2854 goto unlock; 2855 } 2856 2857 page = pte_page(pte); 2858 /* 2859 * If a reference page is supplied, it is because a specific 2860 * page is being unmapped, not a range. Ensure the page we 2861 * are about to unmap is the actual page of interest. 2862 */ 2863 if (ref_page) { 2864 if (page != ref_page) 2865 goto unlock; 2866 2867 /* 2868 * Mark the VMA as having unmapped its page so that 2869 * future faults in this VMA will fail rather than 2870 * looking like data was lost 2871 */ 2872 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 2873 } 2874 2875 pte = huge_ptep_get_and_clear(mm, address, ptep); 2876 tlb_remove_tlb_entry(tlb, ptep, address); 2877 if (huge_pte_dirty(pte)) 2878 set_page_dirty(page); 2879 2880 page_remove_rmap(page); 2881 force_flush = !__tlb_remove_page(tlb, page); 2882 if (force_flush) { 2883 address += sz; 2884 spin_unlock(ptl); 2885 break; 2886 } 2887 /* Bail out after unmapping reference page if supplied */ 2888 if (ref_page) { 2889 spin_unlock(ptl); 2890 break; 2891 } 2892 unlock: 2893 spin_unlock(ptl); 2894 } 2895 /* 2896 * mmu_gather ran out of room to batch pages, we break out of 2897 * the PTE lock to avoid doing the potential expensive TLB invalidate 2898 * and page-free while holding it. 2899 */ 2900 if (force_flush) { 2901 force_flush = 0; 2902 tlb_flush_mmu(tlb); 2903 if (address < end && !ref_page) 2904 goto again; 2905 } 2906 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2907 tlb_end_vma(tlb, vma); 2908 } 2909 2910 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 2911 struct vm_area_struct *vma, unsigned long start, 2912 unsigned long end, struct page *ref_page) 2913 { 2914 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 2915 2916 /* 2917 * Clear this flag so that x86's huge_pmd_share page_table_shareable 2918 * test will fail on a vma being torn down, and not grab a page table 2919 * on its way out. We're lucky that the flag has such an appropriate 2920 * name, and can in fact be safely cleared here. We could clear it 2921 * before the __unmap_hugepage_range above, but all that's necessary 2922 * is to clear it before releasing the i_mmap_rwsem. This works 2923 * because in the context this is called, the VMA is about to be 2924 * destroyed and the i_mmap_rwsem is held. 2925 */ 2926 vma->vm_flags &= ~VM_MAYSHARE; 2927 } 2928 2929 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2930 unsigned long end, struct page *ref_page) 2931 { 2932 struct mm_struct *mm; 2933 struct mmu_gather tlb; 2934 2935 mm = vma->vm_mm; 2936 2937 tlb_gather_mmu(&tlb, mm, start, end); 2938 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 2939 tlb_finish_mmu(&tlb, start, end); 2940 } 2941 2942 /* 2943 * This is called when the original mapper is failing to COW a MAP_PRIVATE 2944 * mappping it owns the reserve page for. The intention is to unmap the page 2945 * from other VMAs and let the children be SIGKILLed if they are faulting the 2946 * same region. 2947 */ 2948 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 2949 struct page *page, unsigned long address) 2950 { 2951 struct hstate *h = hstate_vma(vma); 2952 struct vm_area_struct *iter_vma; 2953 struct address_space *mapping; 2954 pgoff_t pgoff; 2955 2956 /* 2957 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 2958 * from page cache lookup which is in HPAGE_SIZE units. 2959 */ 2960 address = address & huge_page_mask(h); 2961 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 2962 vma->vm_pgoff; 2963 mapping = file_inode(vma->vm_file)->i_mapping; 2964 2965 /* 2966 * Take the mapping lock for the duration of the table walk. As 2967 * this mapping should be shared between all the VMAs, 2968 * __unmap_hugepage_range() is called as the lock is already held 2969 */ 2970 i_mmap_lock_write(mapping); 2971 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 2972 /* Do not unmap the current VMA */ 2973 if (iter_vma == vma) 2974 continue; 2975 2976 /* 2977 * Unmap the page from other VMAs without their own reserves. 2978 * They get marked to be SIGKILLed if they fault in these 2979 * areas. This is because a future no-page fault on this VMA 2980 * could insert a zeroed page instead of the data existing 2981 * from the time of fork. This would look like data corruption 2982 */ 2983 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2984 unmap_hugepage_range(iter_vma, address, 2985 address + huge_page_size(h), page); 2986 } 2987 i_mmap_unlock_write(mapping); 2988 } 2989 2990 /* 2991 * Hugetlb_cow() should be called with page lock of the original hugepage held. 2992 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 2993 * cannot race with other handlers or page migration. 2994 * Keep the pte_same checks anyway to make transition from the mutex easier. 2995 */ 2996 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2997 unsigned long address, pte_t *ptep, pte_t pte, 2998 struct page *pagecache_page, spinlock_t *ptl) 2999 { 3000 struct hstate *h = hstate_vma(vma); 3001 struct page *old_page, *new_page; 3002 int ret = 0, outside_reserve = 0; 3003 unsigned long mmun_start; /* For mmu_notifiers */ 3004 unsigned long mmun_end; /* For mmu_notifiers */ 3005 3006 old_page = pte_page(pte); 3007 3008 retry_avoidcopy: 3009 /* If no-one else is actually using this page, avoid the copy 3010 * and just make the page writable */ 3011 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 3012 page_move_anon_rmap(old_page, vma, address); 3013 set_huge_ptep_writable(vma, address, ptep); 3014 return 0; 3015 } 3016 3017 /* 3018 * If the process that created a MAP_PRIVATE mapping is about to 3019 * perform a COW due to a shared page count, attempt to satisfy 3020 * the allocation without using the existing reserves. The pagecache 3021 * page is used to determine if the reserve at this address was 3022 * consumed or not. If reserves were used, a partial faulted mapping 3023 * at the time of fork() could consume its reserves on COW instead 3024 * of the full address range. 3025 */ 3026 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 3027 old_page != pagecache_page) 3028 outside_reserve = 1; 3029 3030 page_cache_get(old_page); 3031 3032 /* 3033 * Drop page table lock as buddy allocator may be called. It will 3034 * be acquired again before returning to the caller, as expected. 3035 */ 3036 spin_unlock(ptl); 3037 new_page = alloc_huge_page(vma, address, outside_reserve); 3038 3039 if (IS_ERR(new_page)) { 3040 /* 3041 * If a process owning a MAP_PRIVATE mapping fails to COW, 3042 * it is due to references held by a child and an insufficient 3043 * huge page pool. To guarantee the original mappers 3044 * reliability, unmap the page from child processes. The child 3045 * may get SIGKILLed if it later faults. 3046 */ 3047 if (outside_reserve) { 3048 page_cache_release(old_page); 3049 BUG_ON(huge_pte_none(pte)); 3050 unmap_ref_private(mm, vma, old_page, address); 3051 BUG_ON(huge_pte_none(pte)); 3052 spin_lock(ptl); 3053 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 3054 if (likely(ptep && 3055 pte_same(huge_ptep_get(ptep), pte))) 3056 goto retry_avoidcopy; 3057 /* 3058 * race occurs while re-acquiring page table 3059 * lock, and our job is done. 3060 */ 3061 return 0; 3062 } 3063 3064 ret = (PTR_ERR(new_page) == -ENOMEM) ? 3065 VM_FAULT_OOM : VM_FAULT_SIGBUS; 3066 goto out_release_old; 3067 } 3068 3069 /* 3070 * When the original hugepage is shared one, it does not have 3071 * anon_vma prepared. 3072 */ 3073 if (unlikely(anon_vma_prepare(vma))) { 3074 ret = VM_FAULT_OOM; 3075 goto out_release_all; 3076 } 3077 3078 copy_user_huge_page(new_page, old_page, address, vma, 3079 pages_per_huge_page(h)); 3080 __SetPageUptodate(new_page); 3081 set_page_huge_active(new_page); 3082 3083 mmun_start = address & huge_page_mask(h); 3084 mmun_end = mmun_start + huge_page_size(h); 3085 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 3086 3087 /* 3088 * Retake the page table lock to check for racing updates 3089 * before the page tables are altered 3090 */ 3091 spin_lock(ptl); 3092 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 3093 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 3094 ClearPagePrivate(new_page); 3095 3096 /* Break COW */ 3097 huge_ptep_clear_flush(vma, address, ptep); 3098 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); 3099 set_huge_pte_at(mm, address, ptep, 3100 make_huge_pte(vma, new_page, 1)); 3101 page_remove_rmap(old_page); 3102 hugepage_add_new_anon_rmap(new_page, vma, address); 3103 /* Make the old page be freed below */ 3104 new_page = old_page; 3105 } 3106 spin_unlock(ptl); 3107 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 3108 out_release_all: 3109 page_cache_release(new_page); 3110 out_release_old: 3111 page_cache_release(old_page); 3112 3113 spin_lock(ptl); /* Caller expects lock to be held */ 3114 return ret; 3115 } 3116 3117 /* Return the pagecache page at a given address within a VMA */ 3118 static struct page *hugetlbfs_pagecache_page(struct hstate *h, 3119 struct vm_area_struct *vma, unsigned long address) 3120 { 3121 struct address_space *mapping; 3122 pgoff_t idx; 3123 3124 mapping = vma->vm_file->f_mapping; 3125 idx = vma_hugecache_offset(h, vma, address); 3126 3127 return find_lock_page(mapping, idx); 3128 } 3129 3130 /* 3131 * Return whether there is a pagecache page to back given address within VMA. 3132 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 3133 */ 3134 static bool hugetlbfs_pagecache_present(struct hstate *h, 3135 struct vm_area_struct *vma, unsigned long address) 3136 { 3137 struct address_space *mapping; 3138 pgoff_t idx; 3139 struct page *page; 3140 3141 mapping = vma->vm_file->f_mapping; 3142 idx = vma_hugecache_offset(h, vma, address); 3143 3144 page = find_get_page(mapping, idx); 3145 if (page) 3146 put_page(page); 3147 return page != NULL; 3148 } 3149 3150 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 3151 struct address_space *mapping, pgoff_t idx, 3152 unsigned long address, pte_t *ptep, unsigned int flags) 3153 { 3154 struct hstate *h = hstate_vma(vma); 3155 int ret = VM_FAULT_SIGBUS; 3156 int anon_rmap = 0; 3157 unsigned long size; 3158 struct page *page; 3159 pte_t new_pte; 3160 spinlock_t *ptl; 3161 3162 /* 3163 * Currently, we are forced to kill the process in the event the 3164 * original mapper has unmapped pages from the child due to a failed 3165 * COW. Warn that such a situation has occurred as it may not be obvious 3166 */ 3167 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 3168 pr_warning("PID %d killed due to inadequate hugepage pool\n", 3169 current->pid); 3170 return ret; 3171 } 3172 3173 /* 3174 * Use page lock to guard against racing truncation 3175 * before we get page_table_lock. 3176 */ 3177 retry: 3178 page = find_lock_page(mapping, idx); 3179 if (!page) { 3180 size = i_size_read(mapping->host) >> huge_page_shift(h); 3181 if (idx >= size) 3182 goto out; 3183 page = alloc_huge_page(vma, address, 0); 3184 if (IS_ERR(page)) { 3185 ret = PTR_ERR(page); 3186 if (ret == -ENOMEM) 3187 ret = VM_FAULT_OOM; 3188 else 3189 ret = VM_FAULT_SIGBUS; 3190 goto out; 3191 } 3192 clear_huge_page(page, address, pages_per_huge_page(h)); 3193 __SetPageUptodate(page); 3194 set_page_huge_active(page); 3195 3196 if (vma->vm_flags & VM_MAYSHARE) { 3197 int err; 3198 struct inode *inode = mapping->host; 3199 3200 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 3201 if (err) { 3202 put_page(page); 3203 if (err == -EEXIST) 3204 goto retry; 3205 goto out; 3206 } 3207 ClearPagePrivate(page); 3208 3209 spin_lock(&inode->i_lock); 3210 inode->i_blocks += blocks_per_huge_page(h); 3211 spin_unlock(&inode->i_lock); 3212 } else { 3213 lock_page(page); 3214 if (unlikely(anon_vma_prepare(vma))) { 3215 ret = VM_FAULT_OOM; 3216 goto backout_unlocked; 3217 } 3218 anon_rmap = 1; 3219 } 3220 } else { 3221 /* 3222 * If memory error occurs between mmap() and fault, some process 3223 * don't have hwpoisoned swap entry for errored virtual address. 3224 * So we need to block hugepage fault by PG_hwpoison bit check. 3225 */ 3226 if (unlikely(PageHWPoison(page))) { 3227 ret = VM_FAULT_HWPOISON | 3228 VM_FAULT_SET_HINDEX(hstate_index(h)); 3229 goto backout_unlocked; 3230 } 3231 } 3232 3233 /* 3234 * If we are going to COW a private mapping later, we examine the 3235 * pending reservations for this page now. This will ensure that 3236 * any allocations necessary to record that reservation occur outside 3237 * the spinlock. 3238 */ 3239 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) 3240 if (vma_needs_reservation(h, vma, address) < 0) { 3241 ret = VM_FAULT_OOM; 3242 goto backout_unlocked; 3243 } 3244 3245 ptl = huge_pte_lockptr(h, mm, ptep); 3246 spin_lock(ptl); 3247 size = i_size_read(mapping->host) >> huge_page_shift(h); 3248 if (idx >= size) 3249 goto backout; 3250 3251 ret = 0; 3252 if (!huge_pte_none(huge_ptep_get(ptep))) 3253 goto backout; 3254 3255 if (anon_rmap) { 3256 ClearPagePrivate(page); 3257 hugepage_add_new_anon_rmap(page, vma, address); 3258 } else 3259 page_dup_rmap(page); 3260 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 3261 && (vma->vm_flags & VM_SHARED))); 3262 set_huge_pte_at(mm, address, ptep, new_pte); 3263 3264 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 3265 /* Optimization, do the COW without a second fault */ 3266 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); 3267 } 3268 3269 spin_unlock(ptl); 3270 unlock_page(page); 3271 out: 3272 return ret; 3273 3274 backout: 3275 spin_unlock(ptl); 3276 backout_unlocked: 3277 unlock_page(page); 3278 put_page(page); 3279 goto out; 3280 } 3281 3282 #ifdef CONFIG_SMP 3283 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3284 struct vm_area_struct *vma, 3285 struct address_space *mapping, 3286 pgoff_t idx, unsigned long address) 3287 { 3288 unsigned long key[2]; 3289 u32 hash; 3290 3291 if (vma->vm_flags & VM_SHARED) { 3292 key[0] = (unsigned long) mapping; 3293 key[1] = idx; 3294 } else { 3295 key[0] = (unsigned long) mm; 3296 key[1] = address >> huge_page_shift(h); 3297 } 3298 3299 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); 3300 3301 return hash & (num_fault_mutexes - 1); 3302 } 3303 #else 3304 /* 3305 * For uniprocesor systems we always use a single mutex, so just 3306 * return 0 and avoid the hashing overhead. 3307 */ 3308 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3309 struct vm_area_struct *vma, 3310 struct address_space *mapping, 3311 pgoff_t idx, unsigned long address) 3312 { 3313 return 0; 3314 } 3315 #endif 3316 3317 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3318 unsigned long address, unsigned int flags) 3319 { 3320 pte_t *ptep, entry; 3321 spinlock_t *ptl; 3322 int ret; 3323 u32 hash; 3324 pgoff_t idx; 3325 struct page *page = NULL; 3326 struct page *pagecache_page = NULL; 3327 struct hstate *h = hstate_vma(vma); 3328 struct address_space *mapping; 3329 int need_wait_lock = 0; 3330 3331 address &= huge_page_mask(h); 3332 3333 ptep = huge_pte_offset(mm, address); 3334 if (ptep) { 3335 entry = huge_ptep_get(ptep); 3336 if (unlikely(is_hugetlb_entry_migration(entry))) { 3337 migration_entry_wait_huge(vma, mm, ptep); 3338 return 0; 3339 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 3340 return VM_FAULT_HWPOISON_LARGE | 3341 VM_FAULT_SET_HINDEX(hstate_index(h)); 3342 } 3343 3344 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 3345 if (!ptep) 3346 return VM_FAULT_OOM; 3347 3348 mapping = vma->vm_file->f_mapping; 3349 idx = vma_hugecache_offset(h, vma, address); 3350 3351 /* 3352 * Serialize hugepage allocation and instantiation, so that we don't 3353 * get spurious allocation failures if two CPUs race to instantiate 3354 * the same page in the page cache. 3355 */ 3356 hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); 3357 mutex_lock(&htlb_fault_mutex_table[hash]); 3358 3359 entry = huge_ptep_get(ptep); 3360 if (huge_pte_none(entry)) { 3361 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 3362 goto out_mutex; 3363 } 3364 3365 ret = 0; 3366 3367 /* 3368 * entry could be a migration/hwpoison entry at this point, so this 3369 * check prevents the kernel from going below assuming that we have 3370 * a active hugepage in pagecache. This goto expects the 2nd page fault, 3371 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly 3372 * handle it. 3373 */ 3374 if (!pte_present(entry)) 3375 goto out_mutex; 3376 3377 /* 3378 * If we are going to COW the mapping later, we examine the pending 3379 * reservations for this page now. This will ensure that any 3380 * allocations necessary to record that reservation occur outside the 3381 * spinlock. For private mappings, we also lookup the pagecache 3382 * page now as it is used to determine if a reservation has been 3383 * consumed. 3384 */ 3385 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 3386 if (vma_needs_reservation(h, vma, address) < 0) { 3387 ret = VM_FAULT_OOM; 3388 goto out_mutex; 3389 } 3390 3391 if (!(vma->vm_flags & VM_MAYSHARE)) 3392 pagecache_page = hugetlbfs_pagecache_page(h, 3393 vma, address); 3394 } 3395 3396 ptl = huge_pte_lock(h, mm, ptep); 3397 3398 /* Check for a racing update before calling hugetlb_cow */ 3399 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 3400 goto out_ptl; 3401 3402 /* 3403 * hugetlb_cow() requires page locks of pte_page(entry) and 3404 * pagecache_page, so here we need take the former one 3405 * when page != pagecache_page or !pagecache_page. 3406 */ 3407 page = pte_page(entry); 3408 if (page != pagecache_page) 3409 if (!trylock_page(page)) { 3410 need_wait_lock = 1; 3411 goto out_ptl; 3412 } 3413 3414 get_page(page); 3415 3416 if (flags & FAULT_FLAG_WRITE) { 3417 if (!huge_pte_write(entry)) { 3418 ret = hugetlb_cow(mm, vma, address, ptep, entry, 3419 pagecache_page, ptl); 3420 goto out_put_page; 3421 } 3422 entry = huge_pte_mkdirty(entry); 3423 } 3424 entry = pte_mkyoung(entry); 3425 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 3426 flags & FAULT_FLAG_WRITE)) 3427 update_mmu_cache(vma, address, ptep); 3428 out_put_page: 3429 if (page != pagecache_page) 3430 unlock_page(page); 3431 put_page(page); 3432 out_ptl: 3433 spin_unlock(ptl); 3434 3435 if (pagecache_page) { 3436 unlock_page(pagecache_page); 3437 put_page(pagecache_page); 3438 } 3439 out_mutex: 3440 mutex_unlock(&htlb_fault_mutex_table[hash]); 3441 /* 3442 * Generally it's safe to hold refcount during waiting page lock. But 3443 * here we just wait to defer the next page fault to avoid busy loop and 3444 * the page is not used after unlocked before returning from the current 3445 * page fault. So we are safe from accessing freed page, even if we wait 3446 * here without taking refcount. 3447 */ 3448 if (need_wait_lock) 3449 wait_on_page_locked(page); 3450 return ret; 3451 } 3452 3453 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 3454 struct page **pages, struct vm_area_struct **vmas, 3455 unsigned long *position, unsigned long *nr_pages, 3456 long i, unsigned int flags) 3457 { 3458 unsigned long pfn_offset; 3459 unsigned long vaddr = *position; 3460 unsigned long remainder = *nr_pages; 3461 struct hstate *h = hstate_vma(vma); 3462 3463 while (vaddr < vma->vm_end && remainder) { 3464 pte_t *pte; 3465 spinlock_t *ptl = NULL; 3466 int absent; 3467 struct page *page; 3468 3469 /* 3470 * If we have a pending SIGKILL, don't keep faulting pages and 3471 * potentially allocating memory. 3472 */ 3473 if (unlikely(fatal_signal_pending(current))) { 3474 remainder = 0; 3475 break; 3476 } 3477 3478 /* 3479 * Some archs (sparc64, sh*) have multiple pte_ts to 3480 * each hugepage. We have to make sure we get the 3481 * first, for the page indexing below to work. 3482 * 3483 * Note that page table lock is not held when pte is null. 3484 */ 3485 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 3486 if (pte) 3487 ptl = huge_pte_lock(h, mm, pte); 3488 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 3489 3490 /* 3491 * When coredumping, it suits get_dump_page if we just return 3492 * an error where there's an empty slot with no huge pagecache 3493 * to back it. This way, we avoid allocating a hugepage, and 3494 * the sparse dumpfile avoids allocating disk blocks, but its 3495 * huge holes still show up with zeroes where they need to be. 3496 */ 3497 if (absent && (flags & FOLL_DUMP) && 3498 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 3499 if (pte) 3500 spin_unlock(ptl); 3501 remainder = 0; 3502 break; 3503 } 3504 3505 /* 3506 * We need call hugetlb_fault for both hugepages under migration 3507 * (in which case hugetlb_fault waits for the migration,) and 3508 * hwpoisoned hugepages (in which case we need to prevent the 3509 * caller from accessing to them.) In order to do this, we use 3510 * here is_swap_pte instead of is_hugetlb_entry_migration and 3511 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 3512 * both cases, and because we can't follow correct pages 3513 * directly from any kind of swap entries. 3514 */ 3515 if (absent || is_swap_pte(huge_ptep_get(pte)) || 3516 ((flags & FOLL_WRITE) && 3517 !huge_pte_write(huge_ptep_get(pte)))) { 3518 int ret; 3519 3520 if (pte) 3521 spin_unlock(ptl); 3522 ret = hugetlb_fault(mm, vma, vaddr, 3523 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); 3524 if (!(ret & VM_FAULT_ERROR)) 3525 continue; 3526 3527 remainder = 0; 3528 break; 3529 } 3530 3531 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 3532 page = pte_page(huge_ptep_get(pte)); 3533 same_page: 3534 if (pages) { 3535 pages[i] = mem_map_offset(page, pfn_offset); 3536 get_page_foll(pages[i]); 3537 } 3538 3539 if (vmas) 3540 vmas[i] = vma; 3541 3542 vaddr += PAGE_SIZE; 3543 ++pfn_offset; 3544 --remainder; 3545 ++i; 3546 if (vaddr < vma->vm_end && remainder && 3547 pfn_offset < pages_per_huge_page(h)) { 3548 /* 3549 * We use pfn_offset to avoid touching the pageframes 3550 * of this compound page. 3551 */ 3552 goto same_page; 3553 } 3554 spin_unlock(ptl); 3555 } 3556 *nr_pages = remainder; 3557 *position = vaddr; 3558 3559 return i ? i : -EFAULT; 3560 } 3561 3562 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 3563 unsigned long address, unsigned long end, pgprot_t newprot) 3564 { 3565 struct mm_struct *mm = vma->vm_mm; 3566 unsigned long start = address; 3567 pte_t *ptep; 3568 pte_t pte; 3569 struct hstate *h = hstate_vma(vma); 3570 unsigned long pages = 0; 3571 3572 BUG_ON(address >= end); 3573 flush_cache_range(vma, address, end); 3574 3575 mmu_notifier_invalidate_range_start(mm, start, end); 3576 i_mmap_lock_write(vma->vm_file->f_mapping); 3577 for (; address < end; address += huge_page_size(h)) { 3578 spinlock_t *ptl; 3579 ptep = huge_pte_offset(mm, address); 3580 if (!ptep) 3581 continue; 3582 ptl = huge_pte_lock(h, mm, ptep); 3583 if (huge_pmd_unshare(mm, &address, ptep)) { 3584 pages++; 3585 spin_unlock(ptl); 3586 continue; 3587 } 3588 pte = huge_ptep_get(ptep); 3589 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 3590 spin_unlock(ptl); 3591 continue; 3592 } 3593 if (unlikely(is_hugetlb_entry_migration(pte))) { 3594 swp_entry_t entry = pte_to_swp_entry(pte); 3595 3596 if (is_write_migration_entry(entry)) { 3597 pte_t newpte; 3598 3599 make_migration_entry_read(&entry); 3600 newpte = swp_entry_to_pte(entry); 3601 set_huge_pte_at(mm, address, ptep, newpte); 3602 pages++; 3603 } 3604 spin_unlock(ptl); 3605 continue; 3606 } 3607 if (!huge_pte_none(pte)) { 3608 pte = huge_ptep_get_and_clear(mm, address, ptep); 3609 pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 3610 pte = arch_make_huge_pte(pte, vma, NULL, 0); 3611 set_huge_pte_at(mm, address, ptep, pte); 3612 pages++; 3613 } 3614 spin_unlock(ptl); 3615 } 3616 /* 3617 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 3618 * may have cleared our pud entry and done put_page on the page table: 3619 * once we release i_mmap_rwsem, another task can do the final put_page 3620 * and that page table be reused and filled with junk. 3621 */ 3622 flush_tlb_range(vma, start, end); 3623 mmu_notifier_invalidate_range(mm, start, end); 3624 i_mmap_unlock_write(vma->vm_file->f_mapping); 3625 mmu_notifier_invalidate_range_end(mm, start, end); 3626 3627 return pages << h->order; 3628 } 3629 3630 int hugetlb_reserve_pages(struct inode *inode, 3631 long from, long to, 3632 struct vm_area_struct *vma, 3633 vm_flags_t vm_flags) 3634 { 3635 long ret, chg; 3636 struct hstate *h = hstate_inode(inode); 3637 struct hugepage_subpool *spool = subpool_inode(inode); 3638 struct resv_map *resv_map; 3639 long gbl_reserve; 3640 3641 /* 3642 * Only apply hugepage reservation if asked. At fault time, an 3643 * attempt will be made for VM_NORESERVE to allocate a page 3644 * without using reserves 3645 */ 3646 if (vm_flags & VM_NORESERVE) 3647 return 0; 3648 3649 /* 3650 * Shared mappings base their reservation on the number of pages that 3651 * are already allocated on behalf of the file. Private mappings need 3652 * to reserve the full area even if read-only as mprotect() may be 3653 * called to make the mapping read-write. Assume !vma is a shm mapping 3654 */ 3655 if (!vma || vma->vm_flags & VM_MAYSHARE) { 3656 resv_map = inode_resv_map(inode); 3657 3658 chg = region_chg(resv_map, from, to); 3659 3660 } else { 3661 resv_map = resv_map_alloc(); 3662 if (!resv_map) 3663 return -ENOMEM; 3664 3665 chg = to - from; 3666 3667 set_vma_resv_map(vma, resv_map); 3668 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 3669 } 3670 3671 if (chg < 0) { 3672 ret = chg; 3673 goto out_err; 3674 } 3675 3676 /* 3677 * There must be enough pages in the subpool for the mapping. If 3678 * the subpool has a minimum size, there may be some global 3679 * reservations already in place (gbl_reserve). 3680 */ 3681 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 3682 if (gbl_reserve < 0) { 3683 ret = -ENOSPC; 3684 goto out_err; 3685 } 3686 3687 /* 3688 * Check enough hugepages are available for the reservation. 3689 * Hand the pages back to the subpool if there are not 3690 */ 3691 ret = hugetlb_acct_memory(h, gbl_reserve); 3692 if (ret < 0) { 3693 /* put back original number of pages, chg */ 3694 (void)hugepage_subpool_put_pages(spool, chg); 3695 goto out_err; 3696 } 3697 3698 /* 3699 * Account for the reservations made. Shared mappings record regions 3700 * that have reservations as they are shared by multiple VMAs. 3701 * When the last VMA disappears, the region map says how much 3702 * the reservation was and the page cache tells how much of 3703 * the reservation was consumed. Private mappings are per-VMA and 3704 * only the consumed reservations are tracked. When the VMA 3705 * disappears, the original reservation is the VMA size and the 3706 * consumed reservations are stored in the map. Hence, nothing 3707 * else has to be done for private mappings here 3708 */ 3709 if (!vma || vma->vm_flags & VM_MAYSHARE) { 3710 long add = region_add(resv_map, from, to); 3711 3712 if (unlikely(chg > add)) { 3713 /* 3714 * pages in this range were added to the reserve 3715 * map between region_chg and region_add. This 3716 * indicates a race with alloc_huge_page. Adjust 3717 * the subpool and reserve counts modified above 3718 * based on the difference. 3719 */ 3720 long rsv_adjust; 3721 3722 rsv_adjust = hugepage_subpool_put_pages(spool, 3723 chg - add); 3724 hugetlb_acct_memory(h, -rsv_adjust); 3725 } 3726 } 3727 return 0; 3728 out_err: 3729 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3730 kref_put(&resv_map->refs, resv_map_release); 3731 return ret; 3732 } 3733 3734 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 3735 { 3736 struct hstate *h = hstate_inode(inode); 3737 struct resv_map *resv_map = inode_resv_map(inode); 3738 long chg = 0; 3739 struct hugepage_subpool *spool = subpool_inode(inode); 3740 long gbl_reserve; 3741 3742 if (resv_map) 3743 chg = region_truncate(resv_map, offset); 3744 spin_lock(&inode->i_lock); 3745 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3746 spin_unlock(&inode->i_lock); 3747 3748 /* 3749 * If the subpool has a minimum size, the number of global 3750 * reservations to be released may be adjusted. 3751 */ 3752 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 3753 hugetlb_acct_memory(h, -gbl_reserve); 3754 } 3755 3756 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 3757 static unsigned long page_table_shareable(struct vm_area_struct *svma, 3758 struct vm_area_struct *vma, 3759 unsigned long addr, pgoff_t idx) 3760 { 3761 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 3762 svma->vm_start; 3763 unsigned long sbase = saddr & PUD_MASK; 3764 unsigned long s_end = sbase + PUD_SIZE; 3765 3766 /* Allow segments to share if only one is marked locked */ 3767 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 3768 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 3769 3770 /* 3771 * match the virtual addresses, permission and the alignment of the 3772 * page table page. 3773 */ 3774 if (pmd_index(addr) != pmd_index(saddr) || 3775 vm_flags != svm_flags || 3776 sbase < svma->vm_start || svma->vm_end < s_end) 3777 return 0; 3778 3779 return saddr; 3780 } 3781 3782 static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) 3783 { 3784 unsigned long base = addr & PUD_MASK; 3785 unsigned long end = base + PUD_SIZE; 3786 3787 /* 3788 * check on proper vm_flags and page table alignment 3789 */ 3790 if (vma->vm_flags & VM_MAYSHARE && 3791 vma->vm_start <= base && end <= vma->vm_end) 3792 return 1; 3793 return 0; 3794 } 3795 3796 /* 3797 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 3798 * and returns the corresponding pte. While this is not necessary for the 3799 * !shared pmd case because we can allocate the pmd later as well, it makes the 3800 * code much cleaner. pmd allocation is essential for the shared case because 3801 * pud has to be populated inside the same i_mmap_rwsem section - otherwise 3802 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 3803 * bad pmd for sharing. 3804 */ 3805 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 3806 { 3807 struct vm_area_struct *vma = find_vma(mm, addr); 3808 struct address_space *mapping = vma->vm_file->f_mapping; 3809 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 3810 vma->vm_pgoff; 3811 struct vm_area_struct *svma; 3812 unsigned long saddr; 3813 pte_t *spte = NULL; 3814 pte_t *pte; 3815 spinlock_t *ptl; 3816 3817 if (!vma_shareable(vma, addr)) 3818 return (pte_t *)pmd_alloc(mm, pud, addr); 3819 3820 i_mmap_lock_write(mapping); 3821 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 3822 if (svma == vma) 3823 continue; 3824 3825 saddr = page_table_shareable(svma, vma, addr, idx); 3826 if (saddr) { 3827 spte = huge_pte_offset(svma->vm_mm, saddr); 3828 if (spte) { 3829 mm_inc_nr_pmds(mm); 3830 get_page(virt_to_page(spte)); 3831 break; 3832 } 3833 } 3834 } 3835 3836 if (!spte) 3837 goto out; 3838 3839 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 3840 spin_lock(ptl); 3841 if (pud_none(*pud)) { 3842 pud_populate(mm, pud, 3843 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3844 } else { 3845 put_page(virt_to_page(spte)); 3846 mm_inc_nr_pmds(mm); 3847 } 3848 spin_unlock(ptl); 3849 out: 3850 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3851 i_mmap_unlock_write(mapping); 3852 return pte; 3853 } 3854 3855 /* 3856 * unmap huge page backed by shared pte. 3857 * 3858 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 3859 * indicated by page_count > 1, unmap is achieved by clearing pud and 3860 * decrementing the ref count. If count == 1, the pte page is not shared. 3861 * 3862 * called with page table lock held. 3863 * 3864 * returns: 1 successfully unmapped a shared pte page 3865 * 0 the underlying pte page is not shared, or it is the last user 3866 */ 3867 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 3868 { 3869 pgd_t *pgd = pgd_offset(mm, *addr); 3870 pud_t *pud = pud_offset(pgd, *addr); 3871 3872 BUG_ON(page_count(virt_to_page(ptep)) == 0); 3873 if (page_count(virt_to_page(ptep)) == 1) 3874 return 0; 3875 3876 pud_clear(pud); 3877 put_page(virt_to_page(ptep)); 3878 mm_dec_nr_pmds(mm); 3879 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 3880 return 1; 3881 } 3882 #define want_pmd_share() (1) 3883 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 3884 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 3885 { 3886 return NULL; 3887 } 3888 3889 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 3890 { 3891 return 0; 3892 } 3893 #define want_pmd_share() (0) 3894 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 3895 3896 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 3897 pte_t *huge_pte_alloc(struct mm_struct *mm, 3898 unsigned long addr, unsigned long sz) 3899 { 3900 pgd_t *pgd; 3901 pud_t *pud; 3902 pte_t *pte = NULL; 3903 3904 pgd = pgd_offset(mm, addr); 3905 pud = pud_alloc(mm, pgd, addr); 3906 if (pud) { 3907 if (sz == PUD_SIZE) { 3908 pte = (pte_t *)pud; 3909 } else { 3910 BUG_ON(sz != PMD_SIZE); 3911 if (want_pmd_share() && pud_none(*pud)) 3912 pte = huge_pmd_share(mm, addr, pud); 3913 else 3914 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3915 } 3916 } 3917 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 3918 3919 return pte; 3920 } 3921 3922 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 3923 { 3924 pgd_t *pgd; 3925 pud_t *pud; 3926 pmd_t *pmd = NULL; 3927 3928 pgd = pgd_offset(mm, addr); 3929 if (pgd_present(*pgd)) { 3930 pud = pud_offset(pgd, addr); 3931 if (pud_present(*pud)) { 3932 if (pud_huge(*pud)) 3933 return (pte_t *)pud; 3934 pmd = pmd_offset(pud, addr); 3935 } 3936 } 3937 return (pte_t *) pmd; 3938 } 3939 3940 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 3941 3942 /* 3943 * These functions are overwritable if your architecture needs its own 3944 * behavior. 3945 */ 3946 struct page * __weak 3947 follow_huge_addr(struct mm_struct *mm, unsigned long address, 3948 int write) 3949 { 3950 return ERR_PTR(-EINVAL); 3951 } 3952 3953 struct page * __weak 3954 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 3955 pmd_t *pmd, int flags) 3956 { 3957 struct page *page = NULL; 3958 spinlock_t *ptl; 3959 retry: 3960 ptl = pmd_lockptr(mm, pmd); 3961 spin_lock(ptl); 3962 /* 3963 * make sure that the address range covered by this pmd is not 3964 * unmapped from other threads. 3965 */ 3966 if (!pmd_huge(*pmd)) 3967 goto out; 3968 if (pmd_present(*pmd)) { 3969 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 3970 if (flags & FOLL_GET) 3971 get_page(page); 3972 } else { 3973 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { 3974 spin_unlock(ptl); 3975 __migration_entry_wait(mm, (pte_t *)pmd, ptl); 3976 goto retry; 3977 } 3978 /* 3979 * hwpoisoned entry is treated as no_page_table in 3980 * follow_page_mask(). 3981 */ 3982 } 3983 out: 3984 spin_unlock(ptl); 3985 return page; 3986 } 3987 3988 struct page * __weak 3989 follow_huge_pud(struct mm_struct *mm, unsigned long address, 3990 pud_t *pud, int flags) 3991 { 3992 if (flags & FOLL_GET) 3993 return NULL; 3994 3995 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 3996 } 3997 3998 #ifdef CONFIG_MEMORY_FAILURE 3999 4000 /* 4001 * This function is called from memory failure code. 4002 * Assume the caller holds page lock of the head page. 4003 */ 4004 int dequeue_hwpoisoned_huge_page(struct page *hpage) 4005 { 4006 struct hstate *h = page_hstate(hpage); 4007 int nid = page_to_nid(hpage); 4008 int ret = -EBUSY; 4009 4010 spin_lock(&hugetlb_lock); 4011 /* 4012 * Just checking !page_huge_active is not enough, because that could be 4013 * an isolated/hwpoisoned hugepage (which have >0 refcount). 4014 */ 4015 if (!page_huge_active(hpage) && !page_count(hpage)) { 4016 /* 4017 * Hwpoisoned hugepage isn't linked to activelist or freelist, 4018 * but dangling hpage->lru can trigger list-debug warnings 4019 * (this happens when we call unpoison_memory() on it), 4020 * so let it point to itself with list_del_init(). 4021 */ 4022 list_del_init(&hpage->lru); 4023 set_page_refcounted(hpage); 4024 h->free_huge_pages--; 4025 h->free_huge_pages_node[nid]--; 4026 ret = 0; 4027 } 4028 spin_unlock(&hugetlb_lock); 4029 return ret; 4030 } 4031 #endif 4032 4033 bool isolate_huge_page(struct page *page, struct list_head *list) 4034 { 4035 bool ret = true; 4036 4037 VM_BUG_ON_PAGE(!PageHead(page), page); 4038 spin_lock(&hugetlb_lock); 4039 if (!page_huge_active(page) || !get_page_unless_zero(page)) { 4040 ret = false; 4041 goto unlock; 4042 } 4043 clear_page_huge_active(page); 4044 list_move_tail(&page->lru, list); 4045 unlock: 4046 spin_unlock(&hugetlb_lock); 4047 return ret; 4048 } 4049 4050 void putback_active_hugepage(struct page *page) 4051 { 4052 VM_BUG_ON_PAGE(!PageHead(page), page); 4053 spin_lock(&hugetlb_lock); 4054 set_page_huge_active(page); 4055 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 4056 spin_unlock(&hugetlb_lock); 4057 put_page(page); 4058 } 4059