1 /* 2 * Generic hugetlb support. 3 * (C) Nadia Yvette Chambers, April 2004 4 */ 5 #include <linux/list.h> 6 #include <linux/init.h> 7 #include <linux/module.h> 8 #include <linux/mm.h> 9 #include <linux/seq_file.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/mmu_notifier.h> 13 #include <linux/nodemask.h> 14 #include <linux/pagemap.h> 15 #include <linux/mempolicy.h> 16 #include <linux/compiler.h> 17 #include <linux/cpuset.h> 18 #include <linux/mutex.h> 19 #include <linux/bootmem.h> 20 #include <linux/sysfs.h> 21 #include <linux/slab.h> 22 #include <linux/rmap.h> 23 #include <linux/swap.h> 24 #include <linux/swapops.h> 25 #include <linux/page-isolation.h> 26 #include <linux/jhash.h> 27 28 #include <asm/page.h> 29 #include <asm/pgtable.h> 30 #include <asm/tlb.h> 31 32 #include <linux/io.h> 33 #include <linux/hugetlb.h> 34 #include <linux/hugetlb_cgroup.h> 35 #include <linux/node.h> 36 #include "internal.h" 37 38 int hugepages_treat_as_movable; 39 40 int hugetlb_max_hstate __read_mostly; 41 unsigned int default_hstate_idx; 42 struct hstate hstates[HUGE_MAX_HSTATE]; 43 44 __initdata LIST_HEAD(huge_boot_pages); 45 46 /* for command line parsing */ 47 static struct hstate * __initdata parsed_hstate; 48 static unsigned long __initdata default_hstate_max_huge_pages; 49 static unsigned long __initdata default_hstate_size; 50 51 /* 52 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, 53 * free_huge_pages, and surplus_huge_pages. 54 */ 55 DEFINE_SPINLOCK(hugetlb_lock); 56 57 /* 58 * Serializes faults on the same logical page. This is used to 59 * prevent spurious OOMs when the hugepage pool is fully utilized. 60 */ 61 static int num_fault_mutexes; 62 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; 63 64 /* Forward declaration */ 65 static int hugetlb_acct_memory(struct hstate *h, long delta); 66 67 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 68 { 69 bool free = (spool->count == 0) && (spool->used_hpages == 0); 70 71 spin_unlock(&spool->lock); 72 73 /* If no pages are used, and no other handles to the subpool 74 * remain, give up any reservations mased on minimum size and 75 * free the subpool */ 76 if (free) { 77 if (spool->min_hpages != -1) 78 hugetlb_acct_memory(spool->hstate, 79 -spool->min_hpages); 80 kfree(spool); 81 } 82 } 83 84 struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, 85 long min_hpages) 86 { 87 struct hugepage_subpool *spool; 88 89 spool = kzalloc(sizeof(*spool), GFP_KERNEL); 90 if (!spool) 91 return NULL; 92 93 spin_lock_init(&spool->lock); 94 spool->count = 1; 95 spool->max_hpages = max_hpages; 96 spool->hstate = h; 97 spool->min_hpages = min_hpages; 98 99 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { 100 kfree(spool); 101 return NULL; 102 } 103 spool->rsv_hpages = min_hpages; 104 105 return spool; 106 } 107 108 void hugepage_put_subpool(struct hugepage_subpool *spool) 109 { 110 spin_lock(&spool->lock); 111 BUG_ON(!spool->count); 112 spool->count--; 113 unlock_or_release_subpool(spool); 114 } 115 116 /* 117 * Subpool accounting for allocating and reserving pages. 118 * Return -ENOMEM if there are not enough resources to satisfy the 119 * the request. Otherwise, return the number of pages by which the 120 * global pools must be adjusted (upward). The returned value may 121 * only be different than the passed value (delta) in the case where 122 * a subpool minimum size must be manitained. 123 */ 124 static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, 125 long delta) 126 { 127 long ret = delta; 128 129 if (!spool) 130 return ret; 131 132 spin_lock(&spool->lock); 133 134 if (spool->max_hpages != -1) { /* maximum size accounting */ 135 if ((spool->used_hpages + delta) <= spool->max_hpages) 136 spool->used_hpages += delta; 137 else { 138 ret = -ENOMEM; 139 goto unlock_ret; 140 } 141 } 142 143 if (spool->min_hpages != -1) { /* minimum size accounting */ 144 if (delta > spool->rsv_hpages) { 145 /* 146 * Asking for more reserves than those already taken on 147 * behalf of subpool. Return difference. 148 */ 149 ret = delta - spool->rsv_hpages; 150 spool->rsv_hpages = 0; 151 } else { 152 ret = 0; /* reserves already accounted for */ 153 spool->rsv_hpages -= delta; 154 } 155 } 156 157 unlock_ret: 158 spin_unlock(&spool->lock); 159 return ret; 160 } 161 162 /* 163 * Subpool accounting for freeing and unreserving pages. 164 * Return the number of global page reservations that must be dropped. 165 * The return value may only be different than the passed value (delta) 166 * in the case where a subpool minimum size must be maintained. 167 */ 168 static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, 169 long delta) 170 { 171 long ret = delta; 172 173 if (!spool) 174 return delta; 175 176 spin_lock(&spool->lock); 177 178 if (spool->max_hpages != -1) /* maximum size accounting */ 179 spool->used_hpages -= delta; 180 181 if (spool->min_hpages != -1) { /* minimum size accounting */ 182 if (spool->rsv_hpages + delta <= spool->min_hpages) 183 ret = 0; 184 else 185 ret = spool->rsv_hpages + delta - spool->min_hpages; 186 187 spool->rsv_hpages += delta; 188 if (spool->rsv_hpages > spool->min_hpages) 189 spool->rsv_hpages = spool->min_hpages; 190 } 191 192 /* 193 * If hugetlbfs_put_super couldn't free spool due to an outstanding 194 * quota reference, free it now. 195 */ 196 unlock_or_release_subpool(spool); 197 198 return ret; 199 } 200 201 static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 202 { 203 return HUGETLBFS_SB(inode->i_sb)->spool; 204 } 205 206 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) 207 { 208 return subpool_inode(file_inode(vma->vm_file)); 209 } 210 211 /* 212 * Region tracking -- allows tracking of reservations and instantiated pages 213 * across the pages in a mapping. 214 * 215 * The region data structures are embedded into a resv_map and 216 * protected by a resv_map's lock 217 */ 218 struct file_region { 219 struct list_head link; 220 long from; 221 long to; 222 }; 223 224 static long region_add(struct resv_map *resv, long f, long t) 225 { 226 struct list_head *head = &resv->regions; 227 struct file_region *rg, *nrg, *trg; 228 229 spin_lock(&resv->lock); 230 /* Locate the region we are either in or before. */ 231 list_for_each_entry(rg, head, link) 232 if (f <= rg->to) 233 break; 234 235 /* Round our left edge to the current segment if it encloses us. */ 236 if (f > rg->from) 237 f = rg->from; 238 239 /* Check for and consume any regions we now overlap with. */ 240 nrg = rg; 241 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 242 if (&rg->link == head) 243 break; 244 if (rg->from > t) 245 break; 246 247 /* If this area reaches higher then extend our area to 248 * include it completely. If this is not the first area 249 * which we intend to reuse, free it. */ 250 if (rg->to > t) 251 t = rg->to; 252 if (rg != nrg) { 253 list_del(&rg->link); 254 kfree(rg); 255 } 256 } 257 nrg->from = f; 258 nrg->to = t; 259 spin_unlock(&resv->lock); 260 return 0; 261 } 262 263 static long region_chg(struct resv_map *resv, long f, long t) 264 { 265 struct list_head *head = &resv->regions; 266 struct file_region *rg, *nrg = NULL; 267 long chg = 0; 268 269 retry: 270 spin_lock(&resv->lock); 271 /* Locate the region we are before or in. */ 272 list_for_each_entry(rg, head, link) 273 if (f <= rg->to) 274 break; 275 276 /* If we are below the current region then a new region is required. 277 * Subtle, allocate a new region at the position but make it zero 278 * size such that we can guarantee to record the reservation. */ 279 if (&rg->link == head || t < rg->from) { 280 if (!nrg) { 281 spin_unlock(&resv->lock); 282 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); 283 if (!nrg) 284 return -ENOMEM; 285 286 nrg->from = f; 287 nrg->to = f; 288 INIT_LIST_HEAD(&nrg->link); 289 goto retry; 290 } 291 292 list_add(&nrg->link, rg->link.prev); 293 chg = t - f; 294 goto out_nrg; 295 } 296 297 /* Round our left edge to the current segment if it encloses us. */ 298 if (f > rg->from) 299 f = rg->from; 300 chg = t - f; 301 302 /* Check for and consume any regions we now overlap with. */ 303 list_for_each_entry(rg, rg->link.prev, link) { 304 if (&rg->link == head) 305 break; 306 if (rg->from > t) 307 goto out; 308 309 /* We overlap with this area, if it extends further than 310 * us then we must extend ourselves. Account for its 311 * existing reservation. */ 312 if (rg->to > t) { 313 chg += rg->to - t; 314 t = rg->to; 315 } 316 chg -= rg->to - rg->from; 317 } 318 319 out: 320 spin_unlock(&resv->lock); 321 /* We already know we raced and no longer need the new region */ 322 kfree(nrg); 323 return chg; 324 out_nrg: 325 spin_unlock(&resv->lock); 326 return chg; 327 } 328 329 static long region_truncate(struct resv_map *resv, long end) 330 { 331 struct list_head *head = &resv->regions; 332 struct file_region *rg, *trg; 333 long chg = 0; 334 335 spin_lock(&resv->lock); 336 /* Locate the region we are either in or before. */ 337 list_for_each_entry(rg, head, link) 338 if (end <= rg->to) 339 break; 340 if (&rg->link == head) 341 goto out; 342 343 /* If we are in the middle of a region then adjust it. */ 344 if (end > rg->from) { 345 chg = rg->to - end; 346 rg->to = end; 347 rg = list_entry(rg->link.next, typeof(*rg), link); 348 } 349 350 /* Drop any remaining regions. */ 351 list_for_each_entry_safe(rg, trg, rg->link.prev, link) { 352 if (&rg->link == head) 353 break; 354 chg += rg->to - rg->from; 355 list_del(&rg->link); 356 kfree(rg); 357 } 358 359 out: 360 spin_unlock(&resv->lock); 361 return chg; 362 } 363 364 static long region_count(struct resv_map *resv, long f, long t) 365 { 366 struct list_head *head = &resv->regions; 367 struct file_region *rg; 368 long chg = 0; 369 370 spin_lock(&resv->lock); 371 /* Locate each segment we overlap with, and count that overlap. */ 372 list_for_each_entry(rg, head, link) { 373 long seg_from; 374 long seg_to; 375 376 if (rg->to <= f) 377 continue; 378 if (rg->from >= t) 379 break; 380 381 seg_from = max(rg->from, f); 382 seg_to = min(rg->to, t); 383 384 chg += seg_to - seg_from; 385 } 386 spin_unlock(&resv->lock); 387 388 return chg; 389 } 390 391 /* 392 * Convert the address within this vma to the page offset within 393 * the mapping, in pagecache page units; huge pages here. 394 */ 395 static pgoff_t vma_hugecache_offset(struct hstate *h, 396 struct vm_area_struct *vma, unsigned long address) 397 { 398 return ((address - vma->vm_start) >> huge_page_shift(h)) + 399 (vma->vm_pgoff >> huge_page_order(h)); 400 } 401 402 pgoff_t linear_hugepage_index(struct vm_area_struct *vma, 403 unsigned long address) 404 { 405 return vma_hugecache_offset(hstate_vma(vma), vma, address); 406 } 407 408 /* 409 * Return the size of the pages allocated when backing a VMA. In the majority 410 * cases this will be same size as used by the page table entries. 411 */ 412 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 413 { 414 struct hstate *hstate; 415 416 if (!is_vm_hugetlb_page(vma)) 417 return PAGE_SIZE; 418 419 hstate = hstate_vma(vma); 420 421 return 1UL << huge_page_shift(hstate); 422 } 423 EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 424 425 /* 426 * Return the page size being used by the MMU to back a VMA. In the majority 427 * of cases, the page size used by the kernel matches the MMU size. On 428 * architectures where it differs, an architecture-specific version of this 429 * function is required. 430 */ 431 #ifndef vma_mmu_pagesize 432 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) 433 { 434 return vma_kernel_pagesize(vma); 435 } 436 #endif 437 438 /* 439 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 440 * bits of the reservation map pointer, which are always clear due to 441 * alignment. 442 */ 443 #define HPAGE_RESV_OWNER (1UL << 0) 444 #define HPAGE_RESV_UNMAPPED (1UL << 1) 445 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 446 447 /* 448 * These helpers are used to track how many pages are reserved for 449 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 450 * is guaranteed to have their future faults succeed. 451 * 452 * With the exception of reset_vma_resv_huge_pages() which is called at fork(), 453 * the reserve counters are updated with the hugetlb_lock held. It is safe 454 * to reset the VMA at fork() time as it is not in use yet and there is no 455 * chance of the global counters getting corrupted as a result of the values. 456 * 457 * The private mapping reservation is represented in a subtly different 458 * manner to a shared mapping. A shared mapping has a region map associated 459 * with the underlying file, this region map represents the backing file 460 * pages which have ever had a reservation assigned which this persists even 461 * after the page is instantiated. A private mapping has a region map 462 * associated with the original mmap which is attached to all VMAs which 463 * reference it, this region map represents those offsets which have consumed 464 * reservation ie. where pages have been instantiated. 465 */ 466 static unsigned long get_vma_private_data(struct vm_area_struct *vma) 467 { 468 return (unsigned long)vma->vm_private_data; 469 } 470 471 static void set_vma_private_data(struct vm_area_struct *vma, 472 unsigned long value) 473 { 474 vma->vm_private_data = (void *)value; 475 } 476 477 struct resv_map *resv_map_alloc(void) 478 { 479 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 480 if (!resv_map) 481 return NULL; 482 483 kref_init(&resv_map->refs); 484 spin_lock_init(&resv_map->lock); 485 INIT_LIST_HEAD(&resv_map->regions); 486 487 return resv_map; 488 } 489 490 void resv_map_release(struct kref *ref) 491 { 492 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 493 494 /* Clear out any active regions before we release the map. */ 495 region_truncate(resv_map, 0); 496 kfree(resv_map); 497 } 498 499 static inline struct resv_map *inode_resv_map(struct inode *inode) 500 { 501 return inode->i_mapping->private_data; 502 } 503 504 static struct resv_map *vma_resv_map(struct vm_area_struct *vma) 505 { 506 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 507 if (vma->vm_flags & VM_MAYSHARE) { 508 struct address_space *mapping = vma->vm_file->f_mapping; 509 struct inode *inode = mapping->host; 510 511 return inode_resv_map(inode); 512 513 } else { 514 return (struct resv_map *)(get_vma_private_data(vma) & 515 ~HPAGE_RESV_MASK); 516 } 517 } 518 519 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 520 { 521 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 522 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 523 524 set_vma_private_data(vma, (get_vma_private_data(vma) & 525 HPAGE_RESV_MASK) | (unsigned long)map); 526 } 527 528 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 529 { 530 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 531 VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); 532 533 set_vma_private_data(vma, get_vma_private_data(vma) | flags); 534 } 535 536 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) 537 { 538 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 539 540 return (get_vma_private_data(vma) & flag) != 0; 541 } 542 543 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 544 void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 545 { 546 VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); 547 if (!(vma->vm_flags & VM_MAYSHARE)) 548 vma->vm_private_data = (void *)0; 549 } 550 551 /* Returns true if the VMA has associated reserve pages */ 552 static int vma_has_reserves(struct vm_area_struct *vma, long chg) 553 { 554 if (vma->vm_flags & VM_NORESERVE) { 555 /* 556 * This address is already reserved by other process(chg == 0), 557 * so, we should decrement reserved count. Without decrementing, 558 * reserve count remains after releasing inode, because this 559 * allocated page will go into page cache and is regarded as 560 * coming from reserved pool in releasing step. Currently, we 561 * don't have any other solution to deal with this situation 562 * properly, so add work-around here. 563 */ 564 if (vma->vm_flags & VM_MAYSHARE && chg == 0) 565 return 1; 566 else 567 return 0; 568 } 569 570 /* Shared mappings always use reserves */ 571 if (vma->vm_flags & VM_MAYSHARE) 572 return 1; 573 574 /* 575 * Only the process that called mmap() has reserves for 576 * private mappings. 577 */ 578 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 579 return 1; 580 581 return 0; 582 } 583 584 static void enqueue_huge_page(struct hstate *h, struct page *page) 585 { 586 int nid = page_to_nid(page); 587 list_move(&page->lru, &h->hugepage_freelists[nid]); 588 h->free_huge_pages++; 589 h->free_huge_pages_node[nid]++; 590 } 591 592 static struct page *dequeue_huge_page_node(struct hstate *h, int nid) 593 { 594 struct page *page; 595 596 list_for_each_entry(page, &h->hugepage_freelists[nid], lru) 597 if (!is_migrate_isolate_page(page)) 598 break; 599 /* 600 * if 'non-isolated free hugepage' not found on the list, 601 * the allocation fails. 602 */ 603 if (&h->hugepage_freelists[nid] == &page->lru) 604 return NULL; 605 list_move(&page->lru, &h->hugepage_activelist); 606 set_page_refcounted(page); 607 h->free_huge_pages--; 608 h->free_huge_pages_node[nid]--; 609 return page; 610 } 611 612 /* Movability of hugepages depends on migration support. */ 613 static inline gfp_t htlb_alloc_mask(struct hstate *h) 614 { 615 if (hugepages_treat_as_movable || hugepage_migration_supported(h)) 616 return GFP_HIGHUSER_MOVABLE; 617 else 618 return GFP_HIGHUSER; 619 } 620 621 static struct page *dequeue_huge_page_vma(struct hstate *h, 622 struct vm_area_struct *vma, 623 unsigned long address, int avoid_reserve, 624 long chg) 625 { 626 struct page *page = NULL; 627 struct mempolicy *mpol; 628 nodemask_t *nodemask; 629 struct zonelist *zonelist; 630 struct zone *zone; 631 struct zoneref *z; 632 unsigned int cpuset_mems_cookie; 633 634 /* 635 * A child process with MAP_PRIVATE mappings created by their parent 636 * have no page reserves. This check ensures that reservations are 637 * not "stolen". The child may still get SIGKILLed 638 */ 639 if (!vma_has_reserves(vma, chg) && 640 h->free_huge_pages - h->resv_huge_pages == 0) 641 goto err; 642 643 /* If reserves cannot be used, ensure enough pages are in the pool */ 644 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 645 goto err; 646 647 retry_cpuset: 648 cpuset_mems_cookie = read_mems_allowed_begin(); 649 zonelist = huge_zonelist(vma, address, 650 htlb_alloc_mask(h), &mpol, &nodemask); 651 652 for_each_zone_zonelist_nodemask(zone, z, zonelist, 653 MAX_NR_ZONES - 1, nodemask) { 654 if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { 655 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 656 if (page) { 657 if (avoid_reserve) 658 break; 659 if (!vma_has_reserves(vma, chg)) 660 break; 661 662 SetPagePrivate(page); 663 h->resv_huge_pages--; 664 break; 665 } 666 } 667 } 668 669 mpol_cond_put(mpol); 670 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 671 goto retry_cpuset; 672 return page; 673 674 err: 675 return NULL; 676 } 677 678 /* 679 * common helper functions for hstate_next_node_to_{alloc|free}. 680 * We may have allocated or freed a huge page based on a different 681 * nodes_allowed previously, so h->next_node_to_{alloc|free} might 682 * be outside of *nodes_allowed. Ensure that we use an allowed 683 * node for alloc or free. 684 */ 685 static int next_node_allowed(int nid, nodemask_t *nodes_allowed) 686 { 687 nid = next_node(nid, *nodes_allowed); 688 if (nid == MAX_NUMNODES) 689 nid = first_node(*nodes_allowed); 690 VM_BUG_ON(nid >= MAX_NUMNODES); 691 692 return nid; 693 } 694 695 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) 696 { 697 if (!node_isset(nid, *nodes_allowed)) 698 nid = next_node_allowed(nid, nodes_allowed); 699 return nid; 700 } 701 702 /* 703 * returns the previously saved node ["this node"] from which to 704 * allocate a persistent huge page for the pool and advance the 705 * next node from which to allocate, handling wrap at end of node 706 * mask. 707 */ 708 static int hstate_next_node_to_alloc(struct hstate *h, 709 nodemask_t *nodes_allowed) 710 { 711 int nid; 712 713 VM_BUG_ON(!nodes_allowed); 714 715 nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); 716 h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); 717 718 return nid; 719 } 720 721 /* 722 * helper for free_pool_huge_page() - return the previously saved 723 * node ["this node"] from which to free a huge page. Advance the 724 * next node id whether or not we find a free huge page to free so 725 * that the next attempt to free addresses the next node. 726 */ 727 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) 728 { 729 int nid; 730 731 VM_BUG_ON(!nodes_allowed); 732 733 nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); 734 h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); 735 736 return nid; 737 } 738 739 #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ 740 for (nr_nodes = nodes_weight(*mask); \ 741 nr_nodes > 0 && \ 742 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ 743 nr_nodes--) 744 745 #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ 746 for (nr_nodes = nodes_weight(*mask); \ 747 nr_nodes > 0 && \ 748 ((node = hstate_next_node_to_free(hs, mask)) || 1); \ 749 nr_nodes--) 750 751 #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) 752 static void destroy_compound_gigantic_page(struct page *page, 753 unsigned long order) 754 { 755 int i; 756 int nr_pages = 1 << order; 757 struct page *p = page + 1; 758 759 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 760 __ClearPageTail(p); 761 set_page_refcounted(p); 762 p->first_page = NULL; 763 } 764 765 set_compound_order(page, 0); 766 __ClearPageHead(page); 767 } 768 769 static void free_gigantic_page(struct page *page, unsigned order) 770 { 771 free_contig_range(page_to_pfn(page), 1 << order); 772 } 773 774 static int __alloc_gigantic_page(unsigned long start_pfn, 775 unsigned long nr_pages) 776 { 777 unsigned long end_pfn = start_pfn + nr_pages; 778 return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 779 } 780 781 static bool pfn_range_valid_gigantic(unsigned long start_pfn, 782 unsigned long nr_pages) 783 { 784 unsigned long i, end_pfn = start_pfn + nr_pages; 785 struct page *page; 786 787 for (i = start_pfn; i < end_pfn; i++) { 788 if (!pfn_valid(i)) 789 return false; 790 791 page = pfn_to_page(i); 792 793 if (PageReserved(page)) 794 return false; 795 796 if (page_count(page) > 0) 797 return false; 798 799 if (PageHuge(page)) 800 return false; 801 } 802 803 return true; 804 } 805 806 static bool zone_spans_last_pfn(const struct zone *zone, 807 unsigned long start_pfn, unsigned long nr_pages) 808 { 809 unsigned long last_pfn = start_pfn + nr_pages - 1; 810 return zone_spans_pfn(zone, last_pfn); 811 } 812 813 static struct page *alloc_gigantic_page(int nid, unsigned order) 814 { 815 unsigned long nr_pages = 1 << order; 816 unsigned long ret, pfn, flags; 817 struct zone *z; 818 819 z = NODE_DATA(nid)->node_zones; 820 for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { 821 spin_lock_irqsave(&z->lock, flags); 822 823 pfn = ALIGN(z->zone_start_pfn, nr_pages); 824 while (zone_spans_last_pfn(z, pfn, nr_pages)) { 825 if (pfn_range_valid_gigantic(pfn, nr_pages)) { 826 /* 827 * We release the zone lock here because 828 * alloc_contig_range() will also lock the zone 829 * at some point. If there's an allocation 830 * spinning on this lock, it may win the race 831 * and cause alloc_contig_range() to fail... 832 */ 833 spin_unlock_irqrestore(&z->lock, flags); 834 ret = __alloc_gigantic_page(pfn, nr_pages); 835 if (!ret) 836 return pfn_to_page(pfn); 837 spin_lock_irqsave(&z->lock, flags); 838 } 839 pfn += nr_pages; 840 } 841 842 spin_unlock_irqrestore(&z->lock, flags); 843 } 844 845 return NULL; 846 } 847 848 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); 849 static void prep_compound_gigantic_page(struct page *page, unsigned long order); 850 851 static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) 852 { 853 struct page *page; 854 855 page = alloc_gigantic_page(nid, huge_page_order(h)); 856 if (page) { 857 prep_compound_gigantic_page(page, huge_page_order(h)); 858 prep_new_huge_page(h, page, nid); 859 } 860 861 return page; 862 } 863 864 static int alloc_fresh_gigantic_page(struct hstate *h, 865 nodemask_t *nodes_allowed) 866 { 867 struct page *page = NULL; 868 int nr_nodes, node; 869 870 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 871 page = alloc_fresh_gigantic_page_node(h, node); 872 if (page) 873 return 1; 874 } 875 876 return 0; 877 } 878 879 static inline bool gigantic_page_supported(void) { return true; } 880 #else 881 static inline bool gigantic_page_supported(void) { return false; } 882 static inline void free_gigantic_page(struct page *page, unsigned order) { } 883 static inline void destroy_compound_gigantic_page(struct page *page, 884 unsigned long order) { } 885 static inline int alloc_fresh_gigantic_page(struct hstate *h, 886 nodemask_t *nodes_allowed) { return 0; } 887 #endif 888 889 static void update_and_free_page(struct hstate *h, struct page *page) 890 { 891 int i; 892 893 if (hstate_is_gigantic(h) && !gigantic_page_supported()) 894 return; 895 896 h->nr_huge_pages--; 897 h->nr_huge_pages_node[page_to_nid(page)]--; 898 for (i = 0; i < pages_per_huge_page(h); i++) { 899 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 900 1 << PG_referenced | 1 << PG_dirty | 901 1 << PG_active | 1 << PG_private | 902 1 << PG_writeback); 903 } 904 VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); 905 set_compound_page_dtor(page, NULL); 906 set_page_refcounted(page); 907 if (hstate_is_gigantic(h)) { 908 destroy_compound_gigantic_page(page, huge_page_order(h)); 909 free_gigantic_page(page, huge_page_order(h)); 910 } else { 911 arch_release_hugepage(page); 912 __free_pages(page, huge_page_order(h)); 913 } 914 } 915 916 struct hstate *size_to_hstate(unsigned long size) 917 { 918 struct hstate *h; 919 920 for_each_hstate(h) { 921 if (huge_page_size(h) == size) 922 return h; 923 } 924 return NULL; 925 } 926 927 /* 928 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked 929 * to hstate->hugepage_activelist.) 930 * 931 * This function can be called for tail pages, but never returns true for them. 932 */ 933 bool page_huge_active(struct page *page) 934 { 935 VM_BUG_ON_PAGE(!PageHuge(page), page); 936 return PageHead(page) && PagePrivate(&page[1]); 937 } 938 939 /* never called for tail page */ 940 static void set_page_huge_active(struct page *page) 941 { 942 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 943 SetPagePrivate(&page[1]); 944 } 945 946 static void clear_page_huge_active(struct page *page) 947 { 948 VM_BUG_ON_PAGE(!PageHeadHuge(page), page); 949 ClearPagePrivate(&page[1]); 950 } 951 952 void free_huge_page(struct page *page) 953 { 954 /* 955 * Can't pass hstate in here because it is called from the 956 * compound page destructor. 957 */ 958 struct hstate *h = page_hstate(page); 959 int nid = page_to_nid(page); 960 struct hugepage_subpool *spool = 961 (struct hugepage_subpool *)page_private(page); 962 bool restore_reserve; 963 964 set_page_private(page, 0); 965 page->mapping = NULL; 966 BUG_ON(page_count(page)); 967 BUG_ON(page_mapcount(page)); 968 restore_reserve = PagePrivate(page); 969 ClearPagePrivate(page); 970 971 /* 972 * A return code of zero implies that the subpool will be under its 973 * minimum size if the reservation is not restored after page is free. 974 * Therefore, force restore_reserve operation. 975 */ 976 if (hugepage_subpool_put_pages(spool, 1) == 0) 977 restore_reserve = true; 978 979 spin_lock(&hugetlb_lock); 980 clear_page_huge_active(page); 981 hugetlb_cgroup_uncharge_page(hstate_index(h), 982 pages_per_huge_page(h), page); 983 if (restore_reserve) 984 h->resv_huge_pages++; 985 986 if (h->surplus_huge_pages_node[nid]) { 987 /* remove the page from active list */ 988 list_del(&page->lru); 989 update_and_free_page(h, page); 990 h->surplus_huge_pages--; 991 h->surplus_huge_pages_node[nid]--; 992 } else { 993 arch_clear_hugepage_flags(page); 994 enqueue_huge_page(h, page); 995 } 996 spin_unlock(&hugetlb_lock); 997 } 998 999 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 1000 { 1001 INIT_LIST_HEAD(&page->lru); 1002 set_compound_page_dtor(page, free_huge_page); 1003 spin_lock(&hugetlb_lock); 1004 set_hugetlb_cgroup(page, NULL); 1005 h->nr_huge_pages++; 1006 h->nr_huge_pages_node[nid]++; 1007 spin_unlock(&hugetlb_lock); 1008 put_page(page); /* free it into the hugepage allocator */ 1009 } 1010 1011 static void prep_compound_gigantic_page(struct page *page, unsigned long order) 1012 { 1013 int i; 1014 int nr_pages = 1 << order; 1015 struct page *p = page + 1; 1016 1017 /* we rely on prep_new_huge_page to set the destructor */ 1018 set_compound_order(page, order); 1019 __SetPageHead(page); 1020 __ClearPageReserved(page); 1021 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1022 /* 1023 * For gigantic hugepages allocated through bootmem at 1024 * boot, it's safer to be consistent with the not-gigantic 1025 * hugepages and clear the PG_reserved bit from all tail pages 1026 * too. Otherwse drivers using get_user_pages() to access tail 1027 * pages may get the reference counting wrong if they see 1028 * PG_reserved set on a tail page (despite the head page not 1029 * having PG_reserved set). Enforcing this consistency between 1030 * head and tail pages allows drivers to optimize away a check 1031 * on the head page when they need know if put_page() is needed 1032 * after get_user_pages(). 1033 */ 1034 __ClearPageReserved(p); 1035 set_page_count(p, 0); 1036 p->first_page = page; 1037 /* Make sure p->first_page is always valid for PageTail() */ 1038 smp_wmb(); 1039 __SetPageTail(p); 1040 } 1041 } 1042 1043 /* 1044 * PageHuge() only returns true for hugetlbfs pages, but not for normal or 1045 * transparent huge pages. See the PageTransHuge() documentation for more 1046 * details. 1047 */ 1048 int PageHuge(struct page *page) 1049 { 1050 if (!PageCompound(page)) 1051 return 0; 1052 1053 page = compound_head(page); 1054 return get_compound_page_dtor(page) == free_huge_page; 1055 } 1056 EXPORT_SYMBOL_GPL(PageHuge); 1057 1058 /* 1059 * PageHeadHuge() only returns true for hugetlbfs head page, but not for 1060 * normal or transparent huge pages. 1061 */ 1062 int PageHeadHuge(struct page *page_head) 1063 { 1064 if (!PageHead(page_head)) 1065 return 0; 1066 1067 return get_compound_page_dtor(page_head) == free_huge_page; 1068 } 1069 1070 pgoff_t __basepage_index(struct page *page) 1071 { 1072 struct page *page_head = compound_head(page); 1073 pgoff_t index = page_index(page_head); 1074 unsigned long compound_idx; 1075 1076 if (!PageHuge(page_head)) 1077 return page_index(page); 1078 1079 if (compound_order(page_head) >= MAX_ORDER) 1080 compound_idx = page_to_pfn(page) - page_to_pfn(page_head); 1081 else 1082 compound_idx = page - page_head; 1083 1084 return (index << compound_order(page_head)) + compound_idx; 1085 } 1086 1087 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 1088 { 1089 struct page *page; 1090 1091 page = alloc_pages_exact_node(nid, 1092 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1093 __GFP_REPEAT|__GFP_NOWARN, 1094 huge_page_order(h)); 1095 if (page) { 1096 if (arch_prepare_hugepage(page)) { 1097 __free_pages(page, huge_page_order(h)); 1098 return NULL; 1099 } 1100 prep_new_huge_page(h, page, nid); 1101 } 1102 1103 return page; 1104 } 1105 1106 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 1107 { 1108 struct page *page; 1109 int nr_nodes, node; 1110 int ret = 0; 1111 1112 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1113 page = alloc_fresh_huge_page_node(h, node); 1114 if (page) { 1115 ret = 1; 1116 break; 1117 } 1118 } 1119 1120 if (ret) 1121 count_vm_event(HTLB_BUDDY_PGALLOC); 1122 else 1123 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1124 1125 return ret; 1126 } 1127 1128 /* 1129 * Free huge page from pool from next node to free. 1130 * Attempt to keep persistent huge pages more or less 1131 * balanced over allowed nodes. 1132 * Called with hugetlb_lock locked. 1133 */ 1134 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 1135 bool acct_surplus) 1136 { 1137 int nr_nodes, node; 1138 int ret = 0; 1139 1140 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1141 /* 1142 * If we're returning unused surplus pages, only examine 1143 * nodes with surplus pages. 1144 */ 1145 if ((!acct_surplus || h->surplus_huge_pages_node[node]) && 1146 !list_empty(&h->hugepage_freelists[node])) { 1147 struct page *page = 1148 list_entry(h->hugepage_freelists[node].next, 1149 struct page, lru); 1150 list_del(&page->lru); 1151 h->free_huge_pages--; 1152 h->free_huge_pages_node[node]--; 1153 if (acct_surplus) { 1154 h->surplus_huge_pages--; 1155 h->surplus_huge_pages_node[node]--; 1156 } 1157 update_and_free_page(h, page); 1158 ret = 1; 1159 break; 1160 } 1161 } 1162 1163 return ret; 1164 } 1165 1166 /* 1167 * Dissolve a given free hugepage into free buddy pages. This function does 1168 * nothing for in-use (including surplus) hugepages. 1169 */ 1170 static void dissolve_free_huge_page(struct page *page) 1171 { 1172 spin_lock(&hugetlb_lock); 1173 if (PageHuge(page) && !page_count(page)) { 1174 struct hstate *h = page_hstate(page); 1175 int nid = page_to_nid(page); 1176 list_del(&page->lru); 1177 h->free_huge_pages--; 1178 h->free_huge_pages_node[nid]--; 1179 update_and_free_page(h, page); 1180 } 1181 spin_unlock(&hugetlb_lock); 1182 } 1183 1184 /* 1185 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to 1186 * make specified memory blocks removable from the system. 1187 * Note that start_pfn should aligned with (minimum) hugepage size. 1188 */ 1189 void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) 1190 { 1191 unsigned int order = 8 * sizeof(void *); 1192 unsigned long pfn; 1193 struct hstate *h; 1194 1195 if (!hugepages_supported()) 1196 return; 1197 1198 /* Set scan step to minimum hugepage size */ 1199 for_each_hstate(h) 1200 if (order > huge_page_order(h)) 1201 order = huge_page_order(h); 1202 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); 1203 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) 1204 dissolve_free_huge_page(pfn_to_page(pfn)); 1205 } 1206 1207 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 1208 { 1209 struct page *page; 1210 unsigned int r_nid; 1211 1212 if (hstate_is_gigantic(h)) 1213 return NULL; 1214 1215 /* 1216 * Assume we will successfully allocate the surplus page to 1217 * prevent racing processes from causing the surplus to exceed 1218 * overcommit 1219 * 1220 * This however introduces a different race, where a process B 1221 * tries to grow the static hugepage pool while alloc_pages() is 1222 * called by process A. B will only examine the per-node 1223 * counters in determining if surplus huge pages can be 1224 * converted to normal huge pages in adjust_pool_surplus(). A 1225 * won't be able to increment the per-node counter, until the 1226 * lock is dropped by B, but B doesn't drop hugetlb_lock until 1227 * no more huge pages can be converted from surplus to normal 1228 * state (and doesn't try to convert again). Thus, we have a 1229 * case where a surplus huge page exists, the pool is grown, and 1230 * the surplus huge page still exists after, even though it 1231 * should just have been converted to a normal huge page. This 1232 * does not leak memory, though, as the hugepage will be freed 1233 * once it is out of use. It also does not allow the counters to 1234 * go out of whack in adjust_pool_surplus() as we don't modify 1235 * the node values until we've gotten the hugepage and only the 1236 * per-node value is checked there. 1237 */ 1238 spin_lock(&hugetlb_lock); 1239 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { 1240 spin_unlock(&hugetlb_lock); 1241 return NULL; 1242 } else { 1243 h->nr_huge_pages++; 1244 h->surplus_huge_pages++; 1245 } 1246 spin_unlock(&hugetlb_lock); 1247 1248 if (nid == NUMA_NO_NODE) 1249 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| 1250 __GFP_REPEAT|__GFP_NOWARN, 1251 huge_page_order(h)); 1252 else 1253 page = alloc_pages_exact_node(nid, 1254 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| 1255 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 1256 1257 if (page && arch_prepare_hugepage(page)) { 1258 __free_pages(page, huge_page_order(h)); 1259 page = NULL; 1260 } 1261 1262 spin_lock(&hugetlb_lock); 1263 if (page) { 1264 INIT_LIST_HEAD(&page->lru); 1265 r_nid = page_to_nid(page); 1266 set_compound_page_dtor(page, free_huge_page); 1267 set_hugetlb_cgroup(page, NULL); 1268 /* 1269 * We incremented the global counters already 1270 */ 1271 h->nr_huge_pages_node[r_nid]++; 1272 h->surplus_huge_pages_node[r_nid]++; 1273 __count_vm_event(HTLB_BUDDY_PGALLOC); 1274 } else { 1275 h->nr_huge_pages--; 1276 h->surplus_huge_pages--; 1277 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1278 } 1279 spin_unlock(&hugetlb_lock); 1280 1281 return page; 1282 } 1283 1284 /* 1285 * This allocation function is useful in the context where vma is irrelevant. 1286 * E.g. soft-offlining uses this function because it only cares physical 1287 * address of error page. 1288 */ 1289 struct page *alloc_huge_page_node(struct hstate *h, int nid) 1290 { 1291 struct page *page = NULL; 1292 1293 spin_lock(&hugetlb_lock); 1294 if (h->free_huge_pages - h->resv_huge_pages > 0) 1295 page = dequeue_huge_page_node(h, nid); 1296 spin_unlock(&hugetlb_lock); 1297 1298 if (!page) 1299 page = alloc_buddy_huge_page(h, nid); 1300 1301 return page; 1302 } 1303 1304 /* 1305 * Increase the hugetlb pool such that it can accommodate a reservation 1306 * of size 'delta'. 1307 */ 1308 static int gather_surplus_pages(struct hstate *h, int delta) 1309 { 1310 struct list_head surplus_list; 1311 struct page *page, *tmp; 1312 int ret, i; 1313 int needed, allocated; 1314 bool alloc_ok = true; 1315 1316 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 1317 if (needed <= 0) { 1318 h->resv_huge_pages += delta; 1319 return 0; 1320 } 1321 1322 allocated = 0; 1323 INIT_LIST_HEAD(&surplus_list); 1324 1325 ret = -ENOMEM; 1326 retry: 1327 spin_unlock(&hugetlb_lock); 1328 for (i = 0; i < needed; i++) { 1329 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1330 if (!page) { 1331 alloc_ok = false; 1332 break; 1333 } 1334 list_add(&page->lru, &surplus_list); 1335 } 1336 allocated += i; 1337 1338 /* 1339 * After retaking hugetlb_lock, we need to recalculate 'needed' 1340 * because either resv_huge_pages or free_huge_pages may have changed. 1341 */ 1342 spin_lock(&hugetlb_lock); 1343 needed = (h->resv_huge_pages + delta) - 1344 (h->free_huge_pages + allocated); 1345 if (needed > 0) { 1346 if (alloc_ok) 1347 goto retry; 1348 /* 1349 * We were not able to allocate enough pages to 1350 * satisfy the entire reservation so we free what 1351 * we've allocated so far. 1352 */ 1353 goto free; 1354 } 1355 /* 1356 * The surplus_list now contains _at_least_ the number of extra pages 1357 * needed to accommodate the reservation. Add the appropriate number 1358 * of pages to the hugetlb pool and free the extras back to the buddy 1359 * allocator. Commit the entire reservation here to prevent another 1360 * process from stealing the pages as they are added to the pool but 1361 * before they are reserved. 1362 */ 1363 needed += allocated; 1364 h->resv_huge_pages += delta; 1365 ret = 0; 1366 1367 /* Free the needed pages to the hugetlb pool */ 1368 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1369 if ((--needed) < 0) 1370 break; 1371 /* 1372 * This page is now managed by the hugetlb allocator and has 1373 * no users -- drop the buddy allocator's reference. 1374 */ 1375 put_page_testzero(page); 1376 VM_BUG_ON_PAGE(page_count(page), page); 1377 enqueue_huge_page(h, page); 1378 } 1379 free: 1380 spin_unlock(&hugetlb_lock); 1381 1382 /* Free unnecessary surplus pages to the buddy allocator */ 1383 list_for_each_entry_safe(page, tmp, &surplus_list, lru) 1384 put_page(page); 1385 spin_lock(&hugetlb_lock); 1386 1387 return ret; 1388 } 1389 1390 /* 1391 * When releasing a hugetlb pool reservation, any surplus pages that were 1392 * allocated to satisfy the reservation must be explicitly freed if they were 1393 * never used. 1394 * Called with hugetlb_lock held. 1395 */ 1396 static void return_unused_surplus_pages(struct hstate *h, 1397 unsigned long unused_resv_pages) 1398 { 1399 unsigned long nr_pages; 1400 1401 /* Uncommit the reservation */ 1402 h->resv_huge_pages -= unused_resv_pages; 1403 1404 /* Cannot return gigantic pages currently */ 1405 if (hstate_is_gigantic(h)) 1406 return; 1407 1408 nr_pages = min(unused_resv_pages, h->surplus_huge_pages); 1409 1410 /* 1411 * We want to release as many surplus pages as possible, spread 1412 * evenly across all nodes with memory. Iterate across these nodes 1413 * until we can no longer free unreserved surplus pages. This occurs 1414 * when the nodes with surplus pages have no free pages. 1415 * free_pool_huge_page() will balance the the freed pages across the 1416 * on-line nodes with memory and will handle the hstate accounting. 1417 */ 1418 while (nr_pages--) { 1419 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) 1420 break; 1421 cond_resched_lock(&hugetlb_lock); 1422 } 1423 } 1424 1425 /* 1426 * Determine if the huge page at addr within the vma has an associated 1427 * reservation. Where it does not we will need to logically increase 1428 * reservation and actually increase subpool usage before an allocation 1429 * can occur. Where any new reservation would be required the 1430 * reservation change is prepared, but not committed. Once the page 1431 * has been allocated from the subpool and instantiated the change should 1432 * be committed via vma_commit_reservation. No action is required on 1433 * failure. 1434 */ 1435 static long vma_needs_reservation(struct hstate *h, 1436 struct vm_area_struct *vma, unsigned long addr) 1437 { 1438 struct resv_map *resv; 1439 pgoff_t idx; 1440 long chg; 1441 1442 resv = vma_resv_map(vma); 1443 if (!resv) 1444 return 1; 1445 1446 idx = vma_hugecache_offset(h, vma, addr); 1447 chg = region_chg(resv, idx, idx + 1); 1448 1449 if (vma->vm_flags & VM_MAYSHARE) 1450 return chg; 1451 else 1452 return chg < 0 ? chg : 0; 1453 } 1454 static void vma_commit_reservation(struct hstate *h, 1455 struct vm_area_struct *vma, unsigned long addr) 1456 { 1457 struct resv_map *resv; 1458 pgoff_t idx; 1459 1460 resv = vma_resv_map(vma); 1461 if (!resv) 1462 return; 1463 1464 idx = vma_hugecache_offset(h, vma, addr); 1465 region_add(resv, idx, idx + 1); 1466 } 1467 1468 static struct page *alloc_huge_page(struct vm_area_struct *vma, 1469 unsigned long addr, int avoid_reserve) 1470 { 1471 struct hugepage_subpool *spool = subpool_vma(vma); 1472 struct hstate *h = hstate_vma(vma); 1473 struct page *page; 1474 long chg; 1475 int ret, idx; 1476 struct hugetlb_cgroup *h_cg; 1477 1478 idx = hstate_index(h); 1479 /* 1480 * Processes that did not create the mapping will have no 1481 * reserves and will not have accounted against subpool 1482 * limit. Check that the subpool limit can be made before 1483 * satisfying the allocation MAP_NORESERVE mappings may also 1484 * need pages and subpool limit allocated allocated if no reserve 1485 * mapping overlaps. 1486 */ 1487 chg = vma_needs_reservation(h, vma, addr); 1488 if (chg < 0) 1489 return ERR_PTR(-ENOMEM); 1490 if (chg || avoid_reserve) 1491 if (hugepage_subpool_get_pages(spool, 1) < 0) 1492 return ERR_PTR(-ENOSPC); 1493 1494 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1495 if (ret) 1496 goto out_subpool_put; 1497 1498 spin_lock(&hugetlb_lock); 1499 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); 1500 if (!page) { 1501 spin_unlock(&hugetlb_lock); 1502 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1503 if (!page) 1504 goto out_uncharge_cgroup; 1505 1506 spin_lock(&hugetlb_lock); 1507 list_move(&page->lru, &h->hugepage_activelist); 1508 /* Fall through */ 1509 } 1510 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); 1511 spin_unlock(&hugetlb_lock); 1512 1513 set_page_private(page, (unsigned long)spool); 1514 1515 vma_commit_reservation(h, vma, addr); 1516 return page; 1517 1518 out_uncharge_cgroup: 1519 hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); 1520 out_subpool_put: 1521 if (chg || avoid_reserve) 1522 hugepage_subpool_put_pages(spool, 1); 1523 return ERR_PTR(-ENOSPC); 1524 } 1525 1526 /* 1527 * alloc_huge_page()'s wrapper which simply returns the page if allocation 1528 * succeeds, otherwise NULL. This function is called from new_vma_page(), 1529 * where no ERR_VALUE is expected to be returned. 1530 */ 1531 struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, 1532 unsigned long addr, int avoid_reserve) 1533 { 1534 struct page *page = alloc_huge_page(vma, addr, avoid_reserve); 1535 if (IS_ERR(page)) 1536 page = NULL; 1537 return page; 1538 } 1539 1540 int __weak alloc_bootmem_huge_page(struct hstate *h) 1541 { 1542 struct huge_bootmem_page *m; 1543 int nr_nodes, node; 1544 1545 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { 1546 void *addr; 1547 1548 addr = memblock_virt_alloc_try_nid_nopanic( 1549 huge_page_size(h), huge_page_size(h), 1550 0, BOOTMEM_ALLOC_ACCESSIBLE, node); 1551 if (addr) { 1552 /* 1553 * Use the beginning of the huge page to store the 1554 * huge_bootmem_page struct (until gather_bootmem 1555 * puts them into the mem_map). 1556 */ 1557 m = addr; 1558 goto found; 1559 } 1560 } 1561 return 0; 1562 1563 found: 1564 BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); 1565 /* Put them into a private list first because mem_map is not up yet */ 1566 list_add(&m->list, &huge_boot_pages); 1567 m->hstate = h; 1568 return 1; 1569 } 1570 1571 static void __init prep_compound_huge_page(struct page *page, int order) 1572 { 1573 if (unlikely(order > (MAX_ORDER - 1))) 1574 prep_compound_gigantic_page(page, order); 1575 else 1576 prep_compound_page(page, order); 1577 } 1578 1579 /* Put bootmem huge pages into the standard lists after mem_map is up */ 1580 static void __init gather_bootmem_prealloc(void) 1581 { 1582 struct huge_bootmem_page *m; 1583 1584 list_for_each_entry(m, &huge_boot_pages, list) { 1585 struct hstate *h = m->hstate; 1586 struct page *page; 1587 1588 #ifdef CONFIG_HIGHMEM 1589 page = pfn_to_page(m->phys >> PAGE_SHIFT); 1590 memblock_free_late(__pa(m), 1591 sizeof(struct huge_bootmem_page)); 1592 #else 1593 page = virt_to_page(m); 1594 #endif 1595 WARN_ON(page_count(page) != 1); 1596 prep_compound_huge_page(page, h->order); 1597 WARN_ON(PageReserved(page)); 1598 prep_new_huge_page(h, page, page_to_nid(page)); 1599 /* 1600 * If we had gigantic hugepages allocated at boot time, we need 1601 * to restore the 'stolen' pages to totalram_pages in order to 1602 * fix confusing memory reports from free(1) and another 1603 * side-effects, like CommitLimit going negative. 1604 */ 1605 if (hstate_is_gigantic(h)) 1606 adjust_managed_page_count(page, 1 << h->order); 1607 } 1608 } 1609 1610 static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 1611 { 1612 unsigned long i; 1613 1614 for (i = 0; i < h->max_huge_pages; ++i) { 1615 if (hstate_is_gigantic(h)) { 1616 if (!alloc_bootmem_huge_page(h)) 1617 break; 1618 } else if (!alloc_fresh_huge_page(h, 1619 &node_states[N_MEMORY])) 1620 break; 1621 } 1622 h->max_huge_pages = i; 1623 } 1624 1625 static void __init hugetlb_init_hstates(void) 1626 { 1627 struct hstate *h; 1628 1629 for_each_hstate(h) { 1630 /* oversize hugepages were init'ed in early boot */ 1631 if (!hstate_is_gigantic(h)) 1632 hugetlb_hstate_alloc_pages(h); 1633 } 1634 } 1635 1636 static char * __init memfmt(char *buf, unsigned long n) 1637 { 1638 if (n >= (1UL << 30)) 1639 sprintf(buf, "%lu GB", n >> 30); 1640 else if (n >= (1UL << 20)) 1641 sprintf(buf, "%lu MB", n >> 20); 1642 else 1643 sprintf(buf, "%lu KB", n >> 10); 1644 return buf; 1645 } 1646 1647 static void __init report_hugepages(void) 1648 { 1649 struct hstate *h; 1650 1651 for_each_hstate(h) { 1652 char buf[32]; 1653 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", 1654 memfmt(buf, huge_page_size(h)), 1655 h->free_huge_pages); 1656 } 1657 } 1658 1659 #ifdef CONFIG_HIGHMEM 1660 static void try_to_free_low(struct hstate *h, unsigned long count, 1661 nodemask_t *nodes_allowed) 1662 { 1663 int i; 1664 1665 if (hstate_is_gigantic(h)) 1666 return; 1667 1668 for_each_node_mask(i, *nodes_allowed) { 1669 struct page *page, *next; 1670 struct list_head *freel = &h->hugepage_freelists[i]; 1671 list_for_each_entry_safe(page, next, freel, lru) { 1672 if (count >= h->nr_huge_pages) 1673 return; 1674 if (PageHighMem(page)) 1675 continue; 1676 list_del(&page->lru); 1677 update_and_free_page(h, page); 1678 h->free_huge_pages--; 1679 h->free_huge_pages_node[page_to_nid(page)]--; 1680 } 1681 } 1682 } 1683 #else 1684 static inline void try_to_free_low(struct hstate *h, unsigned long count, 1685 nodemask_t *nodes_allowed) 1686 { 1687 } 1688 #endif 1689 1690 /* 1691 * Increment or decrement surplus_huge_pages. Keep node-specific counters 1692 * balanced by operating on them in a round-robin fashion. 1693 * Returns 1 if an adjustment was made. 1694 */ 1695 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 1696 int delta) 1697 { 1698 int nr_nodes, node; 1699 1700 VM_BUG_ON(delta != -1 && delta != 1); 1701 1702 if (delta < 0) { 1703 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1704 if (h->surplus_huge_pages_node[node]) 1705 goto found; 1706 } 1707 } else { 1708 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { 1709 if (h->surplus_huge_pages_node[node] < 1710 h->nr_huge_pages_node[node]) 1711 goto found; 1712 } 1713 } 1714 return 0; 1715 1716 found: 1717 h->surplus_huge_pages += delta; 1718 h->surplus_huge_pages_node[node] += delta; 1719 return 1; 1720 } 1721 1722 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1723 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, 1724 nodemask_t *nodes_allowed) 1725 { 1726 unsigned long min_count, ret; 1727 1728 if (hstate_is_gigantic(h) && !gigantic_page_supported()) 1729 return h->max_huge_pages; 1730 1731 /* 1732 * Increase the pool size 1733 * First take pages out of surplus state. Then make up the 1734 * remaining difference by allocating fresh huge pages. 1735 * 1736 * We might race with alloc_buddy_huge_page() here and be unable 1737 * to convert a surplus huge page to a normal huge page. That is 1738 * not critical, though, it just means the overall size of the 1739 * pool might be one hugepage larger than it needs to be, but 1740 * within all the constraints specified by the sysctls. 1741 */ 1742 spin_lock(&hugetlb_lock); 1743 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { 1744 if (!adjust_pool_surplus(h, nodes_allowed, -1)) 1745 break; 1746 } 1747 1748 while (count > persistent_huge_pages(h)) { 1749 /* 1750 * If this allocation races such that we no longer need the 1751 * page, free_huge_page will handle it by freeing the page 1752 * and reducing the surplus. 1753 */ 1754 spin_unlock(&hugetlb_lock); 1755 if (hstate_is_gigantic(h)) 1756 ret = alloc_fresh_gigantic_page(h, nodes_allowed); 1757 else 1758 ret = alloc_fresh_huge_page(h, nodes_allowed); 1759 spin_lock(&hugetlb_lock); 1760 if (!ret) 1761 goto out; 1762 1763 /* Bail for signals. Probably ctrl-c from user */ 1764 if (signal_pending(current)) 1765 goto out; 1766 } 1767 1768 /* 1769 * Decrease the pool size 1770 * First return free pages to the buddy allocator (being careful 1771 * to keep enough around to satisfy reservations). Then place 1772 * pages into surplus state as needed so the pool will shrink 1773 * to the desired size as pages become free. 1774 * 1775 * By placing pages into the surplus state independent of the 1776 * overcommit value, we are allowing the surplus pool size to 1777 * exceed overcommit. There are few sane options here. Since 1778 * alloc_buddy_huge_page() is checking the global counter, 1779 * though, we'll note that we're not allowed to exceed surplus 1780 * and won't grow the pool anywhere else. Not until one of the 1781 * sysctls are changed, or the surplus pages go out of use. 1782 */ 1783 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; 1784 min_count = max(count, min_count); 1785 try_to_free_low(h, min_count, nodes_allowed); 1786 while (min_count < persistent_huge_pages(h)) { 1787 if (!free_pool_huge_page(h, nodes_allowed, 0)) 1788 break; 1789 cond_resched_lock(&hugetlb_lock); 1790 } 1791 while (count < persistent_huge_pages(h)) { 1792 if (!adjust_pool_surplus(h, nodes_allowed, 1)) 1793 break; 1794 } 1795 out: 1796 ret = persistent_huge_pages(h); 1797 spin_unlock(&hugetlb_lock); 1798 return ret; 1799 } 1800 1801 #define HSTATE_ATTR_RO(_name) \ 1802 static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 1803 1804 #define HSTATE_ATTR(_name) \ 1805 static struct kobj_attribute _name##_attr = \ 1806 __ATTR(_name, 0644, _name##_show, _name##_store) 1807 1808 static struct kobject *hugepages_kobj; 1809 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 1810 1811 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); 1812 1813 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) 1814 { 1815 int i; 1816 1817 for (i = 0; i < HUGE_MAX_HSTATE; i++) 1818 if (hstate_kobjs[i] == kobj) { 1819 if (nidp) 1820 *nidp = NUMA_NO_NODE; 1821 return &hstates[i]; 1822 } 1823 1824 return kobj_to_node_hstate(kobj, nidp); 1825 } 1826 1827 static ssize_t nr_hugepages_show_common(struct kobject *kobj, 1828 struct kobj_attribute *attr, char *buf) 1829 { 1830 struct hstate *h; 1831 unsigned long nr_huge_pages; 1832 int nid; 1833 1834 h = kobj_to_hstate(kobj, &nid); 1835 if (nid == NUMA_NO_NODE) 1836 nr_huge_pages = h->nr_huge_pages; 1837 else 1838 nr_huge_pages = h->nr_huge_pages_node[nid]; 1839 1840 return sprintf(buf, "%lu\n", nr_huge_pages); 1841 } 1842 1843 static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, 1844 struct hstate *h, int nid, 1845 unsigned long count, size_t len) 1846 { 1847 int err; 1848 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1849 1850 if (hstate_is_gigantic(h) && !gigantic_page_supported()) { 1851 err = -EINVAL; 1852 goto out; 1853 } 1854 1855 if (nid == NUMA_NO_NODE) { 1856 /* 1857 * global hstate attribute 1858 */ 1859 if (!(obey_mempolicy && 1860 init_nodemask_of_mempolicy(nodes_allowed))) { 1861 NODEMASK_FREE(nodes_allowed); 1862 nodes_allowed = &node_states[N_MEMORY]; 1863 } 1864 } else if (nodes_allowed) { 1865 /* 1866 * per node hstate attribute: adjust count to global, 1867 * but restrict alloc/free to the specified node. 1868 */ 1869 count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; 1870 init_nodemask_of_node(nodes_allowed, nid); 1871 } else 1872 nodes_allowed = &node_states[N_MEMORY]; 1873 1874 h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); 1875 1876 if (nodes_allowed != &node_states[N_MEMORY]) 1877 NODEMASK_FREE(nodes_allowed); 1878 1879 return len; 1880 out: 1881 NODEMASK_FREE(nodes_allowed); 1882 return err; 1883 } 1884 1885 static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1886 struct kobject *kobj, const char *buf, 1887 size_t len) 1888 { 1889 struct hstate *h; 1890 unsigned long count; 1891 int nid; 1892 int err; 1893 1894 err = kstrtoul(buf, 10, &count); 1895 if (err) 1896 return err; 1897 1898 h = kobj_to_hstate(kobj, &nid); 1899 return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); 1900 } 1901 1902 static ssize_t nr_hugepages_show(struct kobject *kobj, 1903 struct kobj_attribute *attr, char *buf) 1904 { 1905 return nr_hugepages_show_common(kobj, attr, buf); 1906 } 1907 1908 static ssize_t nr_hugepages_store(struct kobject *kobj, 1909 struct kobj_attribute *attr, const char *buf, size_t len) 1910 { 1911 return nr_hugepages_store_common(false, kobj, buf, len); 1912 } 1913 HSTATE_ATTR(nr_hugepages); 1914 1915 #ifdef CONFIG_NUMA 1916 1917 /* 1918 * hstate attribute for optionally mempolicy-based constraint on persistent 1919 * huge page alloc/free. 1920 */ 1921 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, 1922 struct kobj_attribute *attr, char *buf) 1923 { 1924 return nr_hugepages_show_common(kobj, attr, buf); 1925 } 1926 1927 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, 1928 struct kobj_attribute *attr, const char *buf, size_t len) 1929 { 1930 return nr_hugepages_store_common(true, kobj, buf, len); 1931 } 1932 HSTATE_ATTR(nr_hugepages_mempolicy); 1933 #endif 1934 1935 1936 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, 1937 struct kobj_attribute *attr, char *buf) 1938 { 1939 struct hstate *h = kobj_to_hstate(kobj, NULL); 1940 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1941 } 1942 1943 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1944 struct kobj_attribute *attr, const char *buf, size_t count) 1945 { 1946 int err; 1947 unsigned long input; 1948 struct hstate *h = kobj_to_hstate(kobj, NULL); 1949 1950 if (hstate_is_gigantic(h)) 1951 return -EINVAL; 1952 1953 err = kstrtoul(buf, 10, &input); 1954 if (err) 1955 return err; 1956 1957 spin_lock(&hugetlb_lock); 1958 h->nr_overcommit_huge_pages = input; 1959 spin_unlock(&hugetlb_lock); 1960 1961 return count; 1962 } 1963 HSTATE_ATTR(nr_overcommit_hugepages); 1964 1965 static ssize_t free_hugepages_show(struct kobject *kobj, 1966 struct kobj_attribute *attr, char *buf) 1967 { 1968 struct hstate *h; 1969 unsigned long free_huge_pages; 1970 int nid; 1971 1972 h = kobj_to_hstate(kobj, &nid); 1973 if (nid == NUMA_NO_NODE) 1974 free_huge_pages = h->free_huge_pages; 1975 else 1976 free_huge_pages = h->free_huge_pages_node[nid]; 1977 1978 return sprintf(buf, "%lu\n", free_huge_pages); 1979 } 1980 HSTATE_ATTR_RO(free_hugepages); 1981 1982 static ssize_t resv_hugepages_show(struct kobject *kobj, 1983 struct kobj_attribute *attr, char *buf) 1984 { 1985 struct hstate *h = kobj_to_hstate(kobj, NULL); 1986 return sprintf(buf, "%lu\n", h->resv_huge_pages); 1987 } 1988 HSTATE_ATTR_RO(resv_hugepages); 1989 1990 static ssize_t surplus_hugepages_show(struct kobject *kobj, 1991 struct kobj_attribute *attr, char *buf) 1992 { 1993 struct hstate *h; 1994 unsigned long surplus_huge_pages; 1995 int nid; 1996 1997 h = kobj_to_hstate(kobj, &nid); 1998 if (nid == NUMA_NO_NODE) 1999 surplus_huge_pages = h->surplus_huge_pages; 2000 else 2001 surplus_huge_pages = h->surplus_huge_pages_node[nid]; 2002 2003 return sprintf(buf, "%lu\n", surplus_huge_pages); 2004 } 2005 HSTATE_ATTR_RO(surplus_hugepages); 2006 2007 static struct attribute *hstate_attrs[] = { 2008 &nr_hugepages_attr.attr, 2009 &nr_overcommit_hugepages_attr.attr, 2010 &free_hugepages_attr.attr, 2011 &resv_hugepages_attr.attr, 2012 &surplus_hugepages_attr.attr, 2013 #ifdef CONFIG_NUMA 2014 &nr_hugepages_mempolicy_attr.attr, 2015 #endif 2016 NULL, 2017 }; 2018 2019 static struct attribute_group hstate_attr_group = { 2020 .attrs = hstate_attrs, 2021 }; 2022 2023 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, 2024 struct kobject **hstate_kobjs, 2025 struct attribute_group *hstate_attr_group) 2026 { 2027 int retval; 2028 int hi = hstate_index(h); 2029 2030 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 2031 if (!hstate_kobjs[hi]) 2032 return -ENOMEM; 2033 2034 retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); 2035 if (retval) 2036 kobject_put(hstate_kobjs[hi]); 2037 2038 return retval; 2039 } 2040 2041 static void __init hugetlb_sysfs_init(void) 2042 { 2043 struct hstate *h; 2044 int err; 2045 2046 hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); 2047 if (!hugepages_kobj) 2048 return; 2049 2050 for_each_hstate(h) { 2051 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 2052 hstate_kobjs, &hstate_attr_group); 2053 if (err) 2054 pr_err("Hugetlb: Unable to add hstate %s", h->name); 2055 } 2056 } 2057 2058 #ifdef CONFIG_NUMA 2059 2060 /* 2061 * node_hstate/s - associate per node hstate attributes, via their kobjects, 2062 * with node devices in node_devices[] using a parallel array. The array 2063 * index of a node device or _hstate == node id. 2064 * This is here to avoid any static dependency of the node device driver, in 2065 * the base kernel, on the hugetlb module. 2066 */ 2067 struct node_hstate { 2068 struct kobject *hugepages_kobj; 2069 struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; 2070 }; 2071 struct node_hstate node_hstates[MAX_NUMNODES]; 2072 2073 /* 2074 * A subset of global hstate attributes for node devices 2075 */ 2076 static struct attribute *per_node_hstate_attrs[] = { 2077 &nr_hugepages_attr.attr, 2078 &free_hugepages_attr.attr, 2079 &surplus_hugepages_attr.attr, 2080 NULL, 2081 }; 2082 2083 static struct attribute_group per_node_hstate_attr_group = { 2084 .attrs = per_node_hstate_attrs, 2085 }; 2086 2087 /* 2088 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. 2089 * Returns node id via non-NULL nidp. 2090 */ 2091 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 2092 { 2093 int nid; 2094 2095 for (nid = 0; nid < nr_node_ids; nid++) { 2096 struct node_hstate *nhs = &node_hstates[nid]; 2097 int i; 2098 for (i = 0; i < HUGE_MAX_HSTATE; i++) 2099 if (nhs->hstate_kobjs[i] == kobj) { 2100 if (nidp) 2101 *nidp = nid; 2102 return &hstates[i]; 2103 } 2104 } 2105 2106 BUG(); 2107 return NULL; 2108 } 2109 2110 /* 2111 * Unregister hstate attributes from a single node device. 2112 * No-op if no hstate attributes attached. 2113 */ 2114 static void hugetlb_unregister_node(struct node *node) 2115 { 2116 struct hstate *h; 2117 struct node_hstate *nhs = &node_hstates[node->dev.id]; 2118 2119 if (!nhs->hugepages_kobj) 2120 return; /* no hstate attributes */ 2121 2122 for_each_hstate(h) { 2123 int idx = hstate_index(h); 2124 if (nhs->hstate_kobjs[idx]) { 2125 kobject_put(nhs->hstate_kobjs[idx]); 2126 nhs->hstate_kobjs[idx] = NULL; 2127 } 2128 } 2129 2130 kobject_put(nhs->hugepages_kobj); 2131 nhs->hugepages_kobj = NULL; 2132 } 2133 2134 /* 2135 * hugetlb module exit: unregister hstate attributes from node devices 2136 * that have them. 2137 */ 2138 static void hugetlb_unregister_all_nodes(void) 2139 { 2140 int nid; 2141 2142 /* 2143 * disable node device registrations. 2144 */ 2145 register_hugetlbfs_with_node(NULL, NULL); 2146 2147 /* 2148 * remove hstate attributes from any nodes that have them. 2149 */ 2150 for (nid = 0; nid < nr_node_ids; nid++) 2151 hugetlb_unregister_node(node_devices[nid]); 2152 } 2153 2154 /* 2155 * Register hstate attributes for a single node device. 2156 * No-op if attributes already registered. 2157 */ 2158 static void hugetlb_register_node(struct node *node) 2159 { 2160 struct hstate *h; 2161 struct node_hstate *nhs = &node_hstates[node->dev.id]; 2162 int err; 2163 2164 if (nhs->hugepages_kobj) 2165 return; /* already allocated */ 2166 2167 nhs->hugepages_kobj = kobject_create_and_add("hugepages", 2168 &node->dev.kobj); 2169 if (!nhs->hugepages_kobj) 2170 return; 2171 2172 for_each_hstate(h) { 2173 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, 2174 nhs->hstate_kobjs, 2175 &per_node_hstate_attr_group); 2176 if (err) { 2177 pr_err("Hugetlb: Unable to add hstate %s for node %d\n", 2178 h->name, node->dev.id); 2179 hugetlb_unregister_node(node); 2180 break; 2181 } 2182 } 2183 } 2184 2185 /* 2186 * hugetlb init time: register hstate attributes for all registered node 2187 * devices of nodes that have memory. All on-line nodes should have 2188 * registered their associated device by this time. 2189 */ 2190 static void __init hugetlb_register_all_nodes(void) 2191 { 2192 int nid; 2193 2194 for_each_node_state(nid, N_MEMORY) { 2195 struct node *node = node_devices[nid]; 2196 if (node->dev.id == nid) 2197 hugetlb_register_node(node); 2198 } 2199 2200 /* 2201 * Let the node device driver know we're here so it can 2202 * [un]register hstate attributes on node hotplug. 2203 */ 2204 register_hugetlbfs_with_node(hugetlb_register_node, 2205 hugetlb_unregister_node); 2206 } 2207 #else /* !CONFIG_NUMA */ 2208 2209 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) 2210 { 2211 BUG(); 2212 if (nidp) 2213 *nidp = -1; 2214 return NULL; 2215 } 2216 2217 static void hugetlb_unregister_all_nodes(void) { } 2218 2219 static void hugetlb_register_all_nodes(void) { } 2220 2221 #endif 2222 2223 static void __exit hugetlb_exit(void) 2224 { 2225 struct hstate *h; 2226 2227 hugetlb_unregister_all_nodes(); 2228 2229 for_each_hstate(h) { 2230 kobject_put(hstate_kobjs[hstate_index(h)]); 2231 } 2232 2233 kobject_put(hugepages_kobj); 2234 kfree(htlb_fault_mutex_table); 2235 } 2236 module_exit(hugetlb_exit); 2237 2238 static int __init hugetlb_init(void) 2239 { 2240 int i; 2241 2242 if (!hugepages_supported()) 2243 return 0; 2244 2245 if (!size_to_hstate(default_hstate_size)) { 2246 default_hstate_size = HPAGE_SIZE; 2247 if (!size_to_hstate(default_hstate_size)) 2248 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 2249 } 2250 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); 2251 if (default_hstate_max_huge_pages) 2252 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 2253 2254 hugetlb_init_hstates(); 2255 gather_bootmem_prealloc(); 2256 report_hugepages(); 2257 2258 hugetlb_sysfs_init(); 2259 hugetlb_register_all_nodes(); 2260 hugetlb_cgroup_file_init(); 2261 2262 #ifdef CONFIG_SMP 2263 num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); 2264 #else 2265 num_fault_mutexes = 1; 2266 #endif 2267 htlb_fault_mutex_table = 2268 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); 2269 BUG_ON(!htlb_fault_mutex_table); 2270 2271 for (i = 0; i < num_fault_mutexes; i++) 2272 mutex_init(&htlb_fault_mutex_table[i]); 2273 return 0; 2274 } 2275 module_init(hugetlb_init); 2276 2277 /* Should be called on processing a hugepagesz=... option */ 2278 void __init hugetlb_add_hstate(unsigned order) 2279 { 2280 struct hstate *h; 2281 unsigned long i; 2282 2283 if (size_to_hstate(PAGE_SIZE << order)) { 2284 pr_warning("hugepagesz= specified twice, ignoring\n"); 2285 return; 2286 } 2287 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 2288 BUG_ON(order == 0); 2289 h = &hstates[hugetlb_max_hstate++]; 2290 h->order = order; 2291 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 2292 h->nr_huge_pages = 0; 2293 h->free_huge_pages = 0; 2294 for (i = 0; i < MAX_NUMNODES; ++i) 2295 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 2296 INIT_LIST_HEAD(&h->hugepage_activelist); 2297 h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); 2298 h->next_nid_to_free = first_node(node_states[N_MEMORY]); 2299 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 2300 huge_page_size(h)/1024); 2301 2302 parsed_hstate = h; 2303 } 2304 2305 static int __init hugetlb_nrpages_setup(char *s) 2306 { 2307 unsigned long *mhp; 2308 static unsigned long *last_mhp; 2309 2310 /* 2311 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, 2312 * so this hugepages= parameter goes to the "default hstate". 2313 */ 2314 if (!hugetlb_max_hstate) 2315 mhp = &default_hstate_max_huge_pages; 2316 else 2317 mhp = &parsed_hstate->max_huge_pages; 2318 2319 if (mhp == last_mhp) { 2320 pr_warning("hugepages= specified twice without " 2321 "interleaving hugepagesz=, ignoring\n"); 2322 return 1; 2323 } 2324 2325 if (sscanf(s, "%lu", mhp) <= 0) 2326 *mhp = 0; 2327 2328 /* 2329 * Global state is always initialized later in hugetlb_init. 2330 * But we need to allocate >= MAX_ORDER hstates here early to still 2331 * use the bootmem allocator. 2332 */ 2333 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) 2334 hugetlb_hstate_alloc_pages(parsed_hstate); 2335 2336 last_mhp = mhp; 2337 2338 return 1; 2339 } 2340 __setup("hugepages=", hugetlb_nrpages_setup); 2341 2342 static int __init hugetlb_default_setup(char *s) 2343 { 2344 default_hstate_size = memparse(s, &s); 2345 return 1; 2346 } 2347 __setup("default_hugepagesz=", hugetlb_default_setup); 2348 2349 static unsigned int cpuset_mems_nr(unsigned int *array) 2350 { 2351 int node; 2352 unsigned int nr = 0; 2353 2354 for_each_node_mask(node, cpuset_current_mems_allowed) 2355 nr += array[node]; 2356 2357 return nr; 2358 } 2359 2360 #ifdef CONFIG_SYSCTL 2361 static int hugetlb_sysctl_handler_common(bool obey_mempolicy, 2362 struct ctl_table *table, int write, 2363 void __user *buffer, size_t *length, loff_t *ppos) 2364 { 2365 struct hstate *h = &default_hstate; 2366 unsigned long tmp = h->max_huge_pages; 2367 int ret; 2368 2369 if (!hugepages_supported()) 2370 return -ENOTSUPP; 2371 2372 table->data = &tmp; 2373 table->maxlen = sizeof(unsigned long); 2374 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2375 if (ret) 2376 goto out; 2377 2378 if (write) 2379 ret = __nr_hugepages_store_common(obey_mempolicy, h, 2380 NUMA_NO_NODE, tmp, *length); 2381 out: 2382 return ret; 2383 } 2384 2385 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 2386 void __user *buffer, size_t *length, loff_t *ppos) 2387 { 2388 2389 return hugetlb_sysctl_handler_common(false, table, write, 2390 buffer, length, ppos); 2391 } 2392 2393 #ifdef CONFIG_NUMA 2394 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, 2395 void __user *buffer, size_t *length, loff_t *ppos) 2396 { 2397 return hugetlb_sysctl_handler_common(true, table, write, 2398 buffer, length, ppos); 2399 } 2400 #endif /* CONFIG_NUMA */ 2401 2402 int hugetlb_overcommit_handler(struct ctl_table *table, int write, 2403 void __user *buffer, 2404 size_t *length, loff_t *ppos) 2405 { 2406 struct hstate *h = &default_hstate; 2407 unsigned long tmp; 2408 int ret; 2409 2410 if (!hugepages_supported()) 2411 return -ENOTSUPP; 2412 2413 tmp = h->nr_overcommit_huge_pages; 2414 2415 if (write && hstate_is_gigantic(h)) 2416 return -EINVAL; 2417 2418 table->data = &tmp; 2419 table->maxlen = sizeof(unsigned long); 2420 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); 2421 if (ret) 2422 goto out; 2423 2424 if (write) { 2425 spin_lock(&hugetlb_lock); 2426 h->nr_overcommit_huge_pages = tmp; 2427 spin_unlock(&hugetlb_lock); 2428 } 2429 out: 2430 return ret; 2431 } 2432 2433 #endif /* CONFIG_SYSCTL */ 2434 2435 void hugetlb_report_meminfo(struct seq_file *m) 2436 { 2437 struct hstate *h = &default_hstate; 2438 if (!hugepages_supported()) 2439 return; 2440 seq_printf(m, 2441 "HugePages_Total: %5lu\n" 2442 "HugePages_Free: %5lu\n" 2443 "HugePages_Rsvd: %5lu\n" 2444 "HugePages_Surp: %5lu\n" 2445 "Hugepagesize: %8lu kB\n", 2446 h->nr_huge_pages, 2447 h->free_huge_pages, 2448 h->resv_huge_pages, 2449 h->surplus_huge_pages, 2450 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2451 } 2452 2453 int hugetlb_report_node_meminfo(int nid, char *buf) 2454 { 2455 struct hstate *h = &default_hstate; 2456 if (!hugepages_supported()) 2457 return 0; 2458 return sprintf(buf, 2459 "Node %d HugePages_Total: %5u\n" 2460 "Node %d HugePages_Free: %5u\n" 2461 "Node %d HugePages_Surp: %5u\n", 2462 nid, h->nr_huge_pages_node[nid], 2463 nid, h->free_huge_pages_node[nid], 2464 nid, h->surplus_huge_pages_node[nid]); 2465 } 2466 2467 void hugetlb_show_meminfo(void) 2468 { 2469 struct hstate *h; 2470 int nid; 2471 2472 if (!hugepages_supported()) 2473 return; 2474 2475 for_each_node_state(nid, N_MEMORY) 2476 for_each_hstate(h) 2477 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 2478 nid, 2479 h->nr_huge_pages_node[nid], 2480 h->free_huge_pages_node[nid], 2481 h->surplus_huge_pages_node[nid], 2482 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); 2483 } 2484 2485 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 2486 unsigned long hugetlb_total_pages(void) 2487 { 2488 struct hstate *h; 2489 unsigned long nr_total_pages = 0; 2490 2491 for_each_hstate(h) 2492 nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); 2493 return nr_total_pages; 2494 } 2495 2496 static int hugetlb_acct_memory(struct hstate *h, long delta) 2497 { 2498 int ret = -ENOMEM; 2499 2500 spin_lock(&hugetlb_lock); 2501 /* 2502 * When cpuset is configured, it breaks the strict hugetlb page 2503 * reservation as the accounting is done on a global variable. Such 2504 * reservation is completely rubbish in the presence of cpuset because 2505 * the reservation is not checked against page availability for the 2506 * current cpuset. Application can still potentially OOM'ed by kernel 2507 * with lack of free htlb page in cpuset that the task is in. 2508 * Attempt to enforce strict accounting with cpuset is almost 2509 * impossible (or too ugly) because cpuset is too fluid that 2510 * task or memory node can be dynamically moved between cpusets. 2511 * 2512 * The change of semantics for shared hugetlb mapping with cpuset is 2513 * undesirable. However, in order to preserve some of the semantics, 2514 * we fall back to check against current free page availability as 2515 * a best attempt and hopefully to minimize the impact of changing 2516 * semantics that cpuset has. 2517 */ 2518 if (delta > 0) { 2519 if (gather_surplus_pages(h, delta) < 0) 2520 goto out; 2521 2522 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { 2523 return_unused_surplus_pages(h, delta); 2524 goto out; 2525 } 2526 } 2527 2528 ret = 0; 2529 if (delta < 0) 2530 return_unused_surplus_pages(h, (unsigned long) -delta); 2531 2532 out: 2533 spin_unlock(&hugetlb_lock); 2534 return ret; 2535 } 2536 2537 static void hugetlb_vm_op_open(struct vm_area_struct *vma) 2538 { 2539 struct resv_map *resv = vma_resv_map(vma); 2540 2541 /* 2542 * This new VMA should share its siblings reservation map if present. 2543 * The VMA will only ever have a valid reservation map pointer where 2544 * it is being copied for another still existing VMA. As that VMA 2545 * has a reference to the reservation map it cannot disappear until 2546 * after this open call completes. It is therefore safe to take a 2547 * new reference here without additional locking. 2548 */ 2549 if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2550 kref_get(&resv->refs); 2551 } 2552 2553 static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2554 { 2555 struct hstate *h = hstate_vma(vma); 2556 struct resv_map *resv = vma_resv_map(vma); 2557 struct hugepage_subpool *spool = subpool_vma(vma); 2558 unsigned long reserve, start, end; 2559 long gbl_reserve; 2560 2561 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2562 return; 2563 2564 start = vma_hugecache_offset(h, vma, vma->vm_start); 2565 end = vma_hugecache_offset(h, vma, vma->vm_end); 2566 2567 reserve = (end - start) - region_count(resv, start, end); 2568 2569 kref_put(&resv->refs, resv_map_release); 2570 2571 if (reserve) { 2572 /* 2573 * Decrement reserve counts. The global reserve count may be 2574 * adjusted if the subpool has a minimum size. 2575 */ 2576 gbl_reserve = hugepage_subpool_put_pages(spool, reserve); 2577 hugetlb_acct_memory(h, -gbl_reserve); 2578 } 2579 } 2580 2581 /* 2582 * We cannot handle pagefaults against hugetlb pages at all. They cause 2583 * handle_mm_fault() to try to instantiate regular-sized pages in the 2584 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 2585 * this far. 2586 */ 2587 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2588 { 2589 BUG(); 2590 return 0; 2591 } 2592 2593 const struct vm_operations_struct hugetlb_vm_ops = { 2594 .fault = hugetlb_vm_op_fault, 2595 .open = hugetlb_vm_op_open, 2596 .close = hugetlb_vm_op_close, 2597 }; 2598 2599 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 2600 int writable) 2601 { 2602 pte_t entry; 2603 2604 if (writable) { 2605 entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, 2606 vma->vm_page_prot))); 2607 } else { 2608 entry = huge_pte_wrprotect(mk_huge_pte(page, 2609 vma->vm_page_prot)); 2610 } 2611 entry = pte_mkyoung(entry); 2612 entry = pte_mkhuge(entry); 2613 entry = arch_make_huge_pte(entry, vma, page, writable); 2614 2615 return entry; 2616 } 2617 2618 static void set_huge_ptep_writable(struct vm_area_struct *vma, 2619 unsigned long address, pte_t *ptep) 2620 { 2621 pte_t entry; 2622 2623 entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); 2624 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) 2625 update_mmu_cache(vma, address, ptep); 2626 } 2627 2628 static int is_hugetlb_entry_migration(pte_t pte) 2629 { 2630 swp_entry_t swp; 2631 2632 if (huge_pte_none(pte) || pte_present(pte)) 2633 return 0; 2634 swp = pte_to_swp_entry(pte); 2635 if (non_swap_entry(swp) && is_migration_entry(swp)) 2636 return 1; 2637 else 2638 return 0; 2639 } 2640 2641 static int is_hugetlb_entry_hwpoisoned(pte_t pte) 2642 { 2643 swp_entry_t swp; 2644 2645 if (huge_pte_none(pte) || pte_present(pte)) 2646 return 0; 2647 swp = pte_to_swp_entry(pte); 2648 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) 2649 return 1; 2650 else 2651 return 0; 2652 } 2653 2654 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 2655 struct vm_area_struct *vma) 2656 { 2657 pte_t *src_pte, *dst_pte, entry; 2658 struct page *ptepage; 2659 unsigned long addr; 2660 int cow; 2661 struct hstate *h = hstate_vma(vma); 2662 unsigned long sz = huge_page_size(h); 2663 unsigned long mmun_start; /* For mmu_notifiers */ 2664 unsigned long mmun_end; /* For mmu_notifiers */ 2665 int ret = 0; 2666 2667 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2668 2669 mmun_start = vma->vm_start; 2670 mmun_end = vma->vm_end; 2671 if (cow) 2672 mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end); 2673 2674 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2675 spinlock_t *src_ptl, *dst_ptl; 2676 src_pte = huge_pte_offset(src, addr); 2677 if (!src_pte) 2678 continue; 2679 dst_pte = huge_pte_alloc(dst, addr, sz); 2680 if (!dst_pte) { 2681 ret = -ENOMEM; 2682 break; 2683 } 2684 2685 /* If the pagetables are shared don't copy or take references */ 2686 if (dst_pte == src_pte) 2687 continue; 2688 2689 dst_ptl = huge_pte_lock(h, dst, dst_pte); 2690 src_ptl = huge_pte_lockptr(h, src, src_pte); 2691 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 2692 entry = huge_ptep_get(src_pte); 2693 if (huge_pte_none(entry)) { /* skip none entry */ 2694 ; 2695 } else if (unlikely(is_hugetlb_entry_migration(entry) || 2696 is_hugetlb_entry_hwpoisoned(entry))) { 2697 swp_entry_t swp_entry = pte_to_swp_entry(entry); 2698 2699 if (is_write_migration_entry(swp_entry) && cow) { 2700 /* 2701 * COW mappings require pages in both 2702 * parent and child to be set to read. 2703 */ 2704 make_migration_entry_read(&swp_entry); 2705 entry = swp_entry_to_pte(swp_entry); 2706 set_huge_pte_at(src, addr, src_pte, entry); 2707 } 2708 set_huge_pte_at(dst, addr, dst_pte, entry); 2709 } else { 2710 if (cow) { 2711 huge_ptep_set_wrprotect(src, addr, src_pte); 2712 mmu_notifier_invalidate_range(src, mmun_start, 2713 mmun_end); 2714 } 2715 entry = huge_ptep_get(src_pte); 2716 ptepage = pte_page(entry); 2717 get_page(ptepage); 2718 page_dup_rmap(ptepage); 2719 set_huge_pte_at(dst, addr, dst_pte, entry); 2720 } 2721 spin_unlock(src_ptl); 2722 spin_unlock(dst_ptl); 2723 } 2724 2725 if (cow) 2726 mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end); 2727 2728 return ret; 2729 } 2730 2731 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 2732 unsigned long start, unsigned long end, 2733 struct page *ref_page) 2734 { 2735 int force_flush = 0; 2736 struct mm_struct *mm = vma->vm_mm; 2737 unsigned long address; 2738 pte_t *ptep; 2739 pte_t pte; 2740 spinlock_t *ptl; 2741 struct page *page; 2742 struct hstate *h = hstate_vma(vma); 2743 unsigned long sz = huge_page_size(h); 2744 const unsigned long mmun_start = start; /* For mmu_notifiers */ 2745 const unsigned long mmun_end = end; /* For mmu_notifiers */ 2746 2747 WARN_ON(!is_vm_hugetlb_page(vma)); 2748 BUG_ON(start & ~huge_page_mask(h)); 2749 BUG_ON(end & ~huge_page_mask(h)); 2750 2751 tlb_start_vma(tlb, vma); 2752 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2753 address = start; 2754 again: 2755 for (; address < end; address += sz) { 2756 ptep = huge_pte_offset(mm, address); 2757 if (!ptep) 2758 continue; 2759 2760 ptl = huge_pte_lock(h, mm, ptep); 2761 if (huge_pmd_unshare(mm, &address, ptep)) 2762 goto unlock; 2763 2764 pte = huge_ptep_get(ptep); 2765 if (huge_pte_none(pte)) 2766 goto unlock; 2767 2768 /* 2769 * Migrating hugepage or HWPoisoned hugepage is already 2770 * unmapped and its refcount is dropped, so just clear pte here. 2771 */ 2772 if (unlikely(!pte_present(pte))) { 2773 huge_pte_clear(mm, address, ptep); 2774 goto unlock; 2775 } 2776 2777 page = pte_page(pte); 2778 /* 2779 * If a reference page is supplied, it is because a specific 2780 * page is being unmapped, not a range. Ensure the page we 2781 * are about to unmap is the actual page of interest. 2782 */ 2783 if (ref_page) { 2784 if (page != ref_page) 2785 goto unlock; 2786 2787 /* 2788 * Mark the VMA as having unmapped its page so that 2789 * future faults in this VMA will fail rather than 2790 * looking like data was lost 2791 */ 2792 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); 2793 } 2794 2795 pte = huge_ptep_get_and_clear(mm, address, ptep); 2796 tlb_remove_tlb_entry(tlb, ptep, address); 2797 if (huge_pte_dirty(pte)) 2798 set_page_dirty(page); 2799 2800 page_remove_rmap(page); 2801 force_flush = !__tlb_remove_page(tlb, page); 2802 if (force_flush) { 2803 address += sz; 2804 spin_unlock(ptl); 2805 break; 2806 } 2807 /* Bail out after unmapping reference page if supplied */ 2808 if (ref_page) { 2809 spin_unlock(ptl); 2810 break; 2811 } 2812 unlock: 2813 spin_unlock(ptl); 2814 } 2815 /* 2816 * mmu_gather ran out of room to batch pages, we break out of 2817 * the PTE lock to avoid doing the potential expensive TLB invalidate 2818 * and page-free while holding it. 2819 */ 2820 if (force_flush) { 2821 force_flush = 0; 2822 tlb_flush_mmu(tlb); 2823 if (address < end && !ref_page) 2824 goto again; 2825 } 2826 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2827 tlb_end_vma(tlb, vma); 2828 } 2829 2830 void __unmap_hugepage_range_final(struct mmu_gather *tlb, 2831 struct vm_area_struct *vma, unsigned long start, 2832 unsigned long end, struct page *ref_page) 2833 { 2834 __unmap_hugepage_range(tlb, vma, start, end, ref_page); 2835 2836 /* 2837 * Clear this flag so that x86's huge_pmd_share page_table_shareable 2838 * test will fail on a vma being torn down, and not grab a page table 2839 * on its way out. We're lucky that the flag has such an appropriate 2840 * name, and can in fact be safely cleared here. We could clear it 2841 * before the __unmap_hugepage_range above, but all that's necessary 2842 * is to clear it before releasing the i_mmap_rwsem. This works 2843 * because in the context this is called, the VMA is about to be 2844 * destroyed and the i_mmap_rwsem is held. 2845 */ 2846 vma->vm_flags &= ~VM_MAYSHARE; 2847 } 2848 2849 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2850 unsigned long end, struct page *ref_page) 2851 { 2852 struct mm_struct *mm; 2853 struct mmu_gather tlb; 2854 2855 mm = vma->vm_mm; 2856 2857 tlb_gather_mmu(&tlb, mm, start, end); 2858 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 2859 tlb_finish_mmu(&tlb, start, end); 2860 } 2861 2862 /* 2863 * This is called when the original mapper is failing to COW a MAP_PRIVATE 2864 * mappping it owns the reserve page for. The intention is to unmap the page 2865 * from other VMAs and let the children be SIGKILLed if they are faulting the 2866 * same region. 2867 */ 2868 static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, 2869 struct page *page, unsigned long address) 2870 { 2871 struct hstate *h = hstate_vma(vma); 2872 struct vm_area_struct *iter_vma; 2873 struct address_space *mapping; 2874 pgoff_t pgoff; 2875 2876 /* 2877 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 2878 * from page cache lookup which is in HPAGE_SIZE units. 2879 */ 2880 address = address & huge_page_mask(h); 2881 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + 2882 vma->vm_pgoff; 2883 mapping = file_inode(vma->vm_file)->i_mapping; 2884 2885 /* 2886 * Take the mapping lock for the duration of the table walk. As 2887 * this mapping should be shared between all the VMAs, 2888 * __unmap_hugepage_range() is called as the lock is already held 2889 */ 2890 i_mmap_lock_write(mapping); 2891 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { 2892 /* Do not unmap the current VMA */ 2893 if (iter_vma == vma) 2894 continue; 2895 2896 /* 2897 * Unmap the page from other VMAs without their own reserves. 2898 * They get marked to be SIGKILLed if they fault in these 2899 * areas. This is because a future no-page fault on this VMA 2900 * could insert a zeroed page instead of the data existing 2901 * from the time of fork. This would look like data corruption 2902 */ 2903 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2904 unmap_hugepage_range(iter_vma, address, 2905 address + huge_page_size(h), page); 2906 } 2907 i_mmap_unlock_write(mapping); 2908 } 2909 2910 /* 2911 * Hugetlb_cow() should be called with page lock of the original hugepage held. 2912 * Called with hugetlb_instantiation_mutex held and pte_page locked so we 2913 * cannot race with other handlers or page migration. 2914 * Keep the pte_same checks anyway to make transition from the mutex easier. 2915 */ 2916 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2917 unsigned long address, pte_t *ptep, pte_t pte, 2918 struct page *pagecache_page, spinlock_t *ptl) 2919 { 2920 struct hstate *h = hstate_vma(vma); 2921 struct page *old_page, *new_page; 2922 int ret = 0, outside_reserve = 0; 2923 unsigned long mmun_start; /* For mmu_notifiers */ 2924 unsigned long mmun_end; /* For mmu_notifiers */ 2925 2926 old_page = pte_page(pte); 2927 2928 retry_avoidcopy: 2929 /* If no-one else is actually using this page, avoid the copy 2930 * and just make the page writable */ 2931 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { 2932 page_move_anon_rmap(old_page, vma, address); 2933 set_huge_ptep_writable(vma, address, ptep); 2934 return 0; 2935 } 2936 2937 /* 2938 * If the process that created a MAP_PRIVATE mapping is about to 2939 * perform a COW due to a shared page count, attempt to satisfy 2940 * the allocation without using the existing reserves. The pagecache 2941 * page is used to determine if the reserve at this address was 2942 * consumed or not. If reserves were used, a partial faulted mapping 2943 * at the time of fork() could consume its reserves on COW instead 2944 * of the full address range. 2945 */ 2946 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && 2947 old_page != pagecache_page) 2948 outside_reserve = 1; 2949 2950 page_cache_get(old_page); 2951 2952 /* 2953 * Drop page table lock as buddy allocator may be called. It will 2954 * be acquired again before returning to the caller, as expected. 2955 */ 2956 spin_unlock(ptl); 2957 new_page = alloc_huge_page(vma, address, outside_reserve); 2958 2959 if (IS_ERR(new_page)) { 2960 /* 2961 * If a process owning a MAP_PRIVATE mapping fails to COW, 2962 * it is due to references held by a child and an insufficient 2963 * huge page pool. To guarantee the original mappers 2964 * reliability, unmap the page from child processes. The child 2965 * may get SIGKILLed if it later faults. 2966 */ 2967 if (outside_reserve) { 2968 page_cache_release(old_page); 2969 BUG_ON(huge_pte_none(pte)); 2970 unmap_ref_private(mm, vma, old_page, address); 2971 BUG_ON(huge_pte_none(pte)); 2972 spin_lock(ptl); 2973 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2974 if (likely(ptep && 2975 pte_same(huge_ptep_get(ptep), pte))) 2976 goto retry_avoidcopy; 2977 /* 2978 * race occurs while re-acquiring page table 2979 * lock, and our job is done. 2980 */ 2981 return 0; 2982 } 2983 2984 ret = (PTR_ERR(new_page) == -ENOMEM) ? 2985 VM_FAULT_OOM : VM_FAULT_SIGBUS; 2986 goto out_release_old; 2987 } 2988 2989 /* 2990 * When the original hugepage is shared one, it does not have 2991 * anon_vma prepared. 2992 */ 2993 if (unlikely(anon_vma_prepare(vma))) { 2994 ret = VM_FAULT_OOM; 2995 goto out_release_all; 2996 } 2997 2998 copy_user_huge_page(new_page, old_page, address, vma, 2999 pages_per_huge_page(h)); 3000 __SetPageUptodate(new_page); 3001 set_page_huge_active(new_page); 3002 3003 mmun_start = address & huge_page_mask(h); 3004 mmun_end = mmun_start + huge_page_size(h); 3005 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 3006 3007 /* 3008 * Retake the page table lock to check for racing updates 3009 * before the page tables are altered 3010 */ 3011 spin_lock(ptl); 3012 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 3013 if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { 3014 ClearPagePrivate(new_page); 3015 3016 /* Break COW */ 3017 huge_ptep_clear_flush(vma, address, ptep); 3018 mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); 3019 set_huge_pte_at(mm, address, ptep, 3020 make_huge_pte(vma, new_page, 1)); 3021 page_remove_rmap(old_page); 3022 hugepage_add_new_anon_rmap(new_page, vma, address); 3023 /* Make the old page be freed below */ 3024 new_page = old_page; 3025 } 3026 spin_unlock(ptl); 3027 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 3028 out_release_all: 3029 page_cache_release(new_page); 3030 out_release_old: 3031 page_cache_release(old_page); 3032 3033 spin_lock(ptl); /* Caller expects lock to be held */ 3034 return ret; 3035 } 3036 3037 /* Return the pagecache page at a given address within a VMA */ 3038 static struct page *hugetlbfs_pagecache_page(struct hstate *h, 3039 struct vm_area_struct *vma, unsigned long address) 3040 { 3041 struct address_space *mapping; 3042 pgoff_t idx; 3043 3044 mapping = vma->vm_file->f_mapping; 3045 idx = vma_hugecache_offset(h, vma, address); 3046 3047 return find_lock_page(mapping, idx); 3048 } 3049 3050 /* 3051 * Return whether there is a pagecache page to back given address within VMA. 3052 * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. 3053 */ 3054 static bool hugetlbfs_pagecache_present(struct hstate *h, 3055 struct vm_area_struct *vma, unsigned long address) 3056 { 3057 struct address_space *mapping; 3058 pgoff_t idx; 3059 struct page *page; 3060 3061 mapping = vma->vm_file->f_mapping; 3062 idx = vma_hugecache_offset(h, vma, address); 3063 3064 page = find_get_page(mapping, idx); 3065 if (page) 3066 put_page(page); 3067 return page != NULL; 3068 } 3069 3070 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 3071 struct address_space *mapping, pgoff_t idx, 3072 unsigned long address, pte_t *ptep, unsigned int flags) 3073 { 3074 struct hstate *h = hstate_vma(vma); 3075 int ret = VM_FAULT_SIGBUS; 3076 int anon_rmap = 0; 3077 unsigned long size; 3078 struct page *page; 3079 pte_t new_pte; 3080 spinlock_t *ptl; 3081 3082 /* 3083 * Currently, we are forced to kill the process in the event the 3084 * original mapper has unmapped pages from the child due to a failed 3085 * COW. Warn that such a situation has occurred as it may not be obvious 3086 */ 3087 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 3088 pr_warning("PID %d killed due to inadequate hugepage pool\n", 3089 current->pid); 3090 return ret; 3091 } 3092 3093 /* 3094 * Use page lock to guard against racing truncation 3095 * before we get page_table_lock. 3096 */ 3097 retry: 3098 page = find_lock_page(mapping, idx); 3099 if (!page) { 3100 size = i_size_read(mapping->host) >> huge_page_shift(h); 3101 if (idx >= size) 3102 goto out; 3103 page = alloc_huge_page(vma, address, 0); 3104 if (IS_ERR(page)) { 3105 ret = PTR_ERR(page); 3106 if (ret == -ENOMEM) 3107 ret = VM_FAULT_OOM; 3108 else 3109 ret = VM_FAULT_SIGBUS; 3110 goto out; 3111 } 3112 clear_huge_page(page, address, pages_per_huge_page(h)); 3113 __SetPageUptodate(page); 3114 set_page_huge_active(page); 3115 3116 if (vma->vm_flags & VM_MAYSHARE) { 3117 int err; 3118 struct inode *inode = mapping->host; 3119 3120 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 3121 if (err) { 3122 put_page(page); 3123 if (err == -EEXIST) 3124 goto retry; 3125 goto out; 3126 } 3127 ClearPagePrivate(page); 3128 3129 spin_lock(&inode->i_lock); 3130 inode->i_blocks += blocks_per_huge_page(h); 3131 spin_unlock(&inode->i_lock); 3132 } else { 3133 lock_page(page); 3134 if (unlikely(anon_vma_prepare(vma))) { 3135 ret = VM_FAULT_OOM; 3136 goto backout_unlocked; 3137 } 3138 anon_rmap = 1; 3139 } 3140 } else { 3141 /* 3142 * If memory error occurs between mmap() and fault, some process 3143 * don't have hwpoisoned swap entry for errored virtual address. 3144 * So we need to block hugepage fault by PG_hwpoison bit check. 3145 */ 3146 if (unlikely(PageHWPoison(page))) { 3147 ret = VM_FAULT_HWPOISON | 3148 VM_FAULT_SET_HINDEX(hstate_index(h)); 3149 goto backout_unlocked; 3150 } 3151 } 3152 3153 /* 3154 * If we are going to COW a private mapping later, we examine the 3155 * pending reservations for this page now. This will ensure that 3156 * any allocations necessary to record that reservation occur outside 3157 * the spinlock. 3158 */ 3159 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) 3160 if (vma_needs_reservation(h, vma, address) < 0) { 3161 ret = VM_FAULT_OOM; 3162 goto backout_unlocked; 3163 } 3164 3165 ptl = huge_pte_lockptr(h, mm, ptep); 3166 spin_lock(ptl); 3167 size = i_size_read(mapping->host) >> huge_page_shift(h); 3168 if (idx >= size) 3169 goto backout; 3170 3171 ret = 0; 3172 if (!huge_pte_none(huge_ptep_get(ptep))) 3173 goto backout; 3174 3175 if (anon_rmap) { 3176 ClearPagePrivate(page); 3177 hugepage_add_new_anon_rmap(page, vma, address); 3178 } else 3179 page_dup_rmap(page); 3180 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 3181 && (vma->vm_flags & VM_SHARED))); 3182 set_huge_pte_at(mm, address, ptep, new_pte); 3183 3184 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 3185 /* Optimization, do the COW without a second fault */ 3186 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); 3187 } 3188 3189 spin_unlock(ptl); 3190 unlock_page(page); 3191 out: 3192 return ret; 3193 3194 backout: 3195 spin_unlock(ptl); 3196 backout_unlocked: 3197 unlock_page(page); 3198 put_page(page); 3199 goto out; 3200 } 3201 3202 #ifdef CONFIG_SMP 3203 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3204 struct vm_area_struct *vma, 3205 struct address_space *mapping, 3206 pgoff_t idx, unsigned long address) 3207 { 3208 unsigned long key[2]; 3209 u32 hash; 3210 3211 if (vma->vm_flags & VM_SHARED) { 3212 key[0] = (unsigned long) mapping; 3213 key[1] = idx; 3214 } else { 3215 key[0] = (unsigned long) mm; 3216 key[1] = address >> huge_page_shift(h); 3217 } 3218 3219 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); 3220 3221 return hash & (num_fault_mutexes - 1); 3222 } 3223 #else 3224 /* 3225 * For uniprocesor systems we always use a single mutex, so just 3226 * return 0 and avoid the hashing overhead. 3227 */ 3228 static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3229 struct vm_area_struct *vma, 3230 struct address_space *mapping, 3231 pgoff_t idx, unsigned long address) 3232 { 3233 return 0; 3234 } 3235 #endif 3236 3237 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3238 unsigned long address, unsigned int flags) 3239 { 3240 pte_t *ptep, entry; 3241 spinlock_t *ptl; 3242 int ret; 3243 u32 hash; 3244 pgoff_t idx; 3245 struct page *page = NULL; 3246 struct page *pagecache_page = NULL; 3247 struct hstate *h = hstate_vma(vma); 3248 struct address_space *mapping; 3249 int need_wait_lock = 0; 3250 3251 address &= huge_page_mask(h); 3252 3253 ptep = huge_pte_offset(mm, address); 3254 if (ptep) { 3255 entry = huge_ptep_get(ptep); 3256 if (unlikely(is_hugetlb_entry_migration(entry))) { 3257 migration_entry_wait_huge(vma, mm, ptep); 3258 return 0; 3259 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 3260 return VM_FAULT_HWPOISON_LARGE | 3261 VM_FAULT_SET_HINDEX(hstate_index(h)); 3262 } 3263 3264 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 3265 if (!ptep) 3266 return VM_FAULT_OOM; 3267 3268 mapping = vma->vm_file->f_mapping; 3269 idx = vma_hugecache_offset(h, vma, address); 3270 3271 /* 3272 * Serialize hugepage allocation and instantiation, so that we don't 3273 * get spurious allocation failures if two CPUs race to instantiate 3274 * the same page in the page cache. 3275 */ 3276 hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); 3277 mutex_lock(&htlb_fault_mutex_table[hash]); 3278 3279 entry = huge_ptep_get(ptep); 3280 if (huge_pte_none(entry)) { 3281 ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); 3282 goto out_mutex; 3283 } 3284 3285 ret = 0; 3286 3287 /* 3288 * entry could be a migration/hwpoison entry at this point, so this 3289 * check prevents the kernel from going below assuming that we have 3290 * a active hugepage in pagecache. This goto expects the 2nd page fault, 3291 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly 3292 * handle it. 3293 */ 3294 if (!pte_present(entry)) 3295 goto out_mutex; 3296 3297 /* 3298 * If we are going to COW the mapping later, we examine the pending 3299 * reservations for this page now. This will ensure that any 3300 * allocations necessary to record that reservation occur outside the 3301 * spinlock. For private mappings, we also lookup the pagecache 3302 * page now as it is used to determine if a reservation has been 3303 * consumed. 3304 */ 3305 if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { 3306 if (vma_needs_reservation(h, vma, address) < 0) { 3307 ret = VM_FAULT_OOM; 3308 goto out_mutex; 3309 } 3310 3311 if (!(vma->vm_flags & VM_MAYSHARE)) 3312 pagecache_page = hugetlbfs_pagecache_page(h, 3313 vma, address); 3314 } 3315 3316 ptl = huge_pte_lock(h, mm, ptep); 3317 3318 /* Check for a racing update before calling hugetlb_cow */ 3319 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 3320 goto out_ptl; 3321 3322 /* 3323 * hugetlb_cow() requires page locks of pte_page(entry) and 3324 * pagecache_page, so here we need take the former one 3325 * when page != pagecache_page or !pagecache_page. 3326 */ 3327 page = pte_page(entry); 3328 if (page != pagecache_page) 3329 if (!trylock_page(page)) { 3330 need_wait_lock = 1; 3331 goto out_ptl; 3332 } 3333 3334 get_page(page); 3335 3336 if (flags & FAULT_FLAG_WRITE) { 3337 if (!huge_pte_write(entry)) { 3338 ret = hugetlb_cow(mm, vma, address, ptep, entry, 3339 pagecache_page, ptl); 3340 goto out_put_page; 3341 } 3342 entry = huge_pte_mkdirty(entry); 3343 } 3344 entry = pte_mkyoung(entry); 3345 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 3346 flags & FAULT_FLAG_WRITE)) 3347 update_mmu_cache(vma, address, ptep); 3348 out_put_page: 3349 if (page != pagecache_page) 3350 unlock_page(page); 3351 put_page(page); 3352 out_ptl: 3353 spin_unlock(ptl); 3354 3355 if (pagecache_page) { 3356 unlock_page(pagecache_page); 3357 put_page(pagecache_page); 3358 } 3359 out_mutex: 3360 mutex_unlock(&htlb_fault_mutex_table[hash]); 3361 /* 3362 * Generally it's safe to hold refcount during waiting page lock. But 3363 * here we just wait to defer the next page fault to avoid busy loop and 3364 * the page is not used after unlocked before returning from the current 3365 * page fault. So we are safe from accessing freed page, even if we wait 3366 * here without taking refcount. 3367 */ 3368 if (need_wait_lock) 3369 wait_on_page_locked(page); 3370 return ret; 3371 } 3372 3373 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 3374 struct page **pages, struct vm_area_struct **vmas, 3375 unsigned long *position, unsigned long *nr_pages, 3376 long i, unsigned int flags) 3377 { 3378 unsigned long pfn_offset; 3379 unsigned long vaddr = *position; 3380 unsigned long remainder = *nr_pages; 3381 struct hstate *h = hstate_vma(vma); 3382 3383 while (vaddr < vma->vm_end && remainder) { 3384 pte_t *pte; 3385 spinlock_t *ptl = NULL; 3386 int absent; 3387 struct page *page; 3388 3389 /* 3390 * If we have a pending SIGKILL, don't keep faulting pages and 3391 * potentially allocating memory. 3392 */ 3393 if (unlikely(fatal_signal_pending(current))) { 3394 remainder = 0; 3395 break; 3396 } 3397 3398 /* 3399 * Some archs (sparc64, sh*) have multiple pte_ts to 3400 * each hugepage. We have to make sure we get the 3401 * first, for the page indexing below to work. 3402 * 3403 * Note that page table lock is not held when pte is null. 3404 */ 3405 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 3406 if (pte) 3407 ptl = huge_pte_lock(h, mm, pte); 3408 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 3409 3410 /* 3411 * When coredumping, it suits get_dump_page if we just return 3412 * an error where there's an empty slot with no huge pagecache 3413 * to back it. This way, we avoid allocating a hugepage, and 3414 * the sparse dumpfile avoids allocating disk blocks, but its 3415 * huge holes still show up with zeroes where they need to be. 3416 */ 3417 if (absent && (flags & FOLL_DUMP) && 3418 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 3419 if (pte) 3420 spin_unlock(ptl); 3421 remainder = 0; 3422 break; 3423 } 3424 3425 /* 3426 * We need call hugetlb_fault for both hugepages under migration 3427 * (in which case hugetlb_fault waits for the migration,) and 3428 * hwpoisoned hugepages (in which case we need to prevent the 3429 * caller from accessing to them.) In order to do this, we use 3430 * here is_swap_pte instead of is_hugetlb_entry_migration and 3431 * is_hugetlb_entry_hwpoisoned. This is because it simply covers 3432 * both cases, and because we can't follow correct pages 3433 * directly from any kind of swap entries. 3434 */ 3435 if (absent || is_swap_pte(huge_ptep_get(pte)) || 3436 ((flags & FOLL_WRITE) && 3437 !huge_pte_write(huge_ptep_get(pte)))) { 3438 int ret; 3439 3440 if (pte) 3441 spin_unlock(ptl); 3442 ret = hugetlb_fault(mm, vma, vaddr, 3443 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); 3444 if (!(ret & VM_FAULT_ERROR)) 3445 continue; 3446 3447 remainder = 0; 3448 break; 3449 } 3450 3451 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; 3452 page = pte_page(huge_ptep_get(pte)); 3453 same_page: 3454 if (pages) { 3455 pages[i] = mem_map_offset(page, pfn_offset); 3456 get_page_foll(pages[i]); 3457 } 3458 3459 if (vmas) 3460 vmas[i] = vma; 3461 3462 vaddr += PAGE_SIZE; 3463 ++pfn_offset; 3464 --remainder; 3465 ++i; 3466 if (vaddr < vma->vm_end && remainder && 3467 pfn_offset < pages_per_huge_page(h)) { 3468 /* 3469 * We use pfn_offset to avoid touching the pageframes 3470 * of this compound page. 3471 */ 3472 goto same_page; 3473 } 3474 spin_unlock(ptl); 3475 } 3476 *nr_pages = remainder; 3477 *position = vaddr; 3478 3479 return i ? i : -EFAULT; 3480 } 3481 3482 unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 3483 unsigned long address, unsigned long end, pgprot_t newprot) 3484 { 3485 struct mm_struct *mm = vma->vm_mm; 3486 unsigned long start = address; 3487 pte_t *ptep; 3488 pte_t pte; 3489 struct hstate *h = hstate_vma(vma); 3490 unsigned long pages = 0; 3491 3492 BUG_ON(address >= end); 3493 flush_cache_range(vma, address, end); 3494 3495 mmu_notifier_invalidate_range_start(mm, start, end); 3496 i_mmap_lock_write(vma->vm_file->f_mapping); 3497 for (; address < end; address += huge_page_size(h)) { 3498 spinlock_t *ptl; 3499 ptep = huge_pte_offset(mm, address); 3500 if (!ptep) 3501 continue; 3502 ptl = huge_pte_lock(h, mm, ptep); 3503 if (huge_pmd_unshare(mm, &address, ptep)) { 3504 pages++; 3505 spin_unlock(ptl); 3506 continue; 3507 } 3508 pte = huge_ptep_get(ptep); 3509 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 3510 spin_unlock(ptl); 3511 continue; 3512 } 3513 if (unlikely(is_hugetlb_entry_migration(pte))) { 3514 swp_entry_t entry = pte_to_swp_entry(pte); 3515 3516 if (is_write_migration_entry(entry)) { 3517 pte_t newpte; 3518 3519 make_migration_entry_read(&entry); 3520 newpte = swp_entry_to_pte(entry); 3521 set_huge_pte_at(mm, address, ptep, newpte); 3522 pages++; 3523 } 3524 spin_unlock(ptl); 3525 continue; 3526 } 3527 if (!huge_pte_none(pte)) { 3528 pte = huge_ptep_get_and_clear(mm, address, ptep); 3529 pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 3530 pte = arch_make_huge_pte(pte, vma, NULL, 0); 3531 set_huge_pte_at(mm, address, ptep, pte); 3532 pages++; 3533 } 3534 spin_unlock(ptl); 3535 } 3536 /* 3537 * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 3538 * may have cleared our pud entry and done put_page on the page table: 3539 * once we release i_mmap_rwsem, another task can do the final put_page 3540 * and that page table be reused and filled with junk. 3541 */ 3542 flush_tlb_range(vma, start, end); 3543 mmu_notifier_invalidate_range(mm, start, end); 3544 i_mmap_unlock_write(vma->vm_file->f_mapping); 3545 mmu_notifier_invalidate_range_end(mm, start, end); 3546 3547 return pages << h->order; 3548 } 3549 3550 int hugetlb_reserve_pages(struct inode *inode, 3551 long from, long to, 3552 struct vm_area_struct *vma, 3553 vm_flags_t vm_flags) 3554 { 3555 long ret, chg; 3556 struct hstate *h = hstate_inode(inode); 3557 struct hugepage_subpool *spool = subpool_inode(inode); 3558 struct resv_map *resv_map; 3559 long gbl_reserve; 3560 3561 /* 3562 * Only apply hugepage reservation if asked. At fault time, an 3563 * attempt will be made for VM_NORESERVE to allocate a page 3564 * without using reserves 3565 */ 3566 if (vm_flags & VM_NORESERVE) 3567 return 0; 3568 3569 /* 3570 * Shared mappings base their reservation on the number of pages that 3571 * are already allocated on behalf of the file. Private mappings need 3572 * to reserve the full area even if read-only as mprotect() may be 3573 * called to make the mapping read-write. Assume !vma is a shm mapping 3574 */ 3575 if (!vma || vma->vm_flags & VM_MAYSHARE) { 3576 resv_map = inode_resv_map(inode); 3577 3578 chg = region_chg(resv_map, from, to); 3579 3580 } else { 3581 resv_map = resv_map_alloc(); 3582 if (!resv_map) 3583 return -ENOMEM; 3584 3585 chg = to - from; 3586 3587 set_vma_resv_map(vma, resv_map); 3588 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 3589 } 3590 3591 if (chg < 0) { 3592 ret = chg; 3593 goto out_err; 3594 } 3595 3596 /* 3597 * There must be enough pages in the subpool for the mapping. If 3598 * the subpool has a minimum size, there may be some global 3599 * reservations already in place (gbl_reserve). 3600 */ 3601 gbl_reserve = hugepage_subpool_get_pages(spool, chg); 3602 if (gbl_reserve < 0) { 3603 ret = -ENOSPC; 3604 goto out_err; 3605 } 3606 3607 /* 3608 * Check enough hugepages are available for the reservation. 3609 * Hand the pages back to the subpool if there are not 3610 */ 3611 ret = hugetlb_acct_memory(h, gbl_reserve); 3612 if (ret < 0) { 3613 /* put back original number of pages, chg */ 3614 (void)hugepage_subpool_put_pages(spool, chg); 3615 goto out_err; 3616 } 3617 3618 /* 3619 * Account for the reservations made. Shared mappings record regions 3620 * that have reservations as they are shared by multiple VMAs. 3621 * When the last VMA disappears, the region map says how much 3622 * the reservation was and the page cache tells how much of 3623 * the reservation was consumed. Private mappings are per-VMA and 3624 * only the consumed reservations are tracked. When the VMA 3625 * disappears, the original reservation is the VMA size and the 3626 * consumed reservations are stored in the map. Hence, nothing 3627 * else has to be done for private mappings here 3628 */ 3629 if (!vma || vma->vm_flags & VM_MAYSHARE) 3630 region_add(resv_map, from, to); 3631 return 0; 3632 out_err: 3633 if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 3634 kref_put(&resv_map->refs, resv_map_release); 3635 return ret; 3636 } 3637 3638 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 3639 { 3640 struct hstate *h = hstate_inode(inode); 3641 struct resv_map *resv_map = inode_resv_map(inode); 3642 long chg = 0; 3643 struct hugepage_subpool *spool = subpool_inode(inode); 3644 long gbl_reserve; 3645 3646 if (resv_map) 3647 chg = region_truncate(resv_map, offset); 3648 spin_lock(&inode->i_lock); 3649 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3650 spin_unlock(&inode->i_lock); 3651 3652 /* 3653 * If the subpool has a minimum size, the number of global 3654 * reservations to be released may be adjusted. 3655 */ 3656 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); 3657 hugetlb_acct_memory(h, -gbl_reserve); 3658 } 3659 3660 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 3661 static unsigned long page_table_shareable(struct vm_area_struct *svma, 3662 struct vm_area_struct *vma, 3663 unsigned long addr, pgoff_t idx) 3664 { 3665 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + 3666 svma->vm_start; 3667 unsigned long sbase = saddr & PUD_MASK; 3668 unsigned long s_end = sbase + PUD_SIZE; 3669 3670 /* Allow segments to share if only one is marked locked */ 3671 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 3672 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 3673 3674 /* 3675 * match the virtual addresses, permission and the alignment of the 3676 * page table page. 3677 */ 3678 if (pmd_index(addr) != pmd_index(saddr) || 3679 vm_flags != svm_flags || 3680 sbase < svma->vm_start || svma->vm_end < s_end) 3681 return 0; 3682 3683 return saddr; 3684 } 3685 3686 static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) 3687 { 3688 unsigned long base = addr & PUD_MASK; 3689 unsigned long end = base + PUD_SIZE; 3690 3691 /* 3692 * check on proper vm_flags and page table alignment 3693 */ 3694 if (vma->vm_flags & VM_MAYSHARE && 3695 vma->vm_start <= base && end <= vma->vm_end) 3696 return 1; 3697 return 0; 3698 } 3699 3700 /* 3701 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() 3702 * and returns the corresponding pte. While this is not necessary for the 3703 * !shared pmd case because we can allocate the pmd later as well, it makes the 3704 * code much cleaner. pmd allocation is essential for the shared case because 3705 * pud has to be populated inside the same i_mmap_rwsem section - otherwise 3706 * racing tasks could either miss the sharing (see huge_pte_offset) or select a 3707 * bad pmd for sharing. 3708 */ 3709 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 3710 { 3711 struct vm_area_struct *vma = find_vma(mm, addr); 3712 struct address_space *mapping = vma->vm_file->f_mapping; 3713 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + 3714 vma->vm_pgoff; 3715 struct vm_area_struct *svma; 3716 unsigned long saddr; 3717 pte_t *spte = NULL; 3718 pte_t *pte; 3719 spinlock_t *ptl; 3720 3721 if (!vma_shareable(vma, addr)) 3722 return (pte_t *)pmd_alloc(mm, pud, addr); 3723 3724 i_mmap_lock_write(mapping); 3725 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { 3726 if (svma == vma) 3727 continue; 3728 3729 saddr = page_table_shareable(svma, vma, addr, idx); 3730 if (saddr) { 3731 spte = huge_pte_offset(svma->vm_mm, saddr); 3732 if (spte) { 3733 mm_inc_nr_pmds(mm); 3734 get_page(virt_to_page(spte)); 3735 break; 3736 } 3737 } 3738 } 3739 3740 if (!spte) 3741 goto out; 3742 3743 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 3744 spin_lock(ptl); 3745 if (pud_none(*pud)) { 3746 pud_populate(mm, pud, 3747 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3748 } else { 3749 put_page(virt_to_page(spte)); 3750 mm_inc_nr_pmds(mm); 3751 } 3752 spin_unlock(ptl); 3753 out: 3754 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3755 i_mmap_unlock_write(mapping); 3756 return pte; 3757 } 3758 3759 /* 3760 * unmap huge page backed by shared pte. 3761 * 3762 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared 3763 * indicated by page_count > 1, unmap is achieved by clearing pud and 3764 * decrementing the ref count. If count == 1, the pte page is not shared. 3765 * 3766 * called with page table lock held. 3767 * 3768 * returns: 1 successfully unmapped a shared pte page 3769 * 0 the underlying pte page is not shared, or it is the last user 3770 */ 3771 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) 3772 { 3773 pgd_t *pgd = pgd_offset(mm, *addr); 3774 pud_t *pud = pud_offset(pgd, *addr); 3775 3776 BUG_ON(page_count(virt_to_page(ptep)) == 0); 3777 if (page_count(virt_to_page(ptep)) == 1) 3778 return 0; 3779 3780 pud_clear(pud); 3781 put_page(virt_to_page(ptep)); 3782 mm_dec_nr_pmds(mm); 3783 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 3784 return 1; 3785 } 3786 #define want_pmd_share() (1) 3787 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 3788 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) 3789 { 3790 return NULL; 3791 } 3792 #define want_pmd_share() (0) 3793 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ 3794 3795 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB 3796 pte_t *huge_pte_alloc(struct mm_struct *mm, 3797 unsigned long addr, unsigned long sz) 3798 { 3799 pgd_t *pgd; 3800 pud_t *pud; 3801 pte_t *pte = NULL; 3802 3803 pgd = pgd_offset(mm, addr); 3804 pud = pud_alloc(mm, pgd, addr); 3805 if (pud) { 3806 if (sz == PUD_SIZE) { 3807 pte = (pte_t *)pud; 3808 } else { 3809 BUG_ON(sz != PMD_SIZE); 3810 if (want_pmd_share() && pud_none(*pud)) 3811 pte = huge_pmd_share(mm, addr, pud); 3812 else 3813 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3814 } 3815 } 3816 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 3817 3818 return pte; 3819 } 3820 3821 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) 3822 { 3823 pgd_t *pgd; 3824 pud_t *pud; 3825 pmd_t *pmd = NULL; 3826 3827 pgd = pgd_offset(mm, addr); 3828 if (pgd_present(*pgd)) { 3829 pud = pud_offset(pgd, addr); 3830 if (pud_present(*pud)) { 3831 if (pud_huge(*pud)) 3832 return (pte_t *)pud; 3833 pmd = pmd_offset(pud, addr); 3834 } 3835 } 3836 return (pte_t *) pmd; 3837 } 3838 3839 #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 3840 3841 /* 3842 * These functions are overwritable if your architecture needs its own 3843 * behavior. 3844 */ 3845 struct page * __weak 3846 follow_huge_addr(struct mm_struct *mm, unsigned long address, 3847 int write) 3848 { 3849 return ERR_PTR(-EINVAL); 3850 } 3851 3852 struct page * __weak 3853 follow_huge_pmd(struct mm_struct *mm, unsigned long address, 3854 pmd_t *pmd, int flags) 3855 { 3856 struct page *page = NULL; 3857 spinlock_t *ptl; 3858 retry: 3859 ptl = pmd_lockptr(mm, pmd); 3860 spin_lock(ptl); 3861 /* 3862 * make sure that the address range covered by this pmd is not 3863 * unmapped from other threads. 3864 */ 3865 if (!pmd_huge(*pmd)) 3866 goto out; 3867 if (pmd_present(*pmd)) { 3868 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); 3869 if (flags & FOLL_GET) 3870 get_page(page); 3871 } else { 3872 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { 3873 spin_unlock(ptl); 3874 __migration_entry_wait(mm, (pte_t *)pmd, ptl); 3875 goto retry; 3876 } 3877 /* 3878 * hwpoisoned entry is treated as no_page_table in 3879 * follow_page_mask(). 3880 */ 3881 } 3882 out: 3883 spin_unlock(ptl); 3884 return page; 3885 } 3886 3887 struct page * __weak 3888 follow_huge_pud(struct mm_struct *mm, unsigned long address, 3889 pud_t *pud, int flags) 3890 { 3891 if (flags & FOLL_GET) 3892 return NULL; 3893 3894 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); 3895 } 3896 3897 #ifdef CONFIG_MEMORY_FAILURE 3898 3899 /* 3900 * This function is called from memory failure code. 3901 * Assume the caller holds page lock of the head page. 3902 */ 3903 int dequeue_hwpoisoned_huge_page(struct page *hpage) 3904 { 3905 struct hstate *h = page_hstate(hpage); 3906 int nid = page_to_nid(hpage); 3907 int ret = -EBUSY; 3908 3909 spin_lock(&hugetlb_lock); 3910 /* 3911 * Just checking !page_huge_active is not enough, because that could be 3912 * an isolated/hwpoisoned hugepage (which have >0 refcount). 3913 */ 3914 if (!page_huge_active(hpage) && !page_count(hpage)) { 3915 /* 3916 * Hwpoisoned hugepage isn't linked to activelist or freelist, 3917 * but dangling hpage->lru can trigger list-debug warnings 3918 * (this happens when we call unpoison_memory() on it), 3919 * so let it point to itself with list_del_init(). 3920 */ 3921 list_del_init(&hpage->lru); 3922 set_page_refcounted(hpage); 3923 h->free_huge_pages--; 3924 h->free_huge_pages_node[nid]--; 3925 ret = 0; 3926 } 3927 spin_unlock(&hugetlb_lock); 3928 return ret; 3929 } 3930 #endif 3931 3932 bool isolate_huge_page(struct page *page, struct list_head *list) 3933 { 3934 bool ret = true; 3935 3936 VM_BUG_ON_PAGE(!PageHead(page), page); 3937 spin_lock(&hugetlb_lock); 3938 if (!page_huge_active(page) || !get_page_unless_zero(page)) { 3939 ret = false; 3940 goto unlock; 3941 } 3942 clear_page_huge_active(page); 3943 list_move_tail(&page->lru, list); 3944 unlock: 3945 spin_unlock(&hugetlb_lock); 3946 return ret; 3947 } 3948 3949 void putback_active_hugepage(struct page *page) 3950 { 3951 VM_BUG_ON_PAGE(!PageHead(page), page); 3952 spin_lock(&hugetlb_lock); 3953 set_page_huge_active(page); 3954 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 3955 spin_unlock(&hugetlb_lock); 3956 put_page(page); 3957 } 3958