1 /* 2 * Generic hugetlb support. 3 * (C) William Irwin, April 2004 4 */ 5 #include <linux/gfp.h> 6 #include <linux/list.h> 7 #include <linux/init.h> 8 #include <linux/module.h> 9 #include <linux/mm.h> 10 #include <linux/sysctl.h> 11 #include <linux/highmem.h> 12 #include <linux/nodemask.h> 13 #include <linux/pagemap.h> 14 #include <asm/page.h> 15 #include <asm/pgtable.h> 16 17 #include <linux/hugetlb.h> 18 19 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 20 static unsigned long nr_huge_pages, free_huge_pages; 21 unsigned long max_huge_pages; 22 static struct list_head hugepage_freelists[MAX_NUMNODES]; 23 static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 24 static unsigned int free_huge_pages_node[MAX_NUMNODES]; 25 static DEFINE_SPINLOCK(hugetlb_lock); 26 27 static void enqueue_huge_page(struct page *page) 28 { 29 int nid = page_to_nid(page); 30 list_add(&page->lru, &hugepage_freelists[nid]); 31 free_huge_pages++; 32 free_huge_pages_node[nid]++; 33 } 34 35 static struct page *dequeue_huge_page(void) 36 { 37 int nid = numa_node_id(); 38 struct page *page = NULL; 39 40 if (list_empty(&hugepage_freelists[nid])) { 41 for (nid = 0; nid < MAX_NUMNODES; ++nid) 42 if (!list_empty(&hugepage_freelists[nid])) 43 break; 44 } 45 if (nid >= 0 && nid < MAX_NUMNODES && 46 !list_empty(&hugepage_freelists[nid])) { 47 page = list_entry(hugepage_freelists[nid].next, 48 struct page, lru); 49 list_del(&page->lru); 50 free_huge_pages--; 51 free_huge_pages_node[nid]--; 52 } 53 return page; 54 } 55 56 static struct page *alloc_fresh_huge_page(void) 57 { 58 static int nid = 0; 59 struct page *page; 60 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 61 HUGETLB_PAGE_ORDER); 62 nid = (nid + 1) % num_online_nodes(); 63 if (page) { 64 nr_huge_pages++; 65 nr_huge_pages_node[page_to_nid(page)]++; 66 } 67 return page; 68 } 69 70 void free_huge_page(struct page *page) 71 { 72 BUG_ON(page_count(page)); 73 74 INIT_LIST_HEAD(&page->lru); 75 page[1].mapping = NULL; 76 77 spin_lock(&hugetlb_lock); 78 enqueue_huge_page(page); 79 spin_unlock(&hugetlb_lock); 80 } 81 82 struct page *alloc_huge_page(void) 83 { 84 struct page *page; 85 int i; 86 87 spin_lock(&hugetlb_lock); 88 page = dequeue_huge_page(); 89 if (!page) { 90 spin_unlock(&hugetlb_lock); 91 return NULL; 92 } 93 spin_unlock(&hugetlb_lock); 94 set_page_count(page, 1); 95 page[1].mapping = (void *)free_huge_page; 96 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) 97 clear_highpage(&page[i]); 98 return page; 99 } 100 101 static int __init hugetlb_init(void) 102 { 103 unsigned long i; 104 struct page *page; 105 106 for (i = 0; i < MAX_NUMNODES; ++i) 107 INIT_LIST_HEAD(&hugepage_freelists[i]); 108 109 for (i = 0; i < max_huge_pages; ++i) { 110 page = alloc_fresh_huge_page(); 111 if (!page) 112 break; 113 spin_lock(&hugetlb_lock); 114 enqueue_huge_page(page); 115 spin_unlock(&hugetlb_lock); 116 } 117 max_huge_pages = free_huge_pages = nr_huge_pages = i; 118 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 119 return 0; 120 } 121 module_init(hugetlb_init); 122 123 static int __init hugetlb_setup(char *s) 124 { 125 if (sscanf(s, "%lu", &max_huge_pages) <= 0) 126 max_huge_pages = 0; 127 return 1; 128 } 129 __setup("hugepages=", hugetlb_setup); 130 131 #ifdef CONFIG_SYSCTL 132 static void update_and_free_page(struct page *page) 133 { 134 int i; 135 nr_huge_pages--; 136 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; 137 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 138 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 139 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 140 1 << PG_private | 1<< PG_writeback); 141 set_page_count(&page[i], 0); 142 } 143 set_page_count(page, 1); 144 __free_pages(page, HUGETLB_PAGE_ORDER); 145 } 146 147 #ifdef CONFIG_HIGHMEM 148 static void try_to_free_low(unsigned long count) 149 { 150 int i, nid; 151 for (i = 0; i < MAX_NUMNODES; ++i) { 152 struct page *page, *next; 153 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 154 if (PageHighMem(page)) 155 continue; 156 list_del(&page->lru); 157 update_and_free_page(page); 158 nid = page_zone(page)->zone_pgdat->node_id; 159 free_huge_pages--; 160 free_huge_pages_node[nid]--; 161 if (count >= nr_huge_pages) 162 return; 163 } 164 } 165 } 166 #else 167 static inline void try_to_free_low(unsigned long count) 168 { 169 } 170 #endif 171 172 static unsigned long set_max_huge_pages(unsigned long count) 173 { 174 while (count > nr_huge_pages) { 175 struct page *page = alloc_fresh_huge_page(); 176 if (!page) 177 return nr_huge_pages; 178 spin_lock(&hugetlb_lock); 179 enqueue_huge_page(page); 180 spin_unlock(&hugetlb_lock); 181 } 182 if (count >= nr_huge_pages) 183 return nr_huge_pages; 184 185 spin_lock(&hugetlb_lock); 186 try_to_free_low(count); 187 while (count < nr_huge_pages) { 188 struct page *page = dequeue_huge_page(); 189 if (!page) 190 break; 191 update_and_free_page(page); 192 } 193 spin_unlock(&hugetlb_lock); 194 return nr_huge_pages; 195 } 196 197 int hugetlb_sysctl_handler(struct ctl_table *table, int write, 198 struct file *file, void __user *buffer, 199 size_t *length, loff_t *ppos) 200 { 201 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 202 max_huge_pages = set_max_huge_pages(max_huge_pages); 203 return 0; 204 } 205 #endif /* CONFIG_SYSCTL */ 206 207 int hugetlb_report_meminfo(char *buf) 208 { 209 return sprintf(buf, 210 "HugePages_Total: %5lu\n" 211 "HugePages_Free: %5lu\n" 212 "Hugepagesize: %5lu kB\n", 213 nr_huge_pages, 214 free_huge_pages, 215 HPAGE_SIZE/1024); 216 } 217 218 int hugetlb_report_node_meminfo(int nid, char *buf) 219 { 220 return sprintf(buf, 221 "Node %d HugePages_Total: %5u\n" 222 "Node %d HugePages_Free: %5u\n", 223 nid, nr_huge_pages_node[nid], 224 nid, free_huge_pages_node[nid]); 225 } 226 227 int is_hugepage_mem_enough(size_t size) 228 { 229 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; 230 } 231 232 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 233 unsigned long hugetlb_total_pages(void) 234 { 235 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 236 } 237 EXPORT_SYMBOL(hugetlb_total_pages); 238 239 /* 240 * We cannot handle pagefaults against hugetlb pages at all. They cause 241 * handle_mm_fault() to try to instantiate regular-sized pages in the 242 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get 243 * this far. 244 */ 245 static struct page *hugetlb_nopage(struct vm_area_struct *vma, 246 unsigned long address, int *unused) 247 { 248 BUG(); 249 return NULL; 250 } 251 252 struct vm_operations_struct hugetlb_vm_ops = { 253 .nopage = hugetlb_nopage, 254 }; 255 256 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) 257 { 258 pte_t entry; 259 260 if (vma->vm_flags & VM_WRITE) { 261 entry = 262 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 263 } else { 264 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 265 } 266 entry = pte_mkyoung(entry); 267 entry = pte_mkhuge(entry); 268 269 return entry; 270 } 271 272 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 273 struct vm_area_struct *vma) 274 { 275 pte_t *src_pte, *dst_pte, entry; 276 struct page *ptepage; 277 unsigned long addr = vma->vm_start; 278 unsigned long end = vma->vm_end; 279 280 while (addr < end) { 281 dst_pte = huge_pte_alloc(dst, addr); 282 if (!dst_pte) 283 goto nomem; 284 src_pte = huge_pte_offset(src, addr); 285 BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */ 286 entry = *src_pte; 287 ptepage = pte_page(entry); 288 get_page(ptepage); 289 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); 290 set_huge_pte_at(dst, addr, dst_pte, entry); 291 addr += HPAGE_SIZE; 292 } 293 return 0; 294 295 nomem: 296 return -ENOMEM; 297 } 298 299 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 300 unsigned long end) 301 { 302 struct mm_struct *mm = vma->vm_mm; 303 unsigned long address; 304 pte_t *ptep; 305 pte_t pte; 306 struct page *page; 307 308 WARN_ON(!is_vm_hugetlb_page(vma)); 309 BUG_ON(start & ~HPAGE_MASK); 310 BUG_ON(end & ~HPAGE_MASK); 311 312 for (address = start; address < end; address += HPAGE_SIZE) { 313 ptep = huge_pte_offset(mm, address); 314 if (! ptep) 315 /* This can happen on truncate, or if an 316 * mmap() is aborted due to an error before 317 * the prefault */ 318 continue; 319 320 pte = huge_ptep_get_and_clear(mm, address, ptep); 321 if (pte_none(pte)) 322 continue; 323 324 page = pte_page(pte); 325 put_page(page); 326 } 327 add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); 328 flush_tlb_range(vma, start, end); 329 } 330 331 void zap_hugepage_range(struct vm_area_struct *vma, 332 unsigned long start, unsigned long length) 333 { 334 struct mm_struct *mm = vma->vm_mm; 335 336 spin_lock(&mm->page_table_lock); 337 unmap_hugepage_range(vma, start, start + length); 338 spin_unlock(&mm->page_table_lock); 339 } 340 341 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) 342 { 343 struct mm_struct *mm = current->mm; 344 unsigned long addr; 345 int ret = 0; 346 347 WARN_ON(!is_vm_hugetlb_page(vma)); 348 BUG_ON(vma->vm_start & ~HPAGE_MASK); 349 BUG_ON(vma->vm_end & ~HPAGE_MASK); 350 351 hugetlb_prefault_arch_hook(mm); 352 353 spin_lock(&mm->page_table_lock); 354 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 355 unsigned long idx; 356 pte_t *pte = huge_pte_alloc(mm, addr); 357 struct page *page; 358 359 if (!pte) { 360 ret = -ENOMEM; 361 goto out; 362 } 363 364 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) 365 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 366 page = find_get_page(mapping, idx); 367 if (!page) { 368 /* charge the fs quota first */ 369 if (hugetlb_get_quota(mapping)) { 370 ret = -ENOMEM; 371 goto out; 372 } 373 page = alloc_huge_page(); 374 if (!page) { 375 hugetlb_put_quota(mapping); 376 ret = -ENOMEM; 377 goto out; 378 } 379 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); 380 if (! ret) { 381 unlock_page(page); 382 } else { 383 hugetlb_put_quota(mapping); 384 free_huge_page(page); 385 goto out; 386 } 387 } 388 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); 389 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); 390 } 391 out: 392 spin_unlock(&mm->page_table_lock); 393 return ret; 394 } 395 396 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 397 struct page **pages, struct vm_area_struct **vmas, 398 unsigned long *position, int *length, int i) 399 { 400 unsigned long vpfn, vaddr = *position; 401 int remainder = *length; 402 403 BUG_ON(!is_vm_hugetlb_page(vma)); 404 405 vpfn = vaddr/PAGE_SIZE; 406 while (vaddr < vma->vm_end && remainder) { 407 408 if (pages) { 409 pte_t *pte; 410 struct page *page; 411 412 /* Some archs (sparc64, sh*) have multiple 413 * pte_ts to each hugepage. We have to make 414 * sure we get the first, for the page 415 * indexing below to work. */ 416 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 417 418 /* hugetlb should be locked, and hence, prefaulted */ 419 WARN_ON(!pte || pte_none(*pte)); 420 421 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 422 423 WARN_ON(!PageCompound(page)); 424 425 get_page(page); 426 pages[i] = page; 427 } 428 429 if (vmas) 430 vmas[i] = vma; 431 432 vaddr += PAGE_SIZE; 433 ++vpfn; 434 --remainder; 435 ++i; 436 } 437 438 *length = remainder; 439 *position = vaddr; 440 441 return i; 442 } 443