xref: /linux/mm/hugetlb.c (revision 9ce7677cfd7cd871adb457c80bea3b581b839641)
1 /*
2  * Generic hugetlb support.
3  * (C) William Irwin, April 2004
4  */
5 #include <linux/gfp.h>
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
9 #include <linux/mm.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
14 #include <asm/page.h>
15 #include <asm/pgtable.h>
16 
17 #include <linux/hugetlb.h>
18 
19 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
20 static unsigned long nr_huge_pages, free_huge_pages;
21 unsigned long max_huge_pages;
22 static struct list_head hugepage_freelists[MAX_NUMNODES];
23 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
24 static unsigned int free_huge_pages_node[MAX_NUMNODES];
25 
26 /*
27  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
28  */
29 static DEFINE_SPINLOCK(hugetlb_lock);
30 
31 static void enqueue_huge_page(struct page *page)
32 {
33 	int nid = page_to_nid(page);
34 	list_add(&page->lru, &hugepage_freelists[nid]);
35 	free_huge_pages++;
36 	free_huge_pages_node[nid]++;
37 }
38 
39 static struct page *dequeue_huge_page(void)
40 {
41 	int nid = numa_node_id();
42 	struct page *page = NULL;
43 
44 	if (list_empty(&hugepage_freelists[nid])) {
45 		for (nid = 0; nid < MAX_NUMNODES; ++nid)
46 			if (!list_empty(&hugepage_freelists[nid]))
47 				break;
48 	}
49 	if (nid >= 0 && nid < MAX_NUMNODES &&
50 	    !list_empty(&hugepage_freelists[nid])) {
51 		page = list_entry(hugepage_freelists[nid].next,
52 				  struct page, lru);
53 		list_del(&page->lru);
54 		free_huge_pages--;
55 		free_huge_pages_node[nid]--;
56 	}
57 	return page;
58 }
59 
60 static struct page *alloc_fresh_huge_page(void)
61 {
62 	static int nid = 0;
63 	struct page *page;
64 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
65 					HUGETLB_PAGE_ORDER);
66 	nid = (nid + 1) % num_online_nodes();
67 	if (page) {
68 		spin_lock(&hugetlb_lock);
69 		nr_huge_pages++;
70 		nr_huge_pages_node[page_to_nid(page)]++;
71 		spin_unlock(&hugetlb_lock);
72 	}
73 	return page;
74 }
75 
76 void free_huge_page(struct page *page)
77 {
78 	BUG_ON(page_count(page));
79 
80 	INIT_LIST_HEAD(&page->lru);
81 	page[1].mapping = NULL;
82 
83 	spin_lock(&hugetlb_lock);
84 	enqueue_huge_page(page);
85 	spin_unlock(&hugetlb_lock);
86 }
87 
88 struct page *alloc_huge_page(void)
89 {
90 	struct page *page;
91 	int i;
92 
93 	spin_lock(&hugetlb_lock);
94 	page = dequeue_huge_page();
95 	if (!page) {
96 		spin_unlock(&hugetlb_lock);
97 		return NULL;
98 	}
99 	spin_unlock(&hugetlb_lock);
100 	set_page_count(page, 1);
101 	page[1].mapping = (void *)free_huge_page;
102 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
103 		clear_highpage(&page[i]);
104 	return page;
105 }
106 
107 static int __init hugetlb_init(void)
108 {
109 	unsigned long i;
110 	struct page *page;
111 
112 	if (HPAGE_SHIFT == 0)
113 		return 0;
114 
115 	for (i = 0; i < MAX_NUMNODES; ++i)
116 		INIT_LIST_HEAD(&hugepage_freelists[i]);
117 
118 	for (i = 0; i < max_huge_pages; ++i) {
119 		page = alloc_fresh_huge_page();
120 		if (!page)
121 			break;
122 		spin_lock(&hugetlb_lock);
123 		enqueue_huge_page(page);
124 		spin_unlock(&hugetlb_lock);
125 	}
126 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
127 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
128 	return 0;
129 }
130 module_init(hugetlb_init);
131 
132 static int __init hugetlb_setup(char *s)
133 {
134 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
135 		max_huge_pages = 0;
136 	return 1;
137 }
138 __setup("hugepages=", hugetlb_setup);
139 
140 #ifdef CONFIG_SYSCTL
141 static void update_and_free_page(struct page *page)
142 {
143 	int i;
144 	nr_huge_pages--;
145 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
146 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
147 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
148 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
149 				1 << PG_private | 1<< PG_writeback);
150 		set_page_count(&page[i], 0);
151 	}
152 	set_page_count(page, 1);
153 	__free_pages(page, HUGETLB_PAGE_ORDER);
154 }
155 
156 #ifdef CONFIG_HIGHMEM
157 static void try_to_free_low(unsigned long count)
158 {
159 	int i, nid;
160 	for (i = 0; i < MAX_NUMNODES; ++i) {
161 		struct page *page, *next;
162 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
163 			if (PageHighMem(page))
164 				continue;
165 			list_del(&page->lru);
166 			update_and_free_page(page);
167 			nid = page_zone(page)->zone_pgdat->node_id;
168 			free_huge_pages--;
169 			free_huge_pages_node[nid]--;
170 			if (count >= nr_huge_pages)
171 				return;
172 		}
173 	}
174 }
175 #else
176 static inline void try_to_free_low(unsigned long count)
177 {
178 }
179 #endif
180 
181 static unsigned long set_max_huge_pages(unsigned long count)
182 {
183 	while (count > nr_huge_pages) {
184 		struct page *page = alloc_fresh_huge_page();
185 		if (!page)
186 			return nr_huge_pages;
187 		spin_lock(&hugetlb_lock);
188 		enqueue_huge_page(page);
189 		spin_unlock(&hugetlb_lock);
190 	}
191 	if (count >= nr_huge_pages)
192 		return nr_huge_pages;
193 
194 	spin_lock(&hugetlb_lock);
195 	try_to_free_low(count);
196 	while (count < nr_huge_pages) {
197 		struct page *page = dequeue_huge_page();
198 		if (!page)
199 			break;
200 		update_and_free_page(page);
201 	}
202 	spin_unlock(&hugetlb_lock);
203 	return nr_huge_pages;
204 }
205 
206 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
207 			   struct file *file, void __user *buffer,
208 			   size_t *length, loff_t *ppos)
209 {
210 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
211 	max_huge_pages = set_max_huge_pages(max_huge_pages);
212 	return 0;
213 }
214 #endif /* CONFIG_SYSCTL */
215 
216 int hugetlb_report_meminfo(char *buf)
217 {
218 	return sprintf(buf,
219 			"HugePages_Total: %5lu\n"
220 			"HugePages_Free:  %5lu\n"
221 			"Hugepagesize:    %5lu kB\n",
222 			nr_huge_pages,
223 			free_huge_pages,
224 			HPAGE_SIZE/1024);
225 }
226 
227 int hugetlb_report_node_meminfo(int nid, char *buf)
228 {
229 	return sprintf(buf,
230 		"Node %d HugePages_Total: %5u\n"
231 		"Node %d HugePages_Free:  %5u\n",
232 		nid, nr_huge_pages_node[nid],
233 		nid, free_huge_pages_node[nid]);
234 }
235 
236 int is_hugepage_mem_enough(size_t size)
237 {
238 	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
239 }
240 
241 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
242 unsigned long hugetlb_total_pages(void)
243 {
244 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
245 }
246 
247 /*
248  * We cannot handle pagefaults against hugetlb pages at all.  They cause
249  * handle_mm_fault() to try to instantiate regular-sized pages in the
250  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
251  * this far.
252  */
253 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
254 				unsigned long address, int *unused)
255 {
256 	BUG();
257 	return NULL;
258 }
259 
260 struct vm_operations_struct hugetlb_vm_ops = {
261 	.nopage = hugetlb_nopage,
262 };
263 
264 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
265 {
266 	pte_t entry;
267 
268 	if (vma->vm_flags & VM_WRITE) {
269 		entry =
270 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
271 	} else {
272 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
273 	}
274 	entry = pte_mkyoung(entry);
275 	entry = pte_mkhuge(entry);
276 
277 	return entry;
278 }
279 
280 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
281 			    struct vm_area_struct *vma)
282 {
283 	pte_t *src_pte, *dst_pte, entry;
284 	struct page *ptepage;
285 	unsigned long addr;
286 
287 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
288 		src_pte = huge_pte_offset(src, addr);
289 		if (!src_pte)
290 			continue;
291 		dst_pte = huge_pte_alloc(dst, addr);
292 		if (!dst_pte)
293 			goto nomem;
294 		spin_lock(&dst->page_table_lock);
295 		spin_lock(&src->page_table_lock);
296 		if (!pte_none(*src_pte)) {
297 			entry = *src_pte;
298 			ptepage = pte_page(entry);
299 			get_page(ptepage);
300 			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
301 			set_huge_pte_at(dst, addr, dst_pte, entry);
302 		}
303 		spin_unlock(&src->page_table_lock);
304 		spin_unlock(&dst->page_table_lock);
305 	}
306 	return 0;
307 
308 nomem:
309 	return -ENOMEM;
310 }
311 
312 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
313 			  unsigned long end)
314 {
315 	struct mm_struct *mm = vma->vm_mm;
316 	unsigned long address;
317 	pte_t *ptep;
318 	pte_t pte;
319 	struct page *page;
320 
321 	WARN_ON(!is_vm_hugetlb_page(vma));
322 	BUG_ON(start & ~HPAGE_MASK);
323 	BUG_ON(end & ~HPAGE_MASK);
324 
325 	spin_lock(&mm->page_table_lock);
326 
327 	/* Update high watermark before we lower rss */
328 	update_hiwater_rss(mm);
329 
330 	for (address = start; address < end; address += HPAGE_SIZE) {
331 		ptep = huge_pte_offset(mm, address);
332 		if (!ptep)
333 			continue;
334 
335 		pte = huge_ptep_get_and_clear(mm, address, ptep);
336 		if (pte_none(pte))
337 			continue;
338 
339 		page = pte_page(pte);
340 		put_page(page);
341 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
342 	}
343 
344 	spin_unlock(&mm->page_table_lock);
345 	flush_tlb_range(vma, start, end);
346 }
347 
348 static struct page *find_lock_huge_page(struct address_space *mapping,
349 			unsigned long idx)
350 {
351 	struct page *page;
352 	int err;
353 	struct inode *inode = mapping->host;
354 	unsigned long size;
355 
356 retry:
357 	page = find_lock_page(mapping, idx);
358 	if (page)
359 		goto out;
360 
361 	/* Check to make sure the mapping hasn't been truncated */
362 	size = i_size_read(inode) >> HPAGE_SHIFT;
363 	if (idx >= size)
364 		goto out;
365 
366 	if (hugetlb_get_quota(mapping))
367 		goto out;
368 	page = alloc_huge_page();
369 	if (!page) {
370 		hugetlb_put_quota(mapping);
371 		goto out;
372 	}
373 
374 	err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
375 	if (err) {
376 		put_page(page);
377 		hugetlb_put_quota(mapping);
378 		if (err == -EEXIST)
379 			goto retry;
380 		page = NULL;
381 	}
382 out:
383 	return page;
384 }
385 
386 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
387 			unsigned long address, int write_access)
388 {
389 	int ret = VM_FAULT_SIGBUS;
390 	unsigned long idx;
391 	unsigned long size;
392 	pte_t *pte;
393 	struct page *page;
394 	struct address_space *mapping;
395 
396 	pte = huge_pte_alloc(mm, address);
397 	if (!pte)
398 		goto out;
399 
400 	mapping = vma->vm_file->f_mapping;
401 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
402 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
403 
404 	/*
405 	 * Use page lock to guard against racing truncation
406 	 * before we get page_table_lock.
407 	 */
408 	page = find_lock_huge_page(mapping, idx);
409 	if (!page)
410 		goto out;
411 
412 	spin_lock(&mm->page_table_lock);
413 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
414 	if (idx >= size)
415 		goto backout;
416 
417 	ret = VM_FAULT_MINOR;
418 	if (!pte_none(*pte))
419 		goto backout;
420 
421 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
422 	set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
423 	spin_unlock(&mm->page_table_lock);
424 	unlock_page(page);
425 out:
426 	return ret;
427 
428 backout:
429 	spin_unlock(&mm->page_table_lock);
430 	hugetlb_put_quota(mapping);
431 	unlock_page(page);
432 	put_page(page);
433 	goto out;
434 }
435 
436 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
437 			struct page **pages, struct vm_area_struct **vmas,
438 			unsigned long *position, int *length, int i)
439 {
440 	unsigned long vpfn, vaddr = *position;
441 	int remainder = *length;
442 
443 	vpfn = vaddr/PAGE_SIZE;
444 	spin_lock(&mm->page_table_lock);
445 	while (vaddr < vma->vm_end && remainder) {
446 		pte_t *pte;
447 		struct page *page;
448 
449 		/*
450 		 * Some archs (sparc64, sh*) have multiple pte_ts to
451 		 * each hugepage.  We have to make * sure we get the
452 		 * first, for the page indexing below to work.
453 		 */
454 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
455 
456 		if (!pte || pte_none(*pte)) {
457 			int ret;
458 
459 			spin_unlock(&mm->page_table_lock);
460 			ret = hugetlb_fault(mm, vma, vaddr, 0);
461 			spin_lock(&mm->page_table_lock);
462 			if (ret == VM_FAULT_MINOR)
463 				continue;
464 
465 			remainder = 0;
466 			if (!i)
467 				i = -EFAULT;
468 			break;
469 		}
470 
471 		if (pages) {
472 			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
473 			get_page(page);
474 			pages[i] = page;
475 		}
476 
477 		if (vmas)
478 			vmas[i] = vma;
479 
480 		vaddr += PAGE_SIZE;
481 		++vpfn;
482 		--remainder;
483 		++i;
484 	}
485 	spin_unlock(&mm->page_table_lock);
486 	*length = remainder;
487 	*position = vaddr;
488 
489 	return i;
490 }
491