xref: /linux/mm/hugetlb.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /*
2  * Generic hugetlb support.
3  * (C) William Irwin, April 2004
4  */
5 #include <linux/gfp.h>
6 #include <linux/list.h>
7 #include <linux/init.h>
8 #include <linux/module.h>
9 #include <linux/mm.h>
10 #include <linux/sysctl.h>
11 #include <linux/highmem.h>
12 #include <linux/nodemask.h>
13 #include <linux/pagemap.h>
14 #include <linux/mempolicy.h>
15 #include <linux/cpuset.h>
16 #include <linux/mutex.h>
17 
18 #include <asm/page.h>
19 #include <asm/pgtable.h>
20 
21 #include <linux/hugetlb.h>
22 #include "internal.h"
23 
24 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25 static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
26 unsigned long max_huge_pages;
27 static struct list_head hugepage_freelists[MAX_NUMNODES];
28 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29 static unsigned int free_huge_pages_node[MAX_NUMNODES];
30 /*
31  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
32  */
33 static DEFINE_SPINLOCK(hugetlb_lock);
34 
35 static void clear_huge_page(struct page *page, unsigned long addr)
36 {
37 	int i;
38 
39 	might_sleep();
40 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
41 		cond_resched();
42 		clear_user_highpage(page + i, addr);
43 	}
44 }
45 
46 static void copy_huge_page(struct page *dst, struct page *src,
47 			   unsigned long addr)
48 {
49 	int i;
50 
51 	might_sleep();
52 	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
53 		cond_resched();
54 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
55 	}
56 }
57 
58 static void enqueue_huge_page(struct page *page)
59 {
60 	int nid = page_to_nid(page);
61 	list_add(&page->lru, &hugepage_freelists[nid]);
62 	free_huge_pages++;
63 	free_huge_pages_node[nid]++;
64 }
65 
66 static struct page *dequeue_huge_page(struct vm_area_struct *vma,
67 				unsigned long address)
68 {
69 	int nid = numa_node_id();
70 	struct page *page = NULL;
71 	struct zonelist *zonelist = huge_zonelist(vma, address);
72 	struct zone **z;
73 
74 	for (z = zonelist->zones; *z; z++) {
75 		nid = (*z)->zone_pgdat->node_id;
76 		if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
77 		    !list_empty(&hugepage_freelists[nid]))
78 			break;
79 	}
80 
81 	if (*z) {
82 		page = list_entry(hugepage_freelists[nid].next,
83 				  struct page, lru);
84 		list_del(&page->lru);
85 		free_huge_pages--;
86 		free_huge_pages_node[nid]--;
87 	}
88 	return page;
89 }
90 
91 static void free_huge_page(struct page *page)
92 {
93 	BUG_ON(page_count(page));
94 
95 	INIT_LIST_HEAD(&page->lru);
96 
97 	spin_lock(&hugetlb_lock);
98 	enqueue_huge_page(page);
99 	spin_unlock(&hugetlb_lock);
100 }
101 
102 static int alloc_fresh_huge_page(void)
103 {
104 	static int nid = 0;
105 	struct page *page;
106 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
107 					HUGETLB_PAGE_ORDER);
108 	nid = next_node(nid, node_online_map);
109 	if (nid == MAX_NUMNODES)
110 		nid = first_node(node_online_map);
111 	if (page) {
112 		page[1].lru.next = (void *)free_huge_page;	/* dtor */
113 		spin_lock(&hugetlb_lock);
114 		nr_huge_pages++;
115 		nr_huge_pages_node[page_to_nid(page)]++;
116 		spin_unlock(&hugetlb_lock);
117 		put_page(page); /* free it into the hugepage allocator */
118 		return 1;
119 	}
120 	return 0;
121 }
122 
123 static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 				    unsigned long addr)
125 {
126 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 	struct page *page;
128 	int use_reserve = 0;
129 	unsigned long idx;
130 
131 	spin_lock(&hugetlb_lock);
132 
133 	if (vma->vm_flags & VM_MAYSHARE) {
134 
135 		/* idx = radix tree index, i.e. offset into file in
136 		 * HPAGE_SIZE units */
137 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139 
140 		/* The hugetlbfs specific inode info stores the number
141 		 * of "guaranteed available" (huge) pages.  That is,
142 		 * the first 'prereserved_hpages' pages of the inode
143 		 * are either already instantiated, or have been
144 		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 		 * we're in the process of instantiating the page, so
146 		 * we use this to determine whether to draw from the
147 		 * pre-reserved pool or the truly free pool. */
148 		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 			use_reserve = 1;
150 	}
151 
152 	if (!use_reserve) {
153 		if (free_huge_pages <= reserved_huge_pages)
154 			goto fail;
155 	} else {
156 		BUG_ON(reserved_huge_pages == 0);
157 		reserved_huge_pages--;
158 	}
159 
160 	page = dequeue_huge_page(vma, addr);
161 	if (!page)
162 		goto fail;
163 
164 	spin_unlock(&hugetlb_lock);
165 	set_page_refcounted(page);
166 	return page;
167 
168  fail:
169 	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 	spin_unlock(&hugetlb_lock);
171 	return NULL;
172 }
173 
174 /* hugetlb_extend_reservation()
175  *
176  * Ensure that at least 'atleast' hugepages are, and will remain,
177  * available to instantiate the first 'atleast' pages of the given
178  * inode.  If the inode doesn't already have this many pages reserved
179  * or instantiated, set aside some hugepages in the reserved pool to
180  * satisfy later faults (or fail now if there aren't enough, rather
181  * than getting the SIGBUS later).
182  */
183 int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 			       unsigned long atleast)
185 {
186 	struct inode *inode = &info->vfs_inode;
187 	unsigned long change_in_reserve = 0;
188 	int ret = 0;
189 
190 	spin_lock(&hugetlb_lock);
191 	read_lock_irq(&inode->i_mapping->tree_lock);
192 
193 	if (info->prereserved_hpages >= atleast)
194 		goto out;
195 
196 	/* Because we always call this on shared mappings, none of the
197 	 * pages beyond info->prereserved_hpages can have been
198 	 * instantiated, so we need to reserve all of them now. */
199 	change_in_reserve = atleast - info->prereserved_hpages;
200 
201 	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 		ret = -ENOMEM;
203 		goto out;
204 	}
205 
206 	reserved_huge_pages += change_in_reserve;
207 	info->prereserved_hpages = atleast;
208 
209  out:
210 	read_unlock_irq(&inode->i_mapping->tree_lock);
211 	spin_unlock(&hugetlb_lock);
212 
213 	return ret;
214 }
215 
216 /* hugetlb_truncate_reservation()
217  *
218  * This returns pages reserved for the given inode to the general free
219  * hugepage pool.  If the inode has any pages prereserved, but not
220  * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221  * them.
222  */
223 void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 				  unsigned long atmost)
225 {
226 	struct inode *inode = &info->vfs_inode;
227 	struct address_space *mapping = inode->i_mapping;
228 	unsigned long idx;
229 	unsigned long change_in_reserve = 0;
230 	struct page *page;
231 
232 	spin_lock(&hugetlb_lock);
233 	read_lock_irq(&inode->i_mapping->tree_lock);
234 
235 	if (info->prereserved_hpages <= atmost)
236 		goto out;
237 
238 	/* Count pages which were reserved, but not instantiated, and
239 	 * which we can now release. */
240 	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 		page = radix_tree_lookup(&mapping->page_tree, idx);
242 		if (!page)
243 			/* Pages which are already instantiated can't
244 			 * be unreserved (and in fact have already
245 			 * been removed from the reserved pool) */
246 			change_in_reserve++;
247 	}
248 
249 	BUG_ON(reserved_huge_pages < change_in_reserve);
250 	reserved_huge_pages -= change_in_reserve;
251 	info->prereserved_hpages = atmost;
252 
253  out:
254 	read_unlock_irq(&inode->i_mapping->tree_lock);
255 	spin_unlock(&hugetlb_lock);
256 }
257 
258 static int __init hugetlb_init(void)
259 {
260 	unsigned long i;
261 
262 	if (HPAGE_SHIFT == 0)
263 		return 0;
264 
265 	for (i = 0; i < MAX_NUMNODES; ++i)
266 		INIT_LIST_HEAD(&hugepage_freelists[i]);
267 
268 	for (i = 0; i < max_huge_pages; ++i) {
269 		if (!alloc_fresh_huge_page())
270 			break;
271 	}
272 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
273 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
274 	return 0;
275 }
276 module_init(hugetlb_init);
277 
278 static int __init hugetlb_setup(char *s)
279 {
280 	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
281 		max_huge_pages = 0;
282 	return 1;
283 }
284 __setup("hugepages=", hugetlb_setup);
285 
286 #ifdef CONFIG_SYSCTL
287 static void update_and_free_page(struct page *page)
288 {
289 	int i;
290 	nr_huge_pages--;
291 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
292 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
293 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
294 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
295 				1 << PG_private | 1<< PG_writeback);
296 	}
297 	page[1].lru.next = NULL;
298 	set_page_refcounted(page);
299 	__free_pages(page, HUGETLB_PAGE_ORDER);
300 }
301 
302 #ifdef CONFIG_HIGHMEM
303 static void try_to_free_low(unsigned long count)
304 {
305 	int i, nid;
306 	for (i = 0; i < MAX_NUMNODES; ++i) {
307 		struct page *page, *next;
308 		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
309 			if (PageHighMem(page))
310 				continue;
311 			list_del(&page->lru);
312 			update_and_free_page(page);
313 			nid = page_zone(page)->zone_pgdat->node_id;
314 			free_huge_pages--;
315 			free_huge_pages_node[nid]--;
316 			if (count >= nr_huge_pages)
317 				return;
318 		}
319 	}
320 }
321 #else
322 static inline void try_to_free_low(unsigned long count)
323 {
324 }
325 #endif
326 
327 static unsigned long set_max_huge_pages(unsigned long count)
328 {
329 	while (count > nr_huge_pages) {
330 		if (!alloc_fresh_huge_page())
331 			return nr_huge_pages;
332 	}
333 	if (count >= nr_huge_pages)
334 		return nr_huge_pages;
335 
336 	spin_lock(&hugetlb_lock);
337 	count = max(count, reserved_huge_pages);
338 	try_to_free_low(count);
339 	while (count < nr_huge_pages) {
340 		struct page *page = dequeue_huge_page(NULL, 0);
341 		if (!page)
342 			break;
343 		update_and_free_page(page);
344 	}
345 	spin_unlock(&hugetlb_lock);
346 	return nr_huge_pages;
347 }
348 
349 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
350 			   struct file *file, void __user *buffer,
351 			   size_t *length, loff_t *ppos)
352 {
353 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
354 	max_huge_pages = set_max_huge_pages(max_huge_pages);
355 	return 0;
356 }
357 #endif /* CONFIG_SYSCTL */
358 
359 int hugetlb_report_meminfo(char *buf)
360 {
361 	return sprintf(buf,
362 			"HugePages_Total: %5lu\n"
363 			"HugePages_Free:  %5lu\n"
364 		        "HugePages_Rsvd:  %5lu\n"
365 			"Hugepagesize:    %5lu kB\n",
366 			nr_huge_pages,
367 			free_huge_pages,
368 		        reserved_huge_pages,
369 			HPAGE_SIZE/1024);
370 }
371 
372 int hugetlb_report_node_meminfo(int nid, char *buf)
373 {
374 	return sprintf(buf,
375 		"Node %d HugePages_Total: %5u\n"
376 		"Node %d HugePages_Free:  %5u\n",
377 		nid, nr_huge_pages_node[nid],
378 		nid, free_huge_pages_node[nid]);
379 }
380 
381 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
382 unsigned long hugetlb_total_pages(void)
383 {
384 	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
385 }
386 
387 /*
388  * We cannot handle pagefaults against hugetlb pages at all.  They cause
389  * handle_mm_fault() to try to instantiate regular-sized pages in the
390  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
391  * this far.
392  */
393 static struct page *hugetlb_nopage(struct vm_area_struct *vma,
394 				unsigned long address, int *unused)
395 {
396 	BUG();
397 	return NULL;
398 }
399 
400 struct vm_operations_struct hugetlb_vm_ops = {
401 	.nopage = hugetlb_nopage,
402 };
403 
404 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
405 				int writable)
406 {
407 	pte_t entry;
408 
409 	if (writable) {
410 		entry =
411 		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
412 	} else {
413 		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
414 	}
415 	entry = pte_mkyoung(entry);
416 	entry = pte_mkhuge(entry);
417 
418 	return entry;
419 }
420 
421 static void set_huge_ptep_writable(struct vm_area_struct *vma,
422 				   unsigned long address, pte_t *ptep)
423 {
424 	pte_t entry;
425 
426 	entry = pte_mkwrite(pte_mkdirty(*ptep));
427 	ptep_set_access_flags(vma, address, ptep, entry, 1);
428 	update_mmu_cache(vma, address, entry);
429 	lazy_mmu_prot_update(entry);
430 }
431 
432 
433 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
434 			    struct vm_area_struct *vma)
435 {
436 	pte_t *src_pte, *dst_pte, entry;
437 	struct page *ptepage;
438 	unsigned long addr;
439 	int cow;
440 
441 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
442 
443 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
444 		src_pte = huge_pte_offset(src, addr);
445 		if (!src_pte)
446 			continue;
447 		dst_pte = huge_pte_alloc(dst, addr);
448 		if (!dst_pte)
449 			goto nomem;
450 		spin_lock(&dst->page_table_lock);
451 		spin_lock(&src->page_table_lock);
452 		if (!pte_none(*src_pte)) {
453 			if (cow)
454 				ptep_set_wrprotect(src, addr, src_pte);
455 			entry = *src_pte;
456 			ptepage = pte_page(entry);
457 			get_page(ptepage);
458 			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
459 			set_huge_pte_at(dst, addr, dst_pte, entry);
460 		}
461 		spin_unlock(&src->page_table_lock);
462 		spin_unlock(&dst->page_table_lock);
463 	}
464 	return 0;
465 
466 nomem:
467 	return -ENOMEM;
468 }
469 
470 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
471 			  unsigned long end)
472 {
473 	struct mm_struct *mm = vma->vm_mm;
474 	unsigned long address;
475 	pte_t *ptep;
476 	pte_t pte;
477 	struct page *page;
478 
479 	WARN_ON(!is_vm_hugetlb_page(vma));
480 	BUG_ON(start & ~HPAGE_MASK);
481 	BUG_ON(end & ~HPAGE_MASK);
482 
483 	spin_lock(&mm->page_table_lock);
484 
485 	/* Update high watermark before we lower rss */
486 	update_hiwater_rss(mm);
487 
488 	for (address = start; address < end; address += HPAGE_SIZE) {
489 		ptep = huge_pte_offset(mm, address);
490 		if (!ptep)
491 			continue;
492 
493 		pte = huge_ptep_get_and_clear(mm, address, ptep);
494 		if (pte_none(pte))
495 			continue;
496 
497 		page = pte_page(pte);
498 		put_page(page);
499 		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
500 	}
501 
502 	spin_unlock(&mm->page_table_lock);
503 	flush_tlb_range(vma, start, end);
504 }
505 
506 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
507 			unsigned long address, pte_t *ptep, pte_t pte)
508 {
509 	struct page *old_page, *new_page;
510 	int avoidcopy;
511 
512 	old_page = pte_page(pte);
513 
514 	/* If no-one else is actually using this page, avoid the copy
515 	 * and just make the page writable */
516 	avoidcopy = (page_count(old_page) == 1);
517 	if (avoidcopy) {
518 		set_huge_ptep_writable(vma, address, ptep);
519 		return VM_FAULT_MINOR;
520 	}
521 
522 	page_cache_get(old_page);
523 	new_page = alloc_huge_page(vma, address);
524 
525 	if (!new_page) {
526 		page_cache_release(old_page);
527 		return VM_FAULT_OOM;
528 	}
529 
530 	spin_unlock(&mm->page_table_lock);
531 	copy_huge_page(new_page, old_page, address);
532 	spin_lock(&mm->page_table_lock);
533 
534 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
535 	if (likely(pte_same(*ptep, pte))) {
536 		/* Break COW */
537 		set_huge_pte_at(mm, address, ptep,
538 				make_huge_pte(vma, new_page, 1));
539 		/* Make the old page be freed below */
540 		new_page = old_page;
541 	}
542 	page_cache_release(new_page);
543 	page_cache_release(old_page);
544 	return VM_FAULT_MINOR;
545 }
546 
547 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
548 			unsigned long address, pte_t *ptep, int write_access)
549 {
550 	int ret = VM_FAULT_SIGBUS;
551 	unsigned long idx;
552 	unsigned long size;
553 	struct page *page;
554 	struct address_space *mapping;
555 	pte_t new_pte;
556 
557 	mapping = vma->vm_file->f_mapping;
558 	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
559 		+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
560 
561 	/*
562 	 * Use page lock to guard against racing truncation
563 	 * before we get page_table_lock.
564 	 */
565 retry:
566 	page = find_lock_page(mapping, idx);
567 	if (!page) {
568 		if (hugetlb_get_quota(mapping))
569 			goto out;
570 		page = alloc_huge_page(vma, address);
571 		if (!page) {
572 			hugetlb_put_quota(mapping);
573 			ret = VM_FAULT_OOM;
574 			goto out;
575 		}
576 		clear_huge_page(page, address);
577 
578 		if (vma->vm_flags & VM_SHARED) {
579 			int err;
580 
581 			err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
582 			if (err) {
583 				put_page(page);
584 				hugetlb_put_quota(mapping);
585 				if (err == -EEXIST)
586 					goto retry;
587 				goto out;
588 			}
589 		} else
590 			lock_page(page);
591 	}
592 
593 	spin_lock(&mm->page_table_lock);
594 	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
595 	if (idx >= size)
596 		goto backout;
597 
598 	ret = VM_FAULT_MINOR;
599 	if (!pte_none(*ptep))
600 		goto backout;
601 
602 	add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
603 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
604 				&& (vma->vm_flags & VM_SHARED)));
605 	set_huge_pte_at(mm, address, ptep, new_pte);
606 
607 	if (write_access && !(vma->vm_flags & VM_SHARED)) {
608 		/* Optimization, do the COW without a second fault */
609 		ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
610 	}
611 
612 	spin_unlock(&mm->page_table_lock);
613 	unlock_page(page);
614 out:
615 	return ret;
616 
617 backout:
618 	spin_unlock(&mm->page_table_lock);
619 	hugetlb_put_quota(mapping);
620 	unlock_page(page);
621 	put_page(page);
622 	goto out;
623 }
624 
625 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
626 			unsigned long address, int write_access)
627 {
628 	pte_t *ptep;
629 	pte_t entry;
630 	int ret;
631 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
632 
633 	ptep = huge_pte_alloc(mm, address);
634 	if (!ptep)
635 		return VM_FAULT_OOM;
636 
637 	/*
638 	 * Serialize hugepage allocation and instantiation, so that we don't
639 	 * get spurious allocation failures if two CPUs race to instantiate
640 	 * the same page in the page cache.
641 	 */
642 	mutex_lock(&hugetlb_instantiation_mutex);
643 	entry = *ptep;
644 	if (pte_none(entry)) {
645 		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
646 		mutex_unlock(&hugetlb_instantiation_mutex);
647 		return ret;
648 	}
649 
650 	ret = VM_FAULT_MINOR;
651 
652 	spin_lock(&mm->page_table_lock);
653 	/* Check for a racing update before calling hugetlb_cow */
654 	if (likely(pte_same(entry, *ptep)))
655 		if (write_access && !pte_write(entry))
656 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
657 	spin_unlock(&mm->page_table_lock);
658 	mutex_unlock(&hugetlb_instantiation_mutex);
659 
660 	return ret;
661 }
662 
663 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
664 			struct page **pages, struct vm_area_struct **vmas,
665 			unsigned long *position, int *length, int i)
666 {
667 	unsigned long pfn_offset;
668 	unsigned long vaddr = *position;
669 	int remainder = *length;
670 
671 	spin_lock(&mm->page_table_lock);
672 	while (vaddr < vma->vm_end && remainder) {
673 		pte_t *pte;
674 		struct page *page;
675 
676 		/*
677 		 * Some archs (sparc64, sh*) have multiple pte_ts to
678 		 * each hugepage.  We have to make * sure we get the
679 		 * first, for the page indexing below to work.
680 		 */
681 		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
682 
683 		if (!pte || pte_none(*pte)) {
684 			int ret;
685 
686 			spin_unlock(&mm->page_table_lock);
687 			ret = hugetlb_fault(mm, vma, vaddr, 0);
688 			spin_lock(&mm->page_table_lock);
689 			if (ret == VM_FAULT_MINOR)
690 				continue;
691 
692 			remainder = 0;
693 			if (!i)
694 				i = -EFAULT;
695 			break;
696 		}
697 
698 		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
699 		page = pte_page(*pte);
700 same_page:
701 		if (pages) {
702 			get_page(page);
703 			pages[i] = page + pfn_offset;
704 		}
705 
706 		if (vmas)
707 			vmas[i] = vma;
708 
709 		vaddr += PAGE_SIZE;
710 		++pfn_offset;
711 		--remainder;
712 		++i;
713 		if (vaddr < vma->vm_end && remainder &&
714 				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
715 			/*
716 			 * We use pfn_offset to avoid touching the pageframes
717 			 * of this compound page.
718 			 */
719 			goto same_page;
720 		}
721 	}
722 	spin_unlock(&mm->page_table_lock);
723 	*length = remainder;
724 	*position = vaddr;
725 
726 	return i;
727 }
728 
729 void hugetlb_change_protection(struct vm_area_struct *vma,
730 		unsigned long address, unsigned long end, pgprot_t newprot)
731 {
732 	struct mm_struct *mm = vma->vm_mm;
733 	unsigned long start = address;
734 	pte_t *ptep;
735 	pte_t pte;
736 
737 	BUG_ON(address >= end);
738 	flush_cache_range(vma, address, end);
739 
740 	spin_lock(&mm->page_table_lock);
741 	for (; address < end; address += HPAGE_SIZE) {
742 		ptep = huge_pte_offset(mm, address);
743 		if (!ptep)
744 			continue;
745 		if (!pte_none(*ptep)) {
746 			pte = huge_ptep_get_and_clear(mm, address, ptep);
747 			pte = pte_mkhuge(pte_modify(pte, newprot));
748 			set_huge_pte_at(mm, address, ptep, pte);
749 			lazy_mmu_prot_update(pte);
750 		}
751 	}
752 	spin_unlock(&mm->page_table_lock);
753 
754 	flush_tlb_range(vma, start, end);
755 }
756 
757