xref: /linux/fs/hugetlbfs/inode.c (revision 9cfc5c90ad38c8fc11bfd39de42a107da00871ba)
1 /*
2  * hugetlbpage-backed filesystem.  Based on ramfs.
3  *
4  * Nadia Yvette Chambers, 2002
5  *
6  * Copyright (C) 2002 Linus Torvalds.
7  */
8 
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 
11 #include <linux/module.h>
12 #include <linux/thread_info.h>
13 #include <asm/current.h>
14 #include <linux/sched.h>		/* remove ASAP */
15 #include <linux/falloc.h>
16 #include <linux/fs.h>
17 #include <linux/mount.h>
18 #include <linux/file.h>
19 #include <linux/kernel.h>
20 #include <linux/writeback.h>
21 #include <linux/pagemap.h>
22 #include <linux/highmem.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/capability.h>
26 #include <linux/ctype.h>
27 #include <linux/backing-dev.h>
28 #include <linux/hugetlb.h>
29 #include <linux/pagevec.h>
30 #include <linux/parser.h>
31 #include <linux/mman.h>
32 #include <linux/slab.h>
33 #include <linux/dnotify.h>
34 #include <linux/statfs.h>
35 #include <linux/security.h>
36 #include <linux/magic.h>
37 #include <linux/migrate.h>
38 #include <linux/uio.h>
39 
40 #include <asm/uaccess.h>
41 
42 static const struct super_operations hugetlbfs_ops;
43 static const struct address_space_operations hugetlbfs_aops;
44 const struct file_operations hugetlbfs_file_operations;
45 static const struct inode_operations hugetlbfs_dir_inode_operations;
46 static const struct inode_operations hugetlbfs_inode_operations;
47 
48 struct hugetlbfs_config {
49 	kuid_t   uid;
50 	kgid_t   gid;
51 	umode_t mode;
52 	long	max_hpages;
53 	long	nr_inodes;
54 	struct hstate *hstate;
55 	long    min_hpages;
56 };
57 
58 struct hugetlbfs_inode_info {
59 	struct shared_policy policy;
60 	struct inode vfs_inode;
61 };
62 
63 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
64 {
65 	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
66 }
67 
68 int sysctl_hugetlb_shm_group;
69 
70 enum {
71 	Opt_size, Opt_nr_inodes,
72 	Opt_mode, Opt_uid, Opt_gid,
73 	Opt_pagesize, Opt_min_size,
74 	Opt_err,
75 };
76 
77 static const match_table_t tokens = {
78 	{Opt_size,	"size=%s"},
79 	{Opt_nr_inodes,	"nr_inodes=%s"},
80 	{Opt_mode,	"mode=%o"},
81 	{Opt_uid,	"uid=%u"},
82 	{Opt_gid,	"gid=%u"},
83 	{Opt_pagesize,	"pagesize=%s"},
84 	{Opt_min_size,	"min_size=%s"},
85 	{Opt_err,	NULL},
86 };
87 
88 #ifdef CONFIG_NUMA
89 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
90 					struct inode *inode, pgoff_t index)
91 {
92 	vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
93 							index);
94 }
95 
96 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
97 {
98 	mpol_cond_put(vma->vm_policy);
99 }
100 #else
101 static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
102 					struct inode *inode, pgoff_t index)
103 {
104 }
105 
106 static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
107 {
108 }
109 #endif
110 
111 static void huge_pagevec_release(struct pagevec *pvec)
112 {
113 	int i;
114 
115 	for (i = 0; i < pagevec_count(pvec); ++i)
116 		put_page(pvec->pages[i]);
117 
118 	pagevec_reinit(pvec);
119 }
120 
121 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
122 {
123 	struct inode *inode = file_inode(file);
124 	loff_t len, vma_len;
125 	int ret;
126 	struct hstate *h = hstate_file(file);
127 
128 	/*
129 	 * vma address alignment (but not the pgoff alignment) has
130 	 * already been checked by prepare_hugepage_range.  If you add
131 	 * any error returns here, do so after setting VM_HUGETLB, so
132 	 * is_vm_hugetlb_page tests below unmap_region go the right
133 	 * way when do_mmap_pgoff unwinds (may be important on powerpc
134 	 * and ia64).
135 	 */
136 	vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
137 	vma->vm_ops = &hugetlb_vm_ops;
138 
139 	if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
140 		return -EINVAL;
141 
142 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
143 
144 	mutex_lock(&inode->i_mutex);
145 	file_accessed(file);
146 
147 	ret = -ENOMEM;
148 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
149 
150 	if (hugetlb_reserve_pages(inode,
151 				vma->vm_pgoff >> huge_page_order(h),
152 				len >> huge_page_shift(h), vma,
153 				vma->vm_flags))
154 		goto out;
155 
156 	ret = 0;
157 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
158 		inode->i_size = len;
159 out:
160 	mutex_unlock(&inode->i_mutex);
161 
162 	return ret;
163 }
164 
165 /*
166  * Called under down_write(mmap_sem).
167  */
168 
169 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
170 static unsigned long
171 hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
172 		unsigned long len, unsigned long pgoff, unsigned long flags)
173 {
174 	struct mm_struct *mm = current->mm;
175 	struct vm_area_struct *vma;
176 	struct hstate *h = hstate_file(file);
177 	struct vm_unmapped_area_info info;
178 
179 	if (len & ~huge_page_mask(h))
180 		return -EINVAL;
181 	if (len > TASK_SIZE)
182 		return -ENOMEM;
183 
184 	if (flags & MAP_FIXED) {
185 		if (prepare_hugepage_range(file, addr, len))
186 			return -EINVAL;
187 		return addr;
188 	}
189 
190 	if (addr) {
191 		addr = ALIGN(addr, huge_page_size(h));
192 		vma = find_vma(mm, addr);
193 		if (TASK_SIZE - len >= addr &&
194 		    (!vma || addr + len <= vma->vm_start))
195 			return addr;
196 	}
197 
198 	info.flags = 0;
199 	info.length = len;
200 	info.low_limit = TASK_UNMAPPED_BASE;
201 	info.high_limit = TASK_SIZE;
202 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
203 	info.align_offset = 0;
204 	return vm_unmapped_area(&info);
205 }
206 #endif
207 
208 static size_t
209 hugetlbfs_read_actor(struct page *page, unsigned long offset,
210 			struct iov_iter *to, unsigned long size)
211 {
212 	size_t copied = 0;
213 	int i, chunksize;
214 
215 	/* Find which 4k chunk and offset with in that chunk */
216 	i = offset >> PAGE_CACHE_SHIFT;
217 	offset = offset & ~PAGE_CACHE_MASK;
218 
219 	while (size) {
220 		size_t n;
221 		chunksize = PAGE_CACHE_SIZE;
222 		if (offset)
223 			chunksize -= offset;
224 		if (chunksize > size)
225 			chunksize = size;
226 		n = copy_page_to_iter(&page[i], offset, chunksize, to);
227 		copied += n;
228 		if (n != chunksize)
229 			return copied;
230 		offset = 0;
231 		size -= chunksize;
232 		i++;
233 	}
234 	return copied;
235 }
236 
237 /*
238  * Support for read() - Find the page attached to f_mapping and copy out the
239  * data. Its *very* similar to do_generic_mapping_read(), we can't use that
240  * since it has PAGE_CACHE_SIZE assumptions.
241  */
242 static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
243 {
244 	struct file *file = iocb->ki_filp;
245 	struct hstate *h = hstate_file(file);
246 	struct address_space *mapping = file->f_mapping;
247 	struct inode *inode = mapping->host;
248 	unsigned long index = iocb->ki_pos >> huge_page_shift(h);
249 	unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
250 	unsigned long end_index;
251 	loff_t isize;
252 	ssize_t retval = 0;
253 
254 	while (iov_iter_count(to)) {
255 		struct page *page;
256 		size_t nr, copied;
257 
258 		/* nr is the maximum number of bytes to copy from this page */
259 		nr = huge_page_size(h);
260 		isize = i_size_read(inode);
261 		if (!isize)
262 			break;
263 		end_index = (isize - 1) >> huge_page_shift(h);
264 		if (index > end_index)
265 			break;
266 		if (index == end_index) {
267 			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
268 			if (nr <= offset)
269 				break;
270 		}
271 		nr = nr - offset;
272 
273 		/* Find the page */
274 		page = find_lock_page(mapping, index);
275 		if (unlikely(page == NULL)) {
276 			/*
277 			 * We have a HOLE, zero out the user-buffer for the
278 			 * length of the hole or request.
279 			 */
280 			copied = iov_iter_zero(nr, to);
281 		} else {
282 			unlock_page(page);
283 
284 			/*
285 			 * We have the page, copy it to user space buffer.
286 			 */
287 			copied = hugetlbfs_read_actor(page, offset, to, nr);
288 			page_cache_release(page);
289 		}
290 		offset += copied;
291 		retval += copied;
292 		if (copied != nr && iov_iter_count(to)) {
293 			if (!retval)
294 				retval = -EFAULT;
295 			break;
296 		}
297 		index += offset >> huge_page_shift(h);
298 		offset &= ~huge_page_mask(h);
299 	}
300 	iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
301 	return retval;
302 }
303 
304 static int hugetlbfs_write_begin(struct file *file,
305 			struct address_space *mapping,
306 			loff_t pos, unsigned len, unsigned flags,
307 			struct page **pagep, void **fsdata)
308 {
309 	return -EINVAL;
310 }
311 
312 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
313 			loff_t pos, unsigned len, unsigned copied,
314 			struct page *page, void *fsdata)
315 {
316 	BUG();
317 	return -EINVAL;
318 }
319 
320 static void remove_huge_page(struct page *page)
321 {
322 	ClearPageDirty(page);
323 	ClearPageUptodate(page);
324 	delete_from_page_cache(page);
325 }
326 
327 
328 /*
329  * remove_inode_hugepages handles two distinct cases: truncation and hole
330  * punch.  There are subtle differences in operation for each case.
331 
332  * truncation is indicated by end of range being LLONG_MAX
333  *	In this case, we first scan the range and release found pages.
334  *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
335  *	maps and global counts.
336  * hole punch is indicated if end is not LLONG_MAX
337  *	In the hole punch case we scan the range and release found pages.
338  *	Only when releasing a page is the associated region/reserv map
339  *	deleted.  The region/reserv map for ranges without associated
340  *	pages are not modified.
341  * Note: If the passed end of range value is beyond the end of file, but
342  * not LLONG_MAX this routine still performs a hole punch operation.
343  */
344 static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
345 				   loff_t lend)
346 {
347 	struct hstate *h = hstate_inode(inode);
348 	struct address_space *mapping = &inode->i_data;
349 	const pgoff_t start = lstart >> huge_page_shift(h);
350 	const pgoff_t end = lend >> huge_page_shift(h);
351 	struct vm_area_struct pseudo_vma;
352 	struct pagevec pvec;
353 	pgoff_t next;
354 	int i, freed = 0;
355 	long lookup_nr = PAGEVEC_SIZE;
356 	bool truncate_op = (lend == LLONG_MAX);
357 
358 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
359 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
360 	pagevec_init(&pvec, 0);
361 	next = start;
362 	while (next < end) {
363 		/*
364 		 * Make sure to never grab more pages that we
365 		 * might possibly need.
366 		 */
367 		if (end - next < lookup_nr)
368 			lookup_nr = end - next;
369 
370 		/*
371 		 * This pagevec_lookup() may return pages past 'end',
372 		 * so we must check for page->index > end.
373 		 */
374 		if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
375 			if (next == start)
376 				break;
377 			next = start;
378 			continue;
379 		}
380 
381 		for (i = 0; i < pagevec_count(&pvec); ++i) {
382 			struct page *page = pvec.pages[i];
383 			u32 hash;
384 
385 			hash = hugetlb_fault_mutex_hash(h, current->mm,
386 							&pseudo_vma,
387 							mapping, next, 0);
388 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
389 
390 			lock_page(page);
391 			if (page->index >= end) {
392 				unlock_page(page);
393 				mutex_unlock(&hugetlb_fault_mutex_table[hash]);
394 				next = end;	/* we are done */
395 				break;
396 			}
397 
398 			/*
399 			 * If page is mapped, it was faulted in after being
400 			 * unmapped.  Do nothing in this race case.  In the
401 			 * normal case page is not mapped.
402 			 */
403 			if (!page_mapped(page)) {
404 				bool rsv_on_error = !PagePrivate(page);
405 				/*
406 				 * We must free the huge page and remove
407 				 * from page cache (remove_huge_page) BEFORE
408 				 * removing the region/reserve map
409 				 * (hugetlb_unreserve_pages).  In rare out
410 				 * of memory conditions, removal of the
411 				 * region/reserve map could fail.  Before
412 				 * free'ing the page, note PagePrivate which
413 				 * is used in case of error.
414 				 */
415 				remove_huge_page(page);
416 				freed++;
417 				if (!truncate_op) {
418 					if (unlikely(hugetlb_unreserve_pages(
419 							inode, next,
420 							next + 1, 1)))
421 						hugetlb_fix_reserve_counts(
422 							inode, rsv_on_error);
423 				}
424 			}
425 
426 			if (page->index > next)
427 				next = page->index;
428 
429 			++next;
430 			unlock_page(page);
431 
432 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
433 		}
434 		huge_pagevec_release(&pvec);
435 	}
436 
437 	if (truncate_op)
438 		(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
439 }
440 
441 static void hugetlbfs_evict_inode(struct inode *inode)
442 {
443 	struct resv_map *resv_map;
444 
445 	remove_inode_hugepages(inode, 0, LLONG_MAX);
446 	resv_map = (struct resv_map *)inode->i_mapping->private_data;
447 	/* root inode doesn't have the resv_map, so we should check it */
448 	if (resv_map)
449 		resv_map_release(&resv_map->refs);
450 	clear_inode(inode);
451 }
452 
453 static inline void
454 hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
455 {
456 	struct vm_area_struct *vma;
457 
458 	/*
459 	 * end == 0 indicates that the entire range after
460 	 * start should be unmapped.
461 	 */
462 	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
463 		unsigned long v_offset;
464 
465 		/*
466 		 * Can the expression below overflow on 32-bit arches?
467 		 * No, because the interval tree returns us only those vmas
468 		 * which overlap the truncated area starting at pgoff,
469 		 * and no vma on a 32-bit arch can span beyond the 4GB.
470 		 */
471 		if (vma->vm_pgoff < start)
472 			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
473 		else
474 			v_offset = 0;
475 
476 		if (end) {
477 			end = ((end - start) << PAGE_SHIFT) +
478 			       vma->vm_start + v_offset;
479 			if (end > vma->vm_end)
480 				end = vma->vm_end;
481 		} else
482 			end = vma->vm_end;
483 
484 		unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
485 	}
486 }
487 
488 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
489 {
490 	pgoff_t pgoff;
491 	struct address_space *mapping = inode->i_mapping;
492 	struct hstate *h = hstate_inode(inode);
493 
494 	BUG_ON(offset & ~huge_page_mask(h));
495 	pgoff = offset >> PAGE_SHIFT;
496 
497 	i_size_write(inode, offset);
498 	i_mmap_lock_write(mapping);
499 	if (!RB_EMPTY_ROOT(&mapping->i_mmap))
500 		hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
501 	i_mmap_unlock_write(mapping);
502 	remove_inode_hugepages(inode, offset, LLONG_MAX);
503 	return 0;
504 }
505 
506 static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
507 {
508 	struct hstate *h = hstate_inode(inode);
509 	loff_t hpage_size = huge_page_size(h);
510 	loff_t hole_start, hole_end;
511 
512 	/*
513 	 * For hole punch round up the beginning offset of the hole and
514 	 * round down the end.
515 	 */
516 	hole_start = round_up(offset, hpage_size);
517 	hole_end = round_down(offset + len, hpage_size);
518 
519 	if (hole_end > hole_start) {
520 		struct address_space *mapping = inode->i_mapping;
521 
522 		mutex_lock(&inode->i_mutex);
523 		i_mmap_lock_write(mapping);
524 		if (!RB_EMPTY_ROOT(&mapping->i_mmap))
525 			hugetlb_vmdelete_list(&mapping->i_mmap,
526 						hole_start >> PAGE_SHIFT,
527 						hole_end  >> PAGE_SHIFT);
528 		i_mmap_unlock_write(mapping);
529 		remove_inode_hugepages(inode, hole_start, hole_end);
530 		mutex_unlock(&inode->i_mutex);
531 	}
532 
533 	return 0;
534 }
535 
536 static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
537 				loff_t len)
538 {
539 	struct inode *inode = file_inode(file);
540 	struct address_space *mapping = inode->i_mapping;
541 	struct hstate *h = hstate_inode(inode);
542 	struct vm_area_struct pseudo_vma;
543 	struct mm_struct *mm = current->mm;
544 	loff_t hpage_size = huge_page_size(h);
545 	unsigned long hpage_shift = huge_page_shift(h);
546 	pgoff_t start, index, end;
547 	int error;
548 	u32 hash;
549 
550 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
551 		return -EOPNOTSUPP;
552 
553 	if (mode & FALLOC_FL_PUNCH_HOLE)
554 		return hugetlbfs_punch_hole(inode, offset, len);
555 
556 	/*
557 	 * Default preallocate case.
558 	 * For this range, start is rounded down and end is rounded up
559 	 * as well as being converted to page offsets.
560 	 */
561 	start = offset >> hpage_shift;
562 	end = (offset + len + hpage_size - 1) >> hpage_shift;
563 
564 	mutex_lock(&inode->i_mutex);
565 
566 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
567 	error = inode_newsize_ok(inode, offset + len);
568 	if (error)
569 		goto out;
570 
571 	/*
572 	 * Initialize a pseudo vma as this is required by the huge page
573 	 * allocation routines.  If NUMA is configured, use page index
574 	 * as input to create an allocation policy.
575 	 */
576 	memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
577 	pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
578 	pseudo_vma.vm_file = file;
579 
580 	for (index = start; index < end; index++) {
581 		/*
582 		 * This is supposed to be the vaddr where the page is being
583 		 * faulted in, but we have no vaddr here.
584 		 */
585 		struct page *page;
586 		unsigned long addr;
587 		int avoid_reserve = 0;
588 
589 		cond_resched();
590 
591 		/*
592 		 * fallocate(2) manpage permits EINTR; we may have been
593 		 * interrupted because we are using up too much memory.
594 		 */
595 		if (signal_pending(current)) {
596 			error = -EINTR;
597 			break;
598 		}
599 
600 		/* Set numa allocation policy based on index */
601 		hugetlb_set_vma_policy(&pseudo_vma, inode, index);
602 
603 		/* addr is the offset within the file (zero based) */
604 		addr = index * hpage_size;
605 
606 		/* mutex taken here, fault path and hole punch */
607 		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
608 						index, addr);
609 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
610 
611 		/* See if already present in mapping to avoid alloc/free */
612 		page = find_get_page(mapping, index);
613 		if (page) {
614 			put_page(page);
615 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
616 			hugetlb_drop_vma_policy(&pseudo_vma);
617 			continue;
618 		}
619 
620 		/* Allocate page and add to page cache */
621 		page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
622 		hugetlb_drop_vma_policy(&pseudo_vma);
623 		if (IS_ERR(page)) {
624 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
625 			error = PTR_ERR(page);
626 			goto out;
627 		}
628 		clear_huge_page(page, addr, pages_per_huge_page(h));
629 		__SetPageUptodate(page);
630 		error = huge_add_to_page_cache(page, mapping, index);
631 		if (unlikely(error)) {
632 			put_page(page);
633 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
634 			goto out;
635 		}
636 
637 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
638 
639 		/*
640 		 * page_put due to reference from alloc_huge_page()
641 		 * unlock_page because locked by add_to_page_cache()
642 		 */
643 		put_page(page);
644 		unlock_page(page);
645 	}
646 
647 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
648 		i_size_write(inode, offset + len);
649 	inode->i_ctime = CURRENT_TIME;
650 	spin_lock(&inode->i_lock);
651 	inode->i_private = NULL;
652 	spin_unlock(&inode->i_lock);
653 out:
654 	mutex_unlock(&inode->i_mutex);
655 	return error;
656 }
657 
658 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
659 {
660 	struct inode *inode = d_inode(dentry);
661 	struct hstate *h = hstate_inode(inode);
662 	int error;
663 	unsigned int ia_valid = attr->ia_valid;
664 
665 	BUG_ON(!inode);
666 
667 	error = inode_change_ok(inode, attr);
668 	if (error)
669 		return error;
670 
671 	if (ia_valid & ATTR_SIZE) {
672 		error = -EINVAL;
673 		if (attr->ia_size & ~huge_page_mask(h))
674 			return -EINVAL;
675 		error = hugetlb_vmtruncate(inode, attr->ia_size);
676 		if (error)
677 			return error;
678 	}
679 
680 	setattr_copy(inode, attr);
681 	mark_inode_dirty(inode);
682 	return 0;
683 }
684 
685 static struct inode *hugetlbfs_get_root(struct super_block *sb,
686 					struct hugetlbfs_config *config)
687 {
688 	struct inode *inode;
689 
690 	inode = new_inode(sb);
691 	if (inode) {
692 		struct hugetlbfs_inode_info *info;
693 		inode->i_ino = get_next_ino();
694 		inode->i_mode = S_IFDIR | config->mode;
695 		inode->i_uid = config->uid;
696 		inode->i_gid = config->gid;
697 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
698 		info = HUGETLBFS_I(inode);
699 		mpol_shared_policy_init(&info->policy, NULL);
700 		inode->i_op = &hugetlbfs_dir_inode_operations;
701 		inode->i_fop = &simple_dir_operations;
702 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
703 		inc_nlink(inode);
704 		lockdep_annotate_inode_mutex_key(inode);
705 	}
706 	return inode;
707 }
708 
709 /*
710  * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
711  * be taken from reclaim -- unlike regular filesystems. This needs an
712  * annotation because huge_pmd_share() does an allocation under
713  * i_mmap_rwsem.
714  */
715 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
716 
717 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
718 					struct inode *dir,
719 					umode_t mode, dev_t dev)
720 {
721 	struct inode *inode;
722 	struct resv_map *resv_map;
723 
724 	resv_map = resv_map_alloc();
725 	if (!resv_map)
726 		return NULL;
727 
728 	inode = new_inode(sb);
729 	if (inode) {
730 		struct hugetlbfs_inode_info *info;
731 		inode->i_ino = get_next_ino();
732 		inode_init_owner(inode, dir, mode);
733 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
734 				&hugetlbfs_i_mmap_rwsem_key);
735 		inode->i_mapping->a_ops = &hugetlbfs_aops;
736 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
737 		inode->i_mapping->private_data = resv_map;
738 		info = HUGETLBFS_I(inode);
739 		/*
740 		 * The policy is initialized here even if we are creating a
741 		 * private inode because initialization simply creates an
742 		 * an empty rb tree and calls spin_lock_init(), later when we
743 		 * call mpol_free_shared_policy() it will just return because
744 		 * the rb tree will still be empty.
745 		 */
746 		mpol_shared_policy_init(&info->policy, NULL);
747 		switch (mode & S_IFMT) {
748 		default:
749 			init_special_inode(inode, mode, dev);
750 			break;
751 		case S_IFREG:
752 			inode->i_op = &hugetlbfs_inode_operations;
753 			inode->i_fop = &hugetlbfs_file_operations;
754 			break;
755 		case S_IFDIR:
756 			inode->i_op = &hugetlbfs_dir_inode_operations;
757 			inode->i_fop = &simple_dir_operations;
758 
759 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
760 			inc_nlink(inode);
761 			break;
762 		case S_IFLNK:
763 			inode->i_op = &page_symlink_inode_operations;
764 			break;
765 		}
766 		lockdep_annotate_inode_mutex_key(inode);
767 	} else
768 		kref_put(&resv_map->refs, resv_map_release);
769 
770 	return inode;
771 }
772 
773 /*
774  * File creation. Allocate an inode, and we're done..
775  */
776 static int hugetlbfs_mknod(struct inode *dir,
777 			struct dentry *dentry, umode_t mode, dev_t dev)
778 {
779 	struct inode *inode;
780 	int error = -ENOSPC;
781 
782 	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
783 	if (inode) {
784 		dir->i_ctime = dir->i_mtime = CURRENT_TIME;
785 		d_instantiate(dentry, inode);
786 		dget(dentry);	/* Extra count - pin the dentry in core */
787 		error = 0;
788 	}
789 	return error;
790 }
791 
792 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
793 {
794 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
795 	if (!retval)
796 		inc_nlink(dir);
797 	return retval;
798 }
799 
800 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
801 {
802 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
803 }
804 
805 static int hugetlbfs_symlink(struct inode *dir,
806 			struct dentry *dentry, const char *symname)
807 {
808 	struct inode *inode;
809 	int error = -ENOSPC;
810 
811 	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
812 	if (inode) {
813 		int l = strlen(symname)+1;
814 		error = page_symlink(inode, symname, l);
815 		if (!error) {
816 			d_instantiate(dentry, inode);
817 			dget(dentry);
818 		} else
819 			iput(inode);
820 	}
821 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
822 
823 	return error;
824 }
825 
826 /*
827  * mark the head page dirty
828  */
829 static int hugetlbfs_set_page_dirty(struct page *page)
830 {
831 	struct page *head = compound_head(page);
832 
833 	SetPageDirty(head);
834 	return 0;
835 }
836 
837 static int hugetlbfs_migrate_page(struct address_space *mapping,
838 				struct page *newpage, struct page *page,
839 				enum migrate_mode mode)
840 {
841 	int rc;
842 
843 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
844 	if (rc != MIGRATEPAGE_SUCCESS)
845 		return rc;
846 	migrate_page_copy(newpage, page);
847 
848 	return MIGRATEPAGE_SUCCESS;
849 }
850 
851 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
852 {
853 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
854 	struct hstate *h = hstate_inode(d_inode(dentry));
855 
856 	buf->f_type = HUGETLBFS_MAGIC;
857 	buf->f_bsize = huge_page_size(h);
858 	if (sbinfo) {
859 		spin_lock(&sbinfo->stat_lock);
860 		/* If no limits set, just report 0 for max/free/used
861 		 * blocks, like simple_statfs() */
862 		if (sbinfo->spool) {
863 			long free_pages;
864 
865 			spin_lock(&sbinfo->spool->lock);
866 			buf->f_blocks = sbinfo->spool->max_hpages;
867 			free_pages = sbinfo->spool->max_hpages
868 				- sbinfo->spool->used_hpages;
869 			buf->f_bavail = buf->f_bfree = free_pages;
870 			spin_unlock(&sbinfo->spool->lock);
871 			buf->f_files = sbinfo->max_inodes;
872 			buf->f_ffree = sbinfo->free_inodes;
873 		}
874 		spin_unlock(&sbinfo->stat_lock);
875 	}
876 	buf->f_namelen = NAME_MAX;
877 	return 0;
878 }
879 
880 static void hugetlbfs_put_super(struct super_block *sb)
881 {
882 	struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
883 
884 	if (sbi) {
885 		sb->s_fs_info = NULL;
886 
887 		if (sbi->spool)
888 			hugepage_put_subpool(sbi->spool);
889 
890 		kfree(sbi);
891 	}
892 }
893 
894 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
895 {
896 	if (sbinfo->free_inodes >= 0) {
897 		spin_lock(&sbinfo->stat_lock);
898 		if (unlikely(!sbinfo->free_inodes)) {
899 			spin_unlock(&sbinfo->stat_lock);
900 			return 0;
901 		}
902 		sbinfo->free_inodes--;
903 		spin_unlock(&sbinfo->stat_lock);
904 	}
905 
906 	return 1;
907 }
908 
909 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
910 {
911 	if (sbinfo->free_inodes >= 0) {
912 		spin_lock(&sbinfo->stat_lock);
913 		sbinfo->free_inodes++;
914 		spin_unlock(&sbinfo->stat_lock);
915 	}
916 }
917 
918 
919 static struct kmem_cache *hugetlbfs_inode_cachep;
920 
921 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
922 {
923 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
924 	struct hugetlbfs_inode_info *p;
925 
926 	if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
927 		return NULL;
928 	p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
929 	if (unlikely(!p)) {
930 		hugetlbfs_inc_free_inodes(sbinfo);
931 		return NULL;
932 	}
933 	return &p->vfs_inode;
934 }
935 
936 static void hugetlbfs_i_callback(struct rcu_head *head)
937 {
938 	struct inode *inode = container_of(head, struct inode, i_rcu);
939 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
940 }
941 
942 static void hugetlbfs_destroy_inode(struct inode *inode)
943 {
944 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
945 	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
946 	call_rcu(&inode->i_rcu, hugetlbfs_i_callback);
947 }
948 
949 static const struct address_space_operations hugetlbfs_aops = {
950 	.write_begin	= hugetlbfs_write_begin,
951 	.write_end	= hugetlbfs_write_end,
952 	.set_page_dirty	= hugetlbfs_set_page_dirty,
953 	.migratepage    = hugetlbfs_migrate_page,
954 };
955 
956 
957 static void init_once(void *foo)
958 {
959 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
960 
961 	inode_init_once(&ei->vfs_inode);
962 }
963 
964 const struct file_operations hugetlbfs_file_operations = {
965 	.read_iter		= hugetlbfs_read_iter,
966 	.mmap			= hugetlbfs_file_mmap,
967 	.fsync			= noop_fsync,
968 	.get_unmapped_area	= hugetlb_get_unmapped_area,
969 	.llseek			= default_llseek,
970 	.fallocate		= hugetlbfs_fallocate,
971 };
972 
973 static const struct inode_operations hugetlbfs_dir_inode_operations = {
974 	.create		= hugetlbfs_create,
975 	.lookup		= simple_lookup,
976 	.link		= simple_link,
977 	.unlink		= simple_unlink,
978 	.symlink	= hugetlbfs_symlink,
979 	.mkdir		= hugetlbfs_mkdir,
980 	.rmdir		= simple_rmdir,
981 	.mknod		= hugetlbfs_mknod,
982 	.rename		= simple_rename,
983 	.setattr	= hugetlbfs_setattr,
984 };
985 
986 static const struct inode_operations hugetlbfs_inode_operations = {
987 	.setattr	= hugetlbfs_setattr,
988 };
989 
990 static const struct super_operations hugetlbfs_ops = {
991 	.alloc_inode    = hugetlbfs_alloc_inode,
992 	.destroy_inode  = hugetlbfs_destroy_inode,
993 	.evict_inode	= hugetlbfs_evict_inode,
994 	.statfs		= hugetlbfs_statfs,
995 	.put_super	= hugetlbfs_put_super,
996 	.show_options	= generic_show_options,
997 };
998 
999 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
1000 
1001 /*
1002  * Convert size option passed from command line to number of huge pages
1003  * in the pool specified by hstate.  Size option could be in bytes
1004  * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
1005  */
1006 static long long
1007 hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
1008 								int val_type)
1009 {
1010 	if (val_type == NO_SIZE)
1011 		return -1;
1012 
1013 	if (val_type == SIZE_PERCENT) {
1014 		size_opt <<= huge_page_shift(h);
1015 		size_opt *= h->max_huge_pages;
1016 		do_div(size_opt, 100);
1017 	}
1018 
1019 	size_opt >>= huge_page_shift(h);
1020 	return size_opt;
1021 }
1022 
1023 static int
1024 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
1025 {
1026 	char *p, *rest;
1027 	substring_t args[MAX_OPT_ARGS];
1028 	int option;
1029 	unsigned long long max_size_opt = 0, min_size_opt = 0;
1030 	int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
1031 
1032 	if (!options)
1033 		return 0;
1034 
1035 	while ((p = strsep(&options, ",")) != NULL) {
1036 		int token;
1037 		if (!*p)
1038 			continue;
1039 
1040 		token = match_token(p, tokens, args);
1041 		switch (token) {
1042 		case Opt_uid:
1043 			if (match_int(&args[0], &option))
1044  				goto bad_val;
1045 			pconfig->uid = make_kuid(current_user_ns(), option);
1046 			if (!uid_valid(pconfig->uid))
1047 				goto bad_val;
1048 			break;
1049 
1050 		case Opt_gid:
1051 			if (match_int(&args[0], &option))
1052  				goto bad_val;
1053 			pconfig->gid = make_kgid(current_user_ns(), option);
1054 			if (!gid_valid(pconfig->gid))
1055 				goto bad_val;
1056 			break;
1057 
1058 		case Opt_mode:
1059 			if (match_octal(&args[0], &option))
1060  				goto bad_val;
1061 			pconfig->mode = option & 01777U;
1062 			break;
1063 
1064 		case Opt_size: {
1065 			/* memparse() will accept a K/M/G without a digit */
1066 			if (!isdigit(*args[0].from))
1067 				goto bad_val;
1068 			max_size_opt = memparse(args[0].from, &rest);
1069 			max_val_type = SIZE_STD;
1070 			if (*rest == '%')
1071 				max_val_type = SIZE_PERCENT;
1072 			break;
1073 		}
1074 
1075 		case Opt_nr_inodes:
1076 			/* memparse() will accept a K/M/G without a digit */
1077 			if (!isdigit(*args[0].from))
1078 				goto bad_val;
1079 			pconfig->nr_inodes = memparse(args[0].from, &rest);
1080 			break;
1081 
1082 		case Opt_pagesize: {
1083 			unsigned long ps;
1084 			ps = memparse(args[0].from, &rest);
1085 			pconfig->hstate = size_to_hstate(ps);
1086 			if (!pconfig->hstate) {
1087 				pr_err("Unsupported page size %lu MB\n",
1088 					ps >> 20);
1089 				return -EINVAL;
1090 			}
1091 			break;
1092 		}
1093 
1094 		case Opt_min_size: {
1095 			/* memparse() will accept a K/M/G without a digit */
1096 			if (!isdigit(*args[0].from))
1097 				goto bad_val;
1098 			min_size_opt = memparse(args[0].from, &rest);
1099 			min_val_type = SIZE_STD;
1100 			if (*rest == '%')
1101 				min_val_type = SIZE_PERCENT;
1102 			break;
1103 		}
1104 
1105 		default:
1106 			pr_err("Bad mount option: \"%s\"\n", p);
1107 			return -EINVAL;
1108 			break;
1109 		}
1110 	}
1111 
1112 	/*
1113 	 * Use huge page pool size (in hstate) to convert the size
1114 	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
1115 	 */
1116 	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
1117 						max_size_opt, max_val_type);
1118 	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
1119 						min_size_opt, min_val_type);
1120 
1121 	/*
1122 	 * If max_size was specified, then min_size must be smaller
1123 	 */
1124 	if (max_val_type > NO_SIZE &&
1125 	    pconfig->min_hpages > pconfig->max_hpages) {
1126 		pr_err("minimum size can not be greater than maximum size\n");
1127 		return -EINVAL;
1128 	}
1129 
1130 	return 0;
1131 
1132 bad_val:
1133 	pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
1134  	return -EINVAL;
1135 }
1136 
1137 static int
1138 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
1139 {
1140 	int ret;
1141 	struct hugetlbfs_config config;
1142 	struct hugetlbfs_sb_info *sbinfo;
1143 
1144 	save_mount_options(sb, data);
1145 
1146 	config.max_hpages = -1; /* No limit on size by default */
1147 	config.nr_inodes = -1; /* No limit on number of inodes by default */
1148 	config.uid = current_fsuid();
1149 	config.gid = current_fsgid();
1150 	config.mode = 0755;
1151 	config.hstate = &default_hstate;
1152 	config.min_hpages = -1; /* No default minimum size */
1153 	ret = hugetlbfs_parse_options(data, &config);
1154 	if (ret)
1155 		return ret;
1156 
1157 	sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
1158 	if (!sbinfo)
1159 		return -ENOMEM;
1160 	sb->s_fs_info = sbinfo;
1161 	sbinfo->hstate = config.hstate;
1162 	spin_lock_init(&sbinfo->stat_lock);
1163 	sbinfo->max_inodes = config.nr_inodes;
1164 	sbinfo->free_inodes = config.nr_inodes;
1165 	sbinfo->spool = NULL;
1166 	/*
1167 	 * Allocate and initialize subpool if maximum or minimum size is
1168 	 * specified.  Any needed reservations (for minimim size) are taken
1169 	 * taken when the subpool is created.
1170 	 */
1171 	if (config.max_hpages != -1 || config.min_hpages != -1) {
1172 		sbinfo->spool = hugepage_new_subpool(config.hstate,
1173 							config.max_hpages,
1174 							config.min_hpages);
1175 		if (!sbinfo->spool)
1176 			goto out_free;
1177 	}
1178 	sb->s_maxbytes = MAX_LFS_FILESIZE;
1179 	sb->s_blocksize = huge_page_size(config.hstate);
1180 	sb->s_blocksize_bits = huge_page_shift(config.hstate);
1181 	sb->s_magic = HUGETLBFS_MAGIC;
1182 	sb->s_op = &hugetlbfs_ops;
1183 	sb->s_time_gran = 1;
1184 	sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config));
1185 	if (!sb->s_root)
1186 		goto out_free;
1187 	return 0;
1188 out_free:
1189 	kfree(sbinfo->spool);
1190 	kfree(sbinfo);
1191 	return -ENOMEM;
1192 }
1193 
1194 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type,
1195 	int flags, const char *dev_name, void *data)
1196 {
1197 	return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super);
1198 }
1199 
1200 static struct file_system_type hugetlbfs_fs_type = {
1201 	.name		= "hugetlbfs",
1202 	.mount		= hugetlbfs_mount,
1203 	.kill_sb	= kill_litter_super,
1204 };
1205 MODULE_ALIAS_FS("hugetlbfs");
1206 
1207 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
1208 
1209 static int can_do_hugetlb_shm(void)
1210 {
1211 	kgid_t shm_group;
1212 	shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
1213 	return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
1214 }
1215 
1216 static int get_hstate_idx(int page_size_log)
1217 {
1218 	struct hstate *h = hstate_sizelog(page_size_log);
1219 
1220 	if (!h)
1221 		return -1;
1222 	return h - hstates;
1223 }
1224 
1225 static const struct dentry_operations anon_ops = {
1226 	.d_dname = simple_dname
1227 };
1228 
1229 /*
1230  * Note that size should be aligned to proper hugepage size in caller side,
1231  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
1232  */
1233 struct file *hugetlb_file_setup(const char *name, size_t size,
1234 				vm_flags_t acctflag, struct user_struct **user,
1235 				int creat_flags, int page_size_log)
1236 {
1237 	struct file *file = ERR_PTR(-ENOMEM);
1238 	struct inode *inode;
1239 	struct path path;
1240 	struct super_block *sb;
1241 	struct qstr quick_string;
1242 	int hstate_idx;
1243 
1244 	hstate_idx = get_hstate_idx(page_size_log);
1245 	if (hstate_idx < 0)
1246 		return ERR_PTR(-ENODEV);
1247 
1248 	*user = NULL;
1249 	if (!hugetlbfs_vfsmount[hstate_idx])
1250 		return ERR_PTR(-ENOENT);
1251 
1252 	if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
1253 		*user = current_user();
1254 		if (user_shm_lock(size, *user)) {
1255 			task_lock(current);
1256 			pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
1257 				current->comm, current->pid);
1258 			task_unlock(current);
1259 		} else {
1260 			*user = NULL;
1261 			return ERR_PTR(-EPERM);
1262 		}
1263 	}
1264 
1265 	sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb;
1266 	quick_string.name = name;
1267 	quick_string.len = strlen(quick_string.name);
1268 	quick_string.hash = 0;
1269 	path.dentry = d_alloc_pseudo(sb, &quick_string);
1270 	if (!path.dentry)
1271 		goto out_shm_unlock;
1272 
1273 	d_set_d_op(path.dentry, &anon_ops);
1274 	path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
1275 	file = ERR_PTR(-ENOSPC);
1276 	inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0);
1277 	if (!inode)
1278 		goto out_dentry;
1279 	if (creat_flags == HUGETLB_SHMFS_INODE)
1280 		inode->i_flags |= S_PRIVATE;
1281 
1282 	file = ERR_PTR(-ENOMEM);
1283 	if (hugetlb_reserve_pages(inode, 0,
1284 			size >> huge_page_shift(hstate_inode(inode)), NULL,
1285 			acctflag))
1286 		goto out_inode;
1287 
1288 	d_instantiate(path.dentry, inode);
1289 	inode->i_size = size;
1290 	clear_nlink(inode);
1291 
1292 	file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
1293 			&hugetlbfs_file_operations);
1294 	if (IS_ERR(file))
1295 		goto out_dentry; /* inode is already attached */
1296 
1297 	return file;
1298 
1299 out_inode:
1300 	iput(inode);
1301 out_dentry:
1302 	path_put(&path);
1303 out_shm_unlock:
1304 	if (*user) {
1305 		user_shm_unlock(size, *user);
1306 		*user = NULL;
1307 	}
1308 	return file;
1309 }
1310 
1311 static int __init init_hugetlbfs_fs(void)
1312 {
1313 	struct hstate *h;
1314 	int error;
1315 	int i;
1316 
1317 	if (!hugepages_supported()) {
1318 		pr_info("disabling because there are no supported hugepage sizes\n");
1319 		return -ENOTSUPP;
1320 	}
1321 
1322 	error = -ENOMEM;
1323 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
1324 					sizeof(struct hugetlbfs_inode_info),
1325 					0, 0, init_once);
1326 	if (hugetlbfs_inode_cachep == NULL)
1327 		goto out2;
1328 
1329 	error = register_filesystem(&hugetlbfs_fs_type);
1330 	if (error)
1331 		goto out;
1332 
1333 	i = 0;
1334 	for_each_hstate(h) {
1335 		char buf[50];
1336 		unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
1337 
1338 		snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
1339 		hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
1340 							buf);
1341 
1342 		if (IS_ERR(hugetlbfs_vfsmount[i])) {
1343 			pr_err("Cannot mount internal hugetlbfs for "
1344 				"page size %uK", ps_kb);
1345 			error = PTR_ERR(hugetlbfs_vfsmount[i]);
1346 			hugetlbfs_vfsmount[i] = NULL;
1347 		}
1348 		i++;
1349 	}
1350 	/* Non default hstates are optional */
1351 	if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
1352 		return 0;
1353 
1354  out:
1355 	kmem_cache_destroy(hugetlbfs_inode_cachep);
1356  out2:
1357 	return error;
1358 }
1359 
1360 static void __exit exit_hugetlbfs_fs(void)
1361 {
1362 	struct hstate *h;
1363 	int i;
1364 
1365 
1366 	/*
1367 	 * Make sure all delayed rcu free inodes are flushed before we
1368 	 * destroy cache.
1369 	 */
1370 	rcu_barrier();
1371 	kmem_cache_destroy(hugetlbfs_inode_cachep);
1372 	i = 0;
1373 	for_each_hstate(h)
1374 		kern_unmount(hugetlbfs_vfsmount[i++]);
1375 	unregister_filesystem(&hugetlbfs_fs_type);
1376 }
1377 
1378 module_init(init_hugetlbfs_fs)
1379 module_exit(exit_hugetlbfs_fs)
1380 
1381 MODULE_LICENSE("GPL");
1382