1 /* 2 * hugetlbpage-backed filesystem. Based on ramfs. 3 * 4 * Nadia Yvette Chambers, 2002 5 * 6 * Copyright (C) 2002 Linus Torvalds. 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/module.h> 12 #include <linux/thread_info.h> 13 #include <asm/current.h> 14 #include <linux/sched.h> /* remove ASAP */ 15 #include <linux/fs.h> 16 #include <linux/mount.h> 17 #include <linux/file.h> 18 #include <linux/kernel.h> 19 #include <linux/writeback.h> 20 #include <linux/pagemap.h> 21 #include <linux/highmem.h> 22 #include <linux/init.h> 23 #include <linux/string.h> 24 #include <linux/capability.h> 25 #include <linux/ctype.h> 26 #include <linux/backing-dev.h> 27 #include <linux/hugetlb.h> 28 #include <linux/pagevec.h> 29 #include <linux/parser.h> 30 #include <linux/mman.h> 31 #include <linux/slab.h> 32 #include <linux/dnotify.h> 33 #include <linux/statfs.h> 34 #include <linux/security.h> 35 #include <linux/magic.h> 36 #include <linux/migrate.h> 37 #include <linux/uio.h> 38 39 #include <asm/uaccess.h> 40 41 static const struct super_operations hugetlbfs_ops; 42 static const struct address_space_operations hugetlbfs_aops; 43 const struct file_operations hugetlbfs_file_operations; 44 static const struct inode_operations hugetlbfs_dir_inode_operations; 45 static const struct inode_operations hugetlbfs_inode_operations; 46 47 struct hugetlbfs_config { 48 kuid_t uid; 49 kgid_t gid; 50 umode_t mode; 51 long max_hpages; 52 long nr_inodes; 53 struct hstate *hstate; 54 long min_hpages; 55 }; 56 57 struct hugetlbfs_inode_info { 58 struct shared_policy policy; 59 struct inode vfs_inode; 60 }; 61 62 static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) 63 { 64 return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); 65 } 66 67 int sysctl_hugetlb_shm_group; 68 69 enum { 70 Opt_size, Opt_nr_inodes, 71 Opt_mode, Opt_uid, Opt_gid, 72 Opt_pagesize, Opt_min_size, 73 Opt_err, 74 }; 75 76 static const match_table_t tokens = { 77 {Opt_size, "size=%s"}, 78 {Opt_nr_inodes, "nr_inodes=%s"}, 79 {Opt_mode, "mode=%o"}, 80 {Opt_uid, "uid=%u"}, 81 {Opt_gid, "gid=%u"}, 82 {Opt_pagesize, "pagesize=%s"}, 83 {Opt_min_size, "min_size=%s"}, 84 {Opt_err, NULL}, 85 }; 86 87 static void huge_pagevec_release(struct pagevec *pvec) 88 { 89 int i; 90 91 for (i = 0; i < pagevec_count(pvec); ++i) 92 put_page(pvec->pages[i]); 93 94 pagevec_reinit(pvec); 95 } 96 97 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 98 { 99 struct inode *inode = file_inode(file); 100 loff_t len, vma_len; 101 int ret; 102 struct hstate *h = hstate_file(file); 103 104 /* 105 * vma address alignment (but not the pgoff alignment) has 106 * already been checked by prepare_hugepage_range. If you add 107 * any error returns here, do so after setting VM_HUGETLB, so 108 * is_vm_hugetlb_page tests below unmap_region go the right 109 * way when do_mmap_pgoff unwinds (may be important on powerpc 110 * and ia64). 111 */ 112 vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; 113 vma->vm_ops = &hugetlb_vm_ops; 114 115 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 116 return -EINVAL; 117 118 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 119 120 mutex_lock(&inode->i_mutex); 121 file_accessed(file); 122 123 ret = -ENOMEM; 124 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 125 126 if (hugetlb_reserve_pages(inode, 127 vma->vm_pgoff >> huge_page_order(h), 128 len >> huge_page_shift(h), vma, 129 vma->vm_flags)) 130 goto out; 131 132 ret = 0; 133 if (vma->vm_flags & VM_WRITE && inode->i_size < len) 134 inode->i_size = len; 135 out: 136 mutex_unlock(&inode->i_mutex); 137 138 return ret; 139 } 140 141 /* 142 * Called under down_write(mmap_sem). 143 */ 144 145 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 146 static unsigned long 147 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 148 unsigned long len, unsigned long pgoff, unsigned long flags) 149 { 150 struct mm_struct *mm = current->mm; 151 struct vm_area_struct *vma; 152 struct hstate *h = hstate_file(file); 153 struct vm_unmapped_area_info info; 154 155 if (len & ~huge_page_mask(h)) 156 return -EINVAL; 157 if (len > TASK_SIZE) 158 return -ENOMEM; 159 160 if (flags & MAP_FIXED) { 161 if (prepare_hugepage_range(file, addr, len)) 162 return -EINVAL; 163 return addr; 164 } 165 166 if (addr) { 167 addr = ALIGN(addr, huge_page_size(h)); 168 vma = find_vma(mm, addr); 169 if (TASK_SIZE - len >= addr && 170 (!vma || addr + len <= vma->vm_start)) 171 return addr; 172 } 173 174 info.flags = 0; 175 info.length = len; 176 info.low_limit = TASK_UNMAPPED_BASE; 177 info.high_limit = TASK_SIZE; 178 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 179 info.align_offset = 0; 180 return vm_unmapped_area(&info); 181 } 182 #endif 183 184 static size_t 185 hugetlbfs_read_actor(struct page *page, unsigned long offset, 186 struct iov_iter *to, unsigned long size) 187 { 188 size_t copied = 0; 189 int i, chunksize; 190 191 /* Find which 4k chunk and offset with in that chunk */ 192 i = offset >> PAGE_CACHE_SHIFT; 193 offset = offset & ~PAGE_CACHE_MASK; 194 195 while (size) { 196 size_t n; 197 chunksize = PAGE_CACHE_SIZE; 198 if (offset) 199 chunksize -= offset; 200 if (chunksize > size) 201 chunksize = size; 202 n = copy_page_to_iter(&page[i], offset, chunksize, to); 203 copied += n; 204 if (n != chunksize) 205 return copied; 206 offset = 0; 207 size -= chunksize; 208 i++; 209 } 210 return copied; 211 } 212 213 /* 214 * Support for read() - Find the page attached to f_mapping and copy out the 215 * data. Its *very* similar to do_generic_mapping_read(), we can't use that 216 * since it has PAGE_CACHE_SIZE assumptions. 217 */ 218 static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) 219 { 220 struct file *file = iocb->ki_filp; 221 struct hstate *h = hstate_file(file); 222 struct address_space *mapping = file->f_mapping; 223 struct inode *inode = mapping->host; 224 unsigned long index = iocb->ki_pos >> huge_page_shift(h); 225 unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); 226 unsigned long end_index; 227 loff_t isize; 228 ssize_t retval = 0; 229 230 while (iov_iter_count(to)) { 231 struct page *page; 232 size_t nr, copied; 233 234 /* nr is the maximum number of bytes to copy from this page */ 235 nr = huge_page_size(h); 236 isize = i_size_read(inode); 237 if (!isize) 238 break; 239 end_index = (isize - 1) >> huge_page_shift(h); 240 if (index > end_index) 241 break; 242 if (index == end_index) { 243 nr = ((isize - 1) & ~huge_page_mask(h)) + 1; 244 if (nr <= offset) 245 break; 246 } 247 nr = nr - offset; 248 249 /* Find the page */ 250 page = find_lock_page(mapping, index); 251 if (unlikely(page == NULL)) { 252 /* 253 * We have a HOLE, zero out the user-buffer for the 254 * length of the hole or request. 255 */ 256 copied = iov_iter_zero(nr, to); 257 } else { 258 unlock_page(page); 259 260 /* 261 * We have the page, copy it to user space buffer. 262 */ 263 copied = hugetlbfs_read_actor(page, offset, to, nr); 264 page_cache_release(page); 265 } 266 offset += copied; 267 retval += copied; 268 if (copied != nr && iov_iter_count(to)) { 269 if (!retval) 270 retval = -EFAULT; 271 break; 272 } 273 index += offset >> huge_page_shift(h); 274 offset &= ~huge_page_mask(h); 275 } 276 iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; 277 return retval; 278 } 279 280 static int hugetlbfs_write_begin(struct file *file, 281 struct address_space *mapping, 282 loff_t pos, unsigned len, unsigned flags, 283 struct page **pagep, void **fsdata) 284 { 285 return -EINVAL; 286 } 287 288 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, 289 loff_t pos, unsigned len, unsigned copied, 290 struct page *page, void *fsdata) 291 { 292 BUG(); 293 return -EINVAL; 294 } 295 296 static void truncate_huge_page(struct page *page) 297 { 298 ClearPageDirty(page); 299 ClearPageUptodate(page); 300 delete_from_page_cache(page); 301 } 302 303 static void truncate_hugepages(struct inode *inode, loff_t lstart) 304 { 305 struct hstate *h = hstate_inode(inode); 306 struct address_space *mapping = &inode->i_data; 307 const pgoff_t start = lstart >> huge_page_shift(h); 308 struct pagevec pvec; 309 pgoff_t next; 310 int i, freed = 0; 311 312 pagevec_init(&pvec, 0); 313 next = start; 314 while (1) { 315 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 316 if (next == start) 317 break; 318 next = start; 319 continue; 320 } 321 322 for (i = 0; i < pagevec_count(&pvec); ++i) { 323 struct page *page = pvec.pages[i]; 324 325 lock_page(page); 326 if (page->index > next) 327 next = page->index; 328 ++next; 329 truncate_huge_page(page); 330 unlock_page(page); 331 freed++; 332 } 333 huge_pagevec_release(&pvec); 334 } 335 BUG_ON(!lstart && mapping->nrpages); 336 hugetlb_unreserve_pages(inode, start, freed); 337 } 338 339 static void hugetlbfs_evict_inode(struct inode *inode) 340 { 341 struct resv_map *resv_map; 342 343 truncate_hugepages(inode, 0); 344 resv_map = (struct resv_map *)inode->i_mapping->private_data; 345 /* root inode doesn't have the resv_map, so we should check it */ 346 if (resv_map) 347 resv_map_release(&resv_map->refs); 348 clear_inode(inode); 349 } 350 351 static inline void 352 hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) 353 { 354 struct vm_area_struct *vma; 355 356 vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { 357 unsigned long v_offset; 358 359 /* 360 * Can the expression below overflow on 32-bit arches? 361 * No, because the interval tree returns us only those vmas 362 * which overlap the truncated area starting at pgoff, 363 * and no vma on a 32-bit arch can span beyond the 4GB. 364 */ 365 if (vma->vm_pgoff < pgoff) 366 v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; 367 else 368 v_offset = 0; 369 370 unmap_hugepage_range(vma, vma->vm_start + v_offset, 371 vma->vm_end, NULL); 372 } 373 } 374 375 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) 376 { 377 pgoff_t pgoff; 378 struct address_space *mapping = inode->i_mapping; 379 struct hstate *h = hstate_inode(inode); 380 381 BUG_ON(offset & ~huge_page_mask(h)); 382 pgoff = offset >> PAGE_SHIFT; 383 384 i_size_write(inode, offset); 385 i_mmap_lock_write(mapping); 386 if (!RB_EMPTY_ROOT(&mapping->i_mmap)) 387 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 388 i_mmap_unlock_write(mapping); 389 truncate_hugepages(inode, offset); 390 return 0; 391 } 392 393 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) 394 { 395 struct inode *inode = d_inode(dentry); 396 struct hstate *h = hstate_inode(inode); 397 int error; 398 unsigned int ia_valid = attr->ia_valid; 399 400 BUG_ON(!inode); 401 402 error = inode_change_ok(inode, attr); 403 if (error) 404 return error; 405 406 if (ia_valid & ATTR_SIZE) { 407 error = -EINVAL; 408 if (attr->ia_size & ~huge_page_mask(h)) 409 return -EINVAL; 410 error = hugetlb_vmtruncate(inode, attr->ia_size); 411 if (error) 412 return error; 413 } 414 415 setattr_copy(inode, attr); 416 mark_inode_dirty(inode); 417 return 0; 418 } 419 420 static struct inode *hugetlbfs_get_root(struct super_block *sb, 421 struct hugetlbfs_config *config) 422 { 423 struct inode *inode; 424 425 inode = new_inode(sb); 426 if (inode) { 427 struct hugetlbfs_inode_info *info; 428 inode->i_ino = get_next_ino(); 429 inode->i_mode = S_IFDIR | config->mode; 430 inode->i_uid = config->uid; 431 inode->i_gid = config->gid; 432 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 433 info = HUGETLBFS_I(inode); 434 mpol_shared_policy_init(&info->policy, NULL); 435 inode->i_op = &hugetlbfs_dir_inode_operations; 436 inode->i_fop = &simple_dir_operations; 437 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 438 inc_nlink(inode); 439 lockdep_annotate_inode_mutex_key(inode); 440 } 441 return inode; 442 } 443 444 /* 445 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never 446 * be taken from reclaim -- unlike regular filesystems. This needs an 447 * annotation because huge_pmd_share() does an allocation under 448 * i_mmap_rwsem. 449 */ 450 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; 451 452 static struct inode *hugetlbfs_get_inode(struct super_block *sb, 453 struct inode *dir, 454 umode_t mode, dev_t dev) 455 { 456 struct inode *inode; 457 struct resv_map *resv_map; 458 459 resv_map = resv_map_alloc(); 460 if (!resv_map) 461 return NULL; 462 463 inode = new_inode(sb); 464 if (inode) { 465 struct hugetlbfs_inode_info *info; 466 inode->i_ino = get_next_ino(); 467 inode_init_owner(inode, dir, mode); 468 lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, 469 &hugetlbfs_i_mmap_rwsem_key); 470 inode->i_mapping->a_ops = &hugetlbfs_aops; 471 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 472 inode->i_mapping->private_data = resv_map; 473 info = HUGETLBFS_I(inode); 474 /* 475 * The policy is initialized here even if we are creating a 476 * private inode because initialization simply creates an 477 * an empty rb tree and calls spin_lock_init(), later when we 478 * call mpol_free_shared_policy() it will just return because 479 * the rb tree will still be empty. 480 */ 481 mpol_shared_policy_init(&info->policy, NULL); 482 switch (mode & S_IFMT) { 483 default: 484 init_special_inode(inode, mode, dev); 485 break; 486 case S_IFREG: 487 inode->i_op = &hugetlbfs_inode_operations; 488 inode->i_fop = &hugetlbfs_file_operations; 489 break; 490 case S_IFDIR: 491 inode->i_op = &hugetlbfs_dir_inode_operations; 492 inode->i_fop = &simple_dir_operations; 493 494 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 495 inc_nlink(inode); 496 break; 497 case S_IFLNK: 498 inode->i_op = &page_symlink_inode_operations; 499 break; 500 } 501 lockdep_annotate_inode_mutex_key(inode); 502 } else 503 kref_put(&resv_map->refs, resv_map_release); 504 505 return inode; 506 } 507 508 /* 509 * File creation. Allocate an inode, and we're done.. 510 */ 511 static int hugetlbfs_mknod(struct inode *dir, 512 struct dentry *dentry, umode_t mode, dev_t dev) 513 { 514 struct inode *inode; 515 int error = -ENOSPC; 516 517 inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); 518 if (inode) { 519 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 520 d_instantiate(dentry, inode); 521 dget(dentry); /* Extra count - pin the dentry in core */ 522 error = 0; 523 } 524 return error; 525 } 526 527 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 528 { 529 int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); 530 if (!retval) 531 inc_nlink(dir); 532 return retval; 533 } 534 535 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) 536 { 537 return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); 538 } 539 540 static int hugetlbfs_symlink(struct inode *dir, 541 struct dentry *dentry, const char *symname) 542 { 543 struct inode *inode; 544 int error = -ENOSPC; 545 546 inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); 547 if (inode) { 548 int l = strlen(symname)+1; 549 error = page_symlink(inode, symname, l); 550 if (!error) { 551 d_instantiate(dentry, inode); 552 dget(dentry); 553 } else 554 iput(inode); 555 } 556 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 557 558 return error; 559 } 560 561 /* 562 * mark the head page dirty 563 */ 564 static int hugetlbfs_set_page_dirty(struct page *page) 565 { 566 struct page *head = compound_head(page); 567 568 SetPageDirty(head); 569 return 0; 570 } 571 572 static int hugetlbfs_migrate_page(struct address_space *mapping, 573 struct page *newpage, struct page *page, 574 enum migrate_mode mode) 575 { 576 int rc; 577 578 rc = migrate_huge_page_move_mapping(mapping, newpage, page); 579 if (rc != MIGRATEPAGE_SUCCESS) 580 return rc; 581 migrate_page_copy(newpage, page); 582 583 return MIGRATEPAGE_SUCCESS; 584 } 585 586 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 587 { 588 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 589 struct hstate *h = hstate_inode(d_inode(dentry)); 590 591 buf->f_type = HUGETLBFS_MAGIC; 592 buf->f_bsize = huge_page_size(h); 593 if (sbinfo) { 594 spin_lock(&sbinfo->stat_lock); 595 /* If no limits set, just report 0 for max/free/used 596 * blocks, like simple_statfs() */ 597 if (sbinfo->spool) { 598 long free_pages; 599 600 spin_lock(&sbinfo->spool->lock); 601 buf->f_blocks = sbinfo->spool->max_hpages; 602 free_pages = sbinfo->spool->max_hpages 603 - sbinfo->spool->used_hpages; 604 buf->f_bavail = buf->f_bfree = free_pages; 605 spin_unlock(&sbinfo->spool->lock); 606 buf->f_files = sbinfo->max_inodes; 607 buf->f_ffree = sbinfo->free_inodes; 608 } 609 spin_unlock(&sbinfo->stat_lock); 610 } 611 buf->f_namelen = NAME_MAX; 612 return 0; 613 } 614 615 static void hugetlbfs_put_super(struct super_block *sb) 616 { 617 struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); 618 619 if (sbi) { 620 sb->s_fs_info = NULL; 621 622 if (sbi->spool) 623 hugepage_put_subpool(sbi->spool); 624 625 kfree(sbi); 626 } 627 } 628 629 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) 630 { 631 if (sbinfo->free_inodes >= 0) { 632 spin_lock(&sbinfo->stat_lock); 633 if (unlikely(!sbinfo->free_inodes)) { 634 spin_unlock(&sbinfo->stat_lock); 635 return 0; 636 } 637 sbinfo->free_inodes--; 638 spin_unlock(&sbinfo->stat_lock); 639 } 640 641 return 1; 642 } 643 644 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) 645 { 646 if (sbinfo->free_inodes >= 0) { 647 spin_lock(&sbinfo->stat_lock); 648 sbinfo->free_inodes++; 649 spin_unlock(&sbinfo->stat_lock); 650 } 651 } 652 653 654 static struct kmem_cache *hugetlbfs_inode_cachep; 655 656 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 657 { 658 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); 659 struct hugetlbfs_inode_info *p; 660 661 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) 662 return NULL; 663 p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); 664 if (unlikely(!p)) { 665 hugetlbfs_inc_free_inodes(sbinfo); 666 return NULL; 667 } 668 return &p->vfs_inode; 669 } 670 671 static void hugetlbfs_i_callback(struct rcu_head *head) 672 { 673 struct inode *inode = container_of(head, struct inode, i_rcu); 674 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 675 } 676 677 static void hugetlbfs_destroy_inode(struct inode *inode) 678 { 679 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 680 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 681 call_rcu(&inode->i_rcu, hugetlbfs_i_callback); 682 } 683 684 static const struct address_space_operations hugetlbfs_aops = { 685 .write_begin = hugetlbfs_write_begin, 686 .write_end = hugetlbfs_write_end, 687 .set_page_dirty = hugetlbfs_set_page_dirty, 688 .migratepage = hugetlbfs_migrate_page, 689 }; 690 691 692 static void init_once(void *foo) 693 { 694 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; 695 696 inode_init_once(&ei->vfs_inode); 697 } 698 699 const struct file_operations hugetlbfs_file_operations = { 700 .read_iter = hugetlbfs_read_iter, 701 .mmap = hugetlbfs_file_mmap, 702 .fsync = noop_fsync, 703 .get_unmapped_area = hugetlb_get_unmapped_area, 704 .llseek = default_llseek, 705 }; 706 707 static const struct inode_operations hugetlbfs_dir_inode_operations = { 708 .create = hugetlbfs_create, 709 .lookup = simple_lookup, 710 .link = simple_link, 711 .unlink = simple_unlink, 712 .symlink = hugetlbfs_symlink, 713 .mkdir = hugetlbfs_mkdir, 714 .rmdir = simple_rmdir, 715 .mknod = hugetlbfs_mknod, 716 .rename = simple_rename, 717 .setattr = hugetlbfs_setattr, 718 }; 719 720 static const struct inode_operations hugetlbfs_inode_operations = { 721 .setattr = hugetlbfs_setattr, 722 }; 723 724 static const struct super_operations hugetlbfs_ops = { 725 .alloc_inode = hugetlbfs_alloc_inode, 726 .destroy_inode = hugetlbfs_destroy_inode, 727 .evict_inode = hugetlbfs_evict_inode, 728 .statfs = hugetlbfs_statfs, 729 .put_super = hugetlbfs_put_super, 730 .show_options = generic_show_options, 731 }; 732 733 enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; 734 735 /* 736 * Convert size option passed from command line to number of huge pages 737 * in the pool specified by hstate. Size option could be in bytes 738 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). 739 */ 740 static long long 741 hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, 742 int val_type) 743 { 744 if (val_type == NO_SIZE) 745 return -1; 746 747 if (val_type == SIZE_PERCENT) { 748 size_opt <<= huge_page_shift(h); 749 size_opt *= h->max_huge_pages; 750 do_div(size_opt, 100); 751 } 752 753 size_opt >>= huge_page_shift(h); 754 return size_opt; 755 } 756 757 static int 758 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) 759 { 760 char *p, *rest; 761 substring_t args[MAX_OPT_ARGS]; 762 int option; 763 unsigned long long max_size_opt = 0, min_size_opt = 0; 764 int max_val_type = NO_SIZE, min_val_type = NO_SIZE; 765 766 if (!options) 767 return 0; 768 769 while ((p = strsep(&options, ",")) != NULL) { 770 int token; 771 if (!*p) 772 continue; 773 774 token = match_token(p, tokens, args); 775 switch (token) { 776 case Opt_uid: 777 if (match_int(&args[0], &option)) 778 goto bad_val; 779 pconfig->uid = make_kuid(current_user_ns(), option); 780 if (!uid_valid(pconfig->uid)) 781 goto bad_val; 782 break; 783 784 case Opt_gid: 785 if (match_int(&args[0], &option)) 786 goto bad_val; 787 pconfig->gid = make_kgid(current_user_ns(), option); 788 if (!gid_valid(pconfig->gid)) 789 goto bad_val; 790 break; 791 792 case Opt_mode: 793 if (match_octal(&args[0], &option)) 794 goto bad_val; 795 pconfig->mode = option & 01777U; 796 break; 797 798 case Opt_size: { 799 /* memparse() will accept a K/M/G without a digit */ 800 if (!isdigit(*args[0].from)) 801 goto bad_val; 802 max_size_opt = memparse(args[0].from, &rest); 803 max_val_type = SIZE_STD; 804 if (*rest == '%') 805 max_val_type = SIZE_PERCENT; 806 break; 807 } 808 809 case Opt_nr_inodes: 810 /* memparse() will accept a K/M/G without a digit */ 811 if (!isdigit(*args[0].from)) 812 goto bad_val; 813 pconfig->nr_inodes = memparse(args[0].from, &rest); 814 break; 815 816 case Opt_pagesize: { 817 unsigned long ps; 818 ps = memparse(args[0].from, &rest); 819 pconfig->hstate = size_to_hstate(ps); 820 if (!pconfig->hstate) { 821 pr_err("Unsupported page size %lu MB\n", 822 ps >> 20); 823 return -EINVAL; 824 } 825 break; 826 } 827 828 case Opt_min_size: { 829 /* memparse() will accept a K/M/G without a digit */ 830 if (!isdigit(*args[0].from)) 831 goto bad_val; 832 min_size_opt = memparse(args[0].from, &rest); 833 min_val_type = SIZE_STD; 834 if (*rest == '%') 835 min_val_type = SIZE_PERCENT; 836 break; 837 } 838 839 default: 840 pr_err("Bad mount option: \"%s\"\n", p); 841 return -EINVAL; 842 break; 843 } 844 } 845 846 /* 847 * Use huge page pool size (in hstate) to convert the size 848 * options to number of huge pages. If NO_SIZE, -1 is returned. 849 */ 850 pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, 851 max_size_opt, max_val_type); 852 pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, 853 min_size_opt, min_val_type); 854 855 /* 856 * If max_size was specified, then min_size must be smaller 857 */ 858 if (max_val_type > NO_SIZE && 859 pconfig->min_hpages > pconfig->max_hpages) { 860 pr_err("minimum size can not be greater than maximum size\n"); 861 return -EINVAL; 862 } 863 864 return 0; 865 866 bad_val: 867 pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); 868 return -EINVAL; 869 } 870 871 static int 872 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) 873 { 874 int ret; 875 struct hugetlbfs_config config; 876 struct hugetlbfs_sb_info *sbinfo; 877 878 save_mount_options(sb, data); 879 880 config.max_hpages = -1; /* No limit on size by default */ 881 config.nr_inodes = -1; /* No limit on number of inodes by default */ 882 config.uid = current_fsuid(); 883 config.gid = current_fsgid(); 884 config.mode = 0755; 885 config.hstate = &default_hstate; 886 config.min_hpages = -1; /* No default minimum size */ 887 ret = hugetlbfs_parse_options(data, &config); 888 if (ret) 889 return ret; 890 891 sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); 892 if (!sbinfo) 893 return -ENOMEM; 894 sb->s_fs_info = sbinfo; 895 sbinfo->hstate = config.hstate; 896 spin_lock_init(&sbinfo->stat_lock); 897 sbinfo->max_inodes = config.nr_inodes; 898 sbinfo->free_inodes = config.nr_inodes; 899 sbinfo->spool = NULL; 900 /* 901 * Allocate and initialize subpool if maximum or minimum size is 902 * specified. Any needed reservations (for minimim size) are taken 903 * taken when the subpool is created. 904 */ 905 if (config.max_hpages != -1 || config.min_hpages != -1) { 906 sbinfo->spool = hugepage_new_subpool(config.hstate, 907 config.max_hpages, 908 config.min_hpages); 909 if (!sbinfo->spool) 910 goto out_free; 911 } 912 sb->s_maxbytes = MAX_LFS_FILESIZE; 913 sb->s_blocksize = huge_page_size(config.hstate); 914 sb->s_blocksize_bits = huge_page_shift(config.hstate); 915 sb->s_magic = HUGETLBFS_MAGIC; 916 sb->s_op = &hugetlbfs_ops; 917 sb->s_time_gran = 1; 918 sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); 919 if (!sb->s_root) 920 goto out_free; 921 return 0; 922 out_free: 923 kfree(sbinfo->spool); 924 kfree(sbinfo); 925 return -ENOMEM; 926 } 927 928 static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, 929 int flags, const char *dev_name, void *data) 930 { 931 return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); 932 } 933 934 static struct file_system_type hugetlbfs_fs_type = { 935 .name = "hugetlbfs", 936 .mount = hugetlbfs_mount, 937 .kill_sb = kill_litter_super, 938 }; 939 MODULE_ALIAS_FS("hugetlbfs"); 940 941 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; 942 943 static int can_do_hugetlb_shm(void) 944 { 945 kgid_t shm_group; 946 shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); 947 return capable(CAP_IPC_LOCK) || in_group_p(shm_group); 948 } 949 950 static int get_hstate_idx(int page_size_log) 951 { 952 struct hstate *h = hstate_sizelog(page_size_log); 953 954 if (!h) 955 return -1; 956 return h - hstates; 957 } 958 959 static const struct dentry_operations anon_ops = { 960 .d_dname = simple_dname 961 }; 962 963 /* 964 * Note that size should be aligned to proper hugepage size in caller side, 965 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. 966 */ 967 struct file *hugetlb_file_setup(const char *name, size_t size, 968 vm_flags_t acctflag, struct user_struct **user, 969 int creat_flags, int page_size_log) 970 { 971 struct file *file = ERR_PTR(-ENOMEM); 972 struct inode *inode; 973 struct path path; 974 struct super_block *sb; 975 struct qstr quick_string; 976 int hstate_idx; 977 978 hstate_idx = get_hstate_idx(page_size_log); 979 if (hstate_idx < 0) 980 return ERR_PTR(-ENODEV); 981 982 *user = NULL; 983 if (!hugetlbfs_vfsmount[hstate_idx]) 984 return ERR_PTR(-ENOENT); 985 986 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { 987 *user = current_user(); 988 if (user_shm_lock(size, *user)) { 989 task_lock(current); 990 pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", 991 current->comm, current->pid); 992 task_unlock(current); 993 } else { 994 *user = NULL; 995 return ERR_PTR(-EPERM); 996 } 997 } 998 999 sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; 1000 quick_string.name = name; 1001 quick_string.len = strlen(quick_string.name); 1002 quick_string.hash = 0; 1003 path.dentry = d_alloc_pseudo(sb, &quick_string); 1004 if (!path.dentry) 1005 goto out_shm_unlock; 1006 1007 d_set_d_op(path.dentry, &anon_ops); 1008 path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); 1009 file = ERR_PTR(-ENOSPC); 1010 inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); 1011 if (!inode) 1012 goto out_dentry; 1013 if (creat_flags == HUGETLB_SHMFS_INODE) 1014 inode->i_flags |= S_PRIVATE; 1015 1016 file = ERR_PTR(-ENOMEM); 1017 if (hugetlb_reserve_pages(inode, 0, 1018 size >> huge_page_shift(hstate_inode(inode)), NULL, 1019 acctflag)) 1020 goto out_inode; 1021 1022 d_instantiate(path.dentry, inode); 1023 inode->i_size = size; 1024 clear_nlink(inode); 1025 1026 file = alloc_file(&path, FMODE_WRITE | FMODE_READ, 1027 &hugetlbfs_file_operations); 1028 if (IS_ERR(file)) 1029 goto out_dentry; /* inode is already attached */ 1030 1031 return file; 1032 1033 out_inode: 1034 iput(inode); 1035 out_dentry: 1036 path_put(&path); 1037 out_shm_unlock: 1038 if (*user) { 1039 user_shm_unlock(size, *user); 1040 *user = NULL; 1041 } 1042 return file; 1043 } 1044 1045 static int __init init_hugetlbfs_fs(void) 1046 { 1047 struct hstate *h; 1048 int error; 1049 int i; 1050 1051 if (!hugepages_supported()) { 1052 pr_info("disabling because there are no supported hugepage sizes\n"); 1053 return -ENOTSUPP; 1054 } 1055 1056 error = -ENOMEM; 1057 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 1058 sizeof(struct hugetlbfs_inode_info), 1059 0, 0, init_once); 1060 if (hugetlbfs_inode_cachep == NULL) 1061 goto out2; 1062 1063 error = register_filesystem(&hugetlbfs_fs_type); 1064 if (error) 1065 goto out; 1066 1067 i = 0; 1068 for_each_hstate(h) { 1069 char buf[50]; 1070 unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); 1071 1072 snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); 1073 hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, 1074 buf); 1075 1076 if (IS_ERR(hugetlbfs_vfsmount[i])) { 1077 pr_err("Cannot mount internal hugetlbfs for " 1078 "page size %uK", ps_kb); 1079 error = PTR_ERR(hugetlbfs_vfsmount[i]); 1080 hugetlbfs_vfsmount[i] = NULL; 1081 } 1082 i++; 1083 } 1084 /* Non default hstates are optional */ 1085 if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) 1086 return 0; 1087 1088 out: 1089 kmem_cache_destroy(hugetlbfs_inode_cachep); 1090 out2: 1091 return error; 1092 } 1093 1094 static void __exit exit_hugetlbfs_fs(void) 1095 { 1096 struct hstate *h; 1097 int i; 1098 1099 1100 /* 1101 * Make sure all delayed rcu free inodes are flushed before we 1102 * destroy cache. 1103 */ 1104 rcu_barrier(); 1105 kmem_cache_destroy(hugetlbfs_inode_cachep); 1106 i = 0; 1107 for_each_hstate(h) 1108 kern_unmount(hugetlbfs_vfsmount[i++]); 1109 unregister_filesystem(&hugetlbfs_fs_type); 1110 } 1111 1112 module_init(init_hugetlbfs_fs) 1113 module_exit(exit_hugetlbfs_fs) 1114 1115 MODULE_LICENSE("GPL"); 1116