1 /* 2 * hugetlbpage-backed filesystem. Based on ramfs. 3 * 4 * William Irwin, 2002 5 * 6 * Copyright (C) 2002 Linus Torvalds. 7 */ 8 9 #include <linux/module.h> 10 #include <linux/thread_info.h> 11 #include <asm/current.h> 12 #include <linux/sched.h> /* remove ASAP */ 13 #include <linux/fs.h> 14 #include <linux/mount.h> 15 #include <linux/file.h> 16 #include <linux/kernel.h> 17 #include <linux/writeback.h> 18 #include <linux/pagemap.h> 19 #include <linux/highmem.h> 20 #include <linux/init.h> 21 #include <linux/string.h> 22 #include <linux/capability.h> 23 #include <linux/ctype.h> 24 #include <linux/backing-dev.h> 25 #include <linux/hugetlb.h> 26 #include <linux/pagevec.h> 27 #include <linux/parser.h> 28 #include <linux/mman.h> 29 #include <linux/quotaops.h> 30 #include <linux/slab.h> 31 #include <linux/dnotify.h> 32 #include <linux/statfs.h> 33 #include <linux/security.h> 34 35 #include <asm/uaccess.h> 36 37 /* some random number */ 38 #define HUGETLBFS_MAGIC 0x958458f6 39 40 static const struct super_operations hugetlbfs_ops; 41 static const struct address_space_operations hugetlbfs_aops; 42 const struct file_operations hugetlbfs_file_operations; 43 static const struct inode_operations hugetlbfs_dir_inode_operations; 44 static const struct inode_operations hugetlbfs_inode_operations; 45 46 static struct backing_dev_info hugetlbfs_backing_dev_info = { 47 .ra_pages = 0, /* No readahead */ 48 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, 49 }; 50 51 int sysctl_hugetlb_shm_group; 52 53 enum { 54 Opt_size, Opt_nr_inodes, 55 Opt_mode, Opt_uid, Opt_gid, 56 Opt_err, 57 }; 58 59 static match_table_t tokens = { 60 {Opt_size, "size=%s"}, 61 {Opt_nr_inodes, "nr_inodes=%s"}, 62 {Opt_mode, "mode=%o"}, 63 {Opt_uid, "uid=%u"}, 64 {Opt_gid, "gid=%u"}, 65 {Opt_err, NULL}, 66 }; 67 68 static void huge_pagevec_release(struct pagevec *pvec) 69 { 70 int i; 71 72 for (i = 0; i < pagevec_count(pvec); ++i) 73 put_page(pvec->pages[i]); 74 75 pagevec_reinit(pvec); 76 } 77 78 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 79 { 80 struct inode *inode = file->f_path.dentry->d_inode; 81 loff_t len, vma_len; 82 int ret; 83 84 /* 85 * vma address alignment (but not the pgoff alignment) has 86 * already been checked by prepare_hugepage_range. If you add 87 * any error returns here, do so after setting VM_HUGETLB, so 88 * is_vm_hugetlb_page tests below unmap_region go the right 89 * way when do_mmap_pgoff unwinds (may be important on powerpc 90 * and ia64). 91 */ 92 vma->vm_flags |= VM_HUGETLB | VM_RESERVED; 93 vma->vm_ops = &hugetlb_vm_ops; 94 95 if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT)) 96 return -EINVAL; 97 98 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 99 100 mutex_lock(&inode->i_mutex); 101 file_accessed(file); 102 103 ret = -ENOMEM; 104 len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 105 106 if (vma->vm_flags & VM_MAYSHARE && 107 hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), 108 len >> HPAGE_SHIFT)) 109 goto out; 110 111 ret = 0; 112 hugetlb_prefault_arch_hook(vma->vm_mm); 113 if (vma->vm_flags & VM_WRITE && inode->i_size < len) 114 inode->i_size = len; 115 out: 116 mutex_unlock(&inode->i_mutex); 117 118 return ret; 119 } 120 121 /* 122 * Called under down_write(mmap_sem). 123 */ 124 125 #ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA 126 static unsigned long 127 hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 128 unsigned long len, unsigned long pgoff, unsigned long flags) 129 { 130 struct mm_struct *mm = current->mm; 131 struct vm_area_struct *vma; 132 unsigned long start_addr; 133 134 if (len & ~HPAGE_MASK) 135 return -EINVAL; 136 if (len > TASK_SIZE) 137 return -ENOMEM; 138 139 if (flags & MAP_FIXED) { 140 if (prepare_hugepage_range(addr, len)) 141 return -EINVAL; 142 return addr; 143 } 144 145 if (addr) { 146 addr = ALIGN(addr, HPAGE_SIZE); 147 vma = find_vma(mm, addr); 148 if (TASK_SIZE - len >= addr && 149 (!vma || addr + len <= vma->vm_start)) 150 return addr; 151 } 152 153 start_addr = mm->free_area_cache; 154 155 if (len <= mm->cached_hole_size) 156 start_addr = TASK_UNMAPPED_BASE; 157 158 full_search: 159 addr = ALIGN(start_addr, HPAGE_SIZE); 160 161 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 162 /* At this point: (!vma || addr < vma->vm_end). */ 163 if (TASK_SIZE - len < addr) { 164 /* 165 * Start a new search - just in case we missed 166 * some holes. 167 */ 168 if (start_addr != TASK_UNMAPPED_BASE) { 169 start_addr = TASK_UNMAPPED_BASE; 170 goto full_search; 171 } 172 return -ENOMEM; 173 } 174 175 if (!vma || addr + len <= vma->vm_start) 176 return addr; 177 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 178 } 179 } 180 #endif 181 182 static int 183 hugetlbfs_read_actor(struct page *page, unsigned long offset, 184 char __user *buf, unsigned long count, 185 unsigned long size) 186 { 187 char *kaddr; 188 unsigned long left, copied = 0; 189 int i, chunksize; 190 191 if (size > count) 192 size = count; 193 194 /* Find which 4k chunk and offset with in that chunk */ 195 i = offset >> PAGE_CACHE_SHIFT; 196 offset = offset & ~PAGE_CACHE_MASK; 197 198 while (size) { 199 chunksize = PAGE_CACHE_SIZE; 200 if (offset) 201 chunksize -= offset; 202 if (chunksize > size) 203 chunksize = size; 204 kaddr = kmap(&page[i]); 205 left = __copy_to_user(buf, kaddr + offset, chunksize); 206 kunmap(&page[i]); 207 if (left) { 208 copied += (chunksize - left); 209 break; 210 } 211 offset = 0; 212 size -= chunksize; 213 buf += chunksize; 214 copied += chunksize; 215 i++; 216 } 217 return copied ? copied : -EFAULT; 218 } 219 220 /* 221 * Support for read() - Find the page attached to f_mapping and copy out the 222 * data. Its *very* similar to do_generic_mapping_read(), we can't use that 223 * since it has PAGE_CACHE_SIZE assumptions. 224 */ 225 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, 226 size_t len, loff_t *ppos) 227 { 228 struct address_space *mapping = filp->f_mapping; 229 struct inode *inode = mapping->host; 230 unsigned long index = *ppos >> HPAGE_SHIFT; 231 unsigned long offset = *ppos & ~HPAGE_MASK; 232 unsigned long end_index; 233 loff_t isize; 234 ssize_t retval = 0; 235 236 mutex_lock(&inode->i_mutex); 237 238 /* validate length */ 239 if (len == 0) 240 goto out; 241 242 isize = i_size_read(inode); 243 if (!isize) 244 goto out; 245 246 end_index = (isize - 1) >> HPAGE_SHIFT; 247 for (;;) { 248 struct page *page; 249 int nr, ret; 250 251 /* nr is the maximum number of bytes to copy from this page */ 252 nr = HPAGE_SIZE; 253 if (index >= end_index) { 254 if (index > end_index) 255 goto out; 256 nr = ((isize - 1) & ~HPAGE_MASK) + 1; 257 if (nr <= offset) { 258 goto out; 259 } 260 } 261 nr = nr - offset; 262 263 /* Find the page */ 264 page = find_get_page(mapping, index); 265 if (unlikely(page == NULL)) { 266 /* 267 * We have a HOLE, zero out the user-buffer for the 268 * length of the hole or request. 269 */ 270 ret = len < nr ? len : nr; 271 if (clear_user(buf, ret)) 272 ret = -EFAULT; 273 } else { 274 /* 275 * We have the page, copy it to user space buffer. 276 */ 277 ret = hugetlbfs_read_actor(page, offset, buf, len, nr); 278 } 279 if (ret < 0) { 280 if (retval == 0) 281 retval = ret; 282 if (page) 283 page_cache_release(page); 284 goto out; 285 } 286 287 offset += ret; 288 retval += ret; 289 len -= ret; 290 index += offset >> HPAGE_SHIFT; 291 offset &= ~HPAGE_MASK; 292 293 if (page) 294 page_cache_release(page); 295 296 /* short read or no more work */ 297 if ((ret != nr) || (len == 0)) 298 break; 299 } 300 out: 301 *ppos = ((loff_t)index << HPAGE_SHIFT) + offset; 302 mutex_unlock(&inode->i_mutex); 303 return retval; 304 } 305 306 /* 307 * Read a page. Again trivial. If it didn't already exist 308 * in the page cache, it is zero-filled. 309 */ 310 static int hugetlbfs_readpage(struct file *file, struct page * page) 311 { 312 unlock_page(page); 313 return -EINVAL; 314 } 315 316 static int hugetlbfs_write_begin(struct file *file, 317 struct address_space *mapping, 318 loff_t pos, unsigned len, unsigned flags, 319 struct page **pagep, void **fsdata) 320 { 321 return -EINVAL; 322 } 323 324 static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, 325 loff_t pos, unsigned len, unsigned copied, 326 struct page *page, void *fsdata) 327 { 328 BUG(); 329 return -EINVAL; 330 } 331 332 static void truncate_huge_page(struct page *page) 333 { 334 cancel_dirty_page(page, /* No IO accounting for huge pages? */0); 335 ClearPageUptodate(page); 336 remove_from_page_cache(page); 337 put_page(page); 338 } 339 340 static void truncate_hugepages(struct inode *inode, loff_t lstart) 341 { 342 struct address_space *mapping = &inode->i_data; 343 const pgoff_t start = lstart >> HPAGE_SHIFT; 344 struct pagevec pvec; 345 pgoff_t next; 346 int i, freed = 0; 347 348 pagevec_init(&pvec, 0); 349 next = start; 350 while (1) { 351 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 352 if (next == start) 353 break; 354 next = start; 355 continue; 356 } 357 358 for (i = 0; i < pagevec_count(&pvec); ++i) { 359 struct page *page = pvec.pages[i]; 360 361 lock_page(page); 362 if (page->index > next) 363 next = page->index; 364 ++next; 365 truncate_huge_page(page); 366 unlock_page(page); 367 hugetlb_put_quota(mapping); 368 freed++; 369 } 370 huge_pagevec_release(&pvec); 371 } 372 BUG_ON(!lstart && mapping->nrpages); 373 hugetlb_unreserve_pages(inode, start, freed); 374 } 375 376 static void hugetlbfs_delete_inode(struct inode *inode) 377 { 378 truncate_hugepages(inode, 0); 379 clear_inode(inode); 380 } 381 382 static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) 383 { 384 struct super_block *sb = inode->i_sb; 385 386 if (!hlist_unhashed(&inode->i_hash)) { 387 if (!(inode->i_state & (I_DIRTY|I_LOCK))) 388 list_move(&inode->i_list, &inode_unused); 389 inodes_stat.nr_unused++; 390 if (!sb || (sb->s_flags & MS_ACTIVE)) { 391 spin_unlock(&inode_lock); 392 return; 393 } 394 inode->i_state |= I_WILL_FREE; 395 spin_unlock(&inode_lock); 396 /* 397 * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK 398 * in our backing_dev_info. 399 */ 400 write_inode_now(inode, 1); 401 spin_lock(&inode_lock); 402 inode->i_state &= ~I_WILL_FREE; 403 inodes_stat.nr_unused--; 404 hlist_del_init(&inode->i_hash); 405 } 406 list_del_init(&inode->i_list); 407 list_del_init(&inode->i_sb_list); 408 inode->i_state |= I_FREEING; 409 inodes_stat.nr_inodes--; 410 spin_unlock(&inode_lock); 411 truncate_hugepages(inode, 0); 412 clear_inode(inode); 413 destroy_inode(inode); 414 } 415 416 static void hugetlbfs_drop_inode(struct inode *inode) 417 { 418 if (!inode->i_nlink) 419 generic_delete_inode(inode); 420 else 421 hugetlbfs_forget_inode(inode); 422 } 423 424 static inline void 425 hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) 426 { 427 struct vm_area_struct *vma; 428 struct prio_tree_iter iter; 429 430 vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { 431 unsigned long v_offset; 432 433 /* 434 * Can the expression below overflow on 32-bit arches? 435 * No, because the prio_tree returns us only those vmas 436 * which overlap the truncated area starting at pgoff, 437 * and no vma on a 32-bit arch can span beyond the 4GB. 438 */ 439 if (vma->vm_pgoff < pgoff) 440 v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT; 441 else 442 v_offset = 0; 443 444 __unmap_hugepage_range(vma, 445 vma->vm_start + v_offset, vma->vm_end); 446 } 447 } 448 449 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) 450 { 451 pgoff_t pgoff; 452 struct address_space *mapping = inode->i_mapping; 453 454 BUG_ON(offset & ~HPAGE_MASK); 455 pgoff = offset >> PAGE_SHIFT; 456 457 i_size_write(inode, offset); 458 spin_lock(&mapping->i_mmap_lock); 459 if (!prio_tree_empty(&mapping->i_mmap)) 460 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 461 spin_unlock(&mapping->i_mmap_lock); 462 truncate_hugepages(inode, offset); 463 return 0; 464 } 465 466 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) 467 { 468 struct inode *inode = dentry->d_inode; 469 int error; 470 unsigned int ia_valid = attr->ia_valid; 471 472 BUG_ON(!inode); 473 474 error = inode_change_ok(inode, attr); 475 if (error) 476 goto out; 477 478 if (ia_valid & ATTR_SIZE) { 479 error = -EINVAL; 480 if (!(attr->ia_size & ~HPAGE_MASK)) 481 error = hugetlb_vmtruncate(inode, attr->ia_size); 482 if (error) 483 goto out; 484 attr->ia_valid &= ~ATTR_SIZE; 485 } 486 error = inode_setattr(inode, attr); 487 out: 488 return error; 489 } 490 491 static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 492 gid_t gid, int mode, dev_t dev) 493 { 494 struct inode *inode; 495 496 inode = new_inode(sb); 497 if (inode) { 498 struct hugetlbfs_inode_info *info; 499 inode->i_mode = mode; 500 inode->i_uid = uid; 501 inode->i_gid = gid; 502 inode->i_blocks = 0; 503 inode->i_mapping->a_ops = &hugetlbfs_aops; 504 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 505 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 506 INIT_LIST_HEAD(&inode->i_mapping->private_list); 507 info = HUGETLBFS_I(inode); 508 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL); 509 switch (mode & S_IFMT) { 510 default: 511 init_special_inode(inode, mode, dev); 512 break; 513 case S_IFREG: 514 inode->i_op = &hugetlbfs_inode_operations; 515 inode->i_fop = &hugetlbfs_file_operations; 516 break; 517 case S_IFDIR: 518 inode->i_op = &hugetlbfs_dir_inode_operations; 519 inode->i_fop = &simple_dir_operations; 520 521 /* directory inodes start off with i_nlink == 2 (for "." entry) */ 522 inc_nlink(inode); 523 break; 524 case S_IFLNK: 525 inode->i_op = &page_symlink_inode_operations; 526 break; 527 } 528 } 529 return inode; 530 } 531 532 /* 533 * File creation. Allocate an inode, and we're done.. 534 */ 535 static int hugetlbfs_mknod(struct inode *dir, 536 struct dentry *dentry, int mode, dev_t dev) 537 { 538 struct inode *inode; 539 int error = -ENOSPC; 540 gid_t gid; 541 542 if (dir->i_mode & S_ISGID) { 543 gid = dir->i_gid; 544 if (S_ISDIR(mode)) 545 mode |= S_ISGID; 546 } else { 547 gid = current->fsgid; 548 } 549 inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, gid, mode, dev); 550 if (inode) { 551 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 552 d_instantiate(dentry, inode); 553 dget(dentry); /* Extra count - pin the dentry in core */ 554 error = 0; 555 } 556 return error; 557 } 558 559 static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 560 { 561 int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); 562 if (!retval) 563 inc_nlink(dir); 564 return retval; 565 } 566 567 static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) 568 { 569 return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); 570 } 571 572 static int hugetlbfs_symlink(struct inode *dir, 573 struct dentry *dentry, const char *symname) 574 { 575 struct inode *inode; 576 int error = -ENOSPC; 577 gid_t gid; 578 579 if (dir->i_mode & S_ISGID) 580 gid = dir->i_gid; 581 else 582 gid = current->fsgid; 583 584 inode = hugetlbfs_get_inode(dir->i_sb, current->fsuid, 585 gid, S_IFLNK|S_IRWXUGO, 0); 586 if (inode) { 587 int l = strlen(symname)+1; 588 error = page_symlink(inode, symname, l); 589 if (!error) { 590 d_instantiate(dentry, inode); 591 dget(dentry); 592 } else 593 iput(inode); 594 } 595 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 596 597 return error; 598 } 599 600 /* 601 * mark the head page dirty 602 */ 603 static int hugetlbfs_set_page_dirty(struct page *page) 604 { 605 struct page *head = compound_head(page); 606 607 SetPageDirty(head); 608 return 0; 609 } 610 611 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) 612 { 613 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); 614 615 buf->f_type = HUGETLBFS_MAGIC; 616 buf->f_bsize = HPAGE_SIZE; 617 if (sbinfo) { 618 spin_lock(&sbinfo->stat_lock); 619 /* If no limits set, just report 0 for max/free/used 620 * blocks, like simple_statfs() */ 621 if (sbinfo->max_blocks >= 0) { 622 buf->f_blocks = sbinfo->max_blocks; 623 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 624 buf->f_files = sbinfo->max_inodes; 625 buf->f_ffree = sbinfo->free_inodes; 626 } 627 spin_unlock(&sbinfo->stat_lock); 628 } 629 buf->f_namelen = NAME_MAX; 630 return 0; 631 } 632 633 static void hugetlbfs_put_super(struct super_block *sb) 634 { 635 struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); 636 637 if (sbi) { 638 sb->s_fs_info = NULL; 639 kfree(sbi); 640 } 641 } 642 643 static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) 644 { 645 if (sbinfo->free_inodes >= 0) { 646 spin_lock(&sbinfo->stat_lock); 647 if (unlikely(!sbinfo->free_inodes)) { 648 spin_unlock(&sbinfo->stat_lock); 649 return 0; 650 } 651 sbinfo->free_inodes--; 652 spin_unlock(&sbinfo->stat_lock); 653 } 654 655 return 1; 656 } 657 658 static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) 659 { 660 if (sbinfo->free_inodes >= 0) { 661 spin_lock(&sbinfo->stat_lock); 662 sbinfo->free_inodes++; 663 spin_unlock(&sbinfo->stat_lock); 664 } 665 } 666 667 668 static struct kmem_cache *hugetlbfs_inode_cachep; 669 670 static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) 671 { 672 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); 673 struct hugetlbfs_inode_info *p; 674 675 if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) 676 return NULL; 677 p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL); 678 if (unlikely(!p)) { 679 hugetlbfs_inc_free_inodes(sbinfo); 680 return NULL; 681 } 682 return &p->vfs_inode; 683 } 684 685 static void hugetlbfs_destroy_inode(struct inode *inode) 686 { 687 hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); 688 mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); 689 kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); 690 } 691 692 static const struct address_space_operations hugetlbfs_aops = { 693 .readpage = hugetlbfs_readpage, 694 .write_begin = hugetlbfs_write_begin, 695 .write_end = hugetlbfs_write_end, 696 .set_page_dirty = hugetlbfs_set_page_dirty, 697 }; 698 699 700 static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) 701 { 702 struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; 703 704 inode_init_once(&ei->vfs_inode); 705 } 706 707 const struct file_operations hugetlbfs_file_operations = { 708 .read = hugetlbfs_read, 709 .mmap = hugetlbfs_file_mmap, 710 .fsync = simple_sync_file, 711 .get_unmapped_area = hugetlb_get_unmapped_area, 712 }; 713 714 static const struct inode_operations hugetlbfs_dir_inode_operations = { 715 .create = hugetlbfs_create, 716 .lookup = simple_lookup, 717 .link = simple_link, 718 .unlink = simple_unlink, 719 .symlink = hugetlbfs_symlink, 720 .mkdir = hugetlbfs_mkdir, 721 .rmdir = simple_rmdir, 722 .mknod = hugetlbfs_mknod, 723 .rename = simple_rename, 724 .setattr = hugetlbfs_setattr, 725 }; 726 727 static const struct inode_operations hugetlbfs_inode_operations = { 728 .setattr = hugetlbfs_setattr, 729 }; 730 731 static const struct super_operations hugetlbfs_ops = { 732 .alloc_inode = hugetlbfs_alloc_inode, 733 .destroy_inode = hugetlbfs_destroy_inode, 734 .statfs = hugetlbfs_statfs, 735 .delete_inode = hugetlbfs_delete_inode, 736 .drop_inode = hugetlbfs_drop_inode, 737 .put_super = hugetlbfs_put_super, 738 }; 739 740 static int 741 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) 742 { 743 char *p, *rest; 744 substring_t args[MAX_OPT_ARGS]; 745 int option; 746 747 if (!options) 748 return 0; 749 750 while ((p = strsep(&options, ",")) != NULL) { 751 int token; 752 if (!*p) 753 continue; 754 755 token = match_token(p, tokens, args); 756 switch (token) { 757 case Opt_uid: 758 if (match_int(&args[0], &option)) 759 goto bad_val; 760 pconfig->uid = option; 761 break; 762 763 case Opt_gid: 764 if (match_int(&args[0], &option)) 765 goto bad_val; 766 pconfig->gid = option; 767 break; 768 769 case Opt_mode: 770 if (match_octal(&args[0], &option)) 771 goto bad_val; 772 pconfig->mode = option & 0777U; 773 break; 774 775 case Opt_size: { 776 unsigned long long size; 777 /* memparse() will accept a K/M/G without a digit */ 778 if (!isdigit(*args[0].from)) 779 goto bad_val; 780 size = memparse(args[0].from, &rest); 781 if (*rest == '%') { 782 size <<= HPAGE_SHIFT; 783 size *= max_huge_pages; 784 do_div(size, 100); 785 } 786 pconfig->nr_blocks = (size >> HPAGE_SHIFT); 787 break; 788 } 789 790 case Opt_nr_inodes: 791 /* memparse() will accept a K/M/G without a digit */ 792 if (!isdigit(*args[0].from)) 793 goto bad_val; 794 pconfig->nr_inodes = memparse(args[0].from, &rest); 795 break; 796 797 default: 798 printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", 799 p); 800 return -EINVAL; 801 break; 802 } 803 } 804 return 0; 805 806 bad_val: 807 printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", 808 args[0].from, p); 809 return 1; 810 } 811 812 static int 813 hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) 814 { 815 struct inode * inode; 816 struct dentry * root; 817 int ret; 818 struct hugetlbfs_config config; 819 struct hugetlbfs_sb_info *sbinfo; 820 821 config.nr_blocks = -1; /* No limit on size by default */ 822 config.nr_inodes = -1; /* No limit on number of inodes by default */ 823 config.uid = current->fsuid; 824 config.gid = current->fsgid; 825 config.mode = 0755; 826 ret = hugetlbfs_parse_options(data, &config); 827 if (ret) 828 return ret; 829 830 sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); 831 if (!sbinfo) 832 return -ENOMEM; 833 sb->s_fs_info = sbinfo; 834 spin_lock_init(&sbinfo->stat_lock); 835 sbinfo->max_blocks = config.nr_blocks; 836 sbinfo->free_blocks = config.nr_blocks; 837 sbinfo->max_inodes = config.nr_inodes; 838 sbinfo->free_inodes = config.nr_inodes; 839 sb->s_maxbytes = MAX_LFS_FILESIZE; 840 sb->s_blocksize = HPAGE_SIZE; 841 sb->s_blocksize_bits = HPAGE_SHIFT; 842 sb->s_magic = HUGETLBFS_MAGIC; 843 sb->s_op = &hugetlbfs_ops; 844 sb->s_time_gran = 1; 845 inode = hugetlbfs_get_inode(sb, config.uid, config.gid, 846 S_IFDIR | config.mode, 0); 847 if (!inode) 848 goto out_free; 849 850 root = d_alloc_root(inode); 851 if (!root) { 852 iput(inode); 853 goto out_free; 854 } 855 sb->s_root = root; 856 return 0; 857 out_free: 858 kfree(sbinfo); 859 return -ENOMEM; 860 } 861 862 int hugetlb_get_quota(struct address_space *mapping) 863 { 864 int ret = 0; 865 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); 866 867 if (sbinfo->free_blocks > -1) { 868 spin_lock(&sbinfo->stat_lock); 869 if (sbinfo->free_blocks > 0) 870 sbinfo->free_blocks--; 871 else 872 ret = -ENOMEM; 873 spin_unlock(&sbinfo->stat_lock); 874 } 875 876 return ret; 877 } 878 879 void hugetlb_put_quota(struct address_space *mapping) 880 { 881 struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); 882 883 if (sbinfo->free_blocks > -1) { 884 spin_lock(&sbinfo->stat_lock); 885 sbinfo->free_blocks++; 886 spin_unlock(&sbinfo->stat_lock); 887 } 888 } 889 890 static int hugetlbfs_get_sb(struct file_system_type *fs_type, 891 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 892 { 893 return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); 894 } 895 896 static struct file_system_type hugetlbfs_fs_type = { 897 .name = "hugetlbfs", 898 .get_sb = hugetlbfs_get_sb, 899 .kill_sb = kill_litter_super, 900 }; 901 902 static struct vfsmount *hugetlbfs_vfsmount; 903 904 static int can_do_hugetlb_shm(void) 905 { 906 return likely(capable(CAP_IPC_LOCK) || 907 in_group_p(sysctl_hugetlb_shm_group) || 908 can_do_mlock()); 909 } 910 911 struct file *hugetlb_file_setup(const char *name, size_t size) 912 { 913 int error = -ENOMEM; 914 struct file *file; 915 struct inode *inode; 916 struct dentry *dentry, *root; 917 struct qstr quick_string; 918 919 if (!hugetlbfs_vfsmount) 920 return ERR_PTR(-ENOENT); 921 922 if (!can_do_hugetlb_shm()) 923 return ERR_PTR(-EPERM); 924 925 if (!user_shm_lock(size, current->user)) 926 return ERR_PTR(-ENOMEM); 927 928 root = hugetlbfs_vfsmount->mnt_root; 929 quick_string.name = name; 930 quick_string.len = strlen(quick_string.name); 931 quick_string.hash = 0; 932 dentry = d_alloc(root, &quick_string); 933 if (!dentry) 934 goto out_shm_unlock; 935 936 error = -ENFILE; 937 file = get_empty_filp(); 938 if (!file) 939 goto out_dentry; 940 941 error = -ENOSPC; 942 inode = hugetlbfs_get_inode(root->d_sb, current->fsuid, 943 current->fsgid, S_IFREG | S_IRWXUGO, 0); 944 if (!inode) 945 goto out_file; 946 947 error = -ENOMEM; 948 if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) 949 goto out_inode; 950 951 d_instantiate(dentry, inode); 952 inode->i_size = size; 953 inode->i_nlink = 0; 954 file->f_path.mnt = mntget(hugetlbfs_vfsmount); 955 file->f_path.dentry = dentry; 956 file->f_mapping = inode->i_mapping; 957 file->f_op = &hugetlbfs_file_operations; 958 file->f_mode = FMODE_WRITE | FMODE_READ; 959 return file; 960 961 out_inode: 962 iput(inode); 963 out_file: 964 put_filp(file); 965 out_dentry: 966 dput(dentry); 967 out_shm_unlock: 968 user_shm_unlock(size, current->user); 969 return ERR_PTR(error); 970 } 971 972 static int __init init_hugetlbfs_fs(void) 973 { 974 int error; 975 struct vfsmount *vfsmount; 976 977 hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", 978 sizeof(struct hugetlbfs_inode_info), 979 0, 0, init_once); 980 if (hugetlbfs_inode_cachep == NULL) 981 return -ENOMEM; 982 983 error = register_filesystem(&hugetlbfs_fs_type); 984 if (error) 985 goto out; 986 987 vfsmount = kern_mount(&hugetlbfs_fs_type); 988 989 if (!IS_ERR(vfsmount)) { 990 hugetlbfs_vfsmount = vfsmount; 991 return 0; 992 } 993 994 error = PTR_ERR(vfsmount); 995 996 out: 997 if (error) 998 kmem_cache_destroy(hugetlbfs_inode_cachep); 999 return error; 1000 } 1001 1002 static void __exit exit_hugetlbfs_fs(void) 1003 { 1004 kmem_cache_destroy(hugetlbfs_inode_cachep); 1005 unregister_filesystem(&hugetlbfs_fs_type); 1006 } 1007 1008 module_init(init_hugetlbfs_fs) 1009 module_exit(exit_hugetlbfs_fs) 1010 1011 MODULE_LICENSE("GPL"); 1012