1 /* 2 * linux/fs/file_table.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) 6 */ 7 8 #include <linux/string.h> 9 #include <linux/slab.h> 10 #include <linux/file.h> 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/fs.h> 14 #include <linux/security.h> 15 #include <linux/eventpoll.h> 16 #include <linux/rcupdate.h> 17 #include <linux/mount.h> 18 #include <linux/capability.h> 19 #include <linux/cdev.h> 20 #include <linux/fsnotify.h> 21 #include <linux/sysctl.h> 22 #include <linux/percpu_counter.h> 23 24 #include <asm/atomic.h> 25 26 /* sysctl tunables... */ 27 struct files_stat_struct files_stat = { 28 .max_files = NR_FILE 29 }; 30 31 /* public. Not pretty! */ 32 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); 33 34 static struct percpu_counter nr_files __cacheline_aligned_in_smp; 35 36 static inline void file_free_rcu(struct rcu_head *head) 37 { 38 struct file *f = container_of(head, struct file, f_u.fu_rcuhead); 39 kmem_cache_free(filp_cachep, f); 40 } 41 42 static inline void file_free(struct file *f) 43 { 44 percpu_counter_dec(&nr_files); 45 file_check_state(f); 46 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); 47 } 48 49 /* 50 * Return the total number of open files in the system 51 */ 52 static int get_nr_files(void) 53 { 54 return percpu_counter_read_positive(&nr_files); 55 } 56 57 /* 58 * Return the maximum number of open files in the system 59 */ 60 int get_max_files(void) 61 { 62 return files_stat.max_files; 63 } 64 EXPORT_SYMBOL_GPL(get_max_files); 65 66 /* 67 * Handle nr_files sysctl 68 */ 69 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 70 int proc_nr_files(ctl_table *table, int write, struct file *filp, 71 void __user *buffer, size_t *lenp, loff_t *ppos) 72 { 73 files_stat.nr_files = get_nr_files(); 74 return proc_dointvec(table, write, filp, buffer, lenp, ppos); 75 } 76 #else 77 int proc_nr_files(ctl_table *table, int write, struct file *filp, 78 void __user *buffer, size_t *lenp, loff_t *ppos) 79 { 80 return -ENOSYS; 81 } 82 #endif 83 84 /* Find an unused file structure and return a pointer to it. 85 * Returns NULL, if there are no more free file structures or 86 * we run out of memory. 87 * 88 * Be very careful using this. You are responsible for 89 * getting write access to any mount that you might assign 90 * to this filp, if it is opened for write. If this is not 91 * done, you will imbalance int the mount's writer count 92 * and a warning at __fput() time. 93 */ 94 struct file *get_empty_filp(void) 95 { 96 struct task_struct *tsk; 97 static int old_max; 98 struct file * f; 99 100 /* 101 * Privileged users can go above max_files 102 */ 103 if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { 104 /* 105 * percpu_counters are inaccurate. Do an expensive check before 106 * we go and fail. 107 */ 108 if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files) 109 goto over; 110 } 111 112 f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); 113 if (f == NULL) 114 goto fail; 115 116 percpu_counter_inc(&nr_files); 117 if (security_file_alloc(f)) 118 goto fail_sec; 119 120 tsk = current; 121 INIT_LIST_HEAD(&f->f_u.fu_list); 122 atomic_set(&f->f_count, 1); 123 rwlock_init(&f->f_owner.lock); 124 f->f_uid = tsk->fsuid; 125 f->f_gid = tsk->fsgid; 126 eventpoll_init_file(f); 127 /* f->f_version: 0 */ 128 return f; 129 130 over: 131 /* Ran out of filps - report that */ 132 if (get_nr_files() > old_max) { 133 printk(KERN_INFO "VFS: file-max limit %d reached\n", 134 get_max_files()); 135 old_max = get_nr_files(); 136 } 137 goto fail; 138 139 fail_sec: 140 file_free(f); 141 fail: 142 return NULL; 143 } 144 145 EXPORT_SYMBOL(get_empty_filp); 146 147 /** 148 * alloc_file - allocate and initialize a 'struct file' 149 * @mnt: the vfsmount on which the file will reside 150 * @dentry: the dentry representing the new file 151 * @mode: the mode with which the new file will be opened 152 * @fop: the 'struct file_operations' for the new file 153 * 154 * Use this instead of get_empty_filp() to get a new 155 * 'struct file'. Do so because of the same initialization 156 * pitfalls reasons listed for init_file(). This is a 157 * preferred interface to using init_file(). 158 * 159 * If all the callers of init_file() are eliminated, its 160 * code should be moved into this function. 161 */ 162 struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry, 163 mode_t mode, const struct file_operations *fop) 164 { 165 struct file *file; 166 struct path; 167 168 file = get_empty_filp(); 169 if (!file) 170 return NULL; 171 172 init_file(file, mnt, dentry, mode, fop); 173 return file; 174 } 175 EXPORT_SYMBOL(alloc_file); 176 177 /** 178 * init_file - initialize a 'struct file' 179 * @file: the already allocated 'struct file' to initialized 180 * @mnt: the vfsmount on which the file resides 181 * @dentry: the dentry representing this file 182 * @mode: the mode the file is opened with 183 * @fop: the 'struct file_operations' for this file 184 * 185 * Use this instead of setting the members directly. Doing so 186 * avoids making mistakes like forgetting the mntget() or 187 * forgetting to take a write on the mnt. 188 * 189 * Note: This is a crappy interface. It is here to make 190 * merging with the existing users of get_empty_filp() 191 * who have complex failure logic easier. All users 192 * of this should be moving to alloc_file(). 193 */ 194 int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, 195 mode_t mode, const struct file_operations *fop) 196 { 197 int error = 0; 198 file->f_path.dentry = dentry; 199 file->f_path.mnt = mntget(mnt); 200 file->f_mapping = dentry->d_inode->i_mapping; 201 file->f_mode = mode; 202 file->f_op = fop; 203 204 /* 205 * These mounts don't really matter in practice 206 * for r/o bind mounts. They aren't userspace- 207 * visible. We do this for consistency, and so 208 * that we can do debugging checks at __fput() 209 */ 210 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { 211 file_take_write(file); 212 error = mnt_want_write(mnt); 213 WARN_ON(error); 214 } 215 return error; 216 } 217 EXPORT_SYMBOL(init_file); 218 219 void fput(struct file *file) 220 { 221 if (atomic_dec_and_test(&file->f_count)) 222 __fput(file); 223 } 224 225 EXPORT_SYMBOL(fput); 226 227 /** 228 * drop_file_write_access - give up ability to write to a file 229 * @file: the file to which we will stop writing 230 * 231 * This is a central place which will give up the ability 232 * to write to @file, along with access to write through 233 * its vfsmount. 234 */ 235 void drop_file_write_access(struct file *file) 236 { 237 struct vfsmount *mnt = file->f_path.mnt; 238 struct dentry *dentry = file->f_path.dentry; 239 struct inode *inode = dentry->d_inode; 240 241 put_write_access(inode); 242 243 if (special_file(inode->i_mode)) 244 return; 245 if (file_check_writeable(file) != 0) 246 return; 247 mnt_drop_write(mnt); 248 file_release_write(file); 249 } 250 EXPORT_SYMBOL_GPL(drop_file_write_access); 251 252 /* __fput is called from task context when aio completion releases the last 253 * last use of a struct file *. Do not use otherwise. 254 */ 255 void __fput(struct file *file) 256 { 257 struct dentry *dentry = file->f_path.dentry; 258 struct vfsmount *mnt = file->f_path.mnt; 259 struct inode *inode = dentry->d_inode; 260 261 might_sleep(); 262 263 fsnotify_close(file); 264 /* 265 * The function eventpoll_release() should be the first called 266 * in the file cleanup chain. 267 */ 268 eventpoll_release(file); 269 locks_remove_flock(file); 270 271 if (file->f_op && file->f_op->release) 272 file->f_op->release(inode, file); 273 security_file_free(file); 274 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) 275 cdev_put(inode->i_cdev); 276 fops_put(file->f_op); 277 put_pid(file->f_owner.pid); 278 file_kill(file); 279 if (file->f_mode & FMODE_WRITE) 280 drop_file_write_access(file); 281 file->f_path.dentry = NULL; 282 file->f_path.mnt = NULL; 283 file_free(file); 284 dput(dentry); 285 mntput(mnt); 286 } 287 288 struct file *fget(unsigned int fd) 289 { 290 struct file *file; 291 struct files_struct *files = current->files; 292 293 rcu_read_lock(); 294 file = fcheck_files(files, fd); 295 if (file) { 296 if (!atomic_inc_not_zero(&file->f_count)) { 297 /* File object ref couldn't be taken */ 298 rcu_read_unlock(); 299 return NULL; 300 } 301 } 302 rcu_read_unlock(); 303 304 return file; 305 } 306 307 EXPORT_SYMBOL(fget); 308 309 /* 310 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 311 * You can use this only if it is guranteed that the current task already 312 * holds a refcnt to that file. That check has to be done at fget() only 313 * and a flag is returned to be passed to the corresponding fput_light(). 314 * There must not be a cloning between an fget_light/fput_light pair. 315 */ 316 struct file *fget_light(unsigned int fd, int *fput_needed) 317 { 318 struct file *file; 319 struct files_struct *files = current->files; 320 321 *fput_needed = 0; 322 if (likely((atomic_read(&files->count) == 1))) { 323 file = fcheck_files(files, fd); 324 } else { 325 rcu_read_lock(); 326 file = fcheck_files(files, fd); 327 if (file) { 328 if (atomic_inc_not_zero(&file->f_count)) 329 *fput_needed = 1; 330 else 331 /* Didn't get the reference, someone's freed */ 332 file = NULL; 333 } 334 rcu_read_unlock(); 335 } 336 337 return file; 338 } 339 340 341 void put_filp(struct file *file) 342 { 343 if (atomic_dec_and_test(&file->f_count)) { 344 security_file_free(file); 345 file_kill(file); 346 file_free(file); 347 } 348 } 349 350 void file_move(struct file *file, struct list_head *list) 351 { 352 if (!list) 353 return; 354 file_list_lock(); 355 list_move(&file->f_u.fu_list, list); 356 file_list_unlock(); 357 } 358 359 void file_kill(struct file *file) 360 { 361 if (!list_empty(&file->f_u.fu_list)) { 362 file_list_lock(); 363 list_del_init(&file->f_u.fu_list); 364 file_list_unlock(); 365 } 366 } 367 368 int fs_may_remount_ro(struct super_block *sb) 369 { 370 struct file *file; 371 372 /* Check that no files are currently opened for writing. */ 373 file_list_lock(); 374 list_for_each_entry(file, &sb->s_files, f_u.fu_list) { 375 struct inode *inode = file->f_path.dentry->d_inode; 376 377 /* File with pending delete? */ 378 if (inode->i_nlink == 0) 379 goto too_bad; 380 381 /* Writeable file? */ 382 if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) 383 goto too_bad; 384 } 385 file_list_unlock(); 386 return 1; /* Tis' cool bro. */ 387 too_bad: 388 file_list_unlock(); 389 return 0; 390 } 391 392 void __init files_init(unsigned long mempages) 393 { 394 int n; 395 /* One file with associated inode and dcache is very roughly 1K. 396 * Per default don't use more than 10% of our memory for files. 397 */ 398 399 n = (mempages * (PAGE_SIZE / 1024)) / 10; 400 files_stat.max_files = n; 401 if (files_stat.max_files < NR_FILE) 402 files_stat.max_files = NR_FILE; 403 files_defer_init(); 404 percpu_counter_init(&nr_files, 0); 405 } 406