xref: /linux/fs/btrfs/ioctl.c (revision a58130ddc896e5a15e4de2bf50a1d89247118c23)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/fsnotify.h>
25 #include <linux/pagemap.h>
26 #include <linux/highmem.h>
27 #include <linux/time.h>
28 #include <linux/init.h>
29 #include <linux/string.h>
30 #include <linux/backing-dev.h>
31 #include <linux/mount.h>
32 #include <linux/mpage.h>
33 #include <linux/namei.h>
34 #include <linux/swap.h>
35 #include <linux/writeback.h>
36 #include <linux/statfs.h>
37 #include <linux/compat.h>
38 #include <linux/bit_spinlock.h>
39 #include <linux/security.h>
40 #include <linux/xattr.h>
41 #include <linux/vmalloc.h>
42 #include <linux/slab.h>
43 #include <linux/blkdev.h>
44 #include <linux/uuid.h>
45 #include "compat.h"
46 #include "ctree.h"
47 #include "disk-io.h"
48 #include "transaction.h"
49 #include "btrfs_inode.h"
50 #include "ioctl.h"
51 #include "print-tree.h"
52 #include "volumes.h"
53 #include "locking.h"
54 #include "inode-map.h"
55 #include "backref.h"
56 #include "rcu-string.h"
57 #include "send.h"
58 
59 /* Mask out flags that are inappropriate for the given type of inode. */
60 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
61 {
62 	if (S_ISDIR(mode))
63 		return flags;
64 	else if (S_ISREG(mode))
65 		return flags & ~FS_DIRSYNC_FL;
66 	else
67 		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
68 }
69 
70 /*
71  * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
72  */
73 static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
74 {
75 	unsigned int iflags = 0;
76 
77 	if (flags & BTRFS_INODE_SYNC)
78 		iflags |= FS_SYNC_FL;
79 	if (flags & BTRFS_INODE_IMMUTABLE)
80 		iflags |= FS_IMMUTABLE_FL;
81 	if (flags & BTRFS_INODE_APPEND)
82 		iflags |= FS_APPEND_FL;
83 	if (flags & BTRFS_INODE_NODUMP)
84 		iflags |= FS_NODUMP_FL;
85 	if (flags & BTRFS_INODE_NOATIME)
86 		iflags |= FS_NOATIME_FL;
87 	if (flags & BTRFS_INODE_DIRSYNC)
88 		iflags |= FS_DIRSYNC_FL;
89 	if (flags & BTRFS_INODE_NODATACOW)
90 		iflags |= FS_NOCOW_FL;
91 
92 	if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
93 		iflags |= FS_COMPR_FL;
94 	else if (flags & BTRFS_INODE_NOCOMPRESS)
95 		iflags |= FS_NOCOMP_FL;
96 
97 	return iflags;
98 }
99 
100 /*
101  * Update inode->i_flags based on the btrfs internal flags.
102  */
103 void btrfs_update_iflags(struct inode *inode)
104 {
105 	struct btrfs_inode *ip = BTRFS_I(inode);
106 
107 	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
108 
109 	if (ip->flags & BTRFS_INODE_SYNC)
110 		inode->i_flags |= S_SYNC;
111 	if (ip->flags & BTRFS_INODE_IMMUTABLE)
112 		inode->i_flags |= S_IMMUTABLE;
113 	if (ip->flags & BTRFS_INODE_APPEND)
114 		inode->i_flags |= S_APPEND;
115 	if (ip->flags & BTRFS_INODE_NOATIME)
116 		inode->i_flags |= S_NOATIME;
117 	if (ip->flags & BTRFS_INODE_DIRSYNC)
118 		inode->i_flags |= S_DIRSYNC;
119 }
120 
121 /*
122  * Inherit flags from the parent inode.
123  *
124  * Currently only the compression flags and the cow flags are inherited.
125  */
126 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
127 {
128 	unsigned int flags;
129 
130 	if (!dir)
131 		return;
132 
133 	flags = BTRFS_I(dir)->flags;
134 
135 	if (flags & BTRFS_INODE_NOCOMPRESS) {
136 		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
137 		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
138 	} else if (flags & BTRFS_INODE_COMPRESS) {
139 		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
140 		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
141 	}
142 
143 	if (flags & BTRFS_INODE_NODATACOW)
144 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
145 
146 	btrfs_update_iflags(inode);
147 }
148 
149 static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
150 {
151 	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
152 	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
153 
154 	if (copy_to_user(arg, &flags, sizeof(flags)))
155 		return -EFAULT;
156 	return 0;
157 }
158 
159 static int check_flags(unsigned int flags)
160 {
161 	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
162 		      FS_NOATIME_FL | FS_NODUMP_FL | \
163 		      FS_SYNC_FL | FS_DIRSYNC_FL | \
164 		      FS_NOCOMP_FL | FS_COMPR_FL |
165 		      FS_NOCOW_FL))
166 		return -EOPNOTSUPP;
167 
168 	if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
169 		return -EINVAL;
170 
171 	return 0;
172 }
173 
174 static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
175 {
176 	struct inode *inode = file->f_path.dentry->d_inode;
177 	struct btrfs_inode *ip = BTRFS_I(inode);
178 	struct btrfs_root *root = ip->root;
179 	struct btrfs_trans_handle *trans;
180 	unsigned int flags, oldflags;
181 	int ret;
182 	u64 ip_oldflags;
183 	unsigned int i_oldflags;
184 	umode_t mode;
185 
186 	if (btrfs_root_readonly(root))
187 		return -EROFS;
188 
189 	if (copy_from_user(&flags, arg, sizeof(flags)))
190 		return -EFAULT;
191 
192 	ret = check_flags(flags);
193 	if (ret)
194 		return ret;
195 
196 	if (!inode_owner_or_capable(inode))
197 		return -EACCES;
198 
199 	ret = mnt_want_write_file(file);
200 	if (ret)
201 		return ret;
202 
203 	mutex_lock(&inode->i_mutex);
204 
205 	ip_oldflags = ip->flags;
206 	i_oldflags = inode->i_flags;
207 	mode = inode->i_mode;
208 
209 	flags = btrfs_mask_flags(inode->i_mode, flags);
210 	oldflags = btrfs_flags_to_ioctl(ip->flags);
211 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
212 		if (!capable(CAP_LINUX_IMMUTABLE)) {
213 			ret = -EPERM;
214 			goto out_unlock;
215 		}
216 	}
217 
218 	if (flags & FS_SYNC_FL)
219 		ip->flags |= BTRFS_INODE_SYNC;
220 	else
221 		ip->flags &= ~BTRFS_INODE_SYNC;
222 	if (flags & FS_IMMUTABLE_FL)
223 		ip->flags |= BTRFS_INODE_IMMUTABLE;
224 	else
225 		ip->flags &= ~BTRFS_INODE_IMMUTABLE;
226 	if (flags & FS_APPEND_FL)
227 		ip->flags |= BTRFS_INODE_APPEND;
228 	else
229 		ip->flags &= ~BTRFS_INODE_APPEND;
230 	if (flags & FS_NODUMP_FL)
231 		ip->flags |= BTRFS_INODE_NODUMP;
232 	else
233 		ip->flags &= ~BTRFS_INODE_NODUMP;
234 	if (flags & FS_NOATIME_FL)
235 		ip->flags |= BTRFS_INODE_NOATIME;
236 	else
237 		ip->flags &= ~BTRFS_INODE_NOATIME;
238 	if (flags & FS_DIRSYNC_FL)
239 		ip->flags |= BTRFS_INODE_DIRSYNC;
240 	else
241 		ip->flags &= ~BTRFS_INODE_DIRSYNC;
242 	if (flags & FS_NOCOW_FL) {
243 		if (S_ISREG(mode)) {
244 			/*
245 			 * It's safe to turn csums off here, no extents exist.
246 			 * Otherwise we want the flag to reflect the real COW
247 			 * status of the file and will not set it.
248 			 */
249 			if (inode->i_size == 0)
250 				ip->flags |= BTRFS_INODE_NODATACOW
251 					   | BTRFS_INODE_NODATASUM;
252 		} else {
253 			ip->flags |= BTRFS_INODE_NODATACOW;
254 		}
255 	} else {
256 		/*
257 		 * Revert back under same assuptions as above
258 		 */
259 		if (S_ISREG(mode)) {
260 			if (inode->i_size == 0)
261 				ip->flags &= ~(BTRFS_INODE_NODATACOW
262 				             | BTRFS_INODE_NODATASUM);
263 		} else {
264 			ip->flags &= ~BTRFS_INODE_NODATACOW;
265 		}
266 	}
267 
268 	/*
269 	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
270 	 * flag may be changed automatically if compression code won't make
271 	 * things smaller.
272 	 */
273 	if (flags & FS_NOCOMP_FL) {
274 		ip->flags &= ~BTRFS_INODE_COMPRESS;
275 		ip->flags |= BTRFS_INODE_NOCOMPRESS;
276 	} else if (flags & FS_COMPR_FL) {
277 		ip->flags |= BTRFS_INODE_COMPRESS;
278 		ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
279 	} else {
280 		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
281 	}
282 
283 	trans = btrfs_start_transaction(root, 1);
284 	if (IS_ERR(trans)) {
285 		ret = PTR_ERR(trans);
286 		goto out_drop;
287 	}
288 
289 	btrfs_update_iflags(inode);
290 	inode_inc_iversion(inode);
291 	inode->i_ctime = CURRENT_TIME;
292 	ret = btrfs_update_inode(trans, root, inode);
293 
294 	btrfs_end_transaction(trans, root);
295  out_drop:
296 	if (ret) {
297 		ip->flags = ip_oldflags;
298 		inode->i_flags = i_oldflags;
299 	}
300 
301  out_unlock:
302 	mutex_unlock(&inode->i_mutex);
303 	mnt_drop_write_file(file);
304 	return ret;
305 }
306 
307 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
308 {
309 	struct inode *inode = file->f_path.dentry->d_inode;
310 
311 	return put_user(inode->i_generation, arg);
312 }
313 
314 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
315 {
316 	struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
317 	struct btrfs_device *device;
318 	struct request_queue *q;
319 	struct fstrim_range range;
320 	u64 minlen = ULLONG_MAX;
321 	u64 num_devices = 0;
322 	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
323 	int ret;
324 
325 	if (!capable(CAP_SYS_ADMIN))
326 		return -EPERM;
327 
328 	rcu_read_lock();
329 	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
330 				dev_list) {
331 		if (!device->bdev)
332 			continue;
333 		q = bdev_get_queue(device->bdev);
334 		if (blk_queue_discard(q)) {
335 			num_devices++;
336 			minlen = min((u64)q->limits.discard_granularity,
337 				     minlen);
338 		}
339 	}
340 	rcu_read_unlock();
341 
342 	if (!num_devices)
343 		return -EOPNOTSUPP;
344 	if (copy_from_user(&range, arg, sizeof(range)))
345 		return -EFAULT;
346 	if (range.start > total_bytes ||
347 	    range.len < fs_info->sb->s_blocksize)
348 		return -EINVAL;
349 
350 	range.len = min(range.len, total_bytes - range.start);
351 	range.minlen = max(range.minlen, minlen);
352 	ret = btrfs_trim_fs(fs_info->tree_root, &range);
353 	if (ret < 0)
354 		return ret;
355 
356 	if (copy_to_user(arg, &range, sizeof(range)))
357 		return -EFAULT;
358 
359 	return 0;
360 }
361 
362 static noinline int create_subvol(struct btrfs_root *root,
363 				  struct dentry *dentry,
364 				  char *name, int namelen,
365 				  u64 *async_transid,
366 				  struct btrfs_qgroup_inherit **inherit)
367 {
368 	struct btrfs_trans_handle *trans;
369 	struct btrfs_key key;
370 	struct btrfs_root_item root_item;
371 	struct btrfs_inode_item *inode_item;
372 	struct extent_buffer *leaf;
373 	struct btrfs_root *new_root;
374 	struct dentry *parent = dentry->d_parent;
375 	struct inode *dir;
376 	struct timespec cur_time = CURRENT_TIME;
377 	int ret;
378 	int err;
379 	u64 objectid;
380 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
381 	u64 index = 0;
382 	uuid_le new_uuid;
383 
384 	ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
385 	if (ret)
386 		return ret;
387 
388 	dir = parent->d_inode;
389 
390 	/*
391 	 * 1 - inode item
392 	 * 2 - refs
393 	 * 1 - root item
394 	 * 2 - dir items
395 	 */
396 	trans = btrfs_start_transaction(root, 6);
397 	if (IS_ERR(trans))
398 		return PTR_ERR(trans);
399 
400 	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
401 				   inherit ? *inherit : NULL);
402 	if (ret)
403 		goto fail;
404 
405 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
406 				      0, objectid, NULL, 0, 0, 0);
407 	if (IS_ERR(leaf)) {
408 		ret = PTR_ERR(leaf);
409 		goto fail;
410 	}
411 
412 	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
413 	btrfs_set_header_bytenr(leaf, leaf->start);
414 	btrfs_set_header_generation(leaf, trans->transid);
415 	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
416 	btrfs_set_header_owner(leaf, objectid);
417 
418 	write_extent_buffer(leaf, root->fs_info->fsid,
419 			    (unsigned long)btrfs_header_fsid(leaf),
420 			    BTRFS_FSID_SIZE);
421 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
422 			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
423 			    BTRFS_UUID_SIZE);
424 	btrfs_mark_buffer_dirty(leaf);
425 
426 	memset(&root_item, 0, sizeof(root_item));
427 
428 	inode_item = &root_item.inode;
429 	inode_item->generation = cpu_to_le64(1);
430 	inode_item->size = cpu_to_le64(3);
431 	inode_item->nlink = cpu_to_le32(1);
432 	inode_item->nbytes = cpu_to_le64(root->leafsize);
433 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
434 
435 	root_item.flags = 0;
436 	root_item.byte_limit = 0;
437 	inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
438 
439 	btrfs_set_root_bytenr(&root_item, leaf->start);
440 	btrfs_set_root_generation(&root_item, trans->transid);
441 	btrfs_set_root_level(&root_item, 0);
442 	btrfs_set_root_refs(&root_item, 1);
443 	btrfs_set_root_used(&root_item, leaf->len);
444 	btrfs_set_root_last_snapshot(&root_item, 0);
445 
446 	btrfs_set_root_generation_v2(&root_item,
447 			btrfs_root_generation(&root_item));
448 	uuid_le_gen(&new_uuid);
449 	memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
450 	root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
451 	root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec);
452 	root_item.ctime = root_item.otime;
453 	btrfs_set_root_ctransid(&root_item, trans->transid);
454 	btrfs_set_root_otransid(&root_item, trans->transid);
455 
456 	btrfs_tree_unlock(leaf);
457 	free_extent_buffer(leaf);
458 	leaf = NULL;
459 
460 	btrfs_set_root_dirid(&root_item, new_dirid);
461 
462 	key.objectid = objectid;
463 	key.offset = 0;
464 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
465 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
466 				&root_item);
467 	if (ret)
468 		goto fail;
469 
470 	key.offset = (u64)-1;
471 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
472 	if (IS_ERR(new_root)) {
473 		btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
474 		ret = PTR_ERR(new_root);
475 		goto fail;
476 	}
477 
478 	btrfs_record_root_in_trans(trans, new_root);
479 
480 	ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
481 	if (ret) {
482 		/* We potentially lose an unused inode item here */
483 		btrfs_abort_transaction(trans, root, ret);
484 		goto fail;
485 	}
486 
487 	/*
488 	 * insert the directory item
489 	 */
490 	ret = btrfs_set_inode_index(dir, &index);
491 	if (ret) {
492 		btrfs_abort_transaction(trans, root, ret);
493 		goto fail;
494 	}
495 
496 	ret = btrfs_insert_dir_item(trans, root,
497 				    name, namelen, dir, &key,
498 				    BTRFS_FT_DIR, index);
499 	if (ret) {
500 		btrfs_abort_transaction(trans, root, ret);
501 		goto fail;
502 	}
503 
504 	btrfs_i_size_write(dir, dir->i_size + namelen * 2);
505 	ret = btrfs_update_inode(trans, root, dir);
506 	BUG_ON(ret);
507 
508 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
509 				 objectid, root->root_key.objectid,
510 				 btrfs_ino(dir), index, name, namelen);
511 
512 	BUG_ON(ret);
513 
514 	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
515 fail:
516 	if (async_transid) {
517 		*async_transid = trans->transid;
518 		err = btrfs_commit_transaction_async(trans, root, 1);
519 	} else {
520 		err = btrfs_commit_transaction(trans, root);
521 	}
522 	if (err && !ret)
523 		ret = err;
524 	return ret;
525 }
526 
527 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
528 			   char *name, int namelen, u64 *async_transid,
529 			   bool readonly, struct btrfs_qgroup_inherit **inherit)
530 {
531 	struct inode *inode;
532 	struct btrfs_pending_snapshot *pending_snapshot;
533 	struct btrfs_trans_handle *trans;
534 	int ret;
535 
536 	if (!root->ref_cows)
537 		return -EINVAL;
538 
539 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
540 	if (!pending_snapshot)
541 		return -ENOMEM;
542 
543 	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
544 			     BTRFS_BLOCK_RSV_TEMP);
545 	pending_snapshot->dentry = dentry;
546 	pending_snapshot->root = root;
547 	pending_snapshot->readonly = readonly;
548 	if (inherit) {
549 		pending_snapshot->inherit = *inherit;
550 		*inherit = NULL;	/* take responsibility to free it */
551 	}
552 
553 	trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
554 	if (IS_ERR(trans)) {
555 		ret = PTR_ERR(trans);
556 		goto fail;
557 	}
558 
559 	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
560 	BUG_ON(ret);
561 
562 	spin_lock(&root->fs_info->trans_lock);
563 	list_add(&pending_snapshot->list,
564 		 &trans->transaction->pending_snapshots);
565 	spin_unlock(&root->fs_info->trans_lock);
566 	if (async_transid) {
567 		*async_transid = trans->transid;
568 		ret = btrfs_commit_transaction_async(trans,
569 				     root->fs_info->extent_root, 1);
570 	} else {
571 		ret = btrfs_commit_transaction(trans,
572 					       root->fs_info->extent_root);
573 	}
574 	if (ret)
575 		goto fail;
576 
577 	ret = pending_snapshot->error;
578 	if (ret)
579 		goto fail;
580 
581 	ret = btrfs_orphan_cleanup(pending_snapshot->snap);
582 	if (ret)
583 		goto fail;
584 
585 	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
586 	if (IS_ERR(inode)) {
587 		ret = PTR_ERR(inode);
588 		goto fail;
589 	}
590 	BUG_ON(!inode);
591 	d_instantiate(dentry, inode);
592 	ret = 0;
593 fail:
594 	kfree(pending_snapshot);
595 	return ret;
596 }
597 
598 /*  copy of check_sticky in fs/namei.c()
599 * It's inline, so penalty for filesystems that don't use sticky bit is
600 * minimal.
601 */
602 static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
603 {
604 	kuid_t fsuid = current_fsuid();
605 
606 	if (!(dir->i_mode & S_ISVTX))
607 		return 0;
608 	if (uid_eq(inode->i_uid, fsuid))
609 		return 0;
610 	if (uid_eq(dir->i_uid, fsuid))
611 		return 0;
612 	return !capable(CAP_FOWNER);
613 }
614 
615 /*  copy of may_delete in fs/namei.c()
616  *	Check whether we can remove a link victim from directory dir, check
617  *  whether the type of victim is right.
618  *  1. We can't do it if dir is read-only (done in permission())
619  *  2. We should have write and exec permissions on dir
620  *  3. We can't remove anything from append-only dir
621  *  4. We can't do anything with immutable dir (done in permission())
622  *  5. If the sticky bit on dir is set we should either
623  *	a. be owner of dir, or
624  *	b. be owner of victim, or
625  *	c. have CAP_FOWNER capability
626  *  6. If the victim is append-only or immutable we can't do antyhing with
627  *     links pointing to it.
628  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
629  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
630  *  9. We can't remove a root or mountpoint.
631  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
632  *     nfs_async_unlink().
633  */
634 
635 static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
636 {
637 	int error;
638 
639 	if (!victim->d_inode)
640 		return -ENOENT;
641 
642 	BUG_ON(victim->d_parent->d_inode != dir);
643 	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
644 
645 	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
646 	if (error)
647 		return error;
648 	if (IS_APPEND(dir))
649 		return -EPERM;
650 	if (btrfs_check_sticky(dir, victim->d_inode)||
651 		IS_APPEND(victim->d_inode)||
652 	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
653 		return -EPERM;
654 	if (isdir) {
655 		if (!S_ISDIR(victim->d_inode->i_mode))
656 			return -ENOTDIR;
657 		if (IS_ROOT(victim))
658 			return -EBUSY;
659 	} else if (S_ISDIR(victim->d_inode->i_mode))
660 		return -EISDIR;
661 	if (IS_DEADDIR(dir))
662 		return -ENOENT;
663 	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
664 		return -EBUSY;
665 	return 0;
666 }
667 
668 /* copy of may_create in fs/namei.c() */
669 static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
670 {
671 	if (child->d_inode)
672 		return -EEXIST;
673 	if (IS_DEADDIR(dir))
674 		return -ENOENT;
675 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
676 }
677 
678 /*
679  * Create a new subvolume below @parent.  This is largely modeled after
680  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
681  * inside this filesystem so it's quite a bit simpler.
682  */
683 static noinline int btrfs_mksubvol(struct path *parent,
684 				   char *name, int namelen,
685 				   struct btrfs_root *snap_src,
686 				   u64 *async_transid, bool readonly,
687 				   struct btrfs_qgroup_inherit **inherit)
688 {
689 	struct inode *dir  = parent->dentry->d_inode;
690 	struct dentry *dentry;
691 	int error;
692 
693 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
694 
695 	dentry = lookup_one_len(name, parent->dentry, namelen);
696 	error = PTR_ERR(dentry);
697 	if (IS_ERR(dentry))
698 		goto out_unlock;
699 
700 	error = -EEXIST;
701 	if (dentry->d_inode)
702 		goto out_dput;
703 
704 	error = btrfs_may_create(dir, dentry);
705 	if (error)
706 		goto out_dput;
707 
708 	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
709 
710 	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
711 		goto out_up_read;
712 
713 	if (snap_src) {
714 		error = create_snapshot(snap_src, dentry, name, namelen,
715 					async_transid, readonly, inherit);
716 	} else {
717 		error = create_subvol(BTRFS_I(dir)->root, dentry,
718 				      name, namelen, async_transid, inherit);
719 	}
720 	if (!error)
721 		fsnotify_mkdir(dir, dentry);
722 out_up_read:
723 	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
724 out_dput:
725 	dput(dentry);
726 out_unlock:
727 	mutex_unlock(&dir->i_mutex);
728 	return error;
729 }
730 
731 /*
732  * When we're defragging a range, we don't want to kick it off again
733  * if it is really just waiting for delalloc to send it down.
734  * If we find a nice big extent or delalloc range for the bytes in the
735  * file you want to defrag, we return 0 to let you know to skip this
736  * part of the file
737  */
738 static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
739 {
740 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
741 	struct extent_map *em = NULL;
742 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
743 	u64 end;
744 
745 	read_lock(&em_tree->lock);
746 	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
747 	read_unlock(&em_tree->lock);
748 
749 	if (em) {
750 		end = extent_map_end(em);
751 		free_extent_map(em);
752 		if (end - offset > thresh)
753 			return 0;
754 	}
755 	/* if we already have a nice delalloc here, just stop */
756 	thresh /= 2;
757 	end = count_range_bits(io_tree, &offset, offset + thresh,
758 			       thresh, EXTENT_DELALLOC, 1);
759 	if (end >= thresh)
760 		return 0;
761 	return 1;
762 }
763 
764 /*
765  * helper function to walk through a file and find extents
766  * newer than a specific transid, and smaller than thresh.
767  *
768  * This is used by the defragging code to find new and small
769  * extents
770  */
771 static int find_new_extents(struct btrfs_root *root,
772 			    struct inode *inode, u64 newer_than,
773 			    u64 *off, int thresh)
774 {
775 	struct btrfs_path *path;
776 	struct btrfs_key min_key;
777 	struct btrfs_key max_key;
778 	struct extent_buffer *leaf;
779 	struct btrfs_file_extent_item *extent;
780 	int type;
781 	int ret;
782 	u64 ino = btrfs_ino(inode);
783 
784 	path = btrfs_alloc_path();
785 	if (!path)
786 		return -ENOMEM;
787 
788 	min_key.objectid = ino;
789 	min_key.type = BTRFS_EXTENT_DATA_KEY;
790 	min_key.offset = *off;
791 
792 	max_key.objectid = ino;
793 	max_key.type = (u8)-1;
794 	max_key.offset = (u64)-1;
795 
796 	path->keep_locks = 1;
797 
798 	while(1) {
799 		ret = btrfs_search_forward(root, &min_key, &max_key,
800 					   path, 0, newer_than);
801 		if (ret != 0)
802 			goto none;
803 		if (min_key.objectid != ino)
804 			goto none;
805 		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
806 			goto none;
807 
808 		leaf = path->nodes[0];
809 		extent = btrfs_item_ptr(leaf, path->slots[0],
810 					struct btrfs_file_extent_item);
811 
812 		type = btrfs_file_extent_type(leaf, extent);
813 		if (type == BTRFS_FILE_EXTENT_REG &&
814 		    btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
815 		    check_defrag_in_cache(inode, min_key.offset, thresh)) {
816 			*off = min_key.offset;
817 			btrfs_free_path(path);
818 			return 0;
819 		}
820 
821 		if (min_key.offset == (u64)-1)
822 			goto none;
823 
824 		min_key.offset++;
825 		btrfs_release_path(path);
826 	}
827 none:
828 	btrfs_free_path(path);
829 	return -ENOENT;
830 }
831 
832 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
833 {
834 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
835 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
836 	struct extent_map *em;
837 	u64 len = PAGE_CACHE_SIZE;
838 
839 	/*
840 	 * hopefully we have this extent in the tree already, try without
841 	 * the full extent lock
842 	 */
843 	read_lock(&em_tree->lock);
844 	em = lookup_extent_mapping(em_tree, start, len);
845 	read_unlock(&em_tree->lock);
846 
847 	if (!em) {
848 		/* get the big lock and read metadata off disk */
849 		lock_extent(io_tree, start, start + len - 1);
850 		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
851 		unlock_extent(io_tree, start, start + len - 1);
852 
853 		if (IS_ERR(em))
854 			return NULL;
855 	}
856 
857 	return em;
858 }
859 
860 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
861 {
862 	struct extent_map *next;
863 	bool ret = true;
864 
865 	/* this is the last extent */
866 	if (em->start + em->len >= i_size_read(inode))
867 		return false;
868 
869 	next = defrag_lookup_extent(inode, em->start + em->len);
870 	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
871 		ret = false;
872 
873 	free_extent_map(next);
874 	return ret;
875 }
876 
877 static int should_defrag_range(struct inode *inode, u64 start, int thresh,
878 			       u64 *last_len, u64 *skip, u64 *defrag_end,
879 			       int compress)
880 {
881 	struct extent_map *em;
882 	int ret = 1;
883 	bool next_mergeable = true;
884 
885 	/*
886 	 * make sure that once we start defragging an extent, we keep on
887 	 * defragging it
888 	 */
889 	if (start < *defrag_end)
890 		return 1;
891 
892 	*skip = 0;
893 
894 	em = defrag_lookup_extent(inode, start);
895 	if (!em)
896 		return 0;
897 
898 	/* this will cover holes, and inline extents */
899 	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
900 		ret = 0;
901 		goto out;
902 	}
903 
904 	next_mergeable = defrag_check_next_extent(inode, em);
905 
906 	/*
907 	 * we hit a real extent, if it is big or the next extent is not a
908 	 * real extent, don't bother defragging it
909 	 */
910 	if (!compress && (*last_len == 0 || *last_len >= thresh) &&
911 	    (em->len >= thresh || !next_mergeable))
912 		ret = 0;
913 out:
914 	/*
915 	 * last_len ends up being a counter of how many bytes we've defragged.
916 	 * every time we choose not to defrag an extent, we reset *last_len
917 	 * so that the next tiny extent will force a defrag.
918 	 *
919 	 * The end result of this is that tiny extents before a single big
920 	 * extent will force at least part of that big extent to be defragged.
921 	 */
922 	if (ret) {
923 		*defrag_end = extent_map_end(em);
924 	} else {
925 		*last_len = 0;
926 		*skip = extent_map_end(em);
927 		*defrag_end = 0;
928 	}
929 
930 	free_extent_map(em);
931 	return ret;
932 }
933 
934 /*
935  * it doesn't do much good to defrag one or two pages
936  * at a time.  This pulls in a nice chunk of pages
937  * to COW and defrag.
938  *
939  * It also makes sure the delalloc code has enough
940  * dirty data to avoid making new small extents as part
941  * of the defrag
942  *
943  * It's a good idea to start RA on this range
944  * before calling this.
945  */
946 static int cluster_pages_for_defrag(struct inode *inode,
947 				    struct page **pages,
948 				    unsigned long start_index,
949 				    int num_pages)
950 {
951 	unsigned long file_end;
952 	u64 isize = i_size_read(inode);
953 	u64 page_start;
954 	u64 page_end;
955 	u64 page_cnt;
956 	int ret;
957 	int i;
958 	int i_done;
959 	struct btrfs_ordered_extent *ordered;
960 	struct extent_state *cached_state = NULL;
961 	struct extent_io_tree *tree;
962 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
963 
964 	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
965 	if (!isize || start_index > file_end)
966 		return 0;
967 
968 	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
969 
970 	ret = btrfs_delalloc_reserve_space(inode,
971 					   page_cnt << PAGE_CACHE_SHIFT);
972 	if (ret)
973 		return ret;
974 	i_done = 0;
975 	tree = &BTRFS_I(inode)->io_tree;
976 
977 	/* step one, lock all the pages */
978 	for (i = 0; i < page_cnt; i++) {
979 		struct page *page;
980 again:
981 		page = find_or_create_page(inode->i_mapping,
982 					   start_index + i, mask);
983 		if (!page)
984 			break;
985 
986 		page_start = page_offset(page);
987 		page_end = page_start + PAGE_CACHE_SIZE - 1;
988 		while (1) {
989 			lock_extent(tree, page_start, page_end);
990 			ordered = btrfs_lookup_ordered_extent(inode,
991 							      page_start);
992 			unlock_extent(tree, page_start, page_end);
993 			if (!ordered)
994 				break;
995 
996 			unlock_page(page);
997 			btrfs_start_ordered_extent(inode, ordered, 1);
998 			btrfs_put_ordered_extent(ordered);
999 			lock_page(page);
1000 			/*
1001 			 * we unlocked the page above, so we need check if
1002 			 * it was released or not.
1003 			 */
1004 			if (page->mapping != inode->i_mapping) {
1005 				unlock_page(page);
1006 				page_cache_release(page);
1007 				goto again;
1008 			}
1009 		}
1010 
1011 		if (!PageUptodate(page)) {
1012 			btrfs_readpage(NULL, page);
1013 			lock_page(page);
1014 			if (!PageUptodate(page)) {
1015 				unlock_page(page);
1016 				page_cache_release(page);
1017 				ret = -EIO;
1018 				break;
1019 			}
1020 		}
1021 
1022 		if (page->mapping != inode->i_mapping) {
1023 			unlock_page(page);
1024 			page_cache_release(page);
1025 			goto again;
1026 		}
1027 
1028 		pages[i] = page;
1029 		i_done++;
1030 	}
1031 	if (!i_done || ret)
1032 		goto out;
1033 
1034 	if (!(inode->i_sb->s_flags & MS_ACTIVE))
1035 		goto out;
1036 
1037 	/*
1038 	 * so now we have a nice long stream of locked
1039 	 * and up to date pages, lets wait on them
1040 	 */
1041 	for (i = 0; i < i_done; i++)
1042 		wait_on_page_writeback(pages[i]);
1043 
1044 	page_start = page_offset(pages[0]);
1045 	page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
1046 
1047 	lock_extent_bits(&BTRFS_I(inode)->io_tree,
1048 			 page_start, page_end - 1, 0, &cached_state);
1049 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
1050 			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
1051 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
1052 			  &cached_state, GFP_NOFS);
1053 
1054 	if (i_done != page_cnt) {
1055 		spin_lock(&BTRFS_I(inode)->lock);
1056 		BTRFS_I(inode)->outstanding_extents++;
1057 		spin_unlock(&BTRFS_I(inode)->lock);
1058 		btrfs_delalloc_release_space(inode,
1059 				     (page_cnt - i_done) << PAGE_CACHE_SHIFT);
1060 	}
1061 
1062 
1063 	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
1064 			  &cached_state, GFP_NOFS);
1065 
1066 	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1067 			     page_start, page_end - 1, &cached_state,
1068 			     GFP_NOFS);
1069 
1070 	for (i = 0; i < i_done; i++) {
1071 		clear_page_dirty_for_io(pages[i]);
1072 		ClearPageChecked(pages[i]);
1073 		set_page_extent_mapped(pages[i]);
1074 		set_page_dirty(pages[i]);
1075 		unlock_page(pages[i]);
1076 		page_cache_release(pages[i]);
1077 	}
1078 	return i_done;
1079 out:
1080 	for (i = 0; i < i_done; i++) {
1081 		unlock_page(pages[i]);
1082 		page_cache_release(pages[i]);
1083 	}
1084 	btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
1085 	return ret;
1086 
1087 }
1088 
1089 int btrfs_defrag_file(struct inode *inode, struct file *file,
1090 		      struct btrfs_ioctl_defrag_range_args *range,
1091 		      u64 newer_than, unsigned long max_to_defrag)
1092 {
1093 	struct btrfs_root *root = BTRFS_I(inode)->root;
1094 	struct file_ra_state *ra = NULL;
1095 	unsigned long last_index;
1096 	u64 isize = i_size_read(inode);
1097 	u64 last_len = 0;
1098 	u64 skip = 0;
1099 	u64 defrag_end = 0;
1100 	u64 newer_off = range->start;
1101 	unsigned long i;
1102 	unsigned long ra_index = 0;
1103 	int ret;
1104 	int defrag_count = 0;
1105 	int compress_type = BTRFS_COMPRESS_ZLIB;
1106 	int extent_thresh = range->extent_thresh;
1107 	int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
1108 	int cluster = max_cluster;
1109 	u64 new_align = ~((u64)128 * 1024 - 1);
1110 	struct page **pages = NULL;
1111 
1112 	if (extent_thresh == 0)
1113 		extent_thresh = 256 * 1024;
1114 
1115 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
1116 		if (range->compress_type > BTRFS_COMPRESS_TYPES)
1117 			return -EINVAL;
1118 		if (range->compress_type)
1119 			compress_type = range->compress_type;
1120 	}
1121 
1122 	if (isize == 0)
1123 		return 0;
1124 
1125 	/*
1126 	 * if we were not given a file, allocate a readahead
1127 	 * context
1128 	 */
1129 	if (!file) {
1130 		ra = kzalloc(sizeof(*ra), GFP_NOFS);
1131 		if (!ra)
1132 			return -ENOMEM;
1133 		file_ra_state_init(ra, inode->i_mapping);
1134 	} else {
1135 		ra = &file->f_ra;
1136 	}
1137 
1138 	pages = kmalloc(sizeof(struct page *) * max_cluster,
1139 			GFP_NOFS);
1140 	if (!pages) {
1141 		ret = -ENOMEM;
1142 		goto out_ra;
1143 	}
1144 
1145 	/* find the last page to defrag */
1146 	if (range->start + range->len > range->start) {
1147 		last_index = min_t(u64, isize - 1,
1148 			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1149 	} else {
1150 		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1151 	}
1152 
1153 	if (newer_than) {
1154 		ret = find_new_extents(root, inode, newer_than,
1155 				       &newer_off, 64 * 1024);
1156 		if (!ret) {
1157 			range->start = newer_off;
1158 			/*
1159 			 * we always align our defrag to help keep
1160 			 * the extents in the file evenly spaced
1161 			 */
1162 			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1163 		} else
1164 			goto out_ra;
1165 	} else {
1166 		i = range->start >> PAGE_CACHE_SHIFT;
1167 	}
1168 	if (!max_to_defrag)
1169 		max_to_defrag = last_index + 1;
1170 
1171 	/*
1172 	 * make writeback starts from i, so the defrag range can be
1173 	 * written sequentially.
1174 	 */
1175 	if (i < inode->i_mapping->writeback_index)
1176 		inode->i_mapping->writeback_index = i;
1177 
1178 	while (i <= last_index && defrag_count < max_to_defrag &&
1179 	       (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
1180 		PAGE_CACHE_SHIFT)) {
1181 		/*
1182 		 * make sure we stop running if someone unmounts
1183 		 * the FS
1184 		 */
1185 		if (!(inode->i_sb->s_flags & MS_ACTIVE))
1186 			break;
1187 
1188 		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
1189 					 extent_thresh, &last_len, &skip,
1190 					 &defrag_end, range->flags &
1191 					 BTRFS_DEFRAG_RANGE_COMPRESS)) {
1192 			unsigned long next;
1193 			/*
1194 			 * the should_defrag function tells us how much to skip
1195 			 * bump our counter by the suggested amount
1196 			 */
1197 			next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1198 			i = max(i + 1, next);
1199 			continue;
1200 		}
1201 
1202 		if (!newer_than) {
1203 			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1204 				   PAGE_CACHE_SHIFT) - i;
1205 			cluster = min(cluster, max_cluster);
1206 		} else {
1207 			cluster = max_cluster;
1208 		}
1209 
1210 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1211 			BTRFS_I(inode)->force_compress = compress_type;
1212 
1213 		if (i + cluster > ra_index) {
1214 			ra_index = max(i, ra_index);
1215 			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1216 				       cluster);
1217 			ra_index += max_cluster;
1218 		}
1219 
1220 		mutex_lock(&inode->i_mutex);
1221 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1222 		if (ret < 0) {
1223 			mutex_unlock(&inode->i_mutex);
1224 			goto out_ra;
1225 		}
1226 
1227 		defrag_count += ret;
1228 		balance_dirty_pages_ratelimited(inode->i_mapping);
1229 		mutex_unlock(&inode->i_mutex);
1230 
1231 		if (newer_than) {
1232 			if (newer_off == (u64)-1)
1233 				break;
1234 
1235 			if (ret > 0)
1236 				i += ret;
1237 
1238 			newer_off = max(newer_off + 1,
1239 					(u64)i << PAGE_CACHE_SHIFT);
1240 
1241 			ret = find_new_extents(root, inode,
1242 					       newer_than, &newer_off,
1243 					       64 * 1024);
1244 			if (!ret) {
1245 				range->start = newer_off;
1246 				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1247 			} else {
1248 				break;
1249 			}
1250 		} else {
1251 			if (ret > 0) {
1252 				i += ret;
1253 				last_len += ret << PAGE_CACHE_SHIFT;
1254 			} else {
1255 				i++;
1256 				last_len = 0;
1257 			}
1258 		}
1259 	}
1260 
1261 	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
1262 		filemap_flush(inode->i_mapping);
1263 
1264 	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1265 		/* the filemap_flush will queue IO into the worker threads, but
1266 		 * we have to make sure the IO is actually started and that
1267 		 * ordered extents get created before we return
1268 		 */
1269 		atomic_inc(&root->fs_info->async_submit_draining);
1270 		while (atomic_read(&root->fs_info->nr_async_submits) ||
1271 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
1272 			wait_event(root->fs_info->async_submit_wait,
1273 			   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
1274 			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
1275 		}
1276 		atomic_dec(&root->fs_info->async_submit_draining);
1277 
1278 		mutex_lock(&inode->i_mutex);
1279 		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
1280 		mutex_unlock(&inode->i_mutex);
1281 	}
1282 
1283 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
1284 		btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
1285 	}
1286 
1287 	ret = defrag_count;
1288 
1289 out_ra:
1290 	if (!file)
1291 		kfree(ra);
1292 	kfree(pages);
1293 	return ret;
1294 }
1295 
1296 static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1297 					void __user *arg)
1298 {
1299 	u64 new_size;
1300 	u64 old_size;
1301 	u64 devid = 1;
1302 	struct btrfs_ioctl_vol_args *vol_args;
1303 	struct btrfs_trans_handle *trans;
1304 	struct btrfs_device *device = NULL;
1305 	char *sizestr;
1306 	char *devstr = NULL;
1307 	int ret = 0;
1308 	int mod = 0;
1309 
1310 	if (root->fs_info->sb->s_flags & MS_RDONLY)
1311 		return -EROFS;
1312 
1313 	if (!capable(CAP_SYS_ADMIN))
1314 		return -EPERM;
1315 
1316 	mutex_lock(&root->fs_info->volume_mutex);
1317 	if (root->fs_info->balance_ctl) {
1318 		printk(KERN_INFO "btrfs: balance in progress\n");
1319 		ret = -EINVAL;
1320 		goto out;
1321 	}
1322 
1323 	vol_args = memdup_user(arg, sizeof(*vol_args));
1324 	if (IS_ERR(vol_args)) {
1325 		ret = PTR_ERR(vol_args);
1326 		goto out;
1327 	}
1328 
1329 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1330 
1331 	sizestr = vol_args->name;
1332 	devstr = strchr(sizestr, ':');
1333 	if (devstr) {
1334 		char *end;
1335 		sizestr = devstr + 1;
1336 		*devstr = '\0';
1337 		devstr = vol_args->name;
1338 		devid = simple_strtoull(devstr, &end, 10);
1339 		printk(KERN_INFO "btrfs: resizing devid %llu\n",
1340 		       (unsigned long long)devid);
1341 	}
1342 	device = btrfs_find_device(root, devid, NULL, NULL);
1343 	if (!device) {
1344 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1345 		       (unsigned long long)devid);
1346 		ret = -EINVAL;
1347 		goto out_free;
1348 	}
1349 	if (device->fs_devices && device->fs_devices->seeding) {
1350 		printk(KERN_INFO "btrfs: resizer unable to apply on "
1351 		       "seeding device %llu\n",
1352 		       (unsigned long long)devid);
1353 		ret = -EINVAL;
1354 		goto out_free;
1355 	}
1356 
1357 	if (!strcmp(sizestr, "max"))
1358 		new_size = device->bdev->bd_inode->i_size;
1359 	else {
1360 		if (sizestr[0] == '-') {
1361 			mod = -1;
1362 			sizestr++;
1363 		} else if (sizestr[0] == '+') {
1364 			mod = 1;
1365 			sizestr++;
1366 		}
1367 		new_size = memparse(sizestr, NULL);
1368 		if (new_size == 0) {
1369 			ret = -EINVAL;
1370 			goto out_free;
1371 		}
1372 	}
1373 
1374 	old_size = device->total_bytes;
1375 
1376 	if (mod < 0) {
1377 		if (new_size > old_size) {
1378 			ret = -EINVAL;
1379 			goto out_free;
1380 		}
1381 		new_size = old_size - new_size;
1382 	} else if (mod > 0) {
1383 		new_size = old_size + new_size;
1384 	}
1385 
1386 	if (new_size < 256 * 1024 * 1024) {
1387 		ret = -EINVAL;
1388 		goto out_free;
1389 	}
1390 	if (new_size > device->bdev->bd_inode->i_size) {
1391 		ret = -EFBIG;
1392 		goto out_free;
1393 	}
1394 
1395 	do_div(new_size, root->sectorsize);
1396 	new_size *= root->sectorsize;
1397 
1398 	printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
1399 		      rcu_str_deref(device->name),
1400 		      (unsigned long long)new_size);
1401 
1402 	if (new_size > old_size) {
1403 		trans = btrfs_start_transaction(root, 0);
1404 		if (IS_ERR(trans)) {
1405 			ret = PTR_ERR(trans);
1406 			goto out_free;
1407 		}
1408 		ret = btrfs_grow_device(trans, device, new_size);
1409 		btrfs_commit_transaction(trans, root);
1410 	} else if (new_size < old_size) {
1411 		ret = btrfs_shrink_device(device, new_size);
1412 	}
1413 
1414 out_free:
1415 	kfree(vol_args);
1416 out:
1417 	mutex_unlock(&root->fs_info->volume_mutex);
1418 	return ret;
1419 }
1420 
1421 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
1422 				char *name, unsigned long fd, int subvol,
1423 				u64 *transid, bool readonly,
1424 				struct btrfs_qgroup_inherit **inherit)
1425 {
1426 	int namelen;
1427 	int ret = 0;
1428 
1429 	ret = mnt_want_write_file(file);
1430 	if (ret)
1431 		goto out;
1432 
1433 	namelen = strlen(name);
1434 	if (strchr(name, '/')) {
1435 		ret = -EINVAL;
1436 		goto out_drop_write;
1437 	}
1438 
1439 	if (name[0] == '.' &&
1440 	   (namelen == 1 || (name[1] == '.' && namelen == 2))) {
1441 		ret = -EEXIST;
1442 		goto out_drop_write;
1443 	}
1444 
1445 	if (subvol) {
1446 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
1447 				     NULL, transid, readonly, inherit);
1448 	} else {
1449 		struct fd src = fdget(fd);
1450 		struct inode *src_inode;
1451 		if (!src.file) {
1452 			ret = -EINVAL;
1453 			goto out_drop_write;
1454 		}
1455 
1456 		src_inode = src.file->f_path.dentry->d_inode;
1457 		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
1458 			printk(KERN_INFO "btrfs: Snapshot src from "
1459 			       "another FS\n");
1460 			ret = -EINVAL;
1461 		} else {
1462 			ret = btrfs_mksubvol(&file->f_path, name, namelen,
1463 					     BTRFS_I(src_inode)->root,
1464 					     transid, readonly, inherit);
1465 		}
1466 		fdput(src);
1467 	}
1468 out_drop_write:
1469 	mnt_drop_write_file(file);
1470 out:
1471 	return ret;
1472 }
1473 
1474 static noinline int btrfs_ioctl_snap_create(struct file *file,
1475 					    void __user *arg, int subvol)
1476 {
1477 	struct btrfs_ioctl_vol_args *vol_args;
1478 	int ret;
1479 
1480 	vol_args = memdup_user(arg, sizeof(*vol_args));
1481 	if (IS_ERR(vol_args))
1482 		return PTR_ERR(vol_args);
1483 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1484 
1485 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1486 					      vol_args->fd, subvol,
1487 					      NULL, false, NULL);
1488 
1489 	kfree(vol_args);
1490 	return ret;
1491 }
1492 
1493 static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1494 					       void __user *arg, int subvol)
1495 {
1496 	struct btrfs_ioctl_vol_args_v2 *vol_args;
1497 	int ret;
1498 	u64 transid = 0;
1499 	u64 *ptr = NULL;
1500 	bool readonly = false;
1501 	struct btrfs_qgroup_inherit *inherit = NULL;
1502 
1503 	vol_args = memdup_user(arg, sizeof(*vol_args));
1504 	if (IS_ERR(vol_args))
1505 		return PTR_ERR(vol_args);
1506 	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1507 
1508 	if (vol_args->flags &
1509 	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
1510 	      BTRFS_SUBVOL_QGROUP_INHERIT)) {
1511 		ret = -EOPNOTSUPP;
1512 		goto out;
1513 	}
1514 
1515 	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1516 		ptr = &transid;
1517 	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1518 		readonly = true;
1519 	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1520 		if (vol_args->size > PAGE_CACHE_SIZE) {
1521 			ret = -EINVAL;
1522 			goto out;
1523 		}
1524 		inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1525 		if (IS_ERR(inherit)) {
1526 			ret = PTR_ERR(inherit);
1527 			goto out;
1528 		}
1529 	}
1530 
1531 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1532 					      vol_args->fd, subvol, ptr,
1533 					      readonly, &inherit);
1534 
1535 	if (ret == 0 && ptr &&
1536 	    copy_to_user(arg +
1537 			 offsetof(struct btrfs_ioctl_vol_args_v2,
1538 				  transid), ptr, sizeof(*ptr)))
1539 		ret = -EFAULT;
1540 out:
1541 	kfree(vol_args);
1542 	kfree(inherit);
1543 	return ret;
1544 }
1545 
1546 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1547 						void __user *arg)
1548 {
1549 	struct inode *inode = fdentry(file)->d_inode;
1550 	struct btrfs_root *root = BTRFS_I(inode)->root;
1551 	int ret = 0;
1552 	u64 flags = 0;
1553 
1554 	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
1555 		return -EINVAL;
1556 
1557 	down_read(&root->fs_info->subvol_sem);
1558 	if (btrfs_root_readonly(root))
1559 		flags |= BTRFS_SUBVOL_RDONLY;
1560 	up_read(&root->fs_info->subvol_sem);
1561 
1562 	if (copy_to_user(arg, &flags, sizeof(flags)))
1563 		ret = -EFAULT;
1564 
1565 	return ret;
1566 }
1567 
1568 static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1569 					      void __user *arg)
1570 {
1571 	struct inode *inode = fdentry(file)->d_inode;
1572 	struct btrfs_root *root = BTRFS_I(inode)->root;
1573 	struct btrfs_trans_handle *trans;
1574 	u64 root_flags;
1575 	u64 flags;
1576 	int ret = 0;
1577 
1578 	ret = mnt_want_write_file(file);
1579 	if (ret)
1580 		goto out;
1581 
1582 	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
1583 		ret = -EINVAL;
1584 		goto out_drop_write;
1585 	}
1586 
1587 	if (copy_from_user(&flags, arg, sizeof(flags))) {
1588 		ret = -EFAULT;
1589 		goto out_drop_write;
1590 	}
1591 
1592 	if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
1593 		ret = -EINVAL;
1594 		goto out_drop_write;
1595 	}
1596 
1597 	if (flags & ~BTRFS_SUBVOL_RDONLY) {
1598 		ret = -EOPNOTSUPP;
1599 		goto out_drop_write;
1600 	}
1601 
1602 	if (!inode_owner_or_capable(inode)) {
1603 		ret = -EACCES;
1604 		goto out_drop_write;
1605 	}
1606 
1607 	down_write(&root->fs_info->subvol_sem);
1608 
1609 	/* nothing to do */
1610 	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1611 		goto out_drop_sem;
1612 
1613 	root_flags = btrfs_root_flags(&root->root_item);
1614 	if (flags & BTRFS_SUBVOL_RDONLY)
1615 		btrfs_set_root_flags(&root->root_item,
1616 				     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1617 	else
1618 		btrfs_set_root_flags(&root->root_item,
1619 				     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1620 
1621 	trans = btrfs_start_transaction(root, 1);
1622 	if (IS_ERR(trans)) {
1623 		ret = PTR_ERR(trans);
1624 		goto out_reset;
1625 	}
1626 
1627 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
1628 				&root->root_key, &root->root_item);
1629 
1630 	btrfs_commit_transaction(trans, root);
1631 out_reset:
1632 	if (ret)
1633 		btrfs_set_root_flags(&root->root_item, root_flags);
1634 out_drop_sem:
1635 	up_write(&root->fs_info->subvol_sem);
1636 out_drop_write:
1637 	mnt_drop_write_file(file);
1638 out:
1639 	return ret;
1640 }
1641 
1642 /*
1643  * helper to check if the subvolume references other subvolumes
1644  */
1645 static noinline int may_destroy_subvol(struct btrfs_root *root)
1646 {
1647 	struct btrfs_path *path;
1648 	struct btrfs_key key;
1649 	int ret;
1650 
1651 	path = btrfs_alloc_path();
1652 	if (!path)
1653 		return -ENOMEM;
1654 
1655 	key.objectid = root->root_key.objectid;
1656 	key.type = BTRFS_ROOT_REF_KEY;
1657 	key.offset = (u64)-1;
1658 
1659 	ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
1660 				&key, path, 0, 0);
1661 	if (ret < 0)
1662 		goto out;
1663 	BUG_ON(ret == 0);
1664 
1665 	ret = 0;
1666 	if (path->slots[0] > 0) {
1667 		path->slots[0]--;
1668 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1669 		if (key.objectid == root->root_key.objectid &&
1670 		    key.type == BTRFS_ROOT_REF_KEY)
1671 			ret = -ENOTEMPTY;
1672 	}
1673 out:
1674 	btrfs_free_path(path);
1675 	return ret;
1676 }
1677 
1678 static noinline int key_in_sk(struct btrfs_key *key,
1679 			      struct btrfs_ioctl_search_key *sk)
1680 {
1681 	struct btrfs_key test;
1682 	int ret;
1683 
1684 	test.objectid = sk->min_objectid;
1685 	test.type = sk->min_type;
1686 	test.offset = sk->min_offset;
1687 
1688 	ret = btrfs_comp_cpu_keys(key, &test);
1689 	if (ret < 0)
1690 		return 0;
1691 
1692 	test.objectid = sk->max_objectid;
1693 	test.type = sk->max_type;
1694 	test.offset = sk->max_offset;
1695 
1696 	ret = btrfs_comp_cpu_keys(key, &test);
1697 	if (ret > 0)
1698 		return 0;
1699 	return 1;
1700 }
1701 
1702 static noinline int copy_to_sk(struct btrfs_root *root,
1703 			       struct btrfs_path *path,
1704 			       struct btrfs_key *key,
1705 			       struct btrfs_ioctl_search_key *sk,
1706 			       char *buf,
1707 			       unsigned long *sk_offset,
1708 			       int *num_found)
1709 {
1710 	u64 found_transid;
1711 	struct extent_buffer *leaf;
1712 	struct btrfs_ioctl_search_header sh;
1713 	unsigned long item_off;
1714 	unsigned long item_len;
1715 	int nritems;
1716 	int i;
1717 	int slot;
1718 	int ret = 0;
1719 
1720 	leaf = path->nodes[0];
1721 	slot = path->slots[0];
1722 	nritems = btrfs_header_nritems(leaf);
1723 
1724 	if (btrfs_header_generation(leaf) > sk->max_transid) {
1725 		i = nritems;
1726 		goto advance_key;
1727 	}
1728 	found_transid = btrfs_header_generation(leaf);
1729 
1730 	for (i = slot; i < nritems; i++) {
1731 		item_off = btrfs_item_ptr_offset(leaf, i);
1732 		item_len = btrfs_item_size_nr(leaf, i);
1733 
1734 		if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
1735 			item_len = 0;
1736 
1737 		if (sizeof(sh) + item_len + *sk_offset >
1738 		    BTRFS_SEARCH_ARGS_BUFSIZE) {
1739 			ret = 1;
1740 			goto overflow;
1741 		}
1742 
1743 		btrfs_item_key_to_cpu(leaf, key, i);
1744 		if (!key_in_sk(key, sk))
1745 			continue;
1746 
1747 		sh.objectid = key->objectid;
1748 		sh.offset = key->offset;
1749 		sh.type = key->type;
1750 		sh.len = item_len;
1751 		sh.transid = found_transid;
1752 
1753 		/* copy search result header */
1754 		memcpy(buf + *sk_offset, &sh, sizeof(sh));
1755 		*sk_offset += sizeof(sh);
1756 
1757 		if (item_len) {
1758 			char *p = buf + *sk_offset;
1759 			/* copy the item */
1760 			read_extent_buffer(leaf, p,
1761 					   item_off, item_len);
1762 			*sk_offset += item_len;
1763 		}
1764 		(*num_found)++;
1765 
1766 		if (*num_found >= sk->nr_items)
1767 			break;
1768 	}
1769 advance_key:
1770 	ret = 0;
1771 	if (key->offset < (u64)-1 && key->offset < sk->max_offset)
1772 		key->offset++;
1773 	else if (key->type < (u8)-1 && key->type < sk->max_type) {
1774 		key->offset = 0;
1775 		key->type++;
1776 	} else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
1777 		key->offset = 0;
1778 		key->type = 0;
1779 		key->objectid++;
1780 	} else
1781 		ret = 1;
1782 overflow:
1783 	return ret;
1784 }
1785 
1786 static noinline int search_ioctl(struct inode *inode,
1787 				 struct btrfs_ioctl_search_args *args)
1788 {
1789 	struct btrfs_root *root;
1790 	struct btrfs_key key;
1791 	struct btrfs_key max_key;
1792 	struct btrfs_path *path;
1793 	struct btrfs_ioctl_search_key *sk = &args->key;
1794 	struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1795 	int ret;
1796 	int num_found = 0;
1797 	unsigned long sk_offset = 0;
1798 
1799 	path = btrfs_alloc_path();
1800 	if (!path)
1801 		return -ENOMEM;
1802 
1803 	if (sk->tree_id == 0) {
1804 		/* search the root of the inode that was passed */
1805 		root = BTRFS_I(inode)->root;
1806 	} else {
1807 		key.objectid = sk->tree_id;
1808 		key.type = BTRFS_ROOT_ITEM_KEY;
1809 		key.offset = (u64)-1;
1810 		root = btrfs_read_fs_root_no_name(info, &key);
1811 		if (IS_ERR(root)) {
1812 			printk(KERN_ERR "could not find root %llu\n",
1813 			       sk->tree_id);
1814 			btrfs_free_path(path);
1815 			return -ENOENT;
1816 		}
1817 	}
1818 
1819 	key.objectid = sk->min_objectid;
1820 	key.type = sk->min_type;
1821 	key.offset = sk->min_offset;
1822 
1823 	max_key.objectid = sk->max_objectid;
1824 	max_key.type = sk->max_type;
1825 	max_key.offset = sk->max_offset;
1826 
1827 	path->keep_locks = 1;
1828 
1829 	while(1) {
1830 		ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1831 					   sk->min_transid);
1832 		if (ret != 0) {
1833 			if (ret > 0)
1834 				ret = 0;
1835 			goto err;
1836 		}
1837 		ret = copy_to_sk(root, path, &key, sk, args->buf,
1838 				 &sk_offset, &num_found);
1839 		btrfs_release_path(path);
1840 		if (ret || num_found >= sk->nr_items)
1841 			break;
1842 
1843 	}
1844 	ret = 0;
1845 err:
1846 	sk->nr_items = num_found;
1847 	btrfs_free_path(path);
1848 	return ret;
1849 }
1850 
1851 static noinline int btrfs_ioctl_tree_search(struct file *file,
1852 					   void __user *argp)
1853 {
1854 	 struct btrfs_ioctl_search_args *args;
1855 	 struct inode *inode;
1856 	 int ret;
1857 
1858 	if (!capable(CAP_SYS_ADMIN))
1859 		return -EPERM;
1860 
1861 	args = memdup_user(argp, sizeof(*args));
1862 	if (IS_ERR(args))
1863 		return PTR_ERR(args);
1864 
1865 	inode = fdentry(file)->d_inode;
1866 	ret = search_ioctl(inode, args);
1867 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1868 		ret = -EFAULT;
1869 	kfree(args);
1870 	return ret;
1871 }
1872 
1873 /*
1874  * Search INODE_REFs to identify path name of 'dirid' directory
1875  * in a 'tree_id' tree. and sets path name to 'name'.
1876  */
1877 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1878 				u64 tree_id, u64 dirid, char *name)
1879 {
1880 	struct btrfs_root *root;
1881 	struct btrfs_key key;
1882 	char *ptr;
1883 	int ret = -1;
1884 	int slot;
1885 	int len;
1886 	int total_len = 0;
1887 	struct btrfs_inode_ref *iref;
1888 	struct extent_buffer *l;
1889 	struct btrfs_path *path;
1890 
1891 	if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1892 		name[0]='\0';
1893 		return 0;
1894 	}
1895 
1896 	path = btrfs_alloc_path();
1897 	if (!path)
1898 		return -ENOMEM;
1899 
1900 	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1901 
1902 	key.objectid = tree_id;
1903 	key.type = BTRFS_ROOT_ITEM_KEY;
1904 	key.offset = (u64)-1;
1905 	root = btrfs_read_fs_root_no_name(info, &key);
1906 	if (IS_ERR(root)) {
1907 		printk(KERN_ERR "could not find root %llu\n", tree_id);
1908 		ret = -ENOENT;
1909 		goto out;
1910 	}
1911 
1912 	key.objectid = dirid;
1913 	key.type = BTRFS_INODE_REF_KEY;
1914 	key.offset = (u64)-1;
1915 
1916 	while(1) {
1917 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1918 		if (ret < 0)
1919 			goto out;
1920 
1921 		l = path->nodes[0];
1922 		slot = path->slots[0];
1923 		if (ret > 0 && slot > 0)
1924 			slot--;
1925 		btrfs_item_key_to_cpu(l, &key, slot);
1926 
1927 		if (ret > 0 && (key.objectid != dirid ||
1928 				key.type != BTRFS_INODE_REF_KEY)) {
1929 			ret = -ENOENT;
1930 			goto out;
1931 		}
1932 
1933 		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1934 		len = btrfs_inode_ref_name_len(l, iref);
1935 		ptr -= len + 1;
1936 		total_len += len + 1;
1937 		if (ptr < name)
1938 			goto out;
1939 
1940 		*(ptr + len) = '/';
1941 		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1942 
1943 		if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1944 			break;
1945 
1946 		btrfs_release_path(path);
1947 		key.objectid = key.offset;
1948 		key.offset = (u64)-1;
1949 		dirid = key.objectid;
1950 	}
1951 	if (ptr < name)
1952 		goto out;
1953 	memmove(name, ptr, total_len);
1954 	name[total_len]='\0';
1955 	ret = 0;
1956 out:
1957 	btrfs_free_path(path);
1958 	return ret;
1959 }
1960 
1961 static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1962 					   void __user *argp)
1963 {
1964 	 struct btrfs_ioctl_ino_lookup_args *args;
1965 	 struct inode *inode;
1966 	 int ret;
1967 
1968 	if (!capable(CAP_SYS_ADMIN))
1969 		return -EPERM;
1970 
1971 	args = memdup_user(argp, sizeof(*args));
1972 	if (IS_ERR(args))
1973 		return PTR_ERR(args);
1974 
1975 	inode = fdentry(file)->d_inode;
1976 
1977 	if (args->treeid == 0)
1978 		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1979 
1980 	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1981 					args->treeid, args->objectid,
1982 					args->name);
1983 
1984 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1985 		ret = -EFAULT;
1986 
1987 	kfree(args);
1988 	return ret;
1989 }
1990 
1991 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1992 					     void __user *arg)
1993 {
1994 	struct dentry *parent = fdentry(file);
1995 	struct dentry *dentry;
1996 	struct inode *dir = parent->d_inode;
1997 	struct inode *inode;
1998 	struct btrfs_root *root = BTRFS_I(dir)->root;
1999 	struct btrfs_root *dest = NULL;
2000 	struct btrfs_ioctl_vol_args *vol_args;
2001 	struct btrfs_trans_handle *trans;
2002 	int namelen;
2003 	int ret;
2004 	int err = 0;
2005 
2006 	vol_args = memdup_user(arg, sizeof(*vol_args));
2007 	if (IS_ERR(vol_args))
2008 		return PTR_ERR(vol_args);
2009 
2010 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2011 	namelen = strlen(vol_args->name);
2012 	if (strchr(vol_args->name, '/') ||
2013 	    strncmp(vol_args->name, "..", namelen) == 0) {
2014 		err = -EINVAL;
2015 		goto out;
2016 	}
2017 
2018 	err = mnt_want_write_file(file);
2019 	if (err)
2020 		goto out;
2021 
2022 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
2023 	dentry = lookup_one_len(vol_args->name, parent, namelen);
2024 	if (IS_ERR(dentry)) {
2025 		err = PTR_ERR(dentry);
2026 		goto out_unlock_dir;
2027 	}
2028 
2029 	if (!dentry->d_inode) {
2030 		err = -ENOENT;
2031 		goto out_dput;
2032 	}
2033 
2034 	inode = dentry->d_inode;
2035 	dest = BTRFS_I(inode)->root;
2036 	if (!capable(CAP_SYS_ADMIN)){
2037 		/*
2038 		 * Regular user.  Only allow this with a special mount
2039 		 * option, when the user has write+exec access to the
2040 		 * subvol root, and when rmdir(2) would have been
2041 		 * allowed.
2042 		 *
2043 		 * Note that this is _not_ check that the subvol is
2044 		 * empty or doesn't contain data that we wouldn't
2045 		 * otherwise be able to delete.
2046 		 *
2047 		 * Users who want to delete empty subvols should try
2048 		 * rmdir(2).
2049 		 */
2050 		err = -EPERM;
2051 		if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
2052 			goto out_dput;
2053 
2054 		/*
2055 		 * Do not allow deletion if the parent dir is the same
2056 		 * as the dir to be deleted.  That means the ioctl
2057 		 * must be called on the dentry referencing the root
2058 		 * of the subvol, not a random directory contained
2059 		 * within it.
2060 		 */
2061 		err = -EINVAL;
2062 		if (root == dest)
2063 			goto out_dput;
2064 
2065 		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
2066 		if (err)
2067 			goto out_dput;
2068 
2069 		/* check if subvolume may be deleted by a non-root user */
2070 		err = btrfs_may_delete(dir, dentry, 1);
2071 		if (err)
2072 			goto out_dput;
2073 	}
2074 
2075 	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
2076 		err = -EINVAL;
2077 		goto out_dput;
2078 	}
2079 
2080 	mutex_lock(&inode->i_mutex);
2081 	err = d_invalidate(dentry);
2082 	if (err)
2083 		goto out_unlock;
2084 
2085 	down_write(&root->fs_info->subvol_sem);
2086 
2087 	err = may_destroy_subvol(dest);
2088 	if (err)
2089 		goto out_up_write;
2090 
2091 	trans = btrfs_start_transaction(root, 0);
2092 	if (IS_ERR(trans)) {
2093 		err = PTR_ERR(trans);
2094 		goto out_up_write;
2095 	}
2096 	trans->block_rsv = &root->fs_info->global_block_rsv;
2097 
2098 	ret = btrfs_unlink_subvol(trans, root, dir,
2099 				dest->root_key.objectid,
2100 				dentry->d_name.name,
2101 				dentry->d_name.len);
2102 	if (ret) {
2103 		err = ret;
2104 		btrfs_abort_transaction(trans, root, ret);
2105 		goto out_end_trans;
2106 	}
2107 
2108 	btrfs_record_root_in_trans(trans, dest);
2109 
2110 	memset(&dest->root_item.drop_progress, 0,
2111 		sizeof(dest->root_item.drop_progress));
2112 	dest->root_item.drop_level = 0;
2113 	btrfs_set_root_refs(&dest->root_item, 0);
2114 
2115 	if (!xchg(&dest->orphan_item_inserted, 1)) {
2116 		ret = btrfs_insert_orphan_item(trans,
2117 					root->fs_info->tree_root,
2118 					dest->root_key.objectid);
2119 		if (ret) {
2120 			btrfs_abort_transaction(trans, root, ret);
2121 			err = ret;
2122 			goto out_end_trans;
2123 		}
2124 	}
2125 out_end_trans:
2126 	ret = btrfs_end_transaction(trans, root);
2127 	if (ret && !err)
2128 		err = ret;
2129 	inode->i_flags |= S_DEAD;
2130 out_up_write:
2131 	up_write(&root->fs_info->subvol_sem);
2132 out_unlock:
2133 	mutex_unlock(&inode->i_mutex);
2134 	if (!err) {
2135 		shrink_dcache_sb(root->fs_info->sb);
2136 		btrfs_invalidate_inodes(dest);
2137 		d_delete(dentry);
2138 	}
2139 out_dput:
2140 	dput(dentry);
2141 out_unlock_dir:
2142 	mutex_unlock(&dir->i_mutex);
2143 	mnt_drop_write_file(file);
2144 out:
2145 	kfree(vol_args);
2146 	return err;
2147 }
2148 
2149 static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2150 {
2151 	struct inode *inode = fdentry(file)->d_inode;
2152 	struct btrfs_root *root = BTRFS_I(inode)->root;
2153 	struct btrfs_ioctl_defrag_range_args *range;
2154 	int ret;
2155 
2156 	if (btrfs_root_readonly(root))
2157 		return -EROFS;
2158 
2159 	ret = mnt_want_write_file(file);
2160 	if (ret)
2161 		return ret;
2162 
2163 	switch (inode->i_mode & S_IFMT) {
2164 	case S_IFDIR:
2165 		if (!capable(CAP_SYS_ADMIN)) {
2166 			ret = -EPERM;
2167 			goto out;
2168 		}
2169 		ret = btrfs_defrag_root(root, 0);
2170 		if (ret)
2171 			goto out;
2172 		ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
2173 		break;
2174 	case S_IFREG:
2175 		if (!(file->f_mode & FMODE_WRITE)) {
2176 			ret = -EINVAL;
2177 			goto out;
2178 		}
2179 
2180 		range = kzalloc(sizeof(*range), GFP_KERNEL);
2181 		if (!range) {
2182 			ret = -ENOMEM;
2183 			goto out;
2184 		}
2185 
2186 		if (argp) {
2187 			if (copy_from_user(range, argp,
2188 					   sizeof(*range))) {
2189 				ret = -EFAULT;
2190 				kfree(range);
2191 				goto out;
2192 			}
2193 			/* compression requires us to start the IO */
2194 			if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
2195 				range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
2196 				range->extent_thresh = (u32)-1;
2197 			}
2198 		} else {
2199 			/* the rest are all set to zero by kzalloc */
2200 			range->len = (u64)-1;
2201 		}
2202 		ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
2203 					range, 0, 0);
2204 		if (ret > 0)
2205 			ret = 0;
2206 		kfree(range);
2207 		break;
2208 	default:
2209 		ret = -EINVAL;
2210 	}
2211 out:
2212 	mnt_drop_write_file(file);
2213 	return ret;
2214 }
2215 
2216 static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2217 {
2218 	struct btrfs_ioctl_vol_args *vol_args;
2219 	int ret;
2220 
2221 	if (!capable(CAP_SYS_ADMIN))
2222 		return -EPERM;
2223 
2224 	mutex_lock(&root->fs_info->volume_mutex);
2225 	if (root->fs_info->balance_ctl) {
2226 		printk(KERN_INFO "btrfs: balance in progress\n");
2227 		ret = -EINVAL;
2228 		goto out;
2229 	}
2230 
2231 	vol_args = memdup_user(arg, sizeof(*vol_args));
2232 	if (IS_ERR(vol_args)) {
2233 		ret = PTR_ERR(vol_args);
2234 		goto out;
2235 	}
2236 
2237 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2238 	ret = btrfs_init_new_device(root, vol_args->name);
2239 
2240 	kfree(vol_args);
2241 out:
2242 	mutex_unlock(&root->fs_info->volume_mutex);
2243 	return ret;
2244 }
2245 
2246 static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2247 {
2248 	struct btrfs_ioctl_vol_args *vol_args;
2249 	int ret;
2250 
2251 	if (!capable(CAP_SYS_ADMIN))
2252 		return -EPERM;
2253 
2254 	if (root->fs_info->sb->s_flags & MS_RDONLY)
2255 		return -EROFS;
2256 
2257 	mutex_lock(&root->fs_info->volume_mutex);
2258 	if (root->fs_info->balance_ctl) {
2259 		printk(KERN_INFO "btrfs: balance in progress\n");
2260 		ret = -EINVAL;
2261 		goto out;
2262 	}
2263 
2264 	vol_args = memdup_user(arg, sizeof(*vol_args));
2265 	if (IS_ERR(vol_args)) {
2266 		ret = PTR_ERR(vol_args);
2267 		goto out;
2268 	}
2269 
2270 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2271 	ret = btrfs_rm_device(root, vol_args->name);
2272 
2273 	kfree(vol_args);
2274 out:
2275 	mutex_unlock(&root->fs_info->volume_mutex);
2276 	return ret;
2277 }
2278 
2279 static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2280 {
2281 	struct btrfs_ioctl_fs_info_args *fi_args;
2282 	struct btrfs_device *device;
2283 	struct btrfs_device *next;
2284 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2285 	int ret = 0;
2286 
2287 	if (!capable(CAP_SYS_ADMIN))
2288 		return -EPERM;
2289 
2290 	fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
2291 	if (!fi_args)
2292 		return -ENOMEM;
2293 
2294 	fi_args->num_devices = fs_devices->num_devices;
2295 	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
2296 
2297 	mutex_lock(&fs_devices->device_list_mutex);
2298 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
2299 		if (device->devid > fi_args->max_id)
2300 			fi_args->max_id = device->devid;
2301 	}
2302 	mutex_unlock(&fs_devices->device_list_mutex);
2303 
2304 	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
2305 		ret = -EFAULT;
2306 
2307 	kfree(fi_args);
2308 	return ret;
2309 }
2310 
2311 static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2312 {
2313 	struct btrfs_ioctl_dev_info_args *di_args;
2314 	struct btrfs_device *dev;
2315 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2316 	int ret = 0;
2317 	char *s_uuid = NULL;
2318 	char empty_uuid[BTRFS_UUID_SIZE] = {0};
2319 
2320 	if (!capable(CAP_SYS_ADMIN))
2321 		return -EPERM;
2322 
2323 	di_args = memdup_user(arg, sizeof(*di_args));
2324 	if (IS_ERR(di_args))
2325 		return PTR_ERR(di_args);
2326 
2327 	if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
2328 		s_uuid = di_args->uuid;
2329 
2330 	mutex_lock(&fs_devices->device_list_mutex);
2331 	dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
2332 	mutex_unlock(&fs_devices->device_list_mutex);
2333 
2334 	if (!dev) {
2335 		ret = -ENODEV;
2336 		goto out;
2337 	}
2338 
2339 	di_args->devid = dev->devid;
2340 	di_args->bytes_used = dev->bytes_used;
2341 	di_args->total_bytes = dev->total_bytes;
2342 	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2343 	if (dev->name) {
2344 		struct rcu_string *name;
2345 
2346 		rcu_read_lock();
2347 		name = rcu_dereference(dev->name);
2348 		strncpy(di_args->path, name->str, sizeof(di_args->path));
2349 		rcu_read_unlock();
2350 		di_args->path[sizeof(di_args->path) - 1] = 0;
2351 	} else {
2352 		di_args->path[0] = '\0';
2353 	}
2354 
2355 out:
2356 	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
2357 		ret = -EFAULT;
2358 
2359 	kfree(di_args);
2360 	return ret;
2361 }
2362 
2363 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2364 				       u64 off, u64 olen, u64 destoff)
2365 {
2366 	struct inode *inode = fdentry(file)->d_inode;
2367 	struct btrfs_root *root = BTRFS_I(inode)->root;
2368 	struct fd src_file;
2369 	struct inode *src;
2370 	struct btrfs_trans_handle *trans;
2371 	struct btrfs_path *path;
2372 	struct extent_buffer *leaf;
2373 	char *buf;
2374 	struct btrfs_key key;
2375 	u32 nritems;
2376 	int slot;
2377 	int ret;
2378 	u64 len = olen;
2379 	u64 bs = root->fs_info->sb->s_blocksize;
2380 
2381 	/*
2382 	 * TODO:
2383 	 * - split compressed inline extents.  annoying: we need to
2384 	 *   decompress into destination's address_space (the file offset
2385 	 *   may change, so source mapping won't do), then recompress (or
2386 	 *   otherwise reinsert) a subrange.
2387 	 * - allow ranges within the same file to be cloned (provided
2388 	 *   they don't overlap)?
2389 	 */
2390 
2391 	/* the destination must be opened for writing */
2392 	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
2393 		return -EINVAL;
2394 
2395 	if (btrfs_root_readonly(root))
2396 		return -EROFS;
2397 
2398 	ret = mnt_want_write_file(file);
2399 	if (ret)
2400 		return ret;
2401 
2402 	src_file = fdget(srcfd);
2403 	if (!src_file.file) {
2404 		ret = -EBADF;
2405 		goto out_drop_write;
2406 	}
2407 
2408 	ret = -EXDEV;
2409 	if (src_file.file->f_path.mnt != file->f_path.mnt)
2410 		goto out_fput;
2411 
2412 	src = src_file.file->f_dentry->d_inode;
2413 
2414 	ret = -EINVAL;
2415 	if (src == inode)
2416 		goto out_fput;
2417 
2418 	/* the src must be open for reading */
2419 	if (!(src_file.file->f_mode & FMODE_READ))
2420 		goto out_fput;
2421 
2422 	/* don't make the dst file partly checksummed */
2423 	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
2424 	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
2425 		goto out_fput;
2426 
2427 	ret = -EISDIR;
2428 	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
2429 		goto out_fput;
2430 
2431 	ret = -EXDEV;
2432 	if (src->i_sb != inode->i_sb)
2433 		goto out_fput;
2434 
2435 	ret = -ENOMEM;
2436 	buf = vmalloc(btrfs_level_size(root, 0));
2437 	if (!buf)
2438 		goto out_fput;
2439 
2440 	path = btrfs_alloc_path();
2441 	if (!path) {
2442 		vfree(buf);
2443 		goto out_fput;
2444 	}
2445 	path->reada = 2;
2446 
2447 	if (inode < src) {
2448 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
2449 		mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
2450 	} else {
2451 		mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
2452 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
2453 	}
2454 
2455 	/* determine range to clone */
2456 	ret = -EINVAL;
2457 	if (off + len > src->i_size || off + len < off)
2458 		goto out_unlock;
2459 	if (len == 0)
2460 		olen = len = src->i_size - off;
2461 	/* if we extend to eof, continue to block boundary */
2462 	if (off + len == src->i_size)
2463 		len = ALIGN(src->i_size, bs) - off;
2464 
2465 	/* verify the end result is block aligned */
2466 	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
2467 	    !IS_ALIGNED(destoff, bs))
2468 		goto out_unlock;
2469 
2470 	if (destoff > inode->i_size) {
2471 		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2472 		if (ret)
2473 			goto out_unlock;
2474 	}
2475 
2476 	/* truncate page cache pages from target inode range */
2477 	truncate_inode_pages_range(&inode->i_data, destoff,
2478 				   PAGE_CACHE_ALIGN(destoff + len) - 1);
2479 
2480 	/* do any pending delalloc/csum calc on src, one way or
2481 	   another, and lock file content */
2482 	while (1) {
2483 		struct btrfs_ordered_extent *ordered;
2484 		lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2485 		ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
2486 		if (!ordered &&
2487 		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
2488 				    EXTENT_DELALLOC, 0, NULL))
2489 			break;
2490 		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2491 		if (ordered)
2492 			btrfs_put_ordered_extent(ordered);
2493 		btrfs_wait_ordered_range(src, off, len);
2494 	}
2495 
2496 	/* clone data */
2497 	key.objectid = btrfs_ino(src);
2498 	key.type = BTRFS_EXTENT_DATA_KEY;
2499 	key.offset = 0;
2500 
2501 	while (1) {
2502 		/*
2503 		 * note the key will change type as we walk through the
2504 		 * tree.
2505 		 */
2506 		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
2507 				0, 0);
2508 		if (ret < 0)
2509 			goto out;
2510 
2511 		nritems = btrfs_header_nritems(path->nodes[0]);
2512 		if (path->slots[0] >= nritems) {
2513 			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
2514 			if (ret < 0)
2515 				goto out;
2516 			if (ret > 0)
2517 				break;
2518 			nritems = btrfs_header_nritems(path->nodes[0]);
2519 		}
2520 		leaf = path->nodes[0];
2521 		slot = path->slots[0];
2522 
2523 		btrfs_item_key_to_cpu(leaf, &key, slot);
2524 		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
2525 		    key.objectid != btrfs_ino(src))
2526 			break;
2527 
2528 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
2529 			struct btrfs_file_extent_item *extent;
2530 			int type;
2531 			u32 size;
2532 			struct btrfs_key new_key;
2533 			u64 disko = 0, diskl = 0;
2534 			u64 datao = 0, datal = 0;
2535 			u8 comp;
2536 			u64 endoff;
2537 
2538 			size = btrfs_item_size_nr(leaf, slot);
2539 			read_extent_buffer(leaf, buf,
2540 					   btrfs_item_ptr_offset(leaf, slot),
2541 					   size);
2542 
2543 			extent = btrfs_item_ptr(leaf, slot,
2544 						struct btrfs_file_extent_item);
2545 			comp = btrfs_file_extent_compression(leaf, extent);
2546 			type = btrfs_file_extent_type(leaf, extent);
2547 			if (type == BTRFS_FILE_EXTENT_REG ||
2548 			    type == BTRFS_FILE_EXTENT_PREALLOC) {
2549 				disko = btrfs_file_extent_disk_bytenr(leaf,
2550 								      extent);
2551 				diskl = btrfs_file_extent_disk_num_bytes(leaf,
2552 								 extent);
2553 				datao = btrfs_file_extent_offset(leaf, extent);
2554 				datal = btrfs_file_extent_num_bytes(leaf,
2555 								    extent);
2556 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
2557 				/* take upper bound, may be compressed */
2558 				datal = btrfs_file_extent_ram_bytes(leaf,
2559 								    extent);
2560 			}
2561 			btrfs_release_path(path);
2562 
2563 			if (key.offset + datal <= off ||
2564 			    key.offset >= off + len - 1)
2565 				goto next;
2566 
2567 			memcpy(&new_key, &key, sizeof(new_key));
2568 			new_key.objectid = btrfs_ino(inode);
2569 			if (off <= key.offset)
2570 				new_key.offset = key.offset + destoff - off;
2571 			else
2572 				new_key.offset = destoff;
2573 
2574 			/*
2575 			 * 1 - adjusting old extent (we may have to split it)
2576 			 * 1 - add new extent
2577 			 * 1 - inode update
2578 			 */
2579 			trans = btrfs_start_transaction(root, 3);
2580 			if (IS_ERR(trans)) {
2581 				ret = PTR_ERR(trans);
2582 				goto out;
2583 			}
2584 
2585 			if (type == BTRFS_FILE_EXTENT_REG ||
2586 			    type == BTRFS_FILE_EXTENT_PREALLOC) {
2587 				/*
2588 				 *    a  | --- range to clone ---|  b
2589 				 * | ------------- extent ------------- |
2590 				 */
2591 
2592 				/* substract range b */
2593 				if (key.offset + datal > off + len)
2594 					datal = off + len - key.offset;
2595 
2596 				/* substract range a */
2597 				if (off > key.offset) {
2598 					datao += off - key.offset;
2599 					datal -= off - key.offset;
2600 				}
2601 
2602 				ret = btrfs_drop_extents(trans, root, inode,
2603 							 new_key.offset,
2604 							 new_key.offset + datal,
2605 							 1);
2606 				if (ret) {
2607 					btrfs_abort_transaction(trans, root,
2608 								ret);
2609 					btrfs_end_transaction(trans, root);
2610 					goto out;
2611 				}
2612 
2613 				ret = btrfs_insert_empty_item(trans, root, path,
2614 							      &new_key, size);
2615 				if (ret) {
2616 					btrfs_abort_transaction(trans, root,
2617 								ret);
2618 					btrfs_end_transaction(trans, root);
2619 					goto out;
2620 				}
2621 
2622 				leaf = path->nodes[0];
2623 				slot = path->slots[0];
2624 				write_extent_buffer(leaf, buf,
2625 					    btrfs_item_ptr_offset(leaf, slot),
2626 					    size);
2627 
2628 				extent = btrfs_item_ptr(leaf, slot,
2629 						struct btrfs_file_extent_item);
2630 
2631 				/* disko == 0 means it's a hole */
2632 				if (!disko)
2633 					datao = 0;
2634 
2635 				btrfs_set_file_extent_offset(leaf, extent,
2636 							     datao);
2637 				btrfs_set_file_extent_num_bytes(leaf, extent,
2638 								datal);
2639 				if (disko) {
2640 					inode_add_bytes(inode, datal);
2641 					ret = btrfs_inc_extent_ref(trans, root,
2642 							disko, diskl, 0,
2643 							root->root_key.objectid,
2644 							btrfs_ino(inode),
2645 							new_key.offset - datao,
2646 							0);
2647 					if (ret) {
2648 						btrfs_abort_transaction(trans,
2649 									root,
2650 									ret);
2651 						btrfs_end_transaction(trans,
2652 								      root);
2653 						goto out;
2654 
2655 					}
2656 				}
2657 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
2658 				u64 skip = 0;
2659 				u64 trim = 0;
2660 				if (off > key.offset) {
2661 					skip = off - key.offset;
2662 					new_key.offset += skip;
2663 				}
2664 
2665 				if (key.offset + datal > off + len)
2666 					trim = key.offset + datal - (off + len);
2667 
2668 				if (comp && (skip || trim)) {
2669 					ret = -EINVAL;
2670 					btrfs_end_transaction(trans, root);
2671 					goto out;
2672 				}
2673 				size -= skip + trim;
2674 				datal -= skip + trim;
2675 
2676 				ret = btrfs_drop_extents(trans, root, inode,
2677 							 new_key.offset,
2678 							 new_key.offset + datal,
2679 							 1);
2680 				if (ret) {
2681 					btrfs_abort_transaction(trans, root,
2682 								ret);
2683 					btrfs_end_transaction(trans, root);
2684 					goto out;
2685 				}
2686 
2687 				ret = btrfs_insert_empty_item(trans, root, path,
2688 							      &new_key, size);
2689 				if (ret) {
2690 					btrfs_abort_transaction(trans, root,
2691 								ret);
2692 					btrfs_end_transaction(trans, root);
2693 					goto out;
2694 				}
2695 
2696 				if (skip) {
2697 					u32 start =
2698 					  btrfs_file_extent_calc_inline_size(0);
2699 					memmove(buf+start, buf+start+skip,
2700 						datal);
2701 				}
2702 
2703 				leaf = path->nodes[0];
2704 				slot = path->slots[0];
2705 				write_extent_buffer(leaf, buf,
2706 					    btrfs_item_ptr_offset(leaf, slot),
2707 					    size);
2708 				inode_add_bytes(inode, datal);
2709 			}
2710 
2711 			btrfs_mark_buffer_dirty(leaf);
2712 			btrfs_release_path(path);
2713 
2714 			inode_inc_iversion(inode);
2715 			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2716 
2717 			/*
2718 			 * we round up to the block size at eof when
2719 			 * determining which extents to clone above,
2720 			 * but shouldn't round up the file size
2721 			 */
2722 			endoff = new_key.offset + datal;
2723 			if (endoff > destoff+olen)
2724 				endoff = destoff+olen;
2725 			if (endoff > inode->i_size)
2726 				btrfs_i_size_write(inode, endoff);
2727 
2728 			ret = btrfs_update_inode(trans, root, inode);
2729 			if (ret) {
2730 				btrfs_abort_transaction(trans, root, ret);
2731 				btrfs_end_transaction(trans, root);
2732 				goto out;
2733 			}
2734 			ret = btrfs_end_transaction(trans, root);
2735 		}
2736 next:
2737 		btrfs_release_path(path);
2738 		key.offset++;
2739 	}
2740 	ret = 0;
2741 out:
2742 	btrfs_release_path(path);
2743 	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
2744 out_unlock:
2745 	mutex_unlock(&src->i_mutex);
2746 	mutex_unlock(&inode->i_mutex);
2747 	vfree(buf);
2748 	btrfs_free_path(path);
2749 out_fput:
2750 	fdput(src_file);
2751 out_drop_write:
2752 	mnt_drop_write_file(file);
2753 	return ret;
2754 }
2755 
2756 static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
2757 {
2758 	struct btrfs_ioctl_clone_range_args args;
2759 
2760 	if (copy_from_user(&args, argp, sizeof(args)))
2761 		return -EFAULT;
2762 	return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
2763 				 args.src_length, args.dest_offset);
2764 }
2765 
2766 /*
2767  * there are many ways the trans_start and trans_end ioctls can lead
2768  * to deadlocks.  They should only be used by applications that
2769  * basically own the machine, and have a very in depth understanding
2770  * of all the possible deadlocks and enospc problems.
2771  */
2772 static long btrfs_ioctl_trans_start(struct file *file)
2773 {
2774 	struct inode *inode = fdentry(file)->d_inode;
2775 	struct btrfs_root *root = BTRFS_I(inode)->root;
2776 	struct btrfs_trans_handle *trans;
2777 	int ret;
2778 
2779 	ret = -EPERM;
2780 	if (!capable(CAP_SYS_ADMIN))
2781 		goto out;
2782 
2783 	ret = -EINPROGRESS;
2784 	if (file->private_data)
2785 		goto out;
2786 
2787 	ret = -EROFS;
2788 	if (btrfs_root_readonly(root))
2789 		goto out;
2790 
2791 	ret = mnt_want_write_file(file);
2792 	if (ret)
2793 		goto out;
2794 
2795 	atomic_inc(&root->fs_info->open_ioctl_trans);
2796 
2797 	ret = -ENOMEM;
2798 	trans = btrfs_start_ioctl_transaction(root);
2799 	if (IS_ERR(trans))
2800 		goto out_drop;
2801 
2802 	file->private_data = trans;
2803 	return 0;
2804 
2805 out_drop:
2806 	atomic_dec(&root->fs_info->open_ioctl_trans);
2807 	mnt_drop_write_file(file);
2808 out:
2809 	return ret;
2810 }
2811 
2812 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2813 {
2814 	struct inode *inode = fdentry(file)->d_inode;
2815 	struct btrfs_root *root = BTRFS_I(inode)->root;
2816 	struct btrfs_root *new_root;
2817 	struct btrfs_dir_item *di;
2818 	struct btrfs_trans_handle *trans;
2819 	struct btrfs_path *path;
2820 	struct btrfs_key location;
2821 	struct btrfs_disk_key disk_key;
2822 	u64 objectid = 0;
2823 	u64 dir_id;
2824 
2825 	if (!capable(CAP_SYS_ADMIN))
2826 		return -EPERM;
2827 
2828 	if (copy_from_user(&objectid, argp, sizeof(objectid)))
2829 		return -EFAULT;
2830 
2831 	if (!objectid)
2832 		objectid = root->root_key.objectid;
2833 
2834 	location.objectid = objectid;
2835 	location.type = BTRFS_ROOT_ITEM_KEY;
2836 	location.offset = (u64)-1;
2837 
2838 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2839 	if (IS_ERR(new_root))
2840 		return PTR_ERR(new_root);
2841 
2842 	if (btrfs_root_refs(&new_root->root_item) == 0)
2843 		return -ENOENT;
2844 
2845 	path = btrfs_alloc_path();
2846 	if (!path)
2847 		return -ENOMEM;
2848 	path->leave_spinning = 1;
2849 
2850 	trans = btrfs_start_transaction(root, 1);
2851 	if (IS_ERR(trans)) {
2852 		btrfs_free_path(path);
2853 		return PTR_ERR(trans);
2854 	}
2855 
2856 	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2857 	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2858 				   dir_id, "default", 7, 1);
2859 	if (IS_ERR_OR_NULL(di)) {
2860 		btrfs_free_path(path);
2861 		btrfs_end_transaction(trans, root);
2862 		printk(KERN_ERR "Umm, you don't have the default dir item, "
2863 		       "this isn't going to work\n");
2864 		return -ENOENT;
2865 	}
2866 
2867 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
2868 	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
2869 	btrfs_mark_buffer_dirty(path->nodes[0]);
2870 	btrfs_free_path(path);
2871 
2872 	btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
2873 	btrfs_end_transaction(trans, root);
2874 
2875 	return 0;
2876 }
2877 
2878 void btrfs_get_block_group_info(struct list_head *groups_list,
2879 				struct btrfs_ioctl_space_info *space)
2880 {
2881 	struct btrfs_block_group_cache *block_group;
2882 
2883 	space->total_bytes = 0;
2884 	space->used_bytes = 0;
2885 	space->flags = 0;
2886 	list_for_each_entry(block_group, groups_list, list) {
2887 		space->flags = block_group->flags;
2888 		space->total_bytes += block_group->key.offset;
2889 		space->used_bytes +=
2890 			btrfs_block_group_used(&block_group->item);
2891 	}
2892 }
2893 
2894 long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2895 {
2896 	struct btrfs_ioctl_space_args space_args;
2897 	struct btrfs_ioctl_space_info space;
2898 	struct btrfs_ioctl_space_info *dest;
2899 	struct btrfs_ioctl_space_info *dest_orig;
2900 	struct btrfs_ioctl_space_info __user *user_dest;
2901 	struct btrfs_space_info *info;
2902 	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2903 		       BTRFS_BLOCK_GROUP_SYSTEM,
2904 		       BTRFS_BLOCK_GROUP_METADATA,
2905 		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2906 	int num_types = 4;
2907 	int alloc_size;
2908 	int ret = 0;
2909 	u64 slot_count = 0;
2910 	int i, c;
2911 
2912 	if (copy_from_user(&space_args,
2913 			   (struct btrfs_ioctl_space_args __user *)arg,
2914 			   sizeof(space_args)))
2915 		return -EFAULT;
2916 
2917 	for (i = 0; i < num_types; i++) {
2918 		struct btrfs_space_info *tmp;
2919 
2920 		info = NULL;
2921 		rcu_read_lock();
2922 		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
2923 					list) {
2924 			if (tmp->flags == types[i]) {
2925 				info = tmp;
2926 				break;
2927 			}
2928 		}
2929 		rcu_read_unlock();
2930 
2931 		if (!info)
2932 			continue;
2933 
2934 		down_read(&info->groups_sem);
2935 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2936 			if (!list_empty(&info->block_groups[c]))
2937 				slot_count++;
2938 		}
2939 		up_read(&info->groups_sem);
2940 	}
2941 
2942 	/* space_slots == 0 means they are asking for a count */
2943 	if (space_args.space_slots == 0) {
2944 		space_args.total_spaces = slot_count;
2945 		goto out;
2946 	}
2947 
2948 	slot_count = min_t(u64, space_args.space_slots, slot_count);
2949 
2950 	alloc_size = sizeof(*dest) * slot_count;
2951 
2952 	/* we generally have at most 6 or so space infos, one for each raid
2953 	 * level.  So, a whole page should be more than enough for everyone
2954 	 */
2955 	if (alloc_size > PAGE_CACHE_SIZE)
2956 		return -ENOMEM;
2957 
2958 	space_args.total_spaces = 0;
2959 	dest = kmalloc(alloc_size, GFP_NOFS);
2960 	if (!dest)
2961 		return -ENOMEM;
2962 	dest_orig = dest;
2963 
2964 	/* now we have a buffer to copy into */
2965 	for (i = 0; i < num_types; i++) {
2966 		struct btrfs_space_info *tmp;
2967 
2968 		if (!slot_count)
2969 			break;
2970 
2971 		info = NULL;
2972 		rcu_read_lock();
2973 		list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
2974 					list) {
2975 			if (tmp->flags == types[i]) {
2976 				info = tmp;
2977 				break;
2978 			}
2979 		}
2980 		rcu_read_unlock();
2981 
2982 		if (!info)
2983 			continue;
2984 		down_read(&info->groups_sem);
2985 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2986 			if (!list_empty(&info->block_groups[c])) {
2987 				btrfs_get_block_group_info(
2988 					&info->block_groups[c], &space);
2989 				memcpy(dest, &space, sizeof(space));
2990 				dest++;
2991 				space_args.total_spaces++;
2992 				slot_count--;
2993 			}
2994 			if (!slot_count)
2995 				break;
2996 		}
2997 		up_read(&info->groups_sem);
2998 	}
2999 
3000 	user_dest = (struct btrfs_ioctl_space_info __user *)
3001 		(arg + sizeof(struct btrfs_ioctl_space_args));
3002 
3003 	if (copy_to_user(user_dest, dest_orig, alloc_size))
3004 		ret = -EFAULT;
3005 
3006 	kfree(dest_orig);
3007 out:
3008 	if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
3009 		ret = -EFAULT;
3010 
3011 	return ret;
3012 }
3013 
3014 /*
3015  * there are many ways the trans_start and trans_end ioctls can lead
3016  * to deadlocks.  They should only be used by applications that
3017  * basically own the machine, and have a very in depth understanding
3018  * of all the possible deadlocks and enospc problems.
3019  */
3020 long btrfs_ioctl_trans_end(struct file *file)
3021 {
3022 	struct inode *inode = fdentry(file)->d_inode;
3023 	struct btrfs_root *root = BTRFS_I(inode)->root;
3024 	struct btrfs_trans_handle *trans;
3025 
3026 	trans = file->private_data;
3027 	if (!trans)
3028 		return -EINVAL;
3029 	file->private_data = NULL;
3030 
3031 	btrfs_end_transaction(trans, root);
3032 
3033 	atomic_dec(&root->fs_info->open_ioctl_trans);
3034 
3035 	mnt_drop_write_file(file);
3036 	return 0;
3037 }
3038 
3039 static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
3040 {
3041 	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3042 	struct btrfs_trans_handle *trans;
3043 	u64 transid;
3044 	int ret;
3045 
3046 	trans = btrfs_start_transaction(root, 0);
3047 	if (IS_ERR(trans))
3048 		return PTR_ERR(trans);
3049 	transid = trans->transid;
3050 	ret = btrfs_commit_transaction_async(trans, root, 0);
3051 	if (ret) {
3052 		btrfs_end_transaction(trans, root);
3053 		return ret;
3054 	}
3055 
3056 	if (argp)
3057 		if (copy_to_user(argp, &transid, sizeof(transid)))
3058 			return -EFAULT;
3059 	return 0;
3060 }
3061 
3062 static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
3063 {
3064 	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
3065 	u64 transid;
3066 
3067 	if (argp) {
3068 		if (copy_from_user(&transid, argp, sizeof(transid)))
3069 			return -EFAULT;
3070 	} else {
3071 		transid = 0;  /* current trans */
3072 	}
3073 	return btrfs_wait_for_commit(root, transid);
3074 }
3075 
3076 static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
3077 {
3078 	int ret;
3079 	struct btrfs_ioctl_scrub_args *sa;
3080 
3081 	if (!capable(CAP_SYS_ADMIN))
3082 		return -EPERM;
3083 
3084 	sa = memdup_user(arg, sizeof(*sa));
3085 	if (IS_ERR(sa))
3086 		return PTR_ERR(sa);
3087 
3088 	ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
3089 			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
3090 
3091 	if (copy_to_user(arg, sa, sizeof(*sa)))
3092 		ret = -EFAULT;
3093 
3094 	kfree(sa);
3095 	return ret;
3096 }
3097 
3098 static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
3099 {
3100 	if (!capable(CAP_SYS_ADMIN))
3101 		return -EPERM;
3102 
3103 	return btrfs_scrub_cancel(root);
3104 }
3105 
3106 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
3107 				       void __user *arg)
3108 {
3109 	struct btrfs_ioctl_scrub_args *sa;
3110 	int ret;
3111 
3112 	if (!capable(CAP_SYS_ADMIN))
3113 		return -EPERM;
3114 
3115 	sa = memdup_user(arg, sizeof(*sa));
3116 	if (IS_ERR(sa))
3117 		return PTR_ERR(sa);
3118 
3119 	ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
3120 
3121 	if (copy_to_user(arg, sa, sizeof(*sa)))
3122 		ret = -EFAULT;
3123 
3124 	kfree(sa);
3125 	return ret;
3126 }
3127 
3128 static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
3129 				      void __user *arg)
3130 {
3131 	struct btrfs_ioctl_get_dev_stats *sa;
3132 	int ret;
3133 
3134 	sa = memdup_user(arg, sizeof(*sa));
3135 	if (IS_ERR(sa))
3136 		return PTR_ERR(sa);
3137 
3138 	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
3139 		kfree(sa);
3140 		return -EPERM;
3141 	}
3142 
3143 	ret = btrfs_get_dev_stats(root, sa);
3144 
3145 	if (copy_to_user(arg, sa, sizeof(*sa)))
3146 		ret = -EFAULT;
3147 
3148 	kfree(sa);
3149 	return ret;
3150 }
3151 
3152 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3153 {
3154 	int ret = 0;
3155 	int i;
3156 	u64 rel_ptr;
3157 	int size;
3158 	struct btrfs_ioctl_ino_path_args *ipa = NULL;
3159 	struct inode_fs_paths *ipath = NULL;
3160 	struct btrfs_path *path;
3161 
3162 	if (!capable(CAP_SYS_ADMIN))
3163 		return -EPERM;
3164 
3165 	path = btrfs_alloc_path();
3166 	if (!path) {
3167 		ret = -ENOMEM;
3168 		goto out;
3169 	}
3170 
3171 	ipa = memdup_user(arg, sizeof(*ipa));
3172 	if (IS_ERR(ipa)) {
3173 		ret = PTR_ERR(ipa);
3174 		ipa = NULL;
3175 		goto out;
3176 	}
3177 
3178 	size = min_t(u32, ipa->size, 4096);
3179 	ipath = init_ipath(size, root, path);
3180 	if (IS_ERR(ipath)) {
3181 		ret = PTR_ERR(ipath);
3182 		ipath = NULL;
3183 		goto out;
3184 	}
3185 
3186 	ret = paths_from_inode(ipa->inum, ipath);
3187 	if (ret < 0)
3188 		goto out;
3189 
3190 	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
3191 		rel_ptr = ipath->fspath->val[i] -
3192 			  (u64)(unsigned long)ipath->fspath->val;
3193 		ipath->fspath->val[i] = rel_ptr;
3194 	}
3195 
3196 	ret = copy_to_user((void *)(unsigned long)ipa->fspath,
3197 			   (void *)(unsigned long)ipath->fspath, size);
3198 	if (ret) {
3199 		ret = -EFAULT;
3200 		goto out;
3201 	}
3202 
3203 out:
3204 	btrfs_free_path(path);
3205 	free_ipath(ipath);
3206 	kfree(ipa);
3207 
3208 	return ret;
3209 }
3210 
3211 static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
3212 {
3213 	struct btrfs_data_container *inodes = ctx;
3214 	const size_t c = 3 * sizeof(u64);
3215 
3216 	if (inodes->bytes_left >= c) {
3217 		inodes->bytes_left -= c;
3218 		inodes->val[inodes->elem_cnt] = inum;
3219 		inodes->val[inodes->elem_cnt + 1] = offset;
3220 		inodes->val[inodes->elem_cnt + 2] = root;
3221 		inodes->elem_cnt += 3;
3222 	} else {
3223 		inodes->bytes_missing += c - inodes->bytes_left;
3224 		inodes->bytes_left = 0;
3225 		inodes->elem_missed += 3;
3226 	}
3227 
3228 	return 0;
3229 }
3230 
3231 static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3232 					void __user *arg)
3233 {
3234 	int ret = 0;
3235 	int size;
3236 	struct btrfs_ioctl_logical_ino_args *loi;
3237 	struct btrfs_data_container *inodes = NULL;
3238 	struct btrfs_path *path = NULL;
3239 
3240 	if (!capable(CAP_SYS_ADMIN))
3241 		return -EPERM;
3242 
3243 	loi = memdup_user(arg, sizeof(*loi));
3244 	if (IS_ERR(loi)) {
3245 		ret = PTR_ERR(loi);
3246 		loi = NULL;
3247 		goto out;
3248 	}
3249 
3250 	path = btrfs_alloc_path();
3251 	if (!path) {
3252 		ret = -ENOMEM;
3253 		goto out;
3254 	}
3255 
3256 	size = min_t(u32, loi->size, 64 * 1024);
3257 	inodes = init_data_container(size);
3258 	if (IS_ERR(inodes)) {
3259 		ret = PTR_ERR(inodes);
3260 		inodes = NULL;
3261 		goto out;
3262 	}
3263 
3264 	ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
3265 					  build_ino_list, inodes);
3266 	if (ret == -EINVAL)
3267 		ret = -ENOENT;
3268 	if (ret < 0)
3269 		goto out;
3270 
3271 	ret = copy_to_user((void *)(unsigned long)loi->inodes,
3272 			   (void *)(unsigned long)inodes, size);
3273 	if (ret)
3274 		ret = -EFAULT;
3275 
3276 out:
3277 	btrfs_free_path(path);
3278 	vfree(inodes);
3279 	kfree(loi);
3280 
3281 	return ret;
3282 }
3283 
3284 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3285 			       struct btrfs_ioctl_balance_args *bargs)
3286 {
3287 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3288 
3289 	bargs->flags = bctl->flags;
3290 
3291 	if (atomic_read(&fs_info->balance_running))
3292 		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
3293 	if (atomic_read(&fs_info->balance_pause_req))
3294 		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
3295 	if (atomic_read(&fs_info->balance_cancel_req))
3296 		bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
3297 
3298 	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
3299 	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
3300 	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
3301 
3302 	if (lock) {
3303 		spin_lock(&fs_info->balance_lock);
3304 		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3305 		spin_unlock(&fs_info->balance_lock);
3306 	} else {
3307 		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3308 	}
3309 }
3310 
3311 static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3312 {
3313 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3314 	struct btrfs_fs_info *fs_info = root->fs_info;
3315 	struct btrfs_ioctl_balance_args *bargs;
3316 	struct btrfs_balance_control *bctl;
3317 	int ret;
3318 
3319 	if (!capable(CAP_SYS_ADMIN))
3320 		return -EPERM;
3321 
3322 	ret = mnt_want_write_file(file);
3323 	if (ret)
3324 		return ret;
3325 
3326 	mutex_lock(&fs_info->volume_mutex);
3327 	mutex_lock(&fs_info->balance_mutex);
3328 
3329 	if (arg) {
3330 		bargs = memdup_user(arg, sizeof(*bargs));
3331 		if (IS_ERR(bargs)) {
3332 			ret = PTR_ERR(bargs);
3333 			goto out;
3334 		}
3335 
3336 		if (bargs->flags & BTRFS_BALANCE_RESUME) {
3337 			if (!fs_info->balance_ctl) {
3338 				ret = -ENOTCONN;
3339 				goto out_bargs;
3340 			}
3341 
3342 			bctl = fs_info->balance_ctl;
3343 			spin_lock(&fs_info->balance_lock);
3344 			bctl->flags |= BTRFS_BALANCE_RESUME;
3345 			spin_unlock(&fs_info->balance_lock);
3346 
3347 			goto do_balance;
3348 		}
3349 	} else {
3350 		bargs = NULL;
3351 	}
3352 
3353 	if (fs_info->balance_ctl) {
3354 		ret = -EINPROGRESS;
3355 		goto out_bargs;
3356 	}
3357 
3358 	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3359 	if (!bctl) {
3360 		ret = -ENOMEM;
3361 		goto out_bargs;
3362 	}
3363 
3364 	bctl->fs_info = fs_info;
3365 	if (arg) {
3366 		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
3367 		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
3368 		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
3369 
3370 		bctl->flags = bargs->flags;
3371 	} else {
3372 		/* balance everything - no filters */
3373 		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
3374 	}
3375 
3376 do_balance:
3377 	ret = btrfs_balance(bctl, bargs);
3378 	/*
3379 	 * bctl is freed in __cancel_balance or in free_fs_info if
3380 	 * restriper was paused all the way until unmount
3381 	 */
3382 	if (arg) {
3383 		if (copy_to_user(arg, bargs, sizeof(*bargs)))
3384 			ret = -EFAULT;
3385 	}
3386 
3387 out_bargs:
3388 	kfree(bargs);
3389 out:
3390 	mutex_unlock(&fs_info->balance_mutex);
3391 	mutex_unlock(&fs_info->volume_mutex);
3392 	mnt_drop_write_file(file);
3393 	return ret;
3394 }
3395 
3396 static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
3397 {
3398 	if (!capable(CAP_SYS_ADMIN))
3399 		return -EPERM;
3400 
3401 	switch (cmd) {
3402 	case BTRFS_BALANCE_CTL_PAUSE:
3403 		return btrfs_pause_balance(root->fs_info);
3404 	case BTRFS_BALANCE_CTL_CANCEL:
3405 		return btrfs_cancel_balance(root->fs_info);
3406 	}
3407 
3408 	return -EINVAL;
3409 }
3410 
3411 static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
3412 					 void __user *arg)
3413 {
3414 	struct btrfs_fs_info *fs_info = root->fs_info;
3415 	struct btrfs_ioctl_balance_args *bargs;
3416 	int ret = 0;
3417 
3418 	if (!capable(CAP_SYS_ADMIN))
3419 		return -EPERM;
3420 
3421 	mutex_lock(&fs_info->balance_mutex);
3422 	if (!fs_info->balance_ctl) {
3423 		ret = -ENOTCONN;
3424 		goto out;
3425 	}
3426 
3427 	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
3428 	if (!bargs) {
3429 		ret = -ENOMEM;
3430 		goto out;
3431 	}
3432 
3433 	update_ioctl_balance_args(fs_info, 1, bargs);
3434 
3435 	if (copy_to_user(arg, bargs, sizeof(*bargs)))
3436 		ret = -EFAULT;
3437 
3438 	kfree(bargs);
3439 out:
3440 	mutex_unlock(&fs_info->balance_mutex);
3441 	return ret;
3442 }
3443 
3444 static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
3445 {
3446 	struct btrfs_ioctl_quota_ctl_args *sa;
3447 	struct btrfs_trans_handle *trans = NULL;
3448 	int ret;
3449 	int err;
3450 
3451 	if (!capable(CAP_SYS_ADMIN))
3452 		return -EPERM;
3453 
3454 	if (root->fs_info->sb->s_flags & MS_RDONLY)
3455 		return -EROFS;
3456 
3457 	sa = memdup_user(arg, sizeof(*sa));
3458 	if (IS_ERR(sa))
3459 		return PTR_ERR(sa);
3460 
3461 	if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
3462 		trans = btrfs_start_transaction(root, 2);
3463 		if (IS_ERR(trans)) {
3464 			ret = PTR_ERR(trans);
3465 			goto out;
3466 		}
3467 	}
3468 
3469 	switch (sa->cmd) {
3470 	case BTRFS_QUOTA_CTL_ENABLE:
3471 		ret = btrfs_quota_enable(trans, root->fs_info);
3472 		break;
3473 	case BTRFS_QUOTA_CTL_DISABLE:
3474 		ret = btrfs_quota_disable(trans, root->fs_info);
3475 		break;
3476 	case BTRFS_QUOTA_CTL_RESCAN:
3477 		ret = btrfs_quota_rescan(root->fs_info);
3478 		break;
3479 	default:
3480 		ret = -EINVAL;
3481 		break;
3482 	}
3483 
3484 	if (copy_to_user(arg, sa, sizeof(*sa)))
3485 		ret = -EFAULT;
3486 
3487 	if (trans) {
3488 		err = btrfs_commit_transaction(trans, root);
3489 		if (err && !ret)
3490 			ret = err;
3491 	}
3492 
3493 out:
3494 	kfree(sa);
3495 	return ret;
3496 }
3497 
3498 static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
3499 {
3500 	struct btrfs_ioctl_qgroup_assign_args *sa;
3501 	struct btrfs_trans_handle *trans;
3502 	int ret;
3503 	int err;
3504 
3505 	if (!capable(CAP_SYS_ADMIN))
3506 		return -EPERM;
3507 
3508 	if (root->fs_info->sb->s_flags & MS_RDONLY)
3509 		return -EROFS;
3510 
3511 	sa = memdup_user(arg, sizeof(*sa));
3512 	if (IS_ERR(sa))
3513 		return PTR_ERR(sa);
3514 
3515 	trans = btrfs_join_transaction(root);
3516 	if (IS_ERR(trans)) {
3517 		ret = PTR_ERR(trans);
3518 		goto out;
3519 	}
3520 
3521 	/* FIXME: check if the IDs really exist */
3522 	if (sa->assign) {
3523 		ret = btrfs_add_qgroup_relation(trans, root->fs_info,
3524 						sa->src, sa->dst);
3525 	} else {
3526 		ret = btrfs_del_qgroup_relation(trans, root->fs_info,
3527 						sa->src, sa->dst);
3528 	}
3529 
3530 	err = btrfs_end_transaction(trans, root);
3531 	if (err && !ret)
3532 		ret = err;
3533 
3534 out:
3535 	kfree(sa);
3536 	return ret;
3537 }
3538 
3539 static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
3540 {
3541 	struct btrfs_ioctl_qgroup_create_args *sa;
3542 	struct btrfs_trans_handle *trans;
3543 	int ret;
3544 	int err;
3545 
3546 	if (!capable(CAP_SYS_ADMIN))
3547 		return -EPERM;
3548 
3549 	if (root->fs_info->sb->s_flags & MS_RDONLY)
3550 		return -EROFS;
3551 
3552 	sa = memdup_user(arg, sizeof(*sa));
3553 	if (IS_ERR(sa))
3554 		return PTR_ERR(sa);
3555 
3556 	trans = btrfs_join_transaction(root);
3557 	if (IS_ERR(trans)) {
3558 		ret = PTR_ERR(trans);
3559 		goto out;
3560 	}
3561 
3562 	/* FIXME: check if the IDs really exist */
3563 	if (sa->create) {
3564 		ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
3565 					  NULL);
3566 	} else {
3567 		ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
3568 	}
3569 
3570 	err = btrfs_end_transaction(trans, root);
3571 	if (err && !ret)
3572 		ret = err;
3573 
3574 out:
3575 	kfree(sa);
3576 	return ret;
3577 }
3578 
3579 static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
3580 {
3581 	struct btrfs_ioctl_qgroup_limit_args *sa;
3582 	struct btrfs_trans_handle *trans;
3583 	int ret;
3584 	int err;
3585 	u64 qgroupid;
3586 
3587 	if (!capable(CAP_SYS_ADMIN))
3588 		return -EPERM;
3589 
3590 	if (root->fs_info->sb->s_flags & MS_RDONLY)
3591 		return -EROFS;
3592 
3593 	sa = memdup_user(arg, sizeof(*sa));
3594 	if (IS_ERR(sa))
3595 		return PTR_ERR(sa);
3596 
3597 	trans = btrfs_join_transaction(root);
3598 	if (IS_ERR(trans)) {
3599 		ret = PTR_ERR(trans);
3600 		goto out;
3601 	}
3602 
3603 	qgroupid = sa->qgroupid;
3604 	if (!qgroupid) {
3605 		/* take the current subvol as qgroup */
3606 		qgroupid = root->root_key.objectid;
3607 	}
3608 
3609 	/* FIXME: check if the IDs really exist */
3610 	ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
3611 
3612 	err = btrfs_end_transaction(trans, root);
3613 	if (err && !ret)
3614 		ret = err;
3615 
3616 out:
3617 	kfree(sa);
3618 	return ret;
3619 }
3620 
3621 static long btrfs_ioctl_set_received_subvol(struct file *file,
3622 					    void __user *arg)
3623 {
3624 	struct btrfs_ioctl_received_subvol_args *sa = NULL;
3625 	struct inode *inode = fdentry(file)->d_inode;
3626 	struct btrfs_root *root = BTRFS_I(inode)->root;
3627 	struct btrfs_root_item *root_item = &root->root_item;
3628 	struct btrfs_trans_handle *trans;
3629 	struct timespec ct = CURRENT_TIME;
3630 	int ret = 0;
3631 
3632 	ret = mnt_want_write_file(file);
3633 	if (ret < 0)
3634 		return ret;
3635 
3636 	down_write(&root->fs_info->subvol_sem);
3637 
3638 	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
3639 		ret = -EINVAL;
3640 		goto out;
3641 	}
3642 
3643 	if (btrfs_root_readonly(root)) {
3644 		ret = -EROFS;
3645 		goto out;
3646 	}
3647 
3648 	if (!inode_owner_or_capable(inode)) {
3649 		ret = -EACCES;
3650 		goto out;
3651 	}
3652 
3653 	sa = memdup_user(arg, sizeof(*sa));
3654 	if (IS_ERR(sa)) {
3655 		ret = PTR_ERR(sa);
3656 		sa = NULL;
3657 		goto out;
3658 	}
3659 
3660 	trans = btrfs_start_transaction(root, 1);
3661 	if (IS_ERR(trans)) {
3662 		ret = PTR_ERR(trans);
3663 		trans = NULL;
3664 		goto out;
3665 	}
3666 
3667 	sa->rtransid = trans->transid;
3668 	sa->rtime.sec = ct.tv_sec;
3669 	sa->rtime.nsec = ct.tv_nsec;
3670 
3671 	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
3672 	btrfs_set_root_stransid(root_item, sa->stransid);
3673 	btrfs_set_root_rtransid(root_item, sa->rtransid);
3674 	root_item->stime.sec = cpu_to_le64(sa->stime.sec);
3675 	root_item->stime.nsec = cpu_to_le32(sa->stime.nsec);
3676 	root_item->rtime.sec = cpu_to_le64(sa->rtime.sec);
3677 	root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec);
3678 
3679 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
3680 				&root->root_key, &root->root_item);
3681 	if (ret < 0) {
3682 		btrfs_end_transaction(trans, root);
3683 		trans = NULL;
3684 		goto out;
3685 	} else {
3686 		ret = btrfs_commit_transaction(trans, root);
3687 		if (ret < 0)
3688 			goto out;
3689 	}
3690 
3691 	ret = copy_to_user(arg, sa, sizeof(*sa));
3692 	if (ret)
3693 		ret = -EFAULT;
3694 
3695 out:
3696 	kfree(sa);
3697 	up_write(&root->fs_info->subvol_sem);
3698 	mnt_drop_write_file(file);
3699 	return ret;
3700 }
3701 
3702 long btrfs_ioctl(struct file *file, unsigned int
3703 		cmd, unsigned long arg)
3704 {
3705 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
3706 	void __user *argp = (void __user *)arg;
3707 
3708 	switch (cmd) {
3709 	case FS_IOC_GETFLAGS:
3710 		return btrfs_ioctl_getflags(file, argp);
3711 	case FS_IOC_SETFLAGS:
3712 		return btrfs_ioctl_setflags(file, argp);
3713 	case FS_IOC_GETVERSION:
3714 		return btrfs_ioctl_getversion(file, argp);
3715 	case FITRIM:
3716 		return btrfs_ioctl_fitrim(file, argp);
3717 	case BTRFS_IOC_SNAP_CREATE:
3718 		return btrfs_ioctl_snap_create(file, argp, 0);
3719 	case BTRFS_IOC_SNAP_CREATE_V2:
3720 		return btrfs_ioctl_snap_create_v2(file, argp, 0);
3721 	case BTRFS_IOC_SUBVOL_CREATE:
3722 		return btrfs_ioctl_snap_create(file, argp, 1);
3723 	case BTRFS_IOC_SUBVOL_CREATE_V2:
3724 		return btrfs_ioctl_snap_create_v2(file, argp, 1);
3725 	case BTRFS_IOC_SNAP_DESTROY:
3726 		return btrfs_ioctl_snap_destroy(file, argp);
3727 	case BTRFS_IOC_SUBVOL_GETFLAGS:
3728 		return btrfs_ioctl_subvol_getflags(file, argp);
3729 	case BTRFS_IOC_SUBVOL_SETFLAGS:
3730 		return btrfs_ioctl_subvol_setflags(file, argp);
3731 	case BTRFS_IOC_DEFAULT_SUBVOL:
3732 		return btrfs_ioctl_default_subvol(file, argp);
3733 	case BTRFS_IOC_DEFRAG:
3734 		return btrfs_ioctl_defrag(file, NULL);
3735 	case BTRFS_IOC_DEFRAG_RANGE:
3736 		return btrfs_ioctl_defrag(file, argp);
3737 	case BTRFS_IOC_RESIZE:
3738 		return btrfs_ioctl_resize(root, argp);
3739 	case BTRFS_IOC_ADD_DEV:
3740 		return btrfs_ioctl_add_dev(root, argp);
3741 	case BTRFS_IOC_RM_DEV:
3742 		return btrfs_ioctl_rm_dev(root, argp);
3743 	case BTRFS_IOC_FS_INFO:
3744 		return btrfs_ioctl_fs_info(root, argp);
3745 	case BTRFS_IOC_DEV_INFO:
3746 		return btrfs_ioctl_dev_info(root, argp);
3747 	case BTRFS_IOC_BALANCE:
3748 		return btrfs_ioctl_balance(file, NULL);
3749 	case BTRFS_IOC_CLONE:
3750 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3751 	case BTRFS_IOC_CLONE_RANGE:
3752 		return btrfs_ioctl_clone_range(file, argp);
3753 	case BTRFS_IOC_TRANS_START:
3754 		return btrfs_ioctl_trans_start(file);
3755 	case BTRFS_IOC_TRANS_END:
3756 		return btrfs_ioctl_trans_end(file);
3757 	case BTRFS_IOC_TREE_SEARCH:
3758 		return btrfs_ioctl_tree_search(file, argp);
3759 	case BTRFS_IOC_INO_LOOKUP:
3760 		return btrfs_ioctl_ino_lookup(file, argp);
3761 	case BTRFS_IOC_INO_PATHS:
3762 		return btrfs_ioctl_ino_to_path(root, argp);
3763 	case BTRFS_IOC_LOGICAL_INO:
3764 		return btrfs_ioctl_logical_to_ino(root, argp);
3765 	case BTRFS_IOC_SPACE_INFO:
3766 		return btrfs_ioctl_space_info(root, argp);
3767 	case BTRFS_IOC_SYNC:
3768 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
3769 		return 0;
3770 	case BTRFS_IOC_START_SYNC:
3771 		return btrfs_ioctl_start_sync(file, argp);
3772 	case BTRFS_IOC_WAIT_SYNC:
3773 		return btrfs_ioctl_wait_sync(file, argp);
3774 	case BTRFS_IOC_SCRUB:
3775 		return btrfs_ioctl_scrub(root, argp);
3776 	case BTRFS_IOC_SCRUB_CANCEL:
3777 		return btrfs_ioctl_scrub_cancel(root, argp);
3778 	case BTRFS_IOC_SCRUB_PROGRESS:
3779 		return btrfs_ioctl_scrub_progress(root, argp);
3780 	case BTRFS_IOC_BALANCE_V2:
3781 		return btrfs_ioctl_balance(file, argp);
3782 	case BTRFS_IOC_BALANCE_CTL:
3783 		return btrfs_ioctl_balance_ctl(root, arg);
3784 	case BTRFS_IOC_BALANCE_PROGRESS:
3785 		return btrfs_ioctl_balance_progress(root, argp);
3786 	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
3787 		return btrfs_ioctl_set_received_subvol(file, argp);
3788 	case BTRFS_IOC_SEND:
3789 		return btrfs_ioctl_send(file, argp);
3790 	case BTRFS_IOC_GET_DEV_STATS:
3791 		return btrfs_ioctl_get_dev_stats(root, argp);
3792 	case BTRFS_IOC_QUOTA_CTL:
3793 		return btrfs_ioctl_quota_ctl(root, argp);
3794 	case BTRFS_IOC_QGROUP_ASSIGN:
3795 		return btrfs_ioctl_qgroup_assign(root, argp);
3796 	case BTRFS_IOC_QGROUP_CREATE:
3797 		return btrfs_ioctl_qgroup_create(root, argp);
3798 	case BTRFS_IOC_QGROUP_LIMIT:
3799 		return btrfs_ioctl_qgroup_limit(root, argp);
3800 	}
3801 
3802 	return -ENOTTY;
3803 }
3804