xref: /linux/fs/btrfs/inode.c (revision a1ad803322a904a250fa901020b4a4dfaf51a829)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/bit_spinlock.h>
36 #include <linux/xattr.h>
37 #include <linux/posix_acl.h>
38 #include <linux/falloc.h>
39 #include <linux/slab.h>
40 #include <linux/ratelimit.h>
41 #include <linux/mount.h>
42 #include "compat.h"
43 #include "ctree.h"
44 #include "disk-io.h"
45 #include "transaction.h"
46 #include "btrfs_inode.h"
47 #include "ioctl.h"
48 #include "print-tree.h"
49 #include "ordered-data.h"
50 #include "xattr.h"
51 #include "tree-log.h"
52 #include "volumes.h"
53 #include "compression.h"
54 #include "locking.h"
55 #include "free-space-cache.h"
56 #include "inode-map.h"
57 
58 struct btrfs_iget_args {
59 	u64 ino;
60 	struct btrfs_root *root;
61 };
62 
63 static const struct inode_operations btrfs_dir_inode_operations;
64 static const struct inode_operations btrfs_symlink_inode_operations;
65 static const struct inode_operations btrfs_dir_ro_inode_operations;
66 static const struct inode_operations btrfs_special_inode_operations;
67 static const struct inode_operations btrfs_file_inode_operations;
68 static const struct address_space_operations btrfs_aops;
69 static const struct address_space_operations btrfs_symlink_aops;
70 static const struct file_operations btrfs_dir_file_operations;
71 static struct extent_io_ops btrfs_extent_io_ops;
72 
73 static struct kmem_cache *btrfs_inode_cachep;
74 struct kmem_cache *btrfs_trans_handle_cachep;
75 struct kmem_cache *btrfs_transaction_cachep;
76 struct kmem_cache *btrfs_path_cachep;
77 struct kmem_cache *btrfs_free_space_cachep;
78 
79 #define S_SHIFT 12
80 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
81 	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
82 	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
83 	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
84 	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
85 	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
86 	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
87 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
88 };
89 
90 static int btrfs_setsize(struct inode *inode, loff_t newsize);
91 static int btrfs_truncate(struct inode *inode);
92 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
93 static noinline int cow_file_range(struct inode *inode,
94 				   struct page *locked_page,
95 				   u64 start, u64 end, int *page_started,
96 				   unsigned long *nr_written, int unlock);
97 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
98 				struct btrfs_root *root, struct inode *inode);
99 
100 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
101 				     struct inode *inode,  struct inode *dir,
102 				     const struct qstr *qstr)
103 {
104 	int err;
105 
106 	err = btrfs_init_acl(trans, inode, dir);
107 	if (!err)
108 		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
109 	return err;
110 }
111 
112 /*
113  * this does all the hard work for inserting an inline extent into
114  * the btree.  The caller should have done a btrfs_drop_extents so that
115  * no overlapping inline items exist in the btree
116  */
117 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
118 				struct btrfs_root *root, struct inode *inode,
119 				u64 start, size_t size, size_t compressed_size,
120 				int compress_type,
121 				struct page **compressed_pages)
122 {
123 	struct btrfs_key key;
124 	struct btrfs_path *path;
125 	struct extent_buffer *leaf;
126 	struct page *page = NULL;
127 	char *kaddr;
128 	unsigned long ptr;
129 	struct btrfs_file_extent_item *ei;
130 	int err = 0;
131 	int ret;
132 	size_t cur_size = size;
133 	size_t datasize;
134 	unsigned long offset;
135 
136 	if (compressed_size && compressed_pages)
137 		cur_size = compressed_size;
138 
139 	path = btrfs_alloc_path();
140 	if (!path)
141 		return -ENOMEM;
142 
143 	path->leave_spinning = 1;
144 
145 	key.objectid = btrfs_ino(inode);
146 	key.offset = start;
147 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
148 	datasize = btrfs_file_extent_calc_inline_size(cur_size);
149 
150 	inode_add_bytes(inode, size);
151 	ret = btrfs_insert_empty_item(trans, root, path, &key,
152 				      datasize);
153 	BUG_ON(ret);
154 	if (ret) {
155 		err = ret;
156 		goto fail;
157 	}
158 	leaf = path->nodes[0];
159 	ei = btrfs_item_ptr(leaf, path->slots[0],
160 			    struct btrfs_file_extent_item);
161 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
162 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
163 	btrfs_set_file_extent_encryption(leaf, ei, 0);
164 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
165 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
166 	ptr = btrfs_file_extent_inline_start(ei);
167 
168 	if (compress_type != BTRFS_COMPRESS_NONE) {
169 		struct page *cpage;
170 		int i = 0;
171 		while (compressed_size > 0) {
172 			cpage = compressed_pages[i];
173 			cur_size = min_t(unsigned long, compressed_size,
174 				       PAGE_CACHE_SIZE);
175 
176 			kaddr = kmap_atomic(cpage, KM_USER0);
177 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
178 			kunmap_atomic(kaddr, KM_USER0);
179 
180 			i++;
181 			ptr += cur_size;
182 			compressed_size -= cur_size;
183 		}
184 		btrfs_set_file_extent_compression(leaf, ei,
185 						  compress_type);
186 	} else {
187 		page = find_get_page(inode->i_mapping,
188 				     start >> PAGE_CACHE_SHIFT);
189 		btrfs_set_file_extent_compression(leaf, ei, 0);
190 		kaddr = kmap_atomic(page, KM_USER0);
191 		offset = start & (PAGE_CACHE_SIZE - 1);
192 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
193 		kunmap_atomic(kaddr, KM_USER0);
194 		page_cache_release(page);
195 	}
196 	btrfs_mark_buffer_dirty(leaf);
197 	btrfs_free_path(path);
198 
199 	/*
200 	 * we're an inline extent, so nobody can
201 	 * extend the file past i_size without locking
202 	 * a page we already have locked.
203 	 *
204 	 * We must do any isize and inode updates
205 	 * before we unlock the pages.  Otherwise we
206 	 * could end up racing with unlink.
207 	 */
208 	BTRFS_I(inode)->disk_i_size = inode->i_size;
209 	btrfs_update_inode(trans, root, inode);
210 
211 	return 0;
212 fail:
213 	btrfs_free_path(path);
214 	return err;
215 }
216 
217 
218 /*
219  * conditionally insert an inline extent into the file.  This
220  * does the checks required to make sure the data is small enough
221  * to fit as an inline extent.
222  */
223 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
224 				 struct btrfs_root *root,
225 				 struct inode *inode, u64 start, u64 end,
226 				 size_t compressed_size, int compress_type,
227 				 struct page **compressed_pages)
228 {
229 	u64 isize = i_size_read(inode);
230 	u64 actual_end = min(end + 1, isize);
231 	u64 inline_len = actual_end - start;
232 	u64 aligned_end = (end + root->sectorsize - 1) &
233 			~((u64)root->sectorsize - 1);
234 	u64 hint_byte;
235 	u64 data_len = inline_len;
236 	int ret;
237 
238 	if (compressed_size)
239 		data_len = compressed_size;
240 
241 	if (start > 0 ||
242 	    actual_end >= PAGE_CACHE_SIZE ||
243 	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
244 	    (!compressed_size &&
245 	    (actual_end & (root->sectorsize - 1)) == 0) ||
246 	    end + 1 < isize ||
247 	    data_len > root->fs_info->max_inline) {
248 		return 1;
249 	}
250 
251 	ret = btrfs_drop_extents(trans, inode, start, aligned_end,
252 				 &hint_byte, 1);
253 	BUG_ON(ret);
254 
255 	if (isize > actual_end)
256 		inline_len = min_t(u64, isize, actual_end);
257 	ret = insert_inline_extent(trans, root, inode, start,
258 				   inline_len, compressed_size,
259 				   compress_type, compressed_pages);
260 	BUG_ON(ret);
261 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
262 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
263 	return 0;
264 }
265 
266 struct async_extent {
267 	u64 start;
268 	u64 ram_size;
269 	u64 compressed_size;
270 	struct page **pages;
271 	unsigned long nr_pages;
272 	int compress_type;
273 	struct list_head list;
274 };
275 
276 struct async_cow {
277 	struct inode *inode;
278 	struct btrfs_root *root;
279 	struct page *locked_page;
280 	u64 start;
281 	u64 end;
282 	struct list_head extents;
283 	struct btrfs_work work;
284 };
285 
286 static noinline int add_async_extent(struct async_cow *cow,
287 				     u64 start, u64 ram_size,
288 				     u64 compressed_size,
289 				     struct page **pages,
290 				     unsigned long nr_pages,
291 				     int compress_type)
292 {
293 	struct async_extent *async_extent;
294 
295 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
296 	BUG_ON(!async_extent);
297 	async_extent->start = start;
298 	async_extent->ram_size = ram_size;
299 	async_extent->compressed_size = compressed_size;
300 	async_extent->pages = pages;
301 	async_extent->nr_pages = nr_pages;
302 	async_extent->compress_type = compress_type;
303 	list_add_tail(&async_extent->list, &cow->extents);
304 	return 0;
305 }
306 
307 /*
308  * we create compressed extents in two phases.  The first
309  * phase compresses a range of pages that have already been
310  * locked (both pages and state bits are locked).
311  *
312  * This is done inside an ordered work queue, and the compression
313  * is spread across many cpus.  The actual IO submission is step
314  * two, and the ordered work queue takes care of making sure that
315  * happens in the same order things were put onto the queue by
316  * writepages and friends.
317  *
318  * If this code finds it can't get good compression, it puts an
319  * entry onto the work queue to write the uncompressed bytes.  This
320  * makes sure that both compressed inodes and uncompressed inodes
321  * are written in the same order that pdflush sent them down.
322  */
323 static noinline int compress_file_range(struct inode *inode,
324 					struct page *locked_page,
325 					u64 start, u64 end,
326 					struct async_cow *async_cow,
327 					int *num_added)
328 {
329 	struct btrfs_root *root = BTRFS_I(inode)->root;
330 	struct btrfs_trans_handle *trans;
331 	u64 num_bytes;
332 	u64 blocksize = root->sectorsize;
333 	u64 actual_end;
334 	u64 isize = i_size_read(inode);
335 	int ret = 0;
336 	struct page **pages = NULL;
337 	unsigned long nr_pages;
338 	unsigned long nr_pages_ret = 0;
339 	unsigned long total_compressed = 0;
340 	unsigned long total_in = 0;
341 	unsigned long max_compressed = 128 * 1024;
342 	unsigned long max_uncompressed = 128 * 1024;
343 	int i;
344 	int will_compress;
345 	int compress_type = root->fs_info->compress_type;
346 
347 	/* if this is a small write inside eof, kick off a defragbot */
348 	if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
349 		btrfs_add_inode_defrag(NULL, inode);
350 
351 	actual_end = min_t(u64, isize, end + 1);
352 again:
353 	will_compress = 0;
354 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
355 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
356 
357 	/*
358 	 * we don't want to send crud past the end of i_size through
359 	 * compression, that's just a waste of CPU time.  So, if the
360 	 * end of the file is before the start of our current
361 	 * requested range of bytes, we bail out to the uncompressed
362 	 * cleanup code that can deal with all of this.
363 	 *
364 	 * It isn't really the fastest way to fix things, but this is a
365 	 * very uncommon corner.
366 	 */
367 	if (actual_end <= start)
368 		goto cleanup_and_bail_uncompressed;
369 
370 	total_compressed = actual_end - start;
371 
372 	/* we want to make sure that amount of ram required to uncompress
373 	 * an extent is reasonable, so we limit the total size in ram
374 	 * of a compressed extent to 128k.  This is a crucial number
375 	 * because it also controls how easily we can spread reads across
376 	 * cpus for decompression.
377 	 *
378 	 * We also want to make sure the amount of IO required to do
379 	 * a random read is reasonably small, so we limit the size of
380 	 * a compressed extent to 128k.
381 	 */
382 	total_compressed = min(total_compressed, max_uncompressed);
383 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
384 	num_bytes = max(blocksize,  num_bytes);
385 	total_in = 0;
386 	ret = 0;
387 
388 	/*
389 	 * we do compression for mount -o compress and when the
390 	 * inode has not been flagged as nocompress.  This flag can
391 	 * change at any time if we discover bad compression ratios.
392 	 */
393 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
394 	    (btrfs_test_opt(root, COMPRESS) ||
395 	     (BTRFS_I(inode)->force_compress) ||
396 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
397 		WARN_ON(pages);
398 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
399 		if (!pages) {
400 			/* just bail out to the uncompressed code */
401 			goto cont;
402 		}
403 
404 		if (BTRFS_I(inode)->force_compress)
405 			compress_type = BTRFS_I(inode)->force_compress;
406 
407 		ret = btrfs_compress_pages(compress_type,
408 					   inode->i_mapping, start,
409 					   total_compressed, pages,
410 					   nr_pages, &nr_pages_ret,
411 					   &total_in,
412 					   &total_compressed,
413 					   max_compressed);
414 
415 		if (!ret) {
416 			unsigned long offset = total_compressed &
417 				(PAGE_CACHE_SIZE - 1);
418 			struct page *page = pages[nr_pages_ret - 1];
419 			char *kaddr;
420 
421 			/* zero the tail end of the last page, we might be
422 			 * sending it down to disk
423 			 */
424 			if (offset) {
425 				kaddr = kmap_atomic(page, KM_USER0);
426 				memset(kaddr + offset, 0,
427 				       PAGE_CACHE_SIZE - offset);
428 				kunmap_atomic(kaddr, KM_USER0);
429 			}
430 			will_compress = 1;
431 		}
432 	}
433 cont:
434 	if (start == 0) {
435 		trans = btrfs_join_transaction(root);
436 		BUG_ON(IS_ERR(trans));
437 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
438 
439 		/* lets try to make an inline extent */
440 		if (ret || total_in < (actual_end - start)) {
441 			/* we didn't compress the entire range, try
442 			 * to make an uncompressed inline extent.
443 			 */
444 			ret = cow_file_range_inline(trans, root, inode,
445 						    start, end, 0, 0, NULL);
446 		} else {
447 			/* try making a compressed inline extent */
448 			ret = cow_file_range_inline(trans, root, inode,
449 						    start, end,
450 						    total_compressed,
451 						    compress_type, pages);
452 		}
453 		if (ret == 0) {
454 			/*
455 			 * inline extent creation worked, we don't need
456 			 * to create any more async work items.  Unlock
457 			 * and free up our temp pages.
458 			 */
459 			extent_clear_unlock_delalloc(inode,
460 			     &BTRFS_I(inode)->io_tree,
461 			     start, end, NULL,
462 			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
463 			     EXTENT_CLEAR_DELALLOC |
464 			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
465 
466 			btrfs_end_transaction(trans, root);
467 			goto free_pages_out;
468 		}
469 		btrfs_end_transaction(trans, root);
470 	}
471 
472 	if (will_compress) {
473 		/*
474 		 * we aren't doing an inline extent round the compressed size
475 		 * up to a block size boundary so the allocator does sane
476 		 * things
477 		 */
478 		total_compressed = (total_compressed + blocksize - 1) &
479 			~(blocksize - 1);
480 
481 		/*
482 		 * one last check to make sure the compression is really a
483 		 * win, compare the page count read with the blocks on disk
484 		 */
485 		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
486 			~(PAGE_CACHE_SIZE - 1);
487 		if (total_compressed >= total_in) {
488 			will_compress = 0;
489 		} else {
490 			num_bytes = total_in;
491 		}
492 	}
493 	if (!will_compress && pages) {
494 		/*
495 		 * the compression code ran but failed to make things smaller,
496 		 * free any pages it allocated and our page pointer array
497 		 */
498 		for (i = 0; i < nr_pages_ret; i++) {
499 			WARN_ON(pages[i]->mapping);
500 			page_cache_release(pages[i]);
501 		}
502 		kfree(pages);
503 		pages = NULL;
504 		total_compressed = 0;
505 		nr_pages_ret = 0;
506 
507 		/* flag the file so we don't compress in the future */
508 		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
509 		    !(BTRFS_I(inode)->force_compress)) {
510 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
511 		}
512 	}
513 	if (will_compress) {
514 		*num_added += 1;
515 
516 		/* the async work queues will take care of doing actual
517 		 * allocation on disk for these compressed pages,
518 		 * and will submit them to the elevator.
519 		 */
520 		add_async_extent(async_cow, start, num_bytes,
521 				 total_compressed, pages, nr_pages_ret,
522 				 compress_type);
523 
524 		if (start + num_bytes < end) {
525 			start += num_bytes;
526 			pages = NULL;
527 			cond_resched();
528 			goto again;
529 		}
530 	} else {
531 cleanup_and_bail_uncompressed:
532 		/*
533 		 * No compression, but we still need to write the pages in
534 		 * the file we've been given so far.  redirty the locked
535 		 * page if it corresponds to our extent and set things up
536 		 * for the async work queue to run cow_file_range to do
537 		 * the normal delalloc dance
538 		 */
539 		if (page_offset(locked_page) >= start &&
540 		    page_offset(locked_page) <= end) {
541 			__set_page_dirty_nobuffers(locked_page);
542 			/* unlocked later on in the async handlers */
543 		}
544 		add_async_extent(async_cow, start, end - start + 1,
545 				 0, NULL, 0, BTRFS_COMPRESS_NONE);
546 		*num_added += 1;
547 	}
548 
549 out:
550 	return 0;
551 
552 free_pages_out:
553 	for (i = 0; i < nr_pages_ret; i++) {
554 		WARN_ON(pages[i]->mapping);
555 		page_cache_release(pages[i]);
556 	}
557 	kfree(pages);
558 
559 	goto out;
560 }
561 
562 /*
563  * phase two of compressed writeback.  This is the ordered portion
564  * of the code, which only gets called in the order the work was
565  * queued.  We walk all the async extents created by compress_file_range
566  * and send them down to the disk.
567  */
568 static noinline int submit_compressed_extents(struct inode *inode,
569 					      struct async_cow *async_cow)
570 {
571 	struct async_extent *async_extent;
572 	u64 alloc_hint = 0;
573 	struct btrfs_trans_handle *trans;
574 	struct btrfs_key ins;
575 	struct extent_map *em;
576 	struct btrfs_root *root = BTRFS_I(inode)->root;
577 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
578 	struct extent_io_tree *io_tree;
579 	int ret = 0;
580 
581 	if (list_empty(&async_cow->extents))
582 		return 0;
583 
584 
585 	while (!list_empty(&async_cow->extents)) {
586 		async_extent = list_entry(async_cow->extents.next,
587 					  struct async_extent, list);
588 		list_del(&async_extent->list);
589 
590 		io_tree = &BTRFS_I(inode)->io_tree;
591 
592 retry:
593 		/* did the compression code fall back to uncompressed IO? */
594 		if (!async_extent->pages) {
595 			int page_started = 0;
596 			unsigned long nr_written = 0;
597 
598 			lock_extent(io_tree, async_extent->start,
599 					 async_extent->start +
600 					 async_extent->ram_size - 1, GFP_NOFS);
601 
602 			/* allocate blocks */
603 			ret = cow_file_range(inode, async_cow->locked_page,
604 					     async_extent->start,
605 					     async_extent->start +
606 					     async_extent->ram_size - 1,
607 					     &page_started, &nr_written, 0);
608 
609 			/*
610 			 * if page_started, cow_file_range inserted an
611 			 * inline extent and took care of all the unlocking
612 			 * and IO for us.  Otherwise, we need to submit
613 			 * all those pages down to the drive.
614 			 */
615 			if (!page_started && !ret)
616 				extent_write_locked_range(io_tree,
617 						  inode, async_extent->start,
618 						  async_extent->start +
619 						  async_extent->ram_size - 1,
620 						  btrfs_get_extent,
621 						  WB_SYNC_ALL);
622 			kfree(async_extent);
623 			cond_resched();
624 			continue;
625 		}
626 
627 		lock_extent(io_tree, async_extent->start,
628 			    async_extent->start + async_extent->ram_size - 1,
629 			    GFP_NOFS);
630 
631 		trans = btrfs_join_transaction(root);
632 		BUG_ON(IS_ERR(trans));
633 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
634 		ret = btrfs_reserve_extent(trans, root,
635 					   async_extent->compressed_size,
636 					   async_extent->compressed_size,
637 					   0, alloc_hint,
638 					   (u64)-1, &ins, 1);
639 		btrfs_end_transaction(trans, root);
640 
641 		if (ret) {
642 			int i;
643 			for (i = 0; i < async_extent->nr_pages; i++) {
644 				WARN_ON(async_extent->pages[i]->mapping);
645 				page_cache_release(async_extent->pages[i]);
646 			}
647 			kfree(async_extent->pages);
648 			async_extent->nr_pages = 0;
649 			async_extent->pages = NULL;
650 			unlock_extent(io_tree, async_extent->start,
651 				      async_extent->start +
652 				      async_extent->ram_size - 1, GFP_NOFS);
653 			goto retry;
654 		}
655 
656 		/*
657 		 * here we're doing allocation and writeback of the
658 		 * compressed pages
659 		 */
660 		btrfs_drop_extent_cache(inode, async_extent->start,
661 					async_extent->start +
662 					async_extent->ram_size - 1, 0);
663 
664 		em = alloc_extent_map();
665 		BUG_ON(!em);
666 		em->start = async_extent->start;
667 		em->len = async_extent->ram_size;
668 		em->orig_start = em->start;
669 
670 		em->block_start = ins.objectid;
671 		em->block_len = ins.offset;
672 		em->bdev = root->fs_info->fs_devices->latest_bdev;
673 		em->compress_type = async_extent->compress_type;
674 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
675 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
676 
677 		while (1) {
678 			write_lock(&em_tree->lock);
679 			ret = add_extent_mapping(em_tree, em);
680 			write_unlock(&em_tree->lock);
681 			if (ret != -EEXIST) {
682 				free_extent_map(em);
683 				break;
684 			}
685 			btrfs_drop_extent_cache(inode, async_extent->start,
686 						async_extent->start +
687 						async_extent->ram_size - 1, 0);
688 		}
689 
690 		ret = btrfs_add_ordered_extent_compress(inode,
691 						async_extent->start,
692 						ins.objectid,
693 						async_extent->ram_size,
694 						ins.offset,
695 						BTRFS_ORDERED_COMPRESSED,
696 						async_extent->compress_type);
697 		BUG_ON(ret);
698 
699 		/*
700 		 * clear dirty, set writeback and unlock the pages.
701 		 */
702 		extent_clear_unlock_delalloc(inode,
703 				&BTRFS_I(inode)->io_tree,
704 				async_extent->start,
705 				async_extent->start +
706 				async_extent->ram_size - 1,
707 				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
708 				EXTENT_CLEAR_UNLOCK |
709 				EXTENT_CLEAR_DELALLOC |
710 				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
711 
712 		ret = btrfs_submit_compressed_write(inode,
713 				    async_extent->start,
714 				    async_extent->ram_size,
715 				    ins.objectid,
716 				    ins.offset, async_extent->pages,
717 				    async_extent->nr_pages);
718 
719 		BUG_ON(ret);
720 		alloc_hint = ins.objectid + ins.offset;
721 		kfree(async_extent);
722 		cond_resched();
723 	}
724 
725 	return 0;
726 }
727 
728 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
729 				      u64 num_bytes)
730 {
731 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
732 	struct extent_map *em;
733 	u64 alloc_hint = 0;
734 
735 	read_lock(&em_tree->lock);
736 	em = search_extent_mapping(em_tree, start, num_bytes);
737 	if (em) {
738 		/*
739 		 * if block start isn't an actual block number then find the
740 		 * first block in this inode and use that as a hint.  If that
741 		 * block is also bogus then just don't worry about it.
742 		 */
743 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
744 			free_extent_map(em);
745 			em = search_extent_mapping(em_tree, 0, 0);
746 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
747 				alloc_hint = em->block_start;
748 			if (em)
749 				free_extent_map(em);
750 		} else {
751 			alloc_hint = em->block_start;
752 			free_extent_map(em);
753 		}
754 	}
755 	read_unlock(&em_tree->lock);
756 
757 	return alloc_hint;
758 }
759 
760 /*
761  * when extent_io.c finds a delayed allocation range in the file,
762  * the call backs end up in this code.  The basic idea is to
763  * allocate extents on disk for the range, and create ordered data structs
764  * in ram to track those extents.
765  *
766  * locked_page is the page that writepage had locked already.  We use
767  * it to make sure we don't do extra locks or unlocks.
768  *
769  * *page_started is set to one if we unlock locked_page and do everything
770  * required to start IO on it.  It may be clean and already done with
771  * IO when we return.
772  */
773 static noinline int cow_file_range(struct inode *inode,
774 				   struct page *locked_page,
775 				   u64 start, u64 end, int *page_started,
776 				   unsigned long *nr_written,
777 				   int unlock)
778 {
779 	struct btrfs_root *root = BTRFS_I(inode)->root;
780 	struct btrfs_trans_handle *trans;
781 	u64 alloc_hint = 0;
782 	u64 num_bytes;
783 	unsigned long ram_size;
784 	u64 disk_num_bytes;
785 	u64 cur_alloc_size;
786 	u64 blocksize = root->sectorsize;
787 	struct btrfs_key ins;
788 	struct extent_map *em;
789 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
790 	int ret = 0;
791 
792 	BUG_ON(btrfs_is_free_space_inode(root, inode));
793 	trans = btrfs_join_transaction(root);
794 	BUG_ON(IS_ERR(trans));
795 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
796 
797 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
798 	num_bytes = max(blocksize,  num_bytes);
799 	disk_num_bytes = num_bytes;
800 	ret = 0;
801 
802 	/* if this is a small write inside eof, kick off defrag */
803 	if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
804 		btrfs_add_inode_defrag(trans, inode);
805 
806 	if (start == 0) {
807 		/* lets try to make an inline extent */
808 		ret = cow_file_range_inline(trans, root, inode,
809 					    start, end, 0, 0, NULL);
810 		if (ret == 0) {
811 			extent_clear_unlock_delalloc(inode,
812 				     &BTRFS_I(inode)->io_tree,
813 				     start, end, NULL,
814 				     EXTENT_CLEAR_UNLOCK_PAGE |
815 				     EXTENT_CLEAR_UNLOCK |
816 				     EXTENT_CLEAR_DELALLOC |
817 				     EXTENT_CLEAR_DIRTY |
818 				     EXTENT_SET_WRITEBACK |
819 				     EXTENT_END_WRITEBACK);
820 
821 			*nr_written = *nr_written +
822 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
823 			*page_started = 1;
824 			ret = 0;
825 			goto out;
826 		}
827 	}
828 
829 	BUG_ON(disk_num_bytes >
830 	       btrfs_super_total_bytes(root->fs_info->super_copy));
831 
832 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
833 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
834 
835 	while (disk_num_bytes > 0) {
836 		unsigned long op;
837 
838 		cur_alloc_size = disk_num_bytes;
839 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
840 					   root->sectorsize, 0, alloc_hint,
841 					   (u64)-1, &ins, 1);
842 		BUG_ON(ret);
843 
844 		em = alloc_extent_map();
845 		BUG_ON(!em);
846 		em->start = start;
847 		em->orig_start = em->start;
848 		ram_size = ins.offset;
849 		em->len = ins.offset;
850 
851 		em->block_start = ins.objectid;
852 		em->block_len = ins.offset;
853 		em->bdev = root->fs_info->fs_devices->latest_bdev;
854 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
855 
856 		while (1) {
857 			write_lock(&em_tree->lock);
858 			ret = add_extent_mapping(em_tree, em);
859 			write_unlock(&em_tree->lock);
860 			if (ret != -EEXIST) {
861 				free_extent_map(em);
862 				break;
863 			}
864 			btrfs_drop_extent_cache(inode, start,
865 						start + ram_size - 1, 0);
866 		}
867 
868 		cur_alloc_size = ins.offset;
869 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
870 					       ram_size, cur_alloc_size, 0);
871 		BUG_ON(ret);
872 
873 		if (root->root_key.objectid ==
874 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
875 			ret = btrfs_reloc_clone_csums(inode, start,
876 						      cur_alloc_size);
877 			BUG_ON(ret);
878 		}
879 
880 		if (disk_num_bytes < cur_alloc_size)
881 			break;
882 
883 		/* we're not doing compressed IO, don't unlock the first
884 		 * page (which the caller expects to stay locked), don't
885 		 * clear any dirty bits and don't set any writeback bits
886 		 *
887 		 * Do set the Private2 bit so we know this page was properly
888 		 * setup for writepage
889 		 */
890 		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
891 		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
892 			EXTENT_SET_PRIVATE2;
893 
894 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
895 					     start, start + ram_size - 1,
896 					     locked_page, op);
897 		disk_num_bytes -= cur_alloc_size;
898 		num_bytes -= cur_alloc_size;
899 		alloc_hint = ins.objectid + ins.offset;
900 		start += cur_alloc_size;
901 	}
902 out:
903 	ret = 0;
904 	btrfs_end_transaction(trans, root);
905 
906 	return ret;
907 }
908 
909 /*
910  * work queue call back to started compression on a file and pages
911  */
912 static noinline void async_cow_start(struct btrfs_work *work)
913 {
914 	struct async_cow *async_cow;
915 	int num_added = 0;
916 	async_cow = container_of(work, struct async_cow, work);
917 
918 	compress_file_range(async_cow->inode, async_cow->locked_page,
919 			    async_cow->start, async_cow->end, async_cow,
920 			    &num_added);
921 	if (num_added == 0)
922 		async_cow->inode = NULL;
923 }
924 
925 /*
926  * work queue call back to submit previously compressed pages
927  */
928 static noinline void async_cow_submit(struct btrfs_work *work)
929 {
930 	struct async_cow *async_cow;
931 	struct btrfs_root *root;
932 	unsigned long nr_pages;
933 
934 	async_cow = container_of(work, struct async_cow, work);
935 
936 	root = async_cow->root;
937 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
938 		PAGE_CACHE_SHIFT;
939 
940 	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
941 
942 	if (atomic_read(&root->fs_info->async_delalloc_pages) <
943 	    5 * 1042 * 1024 &&
944 	    waitqueue_active(&root->fs_info->async_submit_wait))
945 		wake_up(&root->fs_info->async_submit_wait);
946 
947 	if (async_cow->inode)
948 		submit_compressed_extents(async_cow->inode, async_cow);
949 }
950 
951 static noinline void async_cow_free(struct btrfs_work *work)
952 {
953 	struct async_cow *async_cow;
954 	async_cow = container_of(work, struct async_cow, work);
955 	kfree(async_cow);
956 }
957 
958 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
959 				u64 start, u64 end, int *page_started,
960 				unsigned long *nr_written)
961 {
962 	struct async_cow *async_cow;
963 	struct btrfs_root *root = BTRFS_I(inode)->root;
964 	unsigned long nr_pages;
965 	u64 cur_end;
966 	int limit = 10 * 1024 * 1042;
967 
968 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
969 			 1, 0, NULL, GFP_NOFS);
970 	while (start < end) {
971 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
972 		BUG_ON(!async_cow);
973 		async_cow->inode = inode;
974 		async_cow->root = root;
975 		async_cow->locked_page = locked_page;
976 		async_cow->start = start;
977 
978 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
979 			cur_end = end;
980 		else
981 			cur_end = min(end, start + 512 * 1024 - 1);
982 
983 		async_cow->end = cur_end;
984 		INIT_LIST_HEAD(&async_cow->extents);
985 
986 		async_cow->work.func = async_cow_start;
987 		async_cow->work.ordered_func = async_cow_submit;
988 		async_cow->work.ordered_free = async_cow_free;
989 		async_cow->work.flags = 0;
990 
991 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
992 			PAGE_CACHE_SHIFT;
993 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
994 
995 		btrfs_queue_worker(&root->fs_info->delalloc_workers,
996 				   &async_cow->work);
997 
998 		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
999 			wait_event(root->fs_info->async_submit_wait,
1000 			   (atomic_read(&root->fs_info->async_delalloc_pages) <
1001 			    limit));
1002 		}
1003 
1004 		while (atomic_read(&root->fs_info->async_submit_draining) &&
1005 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
1006 			wait_event(root->fs_info->async_submit_wait,
1007 			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
1008 			   0));
1009 		}
1010 
1011 		*nr_written += nr_pages;
1012 		start = cur_end + 1;
1013 	}
1014 	*page_started = 1;
1015 	return 0;
1016 }
1017 
1018 static noinline int csum_exist_in_range(struct btrfs_root *root,
1019 					u64 bytenr, u64 num_bytes)
1020 {
1021 	int ret;
1022 	struct btrfs_ordered_sum *sums;
1023 	LIST_HEAD(list);
1024 
1025 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1026 				       bytenr + num_bytes - 1, &list, 0);
1027 	if (ret == 0 && list_empty(&list))
1028 		return 0;
1029 
1030 	while (!list_empty(&list)) {
1031 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1032 		list_del(&sums->list);
1033 		kfree(sums);
1034 	}
1035 	return 1;
1036 }
1037 
1038 /*
1039  * when nowcow writeback call back.  This checks for snapshots or COW copies
1040  * of the extents that exist in the file, and COWs the file as required.
1041  *
1042  * If no cow copies or snapshots exist, we write directly to the existing
1043  * blocks on disk
1044  */
1045 static noinline int run_delalloc_nocow(struct inode *inode,
1046 				       struct page *locked_page,
1047 			      u64 start, u64 end, int *page_started, int force,
1048 			      unsigned long *nr_written)
1049 {
1050 	struct btrfs_root *root = BTRFS_I(inode)->root;
1051 	struct btrfs_trans_handle *trans;
1052 	struct extent_buffer *leaf;
1053 	struct btrfs_path *path;
1054 	struct btrfs_file_extent_item *fi;
1055 	struct btrfs_key found_key;
1056 	u64 cow_start;
1057 	u64 cur_offset;
1058 	u64 extent_end;
1059 	u64 extent_offset;
1060 	u64 disk_bytenr;
1061 	u64 num_bytes;
1062 	int extent_type;
1063 	int ret;
1064 	int type;
1065 	int nocow;
1066 	int check_prev = 1;
1067 	bool nolock;
1068 	u64 ino = btrfs_ino(inode);
1069 
1070 	path = btrfs_alloc_path();
1071 	if (!path)
1072 		return -ENOMEM;
1073 
1074 	nolock = btrfs_is_free_space_inode(root, inode);
1075 
1076 	if (nolock)
1077 		trans = btrfs_join_transaction_nolock(root);
1078 	else
1079 		trans = btrfs_join_transaction(root);
1080 
1081 	BUG_ON(IS_ERR(trans));
1082 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1083 
1084 	cow_start = (u64)-1;
1085 	cur_offset = start;
1086 	while (1) {
1087 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
1088 					       cur_offset, 0);
1089 		BUG_ON(ret < 0);
1090 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1091 			leaf = path->nodes[0];
1092 			btrfs_item_key_to_cpu(leaf, &found_key,
1093 					      path->slots[0] - 1);
1094 			if (found_key.objectid == ino &&
1095 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1096 				path->slots[0]--;
1097 		}
1098 		check_prev = 0;
1099 next_slot:
1100 		leaf = path->nodes[0];
1101 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1102 			ret = btrfs_next_leaf(root, path);
1103 			if (ret < 0)
1104 				BUG_ON(1);
1105 			if (ret > 0)
1106 				break;
1107 			leaf = path->nodes[0];
1108 		}
1109 
1110 		nocow = 0;
1111 		disk_bytenr = 0;
1112 		num_bytes = 0;
1113 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1114 
1115 		if (found_key.objectid > ino ||
1116 		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
1117 		    found_key.offset > end)
1118 			break;
1119 
1120 		if (found_key.offset > cur_offset) {
1121 			extent_end = found_key.offset;
1122 			extent_type = 0;
1123 			goto out_check;
1124 		}
1125 
1126 		fi = btrfs_item_ptr(leaf, path->slots[0],
1127 				    struct btrfs_file_extent_item);
1128 		extent_type = btrfs_file_extent_type(leaf, fi);
1129 
1130 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1131 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1132 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1133 			extent_offset = btrfs_file_extent_offset(leaf, fi);
1134 			extent_end = found_key.offset +
1135 				btrfs_file_extent_num_bytes(leaf, fi);
1136 			if (extent_end <= start) {
1137 				path->slots[0]++;
1138 				goto next_slot;
1139 			}
1140 			if (disk_bytenr == 0)
1141 				goto out_check;
1142 			if (btrfs_file_extent_compression(leaf, fi) ||
1143 			    btrfs_file_extent_encryption(leaf, fi) ||
1144 			    btrfs_file_extent_other_encoding(leaf, fi))
1145 				goto out_check;
1146 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1147 				goto out_check;
1148 			if (btrfs_extent_readonly(root, disk_bytenr))
1149 				goto out_check;
1150 			if (btrfs_cross_ref_exist(trans, root, ino,
1151 						  found_key.offset -
1152 						  extent_offset, disk_bytenr))
1153 				goto out_check;
1154 			disk_bytenr += extent_offset;
1155 			disk_bytenr += cur_offset - found_key.offset;
1156 			num_bytes = min(end + 1, extent_end) - cur_offset;
1157 			/*
1158 			 * force cow if csum exists in the range.
1159 			 * this ensure that csum for a given extent are
1160 			 * either valid or do not exist.
1161 			 */
1162 			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1163 				goto out_check;
1164 			nocow = 1;
1165 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1166 			extent_end = found_key.offset +
1167 				btrfs_file_extent_inline_len(leaf, fi);
1168 			extent_end = ALIGN(extent_end, root->sectorsize);
1169 		} else {
1170 			BUG_ON(1);
1171 		}
1172 out_check:
1173 		if (extent_end <= start) {
1174 			path->slots[0]++;
1175 			goto next_slot;
1176 		}
1177 		if (!nocow) {
1178 			if (cow_start == (u64)-1)
1179 				cow_start = cur_offset;
1180 			cur_offset = extent_end;
1181 			if (cur_offset > end)
1182 				break;
1183 			path->slots[0]++;
1184 			goto next_slot;
1185 		}
1186 
1187 		btrfs_release_path(path);
1188 		if (cow_start != (u64)-1) {
1189 			ret = cow_file_range(inode, locked_page, cow_start,
1190 					found_key.offset - 1, page_started,
1191 					nr_written, 1);
1192 			BUG_ON(ret);
1193 			cow_start = (u64)-1;
1194 		}
1195 
1196 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1197 			struct extent_map *em;
1198 			struct extent_map_tree *em_tree;
1199 			em_tree = &BTRFS_I(inode)->extent_tree;
1200 			em = alloc_extent_map();
1201 			BUG_ON(!em);
1202 			em->start = cur_offset;
1203 			em->orig_start = em->start;
1204 			em->len = num_bytes;
1205 			em->block_len = num_bytes;
1206 			em->block_start = disk_bytenr;
1207 			em->bdev = root->fs_info->fs_devices->latest_bdev;
1208 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
1209 			while (1) {
1210 				write_lock(&em_tree->lock);
1211 				ret = add_extent_mapping(em_tree, em);
1212 				write_unlock(&em_tree->lock);
1213 				if (ret != -EEXIST) {
1214 					free_extent_map(em);
1215 					break;
1216 				}
1217 				btrfs_drop_extent_cache(inode, em->start,
1218 						em->start + em->len - 1, 0);
1219 			}
1220 			type = BTRFS_ORDERED_PREALLOC;
1221 		} else {
1222 			type = BTRFS_ORDERED_NOCOW;
1223 		}
1224 
1225 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1226 					       num_bytes, num_bytes, type);
1227 		BUG_ON(ret);
1228 
1229 		if (root->root_key.objectid ==
1230 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1231 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1232 						      num_bytes);
1233 			BUG_ON(ret);
1234 		}
1235 
1236 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1237 				cur_offset, cur_offset + num_bytes - 1,
1238 				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1239 				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1240 				EXTENT_SET_PRIVATE2);
1241 		cur_offset = extent_end;
1242 		if (cur_offset > end)
1243 			break;
1244 	}
1245 	btrfs_release_path(path);
1246 
1247 	if (cur_offset <= end && cow_start == (u64)-1)
1248 		cow_start = cur_offset;
1249 	if (cow_start != (u64)-1) {
1250 		ret = cow_file_range(inode, locked_page, cow_start, end,
1251 				     page_started, nr_written, 1);
1252 		BUG_ON(ret);
1253 	}
1254 
1255 	if (nolock) {
1256 		ret = btrfs_end_transaction_nolock(trans, root);
1257 		BUG_ON(ret);
1258 	} else {
1259 		ret = btrfs_end_transaction(trans, root);
1260 		BUG_ON(ret);
1261 	}
1262 	btrfs_free_path(path);
1263 	return 0;
1264 }
1265 
1266 /*
1267  * extent_io.c call back to do delayed allocation processing
1268  */
1269 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1270 			      u64 start, u64 end, int *page_started,
1271 			      unsigned long *nr_written)
1272 {
1273 	int ret;
1274 	struct btrfs_root *root = BTRFS_I(inode)->root;
1275 
1276 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
1277 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1278 					 page_started, 1, nr_written);
1279 	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1280 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1281 					 page_started, 0, nr_written);
1282 	else if (!btrfs_test_opt(root, COMPRESS) &&
1283 		 !(BTRFS_I(inode)->force_compress) &&
1284 		 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
1285 		ret = cow_file_range(inode, locked_page, start, end,
1286 				      page_started, nr_written, 1);
1287 	else
1288 		ret = cow_file_range_async(inode, locked_page, start, end,
1289 					   page_started, nr_written);
1290 	return ret;
1291 }
1292 
1293 static void btrfs_split_extent_hook(struct inode *inode,
1294 				    struct extent_state *orig, u64 split)
1295 {
1296 	/* not delalloc, ignore it */
1297 	if (!(orig->state & EXTENT_DELALLOC))
1298 		return;
1299 
1300 	spin_lock(&BTRFS_I(inode)->lock);
1301 	BTRFS_I(inode)->outstanding_extents++;
1302 	spin_unlock(&BTRFS_I(inode)->lock);
1303 }
1304 
1305 /*
1306  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1307  * extents so we can keep track of new extents that are just merged onto old
1308  * extents, such as when we are doing sequential writes, so we can properly
1309  * account for the metadata space we'll need.
1310  */
1311 static void btrfs_merge_extent_hook(struct inode *inode,
1312 				    struct extent_state *new,
1313 				    struct extent_state *other)
1314 {
1315 	/* not delalloc, ignore it */
1316 	if (!(other->state & EXTENT_DELALLOC))
1317 		return;
1318 
1319 	spin_lock(&BTRFS_I(inode)->lock);
1320 	BTRFS_I(inode)->outstanding_extents--;
1321 	spin_unlock(&BTRFS_I(inode)->lock);
1322 }
1323 
1324 /*
1325  * extent_io.c set_bit_hook, used to track delayed allocation
1326  * bytes in this file, and to maintain the list of inodes that
1327  * have pending delalloc work to be done.
1328  */
1329 static void btrfs_set_bit_hook(struct inode *inode,
1330 			       struct extent_state *state, int *bits)
1331 {
1332 
1333 	/*
1334 	 * set_bit and clear bit hooks normally require _irqsave/restore
1335 	 * but in this case, we are only testing for the DELALLOC
1336 	 * bit, which is only set or cleared with irqs on
1337 	 */
1338 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1339 		struct btrfs_root *root = BTRFS_I(inode)->root;
1340 		u64 len = state->end + 1 - state->start;
1341 		bool do_list = !btrfs_is_free_space_inode(root, inode);
1342 
1343 		if (*bits & EXTENT_FIRST_DELALLOC) {
1344 			*bits &= ~EXTENT_FIRST_DELALLOC;
1345 		} else {
1346 			spin_lock(&BTRFS_I(inode)->lock);
1347 			BTRFS_I(inode)->outstanding_extents++;
1348 			spin_unlock(&BTRFS_I(inode)->lock);
1349 		}
1350 
1351 		spin_lock(&root->fs_info->delalloc_lock);
1352 		BTRFS_I(inode)->delalloc_bytes += len;
1353 		root->fs_info->delalloc_bytes += len;
1354 		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1355 			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1356 				      &root->fs_info->delalloc_inodes);
1357 		}
1358 		spin_unlock(&root->fs_info->delalloc_lock);
1359 	}
1360 }
1361 
1362 /*
1363  * extent_io.c clear_bit_hook, see set_bit_hook for why
1364  */
1365 static void btrfs_clear_bit_hook(struct inode *inode,
1366 				 struct extent_state *state, int *bits)
1367 {
1368 	/*
1369 	 * set_bit and clear bit hooks normally require _irqsave/restore
1370 	 * but in this case, we are only testing for the DELALLOC
1371 	 * bit, which is only set or cleared with irqs on
1372 	 */
1373 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1374 		struct btrfs_root *root = BTRFS_I(inode)->root;
1375 		u64 len = state->end + 1 - state->start;
1376 		bool do_list = !btrfs_is_free_space_inode(root, inode);
1377 
1378 		if (*bits & EXTENT_FIRST_DELALLOC) {
1379 			*bits &= ~EXTENT_FIRST_DELALLOC;
1380 		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1381 			spin_lock(&BTRFS_I(inode)->lock);
1382 			BTRFS_I(inode)->outstanding_extents--;
1383 			spin_unlock(&BTRFS_I(inode)->lock);
1384 		}
1385 
1386 		if (*bits & EXTENT_DO_ACCOUNTING)
1387 			btrfs_delalloc_release_metadata(inode, len);
1388 
1389 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1390 		    && do_list)
1391 			btrfs_free_reserved_data_space(inode, len);
1392 
1393 		spin_lock(&root->fs_info->delalloc_lock);
1394 		root->fs_info->delalloc_bytes -= len;
1395 		BTRFS_I(inode)->delalloc_bytes -= len;
1396 
1397 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1398 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1399 			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1400 		}
1401 		spin_unlock(&root->fs_info->delalloc_lock);
1402 	}
1403 }
1404 
1405 /*
1406  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1407  * we don't create bios that span stripes or chunks
1408  */
1409 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1410 			 size_t size, struct bio *bio,
1411 			 unsigned long bio_flags)
1412 {
1413 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1414 	struct btrfs_mapping_tree *map_tree;
1415 	u64 logical = (u64)bio->bi_sector << 9;
1416 	u64 length = 0;
1417 	u64 map_length;
1418 	int ret;
1419 
1420 	if (bio_flags & EXTENT_BIO_COMPRESSED)
1421 		return 0;
1422 
1423 	length = bio->bi_size;
1424 	map_tree = &root->fs_info->mapping_tree;
1425 	map_length = length;
1426 	ret = btrfs_map_block(map_tree, READ, logical,
1427 			      &map_length, NULL, 0);
1428 
1429 	if (map_length < length + size)
1430 		return 1;
1431 	return ret;
1432 }
1433 
1434 /*
1435  * in order to insert checksums into the metadata in large chunks,
1436  * we wait until bio submission time.   All the pages in the bio are
1437  * checksummed and sums are attached onto the ordered extent record.
1438  *
1439  * At IO completion time the cums attached on the ordered extent record
1440  * are inserted into the btree
1441  */
1442 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1443 				    struct bio *bio, int mirror_num,
1444 				    unsigned long bio_flags,
1445 				    u64 bio_offset)
1446 {
1447 	struct btrfs_root *root = BTRFS_I(inode)->root;
1448 	int ret = 0;
1449 
1450 	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1451 	BUG_ON(ret);
1452 	return 0;
1453 }
1454 
1455 /*
1456  * in order to insert checksums into the metadata in large chunks,
1457  * we wait until bio submission time.   All the pages in the bio are
1458  * checksummed and sums are attached onto the ordered extent record.
1459  *
1460  * At IO completion time the cums attached on the ordered extent record
1461  * are inserted into the btree
1462  */
1463 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1464 			  int mirror_num, unsigned long bio_flags,
1465 			  u64 bio_offset)
1466 {
1467 	struct btrfs_root *root = BTRFS_I(inode)->root;
1468 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
1469 }
1470 
1471 /*
1472  * extent_io.c submission hook. This does the right thing for csum calculation
1473  * on write, or reading the csums from the tree before a read
1474  */
1475 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1476 			  int mirror_num, unsigned long bio_flags,
1477 			  u64 bio_offset)
1478 {
1479 	struct btrfs_root *root = BTRFS_I(inode)->root;
1480 	int ret = 0;
1481 	int skip_sum;
1482 
1483 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1484 
1485 	if (btrfs_is_free_space_inode(root, inode))
1486 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1487 	else
1488 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1489 	BUG_ON(ret);
1490 
1491 	if (!(rw & REQ_WRITE)) {
1492 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
1493 			return btrfs_submit_compressed_read(inode, bio,
1494 						    mirror_num, bio_flags);
1495 		} else if (!skip_sum) {
1496 			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1497 			if (ret)
1498 				return ret;
1499 		}
1500 		goto mapit;
1501 	} else if (!skip_sum) {
1502 		/* csum items have already been cloned */
1503 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1504 			goto mapit;
1505 		/* we're doing a write, do the async checksumming */
1506 		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1507 				   inode, rw, bio, mirror_num,
1508 				   bio_flags, bio_offset,
1509 				   __btrfs_submit_bio_start,
1510 				   __btrfs_submit_bio_done);
1511 	}
1512 
1513 mapit:
1514 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
1515 }
1516 
1517 /*
1518  * given a list of ordered sums record them in the inode.  This happens
1519  * at IO completion time based on sums calculated at bio submission time.
1520  */
1521 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1522 			     struct inode *inode, u64 file_offset,
1523 			     struct list_head *list)
1524 {
1525 	struct btrfs_ordered_sum *sum;
1526 
1527 	list_for_each_entry(sum, list, list) {
1528 		btrfs_csum_file_blocks(trans,
1529 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
1530 	}
1531 	return 0;
1532 }
1533 
1534 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1535 			      struct extent_state **cached_state)
1536 {
1537 	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1538 		WARN_ON(1);
1539 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1540 				   cached_state, GFP_NOFS);
1541 }
1542 
1543 /* see btrfs_writepage_start_hook for details on why this is required */
1544 struct btrfs_writepage_fixup {
1545 	struct page *page;
1546 	struct btrfs_work work;
1547 };
1548 
1549 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1550 {
1551 	struct btrfs_writepage_fixup *fixup;
1552 	struct btrfs_ordered_extent *ordered;
1553 	struct extent_state *cached_state = NULL;
1554 	struct page *page;
1555 	struct inode *inode;
1556 	u64 page_start;
1557 	u64 page_end;
1558 
1559 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
1560 	page = fixup->page;
1561 again:
1562 	lock_page(page);
1563 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1564 		ClearPageChecked(page);
1565 		goto out_page;
1566 	}
1567 
1568 	inode = page->mapping->host;
1569 	page_start = page_offset(page);
1570 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1571 
1572 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1573 			 &cached_state, GFP_NOFS);
1574 
1575 	/* already ordered? We're done */
1576 	if (PagePrivate2(page))
1577 		goto out;
1578 
1579 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
1580 	if (ordered) {
1581 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1582 				     page_end, &cached_state, GFP_NOFS);
1583 		unlock_page(page);
1584 		btrfs_start_ordered_extent(inode, ordered, 1);
1585 		goto again;
1586 	}
1587 
1588 	BUG();
1589 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1590 	ClearPageChecked(page);
1591 out:
1592 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1593 			     &cached_state, GFP_NOFS);
1594 out_page:
1595 	unlock_page(page);
1596 	page_cache_release(page);
1597 	kfree(fixup);
1598 }
1599 
1600 /*
1601  * There are a few paths in the higher layers of the kernel that directly
1602  * set the page dirty bit without asking the filesystem if it is a
1603  * good idea.  This causes problems because we want to make sure COW
1604  * properly happens and the data=ordered rules are followed.
1605  *
1606  * In our case any range that doesn't have the ORDERED bit set
1607  * hasn't been properly setup for IO.  We kick off an async process
1608  * to fix it up.  The async helper will wait for ordered extents, set
1609  * the delalloc bit and make it safe to write the page.
1610  */
1611 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1612 {
1613 	struct inode *inode = page->mapping->host;
1614 	struct btrfs_writepage_fixup *fixup;
1615 	struct btrfs_root *root = BTRFS_I(inode)->root;
1616 
1617 	/* this page is properly in the ordered list */
1618 	if (TestClearPagePrivate2(page))
1619 		return 0;
1620 
1621 	if (PageChecked(page))
1622 		return -EAGAIN;
1623 
1624 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1625 	if (!fixup)
1626 		return -EAGAIN;
1627 
1628 	SetPageChecked(page);
1629 	page_cache_get(page);
1630 	fixup->work.func = btrfs_writepage_fixup_worker;
1631 	fixup->page = page;
1632 	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1633 	return -EAGAIN;
1634 }
1635 
1636 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1637 				       struct inode *inode, u64 file_pos,
1638 				       u64 disk_bytenr, u64 disk_num_bytes,
1639 				       u64 num_bytes, u64 ram_bytes,
1640 				       u8 compression, u8 encryption,
1641 				       u16 other_encoding, int extent_type)
1642 {
1643 	struct btrfs_root *root = BTRFS_I(inode)->root;
1644 	struct btrfs_file_extent_item *fi;
1645 	struct btrfs_path *path;
1646 	struct extent_buffer *leaf;
1647 	struct btrfs_key ins;
1648 	u64 hint;
1649 	int ret;
1650 
1651 	path = btrfs_alloc_path();
1652 	if (!path)
1653 		return -ENOMEM;
1654 
1655 	path->leave_spinning = 1;
1656 
1657 	/*
1658 	 * we may be replacing one extent in the tree with another.
1659 	 * The new extent is pinned in the extent map, and we don't want
1660 	 * to drop it from the cache until it is completely in the btree.
1661 	 *
1662 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
1663 	 * the caller is expected to unpin it and allow it to be merged
1664 	 * with the others.
1665 	 */
1666 	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
1667 				 &hint, 0);
1668 	BUG_ON(ret);
1669 
1670 	ins.objectid = btrfs_ino(inode);
1671 	ins.offset = file_pos;
1672 	ins.type = BTRFS_EXTENT_DATA_KEY;
1673 	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1674 	BUG_ON(ret);
1675 	leaf = path->nodes[0];
1676 	fi = btrfs_item_ptr(leaf, path->slots[0],
1677 			    struct btrfs_file_extent_item);
1678 	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1679 	btrfs_set_file_extent_type(leaf, fi, extent_type);
1680 	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1681 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1682 	btrfs_set_file_extent_offset(leaf, fi, 0);
1683 	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1684 	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1685 	btrfs_set_file_extent_compression(leaf, fi, compression);
1686 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
1687 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1688 
1689 	btrfs_unlock_up_safe(path, 1);
1690 	btrfs_set_lock_blocking(leaf);
1691 
1692 	btrfs_mark_buffer_dirty(leaf);
1693 
1694 	inode_add_bytes(inode, num_bytes);
1695 
1696 	ins.objectid = disk_bytenr;
1697 	ins.offset = disk_num_bytes;
1698 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1699 	ret = btrfs_alloc_reserved_file_extent(trans, root,
1700 					root->root_key.objectid,
1701 					btrfs_ino(inode), file_pos, &ins);
1702 	BUG_ON(ret);
1703 	btrfs_free_path(path);
1704 
1705 	return 0;
1706 }
1707 
1708 /*
1709  * helper function for btrfs_finish_ordered_io, this
1710  * just reads in some of the csum leaves to prime them into ram
1711  * before we start the transaction.  It limits the amount of btree
1712  * reads required while inside the transaction.
1713  */
1714 /* as ordered data IO finishes, this gets called so we can finish
1715  * an ordered extent if the range of bytes in the file it covers are
1716  * fully written.
1717  */
1718 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1719 {
1720 	struct btrfs_root *root = BTRFS_I(inode)->root;
1721 	struct btrfs_trans_handle *trans = NULL;
1722 	struct btrfs_ordered_extent *ordered_extent = NULL;
1723 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1724 	struct extent_state *cached_state = NULL;
1725 	int compress_type = 0;
1726 	int ret;
1727 	bool nolock;
1728 
1729 	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1730 					     end - start + 1);
1731 	if (!ret)
1732 		return 0;
1733 	BUG_ON(!ordered_extent);
1734 
1735 	nolock = btrfs_is_free_space_inode(root, inode);
1736 
1737 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1738 		BUG_ON(!list_empty(&ordered_extent->list));
1739 		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1740 		if (!ret) {
1741 			if (nolock)
1742 				trans = btrfs_join_transaction_nolock(root);
1743 			else
1744 				trans = btrfs_join_transaction(root);
1745 			BUG_ON(IS_ERR(trans));
1746 			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1747 			ret = btrfs_update_inode_fallback(trans, root, inode);
1748 			BUG_ON(ret);
1749 		}
1750 		goto out;
1751 	}
1752 
1753 	lock_extent_bits(io_tree, ordered_extent->file_offset,
1754 			 ordered_extent->file_offset + ordered_extent->len - 1,
1755 			 0, &cached_state, GFP_NOFS);
1756 
1757 	if (nolock)
1758 		trans = btrfs_join_transaction_nolock(root);
1759 	else
1760 		trans = btrfs_join_transaction(root);
1761 	BUG_ON(IS_ERR(trans));
1762 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1763 
1764 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1765 		compress_type = ordered_extent->compress_type;
1766 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1767 		BUG_ON(compress_type);
1768 		ret = btrfs_mark_extent_written(trans, inode,
1769 						ordered_extent->file_offset,
1770 						ordered_extent->file_offset +
1771 						ordered_extent->len);
1772 		BUG_ON(ret);
1773 	} else {
1774 		BUG_ON(root == root->fs_info->tree_root);
1775 		ret = insert_reserved_file_extent(trans, inode,
1776 						ordered_extent->file_offset,
1777 						ordered_extent->start,
1778 						ordered_extent->disk_len,
1779 						ordered_extent->len,
1780 						ordered_extent->len,
1781 						compress_type, 0, 0,
1782 						BTRFS_FILE_EXTENT_REG);
1783 		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1784 				   ordered_extent->file_offset,
1785 				   ordered_extent->len);
1786 		BUG_ON(ret);
1787 	}
1788 	unlock_extent_cached(io_tree, ordered_extent->file_offset,
1789 			     ordered_extent->file_offset +
1790 			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
1791 
1792 	add_pending_csums(trans, inode, ordered_extent->file_offset,
1793 			  &ordered_extent->list);
1794 
1795 	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1796 	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1797 		ret = btrfs_update_inode_fallback(trans, root, inode);
1798 		BUG_ON(ret);
1799 	}
1800 	ret = 0;
1801 out:
1802 	if (root != root->fs_info->tree_root)
1803 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1804 	if (trans) {
1805 		if (nolock)
1806 			btrfs_end_transaction_nolock(trans, root);
1807 		else
1808 			btrfs_end_transaction(trans, root);
1809 	}
1810 
1811 	/* once for us */
1812 	btrfs_put_ordered_extent(ordered_extent);
1813 	/* once for the tree */
1814 	btrfs_put_ordered_extent(ordered_extent);
1815 
1816 	return 0;
1817 }
1818 
1819 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1820 				struct extent_state *state, int uptodate)
1821 {
1822 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1823 
1824 	ClearPagePrivate2(page);
1825 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
1826 }
1827 
1828 /*
1829  * when reads are done, we need to check csums to verify the data is correct
1830  * if there's a match, we allow the bio to finish.  If not, the code in
1831  * extent_io.c will try to find good copies for us.
1832  */
1833 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1834 			       struct extent_state *state)
1835 {
1836 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
1837 	struct inode *inode = page->mapping->host;
1838 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1839 	char *kaddr;
1840 	u64 private = ~(u32)0;
1841 	int ret;
1842 	struct btrfs_root *root = BTRFS_I(inode)->root;
1843 	u32 csum = ~(u32)0;
1844 
1845 	if (PageChecked(page)) {
1846 		ClearPageChecked(page);
1847 		goto good;
1848 	}
1849 
1850 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
1851 		goto good;
1852 
1853 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1854 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
1855 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
1856 				  GFP_NOFS);
1857 		return 0;
1858 	}
1859 
1860 	if (state && state->start == start) {
1861 		private = state->private;
1862 		ret = 0;
1863 	} else {
1864 		ret = get_state_private(io_tree, start, &private);
1865 	}
1866 	kaddr = kmap_atomic(page, KM_USER0);
1867 	if (ret)
1868 		goto zeroit;
1869 
1870 	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
1871 	btrfs_csum_final(csum, (char *)&csum);
1872 	if (csum != private)
1873 		goto zeroit;
1874 
1875 	kunmap_atomic(kaddr, KM_USER0);
1876 good:
1877 	return 0;
1878 
1879 zeroit:
1880 	printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
1881 		       "private %llu\n",
1882 		       (unsigned long long)btrfs_ino(page->mapping->host),
1883 		       (unsigned long long)start, csum,
1884 		       (unsigned long long)private);
1885 	memset(kaddr + offset, 1, end - start + 1);
1886 	flush_dcache_page(page);
1887 	kunmap_atomic(kaddr, KM_USER0);
1888 	if (private == 0)
1889 		return 0;
1890 	return -EIO;
1891 }
1892 
1893 struct delayed_iput {
1894 	struct list_head list;
1895 	struct inode *inode;
1896 };
1897 
1898 void btrfs_add_delayed_iput(struct inode *inode)
1899 {
1900 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
1901 	struct delayed_iput *delayed;
1902 
1903 	if (atomic_add_unless(&inode->i_count, -1, 1))
1904 		return;
1905 
1906 	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
1907 	delayed->inode = inode;
1908 
1909 	spin_lock(&fs_info->delayed_iput_lock);
1910 	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
1911 	spin_unlock(&fs_info->delayed_iput_lock);
1912 }
1913 
1914 void btrfs_run_delayed_iputs(struct btrfs_root *root)
1915 {
1916 	LIST_HEAD(list);
1917 	struct btrfs_fs_info *fs_info = root->fs_info;
1918 	struct delayed_iput *delayed;
1919 	int empty;
1920 
1921 	spin_lock(&fs_info->delayed_iput_lock);
1922 	empty = list_empty(&fs_info->delayed_iputs);
1923 	spin_unlock(&fs_info->delayed_iput_lock);
1924 	if (empty)
1925 		return;
1926 
1927 	down_read(&root->fs_info->cleanup_work_sem);
1928 	spin_lock(&fs_info->delayed_iput_lock);
1929 	list_splice_init(&fs_info->delayed_iputs, &list);
1930 	spin_unlock(&fs_info->delayed_iput_lock);
1931 
1932 	while (!list_empty(&list)) {
1933 		delayed = list_entry(list.next, struct delayed_iput, list);
1934 		list_del(&delayed->list);
1935 		iput(delayed->inode);
1936 		kfree(delayed);
1937 	}
1938 	up_read(&root->fs_info->cleanup_work_sem);
1939 }
1940 
1941 enum btrfs_orphan_cleanup_state {
1942 	ORPHAN_CLEANUP_STARTED	= 1,
1943 	ORPHAN_CLEANUP_DONE	= 2,
1944 };
1945 
1946 /*
1947  * This is called in transaction commit time. If there are no orphan
1948  * files in the subvolume, it removes orphan item and frees block_rsv
1949  * structure.
1950  */
1951 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1952 			      struct btrfs_root *root)
1953 {
1954 	struct btrfs_block_rsv *block_rsv;
1955 	int ret;
1956 
1957 	if (!list_empty(&root->orphan_list) ||
1958 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
1959 		return;
1960 
1961 	spin_lock(&root->orphan_lock);
1962 	if (!list_empty(&root->orphan_list)) {
1963 		spin_unlock(&root->orphan_lock);
1964 		return;
1965 	}
1966 
1967 	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
1968 		spin_unlock(&root->orphan_lock);
1969 		return;
1970 	}
1971 
1972 	block_rsv = root->orphan_block_rsv;
1973 	root->orphan_block_rsv = NULL;
1974 	spin_unlock(&root->orphan_lock);
1975 
1976 	if (root->orphan_item_inserted &&
1977 	    btrfs_root_refs(&root->root_item) > 0) {
1978 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
1979 					    root->root_key.objectid);
1980 		BUG_ON(ret);
1981 		root->orphan_item_inserted = 0;
1982 	}
1983 
1984 	if (block_rsv) {
1985 		WARN_ON(block_rsv->size > 0);
1986 		btrfs_free_block_rsv(root, block_rsv);
1987 	}
1988 }
1989 
1990 /*
1991  * This creates an orphan entry for the given inode in case something goes
1992  * wrong in the middle of an unlink/truncate.
1993  *
1994  * NOTE: caller of this function should reserve 5 units of metadata for
1995  *	 this function.
1996  */
1997 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
1998 {
1999 	struct btrfs_root *root = BTRFS_I(inode)->root;
2000 	struct btrfs_block_rsv *block_rsv = NULL;
2001 	int reserve = 0;
2002 	int insert = 0;
2003 	int ret;
2004 
2005 	if (!root->orphan_block_rsv) {
2006 		block_rsv = btrfs_alloc_block_rsv(root);
2007 		if (!block_rsv)
2008 			return -ENOMEM;
2009 	}
2010 
2011 	spin_lock(&root->orphan_lock);
2012 	if (!root->orphan_block_rsv) {
2013 		root->orphan_block_rsv = block_rsv;
2014 	} else if (block_rsv) {
2015 		btrfs_free_block_rsv(root, block_rsv);
2016 		block_rsv = NULL;
2017 	}
2018 
2019 	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2020 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2021 #if 0
2022 		/*
2023 		 * For proper ENOSPC handling, we should do orphan
2024 		 * cleanup when mounting. But this introduces backward
2025 		 * compatibility issue.
2026 		 */
2027 		if (!xchg(&root->orphan_item_inserted, 1))
2028 			insert = 2;
2029 		else
2030 			insert = 1;
2031 #endif
2032 		insert = 1;
2033 	}
2034 
2035 	if (!BTRFS_I(inode)->orphan_meta_reserved) {
2036 		BTRFS_I(inode)->orphan_meta_reserved = 1;
2037 		reserve = 1;
2038 	}
2039 	spin_unlock(&root->orphan_lock);
2040 
2041 	/* grab metadata reservation from transaction handle */
2042 	if (reserve) {
2043 		ret = btrfs_orphan_reserve_metadata(trans, inode);
2044 		BUG_ON(ret);
2045 	}
2046 
2047 	/* insert an orphan item to track this unlinked/truncated file */
2048 	if (insert >= 1) {
2049 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2050 		BUG_ON(ret && ret != -EEXIST);
2051 	}
2052 
2053 	/* insert an orphan item to track subvolume contains orphan files */
2054 	if (insert >= 2) {
2055 		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2056 					       root->root_key.objectid);
2057 		BUG_ON(ret);
2058 	}
2059 	return 0;
2060 }
2061 
2062 /*
2063  * We have done the truncate/delete so we can go ahead and remove the orphan
2064  * item for this particular inode.
2065  */
2066 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2067 {
2068 	struct btrfs_root *root = BTRFS_I(inode)->root;
2069 	int delete_item = 0;
2070 	int release_rsv = 0;
2071 	int ret = 0;
2072 
2073 	spin_lock(&root->orphan_lock);
2074 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2075 		list_del_init(&BTRFS_I(inode)->i_orphan);
2076 		delete_item = 1;
2077 	}
2078 
2079 	if (BTRFS_I(inode)->orphan_meta_reserved) {
2080 		BTRFS_I(inode)->orphan_meta_reserved = 0;
2081 		release_rsv = 1;
2082 	}
2083 	spin_unlock(&root->orphan_lock);
2084 
2085 	if (trans && delete_item) {
2086 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
2087 		BUG_ON(ret);
2088 	}
2089 
2090 	if (release_rsv)
2091 		btrfs_orphan_release_metadata(inode);
2092 
2093 	return 0;
2094 }
2095 
2096 /*
2097  * this cleans up any orphans that may be left on the list from the last use
2098  * of this root.
2099  */
2100 int btrfs_orphan_cleanup(struct btrfs_root *root)
2101 {
2102 	struct btrfs_path *path;
2103 	struct extent_buffer *leaf;
2104 	struct btrfs_key key, found_key;
2105 	struct btrfs_trans_handle *trans;
2106 	struct inode *inode;
2107 	u64 last_objectid = 0;
2108 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
2109 
2110 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2111 		return 0;
2112 
2113 	path = btrfs_alloc_path();
2114 	if (!path) {
2115 		ret = -ENOMEM;
2116 		goto out;
2117 	}
2118 	path->reada = -1;
2119 
2120 	key.objectid = BTRFS_ORPHAN_OBJECTID;
2121 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
2122 	key.offset = (u64)-1;
2123 
2124 	while (1) {
2125 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2126 		if (ret < 0)
2127 			goto out;
2128 
2129 		/*
2130 		 * if ret == 0 means we found what we were searching for, which
2131 		 * is weird, but possible, so only screw with path if we didn't
2132 		 * find the key and see if we have stuff that matches
2133 		 */
2134 		if (ret > 0) {
2135 			ret = 0;
2136 			if (path->slots[0] == 0)
2137 				break;
2138 			path->slots[0]--;
2139 		}
2140 
2141 		/* pull out the item */
2142 		leaf = path->nodes[0];
2143 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2144 
2145 		/* make sure the item matches what we want */
2146 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
2147 			break;
2148 		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
2149 			break;
2150 
2151 		/* release the path since we're done with it */
2152 		btrfs_release_path(path);
2153 
2154 		/*
2155 		 * this is where we are basically btrfs_lookup, without the
2156 		 * crossing root thing.  we store the inode number in the
2157 		 * offset of the orphan item.
2158 		 */
2159 
2160 		if (found_key.offset == last_objectid) {
2161 			printk(KERN_ERR "btrfs: Error removing orphan entry, "
2162 			       "stopping orphan cleanup\n");
2163 			ret = -EINVAL;
2164 			goto out;
2165 		}
2166 
2167 		last_objectid = found_key.offset;
2168 
2169 		found_key.objectid = found_key.offset;
2170 		found_key.type = BTRFS_INODE_ITEM_KEY;
2171 		found_key.offset = 0;
2172 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2173 		ret = PTR_RET(inode);
2174 		if (ret && ret != -ESTALE)
2175 			goto out;
2176 
2177 		if (ret == -ESTALE && root == root->fs_info->tree_root) {
2178 			struct btrfs_root *dead_root;
2179 			struct btrfs_fs_info *fs_info = root->fs_info;
2180 			int is_dead_root = 0;
2181 
2182 			/*
2183 			 * this is an orphan in the tree root. Currently these
2184 			 * could come from 2 sources:
2185 			 *  a) a snapshot deletion in progress
2186 			 *  b) a free space cache inode
2187 			 * We need to distinguish those two, as the snapshot
2188 			 * orphan must not get deleted.
2189 			 * find_dead_roots already ran before us, so if this
2190 			 * is a snapshot deletion, we should find the root
2191 			 * in the dead_roots list
2192 			 */
2193 			spin_lock(&fs_info->trans_lock);
2194 			list_for_each_entry(dead_root, &fs_info->dead_roots,
2195 					    root_list) {
2196 				if (dead_root->root_key.objectid ==
2197 				    found_key.objectid) {
2198 					is_dead_root = 1;
2199 					break;
2200 				}
2201 			}
2202 			spin_unlock(&fs_info->trans_lock);
2203 			if (is_dead_root) {
2204 				/* prevent this orphan from being found again */
2205 				key.offset = found_key.objectid - 1;
2206 				continue;
2207 			}
2208 		}
2209 		/*
2210 		 * Inode is already gone but the orphan item is still there,
2211 		 * kill the orphan item.
2212 		 */
2213 		if (ret == -ESTALE) {
2214 			trans = btrfs_start_transaction(root, 1);
2215 			if (IS_ERR(trans)) {
2216 				ret = PTR_ERR(trans);
2217 				goto out;
2218 			}
2219 			ret = btrfs_del_orphan_item(trans, root,
2220 						    found_key.objectid);
2221 			BUG_ON(ret);
2222 			btrfs_end_transaction(trans, root);
2223 			continue;
2224 		}
2225 
2226 		/*
2227 		 * add this inode to the orphan list so btrfs_orphan_del does
2228 		 * the proper thing when we hit it
2229 		 */
2230 		spin_lock(&root->orphan_lock);
2231 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2232 		spin_unlock(&root->orphan_lock);
2233 
2234 		/* if we have links, this was a truncate, lets do that */
2235 		if (inode->i_nlink) {
2236 			if (!S_ISREG(inode->i_mode)) {
2237 				WARN_ON(1);
2238 				iput(inode);
2239 				continue;
2240 			}
2241 			nr_truncate++;
2242 			ret = btrfs_truncate(inode);
2243 		} else {
2244 			nr_unlink++;
2245 		}
2246 
2247 		/* this will do delete_inode and everything for us */
2248 		iput(inode);
2249 		if (ret)
2250 			goto out;
2251 	}
2252 	/* release the path since we're done with it */
2253 	btrfs_release_path(path);
2254 
2255 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2256 
2257 	if (root->orphan_block_rsv)
2258 		btrfs_block_rsv_release(root, root->orphan_block_rsv,
2259 					(u64)-1);
2260 
2261 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
2262 		trans = btrfs_join_transaction(root);
2263 		if (!IS_ERR(trans))
2264 			btrfs_end_transaction(trans, root);
2265 	}
2266 
2267 	if (nr_unlink)
2268 		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2269 	if (nr_truncate)
2270 		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2271 
2272 out:
2273 	if (ret)
2274 		printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
2275 	btrfs_free_path(path);
2276 	return ret;
2277 }
2278 
2279 /*
2280  * very simple check to peek ahead in the leaf looking for xattrs.  If we
2281  * don't find any xattrs, we know there can't be any acls.
2282  *
2283  * slot is the slot the inode is in, objectid is the objectid of the inode
2284  */
2285 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
2286 					  int slot, u64 objectid)
2287 {
2288 	u32 nritems = btrfs_header_nritems(leaf);
2289 	struct btrfs_key found_key;
2290 	int scanned = 0;
2291 
2292 	slot++;
2293 	while (slot < nritems) {
2294 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
2295 
2296 		/* we found a different objectid, there must not be acls */
2297 		if (found_key.objectid != objectid)
2298 			return 0;
2299 
2300 		/* we found an xattr, assume we've got an acl */
2301 		if (found_key.type == BTRFS_XATTR_ITEM_KEY)
2302 			return 1;
2303 
2304 		/*
2305 		 * we found a key greater than an xattr key, there can't
2306 		 * be any acls later on
2307 		 */
2308 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
2309 			return 0;
2310 
2311 		slot++;
2312 		scanned++;
2313 
2314 		/*
2315 		 * it goes inode, inode backrefs, xattrs, extents,
2316 		 * so if there are a ton of hard links to an inode there can
2317 		 * be a lot of backrefs.  Don't waste time searching too hard,
2318 		 * this is just an optimization
2319 		 */
2320 		if (scanned >= 8)
2321 			break;
2322 	}
2323 	/* we hit the end of the leaf before we found an xattr or
2324 	 * something larger than an xattr.  We have to assume the inode
2325 	 * has acls
2326 	 */
2327 	return 1;
2328 }
2329 
2330 /*
2331  * read an inode from the btree into the in-memory inode
2332  */
2333 static void btrfs_read_locked_inode(struct inode *inode)
2334 {
2335 	struct btrfs_path *path;
2336 	struct extent_buffer *leaf;
2337 	struct btrfs_inode_item *inode_item;
2338 	struct btrfs_timespec *tspec;
2339 	struct btrfs_root *root = BTRFS_I(inode)->root;
2340 	struct btrfs_key location;
2341 	int maybe_acls;
2342 	u32 rdev;
2343 	int ret;
2344 	bool filled = false;
2345 
2346 	ret = btrfs_fill_inode(inode, &rdev);
2347 	if (!ret)
2348 		filled = true;
2349 
2350 	path = btrfs_alloc_path();
2351 	if (!path)
2352 		goto make_bad;
2353 
2354 	path->leave_spinning = 1;
2355 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2356 
2357 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
2358 	if (ret)
2359 		goto make_bad;
2360 
2361 	leaf = path->nodes[0];
2362 
2363 	if (filled)
2364 		goto cache_acl;
2365 
2366 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
2367 				    struct btrfs_inode_item);
2368 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2369 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
2370 	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
2371 	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
2372 	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
2373 
2374 	tspec = btrfs_inode_atime(inode_item);
2375 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2376 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2377 
2378 	tspec = btrfs_inode_mtime(inode_item);
2379 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2380 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2381 
2382 	tspec = btrfs_inode_ctime(inode_item);
2383 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
2384 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
2385 
2386 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
2387 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
2388 	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
2389 	inode->i_generation = BTRFS_I(inode)->generation;
2390 	inode->i_rdev = 0;
2391 	rdev = btrfs_inode_rdev(leaf, inode_item);
2392 
2393 	BTRFS_I(inode)->index_cnt = (u64)-1;
2394 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2395 cache_acl:
2396 	/*
2397 	 * try to precache a NULL acl entry for files that don't have
2398 	 * any xattrs or acls
2399 	 */
2400 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
2401 					   btrfs_ino(inode));
2402 	if (!maybe_acls)
2403 		cache_no_acl(inode);
2404 
2405 	btrfs_free_path(path);
2406 
2407 	switch (inode->i_mode & S_IFMT) {
2408 	case S_IFREG:
2409 		inode->i_mapping->a_ops = &btrfs_aops;
2410 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2411 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
2412 		inode->i_fop = &btrfs_file_operations;
2413 		inode->i_op = &btrfs_file_inode_operations;
2414 		break;
2415 	case S_IFDIR:
2416 		inode->i_fop = &btrfs_dir_file_operations;
2417 		if (root == root->fs_info->tree_root)
2418 			inode->i_op = &btrfs_dir_ro_inode_operations;
2419 		else
2420 			inode->i_op = &btrfs_dir_inode_operations;
2421 		break;
2422 	case S_IFLNK:
2423 		inode->i_op = &btrfs_symlink_inode_operations;
2424 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
2425 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
2426 		break;
2427 	default:
2428 		inode->i_op = &btrfs_special_inode_operations;
2429 		init_special_inode(inode, inode->i_mode, rdev);
2430 		break;
2431 	}
2432 
2433 	btrfs_update_iflags(inode);
2434 	return;
2435 
2436 make_bad:
2437 	btrfs_free_path(path);
2438 	make_bad_inode(inode);
2439 }
2440 
2441 /*
2442  * given a leaf and an inode, copy the inode fields into the leaf
2443  */
2444 static void fill_inode_item(struct btrfs_trans_handle *trans,
2445 			    struct extent_buffer *leaf,
2446 			    struct btrfs_inode_item *item,
2447 			    struct inode *inode)
2448 {
2449 	btrfs_set_inode_uid(leaf, item, inode->i_uid);
2450 	btrfs_set_inode_gid(leaf, item, inode->i_gid);
2451 	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
2452 	btrfs_set_inode_mode(leaf, item, inode->i_mode);
2453 	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
2454 
2455 	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
2456 			       inode->i_atime.tv_sec);
2457 	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
2458 				inode->i_atime.tv_nsec);
2459 
2460 	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
2461 			       inode->i_mtime.tv_sec);
2462 	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
2463 				inode->i_mtime.tv_nsec);
2464 
2465 	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
2466 			       inode->i_ctime.tv_sec);
2467 	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
2468 				inode->i_ctime.tv_nsec);
2469 
2470 	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
2471 	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
2472 	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
2473 	btrfs_set_inode_transid(leaf, item, trans->transid);
2474 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2475 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2476 	btrfs_set_inode_block_group(leaf, item, 0);
2477 }
2478 
2479 /*
2480  * copy everything in the in-memory inode into the btree.
2481  */
2482 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2483 				struct btrfs_root *root, struct inode *inode)
2484 {
2485 	struct btrfs_inode_item *inode_item;
2486 	struct btrfs_path *path;
2487 	struct extent_buffer *leaf;
2488 	int ret;
2489 
2490 	path = btrfs_alloc_path();
2491 	if (!path)
2492 		return -ENOMEM;
2493 
2494 	path->leave_spinning = 1;
2495 	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
2496 				 1);
2497 	if (ret) {
2498 		if (ret > 0)
2499 			ret = -ENOENT;
2500 		goto failed;
2501 	}
2502 
2503 	btrfs_unlock_up_safe(path, 1);
2504 	leaf = path->nodes[0];
2505 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
2506 				    struct btrfs_inode_item);
2507 
2508 	fill_inode_item(trans, leaf, inode_item, inode);
2509 	btrfs_mark_buffer_dirty(leaf);
2510 	btrfs_set_inode_last_trans(trans, inode);
2511 	ret = 0;
2512 failed:
2513 	btrfs_free_path(path);
2514 	return ret;
2515 }
2516 
2517 /*
2518  * copy everything in the in-memory inode into the btree.
2519  */
2520 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2521 				struct btrfs_root *root, struct inode *inode)
2522 {
2523 	int ret;
2524 
2525 	/*
2526 	 * If the inode is a free space inode, we can deadlock during commit
2527 	 * if we put it into the delayed code.
2528 	 *
2529 	 * The data relocation inode should also be directly updated
2530 	 * without delay
2531 	 */
2532 	if (!btrfs_is_free_space_inode(root, inode)
2533 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2534 		ret = btrfs_delayed_update_inode(trans, root, inode);
2535 		if (!ret)
2536 			btrfs_set_inode_last_trans(trans, inode);
2537 		return ret;
2538 	}
2539 
2540 	return btrfs_update_inode_item(trans, root, inode);
2541 }
2542 
2543 static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2544 				struct btrfs_root *root, struct inode *inode)
2545 {
2546 	int ret;
2547 
2548 	ret = btrfs_update_inode(trans, root, inode);
2549 	if (ret == -ENOSPC)
2550 		return btrfs_update_inode_item(trans, root, inode);
2551 	return ret;
2552 }
2553 
2554 /*
2555  * unlink helper that gets used here in inode.c and in the tree logging
2556  * recovery code.  It remove a link in a directory with a given name, and
2557  * also drops the back refs in the inode to the directory
2558  */
2559 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2560 				struct btrfs_root *root,
2561 				struct inode *dir, struct inode *inode,
2562 				const char *name, int name_len)
2563 {
2564 	struct btrfs_path *path;
2565 	int ret = 0;
2566 	struct extent_buffer *leaf;
2567 	struct btrfs_dir_item *di;
2568 	struct btrfs_key key;
2569 	u64 index;
2570 	u64 ino = btrfs_ino(inode);
2571 	u64 dir_ino = btrfs_ino(dir);
2572 
2573 	path = btrfs_alloc_path();
2574 	if (!path) {
2575 		ret = -ENOMEM;
2576 		goto out;
2577 	}
2578 
2579 	path->leave_spinning = 1;
2580 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
2581 				    name, name_len, -1);
2582 	if (IS_ERR(di)) {
2583 		ret = PTR_ERR(di);
2584 		goto err;
2585 	}
2586 	if (!di) {
2587 		ret = -ENOENT;
2588 		goto err;
2589 	}
2590 	leaf = path->nodes[0];
2591 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
2592 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
2593 	if (ret)
2594 		goto err;
2595 	btrfs_release_path(path);
2596 
2597 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
2598 				  dir_ino, &index);
2599 	if (ret) {
2600 		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2601 		       "inode %llu parent %llu\n", name_len, name,
2602 		       (unsigned long long)ino, (unsigned long long)dir_ino);
2603 		goto err;
2604 	}
2605 
2606 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
2607 	if (ret)
2608 		goto err;
2609 
2610 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2611 					 inode, dir_ino);
2612 	BUG_ON(ret != 0 && ret != -ENOENT);
2613 
2614 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2615 					   dir, index);
2616 	if (ret == -ENOENT)
2617 		ret = 0;
2618 err:
2619 	btrfs_free_path(path);
2620 	if (ret)
2621 		goto out;
2622 
2623 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2624 	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2625 	btrfs_update_inode(trans, root, dir);
2626 out:
2627 	return ret;
2628 }
2629 
2630 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2631 		       struct btrfs_root *root,
2632 		       struct inode *dir, struct inode *inode,
2633 		       const char *name, int name_len)
2634 {
2635 	int ret;
2636 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
2637 	if (!ret) {
2638 		btrfs_drop_nlink(inode);
2639 		ret = btrfs_update_inode(trans, root, inode);
2640 	}
2641 	return ret;
2642 }
2643 
2644 
2645 /* helper to check if there is any shared block in the path */
2646 static int check_path_shared(struct btrfs_root *root,
2647 			     struct btrfs_path *path)
2648 {
2649 	struct extent_buffer *eb;
2650 	int level;
2651 	u64 refs = 1;
2652 
2653 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2654 		int ret;
2655 
2656 		if (!path->nodes[level])
2657 			break;
2658 		eb = path->nodes[level];
2659 		if (!btrfs_block_can_be_shared(root, eb))
2660 			continue;
2661 		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2662 					       &refs, NULL);
2663 		if (refs > 1)
2664 			return 1;
2665 	}
2666 	return 0;
2667 }
2668 
2669 /*
2670  * helper to start transaction for unlink and rmdir.
2671  *
2672  * unlink and rmdir are special in btrfs, they do not always free space.
2673  * so in enospc case, we should make sure they will free space before
2674  * allowing them to use the global metadata reservation.
2675  */
2676 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2677 						       struct dentry *dentry)
2678 {
2679 	struct btrfs_trans_handle *trans;
2680 	struct btrfs_root *root = BTRFS_I(dir)->root;
2681 	struct btrfs_path *path;
2682 	struct btrfs_inode_ref *ref;
2683 	struct btrfs_dir_item *di;
2684 	struct inode *inode = dentry->d_inode;
2685 	u64 index;
2686 	int check_link = 1;
2687 	int err = -ENOSPC;
2688 	int ret;
2689 	u64 ino = btrfs_ino(inode);
2690 	u64 dir_ino = btrfs_ino(dir);
2691 
2692 	/*
2693 	 * 1 for the possible orphan item
2694 	 * 1 for the dir item
2695 	 * 1 for the dir index
2696 	 * 1 for the inode ref
2697 	 * 1 for the inode ref in the tree log
2698 	 * 2 for the dir entries in the log
2699 	 * 1 for the inode
2700 	 */
2701 	trans = btrfs_start_transaction(root, 8);
2702 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2703 		return trans;
2704 
2705 	if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2706 		return ERR_PTR(-ENOSPC);
2707 
2708 	/* check if there is someone else holds reference */
2709 	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2710 		return ERR_PTR(-ENOSPC);
2711 
2712 	if (atomic_read(&inode->i_count) > 2)
2713 		return ERR_PTR(-ENOSPC);
2714 
2715 	if (xchg(&root->fs_info->enospc_unlink, 1))
2716 		return ERR_PTR(-ENOSPC);
2717 
2718 	path = btrfs_alloc_path();
2719 	if (!path) {
2720 		root->fs_info->enospc_unlink = 0;
2721 		return ERR_PTR(-ENOMEM);
2722 	}
2723 
2724 	/* 1 for the orphan item */
2725 	trans = btrfs_start_transaction(root, 1);
2726 	if (IS_ERR(trans)) {
2727 		btrfs_free_path(path);
2728 		root->fs_info->enospc_unlink = 0;
2729 		return trans;
2730 	}
2731 
2732 	path->skip_locking = 1;
2733 	path->search_commit_root = 1;
2734 
2735 	ret = btrfs_lookup_inode(trans, root, path,
2736 				&BTRFS_I(dir)->location, 0);
2737 	if (ret < 0) {
2738 		err = ret;
2739 		goto out;
2740 	}
2741 	if (ret == 0) {
2742 		if (check_path_shared(root, path))
2743 			goto out;
2744 	} else {
2745 		check_link = 0;
2746 	}
2747 	btrfs_release_path(path);
2748 
2749 	ret = btrfs_lookup_inode(trans, root, path,
2750 				&BTRFS_I(inode)->location, 0);
2751 	if (ret < 0) {
2752 		err = ret;
2753 		goto out;
2754 	}
2755 	if (ret == 0) {
2756 		if (check_path_shared(root, path))
2757 			goto out;
2758 	} else {
2759 		check_link = 0;
2760 	}
2761 	btrfs_release_path(path);
2762 
2763 	if (ret == 0 && S_ISREG(inode->i_mode)) {
2764 		ret = btrfs_lookup_file_extent(trans, root, path,
2765 					       ino, (u64)-1, 0);
2766 		if (ret < 0) {
2767 			err = ret;
2768 			goto out;
2769 		}
2770 		BUG_ON(ret == 0);
2771 		if (check_path_shared(root, path))
2772 			goto out;
2773 		btrfs_release_path(path);
2774 	}
2775 
2776 	if (!check_link) {
2777 		err = 0;
2778 		goto out;
2779 	}
2780 
2781 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
2782 				dentry->d_name.name, dentry->d_name.len, 0);
2783 	if (IS_ERR(di)) {
2784 		err = PTR_ERR(di);
2785 		goto out;
2786 	}
2787 	if (di) {
2788 		if (check_path_shared(root, path))
2789 			goto out;
2790 	} else {
2791 		err = 0;
2792 		goto out;
2793 	}
2794 	btrfs_release_path(path);
2795 
2796 	ref = btrfs_lookup_inode_ref(trans, root, path,
2797 				dentry->d_name.name, dentry->d_name.len,
2798 				ino, dir_ino, 0);
2799 	if (IS_ERR(ref)) {
2800 		err = PTR_ERR(ref);
2801 		goto out;
2802 	}
2803 	BUG_ON(!ref);
2804 	if (check_path_shared(root, path))
2805 		goto out;
2806 	index = btrfs_inode_ref_index(path->nodes[0], ref);
2807 	btrfs_release_path(path);
2808 
2809 	/*
2810 	 * This is a commit root search, if we can lookup inode item and other
2811 	 * relative items in the commit root, it means the transaction of
2812 	 * dir/file creation has been committed, and the dir index item that we
2813 	 * delay to insert has also been inserted into the commit root. So
2814 	 * we needn't worry about the delayed insertion of the dir index item
2815 	 * here.
2816 	 */
2817 	di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
2818 				dentry->d_name.name, dentry->d_name.len, 0);
2819 	if (IS_ERR(di)) {
2820 		err = PTR_ERR(di);
2821 		goto out;
2822 	}
2823 	BUG_ON(ret == -ENOENT);
2824 	if (check_path_shared(root, path))
2825 		goto out;
2826 
2827 	err = 0;
2828 out:
2829 	btrfs_free_path(path);
2830 	/* Migrate the orphan reservation over */
2831 	if (!err)
2832 		err = btrfs_block_rsv_migrate(trans->block_rsv,
2833 				&root->fs_info->global_block_rsv,
2834 				trans->bytes_reserved);
2835 
2836 	if (err) {
2837 		btrfs_end_transaction(trans, root);
2838 		root->fs_info->enospc_unlink = 0;
2839 		return ERR_PTR(err);
2840 	}
2841 
2842 	trans->block_rsv = &root->fs_info->global_block_rsv;
2843 	return trans;
2844 }
2845 
2846 static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2847 			       struct btrfs_root *root)
2848 {
2849 	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2850 		btrfs_block_rsv_release(root, trans->block_rsv,
2851 					trans->bytes_reserved);
2852 		trans->block_rsv = &root->fs_info->trans_block_rsv;
2853 		BUG_ON(!root->fs_info->enospc_unlink);
2854 		root->fs_info->enospc_unlink = 0;
2855 	}
2856 	btrfs_end_transaction(trans, root);
2857 }
2858 
2859 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2860 {
2861 	struct btrfs_root *root = BTRFS_I(dir)->root;
2862 	struct btrfs_trans_handle *trans;
2863 	struct inode *inode = dentry->d_inode;
2864 	int ret;
2865 	unsigned long nr = 0;
2866 
2867 	trans = __unlink_start_trans(dir, dentry);
2868 	if (IS_ERR(trans))
2869 		return PTR_ERR(trans);
2870 
2871 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2872 
2873 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2874 				 dentry->d_name.name, dentry->d_name.len);
2875 	if (ret)
2876 		goto out;
2877 
2878 	if (inode->i_nlink == 0) {
2879 		ret = btrfs_orphan_add(trans, inode);
2880 		if (ret)
2881 			goto out;
2882 	}
2883 
2884 out:
2885 	nr = trans->blocks_used;
2886 	__unlink_end_trans(trans, root);
2887 	btrfs_btree_balance_dirty(root, nr);
2888 	return ret;
2889 }
2890 
2891 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2892 			struct btrfs_root *root,
2893 			struct inode *dir, u64 objectid,
2894 			const char *name, int name_len)
2895 {
2896 	struct btrfs_path *path;
2897 	struct extent_buffer *leaf;
2898 	struct btrfs_dir_item *di;
2899 	struct btrfs_key key;
2900 	u64 index;
2901 	int ret;
2902 	u64 dir_ino = btrfs_ino(dir);
2903 
2904 	path = btrfs_alloc_path();
2905 	if (!path)
2906 		return -ENOMEM;
2907 
2908 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
2909 				   name, name_len, -1);
2910 	BUG_ON(IS_ERR_OR_NULL(di));
2911 
2912 	leaf = path->nodes[0];
2913 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
2914 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2915 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
2916 	BUG_ON(ret);
2917 	btrfs_release_path(path);
2918 
2919 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
2920 				 objectid, root->root_key.objectid,
2921 				 dir_ino, &index, name, name_len);
2922 	if (ret < 0) {
2923 		BUG_ON(ret != -ENOENT);
2924 		di = btrfs_search_dir_index_item(root, path, dir_ino,
2925 						 name, name_len);
2926 		BUG_ON(IS_ERR_OR_NULL(di));
2927 
2928 		leaf = path->nodes[0];
2929 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2930 		btrfs_release_path(path);
2931 		index = key.offset;
2932 	}
2933 	btrfs_release_path(path);
2934 
2935 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
2936 	BUG_ON(ret);
2937 
2938 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2939 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2940 	ret = btrfs_update_inode(trans, root, dir);
2941 	BUG_ON(ret);
2942 
2943 	btrfs_free_path(path);
2944 	return 0;
2945 }
2946 
2947 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2948 {
2949 	struct inode *inode = dentry->d_inode;
2950 	int err = 0;
2951 	struct btrfs_root *root = BTRFS_I(dir)->root;
2952 	struct btrfs_trans_handle *trans;
2953 	unsigned long nr = 0;
2954 
2955 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2956 	    btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
2957 		return -ENOTEMPTY;
2958 
2959 	trans = __unlink_start_trans(dir, dentry);
2960 	if (IS_ERR(trans))
2961 		return PTR_ERR(trans);
2962 
2963 	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
2964 		err = btrfs_unlink_subvol(trans, root, dir,
2965 					  BTRFS_I(inode)->location.objectid,
2966 					  dentry->d_name.name,
2967 					  dentry->d_name.len);
2968 		goto out;
2969 	}
2970 
2971 	err = btrfs_orphan_add(trans, inode);
2972 	if (err)
2973 		goto out;
2974 
2975 	/* now the directory is empty */
2976 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2977 				 dentry->d_name.name, dentry->d_name.len);
2978 	if (!err)
2979 		btrfs_i_size_write(inode, 0);
2980 out:
2981 	nr = trans->blocks_used;
2982 	__unlink_end_trans(trans, root);
2983 	btrfs_btree_balance_dirty(root, nr);
2984 
2985 	return err;
2986 }
2987 
2988 /*
2989  * this can truncate away extent items, csum items and directory items.
2990  * It starts at a high offset and removes keys until it can't find
2991  * any higher than new_size
2992  *
2993  * csum items that cross the new i_size are truncated to the new size
2994  * as well.
2995  *
2996  * min_type is the minimum key type to truncate down to.  If set to 0, this
2997  * will kill all the items on this inode, including the INODE_ITEM_KEY.
2998  */
2999 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3000 			       struct btrfs_root *root,
3001 			       struct inode *inode,
3002 			       u64 new_size, u32 min_type)
3003 {
3004 	struct btrfs_path *path;
3005 	struct extent_buffer *leaf;
3006 	struct btrfs_file_extent_item *fi;
3007 	struct btrfs_key key;
3008 	struct btrfs_key found_key;
3009 	u64 extent_start = 0;
3010 	u64 extent_num_bytes = 0;
3011 	u64 extent_offset = 0;
3012 	u64 item_end = 0;
3013 	u64 mask = root->sectorsize - 1;
3014 	u32 found_type = (u8)-1;
3015 	int found_extent;
3016 	int del_item;
3017 	int pending_del_nr = 0;
3018 	int pending_del_slot = 0;
3019 	int extent_type = -1;
3020 	int ret;
3021 	int err = 0;
3022 	u64 ino = btrfs_ino(inode);
3023 
3024 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3025 
3026 	path = btrfs_alloc_path();
3027 	if (!path)
3028 		return -ENOMEM;
3029 	path->reada = -1;
3030 
3031 	if (root->ref_cows || root == root->fs_info->tree_root)
3032 		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
3033 
3034 	/*
3035 	 * This function is also used to drop the items in the log tree before
3036 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
3037 	 * it is used to drop the loged items. So we shouldn't kill the delayed
3038 	 * items.
3039 	 */
3040 	if (min_type == 0 && root == BTRFS_I(inode)->root)
3041 		btrfs_kill_delayed_inode_items(inode);
3042 
3043 	key.objectid = ino;
3044 	key.offset = (u64)-1;
3045 	key.type = (u8)-1;
3046 
3047 search_again:
3048 	path->leave_spinning = 1;
3049 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3050 	if (ret < 0) {
3051 		err = ret;
3052 		goto out;
3053 	}
3054 
3055 	if (ret > 0) {
3056 		/* there are no items in the tree for us to truncate, we're
3057 		 * done
3058 		 */
3059 		if (path->slots[0] == 0)
3060 			goto out;
3061 		path->slots[0]--;
3062 	}
3063 
3064 	while (1) {
3065 		fi = NULL;
3066 		leaf = path->nodes[0];
3067 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3068 		found_type = btrfs_key_type(&found_key);
3069 
3070 		if (found_key.objectid != ino)
3071 			break;
3072 
3073 		if (found_type < min_type)
3074 			break;
3075 
3076 		item_end = found_key.offset;
3077 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
3078 			fi = btrfs_item_ptr(leaf, path->slots[0],
3079 					    struct btrfs_file_extent_item);
3080 			extent_type = btrfs_file_extent_type(leaf, fi);
3081 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3082 				item_end +=
3083 				    btrfs_file_extent_num_bytes(leaf, fi);
3084 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3085 				item_end += btrfs_file_extent_inline_len(leaf,
3086 									 fi);
3087 			}
3088 			item_end--;
3089 		}
3090 		if (found_type > min_type) {
3091 			del_item = 1;
3092 		} else {
3093 			if (item_end < new_size)
3094 				break;
3095 			if (found_key.offset >= new_size)
3096 				del_item = 1;
3097 			else
3098 				del_item = 0;
3099 		}
3100 		found_extent = 0;
3101 		/* FIXME, shrink the extent if the ref count is only 1 */
3102 		if (found_type != BTRFS_EXTENT_DATA_KEY)
3103 			goto delete;
3104 
3105 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3106 			u64 num_dec;
3107 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3108 			if (!del_item) {
3109 				u64 orig_num_bytes =
3110 					btrfs_file_extent_num_bytes(leaf, fi);
3111 				extent_num_bytes = new_size -
3112 					found_key.offset + root->sectorsize - 1;
3113 				extent_num_bytes = extent_num_bytes &
3114 					~((u64)root->sectorsize - 1);
3115 				btrfs_set_file_extent_num_bytes(leaf, fi,
3116 							 extent_num_bytes);
3117 				num_dec = (orig_num_bytes -
3118 					   extent_num_bytes);
3119 				if (root->ref_cows && extent_start != 0)
3120 					inode_sub_bytes(inode, num_dec);
3121 				btrfs_mark_buffer_dirty(leaf);
3122 			} else {
3123 				extent_num_bytes =
3124 					btrfs_file_extent_disk_num_bytes(leaf,
3125 									 fi);
3126 				extent_offset = found_key.offset -
3127 					btrfs_file_extent_offset(leaf, fi);
3128 
3129 				/* FIXME blocksize != 4096 */
3130 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
3131 				if (extent_start != 0) {
3132 					found_extent = 1;
3133 					if (root->ref_cows)
3134 						inode_sub_bytes(inode, num_dec);
3135 				}
3136 			}
3137 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3138 			/*
3139 			 * we can't truncate inline items that have had
3140 			 * special encodings
3141 			 */
3142 			if (!del_item &&
3143 			    btrfs_file_extent_compression(leaf, fi) == 0 &&
3144 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
3145 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
3146 				u32 size = new_size - found_key.offset;
3147 
3148 				if (root->ref_cows) {
3149 					inode_sub_bytes(inode, item_end + 1 -
3150 							new_size);
3151 				}
3152 				size =
3153 				    btrfs_file_extent_calc_inline_size(size);
3154 				ret = btrfs_truncate_item(trans, root, path,
3155 							  size, 1);
3156 			} else if (root->ref_cows) {
3157 				inode_sub_bytes(inode, item_end + 1 -
3158 						found_key.offset);
3159 			}
3160 		}
3161 delete:
3162 		if (del_item) {
3163 			if (!pending_del_nr) {
3164 				/* no pending yet, add ourselves */
3165 				pending_del_slot = path->slots[0];
3166 				pending_del_nr = 1;
3167 			} else if (pending_del_nr &&
3168 				   path->slots[0] + 1 == pending_del_slot) {
3169 				/* hop on the pending chunk */
3170 				pending_del_nr++;
3171 				pending_del_slot = path->slots[0];
3172 			} else {
3173 				BUG();
3174 			}
3175 		} else {
3176 			break;
3177 		}
3178 		if (found_extent && (root->ref_cows ||
3179 				     root == root->fs_info->tree_root)) {
3180 			btrfs_set_path_blocking(path);
3181 			ret = btrfs_free_extent(trans, root, extent_start,
3182 						extent_num_bytes, 0,
3183 						btrfs_header_owner(leaf),
3184 						ino, extent_offset, 0);
3185 			BUG_ON(ret);
3186 		}
3187 
3188 		if (found_type == BTRFS_INODE_ITEM_KEY)
3189 			break;
3190 
3191 		if (path->slots[0] == 0 ||
3192 		    path->slots[0] != pending_del_slot) {
3193 			if (root->ref_cows &&
3194 			    BTRFS_I(inode)->location.objectid !=
3195 						BTRFS_FREE_INO_OBJECTID) {
3196 				err = -EAGAIN;
3197 				goto out;
3198 			}
3199 			if (pending_del_nr) {
3200 				ret = btrfs_del_items(trans, root, path,
3201 						pending_del_slot,
3202 						pending_del_nr);
3203 				BUG_ON(ret);
3204 				pending_del_nr = 0;
3205 			}
3206 			btrfs_release_path(path);
3207 			goto search_again;
3208 		} else {
3209 			path->slots[0]--;
3210 		}
3211 	}
3212 out:
3213 	if (pending_del_nr) {
3214 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
3215 				      pending_del_nr);
3216 		BUG_ON(ret);
3217 	}
3218 	btrfs_free_path(path);
3219 	return err;
3220 }
3221 
3222 /*
3223  * taken from block_truncate_page, but does cow as it zeros out
3224  * any bytes left in the last page in the file.
3225  */
3226 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3227 {
3228 	struct inode *inode = mapping->host;
3229 	struct btrfs_root *root = BTRFS_I(inode)->root;
3230 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3231 	struct btrfs_ordered_extent *ordered;
3232 	struct extent_state *cached_state = NULL;
3233 	char *kaddr;
3234 	u32 blocksize = root->sectorsize;
3235 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
3236 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
3237 	struct page *page;
3238 	gfp_t mask = btrfs_alloc_write_mask(mapping);
3239 	int ret = 0;
3240 	u64 page_start;
3241 	u64 page_end;
3242 
3243 	if ((offset & (blocksize - 1)) == 0)
3244 		goto out;
3245 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3246 	if (ret)
3247 		goto out;
3248 
3249 	ret = -ENOMEM;
3250 again:
3251 	page = find_or_create_page(mapping, index, mask);
3252 	if (!page) {
3253 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3254 		goto out;
3255 	}
3256 
3257 	page_start = page_offset(page);
3258 	page_end = page_start + PAGE_CACHE_SIZE - 1;
3259 
3260 	if (!PageUptodate(page)) {
3261 		ret = btrfs_readpage(NULL, page);
3262 		lock_page(page);
3263 		if (page->mapping != mapping) {
3264 			unlock_page(page);
3265 			page_cache_release(page);
3266 			goto again;
3267 		}
3268 		if (!PageUptodate(page)) {
3269 			ret = -EIO;
3270 			goto out_unlock;
3271 		}
3272 	}
3273 	wait_on_page_writeback(page);
3274 
3275 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3276 			 GFP_NOFS);
3277 	set_page_extent_mapped(page);
3278 
3279 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
3280 	if (ordered) {
3281 		unlock_extent_cached(io_tree, page_start, page_end,
3282 				     &cached_state, GFP_NOFS);
3283 		unlock_page(page);
3284 		page_cache_release(page);
3285 		btrfs_start_ordered_extent(inode, ordered, 1);
3286 		btrfs_put_ordered_extent(ordered);
3287 		goto again;
3288 	}
3289 
3290 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3291 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3292 			  0, 0, &cached_state, GFP_NOFS);
3293 
3294 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3295 					&cached_state);
3296 	if (ret) {
3297 		unlock_extent_cached(io_tree, page_start, page_end,
3298 				     &cached_state, GFP_NOFS);
3299 		goto out_unlock;
3300 	}
3301 
3302 	ret = 0;
3303 	if (offset != PAGE_CACHE_SIZE) {
3304 		kaddr = kmap(page);
3305 		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
3306 		flush_dcache_page(page);
3307 		kunmap(page);
3308 	}
3309 	ClearPageChecked(page);
3310 	set_page_dirty(page);
3311 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3312 			     GFP_NOFS);
3313 
3314 out_unlock:
3315 	if (ret)
3316 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3317 	unlock_page(page);
3318 	page_cache_release(page);
3319 out:
3320 	return ret;
3321 }
3322 
3323 /*
3324  * This function puts in dummy file extents for the area we're creating a hole
3325  * for.  So if we are truncating this file to a larger size we need to insert
3326  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
3327  * the range between oldsize and size
3328  */
3329 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3330 {
3331 	struct btrfs_trans_handle *trans;
3332 	struct btrfs_root *root = BTRFS_I(inode)->root;
3333 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3334 	struct extent_map *em = NULL;
3335 	struct extent_state *cached_state = NULL;
3336 	u64 mask = root->sectorsize - 1;
3337 	u64 hole_start = (oldsize + mask) & ~mask;
3338 	u64 block_end = (size + mask) & ~mask;
3339 	u64 last_byte;
3340 	u64 cur_offset;
3341 	u64 hole_size;
3342 	int err = 0;
3343 
3344 	if (size <= hole_start)
3345 		return 0;
3346 
3347 	while (1) {
3348 		struct btrfs_ordered_extent *ordered;
3349 		btrfs_wait_ordered_range(inode, hole_start,
3350 					 block_end - hole_start);
3351 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3352 				 &cached_state, GFP_NOFS);
3353 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3354 		if (!ordered)
3355 			break;
3356 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
3357 				     &cached_state, GFP_NOFS);
3358 		btrfs_put_ordered_extent(ordered);
3359 	}
3360 
3361 	cur_offset = hole_start;
3362 	while (1) {
3363 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
3364 				block_end - cur_offset, 0);
3365 		BUG_ON(IS_ERR_OR_NULL(em));
3366 		last_byte = min(extent_map_end(em), block_end);
3367 		last_byte = (last_byte + mask) & ~mask;
3368 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3369 			u64 hint_byte = 0;
3370 			hole_size = last_byte - cur_offset;
3371 
3372 			trans = btrfs_start_transaction(root, 3);
3373 			if (IS_ERR(trans)) {
3374 				err = PTR_ERR(trans);
3375 				break;
3376 			}
3377 
3378 			err = btrfs_drop_extents(trans, inode, cur_offset,
3379 						 cur_offset + hole_size,
3380 						 &hint_byte, 1);
3381 			if (err) {
3382 				btrfs_update_inode(trans, root, inode);
3383 				btrfs_end_transaction(trans, root);
3384 				break;
3385 			}
3386 
3387 			err = btrfs_insert_file_extent(trans, root,
3388 					btrfs_ino(inode), cur_offset, 0,
3389 					0, hole_size, 0, hole_size,
3390 					0, 0, 0);
3391 			if (err) {
3392 				btrfs_update_inode(trans, root, inode);
3393 				btrfs_end_transaction(trans, root);
3394 				break;
3395 			}
3396 
3397 			btrfs_drop_extent_cache(inode, hole_start,
3398 					last_byte - 1, 0);
3399 
3400 			btrfs_update_inode(trans, root, inode);
3401 			btrfs_end_transaction(trans, root);
3402 		}
3403 		free_extent_map(em);
3404 		em = NULL;
3405 		cur_offset = last_byte;
3406 		if (cur_offset >= block_end)
3407 			break;
3408 	}
3409 
3410 	free_extent_map(em);
3411 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3412 			     GFP_NOFS);
3413 	return err;
3414 }
3415 
3416 static int btrfs_setsize(struct inode *inode, loff_t newsize)
3417 {
3418 	struct btrfs_root *root = BTRFS_I(inode)->root;
3419 	struct btrfs_trans_handle *trans;
3420 	loff_t oldsize = i_size_read(inode);
3421 	int ret;
3422 
3423 	if (newsize == oldsize)
3424 		return 0;
3425 
3426 	if (newsize > oldsize) {
3427 		truncate_pagecache(inode, oldsize, newsize);
3428 		ret = btrfs_cont_expand(inode, oldsize, newsize);
3429 		if (ret)
3430 			return ret;
3431 
3432 		trans = btrfs_start_transaction(root, 1);
3433 		if (IS_ERR(trans))
3434 			return PTR_ERR(trans);
3435 
3436 		i_size_write(inode, newsize);
3437 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3438 		ret = btrfs_update_inode(trans, root, inode);
3439 		btrfs_end_transaction(trans, root);
3440 	} else {
3441 
3442 		/*
3443 		 * We're truncating a file that used to have good data down to
3444 		 * zero. Make sure it gets into the ordered flush list so that
3445 		 * any new writes get down to disk quickly.
3446 		 */
3447 		if (newsize == 0)
3448 			BTRFS_I(inode)->ordered_data_close = 1;
3449 
3450 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
3451 		truncate_setsize(inode, newsize);
3452 		ret = btrfs_truncate(inode);
3453 	}
3454 
3455 	return ret;
3456 }
3457 
3458 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3459 {
3460 	struct inode *inode = dentry->d_inode;
3461 	struct btrfs_root *root = BTRFS_I(inode)->root;
3462 	int err;
3463 
3464 	if (btrfs_root_readonly(root))
3465 		return -EROFS;
3466 
3467 	err = inode_change_ok(inode, attr);
3468 	if (err)
3469 		return err;
3470 
3471 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3472 		err = btrfs_setsize(inode, attr->ia_size);
3473 		if (err)
3474 			return err;
3475 	}
3476 
3477 	if (attr->ia_valid) {
3478 		setattr_copy(inode, attr);
3479 		err = btrfs_dirty_inode(inode);
3480 
3481 		if (!err && attr->ia_valid & ATTR_MODE)
3482 			err = btrfs_acl_chmod(inode);
3483 	}
3484 
3485 	return err;
3486 }
3487 
3488 void btrfs_evict_inode(struct inode *inode)
3489 {
3490 	struct btrfs_trans_handle *trans;
3491 	struct btrfs_root *root = BTRFS_I(inode)->root;
3492 	struct btrfs_block_rsv *rsv, *global_rsv;
3493 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3494 	unsigned long nr;
3495 	int ret;
3496 
3497 	trace_btrfs_inode_evict(inode);
3498 
3499 	truncate_inode_pages(&inode->i_data, 0);
3500 	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3501 			       btrfs_is_free_space_inode(root, inode)))
3502 		goto no_delete;
3503 
3504 	if (is_bad_inode(inode)) {
3505 		btrfs_orphan_del(NULL, inode);
3506 		goto no_delete;
3507 	}
3508 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
3509 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
3510 
3511 	if (root->fs_info->log_root_recovering) {
3512 		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
3513 		goto no_delete;
3514 	}
3515 
3516 	if (inode->i_nlink > 0) {
3517 		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
3518 		goto no_delete;
3519 	}
3520 
3521 	rsv = btrfs_alloc_block_rsv(root);
3522 	if (!rsv) {
3523 		btrfs_orphan_del(NULL, inode);
3524 		goto no_delete;
3525 	}
3526 	rsv->size = min_size;
3527 	global_rsv = &root->fs_info->global_block_rsv;
3528 
3529 	btrfs_i_size_write(inode, 0);
3530 
3531 	/*
3532 	 * This is a bit simpler than btrfs_truncate since
3533 	 *
3534 	 * 1) We've already reserved our space for our orphan item in the
3535 	 *    unlink.
3536 	 * 2) We're going to delete the inode item, so we don't need to update
3537 	 *    it at all.
3538 	 *
3539 	 * So we just need to reserve some slack space in case we add bytes when
3540 	 * doing the truncate.
3541 	 */
3542 	while (1) {
3543 		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3544 
3545 		/*
3546 		 * Try and steal from the global reserve since we will
3547 		 * likely not use this space anyway, we want to try as
3548 		 * hard as possible to get this to work.
3549 		 */
3550 		if (ret)
3551 			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3552 
3553 		if (ret) {
3554 			printk(KERN_WARNING "Could not get space for a "
3555 			       "delete, will truncate on mount %d\n", ret);
3556 			btrfs_orphan_del(NULL, inode);
3557 			btrfs_free_block_rsv(root, rsv);
3558 			goto no_delete;
3559 		}
3560 
3561 		trans = btrfs_start_transaction(root, 0);
3562 		if (IS_ERR(trans)) {
3563 			btrfs_orphan_del(NULL, inode);
3564 			btrfs_free_block_rsv(root, rsv);
3565 			goto no_delete;
3566 		}
3567 
3568 		trans->block_rsv = rsv;
3569 
3570 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3571 		if (ret != -EAGAIN)
3572 			break;
3573 
3574 		nr = trans->blocks_used;
3575 		btrfs_end_transaction(trans, root);
3576 		trans = NULL;
3577 		btrfs_btree_balance_dirty(root, nr);
3578 	}
3579 
3580 	btrfs_free_block_rsv(root, rsv);
3581 
3582 	if (ret == 0) {
3583 		trans->block_rsv = root->orphan_block_rsv;
3584 		ret = btrfs_orphan_del(trans, inode);
3585 		BUG_ON(ret);
3586 	}
3587 
3588 	trans->block_rsv = &root->fs_info->trans_block_rsv;
3589 	if (!(root == root->fs_info->tree_root ||
3590 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3591 		btrfs_return_ino(root, btrfs_ino(inode));
3592 
3593 	nr = trans->blocks_used;
3594 	btrfs_end_transaction(trans, root);
3595 	btrfs_btree_balance_dirty(root, nr);
3596 no_delete:
3597 	end_writeback(inode);
3598 	return;
3599 }
3600 
3601 /*
3602  * this returns the key found in the dir entry in the location pointer.
3603  * If no dir entries were found, location->objectid is 0.
3604  */
3605 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
3606 			       struct btrfs_key *location)
3607 {
3608 	const char *name = dentry->d_name.name;
3609 	int namelen = dentry->d_name.len;
3610 	struct btrfs_dir_item *di;
3611 	struct btrfs_path *path;
3612 	struct btrfs_root *root = BTRFS_I(dir)->root;
3613 	int ret = 0;
3614 
3615 	path = btrfs_alloc_path();
3616 	if (!path)
3617 		return -ENOMEM;
3618 
3619 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
3620 				    namelen, 0);
3621 	if (IS_ERR(di))
3622 		ret = PTR_ERR(di);
3623 
3624 	if (IS_ERR_OR_NULL(di))
3625 		goto out_err;
3626 
3627 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
3628 out:
3629 	btrfs_free_path(path);
3630 	return ret;
3631 out_err:
3632 	location->objectid = 0;
3633 	goto out;
3634 }
3635 
3636 /*
3637  * when we hit a tree root in a directory, the btrfs part of the inode
3638  * needs to be changed to reflect the root directory of the tree root.  This
3639  * is kind of like crossing a mount point.
3640  */
3641 static int fixup_tree_root_location(struct btrfs_root *root,
3642 				    struct inode *dir,
3643 				    struct dentry *dentry,
3644 				    struct btrfs_key *location,
3645 				    struct btrfs_root **sub_root)
3646 {
3647 	struct btrfs_path *path;
3648 	struct btrfs_root *new_root;
3649 	struct btrfs_root_ref *ref;
3650 	struct extent_buffer *leaf;
3651 	int ret;
3652 	int err = 0;
3653 
3654 	path = btrfs_alloc_path();
3655 	if (!path) {
3656 		err = -ENOMEM;
3657 		goto out;
3658 	}
3659 
3660 	err = -ENOENT;
3661 	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
3662 				  BTRFS_I(dir)->root->root_key.objectid,
3663 				  location->objectid);
3664 	if (ret) {
3665 		if (ret < 0)
3666 			err = ret;
3667 		goto out;
3668 	}
3669 
3670 	leaf = path->nodes[0];
3671 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
3672 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
3673 	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
3674 		goto out;
3675 
3676 	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
3677 				   (unsigned long)(ref + 1),
3678 				   dentry->d_name.len);
3679 	if (ret)
3680 		goto out;
3681 
3682 	btrfs_release_path(path);
3683 
3684 	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
3685 	if (IS_ERR(new_root)) {
3686 		err = PTR_ERR(new_root);
3687 		goto out;
3688 	}
3689 
3690 	if (btrfs_root_refs(&new_root->root_item) == 0) {
3691 		err = -ENOENT;
3692 		goto out;
3693 	}
3694 
3695 	*sub_root = new_root;
3696 	location->objectid = btrfs_root_dirid(&new_root->root_item);
3697 	location->type = BTRFS_INODE_ITEM_KEY;
3698 	location->offset = 0;
3699 	err = 0;
3700 out:
3701 	btrfs_free_path(path);
3702 	return err;
3703 }
3704 
3705 static void inode_tree_add(struct inode *inode)
3706 {
3707 	struct btrfs_root *root = BTRFS_I(inode)->root;
3708 	struct btrfs_inode *entry;
3709 	struct rb_node **p;
3710 	struct rb_node *parent;
3711 	u64 ino = btrfs_ino(inode);
3712 again:
3713 	p = &root->inode_tree.rb_node;
3714 	parent = NULL;
3715 
3716 	if (inode_unhashed(inode))
3717 		return;
3718 
3719 	spin_lock(&root->inode_lock);
3720 	while (*p) {
3721 		parent = *p;
3722 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
3723 
3724 		if (ino < btrfs_ino(&entry->vfs_inode))
3725 			p = &parent->rb_left;
3726 		else if (ino > btrfs_ino(&entry->vfs_inode))
3727 			p = &parent->rb_right;
3728 		else {
3729 			WARN_ON(!(entry->vfs_inode.i_state &
3730 				  (I_WILL_FREE | I_FREEING)));
3731 			rb_erase(parent, &root->inode_tree);
3732 			RB_CLEAR_NODE(parent);
3733 			spin_unlock(&root->inode_lock);
3734 			goto again;
3735 		}
3736 	}
3737 	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
3738 	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3739 	spin_unlock(&root->inode_lock);
3740 }
3741 
3742 static void inode_tree_del(struct inode *inode)
3743 {
3744 	struct btrfs_root *root = BTRFS_I(inode)->root;
3745 	int empty = 0;
3746 
3747 	spin_lock(&root->inode_lock);
3748 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
3749 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
3750 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3751 		empty = RB_EMPTY_ROOT(&root->inode_tree);
3752 	}
3753 	spin_unlock(&root->inode_lock);
3754 
3755 	/*
3756 	 * Free space cache has inodes in the tree root, but the tree root has a
3757 	 * root_refs of 0, so this could end up dropping the tree root as a
3758 	 * snapshot, so we need the extra !root->fs_info->tree_root check to
3759 	 * make sure we don't drop it.
3760 	 */
3761 	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
3762 	    root != root->fs_info->tree_root) {
3763 		synchronize_srcu(&root->fs_info->subvol_srcu);
3764 		spin_lock(&root->inode_lock);
3765 		empty = RB_EMPTY_ROOT(&root->inode_tree);
3766 		spin_unlock(&root->inode_lock);
3767 		if (empty)
3768 			btrfs_add_dead_root(root);
3769 	}
3770 }
3771 
3772 int btrfs_invalidate_inodes(struct btrfs_root *root)
3773 {
3774 	struct rb_node *node;
3775 	struct rb_node *prev;
3776 	struct btrfs_inode *entry;
3777 	struct inode *inode;
3778 	u64 objectid = 0;
3779 
3780 	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
3781 
3782 	spin_lock(&root->inode_lock);
3783 again:
3784 	node = root->inode_tree.rb_node;
3785 	prev = NULL;
3786 	while (node) {
3787 		prev = node;
3788 		entry = rb_entry(node, struct btrfs_inode, rb_node);
3789 
3790 		if (objectid < btrfs_ino(&entry->vfs_inode))
3791 			node = node->rb_left;
3792 		else if (objectid > btrfs_ino(&entry->vfs_inode))
3793 			node = node->rb_right;
3794 		else
3795 			break;
3796 	}
3797 	if (!node) {
3798 		while (prev) {
3799 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
3800 			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
3801 				node = prev;
3802 				break;
3803 			}
3804 			prev = rb_next(prev);
3805 		}
3806 	}
3807 	while (node) {
3808 		entry = rb_entry(node, struct btrfs_inode, rb_node);
3809 		objectid = btrfs_ino(&entry->vfs_inode) + 1;
3810 		inode = igrab(&entry->vfs_inode);
3811 		if (inode) {
3812 			spin_unlock(&root->inode_lock);
3813 			if (atomic_read(&inode->i_count) > 1)
3814 				d_prune_aliases(inode);
3815 			/*
3816 			 * btrfs_drop_inode will have it removed from
3817 			 * the inode cache when its usage count
3818 			 * hits zero.
3819 			 */
3820 			iput(inode);
3821 			cond_resched();
3822 			spin_lock(&root->inode_lock);
3823 			goto again;
3824 		}
3825 
3826 		if (cond_resched_lock(&root->inode_lock))
3827 			goto again;
3828 
3829 		node = rb_next(node);
3830 	}
3831 	spin_unlock(&root->inode_lock);
3832 	return 0;
3833 }
3834 
3835 static int btrfs_init_locked_inode(struct inode *inode, void *p)
3836 {
3837 	struct btrfs_iget_args *args = p;
3838 	inode->i_ino = args->ino;
3839 	BTRFS_I(inode)->root = args->root;
3840 	btrfs_set_inode_space_info(args->root, inode);
3841 	return 0;
3842 }
3843 
3844 static int btrfs_find_actor(struct inode *inode, void *opaque)
3845 {
3846 	struct btrfs_iget_args *args = opaque;
3847 	return args->ino == btrfs_ino(inode) &&
3848 		args->root == BTRFS_I(inode)->root;
3849 }
3850 
3851 static struct inode *btrfs_iget_locked(struct super_block *s,
3852 				       u64 objectid,
3853 				       struct btrfs_root *root)
3854 {
3855 	struct inode *inode;
3856 	struct btrfs_iget_args args;
3857 	args.ino = objectid;
3858 	args.root = root;
3859 
3860 	inode = iget5_locked(s, objectid, btrfs_find_actor,
3861 			     btrfs_init_locked_inode,
3862 			     (void *)&args);
3863 	return inode;
3864 }
3865 
3866 /* Get an inode object given its location and corresponding root.
3867  * Returns in *is_new if the inode was read from disk
3868  */
3869 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3870 			 struct btrfs_root *root, int *new)
3871 {
3872 	struct inode *inode;
3873 
3874 	inode = btrfs_iget_locked(s, location->objectid, root);
3875 	if (!inode)
3876 		return ERR_PTR(-ENOMEM);
3877 
3878 	if (inode->i_state & I_NEW) {
3879 		BTRFS_I(inode)->root = root;
3880 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3881 		btrfs_read_locked_inode(inode);
3882 		if (!is_bad_inode(inode)) {
3883 			inode_tree_add(inode);
3884 			unlock_new_inode(inode);
3885 			if (new)
3886 				*new = 1;
3887 		} else {
3888 			unlock_new_inode(inode);
3889 			iput(inode);
3890 			inode = ERR_PTR(-ESTALE);
3891 		}
3892 	}
3893 
3894 	return inode;
3895 }
3896 
3897 static struct inode *new_simple_dir(struct super_block *s,
3898 				    struct btrfs_key *key,
3899 				    struct btrfs_root *root)
3900 {
3901 	struct inode *inode = new_inode(s);
3902 
3903 	if (!inode)
3904 		return ERR_PTR(-ENOMEM);
3905 
3906 	BTRFS_I(inode)->root = root;
3907 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3908 	BTRFS_I(inode)->dummy_inode = 1;
3909 
3910 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
3911 	inode->i_op = &simple_dir_inode_operations;
3912 	inode->i_fop = &simple_dir_operations;
3913 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
3914 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
3915 
3916 	return inode;
3917 }
3918 
3919 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3920 {
3921 	struct inode *inode;
3922 	struct btrfs_root *root = BTRFS_I(dir)->root;
3923 	struct btrfs_root *sub_root = root;
3924 	struct btrfs_key location;
3925 	int index;
3926 	int ret = 0;
3927 
3928 	if (dentry->d_name.len > BTRFS_NAME_LEN)
3929 		return ERR_PTR(-ENAMETOOLONG);
3930 
3931 	if (unlikely(d_need_lookup(dentry))) {
3932 		memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
3933 		kfree(dentry->d_fsdata);
3934 		dentry->d_fsdata = NULL;
3935 		/* This thing is hashed, drop it for now */
3936 		d_drop(dentry);
3937 	} else {
3938 		ret = btrfs_inode_by_name(dir, dentry, &location);
3939 	}
3940 
3941 	if (ret < 0)
3942 		return ERR_PTR(ret);
3943 
3944 	if (location.objectid == 0)
3945 		return NULL;
3946 
3947 	if (location.type == BTRFS_INODE_ITEM_KEY) {
3948 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3949 		return inode;
3950 	}
3951 
3952 	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
3953 
3954 	index = srcu_read_lock(&root->fs_info->subvol_srcu);
3955 	ret = fixup_tree_root_location(root, dir, dentry,
3956 				       &location, &sub_root);
3957 	if (ret < 0) {
3958 		if (ret != -ENOENT)
3959 			inode = ERR_PTR(ret);
3960 		else
3961 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
3962 	} else {
3963 		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3964 	}
3965 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3966 
3967 	if (!IS_ERR(inode) && root != sub_root) {
3968 		down_read(&root->fs_info->cleanup_work_sem);
3969 		if (!(inode->i_sb->s_flags & MS_RDONLY))
3970 			ret = btrfs_orphan_cleanup(sub_root);
3971 		up_read(&root->fs_info->cleanup_work_sem);
3972 		if (ret)
3973 			inode = ERR_PTR(ret);
3974 	}
3975 
3976 	return inode;
3977 }
3978 
3979 static int btrfs_dentry_delete(const struct dentry *dentry)
3980 {
3981 	struct btrfs_root *root;
3982 
3983 	if (!dentry->d_inode && !IS_ROOT(dentry))
3984 		dentry = dentry->d_parent;
3985 
3986 	if (dentry->d_inode) {
3987 		root = BTRFS_I(dentry->d_inode)->root;
3988 		if (btrfs_root_refs(&root->root_item) == 0)
3989 			return 1;
3990 	}
3991 	return 0;
3992 }
3993 
3994 static void btrfs_dentry_release(struct dentry *dentry)
3995 {
3996 	if (dentry->d_fsdata)
3997 		kfree(dentry->d_fsdata);
3998 }
3999 
4000 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4001 				   struct nameidata *nd)
4002 {
4003 	struct dentry *ret;
4004 
4005 	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4006 	if (unlikely(d_need_lookup(dentry))) {
4007 		spin_lock(&dentry->d_lock);
4008 		dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
4009 		spin_unlock(&dentry->d_lock);
4010 	}
4011 	return ret;
4012 }
4013 
4014 unsigned char btrfs_filetype_table[] = {
4015 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
4016 };
4017 
4018 static int btrfs_real_readdir(struct file *filp, void *dirent,
4019 			      filldir_t filldir)
4020 {
4021 	struct inode *inode = filp->f_dentry->d_inode;
4022 	struct btrfs_root *root = BTRFS_I(inode)->root;
4023 	struct btrfs_item *item;
4024 	struct btrfs_dir_item *di;
4025 	struct btrfs_key key;
4026 	struct btrfs_key found_key;
4027 	struct btrfs_path *path;
4028 	struct list_head ins_list;
4029 	struct list_head del_list;
4030 	struct qstr q;
4031 	int ret;
4032 	struct extent_buffer *leaf;
4033 	int slot;
4034 	unsigned char d_type;
4035 	int over = 0;
4036 	u32 di_cur;
4037 	u32 di_total;
4038 	u32 di_len;
4039 	int key_type = BTRFS_DIR_INDEX_KEY;
4040 	char tmp_name[32];
4041 	char *name_ptr;
4042 	int name_len;
4043 	int is_curr = 0;	/* filp->f_pos points to the current index? */
4044 
4045 	/* FIXME, use a real flag for deciding about the key type */
4046 	if (root->fs_info->tree_root == root)
4047 		key_type = BTRFS_DIR_ITEM_KEY;
4048 
4049 	/* special case for "." */
4050 	if (filp->f_pos == 0) {
4051 		over = filldir(dirent, ".", 1,
4052 			       filp->f_pos, btrfs_ino(inode), DT_DIR);
4053 		if (over)
4054 			return 0;
4055 		filp->f_pos = 1;
4056 	}
4057 	/* special case for .., just use the back ref */
4058 	if (filp->f_pos == 1) {
4059 		u64 pino = parent_ino(filp->f_path.dentry);
4060 		over = filldir(dirent, "..", 2,
4061 			       filp->f_pos, pino, DT_DIR);
4062 		if (over)
4063 			return 0;
4064 		filp->f_pos = 2;
4065 	}
4066 	path = btrfs_alloc_path();
4067 	if (!path)
4068 		return -ENOMEM;
4069 
4070 	path->reada = 1;
4071 
4072 	if (key_type == BTRFS_DIR_INDEX_KEY) {
4073 		INIT_LIST_HEAD(&ins_list);
4074 		INIT_LIST_HEAD(&del_list);
4075 		btrfs_get_delayed_items(inode, &ins_list, &del_list);
4076 	}
4077 
4078 	btrfs_set_key_type(&key, key_type);
4079 	key.offset = filp->f_pos;
4080 	key.objectid = btrfs_ino(inode);
4081 
4082 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4083 	if (ret < 0)
4084 		goto err;
4085 
4086 	while (1) {
4087 		leaf = path->nodes[0];
4088 		slot = path->slots[0];
4089 		if (slot >= btrfs_header_nritems(leaf)) {
4090 			ret = btrfs_next_leaf(root, path);
4091 			if (ret < 0)
4092 				goto err;
4093 			else if (ret > 0)
4094 				break;
4095 			continue;
4096 		}
4097 
4098 		item = btrfs_item_nr(leaf, slot);
4099 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
4100 
4101 		if (found_key.objectid != key.objectid)
4102 			break;
4103 		if (btrfs_key_type(&found_key) != key_type)
4104 			break;
4105 		if (found_key.offset < filp->f_pos)
4106 			goto next;
4107 		if (key_type == BTRFS_DIR_INDEX_KEY &&
4108 		    btrfs_should_delete_dir_index(&del_list,
4109 						  found_key.offset))
4110 			goto next;
4111 
4112 		filp->f_pos = found_key.offset;
4113 		is_curr = 1;
4114 
4115 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
4116 		di_cur = 0;
4117 		di_total = btrfs_item_size(leaf, item);
4118 
4119 		while (di_cur < di_total) {
4120 			struct btrfs_key location;
4121 			struct dentry *tmp;
4122 
4123 			if (verify_dir_item(root, leaf, di))
4124 				break;
4125 
4126 			name_len = btrfs_dir_name_len(leaf, di);
4127 			if (name_len <= sizeof(tmp_name)) {
4128 				name_ptr = tmp_name;
4129 			} else {
4130 				name_ptr = kmalloc(name_len, GFP_NOFS);
4131 				if (!name_ptr) {
4132 					ret = -ENOMEM;
4133 					goto err;
4134 				}
4135 			}
4136 			read_extent_buffer(leaf, name_ptr,
4137 					   (unsigned long)(di + 1), name_len);
4138 
4139 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
4140 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
4141 
4142 			q.name = name_ptr;
4143 			q.len = name_len;
4144 			q.hash = full_name_hash(q.name, q.len);
4145 			tmp = d_lookup(filp->f_dentry, &q);
4146 			if (!tmp) {
4147 				struct btrfs_key *newkey;
4148 
4149 				newkey = kzalloc(sizeof(struct btrfs_key),
4150 						 GFP_NOFS);
4151 				if (!newkey)
4152 					goto no_dentry;
4153 				tmp = d_alloc(filp->f_dentry, &q);
4154 				if (!tmp) {
4155 					kfree(newkey);
4156 					dput(tmp);
4157 					goto no_dentry;
4158 				}
4159 				memcpy(newkey, &location,
4160 				       sizeof(struct btrfs_key));
4161 				tmp->d_fsdata = newkey;
4162 				tmp->d_flags |= DCACHE_NEED_LOOKUP;
4163 				d_rehash(tmp);
4164 				dput(tmp);
4165 			} else {
4166 				dput(tmp);
4167 			}
4168 no_dentry:
4169 			/* is this a reference to our own snapshot? If so
4170 			 * skip it
4171 			 */
4172 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
4173 			    location.objectid == root->root_key.objectid) {
4174 				over = 0;
4175 				goto skip;
4176 			}
4177 			over = filldir(dirent, name_ptr, name_len,
4178 				       found_key.offset, location.objectid,
4179 				       d_type);
4180 
4181 skip:
4182 			if (name_ptr != tmp_name)
4183 				kfree(name_ptr);
4184 
4185 			if (over)
4186 				goto nopos;
4187 			di_len = btrfs_dir_name_len(leaf, di) +
4188 				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
4189 			di_cur += di_len;
4190 			di = (struct btrfs_dir_item *)((char *)di + di_len);
4191 		}
4192 next:
4193 		path->slots[0]++;
4194 	}
4195 
4196 	if (key_type == BTRFS_DIR_INDEX_KEY) {
4197 		if (is_curr)
4198 			filp->f_pos++;
4199 		ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
4200 						      &ins_list);
4201 		if (ret)
4202 			goto nopos;
4203 	}
4204 
4205 	/* Reached end of directory/root. Bump pos past the last item. */
4206 	if (key_type == BTRFS_DIR_INDEX_KEY)
4207 		/*
4208 		 * 32-bit glibc will use getdents64, but then strtol -
4209 		 * so the last number we can serve is this.
4210 		 */
4211 		filp->f_pos = 0x7fffffff;
4212 	else
4213 		filp->f_pos++;
4214 nopos:
4215 	ret = 0;
4216 err:
4217 	if (key_type == BTRFS_DIR_INDEX_KEY)
4218 		btrfs_put_delayed_items(&ins_list, &del_list);
4219 	btrfs_free_path(path);
4220 	return ret;
4221 }
4222 
4223 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4224 {
4225 	struct btrfs_root *root = BTRFS_I(inode)->root;
4226 	struct btrfs_trans_handle *trans;
4227 	int ret = 0;
4228 	bool nolock = false;
4229 
4230 	if (BTRFS_I(inode)->dummy_inode)
4231 		return 0;
4232 
4233 	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
4234 		nolock = true;
4235 
4236 	if (wbc->sync_mode == WB_SYNC_ALL) {
4237 		if (nolock)
4238 			trans = btrfs_join_transaction_nolock(root);
4239 		else
4240 			trans = btrfs_join_transaction(root);
4241 		if (IS_ERR(trans))
4242 			return PTR_ERR(trans);
4243 		if (nolock)
4244 			ret = btrfs_end_transaction_nolock(trans, root);
4245 		else
4246 			ret = btrfs_commit_transaction(trans, root);
4247 	}
4248 	return ret;
4249 }
4250 
4251 /*
4252  * This is somewhat expensive, updating the tree every time the
4253  * inode changes.  But, it is most likely to find the inode in cache.
4254  * FIXME, needs more benchmarking...there are no reasons other than performance
4255  * to keep or drop this code.
4256  */
4257 int btrfs_dirty_inode(struct inode *inode)
4258 {
4259 	struct btrfs_root *root = BTRFS_I(inode)->root;
4260 	struct btrfs_trans_handle *trans;
4261 	int ret;
4262 
4263 	if (BTRFS_I(inode)->dummy_inode)
4264 		return 0;
4265 
4266 	trans = btrfs_join_transaction(root);
4267 	if (IS_ERR(trans))
4268 		return PTR_ERR(trans);
4269 
4270 	ret = btrfs_update_inode(trans, root, inode);
4271 	if (ret && ret == -ENOSPC) {
4272 		/* whoops, lets try again with the full transaction */
4273 		btrfs_end_transaction(trans, root);
4274 		trans = btrfs_start_transaction(root, 1);
4275 		if (IS_ERR(trans))
4276 			return PTR_ERR(trans);
4277 
4278 		ret = btrfs_update_inode(trans, root, inode);
4279 	}
4280 	btrfs_end_transaction(trans, root);
4281 	if (BTRFS_I(inode)->delayed_node)
4282 		btrfs_balance_delayed_items(root);
4283 
4284 	return ret;
4285 }
4286 
4287 /*
4288  * This is a copy of file_update_time.  We need this so we can return error on
4289  * ENOSPC for updating the inode in the case of file write and mmap writes.
4290  */
4291 int btrfs_update_time(struct file *file)
4292 {
4293 	struct inode *inode = file->f_path.dentry->d_inode;
4294 	struct timespec now;
4295 	int ret;
4296 	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
4297 
4298 	/* First try to exhaust all avenues to not sync */
4299 	if (IS_NOCMTIME(inode))
4300 		return 0;
4301 
4302 	now = current_fs_time(inode->i_sb);
4303 	if (!timespec_equal(&inode->i_mtime, &now))
4304 		sync_it = S_MTIME;
4305 
4306 	if (!timespec_equal(&inode->i_ctime, &now))
4307 		sync_it |= S_CTIME;
4308 
4309 	if (IS_I_VERSION(inode))
4310 		sync_it |= S_VERSION;
4311 
4312 	if (!sync_it)
4313 		return 0;
4314 
4315 	/* Finally allowed to write? Takes lock. */
4316 	if (mnt_want_write_file(file))
4317 		return 0;
4318 
4319 	/* Only change inode inside the lock region */
4320 	if (sync_it & S_VERSION)
4321 		inode_inc_iversion(inode);
4322 	if (sync_it & S_CTIME)
4323 		inode->i_ctime = now;
4324 	if (sync_it & S_MTIME)
4325 		inode->i_mtime = now;
4326 	ret = btrfs_dirty_inode(inode);
4327 	if (!ret)
4328 		mark_inode_dirty_sync(inode);
4329 	mnt_drop_write(file->f_path.mnt);
4330 	return ret;
4331 }
4332 
4333 /*
4334  * find the highest existing sequence number in a directory
4335  * and then set the in-memory index_cnt variable to reflect
4336  * free sequence numbers
4337  */
4338 static int btrfs_set_inode_index_count(struct inode *inode)
4339 {
4340 	struct btrfs_root *root = BTRFS_I(inode)->root;
4341 	struct btrfs_key key, found_key;
4342 	struct btrfs_path *path;
4343 	struct extent_buffer *leaf;
4344 	int ret;
4345 
4346 	key.objectid = btrfs_ino(inode);
4347 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
4348 	key.offset = (u64)-1;
4349 
4350 	path = btrfs_alloc_path();
4351 	if (!path)
4352 		return -ENOMEM;
4353 
4354 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4355 	if (ret < 0)
4356 		goto out;
4357 	/* FIXME: we should be able to handle this */
4358 	if (ret == 0)
4359 		goto out;
4360 	ret = 0;
4361 
4362 	/*
4363 	 * MAGIC NUMBER EXPLANATION:
4364 	 * since we search a directory based on f_pos we have to start at 2
4365 	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
4366 	 * else has to start at 2
4367 	 */
4368 	if (path->slots[0] == 0) {
4369 		BTRFS_I(inode)->index_cnt = 2;
4370 		goto out;
4371 	}
4372 
4373 	path->slots[0]--;
4374 
4375 	leaf = path->nodes[0];
4376 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4377 
4378 	if (found_key.objectid != btrfs_ino(inode) ||
4379 	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
4380 		BTRFS_I(inode)->index_cnt = 2;
4381 		goto out;
4382 	}
4383 
4384 	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
4385 out:
4386 	btrfs_free_path(path);
4387 	return ret;
4388 }
4389 
4390 /*
4391  * helper to find a free sequence number in a given directory.  This current
4392  * code is very simple, later versions will do smarter things in the btree
4393  */
4394 int btrfs_set_inode_index(struct inode *dir, u64 *index)
4395 {
4396 	int ret = 0;
4397 
4398 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
4399 		ret = btrfs_inode_delayed_dir_index_count(dir);
4400 		if (ret) {
4401 			ret = btrfs_set_inode_index_count(dir);
4402 			if (ret)
4403 				return ret;
4404 		}
4405 	}
4406 
4407 	*index = BTRFS_I(dir)->index_cnt;
4408 	BTRFS_I(dir)->index_cnt++;
4409 
4410 	return ret;
4411 }
4412 
4413 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4414 				     struct btrfs_root *root,
4415 				     struct inode *dir,
4416 				     const char *name, int name_len,
4417 				     u64 ref_objectid, u64 objectid,
4418 				     umode_t mode, u64 *index)
4419 {
4420 	struct inode *inode;
4421 	struct btrfs_inode_item *inode_item;
4422 	struct btrfs_key *location;
4423 	struct btrfs_path *path;
4424 	struct btrfs_inode_ref *ref;
4425 	struct btrfs_key key[2];
4426 	u32 sizes[2];
4427 	unsigned long ptr;
4428 	int ret;
4429 	int owner;
4430 
4431 	path = btrfs_alloc_path();
4432 	if (!path)
4433 		return ERR_PTR(-ENOMEM);
4434 
4435 	inode = new_inode(root->fs_info->sb);
4436 	if (!inode) {
4437 		btrfs_free_path(path);
4438 		return ERR_PTR(-ENOMEM);
4439 	}
4440 
4441 	/*
4442 	 * we have to initialize this early, so we can reclaim the inode
4443 	 * number if we fail afterwards in this function.
4444 	 */
4445 	inode->i_ino = objectid;
4446 
4447 	if (dir) {
4448 		trace_btrfs_inode_request(dir);
4449 
4450 		ret = btrfs_set_inode_index(dir, index);
4451 		if (ret) {
4452 			btrfs_free_path(path);
4453 			iput(inode);
4454 			return ERR_PTR(ret);
4455 		}
4456 	}
4457 	/*
4458 	 * index_cnt is ignored for everything but a dir,
4459 	 * btrfs_get_inode_index_count has an explanation for the magic
4460 	 * number
4461 	 */
4462 	BTRFS_I(inode)->index_cnt = 2;
4463 	BTRFS_I(inode)->root = root;
4464 	BTRFS_I(inode)->generation = trans->transid;
4465 	inode->i_generation = BTRFS_I(inode)->generation;
4466 	btrfs_set_inode_space_info(root, inode);
4467 
4468 	if (S_ISDIR(mode))
4469 		owner = 0;
4470 	else
4471 		owner = 1;
4472 
4473 	key[0].objectid = objectid;
4474 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
4475 	key[0].offset = 0;
4476 
4477 	key[1].objectid = objectid;
4478 	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
4479 	key[1].offset = ref_objectid;
4480 
4481 	sizes[0] = sizeof(struct btrfs_inode_item);
4482 	sizes[1] = name_len + sizeof(*ref);
4483 
4484 	path->leave_spinning = 1;
4485 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
4486 	if (ret != 0)
4487 		goto fail;
4488 
4489 	inode_init_owner(inode, dir, mode);
4490 	inode_set_bytes(inode, 0);
4491 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4492 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4493 				  struct btrfs_inode_item);
4494 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
4495 
4496 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
4497 			     struct btrfs_inode_ref);
4498 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
4499 	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
4500 	ptr = (unsigned long)(ref + 1);
4501 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
4502 
4503 	btrfs_mark_buffer_dirty(path->nodes[0]);
4504 	btrfs_free_path(path);
4505 
4506 	location = &BTRFS_I(inode)->location;
4507 	location->objectid = objectid;
4508 	location->offset = 0;
4509 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
4510 
4511 	btrfs_inherit_iflags(inode, dir);
4512 
4513 	if (S_ISREG(mode)) {
4514 		if (btrfs_test_opt(root, NODATASUM))
4515 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4516 		if (btrfs_test_opt(root, NODATACOW) ||
4517 		    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4518 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4519 	}
4520 
4521 	insert_inode_hash(inode);
4522 	inode_tree_add(inode);
4523 
4524 	trace_btrfs_inode_new(inode);
4525 	btrfs_set_inode_last_trans(trans, inode);
4526 
4527 	return inode;
4528 fail:
4529 	if (dir)
4530 		BTRFS_I(dir)->index_cnt--;
4531 	btrfs_free_path(path);
4532 	iput(inode);
4533 	return ERR_PTR(ret);
4534 }
4535 
4536 static inline u8 btrfs_inode_type(struct inode *inode)
4537 {
4538 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
4539 }
4540 
4541 /*
4542  * utility function to add 'inode' into 'parent_inode' with
4543  * a give name and a given sequence number.
4544  * if 'add_backref' is true, also insert a backref from the
4545  * inode to the parent directory.
4546  */
4547 int btrfs_add_link(struct btrfs_trans_handle *trans,
4548 		   struct inode *parent_inode, struct inode *inode,
4549 		   const char *name, int name_len, int add_backref, u64 index)
4550 {
4551 	int ret = 0;
4552 	struct btrfs_key key;
4553 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
4554 	u64 ino = btrfs_ino(inode);
4555 	u64 parent_ino = btrfs_ino(parent_inode);
4556 
4557 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4558 		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
4559 	} else {
4560 		key.objectid = ino;
4561 		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
4562 		key.offset = 0;
4563 	}
4564 
4565 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4566 		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
4567 					 key.objectid, root->root_key.objectid,
4568 					 parent_ino, index, name, name_len);
4569 	} else if (add_backref) {
4570 		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
4571 					     parent_ino, index);
4572 	}
4573 
4574 	if (ret == 0) {
4575 		ret = btrfs_insert_dir_item(trans, root, name, name_len,
4576 					    parent_inode, &key,
4577 					    btrfs_inode_type(inode), index);
4578 		BUG_ON(ret);
4579 
4580 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
4581 				   name_len * 2);
4582 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
4583 		ret = btrfs_update_inode(trans, root, parent_inode);
4584 	}
4585 	return ret;
4586 }
4587 
4588 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4589 			    struct inode *dir, struct dentry *dentry,
4590 			    struct inode *inode, int backref, u64 index)
4591 {
4592 	int err = btrfs_add_link(trans, dir, inode,
4593 				 dentry->d_name.name, dentry->d_name.len,
4594 				 backref, index);
4595 	if (err > 0)
4596 		err = -EEXIST;
4597 	return err;
4598 }
4599 
4600 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4601 			umode_t mode, dev_t rdev)
4602 {
4603 	struct btrfs_trans_handle *trans;
4604 	struct btrfs_root *root = BTRFS_I(dir)->root;
4605 	struct inode *inode = NULL;
4606 	int err;
4607 	int drop_inode = 0;
4608 	u64 objectid;
4609 	unsigned long nr = 0;
4610 	u64 index = 0;
4611 
4612 	if (!new_valid_dev(rdev))
4613 		return -EINVAL;
4614 
4615 	/*
4616 	 * 2 for inode item and ref
4617 	 * 2 for dir items
4618 	 * 1 for xattr if selinux is on
4619 	 */
4620 	trans = btrfs_start_transaction(root, 5);
4621 	if (IS_ERR(trans))
4622 		return PTR_ERR(trans);
4623 
4624 	err = btrfs_find_free_ino(root, &objectid);
4625 	if (err)
4626 		goto out_unlock;
4627 
4628 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4629 				dentry->d_name.len, btrfs_ino(dir), objectid,
4630 				mode, &index);
4631 	if (IS_ERR(inode)) {
4632 		err = PTR_ERR(inode);
4633 		goto out_unlock;
4634 	}
4635 
4636 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4637 	if (err) {
4638 		drop_inode = 1;
4639 		goto out_unlock;
4640 	}
4641 
4642 	/*
4643 	* If the active LSM wants to access the inode during
4644 	* d_instantiate it needs these. Smack checks to see
4645 	* if the filesystem supports xattrs by looking at the
4646 	* ops vector.
4647 	*/
4648 
4649 	inode->i_op = &btrfs_special_inode_operations;
4650 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4651 	if (err)
4652 		drop_inode = 1;
4653 	else {
4654 		init_special_inode(inode, inode->i_mode, rdev);
4655 		btrfs_update_inode(trans, root, inode);
4656 		d_instantiate(dentry, inode);
4657 	}
4658 out_unlock:
4659 	nr = trans->blocks_used;
4660 	btrfs_end_transaction(trans, root);
4661 	btrfs_btree_balance_dirty(root, nr);
4662 	if (drop_inode) {
4663 		inode_dec_link_count(inode);
4664 		iput(inode);
4665 	}
4666 	return err;
4667 }
4668 
4669 static int btrfs_create(struct inode *dir, struct dentry *dentry,
4670 			umode_t mode, struct nameidata *nd)
4671 {
4672 	struct btrfs_trans_handle *trans;
4673 	struct btrfs_root *root = BTRFS_I(dir)->root;
4674 	struct inode *inode = NULL;
4675 	int drop_inode = 0;
4676 	int err;
4677 	unsigned long nr = 0;
4678 	u64 objectid;
4679 	u64 index = 0;
4680 
4681 	/*
4682 	 * 2 for inode item and ref
4683 	 * 2 for dir items
4684 	 * 1 for xattr if selinux is on
4685 	 */
4686 	trans = btrfs_start_transaction(root, 5);
4687 	if (IS_ERR(trans))
4688 		return PTR_ERR(trans);
4689 
4690 	err = btrfs_find_free_ino(root, &objectid);
4691 	if (err)
4692 		goto out_unlock;
4693 
4694 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4695 				dentry->d_name.len, btrfs_ino(dir), objectid,
4696 				mode, &index);
4697 	if (IS_ERR(inode)) {
4698 		err = PTR_ERR(inode);
4699 		goto out_unlock;
4700 	}
4701 
4702 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4703 	if (err) {
4704 		drop_inode = 1;
4705 		goto out_unlock;
4706 	}
4707 
4708 	/*
4709 	* If the active LSM wants to access the inode during
4710 	* d_instantiate it needs these. Smack checks to see
4711 	* if the filesystem supports xattrs by looking at the
4712 	* ops vector.
4713 	*/
4714 	inode->i_fop = &btrfs_file_operations;
4715 	inode->i_op = &btrfs_file_inode_operations;
4716 
4717 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4718 	if (err)
4719 		drop_inode = 1;
4720 	else {
4721 		inode->i_mapping->a_ops = &btrfs_aops;
4722 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
4723 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4724 		d_instantiate(dentry, inode);
4725 	}
4726 out_unlock:
4727 	nr = trans->blocks_used;
4728 	btrfs_end_transaction(trans, root);
4729 	if (drop_inode) {
4730 		inode_dec_link_count(inode);
4731 		iput(inode);
4732 	}
4733 	btrfs_btree_balance_dirty(root, nr);
4734 	return err;
4735 }
4736 
4737 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4738 		      struct dentry *dentry)
4739 {
4740 	struct btrfs_trans_handle *trans;
4741 	struct btrfs_root *root = BTRFS_I(dir)->root;
4742 	struct inode *inode = old_dentry->d_inode;
4743 	u64 index;
4744 	unsigned long nr = 0;
4745 	int err;
4746 	int drop_inode = 0;
4747 
4748 	/* do not allow sys_link's with other subvols of the same device */
4749 	if (root->objectid != BTRFS_I(inode)->root->objectid)
4750 		return -EXDEV;
4751 
4752 	if (inode->i_nlink == ~0U)
4753 		return -EMLINK;
4754 
4755 	err = btrfs_set_inode_index(dir, &index);
4756 	if (err)
4757 		goto fail;
4758 
4759 	/*
4760 	 * 2 items for inode and inode ref
4761 	 * 2 items for dir items
4762 	 * 1 item for parent inode
4763 	 */
4764 	trans = btrfs_start_transaction(root, 5);
4765 	if (IS_ERR(trans)) {
4766 		err = PTR_ERR(trans);
4767 		goto fail;
4768 	}
4769 
4770 	btrfs_inc_nlink(inode);
4771 	inode->i_ctime = CURRENT_TIME;
4772 	ihold(inode);
4773 
4774 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
4775 
4776 	if (err) {
4777 		drop_inode = 1;
4778 	} else {
4779 		struct dentry *parent = dentry->d_parent;
4780 		err = btrfs_update_inode(trans, root, inode);
4781 		BUG_ON(err);
4782 		d_instantiate(dentry, inode);
4783 		btrfs_log_new_name(trans, inode, NULL, parent);
4784 	}
4785 
4786 	nr = trans->blocks_used;
4787 	btrfs_end_transaction(trans, root);
4788 fail:
4789 	if (drop_inode) {
4790 		inode_dec_link_count(inode);
4791 		iput(inode);
4792 	}
4793 	btrfs_btree_balance_dirty(root, nr);
4794 	return err;
4795 }
4796 
4797 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4798 {
4799 	struct inode *inode = NULL;
4800 	struct btrfs_trans_handle *trans;
4801 	struct btrfs_root *root = BTRFS_I(dir)->root;
4802 	int err = 0;
4803 	int drop_on_err = 0;
4804 	u64 objectid = 0;
4805 	u64 index = 0;
4806 	unsigned long nr = 1;
4807 
4808 	/*
4809 	 * 2 items for inode and ref
4810 	 * 2 items for dir items
4811 	 * 1 for xattr if selinux is on
4812 	 */
4813 	trans = btrfs_start_transaction(root, 5);
4814 	if (IS_ERR(trans))
4815 		return PTR_ERR(trans);
4816 
4817 	err = btrfs_find_free_ino(root, &objectid);
4818 	if (err)
4819 		goto out_fail;
4820 
4821 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4822 				dentry->d_name.len, btrfs_ino(dir), objectid,
4823 				S_IFDIR | mode, &index);
4824 	if (IS_ERR(inode)) {
4825 		err = PTR_ERR(inode);
4826 		goto out_fail;
4827 	}
4828 
4829 	drop_on_err = 1;
4830 
4831 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4832 	if (err)
4833 		goto out_fail;
4834 
4835 	inode->i_op = &btrfs_dir_inode_operations;
4836 	inode->i_fop = &btrfs_dir_file_operations;
4837 
4838 	btrfs_i_size_write(inode, 0);
4839 	err = btrfs_update_inode(trans, root, inode);
4840 	if (err)
4841 		goto out_fail;
4842 
4843 	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
4844 			     dentry->d_name.len, 0, index);
4845 	if (err)
4846 		goto out_fail;
4847 
4848 	d_instantiate(dentry, inode);
4849 	drop_on_err = 0;
4850 
4851 out_fail:
4852 	nr = trans->blocks_used;
4853 	btrfs_end_transaction(trans, root);
4854 	if (drop_on_err)
4855 		iput(inode);
4856 	btrfs_btree_balance_dirty(root, nr);
4857 	return err;
4858 }
4859 
4860 /* helper for btfs_get_extent.  Given an existing extent in the tree,
4861  * and an extent that you want to insert, deal with overlap and insert
4862  * the new extent into the tree.
4863  */
4864 static int merge_extent_mapping(struct extent_map_tree *em_tree,
4865 				struct extent_map *existing,
4866 				struct extent_map *em,
4867 				u64 map_start, u64 map_len)
4868 {
4869 	u64 start_diff;
4870 
4871 	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
4872 	start_diff = map_start - em->start;
4873 	em->start = map_start;
4874 	em->len = map_len;
4875 	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
4876 	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
4877 		em->block_start += start_diff;
4878 		em->block_len -= start_diff;
4879 	}
4880 	return add_extent_mapping(em_tree, em);
4881 }
4882 
4883 static noinline int uncompress_inline(struct btrfs_path *path,
4884 				      struct inode *inode, struct page *page,
4885 				      size_t pg_offset, u64 extent_offset,
4886 				      struct btrfs_file_extent_item *item)
4887 {
4888 	int ret;
4889 	struct extent_buffer *leaf = path->nodes[0];
4890 	char *tmp;
4891 	size_t max_size;
4892 	unsigned long inline_size;
4893 	unsigned long ptr;
4894 	int compress_type;
4895 
4896 	WARN_ON(pg_offset != 0);
4897 	compress_type = btrfs_file_extent_compression(leaf, item);
4898 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
4899 	inline_size = btrfs_file_extent_inline_item_len(leaf,
4900 					btrfs_item_nr(leaf, path->slots[0]));
4901 	tmp = kmalloc(inline_size, GFP_NOFS);
4902 	if (!tmp)
4903 		return -ENOMEM;
4904 	ptr = btrfs_file_extent_inline_start(item);
4905 
4906 	read_extent_buffer(leaf, tmp, ptr, inline_size);
4907 
4908 	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4909 	ret = btrfs_decompress(compress_type, tmp, page,
4910 			       extent_offset, inline_size, max_size);
4911 	if (ret) {
4912 		char *kaddr = kmap_atomic(page, KM_USER0);
4913 		unsigned long copy_size = min_t(u64,
4914 				  PAGE_CACHE_SIZE - pg_offset,
4915 				  max_size - extent_offset);
4916 		memset(kaddr + pg_offset, 0, copy_size);
4917 		kunmap_atomic(kaddr, KM_USER0);
4918 	}
4919 	kfree(tmp);
4920 	return 0;
4921 }
4922 
4923 /*
4924  * a bit scary, this does extent mapping from logical file offset to the disk.
4925  * the ugly parts come from merging extents from the disk with the in-ram
4926  * representation.  This gets more complex because of the data=ordered code,
4927  * where the in-ram extents might be locked pending data=ordered completion.
4928  *
4929  * This also copies inline extents directly into the page.
4930  */
4931 
4932 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4933 				    size_t pg_offset, u64 start, u64 len,
4934 				    int create)
4935 {
4936 	int ret;
4937 	int err = 0;
4938 	u64 bytenr;
4939 	u64 extent_start = 0;
4940 	u64 extent_end = 0;
4941 	u64 objectid = btrfs_ino(inode);
4942 	u32 found_type;
4943 	struct btrfs_path *path = NULL;
4944 	struct btrfs_root *root = BTRFS_I(inode)->root;
4945 	struct btrfs_file_extent_item *item;
4946 	struct extent_buffer *leaf;
4947 	struct btrfs_key found_key;
4948 	struct extent_map *em = NULL;
4949 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4950 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4951 	struct btrfs_trans_handle *trans = NULL;
4952 	int compress_type;
4953 
4954 again:
4955 	read_lock(&em_tree->lock);
4956 	em = lookup_extent_mapping(em_tree, start, len);
4957 	if (em)
4958 		em->bdev = root->fs_info->fs_devices->latest_bdev;
4959 	read_unlock(&em_tree->lock);
4960 
4961 	if (em) {
4962 		if (em->start > start || em->start + em->len <= start)
4963 			free_extent_map(em);
4964 		else if (em->block_start == EXTENT_MAP_INLINE && page)
4965 			free_extent_map(em);
4966 		else
4967 			goto out;
4968 	}
4969 	em = alloc_extent_map();
4970 	if (!em) {
4971 		err = -ENOMEM;
4972 		goto out;
4973 	}
4974 	em->bdev = root->fs_info->fs_devices->latest_bdev;
4975 	em->start = EXTENT_MAP_HOLE;
4976 	em->orig_start = EXTENT_MAP_HOLE;
4977 	em->len = (u64)-1;
4978 	em->block_len = (u64)-1;
4979 
4980 	if (!path) {
4981 		path = btrfs_alloc_path();
4982 		if (!path) {
4983 			err = -ENOMEM;
4984 			goto out;
4985 		}
4986 		/*
4987 		 * Chances are we'll be called again, so go ahead and do
4988 		 * readahead
4989 		 */
4990 		path->reada = 1;
4991 	}
4992 
4993 	ret = btrfs_lookup_file_extent(trans, root, path,
4994 				       objectid, start, trans != NULL);
4995 	if (ret < 0) {
4996 		err = ret;
4997 		goto out;
4998 	}
4999 
5000 	if (ret != 0) {
5001 		if (path->slots[0] == 0)
5002 			goto not_found;
5003 		path->slots[0]--;
5004 	}
5005 
5006 	leaf = path->nodes[0];
5007 	item = btrfs_item_ptr(leaf, path->slots[0],
5008 			      struct btrfs_file_extent_item);
5009 	/* are we inside the extent that was found? */
5010 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5011 	found_type = btrfs_key_type(&found_key);
5012 	if (found_key.objectid != objectid ||
5013 	    found_type != BTRFS_EXTENT_DATA_KEY) {
5014 		goto not_found;
5015 	}
5016 
5017 	found_type = btrfs_file_extent_type(leaf, item);
5018 	extent_start = found_key.offset;
5019 	compress_type = btrfs_file_extent_compression(leaf, item);
5020 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5021 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5022 		extent_end = extent_start +
5023 		       btrfs_file_extent_num_bytes(leaf, item);
5024 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5025 		size_t size;
5026 		size = btrfs_file_extent_inline_len(leaf, item);
5027 		extent_end = (extent_start + size + root->sectorsize - 1) &
5028 			~((u64)root->sectorsize - 1);
5029 	}
5030 
5031 	if (start >= extent_end) {
5032 		path->slots[0]++;
5033 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
5034 			ret = btrfs_next_leaf(root, path);
5035 			if (ret < 0) {
5036 				err = ret;
5037 				goto out;
5038 			}
5039 			if (ret > 0)
5040 				goto not_found;
5041 			leaf = path->nodes[0];
5042 		}
5043 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5044 		if (found_key.objectid != objectid ||
5045 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
5046 			goto not_found;
5047 		if (start + len <= found_key.offset)
5048 			goto not_found;
5049 		em->start = start;
5050 		em->len = found_key.offset - start;
5051 		goto not_found_em;
5052 	}
5053 
5054 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5055 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5056 		em->start = extent_start;
5057 		em->len = extent_end - extent_start;
5058 		em->orig_start = extent_start -
5059 				 btrfs_file_extent_offset(leaf, item);
5060 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5061 		if (bytenr == 0) {
5062 			em->block_start = EXTENT_MAP_HOLE;
5063 			goto insert;
5064 		}
5065 		if (compress_type != BTRFS_COMPRESS_NONE) {
5066 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5067 			em->compress_type = compress_type;
5068 			em->block_start = bytenr;
5069 			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5070 									 item);
5071 		} else {
5072 			bytenr += btrfs_file_extent_offset(leaf, item);
5073 			em->block_start = bytenr;
5074 			em->block_len = em->len;
5075 			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
5076 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5077 		}
5078 		goto insert;
5079 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5080 		unsigned long ptr;
5081 		char *map;
5082 		size_t size;
5083 		size_t extent_offset;
5084 		size_t copy_size;
5085 
5086 		em->block_start = EXTENT_MAP_INLINE;
5087 		if (!page || create) {
5088 			em->start = extent_start;
5089 			em->len = extent_end - extent_start;
5090 			goto out;
5091 		}
5092 
5093 		size = btrfs_file_extent_inline_len(leaf, item);
5094 		extent_offset = page_offset(page) + pg_offset - extent_start;
5095 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
5096 				size - extent_offset);
5097 		em->start = extent_start + extent_offset;
5098 		em->len = (copy_size + root->sectorsize - 1) &
5099 			~((u64)root->sectorsize - 1);
5100 		em->orig_start = EXTENT_MAP_INLINE;
5101 		if (compress_type) {
5102 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5103 			em->compress_type = compress_type;
5104 		}
5105 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5106 		if (create == 0 && !PageUptodate(page)) {
5107 			if (btrfs_file_extent_compression(leaf, item) !=
5108 			    BTRFS_COMPRESS_NONE) {
5109 				ret = uncompress_inline(path, inode, page,
5110 							pg_offset,
5111 							extent_offset, item);
5112 				BUG_ON(ret);
5113 			} else {
5114 				map = kmap(page);
5115 				read_extent_buffer(leaf, map + pg_offset, ptr,
5116 						   copy_size);
5117 				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
5118 					memset(map + pg_offset + copy_size, 0,
5119 					       PAGE_CACHE_SIZE - pg_offset -
5120 					       copy_size);
5121 				}
5122 				kunmap(page);
5123 			}
5124 			flush_dcache_page(page);
5125 		} else if (create && PageUptodate(page)) {
5126 			BUG();
5127 			if (!trans) {
5128 				kunmap(page);
5129 				free_extent_map(em);
5130 				em = NULL;
5131 
5132 				btrfs_release_path(path);
5133 				trans = btrfs_join_transaction(root);
5134 
5135 				if (IS_ERR(trans))
5136 					return ERR_CAST(trans);
5137 				goto again;
5138 			}
5139 			map = kmap(page);
5140 			write_extent_buffer(leaf, map + pg_offset, ptr,
5141 					    copy_size);
5142 			kunmap(page);
5143 			btrfs_mark_buffer_dirty(leaf);
5144 		}
5145 		set_extent_uptodate(io_tree, em->start,
5146 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
5147 		goto insert;
5148 	} else {
5149 		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
5150 		WARN_ON(1);
5151 	}
5152 not_found:
5153 	em->start = start;
5154 	em->len = len;
5155 not_found_em:
5156 	em->block_start = EXTENT_MAP_HOLE;
5157 	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
5158 insert:
5159 	btrfs_release_path(path);
5160 	if (em->start > start || extent_map_end(em) <= start) {
5161 		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
5162 		       "[%llu %llu]\n", (unsigned long long)em->start,
5163 		       (unsigned long long)em->len,
5164 		       (unsigned long long)start,
5165 		       (unsigned long long)len);
5166 		err = -EIO;
5167 		goto out;
5168 	}
5169 
5170 	err = 0;
5171 	write_lock(&em_tree->lock);
5172 	ret = add_extent_mapping(em_tree, em);
5173 	/* it is possible that someone inserted the extent into the tree
5174 	 * while we had the lock dropped.  It is also possible that
5175 	 * an overlapping map exists in the tree
5176 	 */
5177 	if (ret == -EEXIST) {
5178 		struct extent_map *existing;
5179 
5180 		ret = 0;
5181 
5182 		existing = lookup_extent_mapping(em_tree, start, len);
5183 		if (existing && (existing->start > start ||
5184 		    existing->start + existing->len <= start)) {
5185 			free_extent_map(existing);
5186 			existing = NULL;
5187 		}
5188 		if (!existing) {
5189 			existing = lookup_extent_mapping(em_tree, em->start,
5190 							 em->len);
5191 			if (existing) {
5192 				err = merge_extent_mapping(em_tree, existing,
5193 							   em, start,
5194 							   root->sectorsize);
5195 				free_extent_map(existing);
5196 				if (err) {
5197 					free_extent_map(em);
5198 					em = NULL;
5199 				}
5200 			} else {
5201 				err = -EIO;
5202 				free_extent_map(em);
5203 				em = NULL;
5204 			}
5205 		} else {
5206 			free_extent_map(em);
5207 			em = existing;
5208 			err = 0;
5209 		}
5210 	}
5211 	write_unlock(&em_tree->lock);
5212 out:
5213 
5214 	trace_btrfs_get_extent(root, em);
5215 
5216 	if (path)
5217 		btrfs_free_path(path);
5218 	if (trans) {
5219 		ret = btrfs_end_transaction(trans, root);
5220 		if (!err)
5221 			err = ret;
5222 	}
5223 	if (err) {
5224 		free_extent_map(em);
5225 		return ERR_PTR(err);
5226 	}
5227 	return em;
5228 }
5229 
5230 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
5231 					   size_t pg_offset, u64 start, u64 len,
5232 					   int create)
5233 {
5234 	struct extent_map *em;
5235 	struct extent_map *hole_em = NULL;
5236 	u64 range_start = start;
5237 	u64 end;
5238 	u64 found;
5239 	u64 found_end;
5240 	int err = 0;
5241 
5242 	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
5243 	if (IS_ERR(em))
5244 		return em;
5245 	if (em) {
5246 		/*
5247 		 * if our em maps to a hole, there might
5248 		 * actually be delalloc bytes behind it
5249 		 */
5250 		if (em->block_start != EXTENT_MAP_HOLE)
5251 			return em;
5252 		else
5253 			hole_em = em;
5254 	}
5255 
5256 	/* check to see if we've wrapped (len == -1 or similar) */
5257 	end = start + len;
5258 	if (end < start)
5259 		end = (u64)-1;
5260 	else
5261 		end -= 1;
5262 
5263 	em = NULL;
5264 
5265 	/* ok, we didn't find anything, lets look for delalloc */
5266 	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
5267 				 end, len, EXTENT_DELALLOC, 1);
5268 	found_end = range_start + found;
5269 	if (found_end < range_start)
5270 		found_end = (u64)-1;
5271 
5272 	/*
5273 	 * we didn't find anything useful, return
5274 	 * the original results from get_extent()
5275 	 */
5276 	if (range_start > end || found_end <= start) {
5277 		em = hole_em;
5278 		hole_em = NULL;
5279 		goto out;
5280 	}
5281 
5282 	/* adjust the range_start to make sure it doesn't
5283 	 * go backwards from the start they passed in
5284 	 */
5285 	range_start = max(start,range_start);
5286 	found = found_end - range_start;
5287 
5288 	if (found > 0) {
5289 		u64 hole_start = start;
5290 		u64 hole_len = len;
5291 
5292 		em = alloc_extent_map();
5293 		if (!em) {
5294 			err = -ENOMEM;
5295 			goto out;
5296 		}
5297 		/*
5298 		 * when btrfs_get_extent can't find anything it
5299 		 * returns one huge hole
5300 		 *
5301 		 * make sure what it found really fits our range, and
5302 		 * adjust to make sure it is based on the start from
5303 		 * the caller
5304 		 */
5305 		if (hole_em) {
5306 			u64 calc_end = extent_map_end(hole_em);
5307 
5308 			if (calc_end <= start || (hole_em->start > end)) {
5309 				free_extent_map(hole_em);
5310 				hole_em = NULL;
5311 			} else {
5312 				hole_start = max(hole_em->start, start);
5313 				hole_len = calc_end - hole_start;
5314 			}
5315 		}
5316 		em->bdev = NULL;
5317 		if (hole_em && range_start > hole_start) {
5318 			/* our hole starts before our delalloc, so we
5319 			 * have to return just the parts of the hole
5320 			 * that go until  the delalloc starts
5321 			 */
5322 			em->len = min(hole_len,
5323 				      range_start - hole_start);
5324 			em->start = hole_start;
5325 			em->orig_start = hole_start;
5326 			/*
5327 			 * don't adjust block start at all,
5328 			 * it is fixed at EXTENT_MAP_HOLE
5329 			 */
5330 			em->block_start = hole_em->block_start;
5331 			em->block_len = hole_len;
5332 		} else {
5333 			em->start = range_start;
5334 			em->len = found;
5335 			em->orig_start = range_start;
5336 			em->block_start = EXTENT_MAP_DELALLOC;
5337 			em->block_len = found;
5338 		}
5339 	} else if (hole_em) {
5340 		return hole_em;
5341 	}
5342 out:
5343 
5344 	free_extent_map(hole_em);
5345 	if (err) {
5346 		free_extent_map(em);
5347 		return ERR_PTR(err);
5348 	}
5349 	return em;
5350 }
5351 
5352 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5353 						  struct extent_map *em,
5354 						  u64 start, u64 len)
5355 {
5356 	struct btrfs_root *root = BTRFS_I(inode)->root;
5357 	struct btrfs_trans_handle *trans;
5358 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5359 	struct btrfs_key ins;
5360 	u64 alloc_hint;
5361 	int ret;
5362 	bool insert = false;
5363 
5364 	/*
5365 	 * Ok if the extent map we looked up is a hole and is for the exact
5366 	 * range we want, there is no reason to allocate a new one, however if
5367 	 * it is not right then we need to free this one and drop the cache for
5368 	 * our range.
5369 	 */
5370 	if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5371 	    em->len != len) {
5372 		free_extent_map(em);
5373 		em = NULL;
5374 		insert = true;
5375 		btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5376 	}
5377 
5378 	trans = btrfs_join_transaction(root);
5379 	if (IS_ERR(trans))
5380 		return ERR_CAST(trans);
5381 
5382 	if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5383 		btrfs_add_inode_defrag(trans, inode);
5384 
5385 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5386 
5387 	alloc_hint = get_extent_allocation_hint(inode, start, len);
5388 	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5389 				   alloc_hint, (u64)-1, &ins, 1);
5390 	if (ret) {
5391 		em = ERR_PTR(ret);
5392 		goto out;
5393 	}
5394 
5395 	if (!em) {
5396 		em = alloc_extent_map();
5397 		if (!em) {
5398 			em = ERR_PTR(-ENOMEM);
5399 			goto out;
5400 		}
5401 	}
5402 
5403 	em->start = start;
5404 	em->orig_start = em->start;
5405 	em->len = ins.offset;
5406 
5407 	em->block_start = ins.objectid;
5408 	em->block_len = ins.offset;
5409 	em->bdev = root->fs_info->fs_devices->latest_bdev;
5410 
5411 	/*
5412 	 * We need to do this because if we're using the original em we searched
5413 	 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5414 	 */
5415 	em->flags = 0;
5416 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
5417 
5418 	while (insert) {
5419 		write_lock(&em_tree->lock);
5420 		ret = add_extent_mapping(em_tree, em);
5421 		write_unlock(&em_tree->lock);
5422 		if (ret != -EEXIST)
5423 			break;
5424 		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5425 	}
5426 
5427 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5428 					   ins.offset, ins.offset, 0);
5429 	if (ret) {
5430 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5431 		em = ERR_PTR(ret);
5432 	}
5433 out:
5434 	btrfs_end_transaction(trans, root);
5435 	return em;
5436 }
5437 
5438 /*
5439  * returns 1 when the nocow is safe, < 1 on error, 0 if the
5440  * block must be cow'd
5441  */
5442 static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5443 				      struct inode *inode, u64 offset, u64 len)
5444 {
5445 	struct btrfs_path *path;
5446 	int ret;
5447 	struct extent_buffer *leaf;
5448 	struct btrfs_root *root = BTRFS_I(inode)->root;
5449 	struct btrfs_file_extent_item *fi;
5450 	struct btrfs_key key;
5451 	u64 disk_bytenr;
5452 	u64 backref_offset;
5453 	u64 extent_end;
5454 	u64 num_bytes;
5455 	int slot;
5456 	int found_type;
5457 
5458 	path = btrfs_alloc_path();
5459 	if (!path)
5460 		return -ENOMEM;
5461 
5462 	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
5463 				       offset, 0);
5464 	if (ret < 0)
5465 		goto out;
5466 
5467 	slot = path->slots[0];
5468 	if (ret == 1) {
5469 		if (slot == 0) {
5470 			/* can't find the item, must cow */
5471 			ret = 0;
5472 			goto out;
5473 		}
5474 		slot--;
5475 	}
5476 	ret = 0;
5477 	leaf = path->nodes[0];
5478 	btrfs_item_key_to_cpu(leaf, &key, slot);
5479 	if (key.objectid != btrfs_ino(inode) ||
5480 	    key.type != BTRFS_EXTENT_DATA_KEY) {
5481 		/* not our file or wrong item type, must cow */
5482 		goto out;
5483 	}
5484 
5485 	if (key.offset > offset) {
5486 		/* Wrong offset, must cow */
5487 		goto out;
5488 	}
5489 
5490 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5491 	found_type = btrfs_file_extent_type(leaf, fi);
5492 	if (found_type != BTRFS_FILE_EXTENT_REG &&
5493 	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5494 		/* not a regular extent, must cow */
5495 		goto out;
5496 	}
5497 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5498 	backref_offset = btrfs_file_extent_offset(leaf, fi);
5499 
5500 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5501 	if (extent_end < offset + len) {
5502 		/* extent doesn't include our full range, must cow */
5503 		goto out;
5504 	}
5505 
5506 	if (btrfs_extent_readonly(root, disk_bytenr))
5507 		goto out;
5508 
5509 	/*
5510 	 * look for other files referencing this extent, if we
5511 	 * find any we must cow
5512 	 */
5513 	if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
5514 				  key.offset - backref_offset, disk_bytenr))
5515 		goto out;
5516 
5517 	/*
5518 	 * adjust disk_bytenr and num_bytes to cover just the bytes
5519 	 * in this extent we are about to write.  If there
5520 	 * are any csums in that range we have to cow in order
5521 	 * to keep the csums correct
5522 	 */
5523 	disk_bytenr += backref_offset;
5524 	disk_bytenr += offset - key.offset;
5525 	num_bytes = min(offset + len, extent_end) - offset;
5526 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5527 				goto out;
5528 	/*
5529 	 * all of the above have passed, it is safe to overwrite this extent
5530 	 * without cow
5531 	 */
5532 	ret = 1;
5533 out:
5534 	btrfs_free_path(path);
5535 	return ret;
5536 }
5537 
5538 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5539 				   struct buffer_head *bh_result, int create)
5540 {
5541 	struct extent_map *em;
5542 	struct btrfs_root *root = BTRFS_I(inode)->root;
5543 	u64 start = iblock << inode->i_blkbits;
5544 	u64 len = bh_result->b_size;
5545 	struct btrfs_trans_handle *trans;
5546 
5547 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5548 	if (IS_ERR(em))
5549 		return PTR_ERR(em);
5550 
5551 	/*
5552 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5553 	 * io.  INLINE is special, and we could probably kludge it in here, but
5554 	 * it's still buffered so for safety lets just fall back to the generic
5555 	 * buffered path.
5556 	 *
5557 	 * For COMPRESSED we _have_ to read the entire extent in so we can
5558 	 * decompress it, so there will be buffering required no matter what we
5559 	 * do, so go ahead and fallback to buffered.
5560 	 *
5561 	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5562 	 * to buffered IO.  Don't blame me, this is the price we pay for using
5563 	 * the generic code.
5564 	 */
5565 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5566 	    em->block_start == EXTENT_MAP_INLINE) {
5567 		free_extent_map(em);
5568 		return -ENOTBLK;
5569 	}
5570 
5571 	/* Just a good old fashioned hole, return */
5572 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5573 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5574 		free_extent_map(em);
5575 		/* DIO will do one hole at a time, so just unlock a sector */
5576 		unlock_extent(&BTRFS_I(inode)->io_tree, start,
5577 			      start + root->sectorsize - 1, GFP_NOFS);
5578 		return 0;
5579 	}
5580 
5581 	/*
5582 	 * We don't allocate a new extent in the following cases
5583 	 *
5584 	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
5585 	 * existing extent.
5586 	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
5587 	 * just use the extent.
5588 	 *
5589 	 */
5590 	if (!create) {
5591 		len = em->len - (start - em->start);
5592 		goto map;
5593 	}
5594 
5595 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5596 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5597 	     em->block_start != EXTENT_MAP_HOLE)) {
5598 		int type;
5599 		int ret;
5600 		u64 block_start;
5601 
5602 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5603 			type = BTRFS_ORDERED_PREALLOC;
5604 		else
5605 			type = BTRFS_ORDERED_NOCOW;
5606 		len = min(len, em->len - (start - em->start));
5607 		block_start = em->block_start + (start - em->start);
5608 
5609 		/*
5610 		 * we're not going to log anything, but we do need
5611 		 * to make sure the current transaction stays open
5612 		 * while we look for nocow cross refs
5613 		 */
5614 		trans = btrfs_join_transaction(root);
5615 		if (IS_ERR(trans))
5616 			goto must_cow;
5617 
5618 		if (can_nocow_odirect(trans, inode, start, len) == 1) {
5619 			ret = btrfs_add_ordered_extent_dio(inode, start,
5620 					   block_start, len, len, type);
5621 			btrfs_end_transaction(trans, root);
5622 			if (ret) {
5623 				free_extent_map(em);
5624 				return ret;
5625 			}
5626 			goto unlock;
5627 		}
5628 		btrfs_end_transaction(trans, root);
5629 	}
5630 must_cow:
5631 	/*
5632 	 * this will cow the extent, reset the len in case we changed
5633 	 * it above
5634 	 */
5635 	len = bh_result->b_size;
5636 	em = btrfs_new_extent_direct(inode, em, start, len);
5637 	if (IS_ERR(em))
5638 		return PTR_ERR(em);
5639 	len = min(len, em->len - (start - em->start));
5640 unlock:
5641 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5642 			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5643 			  0, NULL, GFP_NOFS);
5644 map:
5645 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5646 		inode->i_blkbits;
5647 	bh_result->b_size = len;
5648 	bh_result->b_bdev = em->bdev;
5649 	set_buffer_mapped(bh_result);
5650 	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5651 		set_buffer_new(bh_result);
5652 
5653 	free_extent_map(em);
5654 
5655 	return 0;
5656 }
5657 
5658 struct btrfs_dio_private {
5659 	struct inode *inode;
5660 	u64 logical_offset;
5661 	u64 disk_bytenr;
5662 	u64 bytes;
5663 	u32 *csums;
5664 	void *private;
5665 
5666 	/* number of bios pending for this dio */
5667 	atomic_t pending_bios;
5668 
5669 	/* IO errors */
5670 	int errors;
5671 
5672 	struct bio *orig_bio;
5673 };
5674 
5675 static void btrfs_endio_direct_read(struct bio *bio, int err)
5676 {
5677 	struct btrfs_dio_private *dip = bio->bi_private;
5678 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5679 	struct bio_vec *bvec = bio->bi_io_vec;
5680 	struct inode *inode = dip->inode;
5681 	struct btrfs_root *root = BTRFS_I(inode)->root;
5682 	u64 start;
5683 	u32 *private = dip->csums;
5684 
5685 	start = dip->logical_offset;
5686 	do {
5687 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5688 			struct page *page = bvec->bv_page;
5689 			char *kaddr;
5690 			u32 csum = ~(u32)0;
5691 			unsigned long flags;
5692 
5693 			local_irq_save(flags);
5694 			kaddr = kmap_atomic(page, KM_IRQ0);
5695 			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5696 					       csum, bvec->bv_len);
5697 			btrfs_csum_final(csum, (char *)&csum);
5698 			kunmap_atomic(kaddr, KM_IRQ0);
5699 			local_irq_restore(flags);
5700 
5701 			flush_dcache_page(bvec->bv_page);
5702 			if (csum != *private) {
5703 				printk(KERN_ERR "btrfs csum failed ino %llu off"
5704 				      " %llu csum %u private %u\n",
5705 				      (unsigned long long)btrfs_ino(inode),
5706 				      (unsigned long long)start,
5707 				      csum, *private);
5708 				err = -EIO;
5709 			}
5710 		}
5711 
5712 		start += bvec->bv_len;
5713 		private++;
5714 		bvec++;
5715 	} while (bvec <= bvec_end);
5716 
5717 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5718 		      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5719 	bio->bi_private = dip->private;
5720 
5721 	kfree(dip->csums);
5722 	kfree(dip);
5723 
5724 	/* If we had a csum failure make sure to clear the uptodate flag */
5725 	if (err)
5726 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
5727 	dio_end_io(bio, err);
5728 }
5729 
5730 static void btrfs_endio_direct_write(struct bio *bio, int err)
5731 {
5732 	struct btrfs_dio_private *dip = bio->bi_private;
5733 	struct inode *inode = dip->inode;
5734 	struct btrfs_root *root = BTRFS_I(inode)->root;
5735 	struct btrfs_trans_handle *trans;
5736 	struct btrfs_ordered_extent *ordered = NULL;
5737 	struct extent_state *cached_state = NULL;
5738 	u64 ordered_offset = dip->logical_offset;
5739 	u64 ordered_bytes = dip->bytes;
5740 	int ret;
5741 
5742 	if (err)
5743 		goto out_done;
5744 again:
5745 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5746 						   &ordered_offset,
5747 						   ordered_bytes);
5748 	if (!ret)
5749 		goto out_test;
5750 
5751 	BUG_ON(!ordered);
5752 
5753 	trans = btrfs_join_transaction(root);
5754 	if (IS_ERR(trans)) {
5755 		err = -ENOMEM;
5756 		goto out;
5757 	}
5758 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5759 
5760 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5761 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5762 		if (!ret)
5763 			err = btrfs_update_inode_fallback(trans, root, inode);
5764 		goto out;
5765 	}
5766 
5767 	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5768 			 ordered->file_offset + ordered->len - 1, 0,
5769 			 &cached_state, GFP_NOFS);
5770 
5771 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5772 		ret = btrfs_mark_extent_written(trans, inode,
5773 						ordered->file_offset,
5774 						ordered->file_offset +
5775 						ordered->len);
5776 		if (ret) {
5777 			err = ret;
5778 			goto out_unlock;
5779 		}
5780 	} else {
5781 		ret = insert_reserved_file_extent(trans, inode,
5782 						  ordered->file_offset,
5783 						  ordered->start,
5784 						  ordered->disk_len,
5785 						  ordered->len,
5786 						  ordered->len,
5787 						  0, 0, 0,
5788 						  BTRFS_FILE_EXTENT_REG);
5789 		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5790 				   ordered->file_offset, ordered->len);
5791 		if (ret) {
5792 			err = ret;
5793 			WARN_ON(1);
5794 			goto out_unlock;
5795 		}
5796 	}
5797 
5798 	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5799 	ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5800 	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5801 		btrfs_update_inode_fallback(trans, root, inode);
5802 	ret = 0;
5803 out_unlock:
5804 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5805 			     ordered->file_offset + ordered->len - 1,
5806 			     &cached_state, GFP_NOFS);
5807 out:
5808 	btrfs_delalloc_release_metadata(inode, ordered->len);
5809 	btrfs_end_transaction(trans, root);
5810 	ordered_offset = ordered->file_offset + ordered->len;
5811 	btrfs_put_ordered_extent(ordered);
5812 	btrfs_put_ordered_extent(ordered);
5813 
5814 out_test:
5815 	/*
5816 	 * our bio might span multiple ordered extents.  If we haven't
5817 	 * completed the accounting for the whole dio, go back and try again
5818 	 */
5819 	if (ordered_offset < dip->logical_offset + dip->bytes) {
5820 		ordered_bytes = dip->logical_offset + dip->bytes -
5821 			ordered_offset;
5822 		goto again;
5823 	}
5824 out_done:
5825 	bio->bi_private = dip->private;
5826 
5827 	kfree(dip->csums);
5828 	kfree(dip);
5829 
5830 	/* If we had an error make sure to clear the uptodate flag */
5831 	if (err)
5832 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
5833 	dio_end_io(bio, err);
5834 }
5835 
5836 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5837 				    struct bio *bio, int mirror_num,
5838 				    unsigned long bio_flags, u64 offset)
5839 {
5840 	int ret;
5841 	struct btrfs_root *root = BTRFS_I(inode)->root;
5842 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5843 	BUG_ON(ret);
5844 	return 0;
5845 }
5846 
5847 static void btrfs_end_dio_bio(struct bio *bio, int err)
5848 {
5849 	struct btrfs_dio_private *dip = bio->bi_private;
5850 
5851 	if (err) {
5852 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
5853 		      "sector %#Lx len %u err no %d\n",
5854 		      (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
5855 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
5856 		dip->errors = 1;
5857 
5858 		/*
5859 		 * before atomic variable goto zero, we must make sure
5860 		 * dip->errors is perceived to be set.
5861 		 */
5862 		smp_mb__before_atomic_dec();
5863 	}
5864 
5865 	/* if there are more bios still pending for this dio, just exit */
5866 	if (!atomic_dec_and_test(&dip->pending_bios))
5867 		goto out;
5868 
5869 	if (dip->errors)
5870 		bio_io_error(dip->orig_bio);
5871 	else {
5872 		set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
5873 		bio_endio(dip->orig_bio, 0);
5874 	}
5875 out:
5876 	bio_put(bio);
5877 }
5878 
5879 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5880 				       u64 first_sector, gfp_t gfp_flags)
5881 {
5882 	int nr_vecs = bio_get_nr_vecs(bdev);
5883 	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
5884 }
5885 
5886 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5887 					 int rw, u64 file_offset, int skip_sum,
5888 					 u32 *csums, int async_submit)
5889 {
5890 	int write = rw & REQ_WRITE;
5891 	struct btrfs_root *root = BTRFS_I(inode)->root;
5892 	int ret;
5893 
5894 	bio_get(bio);
5895 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5896 	if (ret)
5897 		goto err;
5898 
5899 	if (skip_sum)
5900 		goto map;
5901 
5902 	if (write && async_submit) {
5903 		ret = btrfs_wq_submit_bio(root->fs_info,
5904 				   inode, rw, bio, 0, 0,
5905 				   file_offset,
5906 				   __btrfs_submit_bio_start_direct_io,
5907 				   __btrfs_submit_bio_done);
5908 		goto err;
5909 	} else if (write) {
5910 		/*
5911 		 * If we aren't doing async submit, calculate the csum of the
5912 		 * bio now.
5913 		 */
5914 		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
5915 		if (ret)
5916 			goto err;
5917 	} else if (!skip_sum) {
5918 		ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
5919 					  file_offset, csums);
5920 		if (ret)
5921 			goto err;
5922 	}
5923 
5924 map:
5925 	ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
5926 err:
5927 	bio_put(bio);
5928 	return ret;
5929 }
5930 
5931 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5932 				    int skip_sum)
5933 {
5934 	struct inode *inode = dip->inode;
5935 	struct btrfs_root *root = BTRFS_I(inode)->root;
5936 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5937 	struct bio *bio;
5938 	struct bio *orig_bio = dip->orig_bio;
5939 	struct bio_vec *bvec = orig_bio->bi_io_vec;
5940 	u64 start_sector = orig_bio->bi_sector;
5941 	u64 file_offset = dip->logical_offset;
5942 	u64 submit_len = 0;
5943 	u64 map_length;
5944 	int nr_pages = 0;
5945 	u32 *csums = dip->csums;
5946 	int ret = 0;
5947 	int async_submit = 0;
5948 	int write = rw & REQ_WRITE;
5949 
5950 	map_length = orig_bio->bi_size;
5951 	ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5952 			      &map_length, NULL, 0);
5953 	if (ret) {
5954 		bio_put(orig_bio);
5955 		return -EIO;
5956 	}
5957 
5958 	if (map_length >= orig_bio->bi_size) {
5959 		bio = orig_bio;
5960 		goto submit;
5961 	}
5962 
5963 	async_submit = 1;
5964 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
5965 	if (!bio)
5966 		return -ENOMEM;
5967 	bio->bi_private = dip;
5968 	bio->bi_end_io = btrfs_end_dio_bio;
5969 	atomic_inc(&dip->pending_bios);
5970 
5971 	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5972 		if (unlikely(map_length < submit_len + bvec->bv_len ||
5973 		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5974 				 bvec->bv_offset) < bvec->bv_len)) {
5975 			/*
5976 			 * inc the count before we submit the bio so
5977 			 * we know the end IO handler won't happen before
5978 			 * we inc the count. Otherwise, the dip might get freed
5979 			 * before we're done setting it up
5980 			 */
5981 			atomic_inc(&dip->pending_bios);
5982 			ret = __btrfs_submit_dio_bio(bio, inode, rw,
5983 						     file_offset, skip_sum,
5984 						     csums, async_submit);
5985 			if (ret) {
5986 				bio_put(bio);
5987 				atomic_dec(&dip->pending_bios);
5988 				goto out_err;
5989 			}
5990 
5991 			/* Write's use the ordered csums */
5992 			if (!write && !skip_sum)
5993 				csums = csums + nr_pages;
5994 			start_sector += submit_len >> 9;
5995 			file_offset += submit_len;
5996 
5997 			submit_len = 0;
5998 			nr_pages = 0;
5999 
6000 			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
6001 						  start_sector, GFP_NOFS);
6002 			if (!bio)
6003 				goto out_err;
6004 			bio->bi_private = dip;
6005 			bio->bi_end_io = btrfs_end_dio_bio;
6006 
6007 			map_length = orig_bio->bi_size;
6008 			ret = btrfs_map_block(map_tree, READ, start_sector << 9,
6009 					      &map_length, NULL, 0);
6010 			if (ret) {
6011 				bio_put(bio);
6012 				goto out_err;
6013 			}
6014 		} else {
6015 			submit_len += bvec->bv_len;
6016 			nr_pages ++;
6017 			bvec++;
6018 		}
6019 	}
6020 
6021 submit:
6022 	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
6023 				     csums, async_submit);
6024 	if (!ret)
6025 		return 0;
6026 
6027 	bio_put(bio);
6028 out_err:
6029 	dip->errors = 1;
6030 	/*
6031 	 * before atomic variable goto zero, we must
6032 	 * make sure dip->errors is perceived to be set.
6033 	 */
6034 	smp_mb__before_atomic_dec();
6035 	if (atomic_dec_and_test(&dip->pending_bios))
6036 		bio_io_error(dip->orig_bio);
6037 
6038 	/* bio_end_io() will handle error, so we needn't return it */
6039 	return 0;
6040 }
6041 
6042 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
6043 				loff_t file_offset)
6044 {
6045 	struct btrfs_root *root = BTRFS_I(inode)->root;
6046 	struct btrfs_dio_private *dip;
6047 	struct bio_vec *bvec = bio->bi_io_vec;
6048 	int skip_sum;
6049 	int write = rw & REQ_WRITE;
6050 	int ret = 0;
6051 
6052 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
6053 
6054 	dip = kmalloc(sizeof(*dip), GFP_NOFS);
6055 	if (!dip) {
6056 		ret = -ENOMEM;
6057 		goto free_ordered;
6058 	}
6059 	dip->csums = NULL;
6060 
6061 	/* Write's use the ordered csum stuff, so we don't need dip->csums */
6062 	if (!write && !skip_sum) {
6063 		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
6064 		if (!dip->csums) {
6065 			kfree(dip);
6066 			ret = -ENOMEM;
6067 			goto free_ordered;
6068 		}
6069 	}
6070 
6071 	dip->private = bio->bi_private;
6072 	dip->inode = inode;
6073 	dip->logical_offset = file_offset;
6074 
6075 	dip->bytes = 0;
6076 	do {
6077 		dip->bytes += bvec->bv_len;
6078 		bvec++;
6079 	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
6080 
6081 	dip->disk_bytenr = (u64)bio->bi_sector << 9;
6082 	bio->bi_private = dip;
6083 	dip->errors = 0;
6084 	dip->orig_bio = bio;
6085 	atomic_set(&dip->pending_bios, 0);
6086 
6087 	if (write)
6088 		bio->bi_end_io = btrfs_endio_direct_write;
6089 	else
6090 		bio->bi_end_io = btrfs_endio_direct_read;
6091 
6092 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
6093 	if (!ret)
6094 		return;
6095 free_ordered:
6096 	/*
6097 	 * If this is a write, we need to clean up the reserved space and kill
6098 	 * the ordered extent.
6099 	 */
6100 	if (write) {
6101 		struct btrfs_ordered_extent *ordered;
6102 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
6103 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
6104 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
6105 			btrfs_free_reserved_extent(root, ordered->start,
6106 						   ordered->disk_len);
6107 		btrfs_put_ordered_extent(ordered);
6108 		btrfs_put_ordered_extent(ordered);
6109 	}
6110 	bio_endio(bio, ret);
6111 }
6112 
6113 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
6114 			const struct iovec *iov, loff_t offset,
6115 			unsigned long nr_segs)
6116 {
6117 	int seg;
6118 	int i;
6119 	size_t size;
6120 	unsigned long addr;
6121 	unsigned blocksize_mask = root->sectorsize - 1;
6122 	ssize_t retval = -EINVAL;
6123 	loff_t end = offset;
6124 
6125 	if (offset & blocksize_mask)
6126 		goto out;
6127 
6128 	/* Check the memory alignment.  Blocks cannot straddle pages */
6129 	for (seg = 0; seg < nr_segs; seg++) {
6130 		addr = (unsigned long)iov[seg].iov_base;
6131 		size = iov[seg].iov_len;
6132 		end += size;
6133 		if ((addr & blocksize_mask) || (size & blocksize_mask))
6134 			goto out;
6135 
6136 		/* If this is a write we don't need to check anymore */
6137 		if (rw & WRITE)
6138 			continue;
6139 
6140 		/*
6141 		 * Check to make sure we don't have duplicate iov_base's in this
6142 		 * iovec, if so return EINVAL, otherwise we'll get csum errors
6143 		 * when reading back.
6144 		 */
6145 		for (i = seg + 1; i < nr_segs; i++) {
6146 			if (iov[seg].iov_base == iov[i].iov_base)
6147 				goto out;
6148 		}
6149 	}
6150 	retval = 0;
6151 out:
6152 	return retval;
6153 }
6154 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6155 			const struct iovec *iov, loff_t offset,
6156 			unsigned long nr_segs)
6157 {
6158 	struct file *file = iocb->ki_filp;
6159 	struct inode *inode = file->f_mapping->host;
6160 	struct btrfs_ordered_extent *ordered;
6161 	struct extent_state *cached_state = NULL;
6162 	u64 lockstart, lockend;
6163 	ssize_t ret;
6164 	int writing = rw & WRITE;
6165 	int write_bits = 0;
6166 	size_t count = iov_length(iov, nr_segs);
6167 
6168 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6169 			    offset, nr_segs)) {
6170 		return 0;
6171 	}
6172 
6173 	lockstart = offset;
6174 	lockend = offset + count - 1;
6175 
6176 	if (writing) {
6177 		ret = btrfs_delalloc_reserve_space(inode, count);
6178 		if (ret)
6179 			goto out;
6180 	}
6181 
6182 	while (1) {
6183 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6184 				 0, &cached_state, GFP_NOFS);
6185 		/*
6186 		 * We're concerned with the entire range that we're going to be
6187 		 * doing DIO to, so we need to make sure theres no ordered
6188 		 * extents in this range.
6189 		 */
6190 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
6191 						     lockend - lockstart + 1);
6192 		if (!ordered)
6193 			break;
6194 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6195 				     &cached_state, GFP_NOFS);
6196 		btrfs_start_ordered_extent(inode, ordered, 1);
6197 		btrfs_put_ordered_extent(ordered);
6198 		cond_resched();
6199 	}
6200 
6201 	/*
6202 	 * we don't use btrfs_set_extent_delalloc because we don't want
6203 	 * the dirty or uptodate bits
6204 	 */
6205 	if (writing) {
6206 		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
6207 		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6208 				     EXTENT_DELALLOC, 0, NULL, &cached_state,
6209 				     GFP_NOFS);
6210 		if (ret) {
6211 			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6212 					 lockend, EXTENT_LOCKED | write_bits,
6213 					 1, 0, &cached_state, GFP_NOFS);
6214 			goto out;
6215 		}
6216 	}
6217 
6218 	free_extent_state(cached_state);
6219 	cached_state = NULL;
6220 
6221 	ret = __blockdev_direct_IO(rw, iocb, inode,
6222 		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
6223 		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
6224 		   btrfs_submit_direct, 0);
6225 
6226 	if (ret < 0 && ret != -EIOCBQUEUED) {
6227 		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
6228 			      offset + iov_length(iov, nr_segs) - 1,
6229 			      EXTENT_LOCKED | write_bits, 1, 0,
6230 			      &cached_state, GFP_NOFS);
6231 	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
6232 		/*
6233 		 * We're falling back to buffered, unlock the section we didn't
6234 		 * do IO on.
6235 		 */
6236 		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
6237 			      offset + iov_length(iov, nr_segs) - 1,
6238 			      EXTENT_LOCKED | write_bits, 1, 0,
6239 			      &cached_state, GFP_NOFS);
6240 	}
6241 out:
6242 	free_extent_state(cached_state);
6243 	return ret;
6244 }
6245 
6246 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6247 		__u64 start, __u64 len)
6248 {
6249 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6250 }
6251 
6252 int btrfs_readpage(struct file *file, struct page *page)
6253 {
6254 	struct extent_io_tree *tree;
6255 	tree = &BTRFS_I(page->mapping->host)->io_tree;
6256 	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6257 }
6258 
6259 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
6260 {
6261 	struct extent_io_tree *tree;
6262 
6263 
6264 	if (current->flags & PF_MEMALLOC) {
6265 		redirty_page_for_writepage(wbc, page);
6266 		unlock_page(page);
6267 		return 0;
6268 	}
6269 	tree = &BTRFS_I(page->mapping->host)->io_tree;
6270 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
6271 }
6272 
6273 int btrfs_writepages(struct address_space *mapping,
6274 		     struct writeback_control *wbc)
6275 {
6276 	struct extent_io_tree *tree;
6277 
6278 	tree = &BTRFS_I(mapping->host)->io_tree;
6279 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
6280 }
6281 
6282 static int
6283 btrfs_readpages(struct file *file, struct address_space *mapping,
6284 		struct list_head *pages, unsigned nr_pages)
6285 {
6286 	struct extent_io_tree *tree;
6287 	tree = &BTRFS_I(mapping->host)->io_tree;
6288 	return extent_readpages(tree, mapping, pages, nr_pages,
6289 				btrfs_get_extent);
6290 }
6291 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
6292 {
6293 	struct extent_io_tree *tree;
6294 	struct extent_map_tree *map;
6295 	int ret;
6296 
6297 	tree = &BTRFS_I(page->mapping->host)->io_tree;
6298 	map = &BTRFS_I(page->mapping->host)->extent_tree;
6299 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
6300 	if (ret == 1) {
6301 		ClearPagePrivate(page);
6302 		set_page_private(page, 0);
6303 		page_cache_release(page);
6304 	}
6305 	return ret;
6306 }
6307 
6308 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
6309 {
6310 	if (PageWriteback(page) || PageDirty(page))
6311 		return 0;
6312 	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
6313 }
6314 
6315 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
6316 {
6317 	struct extent_io_tree *tree;
6318 	struct btrfs_ordered_extent *ordered;
6319 	struct extent_state *cached_state = NULL;
6320 	u64 page_start = page_offset(page);
6321 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
6322 
6323 
6324 	/*
6325 	 * we have the page locked, so new writeback can't start,
6326 	 * and the dirty bit won't be cleared while we are here.
6327 	 *
6328 	 * Wait for IO on this page so that we can safely clear
6329 	 * the PagePrivate2 bit and do ordered accounting
6330 	 */
6331 	wait_on_page_writeback(page);
6332 
6333 	tree = &BTRFS_I(page->mapping->host)->io_tree;
6334 	if (offset) {
6335 		btrfs_releasepage(page, GFP_NOFS);
6336 		return;
6337 	}
6338 	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
6339 			 GFP_NOFS);
6340 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
6341 					   page_offset(page));
6342 	if (ordered) {
6343 		/*
6344 		 * IO on this page will never be started, so we need
6345 		 * to account for any ordered extents now
6346 		 */
6347 		clear_extent_bit(tree, page_start, page_end,
6348 				 EXTENT_DIRTY | EXTENT_DELALLOC |
6349 				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
6350 				 &cached_state, GFP_NOFS);
6351 		/*
6352 		 * whoever cleared the private bit is responsible
6353 		 * for the finish_ordered_io
6354 		 */
6355 		if (TestClearPagePrivate2(page)) {
6356 			btrfs_finish_ordered_io(page->mapping->host,
6357 						page_start, page_end);
6358 		}
6359 		btrfs_put_ordered_extent(ordered);
6360 		cached_state = NULL;
6361 		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
6362 				 GFP_NOFS);
6363 	}
6364 	clear_extent_bit(tree, page_start, page_end,
6365 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
6366 		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
6367 	__btrfs_releasepage(page, GFP_NOFS);
6368 
6369 	ClearPageChecked(page);
6370 	if (PagePrivate(page)) {
6371 		ClearPagePrivate(page);
6372 		set_page_private(page, 0);
6373 		page_cache_release(page);
6374 	}
6375 }
6376 
6377 /*
6378  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
6379  * called from a page fault handler when a page is first dirtied. Hence we must
6380  * be careful to check for EOF conditions here. We set the page up correctly
6381  * for a written page which means we get ENOSPC checking when writing into
6382  * holes and correct delalloc and unwritten extent mapping on filesystems that
6383  * support these features.
6384  *
6385  * We are not allowed to take the i_mutex here so we have to play games to
6386  * protect against truncate races as the page could now be beyond EOF.  Because
6387  * vmtruncate() writes the inode size before removing pages, once we have the
6388  * page lock we can determine safely if the page is beyond EOF. If it is not
6389  * beyond EOF, then the page is guaranteed safe against truncation until we
6390  * unlock the page.
6391  */
6392 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6393 {
6394 	struct page *page = vmf->page;
6395 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
6396 	struct btrfs_root *root = BTRFS_I(inode)->root;
6397 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6398 	struct btrfs_ordered_extent *ordered;
6399 	struct extent_state *cached_state = NULL;
6400 	char *kaddr;
6401 	unsigned long zero_start;
6402 	loff_t size;
6403 	int ret;
6404 	u64 page_start;
6405 	u64 page_end;
6406 
6407 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 	if (!ret)
6409 		ret = btrfs_update_time(vma->vm_file);
6410 	if (ret) {
6411 		if (ret == -ENOMEM)
6412 			ret = VM_FAULT_OOM;
6413 		else /* -ENOSPC, -EIO, etc */
6414 			ret = VM_FAULT_SIGBUS;
6415 		goto out;
6416 	}
6417 
6418 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
6419 again:
6420 	lock_page(page);
6421 	size = i_size_read(inode);
6422 	page_start = page_offset(page);
6423 	page_end = page_start + PAGE_CACHE_SIZE - 1;
6424 
6425 	if ((page->mapping != inode->i_mapping) ||
6426 	    (page_start >= size)) {
6427 		/* page got truncated out from underneath us */
6428 		goto out_unlock;
6429 	}
6430 	wait_on_page_writeback(page);
6431 
6432 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
6433 			 GFP_NOFS);
6434 	set_page_extent_mapped(page);
6435 
6436 	/*
6437 	 * we can't set the delalloc bits if there are pending ordered
6438 	 * extents.  Drop our locks and wait for them to finish
6439 	 */
6440 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
6441 	if (ordered) {
6442 		unlock_extent_cached(io_tree, page_start, page_end,
6443 				     &cached_state, GFP_NOFS);
6444 		unlock_page(page);
6445 		btrfs_start_ordered_extent(inode, ordered, 1);
6446 		btrfs_put_ordered_extent(ordered);
6447 		goto again;
6448 	}
6449 
6450 	/*
6451 	 * XXX - page_mkwrite gets called every time the page is dirtied, even
6452 	 * if it was already dirty, so for space accounting reasons we need to
6453 	 * clear any delalloc bits for the range we are fixing to save.  There
6454 	 * is probably a better way to do this, but for now keep consistent with
6455 	 * prepare_pages in the normal write path.
6456 	 */
6457 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
6458 			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
6459 			  0, 0, &cached_state, GFP_NOFS);
6460 
6461 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
6462 					&cached_state);
6463 	if (ret) {
6464 		unlock_extent_cached(io_tree, page_start, page_end,
6465 				     &cached_state, GFP_NOFS);
6466 		ret = VM_FAULT_SIGBUS;
6467 		goto out_unlock;
6468 	}
6469 	ret = 0;
6470 
6471 	/* page is wholly or partially inside EOF */
6472 	if (page_start + PAGE_CACHE_SIZE > size)
6473 		zero_start = size & ~PAGE_CACHE_MASK;
6474 	else
6475 		zero_start = PAGE_CACHE_SIZE;
6476 
6477 	if (zero_start != PAGE_CACHE_SIZE) {
6478 		kaddr = kmap(page);
6479 		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
6480 		flush_dcache_page(page);
6481 		kunmap(page);
6482 	}
6483 	ClearPageChecked(page);
6484 	set_page_dirty(page);
6485 	SetPageUptodate(page);
6486 
6487 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
6488 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
6489 
6490 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
6491 
6492 out_unlock:
6493 	if (!ret)
6494 		return VM_FAULT_LOCKED;
6495 	unlock_page(page);
6496 out:
6497 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6498 	return ret;
6499 }
6500 
6501 static int btrfs_truncate(struct inode *inode)
6502 {
6503 	struct btrfs_root *root = BTRFS_I(inode)->root;
6504 	struct btrfs_block_rsv *rsv;
6505 	int ret;
6506 	int err = 0;
6507 	struct btrfs_trans_handle *trans;
6508 	unsigned long nr;
6509 	u64 mask = root->sectorsize - 1;
6510 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6511 
6512 	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6513 	if (ret)
6514 		return ret;
6515 
6516 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6517 	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6518 
6519 	/*
6520 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
6521 	 * 3 things going on here
6522 	 *
6523 	 * 1) We need to reserve space for our orphan item and the space to
6524 	 * delete our orphan item.  Lord knows we don't want to have a dangling
6525 	 * orphan item because we didn't reserve space to remove it.
6526 	 *
6527 	 * 2) We need to reserve space to update our inode.
6528 	 *
6529 	 * 3) We need to have something to cache all the space that is going to
6530 	 * be free'd up by the truncate operation, but also have some slack
6531 	 * space reserved in case it uses space during the truncate (thank you
6532 	 * very much snapshotting).
6533 	 *
6534 	 * And we need these to all be seperate.  The fact is we can use alot of
6535 	 * space doing the truncate, and we have no earthly idea how much space
6536 	 * we will use, so we need the truncate reservation to be seperate so it
6537 	 * doesn't end up using space reserved for updating the inode or
6538 	 * removing the orphan item.  We also need to be able to stop the
6539 	 * transaction and start a new one, which means we need to be able to
6540 	 * update the inode several times, and we have no idea of knowing how
6541 	 * many times that will be, so we can't just reserve 1 item for the
6542 	 * entirety of the opration, so that has to be done seperately as well.
6543 	 * Then there is the orphan item, which does indeed need to be held on
6544 	 * to for the whole operation, and we need nobody to touch this reserved
6545 	 * space except the orphan code.
6546 	 *
6547 	 * So that leaves us with
6548 	 *
6549 	 * 1) root->orphan_block_rsv - for the orphan deletion.
6550 	 * 2) rsv - for the truncate reservation, which we will steal from the
6551 	 * transaction reservation.
6552 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6553 	 * updating the inode.
6554 	 */
6555 	rsv = btrfs_alloc_block_rsv(root);
6556 	if (!rsv)
6557 		return -ENOMEM;
6558 	rsv->size = min_size;
6559 
6560 	/*
6561 	 * 1 for the truncate slack space
6562 	 * 1 for the orphan item we're going to add
6563 	 * 1 for the orphan item deletion
6564 	 * 1 for updating the inode.
6565 	 */
6566 	trans = btrfs_start_transaction(root, 4);
6567 	if (IS_ERR(trans)) {
6568 		err = PTR_ERR(trans);
6569 		goto out;
6570 	}
6571 
6572 	/* Migrate the slack space for the truncate to our reserve */
6573 	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6574 				      min_size);
6575 	BUG_ON(ret);
6576 
6577 	ret = btrfs_orphan_add(trans, inode);
6578 	if (ret) {
6579 		btrfs_end_transaction(trans, root);
6580 		goto out;
6581 	}
6582 
6583 	/*
6584 	 * setattr is responsible for setting the ordered_data_close flag,
6585 	 * but that is only tested during the last file release.  That
6586 	 * could happen well after the next commit, leaving a great big
6587 	 * window where new writes may get lost if someone chooses to write
6588 	 * to this file after truncating to zero
6589 	 *
6590 	 * The inode doesn't have any dirty data here, and so if we commit
6591 	 * this is a noop.  If someone immediately starts writing to the inode
6592 	 * it is very likely we'll catch some of their writes in this
6593 	 * transaction, and the commit will find this file on the ordered
6594 	 * data list with good things to send down.
6595 	 *
6596 	 * This is a best effort solution, there is still a window where
6597 	 * using truncate to replace the contents of the file will
6598 	 * end up with a zero length file after a crash.
6599 	 */
6600 	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
6601 		btrfs_add_ordered_operation(trans, root, inode);
6602 
6603 	while (1) {
6604 		ret = btrfs_block_rsv_refill(root, rsv, min_size);
6605 		if (ret) {
6606 			/*
6607 			 * This can only happen with the original transaction we
6608 			 * started above, every other time we shouldn't have a
6609 			 * transaction started yet.
6610 			 */
6611 			if (ret == -EAGAIN)
6612 				goto end_trans;
6613 			err = ret;
6614 			break;
6615 		}
6616 
6617 		if (!trans) {
6618 			/* Just need the 1 for updating the inode */
6619 			trans = btrfs_start_transaction(root, 1);
6620 			if (IS_ERR(trans)) {
6621 				ret = err = PTR_ERR(trans);
6622 				trans = NULL;
6623 				break;
6624 			}
6625 		}
6626 
6627 		trans->block_rsv = rsv;
6628 
6629 		ret = btrfs_truncate_inode_items(trans, root, inode,
6630 						 inode->i_size,
6631 						 BTRFS_EXTENT_DATA_KEY);
6632 		if (ret != -EAGAIN) {
6633 			err = ret;
6634 			break;
6635 		}
6636 
6637 		trans->block_rsv = &root->fs_info->trans_block_rsv;
6638 		ret = btrfs_update_inode(trans, root, inode);
6639 		if (ret) {
6640 			err = ret;
6641 			break;
6642 		}
6643 end_trans:
6644 		nr = trans->blocks_used;
6645 		btrfs_end_transaction(trans, root);
6646 		trans = NULL;
6647 		btrfs_btree_balance_dirty(root, nr);
6648 	}
6649 
6650 	if (ret == 0 && inode->i_nlink > 0) {
6651 		trans->block_rsv = root->orphan_block_rsv;
6652 		ret = btrfs_orphan_del(trans, inode);
6653 		if (ret)
6654 			err = ret;
6655 	} else if (ret && inode->i_nlink > 0) {
6656 		/*
6657 		 * Failed to do the truncate, remove us from the in memory
6658 		 * orphan list.
6659 		 */
6660 		ret = btrfs_orphan_del(NULL, inode);
6661 	}
6662 
6663 	if (trans) {
6664 		trans->block_rsv = &root->fs_info->trans_block_rsv;
6665 		ret = btrfs_update_inode(trans, root, inode);
6666 		if (ret && !err)
6667 			err = ret;
6668 
6669 		nr = trans->blocks_used;
6670 		ret = btrfs_end_transaction(trans, root);
6671 		btrfs_btree_balance_dirty(root, nr);
6672 	}
6673 
6674 out:
6675 	btrfs_free_block_rsv(root, rsv);
6676 
6677 	if (ret && !err)
6678 		err = ret;
6679 
6680 	return err;
6681 }
6682 
6683 /*
6684  * create a new subvolume directory/inode (helper for the ioctl).
6685  */
6686 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6687 			     struct btrfs_root *new_root, u64 new_dirid)
6688 {
6689 	struct inode *inode;
6690 	int err;
6691 	u64 index = 0;
6692 
6693 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
6694 				new_dirid, S_IFDIR | 0700, &index);
6695 	if (IS_ERR(inode))
6696 		return PTR_ERR(inode);
6697 	inode->i_op = &btrfs_dir_inode_operations;
6698 	inode->i_fop = &btrfs_dir_file_operations;
6699 
6700 	set_nlink(inode, 1);
6701 	btrfs_i_size_write(inode, 0);
6702 
6703 	err = btrfs_update_inode(trans, new_root, inode);
6704 	BUG_ON(err);
6705 
6706 	iput(inode);
6707 	return 0;
6708 }
6709 
6710 struct inode *btrfs_alloc_inode(struct super_block *sb)
6711 {
6712 	struct btrfs_inode *ei;
6713 	struct inode *inode;
6714 
6715 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
6716 	if (!ei)
6717 		return NULL;
6718 
6719 	ei->root = NULL;
6720 	ei->space_info = NULL;
6721 	ei->generation = 0;
6722 	ei->sequence = 0;
6723 	ei->last_trans = 0;
6724 	ei->last_sub_trans = 0;
6725 	ei->logged_trans = 0;
6726 	ei->delalloc_bytes = 0;
6727 	ei->disk_i_size = 0;
6728 	ei->flags = 0;
6729 	ei->csum_bytes = 0;
6730 	ei->index_cnt = (u64)-1;
6731 	ei->last_unlink_trans = 0;
6732 
6733 	spin_lock_init(&ei->lock);
6734 	ei->outstanding_extents = 0;
6735 	ei->reserved_extents = 0;
6736 
6737 	ei->ordered_data_close = 0;
6738 	ei->orphan_meta_reserved = 0;
6739 	ei->dummy_inode = 0;
6740 	ei->in_defrag = 0;
6741 	ei->delalloc_meta_reserved = 0;
6742 	ei->force_compress = BTRFS_COMPRESS_NONE;
6743 
6744 	ei->delayed_node = NULL;
6745 
6746 	inode = &ei->vfs_inode;
6747 	extent_map_tree_init(&ei->extent_tree);
6748 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
6749 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6750 	mutex_init(&ei->log_mutex);
6751 	mutex_init(&ei->delalloc_mutex);
6752 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6753 	INIT_LIST_HEAD(&ei->i_orphan);
6754 	INIT_LIST_HEAD(&ei->delalloc_inodes);
6755 	INIT_LIST_HEAD(&ei->ordered_operations);
6756 	RB_CLEAR_NODE(&ei->rb_node);
6757 
6758 	return inode;
6759 }
6760 
6761 static void btrfs_i_callback(struct rcu_head *head)
6762 {
6763 	struct inode *inode = container_of(head, struct inode, i_rcu);
6764 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
6765 }
6766 
6767 void btrfs_destroy_inode(struct inode *inode)
6768 {
6769 	struct btrfs_ordered_extent *ordered;
6770 	struct btrfs_root *root = BTRFS_I(inode)->root;
6771 
6772 	WARN_ON(!list_empty(&inode->i_dentry));
6773 	WARN_ON(inode->i_data.nrpages);
6774 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
6775 	WARN_ON(BTRFS_I(inode)->reserved_extents);
6776 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6777 	WARN_ON(BTRFS_I(inode)->csum_bytes);
6778 
6779 	/*
6780 	 * This can happen where we create an inode, but somebody else also
6781 	 * created the same inode and we need to destroy the one we already
6782 	 * created.
6783 	 */
6784 	if (!root)
6785 		goto free;
6786 
6787 	/*
6788 	 * Make sure we're properly removed from the ordered operation
6789 	 * lists.
6790 	 */
6791 	smp_mb();
6792 	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
6793 		spin_lock(&root->fs_info->ordered_extent_lock);
6794 		list_del_init(&BTRFS_I(inode)->ordered_operations);
6795 		spin_unlock(&root->fs_info->ordered_extent_lock);
6796 	}
6797 
6798 	spin_lock(&root->orphan_lock);
6799 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
6800 		printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
6801 		       (unsigned long long)btrfs_ino(inode));
6802 		list_del_init(&BTRFS_I(inode)->i_orphan);
6803 	}
6804 	spin_unlock(&root->orphan_lock);
6805 
6806 	while (1) {
6807 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
6808 		if (!ordered)
6809 			break;
6810 		else {
6811 			printk(KERN_ERR "btrfs found ordered "
6812 			       "extent %llu %llu on inode cleanup\n",
6813 			       (unsigned long long)ordered->file_offset,
6814 			       (unsigned long long)ordered->len);
6815 			btrfs_remove_ordered_extent(inode, ordered);
6816 			btrfs_put_ordered_extent(ordered);
6817 			btrfs_put_ordered_extent(ordered);
6818 		}
6819 	}
6820 	inode_tree_del(inode);
6821 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
6822 free:
6823 	btrfs_remove_delayed_node(inode);
6824 	call_rcu(&inode->i_rcu, btrfs_i_callback);
6825 }
6826 
6827 int btrfs_drop_inode(struct inode *inode)
6828 {
6829 	struct btrfs_root *root = BTRFS_I(inode)->root;
6830 
6831 	if (btrfs_root_refs(&root->root_item) == 0 &&
6832 	    !btrfs_is_free_space_inode(root, inode))
6833 		return 1;
6834 	else
6835 		return generic_drop_inode(inode);
6836 }
6837 
6838 static void init_once(void *foo)
6839 {
6840 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
6841 
6842 	inode_init_once(&ei->vfs_inode);
6843 }
6844 
6845 void btrfs_destroy_cachep(void)
6846 {
6847 	if (btrfs_inode_cachep)
6848 		kmem_cache_destroy(btrfs_inode_cachep);
6849 	if (btrfs_trans_handle_cachep)
6850 		kmem_cache_destroy(btrfs_trans_handle_cachep);
6851 	if (btrfs_transaction_cachep)
6852 		kmem_cache_destroy(btrfs_transaction_cachep);
6853 	if (btrfs_path_cachep)
6854 		kmem_cache_destroy(btrfs_path_cachep);
6855 	if (btrfs_free_space_cachep)
6856 		kmem_cache_destroy(btrfs_free_space_cachep);
6857 }
6858 
6859 int btrfs_init_cachep(void)
6860 {
6861 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
6862 			sizeof(struct btrfs_inode), 0,
6863 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
6864 	if (!btrfs_inode_cachep)
6865 		goto fail;
6866 
6867 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
6868 			sizeof(struct btrfs_trans_handle), 0,
6869 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6870 	if (!btrfs_trans_handle_cachep)
6871 		goto fail;
6872 
6873 	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
6874 			sizeof(struct btrfs_transaction), 0,
6875 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6876 	if (!btrfs_transaction_cachep)
6877 		goto fail;
6878 
6879 	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
6880 			sizeof(struct btrfs_path), 0,
6881 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6882 	if (!btrfs_path_cachep)
6883 		goto fail;
6884 
6885 	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
6886 			sizeof(struct btrfs_free_space), 0,
6887 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6888 	if (!btrfs_free_space_cachep)
6889 		goto fail;
6890 
6891 	return 0;
6892 fail:
6893 	btrfs_destroy_cachep();
6894 	return -ENOMEM;
6895 }
6896 
6897 static int btrfs_getattr(struct vfsmount *mnt,
6898 			 struct dentry *dentry, struct kstat *stat)
6899 {
6900 	struct inode *inode = dentry->d_inode;
6901 	u32 blocksize = inode->i_sb->s_blocksize;
6902 
6903 	generic_fillattr(inode, stat);
6904 	stat->dev = BTRFS_I(inode)->root->anon_dev;
6905 	stat->blksize = PAGE_CACHE_SIZE;
6906 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6907 		ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6908 	return 0;
6909 }
6910 
6911 /*
6912  * If a file is moved, it will inherit the cow and compression flags of the new
6913  * directory.
6914  */
6915 static void fixup_inode_flags(struct inode *dir, struct inode *inode)
6916 {
6917 	struct btrfs_inode *b_dir = BTRFS_I(dir);
6918 	struct btrfs_inode *b_inode = BTRFS_I(inode);
6919 
6920 	if (b_dir->flags & BTRFS_INODE_NODATACOW)
6921 		b_inode->flags |= BTRFS_INODE_NODATACOW;
6922 	else
6923 		b_inode->flags &= ~BTRFS_INODE_NODATACOW;
6924 
6925 	if (b_dir->flags & BTRFS_INODE_COMPRESS)
6926 		b_inode->flags |= BTRFS_INODE_COMPRESS;
6927 	else
6928 		b_inode->flags &= ~BTRFS_INODE_COMPRESS;
6929 }
6930 
6931 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6932 			   struct inode *new_dir, struct dentry *new_dentry)
6933 {
6934 	struct btrfs_trans_handle *trans;
6935 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
6936 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
6937 	struct inode *new_inode = new_dentry->d_inode;
6938 	struct inode *old_inode = old_dentry->d_inode;
6939 	struct timespec ctime = CURRENT_TIME;
6940 	u64 index = 0;
6941 	u64 root_objectid;
6942 	int ret;
6943 	u64 old_ino = btrfs_ino(old_inode);
6944 
6945 	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
6946 		return -EPERM;
6947 
6948 	/* we only allow rename subvolume link between subvolumes */
6949 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
6950 		return -EXDEV;
6951 
6952 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
6953 	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
6954 		return -ENOTEMPTY;
6955 
6956 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
6957 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
6958 		return -ENOTEMPTY;
6959 	/*
6960 	 * we're using rename to replace one file with another.
6961 	 * and the replacement file is large.  Start IO on it now so
6962 	 * we don't add too much work to the end of the transaction
6963 	 */
6964 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
6965 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
6966 		filemap_flush(old_inode->i_mapping);
6967 
6968 	/* close the racy window with snapshot create/destroy ioctl */
6969 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
6970 		down_read(&root->fs_info->subvol_sem);
6971 	/*
6972 	 * We want to reserve the absolute worst case amount of items.  So if
6973 	 * both inodes are subvols and we need to unlink them then that would
6974 	 * require 4 item modifications, but if they are both normal inodes it
6975 	 * would require 5 item modifications, so we'll assume their normal
6976 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6977 	 * should cover the worst case number of items we'll modify.
6978 	 */
6979 	trans = btrfs_start_transaction(root, 20);
6980 	if (IS_ERR(trans)) {
6981                 ret = PTR_ERR(trans);
6982                 goto out_notrans;
6983         }
6984 
6985 	if (dest != root)
6986 		btrfs_record_root_in_trans(trans, dest);
6987 
6988 	ret = btrfs_set_inode_index(new_dir, &index);
6989 	if (ret)
6990 		goto out_fail;
6991 
6992 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
6993 		/* force full log commit if subvolume involved. */
6994 		root->fs_info->last_trans_log_full_commit = trans->transid;
6995 	} else {
6996 		ret = btrfs_insert_inode_ref(trans, dest,
6997 					     new_dentry->d_name.name,
6998 					     new_dentry->d_name.len,
6999 					     old_ino,
7000 					     btrfs_ino(new_dir), index);
7001 		if (ret)
7002 			goto out_fail;
7003 		/*
7004 		 * this is an ugly little race, but the rename is required
7005 		 * to make sure that if we crash, the inode is either at the
7006 		 * old name or the new one.  pinning the log transaction lets
7007 		 * us make sure we don't allow a log commit to come in after
7008 		 * we unlink the name but before we add the new name back in.
7009 		 */
7010 		btrfs_pin_log_trans(root);
7011 	}
7012 	/*
7013 	 * make sure the inode gets flushed if it is replacing
7014 	 * something.
7015 	 */
7016 	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
7017 		btrfs_add_ordered_operation(trans, root, old_inode);
7018 
7019 	old_dir->i_ctime = old_dir->i_mtime = ctime;
7020 	new_dir->i_ctime = new_dir->i_mtime = ctime;
7021 	old_inode->i_ctime = ctime;
7022 
7023 	if (old_dentry->d_parent != new_dentry->d_parent)
7024 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
7025 
7026 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7027 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
7028 		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
7029 					old_dentry->d_name.name,
7030 					old_dentry->d_name.len);
7031 	} else {
7032 		ret = __btrfs_unlink_inode(trans, root, old_dir,
7033 					old_dentry->d_inode,
7034 					old_dentry->d_name.name,
7035 					old_dentry->d_name.len);
7036 		if (!ret)
7037 			ret = btrfs_update_inode(trans, root, old_inode);
7038 	}
7039 	BUG_ON(ret);
7040 
7041 	if (new_inode) {
7042 		new_inode->i_ctime = CURRENT_TIME;
7043 		if (unlikely(btrfs_ino(new_inode) ==
7044 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
7045 			root_objectid = BTRFS_I(new_inode)->location.objectid;
7046 			ret = btrfs_unlink_subvol(trans, dest, new_dir,
7047 						root_objectid,
7048 						new_dentry->d_name.name,
7049 						new_dentry->d_name.len);
7050 			BUG_ON(new_inode->i_nlink == 0);
7051 		} else {
7052 			ret = btrfs_unlink_inode(trans, dest, new_dir,
7053 						 new_dentry->d_inode,
7054 						 new_dentry->d_name.name,
7055 						 new_dentry->d_name.len);
7056 		}
7057 		BUG_ON(ret);
7058 		if (new_inode->i_nlink == 0) {
7059 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
7060 			BUG_ON(ret);
7061 		}
7062 	}
7063 
7064 	fixup_inode_flags(new_dir, old_inode);
7065 
7066 	ret = btrfs_add_link(trans, new_dir, old_inode,
7067 			     new_dentry->d_name.name,
7068 			     new_dentry->d_name.len, 0, index);
7069 	BUG_ON(ret);
7070 
7071 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
7072 		struct dentry *parent = new_dentry->d_parent;
7073 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
7074 		btrfs_end_log_trans(root);
7075 	}
7076 out_fail:
7077 	btrfs_end_transaction(trans, root);
7078 out_notrans:
7079 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7080 		up_read(&root->fs_info->subvol_sem);
7081 
7082 	return ret;
7083 }
7084 
7085 /*
7086  * some fairly slow code that needs optimization. This walks the list
7087  * of all the inodes with pending delalloc and forces them to disk.
7088  */
7089 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
7090 {
7091 	struct list_head *head = &root->fs_info->delalloc_inodes;
7092 	struct btrfs_inode *binode;
7093 	struct inode *inode;
7094 
7095 	if (root->fs_info->sb->s_flags & MS_RDONLY)
7096 		return -EROFS;
7097 
7098 	spin_lock(&root->fs_info->delalloc_lock);
7099 	while (!list_empty(head)) {
7100 		binode = list_entry(head->next, struct btrfs_inode,
7101 				    delalloc_inodes);
7102 		inode = igrab(&binode->vfs_inode);
7103 		if (!inode)
7104 			list_del_init(&binode->delalloc_inodes);
7105 		spin_unlock(&root->fs_info->delalloc_lock);
7106 		if (inode) {
7107 			filemap_flush(inode->i_mapping);
7108 			if (delay_iput)
7109 				btrfs_add_delayed_iput(inode);
7110 			else
7111 				iput(inode);
7112 		}
7113 		cond_resched();
7114 		spin_lock(&root->fs_info->delalloc_lock);
7115 	}
7116 	spin_unlock(&root->fs_info->delalloc_lock);
7117 
7118 	/* the filemap_flush will queue IO into the worker threads, but
7119 	 * we have to make sure the IO is actually started and that
7120 	 * ordered extents get created before we return
7121 	 */
7122 	atomic_inc(&root->fs_info->async_submit_draining);
7123 	while (atomic_read(&root->fs_info->nr_async_submits) ||
7124 	      atomic_read(&root->fs_info->async_delalloc_pages)) {
7125 		wait_event(root->fs_info->async_submit_wait,
7126 		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
7127 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
7128 	}
7129 	atomic_dec(&root->fs_info->async_submit_draining);
7130 	return 0;
7131 }
7132 
7133 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7134 			 const char *symname)
7135 {
7136 	struct btrfs_trans_handle *trans;
7137 	struct btrfs_root *root = BTRFS_I(dir)->root;
7138 	struct btrfs_path *path;
7139 	struct btrfs_key key;
7140 	struct inode *inode = NULL;
7141 	int err;
7142 	int drop_inode = 0;
7143 	u64 objectid;
7144 	u64 index = 0 ;
7145 	int name_len;
7146 	int datasize;
7147 	unsigned long ptr;
7148 	struct btrfs_file_extent_item *ei;
7149 	struct extent_buffer *leaf;
7150 	unsigned long nr = 0;
7151 
7152 	name_len = strlen(symname) + 1;
7153 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
7154 		return -ENAMETOOLONG;
7155 
7156 	/*
7157 	 * 2 items for inode item and ref
7158 	 * 2 items for dir items
7159 	 * 1 item for xattr if selinux is on
7160 	 */
7161 	trans = btrfs_start_transaction(root, 5);
7162 	if (IS_ERR(trans))
7163 		return PTR_ERR(trans);
7164 
7165 	err = btrfs_find_free_ino(root, &objectid);
7166 	if (err)
7167 		goto out_unlock;
7168 
7169 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
7170 				dentry->d_name.len, btrfs_ino(dir), objectid,
7171 				S_IFLNK|S_IRWXUGO, &index);
7172 	if (IS_ERR(inode)) {
7173 		err = PTR_ERR(inode);
7174 		goto out_unlock;
7175 	}
7176 
7177 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
7178 	if (err) {
7179 		drop_inode = 1;
7180 		goto out_unlock;
7181 	}
7182 
7183 	/*
7184 	* If the active LSM wants to access the inode during
7185 	* d_instantiate it needs these. Smack checks to see
7186 	* if the filesystem supports xattrs by looking at the
7187 	* ops vector.
7188 	*/
7189 	inode->i_fop = &btrfs_file_operations;
7190 	inode->i_op = &btrfs_file_inode_operations;
7191 
7192 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7193 	if (err)
7194 		drop_inode = 1;
7195 	else {
7196 		inode->i_mapping->a_ops = &btrfs_aops;
7197 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7198 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7199 	}
7200 	if (drop_inode)
7201 		goto out_unlock;
7202 
7203 	path = btrfs_alloc_path();
7204 	if (!path) {
7205 		err = -ENOMEM;
7206 		drop_inode = 1;
7207 		goto out_unlock;
7208 	}
7209 	key.objectid = btrfs_ino(inode);
7210 	key.offset = 0;
7211 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
7212 	datasize = btrfs_file_extent_calc_inline_size(name_len);
7213 	err = btrfs_insert_empty_item(trans, root, path, &key,
7214 				      datasize);
7215 	if (err) {
7216 		drop_inode = 1;
7217 		btrfs_free_path(path);
7218 		goto out_unlock;
7219 	}
7220 	leaf = path->nodes[0];
7221 	ei = btrfs_item_ptr(leaf, path->slots[0],
7222 			    struct btrfs_file_extent_item);
7223 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
7224 	btrfs_set_file_extent_type(leaf, ei,
7225 				   BTRFS_FILE_EXTENT_INLINE);
7226 	btrfs_set_file_extent_encryption(leaf, ei, 0);
7227 	btrfs_set_file_extent_compression(leaf, ei, 0);
7228 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
7229 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
7230 
7231 	ptr = btrfs_file_extent_inline_start(ei);
7232 	write_extent_buffer(leaf, symname, ptr, name_len);
7233 	btrfs_mark_buffer_dirty(leaf);
7234 	btrfs_free_path(path);
7235 
7236 	inode->i_op = &btrfs_symlink_inode_operations;
7237 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
7238 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
7239 	inode_set_bytes(inode, name_len);
7240 	btrfs_i_size_write(inode, name_len - 1);
7241 	err = btrfs_update_inode(trans, root, inode);
7242 	if (err)
7243 		drop_inode = 1;
7244 
7245 out_unlock:
7246 	if (!err)
7247 		d_instantiate(dentry, inode);
7248 	nr = trans->blocks_used;
7249 	btrfs_end_transaction(trans, root);
7250 	if (drop_inode) {
7251 		inode_dec_link_count(inode);
7252 		iput(inode);
7253 	}
7254 	btrfs_btree_balance_dirty(root, nr);
7255 	return err;
7256 }
7257 
7258 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
7259 				       u64 start, u64 num_bytes, u64 min_size,
7260 				       loff_t actual_len, u64 *alloc_hint,
7261 				       struct btrfs_trans_handle *trans)
7262 {
7263 	struct btrfs_root *root = BTRFS_I(inode)->root;
7264 	struct btrfs_key ins;
7265 	u64 cur_offset = start;
7266 	u64 i_size;
7267 	int ret = 0;
7268 	bool own_trans = true;
7269 
7270 	if (trans)
7271 		own_trans = false;
7272 	while (num_bytes > 0) {
7273 		if (own_trans) {
7274 			trans = btrfs_start_transaction(root, 3);
7275 			if (IS_ERR(trans)) {
7276 				ret = PTR_ERR(trans);
7277 				break;
7278 			}
7279 		}
7280 
7281 		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
7282 					   0, *alloc_hint, (u64)-1, &ins, 1);
7283 		if (ret) {
7284 			if (own_trans)
7285 				btrfs_end_transaction(trans, root);
7286 			break;
7287 		}
7288 
7289 		ret = insert_reserved_file_extent(trans, inode,
7290 						  cur_offset, ins.objectid,
7291 						  ins.offset, ins.offset,
7292 						  ins.offset, 0, 0, 0,
7293 						  BTRFS_FILE_EXTENT_PREALLOC);
7294 		BUG_ON(ret);
7295 		btrfs_drop_extent_cache(inode, cur_offset,
7296 					cur_offset + ins.offset -1, 0);
7297 
7298 		num_bytes -= ins.offset;
7299 		cur_offset += ins.offset;
7300 		*alloc_hint = ins.objectid + ins.offset;
7301 
7302 		inode->i_ctime = CURRENT_TIME;
7303 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
7304 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
7305 		    (actual_len > inode->i_size) &&
7306 		    (cur_offset > inode->i_size)) {
7307 			if (cur_offset > actual_len)
7308 				i_size = actual_len;
7309 			else
7310 				i_size = cur_offset;
7311 			i_size_write(inode, i_size);
7312 			btrfs_ordered_update_i_size(inode, i_size, NULL);
7313 		}
7314 
7315 		ret = btrfs_update_inode(trans, root, inode);
7316 		BUG_ON(ret);
7317 
7318 		if (own_trans)
7319 			btrfs_end_transaction(trans, root);
7320 	}
7321 	return ret;
7322 }
7323 
7324 int btrfs_prealloc_file_range(struct inode *inode, int mode,
7325 			      u64 start, u64 num_bytes, u64 min_size,
7326 			      loff_t actual_len, u64 *alloc_hint)
7327 {
7328 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
7329 					   min_size, actual_len, alloc_hint,
7330 					   NULL);
7331 }
7332 
7333 int btrfs_prealloc_file_range_trans(struct inode *inode,
7334 				    struct btrfs_trans_handle *trans, int mode,
7335 				    u64 start, u64 num_bytes, u64 min_size,
7336 				    loff_t actual_len, u64 *alloc_hint)
7337 {
7338 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
7339 					   min_size, actual_len, alloc_hint, trans);
7340 }
7341 
7342 static int btrfs_set_page_dirty(struct page *page)
7343 {
7344 	return __set_page_dirty_nobuffers(page);
7345 }
7346 
7347 static int btrfs_permission(struct inode *inode, int mask)
7348 {
7349 	struct btrfs_root *root = BTRFS_I(inode)->root;
7350 	umode_t mode = inode->i_mode;
7351 
7352 	if (mask & MAY_WRITE &&
7353 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
7354 		if (btrfs_root_readonly(root))
7355 			return -EROFS;
7356 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
7357 			return -EACCES;
7358 	}
7359 	return generic_permission(inode, mask);
7360 }
7361 
7362 static const struct inode_operations btrfs_dir_inode_operations = {
7363 	.getattr	= btrfs_getattr,
7364 	.lookup		= btrfs_lookup,
7365 	.create		= btrfs_create,
7366 	.unlink		= btrfs_unlink,
7367 	.link		= btrfs_link,
7368 	.mkdir		= btrfs_mkdir,
7369 	.rmdir		= btrfs_rmdir,
7370 	.rename		= btrfs_rename,
7371 	.symlink	= btrfs_symlink,
7372 	.setattr	= btrfs_setattr,
7373 	.mknod		= btrfs_mknod,
7374 	.setxattr	= btrfs_setxattr,
7375 	.getxattr	= btrfs_getxattr,
7376 	.listxattr	= btrfs_listxattr,
7377 	.removexattr	= btrfs_removexattr,
7378 	.permission	= btrfs_permission,
7379 	.get_acl	= btrfs_get_acl,
7380 };
7381 static const struct inode_operations btrfs_dir_ro_inode_operations = {
7382 	.lookup		= btrfs_lookup,
7383 	.permission	= btrfs_permission,
7384 	.get_acl	= btrfs_get_acl,
7385 };
7386 
7387 static const struct file_operations btrfs_dir_file_operations = {
7388 	.llseek		= generic_file_llseek,
7389 	.read		= generic_read_dir,
7390 	.readdir	= btrfs_real_readdir,
7391 	.unlocked_ioctl	= btrfs_ioctl,
7392 #ifdef CONFIG_COMPAT
7393 	.compat_ioctl	= btrfs_ioctl,
7394 #endif
7395 	.release        = btrfs_release_file,
7396 	.fsync		= btrfs_sync_file,
7397 };
7398 
7399 static struct extent_io_ops btrfs_extent_io_ops = {
7400 	.fill_delalloc = run_delalloc_range,
7401 	.submit_bio_hook = btrfs_submit_bio_hook,
7402 	.merge_bio_hook = btrfs_merge_bio_hook,
7403 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
7404 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
7405 	.writepage_start_hook = btrfs_writepage_start_hook,
7406 	.set_bit_hook = btrfs_set_bit_hook,
7407 	.clear_bit_hook = btrfs_clear_bit_hook,
7408 	.merge_extent_hook = btrfs_merge_extent_hook,
7409 	.split_extent_hook = btrfs_split_extent_hook,
7410 };
7411 
7412 /*
7413  * btrfs doesn't support the bmap operation because swapfiles
7414  * use bmap to make a mapping of extents in the file.  They assume
7415  * these extents won't change over the life of the file and they
7416  * use the bmap result to do IO directly to the drive.
7417  *
7418  * the btrfs bmap call would return logical addresses that aren't
7419  * suitable for IO and they also will change frequently as COW
7420  * operations happen.  So, swapfile + btrfs == corruption.
7421  *
7422  * For now we're avoiding this by dropping bmap.
7423  */
7424 static const struct address_space_operations btrfs_aops = {
7425 	.readpage	= btrfs_readpage,
7426 	.writepage	= btrfs_writepage,
7427 	.writepages	= btrfs_writepages,
7428 	.readpages	= btrfs_readpages,
7429 	.direct_IO	= btrfs_direct_IO,
7430 	.invalidatepage = btrfs_invalidatepage,
7431 	.releasepage	= btrfs_releasepage,
7432 	.set_page_dirty	= btrfs_set_page_dirty,
7433 	.error_remove_page = generic_error_remove_page,
7434 };
7435 
7436 static const struct address_space_operations btrfs_symlink_aops = {
7437 	.readpage	= btrfs_readpage,
7438 	.writepage	= btrfs_writepage,
7439 	.invalidatepage = btrfs_invalidatepage,
7440 	.releasepage	= btrfs_releasepage,
7441 };
7442 
7443 static const struct inode_operations btrfs_file_inode_operations = {
7444 	.getattr	= btrfs_getattr,
7445 	.setattr	= btrfs_setattr,
7446 	.setxattr	= btrfs_setxattr,
7447 	.getxattr	= btrfs_getxattr,
7448 	.listxattr      = btrfs_listxattr,
7449 	.removexattr	= btrfs_removexattr,
7450 	.permission	= btrfs_permission,
7451 	.fiemap		= btrfs_fiemap,
7452 	.get_acl	= btrfs_get_acl,
7453 };
7454 static const struct inode_operations btrfs_special_inode_operations = {
7455 	.getattr	= btrfs_getattr,
7456 	.setattr	= btrfs_setattr,
7457 	.permission	= btrfs_permission,
7458 	.setxattr	= btrfs_setxattr,
7459 	.getxattr	= btrfs_getxattr,
7460 	.listxattr	= btrfs_listxattr,
7461 	.removexattr	= btrfs_removexattr,
7462 	.get_acl	= btrfs_get_acl,
7463 };
7464 static const struct inode_operations btrfs_symlink_inode_operations = {
7465 	.readlink	= generic_readlink,
7466 	.follow_link	= page_follow_link_light,
7467 	.put_link	= page_put_link,
7468 	.getattr	= btrfs_getattr,
7469 	.setattr	= btrfs_setattr,
7470 	.permission	= btrfs_permission,
7471 	.setxattr	= btrfs_setxattr,
7472 	.getxattr	= btrfs_getxattr,
7473 	.listxattr	= btrfs_listxattr,
7474 	.removexattr	= btrfs_removexattr,
7475 	.get_acl	= btrfs_get_acl,
7476 };
7477 
7478 const struct dentry_operations btrfs_dentry_operations = {
7479 	.d_delete	= btrfs_dentry_delete,
7480 	.d_release	= btrfs_dentry_release,
7481 };
7482