xref: /linux/fs/btrfs/file.c (revision 9d9659b6c0ebf7dde65ebada4c67980818245913)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/fs.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/time.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/backing-dev.h>
26 #include <linux/mpage.h>
27 #include <linux/falloc.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/statfs.h>
31 #include <linux/compat.h>
32 #include <linux/slab.h>
33 #include "ctree.h"
34 #include "disk-io.h"
35 #include "transaction.h"
36 #include "btrfs_inode.h"
37 #include "ioctl.h"
38 #include "print-tree.h"
39 #include "tree-log.h"
40 #include "locking.h"
41 #include "compat.h"
42 
43 
44 /* simple helper to fault in pages and copy.  This should go away
45  * and be replaced with calls into generic code.
46  */
47 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 					 int write_bytes,
49 					 struct page **prepared_pages,
50 					 struct iov_iter *i)
51 {
52 	size_t copied = 0;
53 	int pg = 0;
54 	int offset = pos & (PAGE_CACHE_SIZE - 1);
55 	int total_copied = 0;
56 
57 	while (write_bytes > 0) {
58 		size_t count = min_t(size_t,
59 				     PAGE_CACHE_SIZE - offset, write_bytes);
60 		struct page *page = prepared_pages[pg];
61 		/*
62 		 * Copy data from userspace to the current page
63 		 *
64 		 * Disable pagefault to avoid recursive lock since
65 		 * the pages are already locked
66 		 */
67 		pagefault_disable();
68 		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
69 		pagefault_enable();
70 
71 		/* Flush processor's dcache for this page */
72 		flush_dcache_page(page);
73 
74 		/*
75 		 * if we get a partial write, we can end up with
76 		 * partially up to date pages.  These add
77 		 * a lot of complexity, so make sure they don't
78 		 * happen by forcing this copy to be retried.
79 		 *
80 		 * The rest of the btrfs_file_write code will fall
81 		 * back to page at a time copies after we return 0.
82 		 */
83 		if (!PageUptodate(page) && copied < count)
84 			copied = 0;
85 
86 		iov_iter_advance(i, copied);
87 		write_bytes -= copied;
88 		total_copied += copied;
89 
90 		/* Return to btrfs_file_aio_write to fault page */
91 		if (unlikely(copied == 0)) {
92 			break;
93 		}
94 
95 		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
96 			offset += copied;
97 		} else {
98 			pg++;
99 			offset = 0;
100 		}
101 	}
102 	return total_copied;
103 }
104 
105 /*
106  * unlocks pages after btrfs_file_write is done with them
107  */
108 static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
109 {
110 	size_t i;
111 	for (i = 0; i < num_pages; i++) {
112 		if (!pages[i])
113 			break;
114 		/* page checked is some magic around finding pages that
115 		 * have been modified without going through btrfs_set_page_dirty
116 		 * clear it here
117 		 */
118 		ClearPageChecked(pages[i]);
119 		unlock_page(pages[i]);
120 		mark_page_accessed(pages[i]);
121 		page_cache_release(pages[i]);
122 	}
123 }
124 
125 /*
126  * after copy_from_user, pages need to be dirtied and we need to make
127  * sure holes are created between the current EOF and the start of
128  * any next extents (if required).
129  *
130  * this also makes the decision about creating an inline extent vs
131  * doing real data extents, marking pages dirty and delalloc as required.
132  */
133 static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
134 				   struct btrfs_root *root,
135 				   struct file *file,
136 				   struct page **pages,
137 				   size_t num_pages,
138 				   loff_t pos,
139 				   size_t write_bytes)
140 {
141 	int err = 0;
142 	int i;
143 	struct inode *inode = fdentry(file)->d_inode;
144 	u64 num_bytes;
145 	u64 start_pos;
146 	u64 end_of_last_block;
147 	u64 end_pos = pos + write_bytes;
148 	loff_t isize = i_size_read(inode);
149 
150 	start_pos = pos & ~((u64)root->sectorsize - 1);
151 	num_bytes = (write_bytes + pos - start_pos +
152 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
153 
154 	end_of_last_block = start_pos + num_bytes - 1;
155 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
156 					NULL);
157 	BUG_ON(err);
158 
159 	for (i = 0; i < num_pages; i++) {
160 		struct page *p = pages[i];
161 		SetPageUptodate(p);
162 		ClearPageChecked(p);
163 		set_page_dirty(p);
164 	}
165 	if (end_pos > isize) {
166 		i_size_write(inode, end_pos);
167 		/* we've only changed i_size in ram, and we haven't updated
168 		 * the disk i_size.  There is no need to log the inode
169 		 * at this time.
170 		 */
171 	}
172 	return 0;
173 }
174 
175 /*
176  * this drops all the extents in the cache that intersect the range
177  * [start, end].  Existing extents are split as required.
178  */
179 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
180 			    int skip_pinned)
181 {
182 	struct extent_map *em;
183 	struct extent_map *split = NULL;
184 	struct extent_map *split2 = NULL;
185 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
186 	u64 len = end - start + 1;
187 	int ret;
188 	int testend = 1;
189 	unsigned long flags;
190 	int compressed = 0;
191 
192 	WARN_ON(end < start);
193 	if (end == (u64)-1) {
194 		len = (u64)-1;
195 		testend = 0;
196 	}
197 	while (1) {
198 		if (!split)
199 			split = alloc_extent_map(GFP_NOFS);
200 		if (!split2)
201 			split2 = alloc_extent_map(GFP_NOFS);
202 		BUG_ON(!split || !split2);
203 
204 		write_lock(&em_tree->lock);
205 		em = lookup_extent_mapping(em_tree, start, len);
206 		if (!em) {
207 			write_unlock(&em_tree->lock);
208 			break;
209 		}
210 		flags = em->flags;
211 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
212 			if (testend && em->start + em->len >= start + len) {
213 				free_extent_map(em);
214 				write_unlock(&em_tree->lock);
215 				break;
216 			}
217 			start = em->start + em->len;
218 			if (testend)
219 				len = start + len - (em->start + em->len);
220 			free_extent_map(em);
221 			write_unlock(&em_tree->lock);
222 			continue;
223 		}
224 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
225 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
226 		remove_extent_mapping(em_tree, em);
227 
228 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
229 		    em->start < start) {
230 			split->start = em->start;
231 			split->len = start - em->start;
232 			split->orig_start = em->orig_start;
233 			split->block_start = em->block_start;
234 
235 			if (compressed)
236 				split->block_len = em->block_len;
237 			else
238 				split->block_len = split->len;
239 
240 			split->bdev = em->bdev;
241 			split->flags = flags;
242 			split->compress_type = em->compress_type;
243 			ret = add_extent_mapping(em_tree, split);
244 			BUG_ON(ret);
245 			free_extent_map(split);
246 			split = split2;
247 			split2 = NULL;
248 		}
249 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
250 		    testend && em->start + em->len > start + len) {
251 			u64 diff = start + len - em->start;
252 
253 			split->start = start + len;
254 			split->len = em->start + em->len - (start + len);
255 			split->bdev = em->bdev;
256 			split->flags = flags;
257 			split->compress_type = em->compress_type;
258 
259 			if (compressed) {
260 				split->block_len = em->block_len;
261 				split->block_start = em->block_start;
262 				split->orig_start = em->orig_start;
263 			} else {
264 				split->block_len = split->len;
265 				split->block_start = em->block_start + diff;
266 				split->orig_start = split->start;
267 			}
268 
269 			ret = add_extent_mapping(em_tree, split);
270 			BUG_ON(ret);
271 			free_extent_map(split);
272 			split = NULL;
273 		}
274 		write_unlock(&em_tree->lock);
275 
276 		/* once for us */
277 		free_extent_map(em);
278 		/* once for the tree*/
279 		free_extent_map(em);
280 	}
281 	if (split)
282 		free_extent_map(split);
283 	if (split2)
284 		free_extent_map(split2);
285 	return 0;
286 }
287 
288 /*
289  * this is very complex, but the basic idea is to drop all extents
290  * in the range start - end.  hint_block is filled in with a block number
291  * that would be a good hint to the block allocator for this file.
292  *
293  * If an extent intersects the range but is not entirely inside the range
294  * it is either truncated or split.  Anything entirely inside the range
295  * is deleted from the tree.
296  */
297 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
298 		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
299 {
300 	struct btrfs_root *root = BTRFS_I(inode)->root;
301 	struct extent_buffer *leaf;
302 	struct btrfs_file_extent_item *fi;
303 	struct btrfs_path *path;
304 	struct btrfs_key key;
305 	struct btrfs_key new_key;
306 	u64 search_start = start;
307 	u64 disk_bytenr = 0;
308 	u64 num_bytes = 0;
309 	u64 extent_offset = 0;
310 	u64 extent_end = 0;
311 	int del_nr = 0;
312 	int del_slot = 0;
313 	int extent_type;
314 	int recow;
315 	int ret;
316 
317 	if (drop_cache)
318 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
319 
320 	path = btrfs_alloc_path();
321 	if (!path)
322 		return -ENOMEM;
323 
324 	while (1) {
325 		recow = 0;
326 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
327 					       search_start, -1);
328 		if (ret < 0)
329 			break;
330 		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
331 			leaf = path->nodes[0];
332 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
333 			if (key.objectid == inode->i_ino &&
334 			    key.type == BTRFS_EXTENT_DATA_KEY)
335 				path->slots[0]--;
336 		}
337 		ret = 0;
338 next_slot:
339 		leaf = path->nodes[0];
340 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
341 			BUG_ON(del_nr > 0);
342 			ret = btrfs_next_leaf(root, path);
343 			if (ret < 0)
344 				break;
345 			if (ret > 0) {
346 				ret = 0;
347 				break;
348 			}
349 			leaf = path->nodes[0];
350 			recow = 1;
351 		}
352 
353 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
354 		if (key.objectid > inode->i_ino ||
355 		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
356 			break;
357 
358 		fi = btrfs_item_ptr(leaf, path->slots[0],
359 				    struct btrfs_file_extent_item);
360 		extent_type = btrfs_file_extent_type(leaf, fi);
361 
362 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
363 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
364 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
365 			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
366 			extent_offset = btrfs_file_extent_offset(leaf, fi);
367 			extent_end = key.offset +
368 				btrfs_file_extent_num_bytes(leaf, fi);
369 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
370 			extent_end = key.offset +
371 				btrfs_file_extent_inline_len(leaf, fi);
372 		} else {
373 			WARN_ON(1);
374 			extent_end = search_start;
375 		}
376 
377 		if (extent_end <= search_start) {
378 			path->slots[0]++;
379 			goto next_slot;
380 		}
381 
382 		search_start = max(key.offset, start);
383 		if (recow) {
384 			btrfs_release_path(root, path);
385 			continue;
386 		}
387 
388 		/*
389 		 *     | - range to drop - |
390 		 *  | -------- extent -------- |
391 		 */
392 		if (start > key.offset && end < extent_end) {
393 			BUG_ON(del_nr > 0);
394 			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
395 
396 			memcpy(&new_key, &key, sizeof(new_key));
397 			new_key.offset = start;
398 			ret = btrfs_duplicate_item(trans, root, path,
399 						   &new_key);
400 			if (ret == -EAGAIN) {
401 				btrfs_release_path(root, path);
402 				continue;
403 			}
404 			if (ret < 0)
405 				break;
406 
407 			leaf = path->nodes[0];
408 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
409 					    struct btrfs_file_extent_item);
410 			btrfs_set_file_extent_num_bytes(leaf, fi,
411 							start - key.offset);
412 
413 			fi = btrfs_item_ptr(leaf, path->slots[0],
414 					    struct btrfs_file_extent_item);
415 
416 			extent_offset += start - key.offset;
417 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
418 			btrfs_set_file_extent_num_bytes(leaf, fi,
419 							extent_end - start);
420 			btrfs_mark_buffer_dirty(leaf);
421 
422 			if (disk_bytenr > 0) {
423 				ret = btrfs_inc_extent_ref(trans, root,
424 						disk_bytenr, num_bytes, 0,
425 						root->root_key.objectid,
426 						new_key.objectid,
427 						start - extent_offset);
428 				BUG_ON(ret);
429 				*hint_byte = disk_bytenr;
430 			}
431 			key.offset = start;
432 		}
433 		/*
434 		 *  | ---- range to drop ----- |
435 		 *      | -------- extent -------- |
436 		 */
437 		if (start <= key.offset && end < extent_end) {
438 			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
439 
440 			memcpy(&new_key, &key, sizeof(new_key));
441 			new_key.offset = end;
442 			btrfs_set_item_key_safe(trans, root, path, &new_key);
443 
444 			extent_offset += end - key.offset;
445 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
446 			btrfs_set_file_extent_num_bytes(leaf, fi,
447 							extent_end - end);
448 			btrfs_mark_buffer_dirty(leaf);
449 			if (disk_bytenr > 0) {
450 				inode_sub_bytes(inode, end - key.offset);
451 				*hint_byte = disk_bytenr;
452 			}
453 			break;
454 		}
455 
456 		search_start = extent_end;
457 		/*
458 		 *       | ---- range to drop ----- |
459 		 *  | -------- extent -------- |
460 		 */
461 		if (start > key.offset && end >= extent_end) {
462 			BUG_ON(del_nr > 0);
463 			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
464 
465 			btrfs_set_file_extent_num_bytes(leaf, fi,
466 							start - key.offset);
467 			btrfs_mark_buffer_dirty(leaf);
468 			if (disk_bytenr > 0) {
469 				inode_sub_bytes(inode, extent_end - start);
470 				*hint_byte = disk_bytenr;
471 			}
472 			if (end == extent_end)
473 				break;
474 
475 			path->slots[0]++;
476 			goto next_slot;
477 		}
478 
479 		/*
480 		 *  | ---- range to drop ----- |
481 		 *    | ------ extent ------ |
482 		 */
483 		if (start <= key.offset && end >= extent_end) {
484 			if (del_nr == 0) {
485 				del_slot = path->slots[0];
486 				del_nr = 1;
487 			} else {
488 				BUG_ON(del_slot + del_nr != path->slots[0]);
489 				del_nr++;
490 			}
491 
492 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
493 				inode_sub_bytes(inode,
494 						extent_end - key.offset);
495 				extent_end = ALIGN(extent_end,
496 						   root->sectorsize);
497 			} else if (disk_bytenr > 0) {
498 				ret = btrfs_free_extent(trans, root,
499 						disk_bytenr, num_bytes, 0,
500 						root->root_key.objectid,
501 						key.objectid, key.offset -
502 						extent_offset);
503 				BUG_ON(ret);
504 				inode_sub_bytes(inode,
505 						extent_end - key.offset);
506 				*hint_byte = disk_bytenr;
507 			}
508 
509 			if (end == extent_end)
510 				break;
511 
512 			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
513 				path->slots[0]++;
514 				goto next_slot;
515 			}
516 
517 			ret = btrfs_del_items(trans, root, path, del_slot,
518 					      del_nr);
519 			BUG_ON(ret);
520 
521 			del_nr = 0;
522 			del_slot = 0;
523 
524 			btrfs_release_path(root, path);
525 			continue;
526 		}
527 
528 		BUG_ON(1);
529 	}
530 
531 	if (del_nr > 0) {
532 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
533 		BUG_ON(ret);
534 	}
535 
536 	btrfs_free_path(path);
537 	return ret;
538 }
539 
540 static int extent_mergeable(struct extent_buffer *leaf, int slot,
541 			    u64 objectid, u64 bytenr, u64 orig_offset,
542 			    u64 *start, u64 *end)
543 {
544 	struct btrfs_file_extent_item *fi;
545 	struct btrfs_key key;
546 	u64 extent_end;
547 
548 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
549 		return 0;
550 
551 	btrfs_item_key_to_cpu(leaf, &key, slot);
552 	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
553 		return 0;
554 
555 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
556 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
557 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
558 	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
559 	    btrfs_file_extent_compression(leaf, fi) ||
560 	    btrfs_file_extent_encryption(leaf, fi) ||
561 	    btrfs_file_extent_other_encoding(leaf, fi))
562 		return 0;
563 
564 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
565 	if ((*start && *start != key.offset) || (*end && *end != extent_end))
566 		return 0;
567 
568 	*start = key.offset;
569 	*end = extent_end;
570 	return 1;
571 }
572 
573 /*
574  * Mark extent in the range start - end as written.
575  *
576  * This changes extent type from 'pre-allocated' to 'regular'. If only
577  * part of extent is marked as written, the extent will be split into
578  * two or three.
579  */
580 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
581 			      struct inode *inode, u64 start, u64 end)
582 {
583 	struct btrfs_root *root = BTRFS_I(inode)->root;
584 	struct extent_buffer *leaf;
585 	struct btrfs_path *path;
586 	struct btrfs_file_extent_item *fi;
587 	struct btrfs_key key;
588 	struct btrfs_key new_key;
589 	u64 bytenr;
590 	u64 num_bytes;
591 	u64 extent_end;
592 	u64 orig_offset;
593 	u64 other_start;
594 	u64 other_end;
595 	u64 split;
596 	int del_nr = 0;
597 	int del_slot = 0;
598 	int recow;
599 	int ret;
600 
601 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
602 
603 	path = btrfs_alloc_path();
604 	BUG_ON(!path);
605 again:
606 	recow = 0;
607 	split = start;
608 	key.objectid = inode->i_ino;
609 	key.type = BTRFS_EXTENT_DATA_KEY;
610 	key.offset = split;
611 
612 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
613 	if (ret > 0 && path->slots[0] > 0)
614 		path->slots[0]--;
615 
616 	leaf = path->nodes[0];
617 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
618 	BUG_ON(key.objectid != inode->i_ino ||
619 	       key.type != BTRFS_EXTENT_DATA_KEY);
620 	fi = btrfs_item_ptr(leaf, path->slots[0],
621 			    struct btrfs_file_extent_item);
622 	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
623 	       BTRFS_FILE_EXTENT_PREALLOC);
624 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
625 	BUG_ON(key.offset > start || extent_end < end);
626 
627 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
628 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
629 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
630 	memcpy(&new_key, &key, sizeof(new_key));
631 
632 	if (start == key.offset && end < extent_end) {
633 		other_start = 0;
634 		other_end = start;
635 		if (extent_mergeable(leaf, path->slots[0] - 1,
636 				     inode->i_ino, bytenr, orig_offset,
637 				     &other_start, &other_end)) {
638 			new_key.offset = end;
639 			btrfs_set_item_key_safe(trans, root, path, &new_key);
640 			fi = btrfs_item_ptr(leaf, path->slots[0],
641 					    struct btrfs_file_extent_item);
642 			btrfs_set_file_extent_num_bytes(leaf, fi,
643 							extent_end - end);
644 			btrfs_set_file_extent_offset(leaf, fi,
645 						     end - orig_offset);
646 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
647 					    struct btrfs_file_extent_item);
648 			btrfs_set_file_extent_num_bytes(leaf, fi,
649 							end - other_start);
650 			btrfs_mark_buffer_dirty(leaf);
651 			goto out;
652 		}
653 	}
654 
655 	if (start > key.offset && end == extent_end) {
656 		other_start = end;
657 		other_end = 0;
658 		if (extent_mergeable(leaf, path->slots[0] + 1,
659 				     inode->i_ino, bytenr, orig_offset,
660 				     &other_start, &other_end)) {
661 			fi = btrfs_item_ptr(leaf, path->slots[0],
662 					    struct btrfs_file_extent_item);
663 			btrfs_set_file_extent_num_bytes(leaf, fi,
664 							start - key.offset);
665 			path->slots[0]++;
666 			new_key.offset = start;
667 			btrfs_set_item_key_safe(trans, root, path, &new_key);
668 
669 			fi = btrfs_item_ptr(leaf, path->slots[0],
670 					    struct btrfs_file_extent_item);
671 			btrfs_set_file_extent_num_bytes(leaf, fi,
672 							other_end - start);
673 			btrfs_set_file_extent_offset(leaf, fi,
674 						     start - orig_offset);
675 			btrfs_mark_buffer_dirty(leaf);
676 			goto out;
677 		}
678 	}
679 
680 	while (start > key.offset || end < extent_end) {
681 		if (key.offset == start)
682 			split = end;
683 
684 		new_key.offset = split;
685 		ret = btrfs_duplicate_item(trans, root, path, &new_key);
686 		if (ret == -EAGAIN) {
687 			btrfs_release_path(root, path);
688 			goto again;
689 		}
690 		BUG_ON(ret < 0);
691 
692 		leaf = path->nodes[0];
693 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
694 				    struct btrfs_file_extent_item);
695 		btrfs_set_file_extent_num_bytes(leaf, fi,
696 						split - key.offset);
697 
698 		fi = btrfs_item_ptr(leaf, path->slots[0],
699 				    struct btrfs_file_extent_item);
700 
701 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
702 		btrfs_set_file_extent_num_bytes(leaf, fi,
703 						extent_end - split);
704 		btrfs_mark_buffer_dirty(leaf);
705 
706 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
707 					   root->root_key.objectid,
708 					   inode->i_ino, orig_offset);
709 		BUG_ON(ret);
710 
711 		if (split == start) {
712 			key.offset = start;
713 		} else {
714 			BUG_ON(start != key.offset);
715 			path->slots[0]--;
716 			extent_end = end;
717 		}
718 		recow = 1;
719 	}
720 
721 	other_start = end;
722 	other_end = 0;
723 	if (extent_mergeable(leaf, path->slots[0] + 1,
724 			     inode->i_ino, bytenr, orig_offset,
725 			     &other_start, &other_end)) {
726 		if (recow) {
727 			btrfs_release_path(root, path);
728 			goto again;
729 		}
730 		extent_end = other_end;
731 		del_slot = path->slots[0] + 1;
732 		del_nr++;
733 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
734 					0, root->root_key.objectid,
735 					inode->i_ino, orig_offset);
736 		BUG_ON(ret);
737 	}
738 	other_start = 0;
739 	other_end = start;
740 	if (extent_mergeable(leaf, path->slots[0] - 1,
741 			     inode->i_ino, bytenr, orig_offset,
742 			     &other_start, &other_end)) {
743 		if (recow) {
744 			btrfs_release_path(root, path);
745 			goto again;
746 		}
747 		key.offset = other_start;
748 		del_slot = path->slots[0];
749 		del_nr++;
750 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
751 					0, root->root_key.objectid,
752 					inode->i_ino, orig_offset);
753 		BUG_ON(ret);
754 	}
755 	if (del_nr == 0) {
756 		fi = btrfs_item_ptr(leaf, path->slots[0],
757 			   struct btrfs_file_extent_item);
758 		btrfs_set_file_extent_type(leaf, fi,
759 					   BTRFS_FILE_EXTENT_REG);
760 		btrfs_mark_buffer_dirty(leaf);
761 	} else {
762 		fi = btrfs_item_ptr(leaf, del_slot - 1,
763 			   struct btrfs_file_extent_item);
764 		btrfs_set_file_extent_type(leaf, fi,
765 					   BTRFS_FILE_EXTENT_REG);
766 		btrfs_set_file_extent_num_bytes(leaf, fi,
767 						extent_end - key.offset);
768 		btrfs_mark_buffer_dirty(leaf);
769 
770 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
771 		BUG_ON(ret);
772 	}
773 out:
774 	btrfs_free_path(path);
775 	return 0;
776 }
777 
778 /*
779  * on error we return an unlocked page and the error value
780  * on success we return a locked page and 0
781  */
782 static int prepare_uptodate_page(struct page *page, u64 pos)
783 {
784 	int ret = 0;
785 
786 	if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
787 		ret = btrfs_readpage(NULL, page);
788 		if (ret)
789 			return ret;
790 		lock_page(page);
791 		if (!PageUptodate(page)) {
792 			unlock_page(page);
793 			return -EIO;
794 		}
795 	}
796 	return 0;
797 }
798 
799 /*
800  * this gets pages into the page cache and locks them down, it also properly
801  * waits for data=ordered extents to finish before allowing the pages to be
802  * modified.
803  */
804 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
805 			 struct page **pages, size_t num_pages,
806 			 loff_t pos, unsigned long first_index,
807 			 unsigned long last_index, size_t write_bytes)
808 {
809 	struct extent_state *cached_state = NULL;
810 	int i;
811 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
812 	struct inode *inode = fdentry(file)->d_inode;
813 	int err = 0;
814 	int faili = 0;
815 	u64 start_pos;
816 	u64 last_pos;
817 
818 	start_pos = pos & ~((u64)root->sectorsize - 1);
819 	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
820 
821 	if (start_pos > inode->i_size) {
822 		err = btrfs_cont_expand(inode, start_pos);
823 		if (err)
824 			return err;
825 	}
826 
827 	memset(pages, 0, num_pages * sizeof(struct page *));
828 again:
829 	for (i = 0; i < num_pages; i++) {
830 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
831 		if (!pages[i]) {
832 			faili = i - 1;
833 			err = -ENOMEM;
834 			goto fail;
835 		}
836 
837 		if (i == 0)
838 			err = prepare_uptodate_page(pages[i], pos);
839 		if (i == num_pages - 1)
840 			err = prepare_uptodate_page(pages[i],
841 						    pos + write_bytes);
842 		if (err) {
843 			page_cache_release(pages[i]);
844 			faili = i - 1;
845 			goto fail;
846 		}
847 		wait_on_page_writeback(pages[i]);
848 	}
849 	err = 0;
850 	if (start_pos < inode->i_size) {
851 		struct btrfs_ordered_extent *ordered;
852 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
853 				 start_pos, last_pos - 1, 0, &cached_state,
854 				 GFP_NOFS);
855 		ordered = btrfs_lookup_first_ordered_extent(inode,
856 							    last_pos - 1);
857 		if (ordered &&
858 		    ordered->file_offset + ordered->len > start_pos &&
859 		    ordered->file_offset < last_pos) {
860 			btrfs_put_ordered_extent(ordered);
861 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
862 					     start_pos, last_pos - 1,
863 					     &cached_state, GFP_NOFS);
864 			for (i = 0; i < num_pages; i++) {
865 				unlock_page(pages[i]);
866 				page_cache_release(pages[i]);
867 			}
868 			btrfs_wait_ordered_range(inode, start_pos,
869 						 last_pos - start_pos);
870 			goto again;
871 		}
872 		if (ordered)
873 			btrfs_put_ordered_extent(ordered);
874 
875 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
876 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
877 				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
878 				  GFP_NOFS);
879 		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
880 				     start_pos, last_pos - 1, &cached_state,
881 				     GFP_NOFS);
882 	}
883 	for (i = 0; i < num_pages; i++) {
884 		clear_page_dirty_for_io(pages[i]);
885 		set_page_extent_mapped(pages[i]);
886 		WARN_ON(!PageLocked(pages[i]));
887 	}
888 	return 0;
889 fail:
890 	while (faili >= 0) {
891 		unlock_page(pages[faili]);
892 		page_cache_release(pages[faili]);
893 		faili--;
894 	}
895 	return err;
896 
897 }
898 
899 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
900 				    const struct iovec *iov,
901 				    unsigned long nr_segs, loff_t pos)
902 {
903 	struct file *file = iocb->ki_filp;
904 	struct inode *inode = fdentry(file)->d_inode;
905 	struct btrfs_root *root = BTRFS_I(inode)->root;
906 	struct page **pages = NULL;
907 	struct iov_iter i;
908 	loff_t *ppos = &iocb->ki_pos;
909 	loff_t start_pos;
910 	ssize_t num_written = 0;
911 	ssize_t err = 0;
912 	size_t count;
913 	size_t ocount;
914 	int ret = 0;
915 	int nrptrs;
916 	unsigned long first_index;
917 	unsigned long last_index;
918 	int will_write;
919 	int buffered = 0;
920 	int copied = 0;
921 	int dirty_pages = 0;
922 
923 	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
924 		      (file->f_flags & O_DIRECT));
925 
926 	start_pos = pos;
927 
928 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
929 
930 	mutex_lock(&inode->i_mutex);
931 
932 	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
933 	if (err)
934 		goto out;
935 	count = ocount;
936 
937 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
938 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
939 	if (err)
940 		goto out;
941 
942 	if (count == 0)
943 		goto out;
944 
945 	err = file_remove_suid(file);
946 	if (err)
947 		goto out;
948 
949 	/*
950 	 * If BTRFS flips readonly due to some impossible error
951 	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
952 	 * although we have opened a file as writable, we have
953 	 * to stop this write operation to ensure FS consistency.
954 	 */
955 	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
956 		err = -EROFS;
957 		goto out;
958 	}
959 
960 	file_update_time(file);
961 	BTRFS_I(inode)->sequence++;
962 
963 	if (unlikely(file->f_flags & O_DIRECT)) {
964 		num_written = generic_file_direct_write(iocb, iov, &nr_segs,
965 							pos, ppos, count,
966 							ocount);
967 		/*
968 		 * the generic O_DIRECT will update in-memory i_size after the
969 		 * DIOs are done.  But our endio handlers that update the on
970 		 * disk i_size never update past the in memory i_size.  So we
971 		 * need one more update here to catch any additions to the
972 		 * file
973 		 */
974 		if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
975 			btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
976 			mark_inode_dirty(inode);
977 		}
978 
979 		if (num_written < 0) {
980 			ret = num_written;
981 			num_written = 0;
982 			goto out;
983 		} else if (num_written == count) {
984 			/* pick up pos changes done by the generic code */
985 			pos = *ppos;
986 			goto out;
987 		}
988 		/*
989 		 * We are going to do buffered for the rest of the range, so we
990 		 * need to make sure to invalidate the buffered pages when we're
991 		 * done.
992 		 */
993 		buffered = 1;
994 		pos += num_written;
995 	}
996 
997 	iov_iter_init(&i, iov, nr_segs, count, num_written);
998 	nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
999 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1000 		     (sizeof(struct page *)));
1001 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1002 	if (!pages) {
1003 		ret = -ENOMEM;
1004 		goto out;
1005 	}
1006 
1007 	/* generic_write_checks can change our pos */
1008 	start_pos = pos;
1009 
1010 	first_index = pos >> PAGE_CACHE_SHIFT;
1011 	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
1012 
1013 	while (iov_iter_count(&i) > 0) {
1014 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1015 		size_t write_bytes = min(iov_iter_count(&i),
1016 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
1017 					 offset);
1018 		size_t num_pages = (write_bytes + offset +
1019 				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1020 
1021 		WARN_ON(num_pages > nrptrs);
1022 		memset(pages, 0, sizeof(struct page *) * nrptrs);
1023 
1024 		/*
1025 		 * Fault pages before locking them in prepare_pages
1026 		 * to avoid recursive lock
1027 		 */
1028 		if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
1029 			ret = -EFAULT;
1030 			goto out;
1031 		}
1032 
1033 		ret = btrfs_delalloc_reserve_space(inode,
1034 					num_pages << PAGE_CACHE_SHIFT);
1035 		if (ret)
1036 			goto out;
1037 
1038 		ret = prepare_pages(root, file, pages, num_pages,
1039 				    pos, first_index, last_index,
1040 				    write_bytes);
1041 		if (ret) {
1042 			btrfs_delalloc_release_space(inode,
1043 					num_pages << PAGE_CACHE_SHIFT);
1044 			goto out;
1045 		}
1046 
1047 		copied = btrfs_copy_from_user(pos, num_pages,
1048 					   write_bytes, pages, &i);
1049 
1050 		/*
1051 		 * if we have trouble faulting in the pages, fall
1052 		 * back to one page at a time
1053 		 */
1054 		if (copied < write_bytes)
1055 			nrptrs = 1;
1056 
1057 		if (copied == 0)
1058 			dirty_pages = 0;
1059 		else
1060 			dirty_pages = (copied + offset +
1061 				       PAGE_CACHE_SIZE - 1) >>
1062 				       PAGE_CACHE_SHIFT;
1063 
1064 		if (num_pages > dirty_pages) {
1065 			if (copied > 0)
1066 				atomic_inc(
1067 					&BTRFS_I(inode)->outstanding_extents);
1068 			btrfs_delalloc_release_space(inode,
1069 					(num_pages - dirty_pages) <<
1070 					PAGE_CACHE_SHIFT);
1071 		}
1072 
1073 		if (copied > 0) {
1074 			dirty_and_release_pages(NULL, root, file, pages,
1075 						dirty_pages, pos, copied);
1076 		}
1077 
1078 		btrfs_drop_pages(pages, num_pages);
1079 
1080 		if (copied > 0) {
1081 			if (will_write) {
1082 				filemap_fdatawrite_range(inode->i_mapping, pos,
1083 							 pos + copied - 1);
1084 			} else {
1085 				balance_dirty_pages_ratelimited_nr(
1086 							inode->i_mapping,
1087 							dirty_pages);
1088 				if (dirty_pages <
1089 				(root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1090 					btrfs_btree_balance_dirty(root, 1);
1091 				btrfs_throttle(root);
1092 			}
1093 		}
1094 
1095 		pos += copied;
1096 		num_written += copied;
1097 
1098 		cond_resched();
1099 	}
1100 out:
1101 	mutex_unlock(&inode->i_mutex);
1102 	if (ret)
1103 		err = ret;
1104 
1105 	kfree(pages);
1106 	*ppos = pos;
1107 
1108 	/*
1109 	 * we want to make sure fsync finds this change
1110 	 * but we haven't joined a transaction running right now.
1111 	 *
1112 	 * Later on, someone is sure to update the inode and get the
1113 	 * real transid recorded.
1114 	 *
1115 	 * We set last_trans now to the fs_info generation + 1,
1116 	 * this will either be one more than the running transaction
1117 	 * or the generation used for the next transaction if there isn't
1118 	 * one running right now.
1119 	 */
1120 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1121 
1122 	if (num_written > 0 && will_write) {
1123 		struct btrfs_trans_handle *trans;
1124 
1125 		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1126 		if (err)
1127 			num_written = err;
1128 
1129 		if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1130 			trans = btrfs_start_transaction(root, 0);
1131 			if (IS_ERR(trans)) {
1132 				num_written = PTR_ERR(trans);
1133 				goto done;
1134 			}
1135 			mutex_lock(&inode->i_mutex);
1136 			ret = btrfs_log_dentry_safe(trans, root,
1137 						    file->f_dentry);
1138 			mutex_unlock(&inode->i_mutex);
1139 			if (ret == 0) {
1140 				ret = btrfs_sync_log(trans, root);
1141 				if (ret == 0)
1142 					btrfs_end_transaction(trans, root);
1143 				else
1144 					btrfs_commit_transaction(trans, root);
1145 			} else if (ret != BTRFS_NO_LOG_SYNC) {
1146 				btrfs_commit_transaction(trans, root);
1147 			} else {
1148 				btrfs_end_transaction(trans, root);
1149 			}
1150 		}
1151 		if (file->f_flags & O_DIRECT && buffered) {
1152 			invalidate_mapping_pages(inode->i_mapping,
1153 			      start_pos >> PAGE_CACHE_SHIFT,
1154 			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1155 		}
1156 	}
1157 done:
1158 	current->backing_dev_info = NULL;
1159 	return num_written ? num_written : err;
1160 }
1161 
1162 int btrfs_release_file(struct inode *inode, struct file *filp)
1163 {
1164 	/*
1165 	 * ordered_data_close is set by settattr when we are about to truncate
1166 	 * a file from a non-zero size to a zero size.  This tries to
1167 	 * flush down new bytes that may have been written if the
1168 	 * application were using truncate to replace a file in place.
1169 	 */
1170 	if (BTRFS_I(inode)->ordered_data_close) {
1171 		BTRFS_I(inode)->ordered_data_close = 0;
1172 		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1173 		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1174 			filemap_flush(inode->i_mapping);
1175 	}
1176 	if (filp->private_data)
1177 		btrfs_ioctl_trans_end(filp);
1178 	return 0;
1179 }
1180 
1181 /*
1182  * fsync call for both files and directories.  This logs the inode into
1183  * the tree log instead of forcing full commits whenever possible.
1184  *
1185  * It needs to call filemap_fdatawait so that all ordered extent updates are
1186  * in the metadata btree are up to date for copying to the log.
1187  *
1188  * It drops the inode mutex before doing the tree log commit.  This is an
1189  * important optimization for directories because holding the mutex prevents
1190  * new operations on the dir while we write to disk.
1191  */
1192 int btrfs_sync_file(struct file *file, int datasync)
1193 {
1194 	struct dentry *dentry = file->f_path.dentry;
1195 	struct inode *inode = dentry->d_inode;
1196 	struct btrfs_root *root = BTRFS_I(inode)->root;
1197 	int ret = 0;
1198 	struct btrfs_trans_handle *trans;
1199 
1200 
1201 	/* we wait first, since the writeback may change the inode */
1202 	root->log_batch++;
1203 	/* the VFS called filemap_fdatawrite for us */
1204 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
1205 	root->log_batch++;
1206 
1207 	/*
1208 	 * check the transaction that last modified this inode
1209 	 * and see if its already been committed
1210 	 */
1211 	if (!BTRFS_I(inode)->last_trans)
1212 		goto out;
1213 
1214 	/*
1215 	 * if the last transaction that changed this file was before
1216 	 * the current transaction, we can bail out now without any
1217 	 * syncing
1218 	 */
1219 	mutex_lock(&root->fs_info->trans_mutex);
1220 	if (BTRFS_I(inode)->last_trans <=
1221 	    root->fs_info->last_trans_committed) {
1222 		BTRFS_I(inode)->last_trans = 0;
1223 		mutex_unlock(&root->fs_info->trans_mutex);
1224 		goto out;
1225 	}
1226 	mutex_unlock(&root->fs_info->trans_mutex);
1227 
1228 	/*
1229 	 * ok we haven't committed the transaction yet, lets do a commit
1230 	 */
1231 	if (file->private_data)
1232 		btrfs_ioctl_trans_end(file);
1233 
1234 	trans = btrfs_start_transaction(root, 0);
1235 	if (IS_ERR(trans)) {
1236 		ret = PTR_ERR(trans);
1237 		goto out;
1238 	}
1239 
1240 	ret = btrfs_log_dentry_safe(trans, root, dentry);
1241 	if (ret < 0)
1242 		goto out;
1243 
1244 	/* we've logged all the items and now have a consistent
1245 	 * version of the file in the log.  It is possible that
1246 	 * someone will come in and modify the file, but that's
1247 	 * fine because the log is consistent on disk, and we
1248 	 * have references to all of the file's extents
1249 	 *
1250 	 * It is possible that someone will come in and log the
1251 	 * file again, but that will end up using the synchronization
1252 	 * inside btrfs_sync_log to keep things safe.
1253 	 */
1254 	mutex_unlock(&dentry->d_inode->i_mutex);
1255 
1256 	if (ret != BTRFS_NO_LOG_SYNC) {
1257 		if (ret > 0) {
1258 			ret = btrfs_commit_transaction(trans, root);
1259 		} else {
1260 			ret = btrfs_sync_log(trans, root);
1261 			if (ret == 0)
1262 				ret = btrfs_end_transaction(trans, root);
1263 			else
1264 				ret = btrfs_commit_transaction(trans, root);
1265 		}
1266 	} else {
1267 		ret = btrfs_end_transaction(trans, root);
1268 	}
1269 	mutex_lock(&dentry->d_inode->i_mutex);
1270 out:
1271 	return ret > 0 ? -EIO : ret;
1272 }
1273 
1274 static const struct vm_operations_struct btrfs_file_vm_ops = {
1275 	.fault		= filemap_fault,
1276 	.page_mkwrite	= btrfs_page_mkwrite,
1277 };
1278 
1279 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
1280 {
1281 	struct address_space *mapping = filp->f_mapping;
1282 
1283 	if (!mapping->a_ops->readpage)
1284 		return -ENOEXEC;
1285 
1286 	file_accessed(filp);
1287 	vma->vm_ops = &btrfs_file_vm_ops;
1288 	vma->vm_flags |= VM_CAN_NONLINEAR;
1289 
1290 	return 0;
1291 }
1292 
1293 static long btrfs_fallocate(struct file *file, int mode,
1294 			    loff_t offset, loff_t len)
1295 {
1296 	struct inode *inode = file->f_path.dentry->d_inode;
1297 	struct extent_state *cached_state = NULL;
1298 	u64 cur_offset;
1299 	u64 last_byte;
1300 	u64 alloc_start;
1301 	u64 alloc_end;
1302 	u64 alloc_hint = 0;
1303 	u64 locked_end;
1304 	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1305 	struct extent_map *em;
1306 	int ret;
1307 
1308 	alloc_start = offset & ~mask;
1309 	alloc_end =  (offset + len + mask) & ~mask;
1310 
1311 	/* We only support the FALLOC_FL_KEEP_SIZE mode */
1312 	if (mode & ~FALLOC_FL_KEEP_SIZE)
1313 		return -EOPNOTSUPP;
1314 
1315 	/*
1316 	 * wait for ordered IO before we have any locks.  We'll loop again
1317 	 * below with the locks held.
1318 	 */
1319 	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1320 
1321 	mutex_lock(&inode->i_mutex);
1322 	ret = inode_newsize_ok(inode, alloc_end);
1323 	if (ret)
1324 		goto out;
1325 
1326 	if (alloc_start > inode->i_size) {
1327 		ret = btrfs_cont_expand(inode, alloc_start);
1328 		if (ret)
1329 			goto out;
1330 	}
1331 
1332 	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1333 	if (ret)
1334 		goto out;
1335 
1336 	locked_end = alloc_end - 1;
1337 	while (1) {
1338 		struct btrfs_ordered_extent *ordered;
1339 
1340 		/* the extent lock is ordered inside the running
1341 		 * transaction
1342 		 */
1343 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1344 				 locked_end, 0, &cached_state, GFP_NOFS);
1345 		ordered = btrfs_lookup_first_ordered_extent(inode,
1346 							    alloc_end - 1);
1347 		if (ordered &&
1348 		    ordered->file_offset + ordered->len > alloc_start &&
1349 		    ordered->file_offset < alloc_end) {
1350 			btrfs_put_ordered_extent(ordered);
1351 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1352 					     alloc_start, locked_end,
1353 					     &cached_state, GFP_NOFS);
1354 			/*
1355 			 * we can't wait on the range with the transaction
1356 			 * running or with the extent lock held
1357 			 */
1358 			btrfs_wait_ordered_range(inode, alloc_start,
1359 						 alloc_end - alloc_start);
1360 		} else {
1361 			if (ordered)
1362 				btrfs_put_ordered_extent(ordered);
1363 			break;
1364 		}
1365 	}
1366 
1367 	cur_offset = alloc_start;
1368 	while (1) {
1369 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1370 				      alloc_end - cur_offset, 0);
1371 		BUG_ON(IS_ERR(em) || !em);
1372 		last_byte = min(extent_map_end(em), alloc_end);
1373 		last_byte = (last_byte + mask) & ~mask;
1374 		if (em->block_start == EXTENT_MAP_HOLE ||
1375 		    (cur_offset >= inode->i_size &&
1376 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1377 			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1378 							last_byte - cur_offset,
1379 							1 << inode->i_blkbits,
1380 							offset + len,
1381 							&alloc_hint);
1382 			if (ret < 0) {
1383 				free_extent_map(em);
1384 				break;
1385 			}
1386 		}
1387 		free_extent_map(em);
1388 
1389 		cur_offset = last_byte;
1390 		if (cur_offset >= alloc_end) {
1391 			ret = 0;
1392 			break;
1393 		}
1394 	}
1395 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1396 			     &cached_state, GFP_NOFS);
1397 
1398 	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1399 out:
1400 	mutex_unlock(&inode->i_mutex);
1401 	return ret;
1402 }
1403 
1404 const struct file_operations btrfs_file_operations = {
1405 	.llseek		= generic_file_llseek,
1406 	.read		= do_sync_read,
1407 	.write		= do_sync_write,
1408 	.aio_read       = generic_file_aio_read,
1409 	.splice_read	= generic_file_splice_read,
1410 	.aio_write	= btrfs_file_aio_write,
1411 	.mmap		= btrfs_file_mmap,
1412 	.open		= generic_file_open,
1413 	.release	= btrfs_release_file,
1414 	.fsync		= btrfs_sync_file,
1415 	.fallocate	= btrfs_fallocate,
1416 	.unlocked_ioctl	= btrfs_ioctl,
1417 #ifdef CONFIG_COMPAT
1418 	.compat_ioctl	= btrfs_ioctl,
1419 #endif
1420 };
1421