xref: /linux/fs/ext4/extents_status.c (revision 0d456bad36d42d16022be045c8a53ddbb59ee478)
1 /*
2  *  fs/ext4/extents_status.c
3  *
4  * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
5  * Modified by
6  *	Allison Henderson <achender@linux.vnet.ibm.com>
7  *	Hugh Dickins <hughd@google.com>
8  *	Zheng Liu <wenqing.lz@taobao.com>
9  *
10  * Ext4 extents status tree core functions.
11  */
12 #include <linux/rbtree.h>
13 #include "ext4.h"
14 #include "extents_status.h"
15 #include "ext4_extents.h"
16 
17 #include <trace/events/ext4.h>
18 
19 /*
20  * According to previous discussion in Ext4 Developer Workshop, we
21  * will introduce a new structure called io tree to track all extent
22  * status in order to solve some problems that we have met
23  * (e.g. Reservation space warning), and provide extent-level locking.
24  * Delay extent tree is the first step to achieve this goal.  It is
25  * original built by Yongqiang Yang.  At that time it is called delay
26  * extent tree, whose goal is only track delay extent in memory to
27  * simplify the implementation of fiemap and bigalloc, and introduce
28  * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
29  * delay extent tree at the following comment.  But for better
30  * understand what it does, it has been rename to extent status tree.
31  *
32  * Currently the first step has been done.  All delay extents are
33  * tracked in the tree.  It maintains the delay extent when a delay
34  * allocation is issued, and the delay extent is written out or
35  * invalidated.  Therefore the implementation of fiemap and bigalloc
36  * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
37  *
38  * The following comment describes the implemenmtation of extent
39  * status tree and future works.
40  */
41 
42 /*
43  * extents status tree implementation for ext4.
44  *
45  *
46  * ==========================================================================
47  * Extents status encompass delayed extents and extent locks
48  *
49  * 1. Why delayed extent implementation ?
50  *
51  * Without delayed extent, ext4 identifies a delayed extent by looking
52  * up page cache, this has several deficiencies - complicated, buggy,
53  * and inefficient code.
54  *
55  * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
56  * to know if a block or a range of blocks are belonged to a delayed
57  * extent.
58  *
59  * Let us have a look at how they do without delayed extents implementation.
60  *   --	FIEMAP
61  *	FIEMAP looks up page cache to identify delayed allocations from holes.
62  *
63  *   --	SEEK_HOLE/DATA
64  *	SEEK_HOLE/DATA has the same problem as FIEMAP.
65  *
66  *   --	bigalloc
67  *	bigalloc looks up page cache to figure out if a block is
68  *	already under delayed allocation or not to determine whether
69  *	quota reserving is needed for the cluster.
70  *
71  *   -- punch hole
72  *	punch hole looks up page cache to identify a delayed extent.
73  *
74  *   --	writeout
75  *	Writeout looks up whole page cache to see if a buffer is
76  *	mapped, If there are not very many delayed buffers, then it is
77  *	time comsuming.
78  *
79  * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
80  * bigalloc and writeout can figure out if a block or a range of
81  * blocks is under delayed allocation(belonged to a delayed extent) or
82  * not by searching the delayed extent tree.
83  *
84  *
85  * ==========================================================================
86  * 2. ext4 delayed extents impelmentation
87  *
88  *   --	delayed extent
89  *	A delayed extent is a range of blocks which are contiguous
90  *	logically and under delayed allocation.  Unlike extent in
91  *	ext4, delayed extent in ext4 is a in-memory struct, there is
92  *	no corresponding on-disk data.  There is no limit on length of
93  *	delayed extent, so a delayed extent can contain as many blocks
94  *	as they are contiguous logically.
95  *
96  *   --	delayed extent tree
97  *	Every inode has a delayed extent tree and all under delayed
98  *	allocation blocks are added to the tree as delayed extents.
99  *	Delayed extents in the tree are ordered by logical block no.
100  *
101  *   --	operations on a delayed extent tree
102  *	There are three operations on a delayed extent tree: find next
103  *	delayed extent, adding a space(a range of blocks) and removing
104  *	a space.
105  *
106  *   --	race on a delayed extent tree
107  *	Delayed extent tree is protected inode->i_es_lock.
108  *
109  *
110  * ==========================================================================
111  * 3. performance analysis
112  *   --	overhead
113  *	1. There is a cache extent for write access, so if writes are
114  *	not very random, adding space operaions are in O(1) time.
115  *
116  *   --	gain
117  *	2. Code is much simpler, more readable, more maintainable and
118  *	more efficient.
119  *
120  *
121  * ==========================================================================
122  * 4. TODO list
123  *   -- Track all extent status
124  *
125  *   -- Improve get block process
126  *
127  *   -- Extent-level locking
128  */
129 
130 static struct kmem_cache *ext4_es_cachep;
131 
132 int __init ext4_init_es(void)
133 {
134 	ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
135 	if (ext4_es_cachep == NULL)
136 		return -ENOMEM;
137 	return 0;
138 }
139 
140 void ext4_exit_es(void)
141 {
142 	if (ext4_es_cachep)
143 		kmem_cache_destroy(ext4_es_cachep);
144 }
145 
146 void ext4_es_init_tree(struct ext4_es_tree *tree)
147 {
148 	tree->root = RB_ROOT;
149 	tree->cache_es = NULL;
150 }
151 
152 #ifdef ES_DEBUG__
153 static void ext4_es_print_tree(struct inode *inode)
154 {
155 	struct ext4_es_tree *tree;
156 	struct rb_node *node;
157 
158 	printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
159 	tree = &EXT4_I(inode)->i_es_tree;
160 	node = rb_first(&tree->root);
161 	while (node) {
162 		struct extent_status *es;
163 		es = rb_entry(node, struct extent_status, rb_node);
164 		printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
165 		node = rb_next(node);
166 	}
167 	printk(KERN_DEBUG "\n");
168 }
169 #else
170 #define ext4_es_print_tree(inode)
171 #endif
172 
173 static inline ext4_lblk_t extent_status_end(struct extent_status *es)
174 {
175 	BUG_ON(es->start + es->len < es->start);
176 	return es->start + es->len - 1;
177 }
178 
179 /*
180  * search through the tree for an delayed extent with a given offset.  If
181  * it can't be found, try to find next extent.
182  */
183 static struct extent_status *__es_tree_search(struct rb_root *root,
184 					      ext4_lblk_t offset)
185 {
186 	struct rb_node *node = root->rb_node;
187 	struct extent_status *es = NULL;
188 
189 	while (node) {
190 		es = rb_entry(node, struct extent_status, rb_node);
191 		if (offset < es->start)
192 			node = node->rb_left;
193 		else if (offset > extent_status_end(es))
194 			node = node->rb_right;
195 		else
196 			return es;
197 	}
198 
199 	if (es && offset < es->start)
200 		return es;
201 
202 	if (es && offset > extent_status_end(es)) {
203 		node = rb_next(&es->rb_node);
204 		return node ? rb_entry(node, struct extent_status, rb_node) :
205 			      NULL;
206 	}
207 
208 	return NULL;
209 }
210 
211 /*
212  * ext4_es_find_extent: find the 1st delayed extent covering @es->start
213  * if it exists, otherwise, the next extent after @es->start.
214  *
215  * @inode: the inode which owns delayed extents
216  * @es: delayed extent that we found
217  *
218  * Returns the first block of the next extent after es, otherwise
219  * EXT_MAX_BLOCKS if no delay extent is found.
220  * Delayed extent is returned via @es.
221  */
222 ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
223 {
224 	struct ext4_es_tree *tree = NULL;
225 	struct extent_status *es1 = NULL;
226 	struct rb_node *node;
227 	ext4_lblk_t ret = EXT_MAX_BLOCKS;
228 
229 	trace_ext4_es_find_extent_enter(inode, es->start);
230 
231 	read_lock(&EXT4_I(inode)->i_es_lock);
232 	tree = &EXT4_I(inode)->i_es_tree;
233 
234 	/* find delay extent in cache firstly */
235 	if (tree->cache_es) {
236 		es1 = tree->cache_es;
237 		if (in_range(es->start, es1->start, es1->len)) {
238 			es_debug("%u cached by [%u/%u)\n",
239 				 es->start, es1->start, es1->len);
240 			goto out;
241 		}
242 	}
243 
244 	es->len = 0;
245 	es1 = __es_tree_search(&tree->root, es->start);
246 
247 out:
248 	if (es1) {
249 		tree->cache_es = es1;
250 		es->start = es1->start;
251 		es->len = es1->len;
252 		node = rb_next(&es1->rb_node);
253 		if (node) {
254 			es1 = rb_entry(node, struct extent_status, rb_node);
255 			ret = es1->start;
256 		}
257 	}
258 
259 	read_unlock(&EXT4_I(inode)->i_es_lock);
260 
261 	trace_ext4_es_find_extent_exit(inode, es, ret);
262 	return ret;
263 }
264 
265 static struct extent_status *
266 ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
267 {
268 	struct extent_status *es;
269 	es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
270 	if (es == NULL)
271 		return NULL;
272 	es->start = start;
273 	es->len = len;
274 	return es;
275 }
276 
277 static void ext4_es_free_extent(struct extent_status *es)
278 {
279 	kmem_cache_free(ext4_es_cachep, es);
280 }
281 
282 static struct extent_status *
283 ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
284 {
285 	struct extent_status *es1;
286 	struct rb_node *node;
287 
288 	node = rb_prev(&es->rb_node);
289 	if (!node)
290 		return es;
291 
292 	es1 = rb_entry(node, struct extent_status, rb_node);
293 	if (es->start == extent_status_end(es1) + 1) {
294 		es1->len += es->len;
295 		rb_erase(&es->rb_node, &tree->root);
296 		ext4_es_free_extent(es);
297 		es = es1;
298 	}
299 
300 	return es;
301 }
302 
303 static struct extent_status *
304 ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
305 {
306 	struct extent_status *es1;
307 	struct rb_node *node;
308 
309 	node = rb_next(&es->rb_node);
310 	if (!node)
311 		return es;
312 
313 	es1 = rb_entry(node, struct extent_status, rb_node);
314 	if (es1->start == extent_status_end(es) + 1) {
315 		es->len += es1->len;
316 		rb_erase(node, &tree->root);
317 		ext4_es_free_extent(es1);
318 	}
319 
320 	return es;
321 }
322 
323 static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
324 			      ext4_lblk_t len)
325 {
326 	struct rb_node **p = &tree->root.rb_node;
327 	struct rb_node *parent = NULL;
328 	struct extent_status *es;
329 	ext4_lblk_t end = offset + len - 1;
330 
331 	BUG_ON(end < offset);
332 	es = tree->cache_es;
333 	if (es && offset == (extent_status_end(es) + 1)) {
334 		es_debug("cached by [%u/%u)\n", es->start, es->len);
335 		es->len += len;
336 		es = ext4_es_try_to_merge_right(tree, es);
337 		goto out;
338 	} else if (es && es->start == end + 1) {
339 		es_debug("cached by [%u/%u)\n", es->start, es->len);
340 		es->start = offset;
341 		es->len += len;
342 		es = ext4_es_try_to_merge_left(tree, es);
343 		goto out;
344 	} else if (es && es->start <= offset &&
345 		   end <= extent_status_end(es)) {
346 		es_debug("cached by [%u/%u)\n", es->start, es->len);
347 		goto out;
348 	}
349 
350 	while (*p) {
351 		parent = *p;
352 		es = rb_entry(parent, struct extent_status, rb_node);
353 
354 		if (offset < es->start) {
355 			if (es->start == end + 1) {
356 				es->start = offset;
357 				es->len += len;
358 				es = ext4_es_try_to_merge_left(tree, es);
359 				goto out;
360 			}
361 			p = &(*p)->rb_left;
362 		} else if (offset > extent_status_end(es)) {
363 			if (offset == extent_status_end(es) + 1) {
364 				es->len += len;
365 				es = ext4_es_try_to_merge_right(tree, es);
366 				goto out;
367 			}
368 			p = &(*p)->rb_right;
369 		} else {
370 			if (extent_status_end(es) <= end)
371 				es->len = offset - es->start + len;
372 			goto out;
373 		}
374 	}
375 
376 	es = ext4_es_alloc_extent(offset, len);
377 	if (!es)
378 		return -ENOMEM;
379 	rb_link_node(&es->rb_node, parent, p);
380 	rb_insert_color(&es->rb_node, &tree->root);
381 
382 out:
383 	tree->cache_es = es;
384 	return 0;
385 }
386 
387 /*
388  * ext4_es_insert_extent() adds a space to a delayed extent tree.
389  * Caller holds inode->i_es_lock.
390  *
391  * ext4_es_insert_extent is called by ext4_da_write_begin and
392  * ext4_es_remove_extent.
393  *
394  * Return 0 on success, error code on failure.
395  */
396 int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
397 			  ext4_lblk_t len)
398 {
399 	struct ext4_es_tree *tree;
400 	int err = 0;
401 
402 	trace_ext4_es_insert_extent(inode, offset, len);
403 	es_debug("add [%u/%u) to extent status tree of inode %lu\n",
404 		 offset, len, inode->i_ino);
405 
406 	write_lock(&EXT4_I(inode)->i_es_lock);
407 	tree = &EXT4_I(inode)->i_es_tree;
408 	err = __es_insert_extent(tree, offset, len);
409 	write_unlock(&EXT4_I(inode)->i_es_lock);
410 
411 	ext4_es_print_tree(inode);
412 
413 	return err;
414 }
415 
416 /*
417  * ext4_es_remove_extent() removes a space from a delayed extent tree.
418  * Caller holds inode->i_es_lock.
419  *
420  * Return 0 on success, error code on failure.
421  */
422 int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
423 			  ext4_lblk_t len)
424 {
425 	struct rb_node *node;
426 	struct ext4_es_tree *tree;
427 	struct extent_status *es;
428 	struct extent_status orig_es;
429 	ext4_lblk_t len1, len2, end;
430 	int err = 0;
431 
432 	trace_ext4_es_remove_extent(inode, offset, len);
433 	es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
434 		 offset, len, inode->i_ino);
435 
436 	end = offset + len - 1;
437 	BUG_ON(end < offset);
438 	write_lock(&EXT4_I(inode)->i_es_lock);
439 	tree = &EXT4_I(inode)->i_es_tree;
440 	es = __es_tree_search(&tree->root, offset);
441 	if (!es)
442 		goto out;
443 	if (es->start > end)
444 		goto out;
445 
446 	/* Simply invalidate cache_es. */
447 	tree->cache_es = NULL;
448 
449 	orig_es.start = es->start;
450 	orig_es.len = es->len;
451 	len1 = offset > es->start ? offset - es->start : 0;
452 	len2 = extent_status_end(es) > end ?
453 	       extent_status_end(es) - end : 0;
454 	if (len1 > 0)
455 		es->len = len1;
456 	if (len2 > 0) {
457 		if (len1 > 0) {
458 			err = __es_insert_extent(tree, end + 1, len2);
459 			if (err) {
460 				es->start = orig_es.start;
461 				es->len = orig_es.len;
462 				goto out;
463 			}
464 		} else {
465 			es->start = end + 1;
466 			es->len = len2;
467 		}
468 		goto out;
469 	}
470 
471 	if (len1 > 0) {
472 		node = rb_next(&es->rb_node);
473 		if (node)
474 			es = rb_entry(node, struct extent_status, rb_node);
475 		else
476 			es = NULL;
477 	}
478 
479 	while (es && extent_status_end(es) <= end) {
480 		node = rb_next(&es->rb_node);
481 		rb_erase(&es->rb_node, &tree->root);
482 		ext4_es_free_extent(es);
483 		if (!node) {
484 			es = NULL;
485 			break;
486 		}
487 		es = rb_entry(node, struct extent_status, rb_node);
488 	}
489 
490 	if (es && es->start < end + 1) {
491 		len1 = extent_status_end(es) - end;
492 		es->start = end + 1;
493 		es->len = len1;
494 	}
495 
496 out:
497 	write_unlock(&EXT4_I(inode)->i_es_lock);
498 	ext4_es_print_tree(inode);
499 	return err;
500 }
501