xref: /linux/fs/btrfs/raid-stripe-tree.c (revision 73082fbdb10aba317e8469f51e3411814f2e65b4)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2023 Western Digital Corporation or its affiliates.
4  */
5 
6 #include <linux/btrfs_tree.h>
7 #include "ctree.h"
8 #include "fs.h"
9 #include "accessors.h"
10 #include "transaction.h"
11 #include "disk-io.h"
12 #include "raid-stripe-tree.h"
13 #include "volumes.h"
14 #include "print-tree.h"
15 
btrfs_partially_delete_raid_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,const struct btrfs_key * oldkey,u64 newlen,u64 frontpad)16 static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
17 					       struct btrfs_path *path,
18 					       const struct btrfs_key *oldkey,
19 					       u64 newlen, u64 frontpad)
20 {
21 	struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
22 	struct btrfs_stripe_extent *extent, AUTO_KFREE(newitem);
23 	struct extent_buffer *leaf;
24 	int slot;
25 	size_t item_size;
26 	struct btrfs_key newkey = {
27 		.objectid = oldkey->objectid + frontpad,
28 		.type = BTRFS_RAID_STRIPE_KEY,
29 		.offset = newlen,
30 	};
31 	int ret;
32 
33 	ASSERT(newlen > 0);
34 	ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
35 
36 	leaf = path->nodes[0];
37 	slot = path->slots[0];
38 	item_size = btrfs_item_size(leaf, slot);
39 
40 	newitem = kzalloc(item_size, GFP_NOFS);
41 	if (!newitem)
42 		return -ENOMEM;
43 
44 	extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
45 
46 	for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
47 		struct btrfs_raid_stride *stride = &extent->strides[i];
48 		u64 devid;
49 		u64 phys;
50 
51 		devid = btrfs_raid_stride_devid(leaf, stride);
52 		btrfs_set_stack_raid_stride_devid(&newitem->strides[i], devid);
53 		phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
54 		btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys);
55 	}
56 
57 	ret = btrfs_del_item(trans, stripe_root, path);
58 	if (ret)
59 		return ret;
60 
61 	btrfs_release_path(path);
62 	return btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size);
63 }
64 
btrfs_delete_raid_extent(struct btrfs_trans_handle * trans,u64 start,u64 length)65 int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
66 {
67 	struct btrfs_fs_info *fs_info = trans->fs_info;
68 	struct btrfs_root *stripe_root = fs_info->stripe_root;
69 	BTRFS_PATH_AUTO_FREE(path);
70 	struct btrfs_key key;
71 	struct extent_buffer *leaf;
72 	u64 found_start;
73 	u64 found_end;
74 	u64 end = start + length;
75 	int slot;
76 	int ret;
77 
78 	if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root)
79 		return 0;
80 
81 	if (!btrfs_is_testing(fs_info)) {
82 		struct btrfs_chunk_map *map;
83 		bool use_rst;
84 
85 		map = btrfs_find_chunk_map(fs_info, start, length);
86 		if (!map)
87 			return -EINVAL;
88 		use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
89 		btrfs_free_chunk_map(map);
90 		if (!use_rst)
91 			return 0;
92 	}
93 
94 	path = btrfs_alloc_path();
95 	if (!path)
96 		return -ENOMEM;
97 
98 	while (1) {
99 		key.objectid = start;
100 		key.type = BTRFS_RAID_STRIPE_KEY;
101 		key.offset = (u64)-1;
102 
103 		ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
104 		if (ret < 0)
105 			break;
106 
107 		/*
108 		 * Search with offset=(u64)-1 ensures we land on the correct
109 		 * leaf even when the target entry is the first item on a leaf.
110 		 * Since no real entry has offset=(u64)-1, ret is always 1 and
111 		 * slot points past the last entry with objectid==start (or
112 		 * past the end of the leaf if that entry is the last item).
113 		 * Back up one slot to find the actual entry.
114 		 */
115 		if (path->slots[0] == 0) {
116 			/* No entry with objectid <= start exists. */
117 			ret = 0;
118 			break;
119 		}
120 		path->slots[0]--;
121 
122 		leaf = path->nodes[0];
123 		slot = path->slots[0];
124 		btrfs_item_key_to_cpu(leaf, &key, slot);
125 		found_start = key.objectid;
126 		found_end = found_start + key.offset;
127 		ret = 0;
128 
129 		/*
130 		 * The stripe extent starts before the range we want to delete,
131 		 * but the range spans more than one stripe extent:
132 		 *
133 		 * |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---|
134 		 *        |--- keep  ---|--- drop ---|
135 		 *
136 		 * This means we have to get the previous item, truncate its
137 		 * length and then restart the search.
138 		 */
139 		if (found_start > start) {
140 			if (slot == 0) {
141 				ret = btrfs_previous_item(stripe_root, path, 0,
142 							  BTRFS_RAID_STRIPE_KEY);
143 				if (ret) {
144 					if (ret > 0)
145 						ret = -ENOENT;
146 					break;
147 				}
148 			} else {
149 				path->slots[0]--;
150 			}
151 
152 			leaf = path->nodes[0];
153 			slot = path->slots[0];
154 			btrfs_item_key_to_cpu(leaf, &key, slot);
155 			found_start = key.objectid;
156 			found_end = found_start + key.offset;
157 			if (found_start > start || found_end <= start) {
158 				ret = -ENOENT;
159 				break;
160 			}
161 		}
162 
163 		if (key.type != BTRFS_RAID_STRIPE_KEY)
164 			break;
165 
166 		/* That stripe ends before we start, we're done. */
167 		if (found_end <= start)
168 			break;
169 
170 		trace_btrfs_raid_extent_delete(fs_info, start, end,
171 					       found_start, found_end);
172 
173 		/*
174 		 * The stripe extent starts before the range we want to delete
175 		 * and ends after the range we want to delete, i.e. we're
176 		 * punching a hole in the stripe extent:
177 		 *
178 		 *  |--- RAID Stripe Extent ---|
179 		 *  | keep |--- drop ---| keep |
180 		 *
181 		 * This means we need to a) truncate the existing item and b)
182 		 * create a second item for the remaining range.
183 		 */
184 		if (found_start < start && found_end > end) {
185 			size_t item_size;
186 			u64 diff_start = start - found_start;
187 			u64 diff_end = found_end - end;
188 			struct btrfs_stripe_extent *extent;
189 			struct btrfs_key newkey = {
190 				.objectid = end,
191 				.type = BTRFS_RAID_STRIPE_KEY,
192 				.offset = diff_end,
193 			};
194 
195 			/* The "right" item. */
196 			ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey);
197 			if (ret == -EAGAIN) {
198 				btrfs_release_path(path);
199 				continue;
200 			}
201 			if (ret)
202 				break;
203 
204 			/*
205 			 * btrfs_duplicate_item() may have triggered a leaf
206 			 * split via setup_leaf_for_split(), so we must refresh
207 			 * our leaf pointer from the path.
208 			 */
209 			leaf = path->nodes[0];
210 			item_size = btrfs_item_size(leaf, path->slots[0]);
211 			extent = btrfs_item_ptr(leaf, path->slots[0],
212 						struct btrfs_stripe_extent);
213 
214 			for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
215 				struct btrfs_raid_stride *stride = &extent->strides[i];
216 				u64 phys;
217 
218 				phys = btrfs_raid_stride_physical(leaf, stride);
219 				phys += diff_start + length;
220 				btrfs_set_raid_stride_physical(leaf, stride, phys);
221 			}
222 
223 			/* The "left" item. */
224 			path->slots[0]--;
225 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
226 			ret = btrfs_partially_delete_raid_extent(trans, path,
227 								 &key,
228 								 diff_start, 0);
229 			break;
230 		}
231 
232 		/*
233 		 * The stripe extent starts before the range we want to delete:
234 		 *
235 		 * |--- RAID Stripe Extent ---|
236 		 * |--- keep  ---|--- drop ---|
237 		 *
238 		 * This means we have to duplicate the tree item, truncate the
239 		 * length to the new size and then re-insert the item.
240 		 */
241 		if (found_start < start) {
242 			u64 diff_start = start - found_start;
243 
244 			ret = btrfs_partially_delete_raid_extent(trans, path,
245 								 &key,
246 								 diff_start, 0);
247 			if (ret)
248 				break;
249 
250 			start += (key.offset - diff_start);
251 			length -= (key.offset - diff_start);
252 			if (length == 0)
253 				break;
254 
255 			btrfs_release_path(path);
256 			continue;
257 		}
258 
259 		/*
260 		 * The stripe extent ends after the range we want to delete:
261 		 *
262 		 * |--- RAID Stripe Extent ---|
263 		 * |--- drop  ---|--- keep ---|
264 		 *
265 		 * This means we have to duplicate the tree item, truncate the
266 		 * length to the new size and then re-insert the item.
267 		 */
268 		if (found_end > end) {
269 			u64 diff_end = found_end - end;
270 
271 			ret = btrfs_partially_delete_raid_extent(trans, path,
272 								 &key,
273 								 key.offset - length,
274 								 length);
275 			ASSERT(key.offset - diff_end == length);
276 			break;
277 		}
278 
279 		/* Finally we can delete the whole item, no more special cases. */
280 		ret = btrfs_del_item(trans, stripe_root, path);
281 		if (ret)
282 			break;
283 
284 		start += key.offset;
285 		length -= key.offset;
286 		if (length == 0)
287 			break;
288 
289 		btrfs_release_path(path);
290 	}
291 
292 	return ret;
293 }
294 
update_raid_extent_item(struct btrfs_trans_handle * trans,struct btrfs_key * key,struct btrfs_stripe_extent * stripe_extent,const size_t item_size)295 static int update_raid_extent_item(struct btrfs_trans_handle *trans,
296 				   struct btrfs_key *key,
297 				   struct btrfs_stripe_extent *stripe_extent,
298 				   const size_t item_size)
299 {
300 	BTRFS_PATH_AUTO_FREE(path);
301 	struct extent_buffer *leaf;
302 	int ret;
303 	int slot;
304 
305 	path = btrfs_alloc_path();
306 	if (!path)
307 		return -ENOMEM;
308 
309 	ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
310 				0, 1);
311 	if (ret)
312 		return (ret == 1 ? ret : -EINVAL);
313 
314 	leaf = path->nodes[0];
315 	slot = path->slots[0];
316 
317 	write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
318 			    item_size);
319 
320 	return ret;
321 }
322 
323 EXPORT_FOR_TESTS
btrfs_insert_one_raid_extent(struct btrfs_trans_handle * trans,struct btrfs_io_context * bioc)324 int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
325 				 struct btrfs_io_context *bioc)
326 {
327 	struct btrfs_fs_info *fs_info = trans->fs_info;
328 	struct btrfs_key stripe_key;
329 	struct btrfs_root *stripe_root = fs_info->stripe_root;
330 	const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
331 	struct btrfs_stripe_extent AUTO_KFREE(stripe_extent);
332 	const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
333 	int ret;
334 
335 	stripe_extent = kzalloc(item_size, GFP_NOFS);
336 	if (unlikely(!stripe_extent)) {
337 		btrfs_abort_transaction(trans, -ENOMEM);
338 		btrfs_end_transaction(trans);
339 		return -ENOMEM;
340 	}
341 
342 	trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
343 					   num_stripes);
344 	for (int i = 0; i < num_stripes; i++) {
345 		u64 devid = bioc->stripes[i].dev->devid;
346 		u64 physical = bioc->stripes[i].physical;
347 		struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
348 
349 		btrfs_set_stack_raid_stride_devid(raid_stride, devid);
350 		btrfs_set_stack_raid_stride_physical(raid_stride, physical);
351 	}
352 
353 	stripe_key.objectid = bioc->logical;
354 	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
355 	stripe_key.offset = bioc->size;
356 
357 	ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
358 				item_size);
359 	if (ret == -EEXIST) {
360 		ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
361 					      item_size);
362 		if (ret)
363 			btrfs_abort_transaction(trans, ret);
364 	} else if (ret) {
365 		btrfs_abort_transaction(trans, ret);
366 	}
367 
368 	return ret;
369 }
370 
btrfs_insert_raid_extent(struct btrfs_trans_handle * trans,struct btrfs_ordered_extent * ordered_extent)371 int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
372 			     struct btrfs_ordered_extent *ordered_extent)
373 {
374 	struct btrfs_io_context *bioc;
375 	int ret;
376 
377 	if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
378 		return 0;
379 
380 	list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
381 		ret = btrfs_insert_one_raid_extent(trans, bioc);
382 		if (ret)
383 			return ret;
384 	}
385 
386 	while (!list_empty(&ordered_extent->bioc_list)) {
387 		bioc = list_first_entry(&ordered_extent->bioc_list,
388 					typeof(*bioc), rst_ordered_entry);
389 		list_del(&bioc->rst_ordered_entry);
390 		btrfs_put_bioc(bioc);
391 	}
392 
393 	return 0;
394 }
395 
btrfs_get_raid_extent_offset(struct btrfs_fs_info * fs_info,u64 logical,u64 * length,u64 map_type,u32 stripe_index,struct btrfs_io_stripe * stripe)396 int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
397 				 u64 logical, u64 *length, u64 map_type,
398 				 u32 stripe_index, struct btrfs_io_stripe *stripe)
399 {
400 	struct btrfs_root *stripe_root = fs_info->stripe_root;
401 	struct btrfs_stripe_extent *stripe_extent;
402 	struct btrfs_key stripe_key;
403 	struct btrfs_key found_key;
404 	BTRFS_PATH_AUTO_FREE(path);
405 	struct extent_buffer *leaf;
406 	const u64 end = logical + *length;
407 	int num_stripes;
408 	u64 offset;
409 	u64 found_logical;
410 	u64 found_length;
411 	u64 found_end;
412 	int slot;
413 	int ret;
414 
415 	stripe_key.objectid = logical;
416 	stripe_key.type = BTRFS_RAID_STRIPE_KEY;
417 	stripe_key.offset = 0;
418 
419 	path = btrfs_alloc_path();
420 	if (!path)
421 		return -ENOMEM;
422 
423 	if (stripe->rst_search_commit_root) {
424 		path->skip_locking = true;
425 		path->search_commit_root = true;
426 	}
427 
428 	ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
429 	if (ret < 0)
430 		return ret;
431 	if (ret) {
432 		if (path->slots[0] != 0)
433 			path->slots[0]--;
434 	}
435 
436 	while (1) {
437 		leaf = path->nodes[0];
438 		slot = path->slots[0];
439 
440 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
441 		found_logical = found_key.objectid;
442 		found_length = found_key.offset;
443 		found_end = found_logical + found_length;
444 
445 		if (found_logical > end) {
446 			ret = -ENODATA;
447 			goto out;
448 		}
449 
450 		if (in_range(logical, found_logical, found_length))
451 			break;
452 
453 		ret = btrfs_next_item(stripe_root, path);
454 		if (ret)
455 			goto out;
456 	}
457 
458 	offset = logical - found_logical;
459 
460 	/*
461 	 * If we have a logically contiguous, but physically non-continuous
462 	 * range, we need to split the bio. Record the length after which we
463 	 * must split the bio.
464 	 */
465 	if (end > found_end)
466 		*length -= end - found_end;
467 
468 	num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
469 	stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
470 
471 	for (int i = 0; i < num_stripes; i++) {
472 		struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
473 		u64 devid = btrfs_raid_stride_devid(leaf, stride);
474 		u64 physical = btrfs_raid_stride_physical(leaf, stride);
475 
476 		if (devid != stripe->dev->devid)
477 			continue;
478 
479 		if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
480 			continue;
481 
482 		stripe->physical = physical + offset;
483 
484 		trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
485 						   stripe->physical, devid);
486 
487 		return 0;
488 	}
489 
490 	/* If we're here, we haven't found the requested devid in the stripe. */
491 	ret = -ENODATA;
492 out:
493 	if (ret > 0)
494 		ret = -ENODATA;
495 	if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
496 		btrfs_debug(fs_info,
497 		"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
498 			  logical, logical + *length, stripe->dev->devid,
499 			  btrfs_bg_type_to_raid_name(map_type));
500 	}
501 
502 	return ret;
503 }
504