xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_indirect_mapping.c (revision ea4ea50f064c5468142b24627acad09a41f060cb)
1 /*
2  * CDDL HEADER START
3  *
4  * This file and its contents are supplied under the terms of the
5  * Common Development and Distribution License ("CDDL"), version 1.0.
6  * You may only use this file in accordance with the terms of version
7  * 1.0 of the CDDL.
8  *
9  * A full copy of the text of the CDDL should have accompanied this
10  * source.  A copy of the CDDL is also available via the Internet at
11  * http://www.illumos.org/license/CDDL.
12  *
13  * CDDL HEADER END
14  */
15 
16 /*
17  * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
18  */
19 
20 #include <sys/dmu_tx.h>
21 #include <sys/dsl_pool.h>
22 #include <sys/spa.h>
23 #include <sys/vdev_impl.h>
24 #include <sys/vdev_indirect_mapping.h>
25 #include <sys/zfeature.h>
26 #include <sys/dmu_objset.h>
27 
28 static boolean_t
29 vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
30 {
31 	ASSERT(vim != NULL);
32 
33 	ASSERT(vim->vim_object != 0);
34 	ASSERT(vim->vim_objset != NULL);
35 	ASSERT(vim->vim_phys != NULL);
36 	ASSERT(vim->vim_dbuf != NULL);
37 
38 	EQUIV(vim->vim_phys->vimp_num_entries > 0,
39 	    vim->vim_entries != NULL);
40 	if (vim->vim_phys->vimp_num_entries > 0) {
41 		vdev_indirect_mapping_entry_phys_t *last_entry =
42 		    &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
43 		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(last_entry);
44 		uint64_t size = DVA_GET_ASIZE(&last_entry->vimep_dst);
45 
46 		ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
47 	}
48 	if (vim->vim_havecounts) {
49 		ASSERT(vim->vim_phys->vimp_counts_object != 0);
50 	}
51 
52 	return (B_TRUE);
53 }
54 
55 uint64_t
56 vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
57 {
58 	ASSERT(vdev_indirect_mapping_verify(vim));
59 
60 	return (vim->vim_phys->vimp_num_entries);
61 }
62 
63 uint64_t
64 vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
65 {
66 	ASSERT(vdev_indirect_mapping_verify(vim));
67 
68 	return (vim->vim_phys->vimp_max_offset);
69 }
70 
71 uint64_t
72 vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
73 {
74 	ASSERT(vdev_indirect_mapping_verify(vim));
75 
76 	return (vim->vim_object);
77 }
78 
79 uint64_t
80 vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
81 {
82 	ASSERT(vdev_indirect_mapping_verify(vim));
83 
84 	return (vim->vim_phys->vimp_bytes_mapped);
85 }
86 
87 /*
88  * The length (in bytes) of the mapping object array in memory and
89  * (logically) on disk.
90  *
91  * Note that unlike most of our accessor functions,
92  * we don't assert that the struct is consistent; therefore it can be
93  * called while there may be concurrent changes, if we don't care about
94  * the value being immediately stale (e.g. from spa_removal_get_stats()).
95  */
96 uint64_t
97 vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
98 {
99 	return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
100 }
101 
102 /*
103  * Compare an offset with an indirect mapping entry; there are three
104  * possible scenarios:
105  *
106  *     1. The offset is "less than" the mapping entry; meaning the
107  *        offset is less than the source offset of the mapping entry. In
108  *        this case, there is no overlap between the offset and the
109  *        mapping entry and -1 will be returned.
110  *
111  *     2. The offset is "greater than" the mapping entry; meaning the
112  *        offset is greater than the mapping entry's source offset plus
113  *        the entry's size. In this case, there is no overlap between
114  *        the offset and the mapping entry and 1 will be returned.
115  *
116  *        NOTE: If the offset is actually equal to the entry's offset
117  *        plus size, this is considered to be "greater" than the entry,
118  *        and this case applies (i.e. 1 will be returned). Thus, the
119  *        entry's "range" can be considered to be inclusive at its
120  *        start, but exclusive at its end: e.g. [src, src + size).
121  *
122  *     3. The last case to consider is if the offset actually falls
123  *        within the mapping entry's range. If this is the case, the
124  *        offset is considered to be "equal to" the mapping entry and
125  *        0 will be returned.
126  *
127  *        NOTE: If the offset is equal to the entry's source offset,
128  *        this case applies and 0 will be returned. If the offset is
129  *        equal to the entry's source plus its size, this case does
130  *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
131  *        returned.
132  */
133 static int
134 dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
135 {
136 	const uint64_t *key = v_key;
137 	const vdev_indirect_mapping_entry_phys_t *array_elem =
138 	    v_array_elem;
139 	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
140 
141 	if (*key < src_offset) {
142 		return (-1);
143 	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
144 		return (0);
145 	} else {
146 		return (1);
147 	}
148 }
149 
150 /*
151  * Returns the mapping entry for the given offset.
152  *
153  * It's possible that the given offset will not be in the mapping table
154  * (i.e. no mapping entries contain this offset), in which case, the
155  * return value value depends on the "next_if_missing" parameter.
156  *
157  * If the offset is not found in the table and "next_if_missing" is
158  * B_FALSE, then NULL will always be returned. The behavior is intended
159  * to allow consumers to get the entry corresponding to the offset
160  * parameter, iff the offset overlaps with an entry in the table.
161  *
162  * If the offset is not found in the table and "next_if_missing" is
163  * B_TRUE, then the entry nearest to the given offset will be returned,
164  * such that the entry's source offset is greater than the offset
165  * passed in (i.e. the "next" mapping entry in the table is returned, if
166  * the offset is missing from the table). If there are no entries whose
167  * source offset is greater than the passed in offset, NULL is returned.
168  */
169 static vdev_indirect_mapping_entry_phys_t *
170 vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
171     uint64_t offset, boolean_t next_if_missing)
172 {
173 	ASSERT(vdev_indirect_mapping_verify(vim));
174 	ASSERT(vim->vim_phys->vimp_num_entries > 0);
175 
176 	vdev_indirect_mapping_entry_phys_t *entry = NULL;
177 
178 	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
179 	uint64_t base = 0;
180 
181 	/*
182 	 * We don't define these inside of the while loop because we use
183 	 * their value in the case that offset isn't in the mapping.
184 	 */
185 	uint64_t mid;
186 	int result;
187 
188 	while (last >= base) {
189 		mid = base + ((last - base) >> 1);
190 
191 		result = dva_mapping_overlap_compare(&offset,
192 		    &vim->vim_entries[mid]);
193 
194 		if (result == 0) {
195 			entry = &vim->vim_entries[mid];
196 			break;
197 		} else if (result < 0) {
198 			last = mid - 1;
199 		} else {
200 			base = mid + 1;
201 		}
202 	}
203 
204 	if (entry == NULL && next_if_missing) {
205 		ASSERT3U(base, ==, last + 1);
206 		ASSERT(mid == base || mid == last);
207 		ASSERT3S(result, !=, 0);
208 
209 		/*
210 		 * The offset we're looking for isn't actually contained
211 		 * in the mapping table, thus we need to return the
212 		 * closest mapping entry that is greater than the
213 		 * offset. We reuse the result of the last comparison,
214 		 * comparing the mapping entry at index "mid" and the
215 		 * offset. The offset is guaranteed to lie between
216 		 * indices one less than "mid", and one greater than
217 		 * "mid"; we just need to determine if offset is greater
218 		 * than, or less than the mapping entry contained at
219 		 * index "mid".
220 		 */
221 
222 		uint64_t index;
223 		if (result < 0)
224 			index = mid;
225 		else
226 			index = mid + 1;
227 
228 		ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
229 
230 		if (index == vim->vim_phys->vimp_num_entries) {
231 			/*
232 			 * If "index" is past the end of the entries
233 			 * array, then not only is the offset not in the
234 			 * mapping table, but it's actually greater than
235 			 * all entries in the table. In this case, we
236 			 * can't return a mapping entry greater than the
237 			 * offset (since none exist), so we return NULL.
238 			 */
239 
240 			ASSERT3S(dva_mapping_overlap_compare(&offset,
241 			    &vim->vim_entries[index - 1]), >, 0);
242 
243 			return (NULL);
244 		} else {
245 			/*
246 			 * Just to be safe, we verify the offset falls
247 			 * in between the mapping entries at index and
248 			 * one less than index. Since we know the offset
249 			 * doesn't overlap an entry, and we're supposed
250 			 * to return the entry just greater than the
251 			 * offset, both of the following tests must be
252 			 * true.
253 			 */
254 			ASSERT3S(dva_mapping_overlap_compare(&offset,
255 			    &vim->vim_entries[index]), <, 0);
256 			IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
257 			    &vim->vim_entries[index - 1]) > 0);
258 
259 			return (&vim->vim_entries[index]);
260 		}
261 	} else {
262 		return (entry);
263 	}
264 }
265 
266 vdev_indirect_mapping_entry_phys_t *
267 vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
268     uint64_t offset)
269 {
270 	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
271 	    B_FALSE));
272 }
273 
274 vdev_indirect_mapping_entry_phys_t *
275 vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
276     uint64_t offset)
277 {
278 	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
279 	    B_TRUE));
280 }
281 
282 
283 void
284 vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
285 {
286 	ASSERT(vdev_indirect_mapping_verify(vim));
287 
288 	if (vim->vim_phys->vimp_num_entries > 0) {
289 		uint64_t map_size = vdev_indirect_mapping_size(vim);
290 		kmem_free(vim->vim_entries, map_size);
291 		vim->vim_entries = NULL;
292 	}
293 
294 	dmu_buf_rele(vim->vim_dbuf, vim);
295 
296 	vim->vim_objset = NULL;
297 	vim->vim_object = 0;
298 	vim->vim_dbuf = NULL;
299 	vim->vim_phys = NULL;
300 
301 	kmem_free(vim, sizeof (*vim));
302 }
303 
304 uint64_t
305 vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
306 {
307 	uint64_t object;
308 	ASSERT(dmu_tx_is_syncing(tx));
309 	uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
310 
311 	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
312 		bonus_size = sizeof (vdev_indirect_mapping_phys_t);
313 	}
314 
315 	object = dmu_object_alloc(os,
316 	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
317 	    DMU_OTN_UINT64_METADATA, bonus_size,
318 	    tx);
319 
320 	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
321 		dmu_buf_t *dbuf;
322 		vdev_indirect_mapping_phys_t *vimp;
323 
324 		VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
325 		dmu_buf_will_dirty(dbuf, tx);
326 		vimp = dbuf->db_data;
327 		vimp->vimp_counts_object = dmu_object_alloc(os,
328 		    DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
329 		    DMU_OT_NONE, 0, tx);
330 		spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
331 		dmu_buf_rele(dbuf, FTAG);
332 	}
333 
334 	return (object);
335 }
336 
337 
338 vdev_indirect_mapping_t *
339 vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
340 {
341 	vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
342 	dmu_object_info_t doi;
343 	VERIFY0(dmu_object_info(os, mapping_object, &doi));
344 
345 	vim->vim_objset = os;
346 	vim->vim_object = mapping_object;
347 
348 	VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
349 	    &vim->vim_dbuf));
350 	vim->vim_phys = vim->vim_dbuf->db_data;
351 
352 	vim->vim_havecounts =
353 	    (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
354 
355 	if (vim->vim_phys->vimp_num_entries > 0) {
356 		uint64_t map_size = vdev_indirect_mapping_size(vim);
357 		vim->vim_entries = kmem_alloc(map_size, KM_SLEEP);
358 		VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
359 		    vim->vim_entries, DMU_READ_PREFETCH));
360 	}
361 
362 	ASSERT(vdev_indirect_mapping_verify(vim));
363 
364 	return (vim);
365 }
366 
367 void
368 vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
369 {
370 	vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
371 	if (vim->vim_havecounts) {
372 		VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
373 		    tx));
374 		spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
375 	}
376 	vdev_indirect_mapping_close(vim);
377 
378 	VERIFY0(dmu_object_free(os, object, tx));
379 }
380 
381 /*
382  * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
383  * mapping object.  Also remove the entries from the list and free them.
384  * This also implicitly extends the max_offset of the mapping (to the end
385  * of the last entry).
386  */
387 void
388 vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
389     list_t *list, dmu_tx_t *tx)
390 {
391 	vdev_indirect_mapping_entry_phys_t *mapbuf;
392 	uint64_t old_size;
393 	uint32_t *countbuf = NULL;
394 	vdev_indirect_mapping_entry_phys_t *old_entries;
395 	uint64_t old_count;
396 	uint64_t entries_written = 0;
397 
398 	ASSERT(vdev_indirect_mapping_verify(vim));
399 	ASSERT(dmu_tx_is_syncing(tx));
400 	ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
401 	ASSERT(!list_is_empty(list));
402 
403 	old_size = vdev_indirect_mapping_size(vim);
404 	old_entries = vim->vim_entries;
405 	old_count = vim->vim_phys->vimp_num_entries;
406 
407 	dmu_buf_will_dirty(vim->vim_dbuf, tx);
408 
409 	mapbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
410 	if (vim->vim_havecounts) {
411 		countbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
412 		ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
413 		    SPA_FEATURE_OBSOLETE_COUNTS));
414 	}
415 	while (!list_is_empty(list)) {
416 		uint64_t i;
417 		/*
418 		 * Write entries from the list to the
419 		 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
420 		 */
421 		for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
422 			vdev_indirect_mapping_entry_t *entry =
423 			    list_remove_head(list);
424 			if (entry == NULL)
425 				break;
426 
427 			uint64_t size =
428 			    DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
429 			uint64_t src_offset =
430 			    DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
431 
432 			/*
433 			 * We shouldn't be adding an entry which is fully
434 			 * obsolete.
435 			 */
436 			ASSERT3U(entry->vime_obsolete_count, <, size);
437 			IMPLY(entry->vime_obsolete_count != 0,
438 			    vim->vim_havecounts);
439 
440 			mapbuf[i] = entry->vime_mapping;
441 			if (vim->vim_havecounts)
442 				countbuf[i] = entry->vime_obsolete_count;
443 
444 			vim->vim_phys->vimp_bytes_mapped += size;
445 			ASSERT3U(src_offset, >=,
446 			    vim->vim_phys->vimp_max_offset);
447 			vim->vim_phys->vimp_max_offset = src_offset + size;
448 
449 			entries_written++;
450 
451 			kmem_free(entry, sizeof (*entry));
452 		}
453 		dmu_write(vim->vim_objset, vim->vim_object,
454 		    vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
455 		    i * sizeof (*mapbuf),
456 		    mapbuf, tx);
457 		if (vim->vim_havecounts) {
458 			dmu_write(vim->vim_objset,
459 			    vim->vim_phys->vimp_counts_object,
460 			    vim->vim_phys->vimp_num_entries *
461 			    sizeof (*countbuf),
462 			    i * sizeof (*countbuf), countbuf, tx);
463 		}
464 		vim->vim_phys->vimp_num_entries += i;
465 	}
466 	zio_buf_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
467 	if (vim->vim_havecounts)
468 		zio_buf_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
469 
470 	/*
471 	 * Update the entry array to reflect the new entries. First, copy
472 	 * over any old entries then read back the new entries we just wrote.
473 	 */
474 	uint64_t new_size = vdev_indirect_mapping_size(vim);
475 	ASSERT3U(new_size, >, old_size);
476 	ASSERT3U(new_size - old_size, ==,
477 	    entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
478 	vim->vim_entries = kmem_alloc(new_size, KM_SLEEP);
479 	if (old_size > 0) {
480 		bcopy(old_entries, vim->vim_entries, old_size);
481 		kmem_free(old_entries, old_size);
482 	}
483 	VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
484 	    new_size - old_size, &vim->vim_entries[old_count],
485 	    DMU_READ_PREFETCH));
486 
487 	zfs_dbgmsg("txg %llu: wrote %llu entries to "
488 	    "indirect mapping obj %llu; max offset=0x%llx",
489 	    (u_longlong_t)dmu_tx_get_txg(tx),
490 	    (u_longlong_t)entries_written,
491 	    (u_longlong_t)vim->vim_object,
492 	    (u_longlong_t)vim->vim_phys->vimp_max_offset);
493 }
494 
495 /*
496  * Increment the relevant counts for the specified offset and length.
497  * The counts array must be obtained from
498  * vdev_indirect_mapping_load_obsolete_counts().
499  */
500 void
501 vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
502     uint64_t offset, uint64_t length, uint32_t *counts)
503 {
504 	vdev_indirect_mapping_entry_phys_t *mapping;
505 	uint64_t index;
506 
507 	mapping = vdev_indirect_mapping_entry_for_offset(vim,  offset);
508 
509 	ASSERT(length > 0);
510 	ASSERT3P(mapping, !=, NULL);
511 
512 	index = mapping - vim->vim_entries;
513 
514 	while (length > 0) {
515 		ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
516 
517 		uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
518 		uint64_t inner_offset = offset -
519 		    DVA_MAPPING_GET_SRC_OFFSET(mapping);
520 		VERIFY3U(inner_offset, <, size);
521 		uint64_t inner_size = MIN(length, size - inner_offset);
522 
523 		VERIFY3U(counts[index] + inner_size, <=, size);
524 		counts[index] += inner_size;
525 
526 		offset += inner_size;
527 		length -= inner_size;
528 		mapping++;
529 		index++;
530 	}
531 }
532 
533 typedef struct load_obsolete_space_map_arg {
534 	vdev_indirect_mapping_t	*losma_vim;
535 	uint32_t		*losma_counts;
536 } load_obsolete_space_map_arg_t;
537 
538 static int
539 load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
540 {
541 	load_obsolete_space_map_arg_t *losma = arg;
542 	ASSERT3S(sme->sme_type, ==, SM_ALLOC);
543 
544 	vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
545 	    sme->sme_offset, sme->sme_run, losma->losma_counts);
546 
547 	return (0);
548 }
549 
550 /*
551  * Modify the counts (increment them) based on the spacemap.
552  */
553 void
554 vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
555     uint32_t *counts, space_map_t *obsolete_space_sm)
556 {
557 	load_obsolete_space_map_arg_t losma;
558 	losma.losma_counts = counts;
559 	losma.losma_vim = vim;
560 	VERIFY0(space_map_iterate(obsolete_space_sm,
561 	    load_obsolete_sm_callback, &losma));
562 }
563 
564 /*
565  * Read the obsolete counts from disk, returning them in an array.
566  */
567 uint32_t *
568 vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
569 {
570 	ASSERT(vdev_indirect_mapping_verify(vim));
571 
572 	uint64_t counts_size =
573 	    vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
574 	uint32_t *counts = kmem_alloc(counts_size, KM_SLEEP);
575 	if (vim->vim_havecounts) {
576 		VERIFY0(dmu_read(vim->vim_objset,
577 		    vim->vim_phys->vimp_counts_object,
578 		    0, counts_size,
579 		    counts, DMU_READ_PREFETCH));
580 	} else {
581 		bzero(counts, counts_size);
582 	}
583 	return (counts);
584 }
585 
586 extern void
587 vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
588     uint32_t *counts)
589 {
590 	ASSERT(vdev_indirect_mapping_verify(vim));
591 
592 	kmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
593 }
594