xref: /freebsd/sys/contrib/openzfs/module/zfs/abd.c (revision 3a56015a2f5d630910177fa79a522bb95511ccf7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
23  * Copyright (c) 2019 by Delphix. All rights reserved.
24  */
25 
26 /*
27  * ARC buffer data (ABD).
28  *
29  * ABDs are an abstract data structure for the ARC which can use two
30  * different ways of storing the underlying data:
31  *
32  * (a) Linear buffer. In this case, all the data in the ABD is stored in one
33  *     contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
34  *
35  *         +-------------------+
36  *         | ABD (linear)      |
37  *         |   abd_flags = ... |
38  *         |   abd_size = ...  |     +--------------------------------+
39  *         |   abd_buf ------------->| raw buffer of size abd_size    |
40  *         +-------------------+     +--------------------------------+
41  *              no abd_chunks
42  *
43  * (b) Scattered buffer. In this case, the data in the ABD is split into
44  *     equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
45  *     to the chunks recorded in an array at the end of the ABD structure.
46  *
47  *         +-------------------+
48  *         | ABD (scattered)   |
49  *         |   abd_flags = ... |
50  *         |   abd_size = ...  |
51  *         |   abd_offset = 0  |                           +-----------+
52  *         |   abd_chunks[0] ----------------------------->| chunk 0   |
53  *         |   abd_chunks[1] ---------------------+        +-----------+
54  *         |   ...             |                  |        +-----------+
55  *         |   abd_chunks[N-1] ---------+         +------->| chunk 1   |
56  *         +-------------------+        |                  +-----------+
57  *                                      |                      ...
58  *                                      |                  +-----------+
59  *                                      +----------------->| chunk N-1 |
60  *                                                         +-----------+
61  *
62  * In addition to directly allocating a linear or scattered ABD, it is also
63  * possible to create an ABD by requesting the "sub-ABD" starting at an offset
64  * within an existing ABD. In linear buffers this is simple (set abd_buf of
65  * the new ABD to the starting point within the original raw buffer), but
66  * scattered ABDs are a little more complex. The new ABD makes a copy of the
67  * relevant abd_chunks pointers (but not the underlying data). However, to
68  * provide arbitrary rather than only chunk-aligned starting offsets, it also
69  * tracks an abd_offset field which represents the starting point of the data
70  * within the first chunk in abd_chunks. For both linear and scattered ABDs,
71  * creating an offset ABD marks the original ABD as the offset's parent, and the
72  * original ABD's abd_children refcount is incremented. This data allows us to
73  * ensure the root ABD isn't deleted before its children.
74  *
75  * Most consumers should never need to know what type of ABD they're using --
76  * the ABD public API ensures that it's possible to transparently switch from
77  * using a linear ABD to a scattered one when doing so would be beneficial.
78  *
79  * If you need to use the data within an ABD directly, if you know it's linear
80  * (because you allocated it) you can use abd_to_buf() to access the underlying
81  * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
82  * which will allocate a raw buffer if necessary. Use the abd_return_buf*
83  * functions to return any raw buffers that are no longer necessary when you're
84  * done using them.
85  *
86  * There are a variety of ABD APIs that implement basic buffer operations:
87  * compare, copy, read, write, and fill with zeroes. If you need a custom
88  * function which progressively accesses the whole ABD, use the abd_iterate_*
89  * functions.
90  *
91  * As an additional feature, linear and scatter ABD's can be stitched together
92  * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs
93  * to be viewed as a singular ABD.
94  *
95  * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
96  * B_FALSE.
97  */
98 
99 #include <sys/abd_impl.h>
100 #include <sys/param.h>
101 #include <sys/zio.h>
102 #include <sys/zfs_context.h>
103 #include <sys/zfs_znode.h>
104 
105 /* see block comment above for description */
106 int zfs_abd_scatter_enabled = B_TRUE;
107 
108 void
109 abd_verify(abd_t *abd)
110 {
111 #ifdef ZFS_DEBUG
112 	if (abd_is_from_pages(abd)) {
113 		ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS);
114 	} else {
115 		ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
116 	}
117 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
118 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
119 	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
120 	    ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES));
121 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
122 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
123 	if (abd_is_linear(abd)) {
124 		ASSERT3U(abd->abd_size, >, 0);
125 		ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
126 	} else if (abd_is_gang(abd)) {
127 		uint_t child_sizes = 0;
128 		for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
129 		    cabd != NULL;
130 		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
131 			ASSERT(list_link_active(&cabd->abd_gang_link));
132 			child_sizes += cabd->abd_size;
133 			abd_verify(cabd);
134 		}
135 		ASSERT3U(abd->abd_size, ==, child_sizes);
136 	} else {
137 		ASSERT3U(abd->abd_size, >, 0);
138 		abd_verify_scatter(abd);
139 	}
140 #endif
141 }
142 
143 void
144 abd_init_struct(abd_t *abd)
145 {
146 	list_link_init(&abd->abd_gang_link);
147 	mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
148 	abd->abd_flags = 0;
149 #ifdef ZFS_DEBUG
150 	zfs_refcount_create(&abd->abd_children);
151 	abd->abd_parent = NULL;
152 #endif
153 	abd->abd_size = 0;
154 }
155 
156 static void
157 abd_fini_struct(abd_t *abd)
158 {
159 	mutex_destroy(&abd->abd_mtx);
160 	ASSERT(!list_link_active(&abd->abd_gang_link));
161 #ifdef ZFS_DEBUG
162 	zfs_refcount_destroy(&abd->abd_children);
163 #endif
164 }
165 
166 abd_t *
167 abd_alloc_struct(size_t size)
168 {
169 	abd_t *abd = abd_alloc_struct_impl(size);
170 	abd_init_struct(abd);
171 	abd->abd_flags |= ABD_FLAG_ALLOCD;
172 	return (abd);
173 }
174 
175 void
176 abd_free_struct(abd_t *abd)
177 {
178 	abd_fini_struct(abd);
179 	abd_free_struct_impl(abd);
180 }
181 
182 /*
183  * Allocate an ABD, along with its own underlying data buffers. Use this if you
184  * don't care whether the ABD is linear or not.
185  */
186 abd_t *
187 abd_alloc(size_t size, boolean_t is_metadata)
188 {
189 	if (abd_size_alloc_linear(size))
190 		return (abd_alloc_linear(size, is_metadata));
191 
192 	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
193 
194 	abd_t *abd = abd_alloc_struct(size);
195 	abd->abd_flags |= ABD_FLAG_OWNER;
196 	abd->abd_u.abd_scatter.abd_offset = 0;
197 	abd_alloc_chunks(abd, size);
198 
199 	if (is_metadata) {
200 		abd->abd_flags |= ABD_FLAG_META;
201 	}
202 	abd->abd_size = size;
203 
204 	abd_update_scatter_stats(abd, ABDSTAT_INCR);
205 
206 	return (abd);
207 }
208 
209 /*
210  * Allocate an ABD that must be linear, along with its own underlying data
211  * buffer. Only use this when it would be very annoying to write your ABD
212  * consumer with a scattered ABD.
213  */
214 abd_t *
215 abd_alloc_linear(size_t size, boolean_t is_metadata)
216 {
217 	abd_t *abd = abd_alloc_struct(0);
218 
219 	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
220 
221 	abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
222 	if (is_metadata) {
223 		abd->abd_flags |= ABD_FLAG_META;
224 	}
225 	abd->abd_size = size;
226 
227 	if (is_metadata) {
228 		ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);
229 	} else {
230 		ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size);
231 	}
232 
233 	abd_update_linear_stats(abd, ABDSTAT_INCR);
234 
235 	return (abd);
236 }
237 
238 static void
239 abd_free_linear(abd_t *abd)
240 {
241 	if (abd_is_linear_page(abd)) {
242 		abd_free_linear_page(abd);
243 		return;
244 	}
245 
246 	if (abd->abd_flags & ABD_FLAG_META) {
247 		zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
248 	} else {
249 		zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
250 	}
251 
252 	abd_update_linear_stats(abd, ABDSTAT_DECR);
253 }
254 
255 static void
256 abd_free_gang(abd_t *abd)
257 {
258 	ASSERT(abd_is_gang(abd));
259 	abd_t *cabd;
260 
261 	while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) {
262 		/*
263 		 * We must acquire the child ABDs mutex to ensure that if it
264 		 * is being added to another gang ABD we will set the link
265 		 * as inactive when removing it from this gang ABD and before
266 		 * adding it to the other gang ABD.
267 		 */
268 		mutex_enter(&cabd->abd_mtx);
269 		ASSERT(list_link_active(&cabd->abd_gang_link));
270 		list_remove(&ABD_GANG(abd).abd_gang_chain, cabd);
271 		mutex_exit(&cabd->abd_mtx);
272 		if (cabd->abd_flags & ABD_FLAG_GANG_FREE)
273 			abd_free(cabd);
274 	}
275 	list_destroy(&ABD_GANG(abd).abd_gang_chain);
276 }
277 
278 static void
279 abd_free_scatter(abd_t *abd)
280 {
281 	abd_free_chunks(abd);
282 	abd_update_scatter_stats(abd, ABDSTAT_DECR);
283 }
284 
285 /*
286  * Free an ABD.  Use with any kind of abd: those created with abd_alloc_*()
287  * and abd_get_*(), including abd_get_offset_struct().
288  *
289  * If the ABD was created with abd_alloc_*(), the underlying data
290  * (scatterlist or linear buffer) will also be freed.  (Subject to ownership
291  * changes via abd_*_ownership_of_buf().)
292  *
293  * Unless the ABD was created with abd_get_offset_struct(), the abd_t will
294  * also be freed.
295  */
296 void
297 abd_free(abd_t *abd)
298 {
299 	if (abd == NULL)
300 		return;
301 
302 	abd_verify(abd);
303 #ifdef ZFS_DEBUG
304 	IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL);
305 #endif
306 
307 	if (abd_is_gang(abd)) {
308 		abd_free_gang(abd);
309 	} else if (abd_is_linear(abd)) {
310 		if (abd->abd_flags & ABD_FLAG_OWNER)
311 			abd_free_linear(abd);
312 	} else {
313 		if (abd->abd_flags & ABD_FLAG_OWNER)
314 			abd_free_scatter(abd);
315 	}
316 
317 #ifdef ZFS_DEBUG
318 	if (abd->abd_parent != NULL) {
319 		(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
320 		    abd->abd_size, abd);
321 	}
322 #endif
323 
324 	abd_fini_struct(abd);
325 	if (abd->abd_flags & ABD_FLAG_ALLOCD)
326 		abd_free_struct_impl(abd);
327 }
328 
329 /*
330  * Allocate an ABD of the same format (same metadata flag, same scatterize
331  * setting) as another ABD.
332  */
333 abd_t *
334 abd_alloc_sametype(abd_t *sabd, size_t size)
335 {
336 	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
337 	if (abd_is_linear(sabd) &&
338 	    !abd_is_linear_page(sabd)) {
339 		return (abd_alloc_linear(size, is_metadata));
340 	} else {
341 		return (abd_alloc(size, is_metadata));
342 	}
343 }
344 
345 /*
346  * Create gang ABD that will be the head of a list of ABD's. This is used
347  * to "chain" scatter/gather lists together when constructing aggregated
348  * IO's. To free this abd, abd_free() must be called.
349  */
350 abd_t *
351 abd_alloc_gang(void)
352 {
353 	abd_t *abd = abd_alloc_struct(0);
354 	abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER;
355 	list_create(&ABD_GANG(abd).abd_gang_chain,
356 	    sizeof (abd_t), offsetof(abd_t, abd_gang_link));
357 	return (abd);
358 }
359 
360 /*
361  * Add a child gang ABD to a parent gang ABDs chained list.
362  */
363 static void
364 abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
365 {
366 	ASSERT(abd_is_gang(pabd));
367 	ASSERT(abd_is_gang(cabd));
368 
369 	if (free_on_free) {
370 		/*
371 		 * If the parent is responsible for freeing the child gang
372 		 * ABD we will just splice the child's children ABD list to
373 		 * the parent's list and immediately free the child gang ABD
374 		 * struct. The parent gang ABDs children from the child gang
375 		 * will retain all the free_on_free settings after being
376 		 * added to the parents list.
377 		 */
378 #ifdef ZFS_DEBUG
379 		/*
380 		 * If cabd had abd_parent, we have to drop it here.  We can't
381 		 * transfer it to pabd, nor we can clear abd_size leaving it.
382 		 */
383 		if (cabd->abd_parent != NULL) {
384 			(void) zfs_refcount_remove_many(
385 			    &cabd->abd_parent->abd_children,
386 			    cabd->abd_size, cabd);
387 			cabd->abd_parent = NULL;
388 		}
389 #endif
390 		pabd->abd_size += cabd->abd_size;
391 		cabd->abd_size = 0;
392 		list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
393 		    &ABD_GANG(cabd).abd_gang_chain);
394 		ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
395 		abd_verify(pabd);
396 		abd_free(cabd);
397 	} else {
398 		for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain);
399 		    child != NULL;
400 		    child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) {
401 			/*
402 			 * We always pass B_FALSE for free_on_free as it is the
403 			 * original child gang ABDs responsibility to determine
404 			 * if any of its child ABDs should be free'd on the call
405 			 * to abd_free().
406 			 */
407 			abd_gang_add(pabd, child, B_FALSE);
408 		}
409 		abd_verify(pabd);
410 	}
411 }
412 
413 /*
414  * Add a child ABD to a gang ABD's chained list.
415  */
416 void
417 abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
418 {
419 	ASSERT(abd_is_gang(pabd));
420 	abd_t *child_abd = NULL;
421 
422 	/*
423 	 * If the child being added is a gang ABD, we will add the
424 	 * child's ABDs to the parent gang ABD. This allows us to account
425 	 * for the offset correctly in the parent gang ABD.
426 	 */
427 	if (abd_is_gang(cabd)) {
428 		ASSERT(!list_link_active(&cabd->abd_gang_link));
429 		return (abd_gang_add_gang(pabd, cabd, free_on_free));
430 	}
431 	ASSERT(!abd_is_gang(cabd));
432 
433 	/*
434 	 * In order to verify that an ABD is not already part of
435 	 * another gang ABD, we must lock the child ABD's abd_mtx
436 	 * to check its abd_gang_link status. We unlock the abd_mtx
437 	 * only after it is has been added to a gang ABD, which
438 	 * will update the abd_gang_link's status. See comment below
439 	 * for how an ABD can be in multiple gang ABD's simultaneously.
440 	 */
441 	mutex_enter(&cabd->abd_mtx);
442 	if (list_link_active(&cabd->abd_gang_link)) {
443 		/*
444 		 * If the child ABD is already part of another
445 		 * gang ABD then we must allocate a new
446 		 * ABD to use a separate link. We mark the newly
447 		 * allocated ABD with ABD_FLAG_GANG_FREE, before
448 		 * adding it to the gang ABD's list, to make the
449 		 * gang ABD aware that it is responsible to call
450 		 * abd_free(). We use abd_get_offset() in order
451 		 * to just allocate a new ABD but avoid copying the
452 		 * data over into the newly allocated ABD.
453 		 *
454 		 * An ABD may become part of multiple gang ABD's. For
455 		 * example, when writing ditto bocks, the same ABD
456 		 * is used to write 2 or 3 locations with 2 or 3
457 		 * zio_t's. Each of the zio's may be aggregated with
458 		 * different adjacent zio's. zio aggregation uses gang
459 		 * zio's, so the single ABD can become part of multiple
460 		 * gang zio's.
461 		 *
462 		 * The ASSERT below is to make sure that if
463 		 * free_on_free is passed as B_TRUE, the ABD can
464 		 * not be in multiple gang ABD's. The gang ABD
465 		 * can not be responsible for cleaning up the child
466 		 * ABD memory allocation if the ABD can be in
467 		 * multiple gang ABD's at one time.
468 		 */
469 		ASSERT3B(free_on_free, ==, B_FALSE);
470 		child_abd = abd_get_offset(cabd, 0);
471 		child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
472 	} else {
473 		child_abd = cabd;
474 		if (free_on_free)
475 			child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
476 	}
477 	ASSERT3P(child_abd, !=, NULL);
478 
479 	list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd);
480 	mutex_exit(&cabd->abd_mtx);
481 	pabd->abd_size += child_abd->abd_size;
482 }
483 
484 /*
485  * Locate the ABD for the supplied offset in the gang ABD.
486  * Return a new offset relative to the returned ABD.
487  */
488 abd_t *
489 abd_gang_get_offset(abd_t *abd, size_t *off)
490 {
491 	abd_t *cabd;
492 
493 	ASSERT(abd_is_gang(abd));
494 	ASSERT3U(*off, <, abd->abd_size);
495 	for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL;
496 	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
497 		if (*off >= cabd->abd_size)
498 			*off -= cabd->abd_size;
499 		else
500 			return (cabd);
501 	}
502 	VERIFY3P(cabd, !=, NULL);
503 	return (cabd);
504 }
505 
506 /*
507  * Allocate a new ABD, using the provided struct (if non-NULL, and if
508  * circumstances allow - otherwise allocate the struct).  The returned ABD will
509  * point to offset off of sabd. It shares the underlying buffer data with sabd.
510  * Use abd_free() to free.  sabd must not be freed while any derived ABDs exist.
511  */
512 static abd_t *
513 abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
514 {
515 	abd_verify(sabd);
516 	ASSERT3U(off + size, <=, sabd->abd_size);
517 
518 	if (abd_is_linear(sabd)) {
519 		if (abd == NULL)
520 			abd = abd_alloc_struct(0);
521 		/*
522 		 * Even if this buf is filesystem metadata, we only track that
523 		 * if we own the underlying data buffer, which is not true in
524 		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
525 		 */
526 		abd->abd_flags |= ABD_FLAG_LINEAR;
527 
528 		/*
529 		 * User pages from Direct I/O requests may be in a single page
530 		 * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag
531 		 * that here for abd. This is required because we have to be
532 		 * careful when borrowing the buffer from the ABD because we
533 		 * can not place user pages under write protection on Linux.
534 		 * See the comments in abd_os.c for abd_borrow_buf(),
535 		 * abd_borrow_buf_copy(), abd_return_buf() and
536 		 * abd_return_buf_copy().
537 		 */
538 		if (abd_is_from_pages(sabd)) {
539 			abd->abd_flags |= ABD_FLAG_FROM_PAGES |
540 			    ABD_FLAG_LINEAR_PAGE;
541 		}
542 
543 		ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
544 	} else if (abd_is_gang(sabd)) {
545 		size_t left = size;
546 		if (abd == NULL) {
547 			abd = abd_alloc_gang();
548 		} else {
549 			abd->abd_flags |= ABD_FLAG_GANG;
550 			list_create(&ABD_GANG(abd).abd_gang_chain,
551 			    sizeof (abd_t), offsetof(abd_t, abd_gang_link));
552 		}
553 
554 		abd->abd_flags &= ~ABD_FLAG_OWNER;
555 		for (abd_t *cabd = abd_gang_get_offset(sabd, &off);
556 		    cabd != NULL && left > 0;
557 		    cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) {
558 			int csize = MIN(left, cabd->abd_size - off);
559 
560 			abd_t *nabd = abd_get_offset_size(cabd, off, csize);
561 			abd_gang_add(abd, nabd, B_TRUE);
562 			left -= csize;
563 			off = 0;
564 		}
565 		ASSERT3U(left, ==, 0);
566 	} else {
567 		abd = abd_get_offset_scatter(abd, sabd, off, size);
568 	}
569 
570 	ASSERT3P(abd, !=, NULL);
571 	abd->abd_size = size;
572 #ifdef ZFS_DEBUG
573 	abd->abd_parent = sabd;
574 	(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
575 #endif
576 	return (abd);
577 }
578 
579 /*
580  * Like abd_get_offset_size(), but memory for the abd_t is provided by the
581  * caller.  Using this routine can improve performance by avoiding the cost
582  * of allocating memory for the abd_t struct, and updating the abd stats.
583  * Usually, the provided abd is returned, but in some circumstances (FreeBSD,
584  * if sabd is scatter and size is more than 2 pages) a new abd_t may need to
585  * be allocated.  Therefore callers should be careful to use the returned
586  * abd_t*.
587  */
588 abd_t *
589 abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size)
590 {
591 	abd_t *result;
592 	abd_init_struct(abd);
593 	result = abd_get_offset_impl(abd, sabd, off, size);
594 	if (result != abd)
595 		abd_fini_struct(abd);
596 	return (result);
597 }
598 
599 abd_t *
600 abd_get_offset(abd_t *sabd, size_t off)
601 {
602 	size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
603 	VERIFY3U(size, >, 0);
604 	return (abd_get_offset_impl(NULL, sabd, off, size));
605 }
606 
607 abd_t *
608 abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
609 {
610 	ASSERT3U(off + size, <=, sabd->abd_size);
611 	return (abd_get_offset_impl(NULL, sabd, off, size));
612 }
613 
614 /*
615  * Return a size scatter ABD containing only zeros.
616  */
617 abd_t *
618 abd_get_zeros(size_t size)
619 {
620 	ASSERT3P(abd_zero_scatter, !=, NULL);
621 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
622 	return (abd_get_offset_size(abd_zero_scatter, 0, size));
623 }
624 
625 /*
626  * Create a linear ABD for an existing buf.
627  */
628 static abd_t *
629 abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
630 {
631 	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
632 
633 	/*
634 	 * Even if this buf is filesystem metadata, we only track that if we
635 	 * own the underlying data buffer, which is not true in this case.
636 	 * Therefore, we don't ever use ABD_FLAG_META here.
637 	 */
638 	abd->abd_flags |= ABD_FLAG_LINEAR;
639 	abd->abd_size = size;
640 
641 	ABD_LINEAR_BUF(abd) = buf;
642 
643 	return (abd);
644 }
645 
646 abd_t *
647 abd_get_from_buf(void *buf, size_t size)
648 {
649 	abd_t *abd = abd_alloc_struct(0);
650 	return (abd_get_from_buf_impl(abd, buf, size));
651 }
652 
653 abd_t *
654 abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
655 {
656 	abd_init_struct(abd);
657 	return (abd_get_from_buf_impl(abd, buf, size));
658 }
659 
660 /*
661  * Get the raw buffer associated with a linear ABD.
662  */
663 void *
664 abd_to_buf(abd_t *abd)
665 {
666 	ASSERT(abd_is_linear(abd));
667 	abd_verify(abd);
668 	return (ABD_LINEAR_BUF(abd));
669 }
670 
671 void
672 abd_release_ownership_of_buf(abd_t *abd)
673 {
674 	ASSERT(abd_is_linear(abd));
675 	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
676 
677 	/*
678 	 * abd_free() needs to handle LINEAR_PAGE ABD's specially.
679 	 * Since that flag does not survive the
680 	 * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
681 	 * abd_take_ownership_of_buf() sequence, we don't allow releasing
682 	 * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
683 	 */
684 	ASSERT(!abd_is_linear_page(abd));
685 
686 	abd_verify(abd);
687 
688 	abd->abd_flags &= ~ABD_FLAG_OWNER;
689 	/* Disable this flag since we no longer own the data buffer */
690 	abd->abd_flags &= ~ABD_FLAG_META;
691 
692 	abd_update_linear_stats(abd, ABDSTAT_DECR);
693 }
694 
695 
696 /*
697  * Give this ABD ownership of the buffer that it's storing. Can only be used on
698  * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
699  * with abd_alloc_linear() which subsequently released ownership of their buf
700  * with abd_release_ownership_of_buf().
701  */
702 void
703 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
704 {
705 	ASSERT(abd_is_linear(abd));
706 	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
707 	abd_verify(abd);
708 
709 	abd->abd_flags |= ABD_FLAG_OWNER;
710 	if (is_metadata) {
711 		abd->abd_flags |= ABD_FLAG_META;
712 	}
713 
714 	abd_update_linear_stats(abd, ABDSTAT_INCR);
715 }
716 
717 /*
718  * Initializes an abd_iter based on whether the abd is a gang ABD
719  * or just a single ABD.
720  */
721 static inline abd_t *
722 abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off)
723 {
724 	abd_t *cabd = NULL;
725 
726 	if (abd_is_gang(abd)) {
727 		cabd = abd_gang_get_offset(abd, &off);
728 		if (cabd) {
729 			abd_iter_init(aiter, cabd);
730 			abd_iter_advance(aiter, off);
731 		}
732 	} else {
733 		abd_iter_init(aiter, abd);
734 		abd_iter_advance(aiter, off);
735 	}
736 	return (cabd);
737 }
738 
739 /*
740  * Advances an abd_iter. We have to be careful with gang ABD as
741  * advancing could mean that we are at the end of a particular ABD and
742  * must grab the ABD in the gang ABD's list.
743  */
744 static inline abd_t *
745 abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter,
746     size_t len)
747 {
748 	abd_iter_advance(aiter, len);
749 	if (abd_is_gang(abd) && abd_iter_at_end(aiter)) {
750 		ASSERT3P(cabd, !=, NULL);
751 		cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd);
752 		if (cabd) {
753 			abd_iter_init(aiter, cabd);
754 			abd_iter_advance(aiter, 0);
755 		}
756 	}
757 	return (cabd);
758 }
759 
760 int
761 abd_iterate_func(abd_t *abd, size_t off, size_t size,
762     abd_iter_func_t *func, void *private)
763 {
764 	struct abd_iter aiter;
765 	int ret = 0;
766 
767 	if (size == 0)
768 		return (0);
769 
770 	abd_verify(abd);
771 	ASSERT3U(off + size, <=, abd->abd_size);
772 
773 	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
774 
775 	while (size > 0) {
776 		IMPLY(abd_is_gang(abd), c_abd != NULL);
777 
778 		abd_iter_map(&aiter);
779 
780 		size_t len = MIN(aiter.iter_mapsize, size);
781 		ASSERT3U(len, >, 0);
782 
783 		ret = func(aiter.iter_mapaddr, len, private);
784 
785 		abd_iter_unmap(&aiter);
786 
787 		if (ret != 0)
788 			break;
789 
790 		size -= len;
791 		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
792 	}
793 
794 	return (ret);
795 }
796 
797 #if defined(__linux__) && defined(_KERNEL)
798 int
799 abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
800     abd_iter_page_func_t *func, void *private)
801 {
802 	struct abd_iter aiter;
803 	int ret = 0;
804 
805 	if (size == 0)
806 		return (0);
807 
808 	abd_verify(abd);
809 	ASSERT3U(off + size, <=, abd->abd_size);
810 
811 	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
812 
813 	while (size > 0) {
814 		IMPLY(abd_is_gang(abd), c_abd != NULL);
815 
816 		abd_iter_page(&aiter);
817 
818 		size_t len = MIN(aiter.iter_page_dsize, size);
819 		ASSERT3U(len, >, 0);
820 
821 		ret = func(aiter.iter_page, aiter.iter_page_doff,
822 		    len, private);
823 
824 		aiter.iter_page = NULL;
825 		aiter.iter_page_doff = 0;
826 		aiter.iter_page_dsize = 0;
827 
828 		if (ret != 0)
829 			break;
830 
831 		size -= len;
832 		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
833 	}
834 
835 	return (ret);
836 }
837 #endif
838 
839 struct buf_arg {
840 	void *arg_buf;
841 };
842 
843 static int
844 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
845 {
846 	struct buf_arg *ba_ptr = private;
847 
848 	(void) memcpy(ba_ptr->arg_buf, buf, size);
849 	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
850 
851 	return (0);
852 }
853 
854 /*
855  * Copy abd to buf. (off is the offset in abd.)
856  */
857 void
858 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
859 {
860 	struct buf_arg ba_ptr = { buf };
861 
862 	(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
863 	    &ba_ptr);
864 }
865 
866 static int
867 abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
868 {
869 	int ret;
870 	struct buf_arg *ba_ptr = private;
871 
872 	ret = memcmp(buf, ba_ptr->arg_buf, size);
873 	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
874 
875 	return (ret);
876 }
877 
878 /*
879  * Compare the contents of abd to buf. (off is the offset in abd.)
880  */
881 int
882 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
883 {
884 	struct buf_arg ba_ptr = { (void *) buf };
885 
886 	return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
887 }
888 
889 static int
890 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
891 {
892 	struct buf_arg *ba_ptr = private;
893 
894 	(void) memcpy(buf, ba_ptr->arg_buf, size);
895 	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
896 
897 	return (0);
898 }
899 
900 /*
901  * Copy from buf to abd. (off is the offset in abd.)
902  */
903 void
904 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
905 {
906 	struct buf_arg ba_ptr = { (void *) buf };
907 
908 	(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
909 	    &ba_ptr);
910 }
911 
912 static int
913 abd_zero_off_cb(void *buf, size_t size, void *private)
914 {
915 	(void) private;
916 	(void) memset(buf, 0, size);
917 	return (0);
918 }
919 
920 /*
921  * Zero out the abd from a particular offset to the end.
922  */
923 void
924 abd_zero_off(abd_t *abd, size_t off, size_t size)
925 {
926 	(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
927 }
928 
929 /*
930  * Iterate over two ABDs and call func incrementally on the two ABDs' data in
931  * equal-sized chunks (passed to func as raw buffers). func could be called many
932  * times during this iteration.
933  */
934 int
935 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
936     size_t size, abd_iter_func2_t *func, void *private)
937 {
938 	int ret = 0;
939 	struct abd_iter daiter, saiter;
940 	abd_t *c_dabd, *c_sabd;
941 
942 	if (size == 0)
943 		return (0);
944 
945 	abd_verify(dabd);
946 	abd_verify(sabd);
947 
948 	ASSERT3U(doff + size, <=, dabd->abd_size);
949 	ASSERT3U(soff + size, <=, sabd->abd_size);
950 
951 	c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
952 	c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
953 
954 	while (size > 0) {
955 		IMPLY(abd_is_gang(dabd), c_dabd != NULL);
956 		IMPLY(abd_is_gang(sabd), c_sabd != NULL);
957 
958 		abd_iter_map(&daiter);
959 		abd_iter_map(&saiter);
960 
961 		size_t dlen = MIN(daiter.iter_mapsize, size);
962 		size_t slen = MIN(saiter.iter_mapsize, size);
963 		size_t len = MIN(dlen, slen);
964 		ASSERT(dlen > 0 || slen > 0);
965 
966 		ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
967 		    private);
968 
969 		abd_iter_unmap(&saiter);
970 		abd_iter_unmap(&daiter);
971 
972 		if (ret != 0)
973 			break;
974 
975 		size -= len;
976 		c_dabd =
977 		    abd_advance_abd_iter(dabd, c_dabd, &daiter, len);
978 		c_sabd =
979 		    abd_advance_abd_iter(sabd, c_sabd, &saiter, len);
980 	}
981 
982 	return (ret);
983 }
984 
985 static int
986 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
987 {
988 	(void) private;
989 	(void) memcpy(dbuf, sbuf, size);
990 	return (0);
991 }
992 
993 /*
994  * Copy from sabd to dabd starting from soff and doff.
995  */
996 void
997 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
998 {
999 	(void) abd_iterate_func2(dabd, sabd, doff, soff, size,
1000 	    abd_copy_off_cb, NULL);
1001 }
1002 
1003 static int
1004 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
1005 {
1006 	(void) private;
1007 	return (memcmp(bufa, bufb, size));
1008 }
1009 
1010 /*
1011  * Compares the contents of two ABDs.
1012  */
1013 int
1014 abd_cmp(abd_t *dabd, abd_t *sabd)
1015 {
1016 	ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
1017 	return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
1018 	    abd_cmp_cb, NULL));
1019 }
1020 
1021 /*
1022  * Check if ABD content is all-zeroes.
1023  */
1024 static int
1025 abd_cmp_zero_off_cb(void *data, size_t len, void *private)
1026 {
1027 	(void) private;
1028 
1029 	/* This function can only check whole uint64s. Enforce that. */
1030 	ASSERT0(P2PHASE(len, 8));
1031 
1032 	uint64_t *end = (uint64_t *)((char *)data + len);
1033 	for (uint64_t *word = (uint64_t *)data; word < end; word++)
1034 		if (*word != 0)
1035 			return (1);
1036 
1037 	return (0);
1038 }
1039 
1040 int
1041 abd_cmp_zero_off(abd_t *abd, size_t off, size_t size)
1042 {
1043 	return (abd_iterate_func(abd, off, size, abd_cmp_zero_off_cb, NULL));
1044 }
1045 
1046 /*
1047  * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
1048  *
1049  * @cabds          parity ABDs, must have equal size
1050  * @dabd           data ABD. Can be NULL (in this case @dsize = 0)
1051  * @func_raidz_gen should be implemented so that its behaviour
1052  *                 is the same when taking linear and when taking scatter
1053  */
1054 void
1055 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off,
1056     size_t csize, size_t dsize, const unsigned parity,
1057     void (*func_raidz_gen)(void **, const void *, size_t, size_t))
1058 {
1059 	int i;
1060 	size_t len, dlen;
1061 	struct abd_iter caiters[3];
1062 	struct abd_iter daiter;
1063 	void *caddrs[3], *daddr;
1064 	unsigned long flags __maybe_unused = 0;
1065 	abd_t *c_cabds[3];
1066 	abd_t *c_dabd = NULL;
1067 
1068 	ASSERT3U(parity, <=, 3);
1069 	for (i = 0; i < parity; i++) {
1070 		abd_verify(cabds[i]);
1071 		ASSERT3U(off + csize, <=, cabds[i]->abd_size);
1072 		c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off);
1073 	}
1074 
1075 	if (dsize > 0) {
1076 		ASSERT(dabd);
1077 		abd_verify(dabd);
1078 		ASSERT3U(off + dsize, <=, dabd->abd_size);
1079 		c_dabd = abd_init_abd_iter(dabd, &daiter, off);
1080 	}
1081 
1082 	abd_enter_critical(flags);
1083 	while (csize > 0) {
1084 		len = csize;
1085 		for (i = 0; i < parity; i++) {
1086 			IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
1087 			abd_iter_map(&caiters[i]);
1088 			caddrs[i] = caiters[i].iter_mapaddr;
1089 			len = MIN(caiters[i].iter_mapsize, len);
1090 		}
1091 
1092 		if (dsize > 0) {
1093 			IMPLY(abd_is_gang(dabd), c_dabd != NULL);
1094 			abd_iter_map(&daiter);
1095 			daddr = daiter.iter_mapaddr;
1096 			len = MIN(daiter.iter_mapsize, len);
1097 			dlen = len;
1098 		} else {
1099 			daddr = NULL;
1100 			dlen = 0;
1101 		}
1102 
1103 		/* must be progressive */
1104 		ASSERT3U(len, >, 0);
1105 		/*
1106 		 * The iterated function likely will not do well if each
1107 		 * segment except the last one is not multiple of 512 (raidz).
1108 		 */
1109 		ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1110 
1111 		func_raidz_gen(caddrs, daddr, len, dlen);
1112 
1113 		for (i = parity-1; i >= 0; i--) {
1114 			abd_iter_unmap(&caiters[i]);
1115 			c_cabds[i] =
1116 			    abd_advance_abd_iter(cabds[i], c_cabds[i],
1117 			    &caiters[i], len);
1118 		}
1119 
1120 		if (dsize > 0) {
1121 			abd_iter_unmap(&daiter);
1122 			c_dabd =
1123 			    abd_advance_abd_iter(dabd, c_dabd, &daiter,
1124 			    dlen);
1125 			dsize -= dlen;
1126 		}
1127 
1128 		csize -= len;
1129 	}
1130 	abd_exit_critical(flags);
1131 }
1132 
1133 /*
1134  * Iterate over code ABDs and data reconstruction target ABDs and call
1135  * @func_raidz_rec. Function maps at most 6 pages atomically.
1136  *
1137  * @cabds           parity ABDs, must have equal size
1138  * @tabds           rec target ABDs, at most 3
1139  * @tsize           size of data target columns
1140  * @func_raidz_rec  expects syndrome data in target columns. Function
1141  *                  reconstructs data and overwrites target columns.
1142  */
1143 void
1144 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
1145     size_t tsize, const unsigned parity,
1146     void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
1147     const unsigned *mul),
1148     const unsigned *mul)
1149 {
1150 	int i;
1151 	size_t len;
1152 	struct abd_iter citers[3];
1153 	struct abd_iter xiters[3];
1154 	void *caddrs[3], *xaddrs[3];
1155 	unsigned long flags __maybe_unused = 0;
1156 	abd_t *c_cabds[3];
1157 	abd_t *c_tabds[3];
1158 
1159 	ASSERT3U(parity, <=, 3);
1160 
1161 	for (i = 0; i < parity; i++) {
1162 		abd_verify(cabds[i]);
1163 		abd_verify(tabds[i]);
1164 		ASSERT3U(tsize, <=, cabds[i]->abd_size);
1165 		ASSERT3U(tsize, <=, tabds[i]->abd_size);
1166 		c_cabds[i] =
1167 		    abd_init_abd_iter(cabds[i], &citers[i], 0);
1168 		c_tabds[i] =
1169 		    abd_init_abd_iter(tabds[i], &xiters[i], 0);
1170 	}
1171 
1172 	abd_enter_critical(flags);
1173 	while (tsize > 0) {
1174 		len = tsize;
1175 		for (i = 0; i < parity; i++) {
1176 			IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL);
1177 			IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL);
1178 			abd_iter_map(&citers[i]);
1179 			abd_iter_map(&xiters[i]);
1180 			caddrs[i] = citers[i].iter_mapaddr;
1181 			xaddrs[i] = xiters[i].iter_mapaddr;
1182 			len = MIN(citers[i].iter_mapsize, len);
1183 			len = MIN(xiters[i].iter_mapsize, len);
1184 		}
1185 
1186 		/* must be progressive */
1187 		ASSERT3S(len, >, 0);
1188 		/*
1189 		 * The iterated function likely will not do well if each
1190 		 * segment except the last one is not multiple of 512 (raidz).
1191 		 */
1192 		ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1193 
1194 		func_raidz_rec(xaddrs, len, caddrs, mul);
1195 
1196 		for (i = parity-1; i >= 0; i--) {
1197 			abd_iter_unmap(&xiters[i]);
1198 			abd_iter_unmap(&citers[i]);
1199 			c_tabds[i] =
1200 			    abd_advance_abd_iter(tabds[i], c_tabds[i],
1201 			    &xiters[i], len);
1202 			c_cabds[i] =
1203 			    abd_advance_abd_iter(cabds[i], c_cabds[i],
1204 			    &citers[i], len);
1205 		}
1206 
1207 		tsize -= len;
1208 		ASSERT3S(tsize, >=, 0);
1209 	}
1210 	abd_exit_critical(flags);
1211 }
1212