1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 24 * Copyright (c) 2019 by Delphix. All rights reserved. 25 */ 26 27 /* 28 * ARC buffer data (ABD). 29 * 30 * ABDs are an abstract data structure for the ARC which can use two 31 * different ways of storing the underlying data: 32 * 33 * (a) Linear buffer. In this case, all the data in the ABD is stored in one 34 * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). 35 * 36 * +-------------------+ 37 * | ABD (linear) | 38 * | abd_flags = ... | 39 * | abd_size = ... | +--------------------------------+ 40 * | abd_buf ------------->| raw buffer of size abd_size | 41 * +-------------------+ +--------------------------------+ 42 * no abd_chunks 43 * 44 * (b) Scattered buffer. In this case, the data in the ABD is split into 45 * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers 46 * to the chunks recorded in an array at the end of the ABD structure. 47 * 48 * +-------------------+ 49 * | ABD (scattered) | 50 * | abd_flags = ... | 51 * | abd_size = ... | 52 * | abd_offset = 0 | +-----------+ 53 * | abd_chunks[0] ----------------------------->| chunk 0 | 54 * | abd_chunks[1] ---------------------+ +-----------+ 55 * | ... | | +-----------+ 56 * | abd_chunks[N-1] ---------+ +------->| chunk 1 | 57 * +-------------------+ | +-----------+ 58 * | ... 59 * | +-----------+ 60 * +----------------->| chunk N-1 | 61 * +-----------+ 62 * 63 * In addition to directly allocating a linear or scattered ABD, it is also 64 * possible to create an ABD by requesting the "sub-ABD" starting at an offset 65 * within an existing ABD. In linear buffers this is simple (set abd_buf of 66 * the new ABD to the starting point within the original raw buffer), but 67 * scattered ABDs are a little more complex. The new ABD makes a copy of the 68 * relevant abd_chunks pointers (but not the underlying data). However, to 69 * provide arbitrary rather than only chunk-aligned starting offsets, it also 70 * tracks an abd_offset field which represents the starting point of the data 71 * within the first chunk in abd_chunks. For both linear and scattered ABDs, 72 * creating an offset ABD marks the original ABD as the offset's parent, and the 73 * original ABD's abd_children refcount is incremented. This data allows us to 74 * ensure the root ABD isn't deleted before its children. 75 * 76 * Most consumers should never need to know what type of ABD they're using -- 77 * the ABD public API ensures that it's possible to transparently switch from 78 * using a linear ABD to a scattered one when doing so would be beneficial. 79 * 80 * If you need to use the data within an ABD directly, if you know it's linear 81 * (because you allocated it) you can use abd_to_buf() to access the underlying 82 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions 83 * which will allocate a raw buffer if necessary. Use the abd_return_buf* 84 * functions to return any raw buffers that are no longer necessary when you're 85 * done using them. 86 * 87 * There are a variety of ABD APIs that implement basic buffer operations: 88 * compare, copy, read, write, and fill with zeroes. If you need a custom 89 * function which progressively accesses the whole ABD, use the abd_iterate_* 90 * functions. 91 * 92 * As an additional feature, linear and scatter ABD's can be stitched together 93 * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs 94 * to be viewed as a singular ABD. 95 * 96 * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to 97 * B_FALSE. 98 */ 99 100 #include <sys/abd_impl.h> 101 #include <sys/param.h> 102 #include <sys/zio.h> 103 #include <sys/zfs_context.h> 104 #include <sys/zfs_znode.h> 105 106 /* see block comment above for description */ 107 int zfs_abd_scatter_enabled = B_TRUE; 108 109 void 110 abd_verify(abd_t *abd) 111 { 112 #ifdef ZFS_DEBUG 113 if (abd_is_from_pages(abd)) { 114 ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS); 115 } else { 116 ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); 117 } 118 ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | 119 ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | 120 ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | 121 ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD | ABD_FLAG_FROM_PAGES)); 122 IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); 123 IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); 124 if (abd_is_linear(abd)) { 125 ASSERT3U(abd->abd_size, >, 0); 126 ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); 127 } else if (abd_is_gang(abd)) { 128 uint_t child_sizes = 0; 129 for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); 130 cabd != NULL; 131 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 132 ASSERT(list_link_active(&cabd->abd_gang_link)); 133 child_sizes += cabd->abd_size; 134 abd_verify(cabd); 135 } 136 ASSERT3U(abd->abd_size, ==, child_sizes); 137 } else { 138 ASSERT3U(abd->abd_size, >, 0); 139 abd_verify_scatter(abd); 140 } 141 #endif 142 } 143 144 void 145 abd_init_struct(abd_t *abd) 146 { 147 list_link_init(&abd->abd_gang_link); 148 mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); 149 abd->abd_flags = 0; 150 #ifdef ZFS_DEBUG 151 zfs_refcount_create(&abd->abd_children); 152 abd->abd_parent = NULL; 153 #endif 154 abd->abd_size = 0; 155 } 156 157 static void 158 abd_fini_struct(abd_t *abd) 159 { 160 mutex_destroy(&abd->abd_mtx); 161 ASSERT(!list_link_active(&abd->abd_gang_link)); 162 #ifdef ZFS_DEBUG 163 zfs_refcount_destroy(&abd->abd_children); 164 #endif 165 } 166 167 abd_t * 168 abd_alloc_struct(size_t size) 169 { 170 abd_t *abd = abd_alloc_struct_impl(size); 171 abd_init_struct(abd); 172 abd->abd_flags |= ABD_FLAG_ALLOCD; 173 return (abd); 174 } 175 176 void 177 abd_free_struct(abd_t *abd) 178 { 179 abd_fini_struct(abd); 180 abd_free_struct_impl(abd); 181 } 182 183 /* 184 * Allocate an ABD, along with its own underlying data buffers. Use this if you 185 * don't care whether the ABD is linear or not. 186 */ 187 abd_t * 188 abd_alloc(size_t size, boolean_t is_metadata) 189 { 190 if (abd_size_alloc_linear(size)) 191 return (abd_alloc_linear(size, is_metadata)); 192 193 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 194 195 abd_t *abd = abd_alloc_struct(size); 196 abd->abd_flags |= ABD_FLAG_OWNER; 197 abd->abd_u.abd_scatter.abd_offset = 0; 198 abd_alloc_chunks(abd, size); 199 200 if (is_metadata) { 201 abd->abd_flags |= ABD_FLAG_META; 202 } 203 abd->abd_size = size; 204 205 abd_update_scatter_stats(abd, ABDSTAT_INCR); 206 207 return (abd); 208 } 209 210 /* 211 * Allocate an ABD that must be linear, along with its own underlying data 212 * buffer. Only use this when it would be very annoying to write your ABD 213 * consumer with a scattered ABD. 214 */ 215 abd_t * 216 abd_alloc_linear(size_t size, boolean_t is_metadata) 217 { 218 abd_t *abd = abd_alloc_struct(0); 219 220 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 221 222 abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER; 223 if (is_metadata) { 224 abd->abd_flags |= ABD_FLAG_META; 225 } 226 abd->abd_size = size; 227 228 if (is_metadata) { 229 ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); 230 } else { 231 ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size); 232 } 233 234 abd_update_linear_stats(abd, ABDSTAT_INCR); 235 236 return (abd); 237 } 238 239 static void 240 abd_free_linear(abd_t *abd) 241 { 242 if (abd_is_linear_page(abd)) { 243 abd_free_linear_page(abd); 244 return; 245 } 246 247 if (abd->abd_flags & ABD_FLAG_META) { 248 zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); 249 } else { 250 zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); 251 } 252 253 abd_update_linear_stats(abd, ABDSTAT_DECR); 254 } 255 256 static void 257 abd_free_gang(abd_t *abd) 258 { 259 ASSERT(abd_is_gang(abd)); 260 abd_t *cabd; 261 262 while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) { 263 /* 264 * We must acquire the child ABDs mutex to ensure that if it 265 * is being added to another gang ABD we will set the link 266 * as inactive when removing it from this gang ABD and before 267 * adding it to the other gang ABD. 268 */ 269 mutex_enter(&cabd->abd_mtx); 270 ASSERT(list_link_active(&cabd->abd_gang_link)); 271 list_remove(&ABD_GANG(abd).abd_gang_chain, cabd); 272 mutex_exit(&cabd->abd_mtx); 273 if (cabd->abd_flags & ABD_FLAG_GANG_FREE) 274 abd_free(cabd); 275 } 276 list_destroy(&ABD_GANG(abd).abd_gang_chain); 277 } 278 279 static void 280 abd_free_scatter(abd_t *abd) 281 { 282 abd_free_chunks(abd); 283 abd_update_scatter_stats(abd, ABDSTAT_DECR); 284 } 285 286 /* 287 * Free an ABD. Use with any kind of abd: those created with abd_alloc_*() 288 * and abd_get_*(), including abd_get_offset_struct(). 289 * 290 * If the ABD was created with abd_alloc_*(), the underlying data 291 * (scatterlist or linear buffer) will also be freed. (Subject to ownership 292 * changes via abd_*_ownership_of_buf().) 293 * 294 * Unless the ABD was created with abd_get_offset_struct(), the abd_t will 295 * also be freed. 296 */ 297 void 298 abd_free(abd_t *abd) 299 { 300 if (abd == NULL) 301 return; 302 303 abd_verify(abd); 304 #ifdef ZFS_DEBUG 305 IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL); 306 #endif 307 308 if (abd_is_gang(abd)) { 309 abd_free_gang(abd); 310 } else if (abd_is_linear(abd)) { 311 if (abd->abd_flags & ABD_FLAG_OWNER) 312 abd_free_linear(abd); 313 } else { 314 if (abd->abd_flags & ABD_FLAG_OWNER) 315 abd_free_scatter(abd); 316 } 317 318 #ifdef ZFS_DEBUG 319 if (abd->abd_parent != NULL) { 320 (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, 321 abd->abd_size, abd); 322 } 323 #endif 324 325 abd_fini_struct(abd); 326 if (abd->abd_flags & ABD_FLAG_ALLOCD) 327 abd_free_struct_impl(abd); 328 } 329 330 /* 331 * Allocate an ABD of the same format (same metadata flag, same scatterize 332 * setting) as another ABD. 333 */ 334 abd_t * 335 abd_alloc_sametype(abd_t *sabd, size_t size) 336 { 337 boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; 338 if (abd_is_linear(sabd) && 339 !abd_is_linear_page(sabd)) { 340 return (abd_alloc_linear(size, is_metadata)); 341 } else { 342 return (abd_alloc(size, is_metadata)); 343 } 344 } 345 346 /* 347 * Create gang ABD that will be the head of a list of ABD's. This is used 348 * to "chain" scatter/gather lists together when constructing aggregated 349 * IO's. To free this abd, abd_free() must be called. 350 */ 351 abd_t * 352 abd_alloc_gang(void) 353 { 354 abd_t *abd = abd_alloc_struct(0); 355 abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER; 356 list_create(&ABD_GANG(abd).abd_gang_chain, 357 sizeof (abd_t), offsetof(abd_t, abd_gang_link)); 358 return (abd); 359 } 360 361 /* 362 * Add a child gang ABD to a parent gang ABDs chained list. 363 */ 364 static void 365 abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) 366 { 367 ASSERT(abd_is_gang(pabd)); 368 ASSERT(abd_is_gang(cabd)); 369 370 if (free_on_free) { 371 /* 372 * If the parent is responsible for freeing the child gang 373 * ABD we will just splice the child's children ABD list to 374 * the parent's list and immediately free the child gang ABD 375 * struct. The parent gang ABDs children from the child gang 376 * will retain all the free_on_free settings after being 377 * added to the parents list. 378 */ 379 #ifdef ZFS_DEBUG 380 /* 381 * If cabd had abd_parent, we have to drop it here. We can't 382 * transfer it to pabd, nor we can clear abd_size leaving it. 383 */ 384 if (cabd->abd_parent != NULL) { 385 (void) zfs_refcount_remove_many( 386 &cabd->abd_parent->abd_children, 387 cabd->abd_size, cabd); 388 cabd->abd_parent = NULL; 389 } 390 #endif 391 pabd->abd_size += cabd->abd_size; 392 cabd->abd_size = 0; 393 list_move_tail(&ABD_GANG(pabd).abd_gang_chain, 394 &ABD_GANG(cabd).abd_gang_chain); 395 ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); 396 abd_verify(pabd); 397 abd_free(cabd); 398 } else { 399 for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain); 400 child != NULL; 401 child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) { 402 /* 403 * We always pass B_FALSE for free_on_free as it is the 404 * original child gang ABDs responsibility to determine 405 * if any of its child ABDs should be free'd on the call 406 * to abd_free(). 407 */ 408 abd_gang_add(pabd, child, B_FALSE); 409 } 410 abd_verify(pabd); 411 } 412 } 413 414 /* 415 * Add a child ABD to a gang ABD's chained list. 416 */ 417 void 418 abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) 419 { 420 ASSERT(abd_is_gang(pabd)); 421 abd_t *child_abd = NULL; 422 423 /* 424 * If the child being added is a gang ABD, we will add the 425 * child's ABDs to the parent gang ABD. This allows us to account 426 * for the offset correctly in the parent gang ABD. 427 */ 428 if (abd_is_gang(cabd)) { 429 ASSERT(!list_link_active(&cabd->abd_gang_link)); 430 return (abd_gang_add_gang(pabd, cabd, free_on_free)); 431 } 432 ASSERT(!abd_is_gang(cabd)); 433 434 /* 435 * In order to verify that an ABD is not already part of 436 * another gang ABD, we must lock the child ABD's abd_mtx 437 * to check its abd_gang_link status. We unlock the abd_mtx 438 * only after it is has been added to a gang ABD, which 439 * will update the abd_gang_link's status. See comment below 440 * for how an ABD can be in multiple gang ABD's simultaneously. 441 */ 442 mutex_enter(&cabd->abd_mtx); 443 if (list_link_active(&cabd->abd_gang_link)) { 444 /* 445 * If the child ABD is already part of another 446 * gang ABD then we must allocate a new 447 * ABD to use a separate link. We mark the newly 448 * allocated ABD with ABD_FLAG_GANG_FREE, before 449 * adding it to the gang ABD's list, to make the 450 * gang ABD aware that it is responsible to call 451 * abd_free(). We use abd_get_offset() in order 452 * to just allocate a new ABD but avoid copying the 453 * data over into the newly allocated ABD. 454 * 455 * An ABD may become part of multiple gang ABD's. For 456 * example, when writing ditto bocks, the same ABD 457 * is used to write 2 or 3 locations with 2 or 3 458 * zio_t's. Each of the zio's may be aggregated with 459 * different adjacent zio's. zio aggregation uses gang 460 * zio's, so the single ABD can become part of multiple 461 * gang zio's. 462 * 463 * The ASSERT below is to make sure that if 464 * free_on_free is passed as B_TRUE, the ABD can 465 * not be in multiple gang ABD's. The gang ABD 466 * can not be responsible for cleaning up the child 467 * ABD memory allocation if the ABD can be in 468 * multiple gang ABD's at one time. 469 */ 470 ASSERT3B(free_on_free, ==, B_FALSE); 471 child_abd = abd_get_offset(cabd, 0); 472 child_abd->abd_flags |= ABD_FLAG_GANG_FREE; 473 } else { 474 child_abd = cabd; 475 if (free_on_free) 476 child_abd->abd_flags |= ABD_FLAG_GANG_FREE; 477 } 478 ASSERT3P(child_abd, !=, NULL); 479 480 list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd); 481 mutex_exit(&cabd->abd_mtx); 482 pabd->abd_size += child_abd->abd_size; 483 } 484 485 /* 486 * Locate the ABD for the supplied offset in the gang ABD. 487 * Return a new offset relative to the returned ABD. 488 */ 489 abd_t * 490 abd_gang_get_offset(abd_t *abd, size_t *off) 491 { 492 abd_t *cabd; 493 494 ASSERT(abd_is_gang(abd)); 495 ASSERT3U(*off, <, abd->abd_size); 496 for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; 497 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 498 if (*off >= cabd->abd_size) 499 *off -= cabd->abd_size; 500 else 501 return (cabd); 502 } 503 VERIFY3P(cabd, !=, NULL); 504 return (cabd); 505 } 506 507 /* 508 * Allocate a new ABD, using the provided struct (if non-NULL, and if 509 * circumstances allow - otherwise allocate the struct). The returned ABD will 510 * point to offset off of sabd. It shares the underlying buffer data with sabd. 511 * Use abd_free() to free. sabd must not be freed while any derived ABDs exist. 512 */ 513 static abd_t * 514 abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) 515 { 516 abd_verify(sabd); 517 ASSERT3U(off + size, <=, sabd->abd_size); 518 519 if (abd_is_linear(sabd)) { 520 if (abd == NULL) 521 abd = abd_alloc_struct(0); 522 /* 523 * Even if this buf is filesystem metadata, we only track that 524 * if we own the underlying data buffer, which is not true in 525 * this case. Therefore, we don't ever use ABD_FLAG_META here. 526 */ 527 abd->abd_flags |= ABD_FLAG_LINEAR; 528 529 /* 530 * User pages from Direct I/O requests may be in a single page 531 * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag 532 * that here for abd. This is required because we have to be 533 * careful when borrowing the buffer from the ABD because we 534 * can not place user pages under write protection on Linux. 535 * See the comments in abd_os.c for abd_borrow_buf(), 536 * abd_borrow_buf_copy(), abd_return_buf() and 537 * abd_return_buf_copy(). 538 */ 539 if (abd_is_from_pages(sabd)) { 540 abd->abd_flags |= ABD_FLAG_FROM_PAGES | 541 ABD_FLAG_LINEAR_PAGE; 542 } 543 544 ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; 545 } else if (abd_is_gang(sabd)) { 546 size_t left = size; 547 if (abd == NULL) { 548 abd = abd_alloc_gang(); 549 } else { 550 abd->abd_flags |= ABD_FLAG_GANG; 551 list_create(&ABD_GANG(abd).abd_gang_chain, 552 sizeof (abd_t), offsetof(abd_t, abd_gang_link)); 553 } 554 555 abd->abd_flags &= ~ABD_FLAG_OWNER; 556 for (abd_t *cabd = abd_gang_get_offset(sabd, &off); 557 cabd != NULL && left > 0; 558 cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) { 559 int csize = MIN(left, cabd->abd_size - off); 560 561 abd_t *nabd = abd_get_offset_size(cabd, off, csize); 562 abd_gang_add(abd, nabd, B_TRUE); 563 left -= csize; 564 off = 0; 565 } 566 ASSERT3U(left, ==, 0); 567 } else { 568 abd = abd_get_offset_scatter(abd, sabd, off, size); 569 } 570 571 ASSERT3P(abd, !=, NULL); 572 abd->abd_size = size; 573 #ifdef ZFS_DEBUG 574 abd->abd_parent = sabd; 575 (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); 576 #endif 577 return (abd); 578 } 579 580 /* 581 * Like abd_get_offset_size(), but memory for the abd_t is provided by the 582 * caller. Using this routine can improve performance by avoiding the cost 583 * of allocating memory for the abd_t struct, and updating the abd stats. 584 * Usually, the provided abd is returned, but in some circumstances (FreeBSD, 585 * if sabd is scatter and size is more than 2 pages) a new abd_t may need to 586 * be allocated. Therefore callers should be careful to use the returned 587 * abd_t*. 588 */ 589 abd_t * 590 abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size) 591 { 592 abd_t *result; 593 abd_init_struct(abd); 594 result = abd_get_offset_impl(abd, sabd, off, size); 595 if (result != abd) 596 abd_fini_struct(abd); 597 return (result); 598 } 599 600 abd_t * 601 abd_get_offset(abd_t *sabd, size_t off) 602 { 603 size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; 604 VERIFY3U(size, >, 0); 605 return (abd_get_offset_impl(NULL, sabd, off, size)); 606 } 607 608 abd_t * 609 abd_get_offset_size(abd_t *sabd, size_t off, size_t size) 610 { 611 ASSERT3U(off + size, <=, sabd->abd_size); 612 return (abd_get_offset_impl(NULL, sabd, off, size)); 613 } 614 615 /* 616 * Return a size scatter ABD containing only zeros. 617 */ 618 abd_t * 619 abd_get_zeros(size_t size) 620 { 621 ASSERT3P(abd_zero_scatter, !=, NULL); 622 ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); 623 return (abd_get_offset_size(abd_zero_scatter, 0, size)); 624 } 625 626 /* 627 * Create a linear ABD for an existing buf. 628 */ 629 static abd_t * 630 abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size) 631 { 632 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 633 634 /* 635 * Even if this buf is filesystem metadata, we only track that if we 636 * own the underlying data buffer, which is not true in this case. 637 * Therefore, we don't ever use ABD_FLAG_META here. 638 */ 639 abd->abd_flags |= ABD_FLAG_LINEAR; 640 abd->abd_size = size; 641 642 ABD_LINEAR_BUF(abd) = buf; 643 644 return (abd); 645 } 646 647 abd_t * 648 abd_get_from_buf(void *buf, size_t size) 649 { 650 abd_t *abd = abd_alloc_struct(0); 651 return (abd_get_from_buf_impl(abd, buf, size)); 652 } 653 654 abd_t * 655 abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size) 656 { 657 abd_init_struct(abd); 658 return (abd_get_from_buf_impl(abd, buf, size)); 659 } 660 661 /* 662 * Get the raw buffer associated with a linear ABD. 663 */ 664 void * 665 abd_to_buf(abd_t *abd) 666 { 667 ASSERT(abd_is_linear(abd)); 668 abd_verify(abd); 669 return (ABD_LINEAR_BUF(abd)); 670 } 671 672 void 673 abd_release_ownership_of_buf(abd_t *abd) 674 { 675 ASSERT(abd_is_linear(abd)); 676 ASSERT(abd->abd_flags & ABD_FLAG_OWNER); 677 678 /* 679 * abd_free() needs to handle LINEAR_PAGE ABD's specially. 680 * Since that flag does not survive the 681 * abd_release_ownership_of_buf() -> abd_get_from_buf() -> 682 * abd_take_ownership_of_buf() sequence, we don't allow releasing 683 * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. 684 */ 685 ASSERT(!abd_is_linear_page(abd)); 686 687 abd_verify(abd); 688 689 abd->abd_flags &= ~ABD_FLAG_OWNER; 690 /* Disable this flag since we no longer own the data buffer */ 691 abd->abd_flags &= ~ABD_FLAG_META; 692 693 abd_update_linear_stats(abd, ABDSTAT_DECR); 694 } 695 696 697 /* 698 * Give this ABD ownership of the buffer that it's storing. Can only be used on 699 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated 700 * with abd_alloc_linear() which subsequently released ownership of their buf 701 * with abd_release_ownership_of_buf(). 702 */ 703 void 704 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) 705 { 706 ASSERT(abd_is_linear(abd)); 707 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); 708 abd_verify(abd); 709 710 abd->abd_flags |= ABD_FLAG_OWNER; 711 if (is_metadata) { 712 abd->abd_flags |= ABD_FLAG_META; 713 } 714 715 abd_update_linear_stats(abd, ABDSTAT_INCR); 716 } 717 718 /* 719 * Initializes an abd_iter based on whether the abd is a gang ABD 720 * or just a single ABD. 721 */ 722 static inline abd_t * 723 abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off) 724 { 725 abd_t *cabd = NULL; 726 727 if (abd_is_gang(abd)) { 728 cabd = abd_gang_get_offset(abd, &off); 729 if (cabd) { 730 abd_iter_init(aiter, cabd); 731 abd_iter_advance(aiter, off); 732 } 733 } else { 734 abd_iter_init(aiter, abd); 735 abd_iter_advance(aiter, off); 736 } 737 return (cabd); 738 } 739 740 /* 741 * Advances an abd_iter. We have to be careful with gang ABD as 742 * advancing could mean that we are at the end of a particular ABD and 743 * must grab the ABD in the gang ABD's list. 744 */ 745 static inline abd_t * 746 abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter, 747 size_t len) 748 { 749 abd_iter_advance(aiter, len); 750 if (abd_is_gang(abd) && abd_iter_at_end(aiter)) { 751 ASSERT3P(cabd, !=, NULL); 752 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd); 753 if (cabd) { 754 abd_iter_init(aiter, cabd); 755 abd_iter_advance(aiter, 0); 756 } 757 } 758 return (cabd); 759 } 760 761 int 762 abd_iterate_func(abd_t *abd, size_t off, size_t size, 763 abd_iter_func_t *func, void *private) 764 { 765 struct abd_iter aiter; 766 int ret = 0; 767 768 if (size == 0) 769 return (0); 770 771 abd_verify(abd); 772 ASSERT3U(off + size, <=, abd->abd_size); 773 774 abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); 775 776 while (size > 0) { 777 IMPLY(abd_is_gang(abd), c_abd != NULL); 778 779 abd_iter_map(&aiter); 780 781 size_t len = MIN(aiter.iter_mapsize, size); 782 ASSERT3U(len, >, 0); 783 784 ret = func(aiter.iter_mapaddr, len, private); 785 786 abd_iter_unmap(&aiter); 787 788 if (ret != 0) 789 break; 790 791 size -= len; 792 c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); 793 } 794 795 return (ret); 796 } 797 798 #if defined(__linux__) && defined(_KERNEL) 799 int 800 abd_iterate_page_func(abd_t *abd, size_t off, size_t size, 801 abd_iter_page_func_t *func, void *private) 802 { 803 struct abd_iter aiter; 804 int ret = 0; 805 806 if (size == 0) 807 return (0); 808 809 abd_verify(abd); 810 ASSERT3U(off + size, <=, abd->abd_size); 811 812 abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); 813 814 while (size > 0) { 815 IMPLY(abd_is_gang(abd), c_abd != NULL); 816 817 abd_iter_page(&aiter); 818 819 size_t len = MIN(aiter.iter_page_dsize, size); 820 ASSERT3U(len, >, 0); 821 822 ret = func(aiter.iter_page, aiter.iter_page_doff, 823 len, private); 824 825 aiter.iter_page = NULL; 826 aiter.iter_page_doff = 0; 827 aiter.iter_page_dsize = 0; 828 829 if (ret != 0) 830 break; 831 832 size -= len; 833 c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); 834 } 835 836 return (ret); 837 } 838 #endif 839 840 struct buf_arg { 841 void *arg_buf; 842 }; 843 844 static int 845 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) 846 { 847 struct buf_arg *ba_ptr = private; 848 849 (void) memcpy(ba_ptr->arg_buf, buf, size); 850 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 851 852 return (0); 853 } 854 855 /* 856 * Copy abd to buf. (off is the offset in abd.) 857 */ 858 void 859 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) 860 { 861 struct buf_arg ba_ptr = { buf }; 862 863 (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, 864 &ba_ptr); 865 } 866 867 static int 868 abd_cmp_buf_off_cb(void *buf, size_t size, void *private) 869 { 870 int ret; 871 struct buf_arg *ba_ptr = private; 872 873 ret = memcmp(buf, ba_ptr->arg_buf, size); 874 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 875 876 return (ret); 877 } 878 879 /* 880 * Compare the contents of abd to buf. (off is the offset in abd.) 881 */ 882 int 883 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) 884 { 885 struct buf_arg ba_ptr = { (void *) buf }; 886 887 return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); 888 } 889 890 static int 891 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) 892 { 893 struct buf_arg *ba_ptr = private; 894 895 (void) memcpy(buf, ba_ptr->arg_buf, size); 896 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 897 898 return (0); 899 } 900 901 /* 902 * Copy from buf to abd. (off is the offset in abd.) 903 */ 904 void 905 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) 906 { 907 struct buf_arg ba_ptr = { (void *) buf }; 908 909 (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, 910 &ba_ptr); 911 } 912 913 static int 914 abd_zero_off_cb(void *buf, size_t size, void *private) 915 { 916 (void) private; 917 (void) memset(buf, 0, size); 918 return (0); 919 } 920 921 /* 922 * Zero out the abd from a particular offset to the end. 923 */ 924 void 925 abd_zero_off(abd_t *abd, size_t off, size_t size) 926 { 927 (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); 928 } 929 930 /* 931 * Iterate over two ABDs and call func incrementally on the two ABDs' data in 932 * equal-sized chunks (passed to func as raw buffers). func could be called many 933 * times during this iteration. 934 */ 935 int 936 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, 937 size_t size, abd_iter_func2_t *func, void *private) 938 { 939 int ret = 0; 940 struct abd_iter daiter, saiter; 941 abd_t *c_dabd, *c_sabd; 942 943 if (size == 0) 944 return (0); 945 946 abd_verify(dabd); 947 abd_verify(sabd); 948 949 ASSERT3U(doff + size, <=, dabd->abd_size); 950 ASSERT3U(soff + size, <=, sabd->abd_size); 951 952 c_dabd = abd_init_abd_iter(dabd, &daiter, doff); 953 c_sabd = abd_init_abd_iter(sabd, &saiter, soff); 954 955 while (size > 0) { 956 IMPLY(abd_is_gang(dabd), c_dabd != NULL); 957 IMPLY(abd_is_gang(sabd), c_sabd != NULL); 958 959 abd_iter_map(&daiter); 960 abd_iter_map(&saiter); 961 962 size_t dlen = MIN(daiter.iter_mapsize, size); 963 size_t slen = MIN(saiter.iter_mapsize, size); 964 size_t len = MIN(dlen, slen); 965 ASSERT(dlen > 0 || slen > 0); 966 967 ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, 968 private); 969 970 abd_iter_unmap(&saiter); 971 abd_iter_unmap(&daiter); 972 973 if (ret != 0) 974 break; 975 976 size -= len; 977 c_dabd = 978 abd_advance_abd_iter(dabd, c_dabd, &daiter, len); 979 c_sabd = 980 abd_advance_abd_iter(sabd, c_sabd, &saiter, len); 981 } 982 983 return (ret); 984 } 985 986 static int 987 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) 988 { 989 (void) private; 990 (void) memcpy(dbuf, sbuf, size); 991 return (0); 992 } 993 994 /* 995 * Copy from sabd to dabd starting from soff and doff. 996 */ 997 void 998 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) 999 { 1000 (void) abd_iterate_func2(dabd, sabd, doff, soff, size, 1001 abd_copy_off_cb, NULL); 1002 } 1003 1004 static int 1005 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) 1006 { 1007 (void) private; 1008 return (memcmp(bufa, bufb, size)); 1009 } 1010 1011 /* 1012 * Compares the contents of two ABDs. 1013 */ 1014 int 1015 abd_cmp(abd_t *dabd, abd_t *sabd) 1016 { 1017 ASSERT3U(dabd->abd_size, ==, sabd->abd_size); 1018 return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, 1019 abd_cmp_cb, NULL)); 1020 } 1021 1022 /* 1023 * Check if ABD content is all-zeroes. 1024 */ 1025 static int 1026 abd_cmp_zero_off_cb(void *data, size_t len, void *private) 1027 { 1028 (void) private; 1029 1030 /* This function can only check whole uint64s. Enforce that. */ 1031 ASSERT0(P2PHASE(len, 8)); 1032 1033 uint64_t *end = (uint64_t *)((char *)data + len); 1034 for (uint64_t *word = (uint64_t *)data; word < end; word++) 1035 if (*word != 0) 1036 return (1); 1037 1038 return (0); 1039 } 1040 1041 int 1042 abd_cmp_zero_off(abd_t *abd, size_t off, size_t size) 1043 { 1044 return (abd_iterate_func(abd, off, size, abd_cmp_zero_off_cb, NULL)); 1045 } 1046 1047 /* 1048 * Iterate over code ABDs and a data ABD and call @func_raidz_gen. 1049 * 1050 * @cabds parity ABDs, must have equal size 1051 * @dabd data ABD. Can be NULL (in this case @dsize = 0) 1052 * @func_raidz_gen should be implemented so that its behaviour 1053 * is the same when taking linear and when taking scatter 1054 */ 1055 void 1056 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, size_t off, 1057 size_t csize, size_t dsize, const unsigned parity, 1058 void (*func_raidz_gen)(void **, const void *, size_t, size_t)) 1059 { 1060 int i; 1061 size_t len, dlen; 1062 struct abd_iter caiters[3]; 1063 struct abd_iter daiter; 1064 void *caddrs[3], *daddr; 1065 unsigned long flags __maybe_unused = 0; 1066 abd_t *c_cabds[3]; 1067 abd_t *c_dabd = NULL; 1068 1069 ASSERT3U(parity, <=, 3); 1070 for (i = 0; i < parity; i++) { 1071 abd_verify(cabds[i]); 1072 ASSERT3U(off + csize, <=, cabds[i]->abd_size); 1073 c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], off); 1074 } 1075 1076 if (dsize > 0) { 1077 ASSERT(dabd); 1078 abd_verify(dabd); 1079 ASSERT3U(off + dsize, <=, dabd->abd_size); 1080 c_dabd = abd_init_abd_iter(dabd, &daiter, off); 1081 } 1082 1083 abd_enter_critical(flags); 1084 while (csize > 0) { 1085 len = csize; 1086 for (i = 0; i < parity; i++) { 1087 IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); 1088 abd_iter_map(&caiters[i]); 1089 caddrs[i] = caiters[i].iter_mapaddr; 1090 len = MIN(caiters[i].iter_mapsize, len); 1091 } 1092 1093 if (dsize > 0) { 1094 IMPLY(abd_is_gang(dabd), c_dabd != NULL); 1095 abd_iter_map(&daiter); 1096 daddr = daiter.iter_mapaddr; 1097 len = MIN(daiter.iter_mapsize, len); 1098 dlen = len; 1099 } else { 1100 daddr = NULL; 1101 dlen = 0; 1102 } 1103 1104 /* must be progressive */ 1105 ASSERT3U(len, >, 0); 1106 /* 1107 * The iterated function likely will not do well if each 1108 * segment except the last one is not multiple of 512 (raidz). 1109 */ 1110 ASSERT3U(((uint64_t)len & 511ULL), ==, 0); 1111 1112 func_raidz_gen(caddrs, daddr, len, dlen); 1113 1114 for (i = parity-1; i >= 0; i--) { 1115 abd_iter_unmap(&caiters[i]); 1116 c_cabds[i] = 1117 abd_advance_abd_iter(cabds[i], c_cabds[i], 1118 &caiters[i], len); 1119 } 1120 1121 if (dsize > 0) { 1122 abd_iter_unmap(&daiter); 1123 c_dabd = 1124 abd_advance_abd_iter(dabd, c_dabd, &daiter, 1125 dlen); 1126 dsize -= dlen; 1127 } 1128 1129 csize -= len; 1130 } 1131 abd_exit_critical(flags); 1132 } 1133 1134 /* 1135 * Iterate over code ABDs and data reconstruction target ABDs and call 1136 * @func_raidz_rec. Function maps at most 6 pages atomically. 1137 * 1138 * @cabds parity ABDs, must have equal size 1139 * @tabds rec target ABDs, at most 3 1140 * @tsize size of data target columns 1141 * @func_raidz_rec expects syndrome data in target columns. Function 1142 * reconstructs data and overwrites target columns. 1143 */ 1144 void 1145 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, 1146 size_t tsize, const unsigned parity, 1147 void (*func_raidz_rec)(void **t, const size_t tsize, void **c, 1148 const unsigned *mul), 1149 const unsigned *mul) 1150 { 1151 int i; 1152 size_t len; 1153 struct abd_iter citers[3]; 1154 struct abd_iter xiters[3]; 1155 void *caddrs[3], *xaddrs[3]; 1156 unsigned long flags __maybe_unused = 0; 1157 abd_t *c_cabds[3]; 1158 abd_t *c_tabds[3]; 1159 1160 ASSERT3U(parity, <=, 3); 1161 1162 for (i = 0; i < parity; i++) { 1163 abd_verify(cabds[i]); 1164 abd_verify(tabds[i]); 1165 ASSERT3U(tsize, <=, cabds[i]->abd_size); 1166 ASSERT3U(tsize, <=, tabds[i]->abd_size); 1167 c_cabds[i] = 1168 abd_init_abd_iter(cabds[i], &citers[i], 0); 1169 c_tabds[i] = 1170 abd_init_abd_iter(tabds[i], &xiters[i], 0); 1171 } 1172 1173 abd_enter_critical(flags); 1174 while (tsize > 0) { 1175 len = tsize; 1176 for (i = 0; i < parity; i++) { 1177 IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); 1178 IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL); 1179 abd_iter_map(&citers[i]); 1180 abd_iter_map(&xiters[i]); 1181 caddrs[i] = citers[i].iter_mapaddr; 1182 xaddrs[i] = xiters[i].iter_mapaddr; 1183 len = MIN(citers[i].iter_mapsize, len); 1184 len = MIN(xiters[i].iter_mapsize, len); 1185 } 1186 1187 /* must be progressive */ 1188 ASSERT3S(len, >, 0); 1189 /* 1190 * The iterated function likely will not do well if each 1191 * segment except the last one is not multiple of 512 (raidz). 1192 */ 1193 ASSERT3U(((uint64_t)len & 511ULL), ==, 0); 1194 1195 func_raidz_rec(xaddrs, len, caddrs, mul); 1196 1197 for (i = parity-1; i >= 0; i--) { 1198 abd_iter_unmap(&xiters[i]); 1199 abd_iter_unmap(&citers[i]); 1200 c_tabds[i] = 1201 abd_advance_abd_iter(tabds[i], c_tabds[i], 1202 &xiters[i], len); 1203 c_cabds[i] = 1204 abd_advance_abd_iter(cabds[i], c_cabds[i], 1205 &citers[i], len); 1206 } 1207 1208 tsize -= len; 1209 ASSERT3S(tsize, >=, 0); 1210 } 1211 abd_exit_critical(flags); 1212 } 1213 1214 EXPORT_SYMBOL(abd_free); 1215