1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 14 * Copyright (c) 2019 by Delphix. All rights reserved. 15 * Copyright 2020 Joyent, Inc. 16 * Copyright 2023 RackTop Systems, Inc. 17 */ 18 19 /* 20 * ARC buffer data (ABD). 21 * 22 * ABDs are an abstract data structure for the ARC which can use two 23 * different ways of storing the underlying data: 24 * 25 * (a) Linear buffer. In this case, all the data in the ABD is stored in one 26 * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). 27 * 28 * +-------------------+ 29 * | ABD (linear) | 30 * | abd_flags = ... | 31 * | abd_size = ... | +--------------------------------+ 32 * | abd_buf ------------->| raw buffer of size abd_size | 33 * +-------------------+ +--------------------------------+ 34 * no abd_chunks 35 * 36 * (b) Scattered buffer. In this case, the data in the ABD is split into 37 * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers 38 * to the chunks recorded in an array at the end of the ABD structure. 39 * 40 * +-------------------+ 41 * | ABD (scattered) | 42 * | abd_flags = ... | 43 * | abd_size = ... | 44 * | abd_offset = 0 | +-----------+ 45 * | abd_chunks[0] ----------------------------->| chunk 0 | 46 * | abd_chunks[1] ---------------------+ +-----------+ 47 * | ... | | +-----------+ 48 * | abd_chunks[N-1] ---------+ +------->| chunk 1 | 49 * +-------------------+ | +-----------+ 50 * | ... 51 * | +-----------+ 52 * +----------------->| chunk N-1 | 53 * +-----------+ 54 * 55 * Using a large proportion of scattered ABDs decreases ARC fragmentation since 56 * when we are at the limit of allocatable space, using equal-size chunks will 57 * allow us to quickly reclaim enough space for a new large allocation (assuming 58 * it is also scattered). 59 * 60 * In addition to directly allocating a linear or scattered ABD, it is also 61 * possible to create an ABD by requesting the "sub-ABD" starting at an offset 62 * within an existing ABD. In linear buffers this is simple (set abd_buf of 63 * the new ABD to the starting point within the original raw buffer), but 64 * scattered ABDs are a little more complex. The new ABD makes a copy of the 65 * relevant abd_chunks pointers (but not the underlying data). However, to 66 * provide arbitrary rather than only chunk-aligned starting offsets, it also 67 * tracks an abd_offset field which represents the starting point of the data 68 * within the first chunk in abd_chunks. For both linear and scattered ABDs, 69 * creating an offset ABD marks the original ABD as the offset's parent, and the 70 * original ABD's abd_children refcount is incremented. This data allows us to 71 * ensure the root ABD isn't deleted before its children. 72 * 73 * Most consumers should never need to know what type of ABD they're using -- 74 * the ABD public API ensures that it's possible to transparently switch from 75 * using a linear ABD to a scattered one when doing so would be beneficial. 76 * 77 * If you need to use the data within an ABD directly, if you know it's linear 78 * (because you allocated it) you can use abd_to_buf() to access the underlying 79 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions 80 * which will allocate a raw buffer if necessary. Use the abd_return_buf* 81 * functions to return any raw buffers that are no longer necessary when you're 82 * done using them. 83 * 84 * There are a variety of ABD APIs that implement basic buffer operations: 85 * compare, copy, read, write, and fill with zeroes. If you need a custom 86 * function which progressively accesses the whole ABD, use the abd_iterate_* 87 * functions. 88 */ 89 90 #include <sys/abd.h> 91 #include <sys/param.h> 92 #include <sys/zio.h> 93 #include <sys/zfs_context.h> 94 #include <sys/zfs_znode.h> 95 96 typedef struct abd_stats { 97 kstat_named_t abdstat_struct_size; 98 kstat_named_t abdstat_scatter_cnt; 99 kstat_named_t abdstat_scatter_data_size; 100 kstat_named_t abdstat_scatter_chunk_waste; 101 kstat_named_t abdstat_linear_cnt; 102 kstat_named_t abdstat_linear_data_size; 103 } abd_stats_t; 104 105 static abd_stats_t abd_stats = { 106 /* Amount of memory occupied by all of the abd_t struct allocations */ 107 { "struct_size", KSTAT_DATA_UINT64 }, 108 /* 109 * The number of scatter ABDs which are currently allocated, excluding 110 * ABDs which don't own their data (for instance the ones which were 111 * allocated through abd_get_offset()). 112 */ 113 { "scatter_cnt", KSTAT_DATA_UINT64 }, 114 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 115 { "scatter_data_size", KSTAT_DATA_UINT64 }, 116 /* 117 * The amount of space wasted at the end of the last chunk across all 118 * scatter ABDs tracked by scatter_cnt. 119 */ 120 { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 121 /* 122 * The number of linear ABDs which are currently allocated, excluding 123 * ABDs which don't own their data (for instance the ones which were 124 * allocated through abd_get_offset() and abd_get_from_buf()). If an 125 * ABD takes ownership of its buf then it will become tracked. 126 */ 127 { "linear_cnt", KSTAT_DATA_UINT64 }, 128 /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 129 { "linear_data_size", KSTAT_DATA_UINT64 }, 130 }; 131 132 #define ABDSTAT(stat) (abd_stats.stat.value.ui64) 133 #define ABDSTAT_INCR(stat, val) \ 134 atomic_add_64(&abd_stats.stat.value.ui64, (val)) 135 #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) 136 #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) 137 138 /* 139 * It is possible to make all future ABDs be linear by setting this to B_FALSE. 140 * Otherwise, ABDs are allocated scattered by default unless the caller uses 141 * abd_alloc_linear(). 142 */ 143 boolean_t zfs_abd_scatter_enabled = B_TRUE; 144 145 /* 146 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 147 * ABD's. Smaller allocations will use linear ABD's which uses 148 * zio_[data_]buf_alloc(). 149 * 150 * Scatter ABD's use at least one page each, so sub-page allocations waste 151 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 152 * half of each page). Using linear ABD's for small allocations means that 153 * they will be put on slabs which contain many allocations. This can 154 * improve memory efficiency, but it also makes it much harder for ARC 155 * evictions to actually free pages, because all the buffers on one slab need 156 * to be freed in order for the slab (and underlying pages) to be freed. 157 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 158 * possible for them to actually waste more memory than scatter (one page per 159 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 160 * 161 * Spill blocks are typically 512B and are heavily used on systems running 162 * selinux with the default dnode size and the `xattr=sa` property set. 163 * 164 * By default we use linear allocations for 512B and 1KB, and scatter 165 * allocations for larger (1.5KB and up). 166 */ 167 int zfs_abd_scatter_min_size = 512 * 3; 168 169 /* 170 * The size of the chunks ABD allocates. Because the sizes allocated from the 171 * kmem_cache can't change, this tunable can only be modified at boot. Changing 172 * it at runtime would cause ABD iteration to work incorrectly for ABDs which 173 * were allocated with the old size, so a safeguard has been put in place which 174 * will cause the machine to panic if you change it and try to access the data 175 * within a scattered ABD. 176 */ 177 size_t zfs_abd_chunk_size = 4096; 178 179 #ifdef _KERNEL 180 extern vmem_t *zio_alloc_arena; 181 #endif 182 183 kmem_cache_t *abd_chunk_cache; 184 static kstat_t *abd_ksp; 185 186 extern inline boolean_t abd_is_linear(abd_t *abd); 187 extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); 188 extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); 189 extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); 190 extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); 191 extern inline void abd_zero(abd_t *abd, size_t size); 192 193 static void * 194 abd_alloc_chunk() 195 { 196 void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); 197 ASSERT3P(c, !=, NULL); 198 return (c); 199 } 200 201 static void 202 abd_free_chunk(void *c) 203 { 204 kmem_cache_free(abd_chunk_cache, c); 205 } 206 207 void 208 abd_init(void) 209 { 210 vmem_t *data_alloc_arena = NULL; 211 212 #ifdef _KERNEL 213 data_alloc_arena = zio_alloc_arena; 214 #endif 215 216 /* 217 * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH 218 * so that no allocator metadata is stored with the buffers. 219 */ 220 abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 64, 221 NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); 222 223 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 224 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 225 if (abd_ksp != NULL) { 226 abd_ksp->ks_data = &abd_stats; 227 kstat_install(abd_ksp); 228 } 229 } 230 231 void 232 abd_fini(void) 233 { 234 if (abd_ksp != NULL) { 235 kstat_delete(abd_ksp); 236 abd_ksp = NULL; 237 } 238 239 kmem_cache_destroy(abd_chunk_cache); 240 abd_chunk_cache = NULL; 241 } 242 243 static inline size_t 244 abd_chunkcnt_for_bytes(size_t size) 245 { 246 return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); 247 } 248 249 static inline size_t 250 abd_scatter_chunkcnt(abd_t *abd) 251 { 252 ASSERT(!abd_is_linear(abd)); 253 return (abd_chunkcnt_for_bytes( 254 abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); 255 } 256 257 static inline void 258 abd_verify(abd_t *abd) 259 { 260 ASSERT3U(abd->abd_size, >, 0); 261 ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); 262 ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | 263 ABD_FLAG_OWNER | ABD_FLAG_META)); 264 IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); 265 IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); 266 if (abd_is_linear(abd)) { 267 ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); 268 } else { 269 ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, 270 zfs_abd_chunk_size); 271 size_t n = abd_scatter_chunkcnt(abd); 272 for (int i = 0; i < n; i++) { 273 ASSERT3P( 274 abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); 275 } 276 } 277 } 278 279 static inline abd_t * 280 abd_alloc_struct(size_t chunkcnt) 281 { 282 size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); 283 abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); 284 ASSERT3P(abd, !=, NULL); 285 ABDSTAT_INCR(abdstat_struct_size, size); 286 287 return (abd); 288 } 289 290 static inline void 291 abd_free_struct(abd_t *abd) 292 { 293 size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); 294 int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); 295 kmem_free(abd, size); 296 ABDSTAT_INCR(abdstat_struct_size, -size); 297 } 298 299 /* 300 * Allocate an ABD, along with its own underlying data buffers. Use this if you 301 * don't care whether the ABD is linear or not. 302 */ 303 abd_t * 304 abd_alloc(size_t size, boolean_t is_metadata) 305 { 306 /* see the comment above zfs_abd_scatter_min_size */ 307 if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) 308 return (abd_alloc_linear(size, is_metadata)); 309 310 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 311 312 size_t n = abd_chunkcnt_for_bytes(size); 313 abd_t *abd = abd_alloc_struct(n); 314 315 abd->abd_flags = ABD_FLAG_OWNER; 316 if (is_metadata) { 317 abd->abd_flags |= ABD_FLAG_META; 318 } 319 abd->abd_size = size; 320 abd->abd_parent = NULL; 321 zfs_refcount_create(&abd->abd_children); 322 323 abd->abd_u.abd_scatter.abd_offset = 0; 324 abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; 325 326 for (int i = 0; i < n; i++) { 327 void *c = abd_alloc_chunk(); 328 ASSERT3P(c, !=, NULL); 329 abd->abd_u.abd_scatter.abd_chunks[i] = c; 330 } 331 332 ABDSTAT_BUMP(abdstat_scatter_cnt); 333 ABDSTAT_INCR(abdstat_scatter_data_size, size); 334 ABDSTAT_INCR(abdstat_scatter_chunk_waste, 335 n * zfs_abd_chunk_size - size); 336 337 return (abd); 338 } 339 340 static void 341 abd_free_scatter(abd_t *abd) 342 { 343 size_t n = abd_scatter_chunkcnt(abd); 344 for (int i = 0; i < n; i++) { 345 abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); 346 } 347 348 zfs_refcount_destroy(&abd->abd_children); 349 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 350 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 351 ABDSTAT_INCR(abdstat_scatter_chunk_waste, 352 abd->abd_size - n * zfs_abd_chunk_size); 353 354 abd_free_struct(abd); 355 } 356 357 /* 358 * Allocate an ABD that must be linear, along with its own underlying data 359 * buffer. Only use this when it would be very annoying to write your ABD 360 * consumer with a scattered ABD. 361 */ 362 abd_t * 363 abd_alloc_linear(size_t size, boolean_t is_metadata) 364 { 365 abd_t *abd = abd_alloc_struct(0); 366 367 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 368 369 abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; 370 if (is_metadata) { 371 abd->abd_flags |= ABD_FLAG_META; 372 } 373 abd->abd_size = size; 374 abd->abd_parent = NULL; 375 zfs_refcount_create(&abd->abd_children); 376 377 if (is_metadata) { 378 abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); 379 } else { 380 abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); 381 } 382 383 ABDSTAT_BUMP(abdstat_linear_cnt); 384 ABDSTAT_INCR(abdstat_linear_data_size, size); 385 386 return (abd); 387 } 388 389 static void 390 abd_free_linear(abd_t *abd) 391 { 392 if (abd->abd_flags & ABD_FLAG_META) { 393 zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); 394 } else { 395 zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); 396 } 397 398 zfs_refcount_destroy(&abd->abd_children); 399 ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 400 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 401 402 abd_free_struct(abd); 403 } 404 405 /* 406 * Free an ABD. Only use this on ABDs allocated with abd_alloc() or 407 * abd_alloc_linear(). 408 */ 409 void 410 abd_free(abd_t *abd) 411 { 412 abd_verify(abd); 413 ASSERT3P(abd->abd_parent, ==, NULL); 414 ASSERT(abd->abd_flags & ABD_FLAG_OWNER); 415 if (abd_is_linear(abd)) 416 abd_free_linear(abd); 417 else 418 abd_free_scatter(abd); 419 } 420 421 /* 422 * Allocate an ABD of the same format (same metadata flag, same scatterize 423 * setting) as another ABD. 424 */ 425 abd_t * 426 abd_alloc_sametype(abd_t *sabd, size_t size) 427 { 428 boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; 429 if (abd_is_linear(sabd)) { 430 return (abd_alloc_linear(size, is_metadata)); 431 } else { 432 return (abd_alloc(size, is_metadata)); 433 } 434 } 435 436 /* 437 * If we're going to use this ABD for doing I/O using the block layer, the 438 * consumer of the ABD data doesn't care if it's scattered or not, and we don't 439 * plan to store this ABD in memory for a long period of time, we should 440 * allocate the ABD type that requires the least data copying to do the I/O. 441 * 442 * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os 443 * using a scatter/gather list we should switch to that and replace this call 444 * with vanilla abd_alloc(). 445 */ 446 abd_t * 447 abd_alloc_for_io(size_t size, boolean_t is_metadata) 448 { 449 return (abd_alloc_linear(size, is_metadata)); 450 } 451 452 /* 453 * Allocate a new ABD to point to offset off of sabd. It shares the underlying 454 * buffer data with sabd. Use abd_put() to free. sabd must not be freed while 455 * any derived ABDs exist. 456 */ 457 /* ARGSUSED */ 458 static inline abd_t * 459 abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) 460 { 461 abd_t *abd; 462 463 abd_verify(sabd); 464 ASSERT3U(off, <=, sabd->abd_size); 465 466 if (abd_is_linear(sabd)) { 467 abd = abd_alloc_struct(0); 468 469 /* 470 * Even if this buf is filesystem metadata, we only track that 471 * if we own the underlying data buffer, which is not true in 472 * this case. Therefore, we don't ever use ABD_FLAG_META here. 473 */ 474 abd->abd_flags = ABD_FLAG_LINEAR; 475 476 abd->abd_u.abd_linear.abd_buf = 477 (char *)sabd->abd_u.abd_linear.abd_buf + off; 478 } else { 479 size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; 480 size_t chunkcnt = abd_scatter_chunkcnt(sabd) - 481 (new_offset / zfs_abd_chunk_size); 482 483 abd = abd_alloc_struct(chunkcnt); 484 485 /* 486 * Even if this buf is filesystem metadata, we only track that 487 * if we own the underlying data buffer, which is not true in 488 * this case. Therefore, we don't ever use ABD_FLAG_META here. 489 */ 490 abd->abd_flags = 0; 491 492 abd->abd_u.abd_scatter.abd_offset = 493 new_offset % zfs_abd_chunk_size; 494 abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; 495 496 /* Copy the scatterlist starting at the correct offset */ 497 (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, 498 &sabd->abd_u.abd_scatter.abd_chunks[new_offset / 499 zfs_abd_chunk_size], 500 chunkcnt * sizeof (void *)); 501 } 502 503 abd->abd_size = sabd->abd_size - off; 504 abd->abd_parent = sabd; 505 zfs_refcount_create(&abd->abd_children); 506 (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); 507 508 return (abd); 509 } 510 511 abd_t * 512 abd_get_offset(abd_t *sabd, size_t off) 513 { 514 size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; 515 516 VERIFY3U(size, >, 0); 517 518 return (abd_get_offset_impl(sabd, off, size)); 519 } 520 521 abd_t * 522 abd_get_offset_size(abd_t *sabd, size_t off, size_t size) 523 { 524 ASSERT3U(off + size, <=, sabd->abd_size); 525 526 return (abd_get_offset_impl(sabd, off, size)); 527 } 528 529 530 /* 531 * Allocate a linear ABD structure for buf. You must free this with abd_put() 532 * since the resulting ABD doesn't own its own buffer. 533 */ 534 abd_t * 535 abd_get_from_buf(void *buf, size_t size) 536 { 537 abd_t *abd = abd_alloc_struct(0); 538 539 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 540 541 /* 542 * Even if this buf is filesystem metadata, we only track that if we 543 * own the underlying data buffer, which is not true in this case. 544 * Therefore, we don't ever use ABD_FLAG_META here. 545 */ 546 abd->abd_flags = ABD_FLAG_LINEAR; 547 abd->abd_size = size; 548 abd->abd_parent = NULL; 549 zfs_refcount_create(&abd->abd_children); 550 551 abd->abd_u.abd_linear.abd_buf = buf; 552 553 return (abd); 554 } 555 556 /* 557 * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not 558 * free the underlying scatterlist or buffer. 559 */ 560 void 561 abd_put(abd_t *abd) 562 { 563 abd_verify(abd); 564 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); 565 566 if (abd->abd_parent != NULL) { 567 (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, 568 abd->abd_size, abd); 569 } 570 571 zfs_refcount_destroy(&abd->abd_children); 572 abd_free_struct(abd); 573 } 574 575 /* 576 * Get the raw buffer associated with a linear ABD. 577 */ 578 void * 579 abd_to_buf(abd_t *abd) 580 { 581 ASSERT(abd_is_linear(abd)); 582 abd_verify(abd); 583 return (abd->abd_u.abd_linear.abd_buf); 584 } 585 586 /* 587 * Borrow a raw buffer from an ABD without copying the contents of the ABD 588 * into the buffer. If the ABD is scattered, this will allocate a raw buffer 589 * whose contents are undefined. To copy over the existing data in the ABD, use 590 * abd_borrow_buf_copy() instead. 591 */ 592 void * 593 abd_borrow_buf(abd_t *abd, size_t n) 594 { 595 void *buf; 596 abd_verify(abd); 597 ASSERT3U(abd->abd_size, >=, n); 598 if (abd_is_linear(abd)) { 599 buf = abd_to_buf(abd); 600 } else if ((abd->abd_flags & ABD_FLAG_META) != 0) { 601 buf = zio_buf_alloc(n); 602 } else { 603 buf = zio_data_buf_alloc(n); 604 } 605 (void) zfs_refcount_add_many(&abd->abd_children, n, buf); 606 607 return (buf); 608 } 609 610 void * 611 abd_borrow_buf_copy(abd_t *abd, size_t n) 612 { 613 void *buf = abd_borrow_buf(abd, n); 614 if (!abd_is_linear(abd)) { 615 abd_copy_to_buf(buf, abd, n); 616 } 617 return (buf); 618 } 619 620 /* 621 * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will 622 * not change the contents of the ABD and will ASSERT that you didn't modify 623 * the buffer since it was borrowed. If you want any changes you made to buf to 624 * be copied back to abd, use abd_return_buf_copy() instead. 625 */ 626 void 627 abd_return_buf(abd_t *abd, void *buf, size_t n) 628 { 629 abd_verify(abd); 630 ASSERT3U(abd->abd_size, >=, n); 631 if (abd_is_linear(abd)) { 632 ASSERT3P(buf, ==, abd_to_buf(abd)); 633 } else if ((abd->abd_flags & ABD_FLAG_META) != 0) { 634 ASSERT0(abd_cmp_buf(abd, buf, n)); 635 zio_buf_free(buf, n); 636 } else { 637 ASSERT0(abd_cmp_buf(abd, buf, n)); 638 zio_data_buf_free(buf, n); 639 } 640 (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); 641 } 642 643 void 644 abd_return_buf_copy(abd_t *abd, void *buf, size_t n) 645 { 646 if (!abd_is_linear(abd)) { 647 abd_copy_from_buf(abd, buf, n); 648 } 649 abd_return_buf(abd, buf, n); 650 } 651 652 /* 653 * Give this ABD ownership of the buffer that it's storing. Can only be used on 654 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated 655 * with abd_alloc_linear() which subsequently released ownership of their buf 656 * with abd_release_ownership_of_buf(). 657 */ 658 void 659 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) 660 { 661 ASSERT(abd_is_linear(abd)); 662 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); 663 abd_verify(abd); 664 665 abd->abd_flags |= ABD_FLAG_OWNER; 666 if (is_metadata) { 667 abd->abd_flags |= ABD_FLAG_META; 668 } 669 670 ABDSTAT_BUMP(abdstat_linear_cnt); 671 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 672 } 673 674 void 675 abd_release_ownership_of_buf(abd_t *abd) 676 { 677 ASSERT(abd_is_linear(abd)); 678 ASSERT(abd->abd_flags & ABD_FLAG_OWNER); 679 abd_verify(abd); 680 681 abd->abd_flags &= ~ABD_FLAG_OWNER; 682 /* Disable this flag since we no longer own the data buffer */ 683 abd->abd_flags &= ~ABD_FLAG_META; 684 685 ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 686 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 687 } 688 689 struct abd_iter { 690 abd_t *iter_abd; /* ABD being iterated through */ 691 size_t iter_pos; /* position (relative to abd_offset) */ 692 void *iter_mapaddr; /* addr corresponding to iter_pos */ 693 size_t iter_mapsize; /* length of data valid at mapaddr */ 694 }; 695 696 static inline size_t 697 abd_iter_scatter_chunk_offset(struct abd_iter *aiter) 698 { 699 ASSERT(!abd_is_linear(aiter->iter_abd)); 700 return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + 701 aiter->iter_pos) % zfs_abd_chunk_size); 702 } 703 704 static inline size_t 705 abd_iter_scatter_chunk_index(struct abd_iter *aiter) 706 { 707 ASSERT(!abd_is_linear(aiter->iter_abd)); 708 return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + 709 aiter->iter_pos) / zfs_abd_chunk_size); 710 } 711 712 /* 713 * Initialize the abd_iter. 714 */ 715 static void 716 abd_iter_init(struct abd_iter *aiter, abd_t *abd) 717 { 718 abd_verify(abd); 719 aiter->iter_abd = abd; 720 aiter->iter_pos = 0; 721 aiter->iter_mapaddr = NULL; 722 aiter->iter_mapsize = 0; 723 } 724 725 /* 726 * Advance the iterator by a certain amount. Cannot be called when a chunk is 727 * in use. This can be safely called when the aiter has already exhausted, in 728 * which case this does nothing. 729 */ 730 static void 731 abd_iter_advance(struct abd_iter *aiter, size_t amount) 732 { 733 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 734 ASSERT0(aiter->iter_mapsize); 735 736 /* There's nothing left to advance to, so do nothing */ 737 if (aiter->iter_pos == aiter->iter_abd->abd_size) 738 return; 739 740 aiter->iter_pos += amount; 741 } 742 743 /* 744 * Map the current chunk into aiter. This can be safely called when the aiter 745 * has already exhausted, in which case this does nothing. 746 */ 747 static void 748 abd_iter_map(struct abd_iter *aiter) 749 { 750 void *paddr; 751 size_t offset = 0; 752 753 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 754 ASSERT0(aiter->iter_mapsize); 755 756 /* Panic if someone has changed zfs_abd_chunk_size */ 757 IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == 758 aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); 759 760 /* There's nothing left to iterate over, so do nothing */ 761 if (aiter->iter_pos == aiter->iter_abd->abd_size) 762 return; 763 764 if (abd_is_linear(aiter->iter_abd)) { 765 offset = aiter->iter_pos; 766 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 767 paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; 768 } else { 769 size_t index = abd_iter_scatter_chunk_index(aiter); 770 offset = abd_iter_scatter_chunk_offset(aiter); 771 aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, 772 aiter->iter_abd->abd_size - aiter->iter_pos); 773 paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; 774 } 775 aiter->iter_mapaddr = (char *)paddr + offset; 776 } 777 778 /* 779 * Unmap the current chunk from aiter. This can be safely called when the aiter 780 * has already exhausted, in which case this does nothing. 781 */ 782 static void 783 abd_iter_unmap(struct abd_iter *aiter) 784 { 785 /* There's nothing left to unmap, so do nothing */ 786 if (aiter->iter_pos == aiter->iter_abd->abd_size) 787 return; 788 789 ASSERT3P(aiter->iter_mapaddr, !=, NULL); 790 ASSERT3U(aiter->iter_mapsize, >, 0); 791 792 aiter->iter_mapaddr = NULL; 793 aiter->iter_mapsize = 0; 794 } 795 796 int 797 abd_iterate_func(abd_t *abd, size_t off, size_t size, 798 abd_iter_func_t *func, void *private) 799 { 800 int ret = 0; 801 struct abd_iter aiter; 802 803 abd_verify(abd); 804 ASSERT3U(off + size, <=, abd->abd_size); 805 806 abd_iter_init(&aiter, abd); 807 abd_iter_advance(&aiter, off); 808 809 while (size > 0) { 810 abd_iter_map(&aiter); 811 812 size_t len = MIN(aiter.iter_mapsize, size); 813 ASSERT3U(len, >, 0); 814 815 ret = func(aiter.iter_mapaddr, len, private); 816 817 abd_iter_unmap(&aiter); 818 819 if (ret != 0) 820 break; 821 822 size -= len; 823 abd_iter_advance(&aiter, len); 824 } 825 826 return (ret); 827 } 828 829 struct buf_arg { 830 void *arg_buf; 831 }; 832 833 static int 834 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) 835 { 836 struct buf_arg *ba_ptr = private; 837 838 (void) memcpy(ba_ptr->arg_buf, buf, size); 839 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 840 841 return (0); 842 } 843 844 /* 845 * Copy abd to buf. (off is the offset in abd.) 846 */ 847 void 848 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) 849 { 850 struct buf_arg ba_ptr = { buf }; 851 852 (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, 853 &ba_ptr); 854 } 855 856 static int 857 abd_cmp_buf_off_cb(void *buf, size_t size, void *private) 858 { 859 int ret; 860 struct buf_arg *ba_ptr = private; 861 862 ret = memcmp(buf, ba_ptr->arg_buf, size); 863 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 864 865 return (ret); 866 } 867 868 /* 869 * Compare the contents of abd to buf. (off is the offset in abd.) 870 */ 871 int 872 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) 873 { 874 struct buf_arg ba_ptr = { (void *) buf }; 875 876 return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); 877 } 878 879 static int 880 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) 881 { 882 struct buf_arg *ba_ptr = private; 883 884 (void) memcpy(buf, ba_ptr->arg_buf, size); 885 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 886 887 return (0); 888 } 889 890 /* 891 * Copy from buf to abd. (off is the offset in abd.) 892 */ 893 void 894 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) 895 { 896 struct buf_arg ba_ptr = { (void *) buf }; 897 898 (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, 899 &ba_ptr); 900 } 901 902 /*ARGSUSED*/ 903 static int 904 abd_zero_off_cb(void *buf, size_t size, void *private) 905 { 906 (void) memset(buf, 0, size); 907 return (0); 908 } 909 910 /* 911 * Zero out the abd from a particular offset to the end. 912 */ 913 void 914 abd_zero_off(abd_t *abd, size_t off, size_t size) 915 { 916 (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); 917 } 918 919 /* 920 * Iterate over two ABDs and call func incrementally on the two ABDs' data in 921 * equal-sized chunks (passed to func as raw buffers). func could be called many 922 * times during this iteration. 923 */ 924 int 925 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, 926 size_t size, abd_iter_func2_t *func, void *private) 927 { 928 int ret = 0; 929 struct abd_iter daiter, saiter; 930 931 abd_verify(dabd); 932 abd_verify(sabd); 933 934 ASSERT3U(doff + size, <=, dabd->abd_size); 935 ASSERT3U(soff + size, <=, sabd->abd_size); 936 937 abd_iter_init(&daiter, dabd); 938 abd_iter_init(&saiter, sabd); 939 abd_iter_advance(&daiter, doff); 940 abd_iter_advance(&saiter, soff); 941 942 while (size > 0) { 943 abd_iter_map(&daiter); 944 abd_iter_map(&saiter); 945 946 size_t dlen = MIN(daiter.iter_mapsize, size); 947 size_t slen = MIN(saiter.iter_mapsize, size); 948 size_t len = MIN(dlen, slen); 949 ASSERT(dlen > 0 || slen > 0); 950 951 ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, 952 private); 953 954 abd_iter_unmap(&saiter); 955 abd_iter_unmap(&daiter); 956 957 if (ret != 0) 958 break; 959 960 size -= len; 961 abd_iter_advance(&daiter, len); 962 abd_iter_advance(&saiter, len); 963 } 964 965 return (ret); 966 } 967 968 /*ARGSUSED*/ 969 static int 970 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) 971 { 972 (void) memcpy(dbuf, sbuf, size); 973 return (0); 974 } 975 976 /* 977 * Copy from sabd to dabd starting from soff and doff. 978 */ 979 void 980 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) 981 { 982 (void) abd_iterate_func2(dabd, sabd, doff, soff, size, 983 abd_copy_off_cb, NULL); 984 } 985 986 /*ARGSUSED*/ 987 static int 988 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) 989 { 990 return (memcmp(bufa, bufb, size)); 991 } 992 993 /* 994 * Compares the first size bytes of two ABDs. 995 */ 996 int 997 abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) 998 { 999 return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); 1000 } 1001 1002 /* 1003 * Iterate over code ABDs and a data ABD and call @func_raidz_gen. 1004 * 1005 * @cabds parity ABDs, must have equal size 1006 * @dabd data ABD. Can be NULL (in this case @dsize = 0) 1007 * @func_raidz_gen should be implemented so that its behaviour 1008 * is the same when taking linear and when taking scatter 1009 */ 1010 void 1011 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, 1012 ssize_t csize, ssize_t dsize, const unsigned parity, 1013 void (*func_raidz_gen)(void **, const void *, size_t, size_t)) 1014 { 1015 int i; 1016 ssize_t len, dlen; 1017 struct abd_iter caiters[3]; 1018 struct abd_iter daiter = {0}; 1019 void *caddrs[3]; 1020 1021 ASSERT3U(parity, <=, 3); 1022 1023 for (i = 0; i < parity; i++) 1024 abd_iter_init(&caiters[i], cabds[i]); 1025 1026 if (dabd) 1027 abd_iter_init(&daiter, dabd); 1028 1029 ASSERT3S(dsize, >=, 0); 1030 1031 #ifdef _KERNEL 1032 kpreempt_disable(); 1033 #endif 1034 while (csize > 0) { 1035 len = csize; 1036 1037 if (dabd && dsize > 0) 1038 abd_iter_map(&daiter); 1039 1040 for (i = 0; i < parity; i++) { 1041 abd_iter_map(&caiters[i]); 1042 caddrs[i] = caiters[i].iter_mapaddr; 1043 } 1044 1045 switch (parity) { 1046 case 3: 1047 len = MIN(caiters[2].iter_mapsize, len); 1048 /* falls through */ 1049 case 2: 1050 len = MIN(caiters[1].iter_mapsize, len); 1051 /* falls through */ 1052 case 1: 1053 len = MIN(caiters[0].iter_mapsize, len); 1054 } 1055 1056 /* must be progressive */ 1057 ASSERT3S(len, >, 0); 1058 1059 if (dabd && dsize > 0) { 1060 /* this needs precise iter.length */ 1061 len = MIN(daiter.iter_mapsize, len); 1062 len = MIN(dsize, len); 1063 dlen = len; 1064 } else 1065 dlen = 0; 1066 1067 /* must be progressive */ 1068 ASSERT3S(len, >, 0); 1069 /* 1070 * The iterated function likely will not do well if each 1071 * segment except the last one is not multiple of 512 (raidz). 1072 */ 1073 ASSERT3U(((uint64_t)len & 511ULL), ==, 0); 1074 1075 func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); 1076 1077 for (i = parity-1; i >= 0; i--) { 1078 abd_iter_unmap(&caiters[i]); 1079 abd_iter_advance(&caiters[i], len); 1080 } 1081 1082 if (dabd && dsize > 0) { 1083 abd_iter_unmap(&daiter); 1084 abd_iter_advance(&daiter, dlen); 1085 dsize -= dlen; 1086 } 1087 1088 csize -= len; 1089 1090 ASSERT3S(dsize, >=, 0); 1091 ASSERT3S(csize, >=, 0); 1092 } 1093 #ifdef _KERNEL 1094 kpreempt_enable(); 1095 #endif 1096 } 1097 1098 /* 1099 * Iterate over code ABDs and data reconstruction target ABDs and call 1100 * @func_raidz_rec. Function maps at most 6 pages atomically. 1101 * 1102 * @cabds parity ABDs, must have equal size 1103 * @tabds rec target ABDs, at most 3 1104 * @tsize size of data target columns 1105 * @func_raidz_rec expects syndrome data in target columns. Function 1106 * reconstructs data and overwrites target columns. 1107 */ 1108 void 1109 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, 1110 ssize_t tsize, const unsigned parity, 1111 void (*func_raidz_rec)(void **t, const size_t tsize, void **c, 1112 const unsigned *mul), 1113 const unsigned *mul) 1114 { 1115 int i; 1116 ssize_t len; 1117 struct abd_iter citers[3]; 1118 struct abd_iter xiters[3]; 1119 void *caddrs[3], *xaddrs[3]; 1120 1121 ASSERT3U(parity, <=, 3); 1122 1123 for (i = 0; i < parity; i++) { 1124 abd_iter_init(&citers[i], cabds[i]); 1125 abd_iter_init(&xiters[i], tabds[i]); 1126 } 1127 1128 #ifdef _KERNEL 1129 kpreempt_disable(); 1130 #endif 1131 while (tsize > 0) { 1132 1133 for (i = 0; i < parity; i++) { 1134 abd_iter_map(&citers[i]); 1135 abd_iter_map(&xiters[i]); 1136 caddrs[i] = citers[i].iter_mapaddr; 1137 xaddrs[i] = xiters[i].iter_mapaddr; 1138 } 1139 1140 len = tsize; 1141 switch (parity) { 1142 case 3: 1143 len = MIN(xiters[2].iter_mapsize, len); 1144 len = MIN(citers[2].iter_mapsize, len); 1145 /* falls through */ 1146 case 2: 1147 len = MIN(xiters[1].iter_mapsize, len); 1148 len = MIN(citers[1].iter_mapsize, len); 1149 /* falls through */ 1150 case 1: 1151 len = MIN(xiters[0].iter_mapsize, len); 1152 len = MIN(citers[0].iter_mapsize, len); 1153 } 1154 /* must be progressive */ 1155 ASSERT3S(len, >, 0); 1156 /* 1157 * The iterated function likely will not do well if each 1158 * segment except the last one is not multiple of 512 (raidz). 1159 */ 1160 ASSERT3U(((uint64_t)len & 511ULL), ==, 0); 1161 1162 func_raidz_rec(xaddrs, len, caddrs, mul); 1163 1164 for (i = parity-1; i >= 0; i--) { 1165 abd_iter_unmap(&xiters[i]); 1166 abd_iter_unmap(&citers[i]); 1167 abd_iter_advance(&xiters[i], len); 1168 abd_iter_advance(&citers[i], len); 1169 } 1170 1171 tsize -= len; 1172 ASSERT3S(tsize, >=, 0); 1173 } 1174 #ifdef _KERNEL 1175 kpreempt_enable(); 1176 #endif 1177 } 1178