1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 14 * Copyright (c) 2019 by Delphix. All rights reserved. 15 * Copyright 2020 Joyent, Inc. 16 */ 17 18 /* 19 * ARC buffer data (ABD). 20 * 21 * ABDs are an abstract data structure for the ARC which can use two 22 * different ways of storing the underlying data: 23 * 24 * (a) Linear buffer. In this case, all the data in the ABD is stored in one 25 * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). 26 * 27 * +-------------------+ 28 * | ABD (linear) | 29 * | abd_flags = ... | 30 * | abd_size = ... | +--------------------------------+ 31 * | abd_buf ------------->| raw buffer of size abd_size | 32 * +-------------------+ +--------------------------------+ 33 * no abd_chunks 34 * 35 * (b) Scattered buffer. In this case, the data in the ABD is split into 36 * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers 37 * to the chunks recorded in an array at the end of the ABD structure. 38 * 39 * +-------------------+ 40 * | ABD (scattered) | 41 * | abd_flags = ... | 42 * | abd_size = ... | 43 * | abd_offset = 0 | +-----------+ 44 * | abd_chunks[0] ----------------------------->| chunk 0 | 45 * | abd_chunks[1] ---------------------+ +-----------+ 46 * | ... | | +-----------+ 47 * | abd_chunks[N-1] ---------+ +------->| chunk 1 | 48 * +-------------------+ | +-----------+ 49 * | ... 50 * | +-----------+ 51 * +----------------->| chunk N-1 | 52 * +-----------+ 53 * 54 * Using a large proportion of scattered ABDs decreases ARC fragmentation since 55 * when we are at the limit of allocatable space, using equal-size chunks will 56 * allow us to quickly reclaim enough space for a new large allocation (assuming 57 * it is also scattered). 58 * 59 * In addition to directly allocating a linear or scattered ABD, it is also 60 * possible to create an ABD by requesting the "sub-ABD" starting at an offset 61 * within an existing ABD. In linear buffers this is simple (set abd_buf of 62 * the new ABD to the starting point within the original raw buffer), but 63 * scattered ABDs are a little more complex. The new ABD makes a copy of the 64 * relevant abd_chunks pointers (but not the underlying data). However, to 65 * provide arbitrary rather than only chunk-aligned starting offsets, it also 66 * tracks an abd_offset field which represents the starting point of the data 67 * within the first chunk in abd_chunks. For both linear and scattered ABDs, 68 * creating an offset ABD marks the original ABD as the offset's parent, and the 69 * original ABD's abd_children refcount is incremented. This data allows us to 70 * ensure the root ABD isn't deleted before its children. 71 * 72 * Most consumers should never need to know what type of ABD they're using -- 73 * the ABD public API ensures that it's possible to transparently switch from 74 * using a linear ABD to a scattered one when doing so would be beneficial. 75 * 76 * If you need to use the data within an ABD directly, if you know it's linear 77 * (because you allocated it) you can use abd_to_buf() to access the underlying 78 * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions 79 * which will allocate a raw buffer if necessary. Use the abd_return_buf* 80 * functions to return any raw buffers that are no longer necessary when you're 81 * done using them. 82 * 83 * There are a variety of ABD APIs that implement basic buffer operations: 84 * compare, copy, read, write, and fill with zeroes. If you need a custom 85 * function which progressively accesses the whole ABD, use the abd_iterate_* 86 * functions. 87 */ 88 89 #include <sys/abd.h> 90 #include <sys/param.h> 91 #include <sys/zio.h> 92 #include <sys/zfs_context.h> 93 #include <sys/zfs_znode.h> 94 95 typedef struct abd_stats { 96 kstat_named_t abdstat_struct_size; 97 kstat_named_t abdstat_scatter_cnt; 98 kstat_named_t abdstat_scatter_data_size; 99 kstat_named_t abdstat_scatter_chunk_waste; 100 kstat_named_t abdstat_linear_cnt; 101 kstat_named_t abdstat_linear_data_size; 102 } abd_stats_t; 103 104 static abd_stats_t abd_stats = { 105 /* Amount of memory occupied by all of the abd_t struct allocations */ 106 { "struct_size", KSTAT_DATA_UINT64 }, 107 /* 108 * The number of scatter ABDs which are currently allocated, excluding 109 * ABDs which don't own their data (for instance the ones which were 110 * allocated through abd_get_offset()). 111 */ 112 { "scatter_cnt", KSTAT_DATA_UINT64 }, 113 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 114 { "scatter_data_size", KSTAT_DATA_UINT64 }, 115 /* 116 * The amount of space wasted at the end of the last chunk across all 117 * scatter ABDs tracked by scatter_cnt. 118 */ 119 { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 120 /* 121 * The number of linear ABDs which are currently allocated, excluding 122 * ABDs which don't own their data (for instance the ones which were 123 * allocated through abd_get_offset() and abd_get_from_buf()). If an 124 * ABD takes ownership of its buf then it will become tracked. 125 */ 126 { "linear_cnt", KSTAT_DATA_UINT64 }, 127 /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 128 { "linear_data_size", KSTAT_DATA_UINT64 }, 129 }; 130 131 #define ABDSTAT(stat) (abd_stats.stat.value.ui64) 132 #define ABDSTAT_INCR(stat, val) \ 133 atomic_add_64(&abd_stats.stat.value.ui64, (val)) 134 #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) 135 #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) 136 137 /* 138 * It is possible to make all future ABDs be linear by setting this to B_FALSE. 139 * Otherwise, ABDs are allocated scattered by default unless the caller uses 140 * abd_alloc_linear(). 141 */ 142 boolean_t zfs_abd_scatter_enabled = B_TRUE; 143 144 /* 145 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 146 * ABD's. Smaller allocations will use linear ABD's which uses 147 * zio_[data_]buf_alloc(). 148 * 149 * Scatter ABD's use at least one page each, so sub-page allocations waste 150 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 151 * half of each page). Using linear ABD's for small allocations means that 152 * they will be put on slabs which contain many allocations. This can 153 * improve memory efficiency, but it also makes it much harder for ARC 154 * evictions to actually free pages, because all the buffers on one slab need 155 * to be freed in order for the slab (and underlying pages) to be freed. 156 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 157 * possible for them to actually waste more memory than scatter (one page per 158 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 159 * 160 * Spill blocks are typically 512B and are heavily used on systems running 161 * selinux with the default dnode size and the `xattr=sa` property set. 162 * 163 * By default we use linear allocations for 512B and 1KB, and scatter 164 * allocations for larger (1.5KB and up). 165 */ 166 int zfs_abd_scatter_min_size = 512 * 3; 167 168 /* 169 * The size of the chunks ABD allocates. Because the sizes allocated from the 170 * kmem_cache can't change, this tunable can only be modified at boot. Changing 171 * it at runtime would cause ABD iteration to work incorrectly for ABDs which 172 * were allocated with the old size, so a safeguard has been put in place which 173 * will cause the machine to panic if you change it and try to access the data 174 * within a scattered ABD. 175 */ 176 size_t zfs_abd_chunk_size = 4096; 177 178 #ifdef _KERNEL 179 extern vmem_t *zio_alloc_arena; 180 #endif 181 182 kmem_cache_t *abd_chunk_cache; 183 static kstat_t *abd_ksp; 184 185 extern inline boolean_t abd_is_linear(abd_t *abd); 186 extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); 187 extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); 188 extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); 189 extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); 190 extern inline void abd_zero(abd_t *abd, size_t size); 191 192 static void * 193 abd_alloc_chunk() 194 { 195 void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); 196 ASSERT3P(c, !=, NULL); 197 return (c); 198 } 199 200 static void 201 abd_free_chunk(void *c) 202 { 203 kmem_cache_free(abd_chunk_cache, c); 204 } 205 206 void 207 abd_init(void) 208 { 209 vmem_t *data_alloc_arena = NULL; 210 211 #ifdef _KERNEL 212 data_alloc_arena = zio_alloc_arena; 213 #endif 214 215 /* 216 * Since ABD chunks do not appear in crash dumps, we pass KMC_NOTOUCH 217 * so that no allocator metadata is stored with the buffers. 218 */ 219 abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 64, 220 NULL, NULL, NULL, NULL, data_alloc_arena, KMC_NOTOUCH); 221 222 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 223 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 224 if (abd_ksp != NULL) { 225 abd_ksp->ks_data = &abd_stats; 226 kstat_install(abd_ksp); 227 } 228 } 229 230 void 231 abd_fini(void) 232 { 233 if (abd_ksp != NULL) { 234 kstat_delete(abd_ksp); 235 abd_ksp = NULL; 236 } 237 238 kmem_cache_destroy(abd_chunk_cache); 239 abd_chunk_cache = NULL; 240 } 241 242 static inline size_t 243 abd_chunkcnt_for_bytes(size_t size) 244 { 245 return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); 246 } 247 248 static inline size_t 249 abd_scatter_chunkcnt(abd_t *abd) 250 { 251 ASSERT(!abd_is_linear(abd)); 252 return (abd_chunkcnt_for_bytes( 253 abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); 254 } 255 256 static inline void 257 abd_verify(abd_t *abd) 258 { 259 ASSERT3U(abd->abd_size, >, 0); 260 ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); 261 ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | 262 ABD_FLAG_OWNER | ABD_FLAG_META)); 263 IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); 264 IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); 265 if (abd_is_linear(abd)) { 266 ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); 267 } else { 268 ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, 269 zfs_abd_chunk_size); 270 size_t n = abd_scatter_chunkcnt(abd); 271 for (int i = 0; i < n; i++) { 272 ASSERT3P( 273 abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); 274 } 275 } 276 } 277 278 static inline abd_t * 279 abd_alloc_struct(size_t chunkcnt) 280 { 281 size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); 282 abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); 283 ASSERT3P(abd, !=, NULL); 284 ABDSTAT_INCR(abdstat_struct_size, size); 285 286 return (abd); 287 } 288 289 static inline void 290 abd_free_struct(abd_t *abd) 291 { 292 size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); 293 int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); 294 kmem_free(abd, size); 295 ABDSTAT_INCR(abdstat_struct_size, -size); 296 } 297 298 /* 299 * Allocate an ABD, along with its own underlying data buffers. Use this if you 300 * don't care whether the ABD is linear or not. 301 */ 302 abd_t * 303 abd_alloc(size_t size, boolean_t is_metadata) 304 { 305 /* see the comment above zfs_abd_scatter_min_size */ 306 if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) 307 return (abd_alloc_linear(size, is_metadata)); 308 309 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 310 311 size_t n = abd_chunkcnt_for_bytes(size); 312 abd_t *abd = abd_alloc_struct(n); 313 314 abd->abd_flags = ABD_FLAG_OWNER; 315 if (is_metadata) { 316 abd->abd_flags |= ABD_FLAG_META; 317 } 318 abd->abd_size = size; 319 abd->abd_parent = NULL; 320 zfs_refcount_create(&abd->abd_children); 321 322 abd->abd_u.abd_scatter.abd_offset = 0; 323 abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; 324 325 for (int i = 0; i < n; i++) { 326 void *c = abd_alloc_chunk(); 327 ASSERT3P(c, !=, NULL); 328 abd->abd_u.abd_scatter.abd_chunks[i] = c; 329 } 330 331 ABDSTAT_BUMP(abdstat_scatter_cnt); 332 ABDSTAT_INCR(abdstat_scatter_data_size, size); 333 ABDSTAT_INCR(abdstat_scatter_chunk_waste, 334 n * zfs_abd_chunk_size - size); 335 336 return (abd); 337 } 338 339 static void 340 abd_free_scatter(abd_t *abd) 341 { 342 size_t n = abd_scatter_chunkcnt(abd); 343 for (int i = 0; i < n; i++) { 344 abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); 345 } 346 347 zfs_refcount_destroy(&abd->abd_children); 348 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 349 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 350 ABDSTAT_INCR(abdstat_scatter_chunk_waste, 351 abd->abd_size - n * zfs_abd_chunk_size); 352 353 abd_free_struct(abd); 354 } 355 356 /* 357 * Allocate an ABD that must be linear, along with its own underlying data 358 * buffer. Only use this when it would be very annoying to write your ABD 359 * consumer with a scattered ABD. 360 */ 361 abd_t * 362 abd_alloc_linear(size_t size, boolean_t is_metadata) 363 { 364 abd_t *abd = abd_alloc_struct(0); 365 366 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 367 368 abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; 369 if (is_metadata) { 370 abd->abd_flags |= ABD_FLAG_META; 371 } 372 abd->abd_size = size; 373 abd->abd_parent = NULL; 374 zfs_refcount_create(&abd->abd_children); 375 376 if (is_metadata) { 377 abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); 378 } else { 379 abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); 380 } 381 382 ABDSTAT_BUMP(abdstat_linear_cnt); 383 ABDSTAT_INCR(abdstat_linear_data_size, size); 384 385 return (abd); 386 } 387 388 static void 389 abd_free_linear(abd_t *abd) 390 { 391 if (abd->abd_flags & ABD_FLAG_META) { 392 zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); 393 } else { 394 zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); 395 } 396 397 zfs_refcount_destroy(&abd->abd_children); 398 ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 399 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 400 401 abd_free_struct(abd); 402 } 403 404 /* 405 * Free an ABD. Only use this on ABDs allocated with abd_alloc() or 406 * abd_alloc_linear(). 407 */ 408 void 409 abd_free(abd_t *abd) 410 { 411 abd_verify(abd); 412 ASSERT3P(abd->abd_parent, ==, NULL); 413 ASSERT(abd->abd_flags & ABD_FLAG_OWNER); 414 if (abd_is_linear(abd)) 415 abd_free_linear(abd); 416 else 417 abd_free_scatter(abd); 418 } 419 420 /* 421 * Allocate an ABD of the same format (same metadata flag, same scatterize 422 * setting) as another ABD. 423 */ 424 abd_t * 425 abd_alloc_sametype(abd_t *sabd, size_t size) 426 { 427 boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; 428 if (abd_is_linear(sabd)) { 429 return (abd_alloc_linear(size, is_metadata)); 430 } else { 431 return (abd_alloc(size, is_metadata)); 432 } 433 } 434 435 /* 436 * If we're going to use this ABD for doing I/O using the block layer, the 437 * consumer of the ABD data doesn't care if it's scattered or not, and we don't 438 * plan to store this ABD in memory for a long period of time, we should 439 * allocate the ABD type that requires the least data copying to do the I/O. 440 * 441 * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os 442 * using a scatter/gather list we should switch to that and replace this call 443 * with vanilla abd_alloc(). 444 */ 445 abd_t * 446 abd_alloc_for_io(size_t size, boolean_t is_metadata) 447 { 448 return (abd_alloc_linear(size, is_metadata)); 449 } 450 451 /* 452 * Allocate a new ABD to point to offset off of sabd. It shares the underlying 453 * buffer data with sabd. Use abd_put() to free. sabd must not be freed while 454 * any derived ABDs exist. 455 */ 456 /* ARGSUSED */ 457 static inline abd_t * 458 abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) 459 { 460 abd_t *abd; 461 462 abd_verify(sabd); 463 ASSERT3U(off, <=, sabd->abd_size); 464 465 if (abd_is_linear(sabd)) { 466 abd = abd_alloc_struct(0); 467 468 /* 469 * Even if this buf is filesystem metadata, we only track that 470 * if we own the underlying data buffer, which is not true in 471 * this case. Therefore, we don't ever use ABD_FLAG_META here. 472 */ 473 abd->abd_flags = ABD_FLAG_LINEAR; 474 475 abd->abd_u.abd_linear.abd_buf = 476 (char *)sabd->abd_u.abd_linear.abd_buf + off; 477 } else { 478 size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; 479 size_t chunkcnt = abd_scatter_chunkcnt(sabd) - 480 (new_offset / zfs_abd_chunk_size); 481 482 abd = abd_alloc_struct(chunkcnt); 483 484 /* 485 * Even if this buf is filesystem metadata, we only track that 486 * if we own the underlying data buffer, which is not true in 487 * this case. Therefore, we don't ever use ABD_FLAG_META here. 488 */ 489 abd->abd_flags = 0; 490 491 abd->abd_u.abd_scatter.abd_offset = 492 new_offset % zfs_abd_chunk_size; 493 abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; 494 495 /* Copy the scatterlist starting at the correct offset */ 496 (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, 497 &sabd->abd_u.abd_scatter.abd_chunks[new_offset / 498 zfs_abd_chunk_size], 499 chunkcnt * sizeof (void *)); 500 } 501 502 abd->abd_size = sabd->abd_size - off; 503 abd->abd_parent = sabd; 504 zfs_refcount_create(&abd->abd_children); 505 (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); 506 507 return (abd); 508 } 509 510 abd_t * 511 abd_get_offset(abd_t *sabd, size_t off) 512 { 513 size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; 514 515 VERIFY3U(size, >, 0); 516 517 return (abd_get_offset_impl(sabd, off, size)); 518 } 519 520 abd_t * 521 abd_get_offset_size(abd_t *sabd, size_t off, size_t size) 522 { 523 ASSERT3U(off + size, <=, sabd->abd_size); 524 525 return (abd_get_offset_impl(sabd, off, size)); 526 } 527 528 529 /* 530 * Allocate a linear ABD structure for buf. You must free this with abd_put() 531 * since the resulting ABD doesn't own its own buffer. 532 */ 533 abd_t * 534 abd_get_from_buf(void *buf, size_t size) 535 { 536 abd_t *abd = abd_alloc_struct(0); 537 538 VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); 539 540 /* 541 * Even if this buf is filesystem metadata, we only track that if we 542 * own the underlying data buffer, which is not true in this case. 543 * Therefore, we don't ever use ABD_FLAG_META here. 544 */ 545 abd->abd_flags = ABD_FLAG_LINEAR; 546 abd->abd_size = size; 547 abd->abd_parent = NULL; 548 zfs_refcount_create(&abd->abd_children); 549 550 abd->abd_u.abd_linear.abd_buf = buf; 551 552 return (abd); 553 } 554 555 /* 556 * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not 557 * free the underlying scatterlist or buffer. 558 */ 559 void 560 abd_put(abd_t *abd) 561 { 562 abd_verify(abd); 563 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); 564 565 if (abd->abd_parent != NULL) { 566 (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, 567 abd->abd_size, abd); 568 } 569 570 zfs_refcount_destroy(&abd->abd_children); 571 abd_free_struct(abd); 572 } 573 574 /* 575 * Get the raw buffer associated with a linear ABD. 576 */ 577 void * 578 abd_to_buf(abd_t *abd) 579 { 580 ASSERT(abd_is_linear(abd)); 581 abd_verify(abd); 582 return (abd->abd_u.abd_linear.abd_buf); 583 } 584 585 /* 586 * Borrow a raw buffer from an ABD without copying the contents of the ABD 587 * into the buffer. If the ABD is scattered, this will allocate a raw buffer 588 * whose contents are undefined. To copy over the existing data in the ABD, use 589 * abd_borrow_buf_copy() instead. 590 */ 591 void * 592 abd_borrow_buf(abd_t *abd, size_t n) 593 { 594 void *buf; 595 abd_verify(abd); 596 ASSERT3U(abd->abd_size, >=, n); 597 if (abd_is_linear(abd)) { 598 buf = abd_to_buf(abd); 599 } else { 600 buf = zio_buf_alloc(n); 601 } 602 (void) zfs_refcount_add_many(&abd->abd_children, n, buf); 603 604 return (buf); 605 } 606 607 void * 608 abd_borrow_buf_copy(abd_t *abd, size_t n) 609 { 610 void *buf = abd_borrow_buf(abd, n); 611 if (!abd_is_linear(abd)) { 612 abd_copy_to_buf(buf, abd, n); 613 } 614 return (buf); 615 } 616 617 /* 618 * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will 619 * not change the contents of the ABD and will ASSERT that you didn't modify 620 * the buffer since it was borrowed. If you want any changes you made to buf to 621 * be copied back to abd, use abd_return_buf_copy() instead. 622 */ 623 void 624 abd_return_buf(abd_t *abd, void *buf, size_t n) 625 { 626 abd_verify(abd); 627 ASSERT3U(abd->abd_size, >=, n); 628 if (abd_is_linear(abd)) { 629 ASSERT3P(buf, ==, abd_to_buf(abd)); 630 } else { 631 ASSERT0(abd_cmp_buf(abd, buf, n)); 632 zio_buf_free(buf, n); 633 } 634 (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); 635 } 636 637 void 638 abd_return_buf_copy(abd_t *abd, void *buf, size_t n) 639 { 640 if (!abd_is_linear(abd)) { 641 abd_copy_from_buf(abd, buf, n); 642 } 643 abd_return_buf(abd, buf, n); 644 } 645 646 /* 647 * Give this ABD ownership of the buffer that it's storing. Can only be used on 648 * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated 649 * with abd_alloc_linear() which subsequently released ownership of their buf 650 * with abd_release_ownership_of_buf(). 651 */ 652 void 653 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) 654 { 655 ASSERT(abd_is_linear(abd)); 656 ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); 657 abd_verify(abd); 658 659 abd->abd_flags |= ABD_FLAG_OWNER; 660 if (is_metadata) { 661 abd->abd_flags |= ABD_FLAG_META; 662 } 663 664 ABDSTAT_BUMP(abdstat_linear_cnt); 665 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 666 } 667 668 void 669 abd_release_ownership_of_buf(abd_t *abd) 670 { 671 ASSERT(abd_is_linear(abd)); 672 ASSERT(abd->abd_flags & ABD_FLAG_OWNER); 673 abd_verify(abd); 674 675 abd->abd_flags &= ~ABD_FLAG_OWNER; 676 /* Disable this flag since we no longer own the data buffer */ 677 abd->abd_flags &= ~ABD_FLAG_META; 678 679 ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 680 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 681 } 682 683 struct abd_iter { 684 abd_t *iter_abd; /* ABD being iterated through */ 685 size_t iter_pos; /* position (relative to abd_offset) */ 686 void *iter_mapaddr; /* addr corresponding to iter_pos */ 687 size_t iter_mapsize; /* length of data valid at mapaddr */ 688 }; 689 690 static inline size_t 691 abd_iter_scatter_chunk_offset(struct abd_iter *aiter) 692 { 693 ASSERT(!abd_is_linear(aiter->iter_abd)); 694 return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + 695 aiter->iter_pos) % zfs_abd_chunk_size); 696 } 697 698 static inline size_t 699 abd_iter_scatter_chunk_index(struct abd_iter *aiter) 700 { 701 ASSERT(!abd_is_linear(aiter->iter_abd)); 702 return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + 703 aiter->iter_pos) / zfs_abd_chunk_size); 704 } 705 706 /* 707 * Initialize the abd_iter. 708 */ 709 static void 710 abd_iter_init(struct abd_iter *aiter, abd_t *abd) 711 { 712 abd_verify(abd); 713 aiter->iter_abd = abd; 714 aiter->iter_pos = 0; 715 aiter->iter_mapaddr = NULL; 716 aiter->iter_mapsize = 0; 717 } 718 719 /* 720 * Advance the iterator by a certain amount. Cannot be called when a chunk is 721 * in use. This can be safely called when the aiter has already exhausted, in 722 * which case this does nothing. 723 */ 724 static void 725 abd_iter_advance(struct abd_iter *aiter, size_t amount) 726 { 727 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 728 ASSERT0(aiter->iter_mapsize); 729 730 /* There's nothing left to advance to, so do nothing */ 731 if (aiter->iter_pos == aiter->iter_abd->abd_size) 732 return; 733 734 aiter->iter_pos += amount; 735 } 736 737 /* 738 * Map the current chunk into aiter. This can be safely called when the aiter 739 * has already exhausted, in which case this does nothing. 740 */ 741 static void 742 abd_iter_map(struct abd_iter *aiter) 743 { 744 void *paddr; 745 size_t offset = 0; 746 747 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 748 ASSERT0(aiter->iter_mapsize); 749 750 /* Panic if someone has changed zfs_abd_chunk_size */ 751 IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == 752 aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); 753 754 /* There's nothing left to iterate over, so do nothing */ 755 if (aiter->iter_pos == aiter->iter_abd->abd_size) 756 return; 757 758 if (abd_is_linear(aiter->iter_abd)) { 759 offset = aiter->iter_pos; 760 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 761 paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; 762 } else { 763 size_t index = abd_iter_scatter_chunk_index(aiter); 764 offset = abd_iter_scatter_chunk_offset(aiter); 765 aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, 766 aiter->iter_abd->abd_size - aiter->iter_pos); 767 paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; 768 } 769 aiter->iter_mapaddr = (char *)paddr + offset; 770 } 771 772 /* 773 * Unmap the current chunk from aiter. This can be safely called when the aiter 774 * has already exhausted, in which case this does nothing. 775 */ 776 static void 777 abd_iter_unmap(struct abd_iter *aiter) 778 { 779 /* There's nothing left to unmap, so do nothing */ 780 if (aiter->iter_pos == aiter->iter_abd->abd_size) 781 return; 782 783 ASSERT3P(aiter->iter_mapaddr, !=, NULL); 784 ASSERT3U(aiter->iter_mapsize, >, 0); 785 786 aiter->iter_mapaddr = NULL; 787 aiter->iter_mapsize = 0; 788 } 789 790 int 791 abd_iterate_func(abd_t *abd, size_t off, size_t size, 792 abd_iter_func_t *func, void *private) 793 { 794 int ret = 0; 795 struct abd_iter aiter; 796 797 abd_verify(abd); 798 ASSERT3U(off + size, <=, abd->abd_size); 799 800 abd_iter_init(&aiter, abd); 801 abd_iter_advance(&aiter, off); 802 803 while (size > 0) { 804 abd_iter_map(&aiter); 805 806 size_t len = MIN(aiter.iter_mapsize, size); 807 ASSERT3U(len, >, 0); 808 809 ret = func(aiter.iter_mapaddr, len, private); 810 811 abd_iter_unmap(&aiter); 812 813 if (ret != 0) 814 break; 815 816 size -= len; 817 abd_iter_advance(&aiter, len); 818 } 819 820 return (ret); 821 } 822 823 struct buf_arg { 824 void *arg_buf; 825 }; 826 827 static int 828 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) 829 { 830 struct buf_arg *ba_ptr = private; 831 832 (void) memcpy(ba_ptr->arg_buf, buf, size); 833 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 834 835 return (0); 836 } 837 838 /* 839 * Copy abd to buf. (off is the offset in abd.) 840 */ 841 void 842 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) 843 { 844 struct buf_arg ba_ptr = { buf }; 845 846 (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, 847 &ba_ptr); 848 } 849 850 static int 851 abd_cmp_buf_off_cb(void *buf, size_t size, void *private) 852 { 853 int ret; 854 struct buf_arg *ba_ptr = private; 855 856 ret = memcmp(buf, ba_ptr->arg_buf, size); 857 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 858 859 return (ret); 860 } 861 862 /* 863 * Compare the contents of abd to buf. (off is the offset in abd.) 864 */ 865 int 866 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) 867 { 868 struct buf_arg ba_ptr = { (void *) buf }; 869 870 return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); 871 } 872 873 static int 874 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) 875 { 876 struct buf_arg *ba_ptr = private; 877 878 (void) memcpy(buf, ba_ptr->arg_buf, size); 879 ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; 880 881 return (0); 882 } 883 884 /* 885 * Copy from buf to abd. (off is the offset in abd.) 886 */ 887 void 888 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) 889 { 890 struct buf_arg ba_ptr = { (void *) buf }; 891 892 (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, 893 &ba_ptr); 894 } 895 896 /*ARGSUSED*/ 897 static int 898 abd_zero_off_cb(void *buf, size_t size, void *private) 899 { 900 (void) memset(buf, 0, size); 901 return (0); 902 } 903 904 /* 905 * Zero out the abd from a particular offset to the end. 906 */ 907 void 908 abd_zero_off(abd_t *abd, size_t off, size_t size) 909 { 910 (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); 911 } 912 913 /* 914 * Iterate over two ABDs and call func incrementally on the two ABDs' data in 915 * equal-sized chunks (passed to func as raw buffers). func could be called many 916 * times during this iteration. 917 */ 918 int 919 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, 920 size_t size, abd_iter_func2_t *func, void *private) 921 { 922 int ret = 0; 923 struct abd_iter daiter, saiter; 924 925 abd_verify(dabd); 926 abd_verify(sabd); 927 928 ASSERT3U(doff + size, <=, dabd->abd_size); 929 ASSERT3U(soff + size, <=, sabd->abd_size); 930 931 abd_iter_init(&daiter, dabd); 932 abd_iter_init(&saiter, sabd); 933 abd_iter_advance(&daiter, doff); 934 abd_iter_advance(&saiter, soff); 935 936 while (size > 0) { 937 abd_iter_map(&daiter); 938 abd_iter_map(&saiter); 939 940 size_t dlen = MIN(daiter.iter_mapsize, size); 941 size_t slen = MIN(saiter.iter_mapsize, size); 942 size_t len = MIN(dlen, slen); 943 ASSERT(dlen > 0 || slen > 0); 944 945 ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, 946 private); 947 948 abd_iter_unmap(&saiter); 949 abd_iter_unmap(&daiter); 950 951 if (ret != 0) 952 break; 953 954 size -= len; 955 abd_iter_advance(&daiter, len); 956 abd_iter_advance(&saiter, len); 957 } 958 959 return (ret); 960 } 961 962 /*ARGSUSED*/ 963 static int 964 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) 965 { 966 (void) memcpy(dbuf, sbuf, size); 967 return (0); 968 } 969 970 /* 971 * Copy from sabd to dabd starting from soff and doff. 972 */ 973 void 974 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) 975 { 976 (void) abd_iterate_func2(dabd, sabd, doff, soff, size, 977 abd_copy_off_cb, NULL); 978 } 979 980 /*ARGSUSED*/ 981 static int 982 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) 983 { 984 return (memcmp(bufa, bufb, size)); 985 } 986 987 /* 988 * Compares the first size bytes of two ABDs. 989 */ 990 int 991 abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) 992 { 993 return (abd_iterate_func2(dabd, sabd, 0, 0, size, abd_cmp_cb, NULL)); 994 } 995 996 /* 997 * Iterate over code ABDs and a data ABD and call @func_raidz_gen. 998 * 999 * @cabds parity ABDs, must have equal size 1000 * @dabd data ABD. Can be NULL (in this case @dsize = 0) 1001 * @func_raidz_gen should be implemented so that its behaviour 1002 * is the same when taking linear and when taking scatter 1003 */ 1004 void 1005 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, 1006 ssize_t csize, ssize_t dsize, const unsigned parity, 1007 void (*func_raidz_gen)(void **, const void *, size_t, size_t)) 1008 { 1009 int i; 1010 ssize_t len, dlen; 1011 struct abd_iter caiters[3]; 1012 struct abd_iter daiter = {0}; 1013 void *caddrs[3]; 1014 1015 ASSERT3U(parity, <=, 3); 1016 1017 for (i = 0; i < parity; i++) 1018 abd_iter_init(&caiters[i], cabds[i]); 1019 1020 if (dabd) 1021 abd_iter_init(&daiter, dabd); 1022 1023 ASSERT3S(dsize, >=, 0); 1024 1025 #ifdef _KERNEL 1026 kpreempt_disable(); 1027 #endif 1028 while (csize > 0) { 1029 len = csize; 1030 1031 if (dabd && dsize > 0) 1032 abd_iter_map(&daiter); 1033 1034 for (i = 0; i < parity; i++) { 1035 abd_iter_map(&caiters[i]); 1036 caddrs[i] = caiters[i].iter_mapaddr; 1037 } 1038 1039 switch (parity) { 1040 case 3: 1041 len = MIN(caiters[2].iter_mapsize, len); 1042 /* falls through */ 1043 case 2: 1044 len = MIN(caiters[1].iter_mapsize, len); 1045 /* falls through */ 1046 case 1: 1047 len = MIN(caiters[0].iter_mapsize, len); 1048 } 1049 1050 /* must be progressive */ 1051 ASSERT3S(len, >, 0); 1052 1053 if (dabd && dsize > 0) { 1054 /* this needs precise iter.length */ 1055 len = MIN(daiter.iter_mapsize, len); 1056 len = MIN(dsize, len); 1057 dlen = len; 1058 } else 1059 dlen = 0; 1060 1061 /* must be progressive */ 1062 ASSERT3S(len, >, 0); 1063 /* 1064 * The iterated function likely will not do well if each 1065 * segment except the last one is not multiple of 512 (raidz). 1066 */ 1067 ASSERT3U(((uint64_t)len & 511ULL), ==, 0); 1068 1069 func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); 1070 1071 for (i = parity-1; i >= 0; i--) { 1072 abd_iter_unmap(&caiters[i]); 1073 abd_iter_advance(&caiters[i], len); 1074 } 1075 1076 if (dabd && dsize > 0) { 1077 abd_iter_unmap(&daiter); 1078 abd_iter_advance(&daiter, dlen); 1079 dsize -= dlen; 1080 } 1081 1082 csize -= len; 1083 1084 ASSERT3S(dsize, >=, 0); 1085 ASSERT3S(csize, >=, 0); 1086 } 1087 #ifdef _KERNEL 1088 kpreempt_enable(); 1089 #endif 1090 } 1091 1092 /* 1093 * Iterate over code ABDs and data reconstruction target ABDs and call 1094 * @func_raidz_rec. Function maps at most 6 pages atomically. 1095 * 1096 * @cabds parity ABDs, must have equal size 1097 * @tabds rec target ABDs, at most 3 1098 * @tsize size of data target columns 1099 * @func_raidz_rec expects syndrome data in target columns. Function 1100 * reconstructs data and overwrites target columns. 1101 */ 1102 void 1103 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, 1104 ssize_t tsize, const unsigned parity, 1105 void (*func_raidz_rec)(void **t, const size_t tsize, void **c, 1106 const unsigned *mul), 1107 const unsigned *mul) 1108 { 1109 int i; 1110 ssize_t len; 1111 struct abd_iter citers[3]; 1112 struct abd_iter xiters[3]; 1113 void *caddrs[3], *xaddrs[3]; 1114 1115 ASSERT3U(parity, <=, 3); 1116 1117 for (i = 0; i < parity; i++) { 1118 abd_iter_init(&citers[i], cabds[i]); 1119 abd_iter_init(&xiters[i], tabds[i]); 1120 } 1121 1122 #ifdef _KERNEL 1123 kpreempt_disable(); 1124 #endif 1125 while (tsize > 0) { 1126 1127 for (i = 0; i < parity; i++) { 1128 abd_iter_map(&citers[i]); 1129 abd_iter_map(&xiters[i]); 1130 caddrs[i] = citers[i].iter_mapaddr; 1131 xaddrs[i] = xiters[i].iter_mapaddr; 1132 } 1133 1134 len = tsize; 1135 switch (parity) { 1136 case 3: 1137 len = MIN(xiters[2].iter_mapsize, len); 1138 len = MIN(citers[2].iter_mapsize, len); 1139 /* falls through */ 1140 case 2: 1141 len = MIN(xiters[1].iter_mapsize, len); 1142 len = MIN(citers[1].iter_mapsize, len); 1143 /* falls through */ 1144 case 1: 1145 len = MIN(xiters[0].iter_mapsize, len); 1146 len = MIN(citers[0].iter_mapsize, len); 1147 } 1148 /* must be progressive */ 1149 ASSERT3S(len, >, 0); 1150 /* 1151 * The iterated function likely will not do well if each 1152 * segment except the last one is not multiple of 512 (raidz). 1153 */ 1154 ASSERT3U(((uint64_t)len & 511ULL), ==, 0); 1155 1156 func_raidz_rec(xaddrs, len, caddrs, mul); 1157 1158 for (i = parity-1; i >= 0; i--) { 1159 abd_iter_unmap(&xiters[i]); 1160 abd_iter_unmap(&citers[i]); 1161 abd_iter_advance(&xiters[i], len); 1162 abd_iter_advance(&citers[i], len); 1163 } 1164 1165 tsize -= len; 1166 ASSERT3S(tsize, >=, 0); 1167 } 1168 #ifdef _KERNEL 1169 kpreempt_enable(); 1170 #endif 1171 } 1172