1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 23 * Copyright (c) 2019 by Delphix. All rights reserved. 24 * Copyright (c) 2023, 2024, Klara Inc. 25 */ 26 27 /* 28 * See abd.c for a general overview of the arc buffered data (ABD). 29 * 30 * Linear buffers act exactly like normal buffers and are always mapped into the 31 * kernel's virtual memory space, while scattered ABD data chunks are allocated 32 * as physical pages and then mapped in only while they are actually being 33 * accessed through one of the abd_* library functions. Using scattered ABDs 34 * provides several benefits: 35 * 36 * (1) They avoid use of kmem_*, preventing performance problems where running 37 * kmem_reap on very large memory systems never finishes and causes 38 * constant TLB shootdowns. 39 * 40 * (2) Fragmentation is less of an issue since when we are at the limit of 41 * allocatable space, we won't have to search around for a long free 42 * hole in the VA space for large ARC allocations. Each chunk is mapped in 43 * individually, so even if we are using HIGHMEM (see next point) we 44 * wouldn't need to worry about finding a contiguous address range. 45 * 46 * (3) If we are not using HIGHMEM, then all physical memory is always 47 * mapped into the kernel's address space, so we also avoid the map / 48 * unmap costs on each ABD access. 49 * 50 * If we are not using HIGHMEM, scattered buffers which have only one chunk 51 * can be treated as linear buffers, because they are contiguous in the 52 * kernel's virtual address space. See abd_alloc_chunks() for details. 53 */ 54 55 #include <sys/abd_impl.h> 56 #include <sys/param.h> 57 #include <sys/zio.h> 58 #include <sys/arc.h> 59 #include <sys/zfs_context.h> 60 #include <sys/zfs_znode.h> 61 #ifdef _KERNEL 62 #include <linux/kmap_compat.h> 63 #include <linux/mm_compat.h> 64 #include <linux/scatterlist.h> 65 #include <linux/version.h> 66 #endif 67 68 #ifdef _KERNEL 69 #if defined(MAX_ORDER) 70 #define ABD_MAX_ORDER (MAX_ORDER) 71 #elif defined(MAX_PAGE_ORDER) 72 #define ABD_MAX_ORDER (MAX_PAGE_ORDER) 73 #endif 74 #else 75 #define ABD_MAX_ORDER (1) 76 #endif 77 78 typedef struct abd_stats { 79 kstat_named_t abdstat_struct_size; 80 kstat_named_t abdstat_linear_cnt; 81 kstat_named_t abdstat_linear_data_size; 82 kstat_named_t abdstat_scatter_cnt; 83 kstat_named_t abdstat_scatter_data_size; 84 kstat_named_t abdstat_scatter_chunk_waste; 85 kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; 86 kstat_named_t abdstat_scatter_page_multi_chunk; 87 kstat_named_t abdstat_scatter_page_multi_zone; 88 kstat_named_t abdstat_scatter_page_alloc_retry; 89 kstat_named_t abdstat_scatter_sg_table_retry; 90 } abd_stats_t; 91 92 static abd_stats_t abd_stats = { 93 /* Amount of memory occupied by all of the abd_t struct allocations */ 94 { "struct_size", KSTAT_DATA_UINT64 }, 95 /* 96 * The number of linear ABDs which are currently allocated, excluding 97 * ABDs which don't own their data (for instance the ones which were 98 * allocated through abd_get_offset() and abd_get_from_buf()). If an 99 * ABD takes ownership of its buf then it will become tracked. 100 */ 101 { "linear_cnt", KSTAT_DATA_UINT64 }, 102 /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 103 { "linear_data_size", KSTAT_DATA_UINT64 }, 104 /* 105 * The number of scatter ABDs which are currently allocated, excluding 106 * ABDs which don't own their data (for instance the ones which were 107 * allocated through abd_get_offset()). 108 */ 109 { "scatter_cnt", KSTAT_DATA_UINT64 }, 110 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 111 { "scatter_data_size", KSTAT_DATA_UINT64 }, 112 /* 113 * The amount of space wasted at the end of the last chunk across all 114 * scatter ABDs tracked by scatter_cnt. 115 */ 116 { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 117 /* 118 * The number of compound allocations of a given order. These 119 * allocations are spread over all currently allocated ABDs, and 120 * act as a measure of memory fragmentation. 121 */ 122 { { "scatter_order_N", KSTAT_DATA_UINT64 } }, 123 /* 124 * The number of scatter ABDs which contain multiple chunks. 125 * ABDs are preferentially allocated from the minimum number of 126 * contiguous multi-page chunks, a single chunk is optimal. 127 */ 128 { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, 129 /* 130 * The number of scatter ABDs which are split across memory zones. 131 * ABDs are preferentially allocated using pages from a single zone. 132 */ 133 { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, 134 /* 135 * The total number of retries encountered when attempting to 136 * allocate the pages to populate the scatter ABD. 137 */ 138 { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, 139 /* 140 * The total number of retries encountered when attempting to 141 * allocate the sg table for an ABD. 142 */ 143 { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, 144 }; 145 146 static struct { 147 wmsum_t abdstat_struct_size; 148 wmsum_t abdstat_linear_cnt; 149 wmsum_t abdstat_linear_data_size; 150 wmsum_t abdstat_scatter_cnt; 151 wmsum_t abdstat_scatter_data_size; 152 wmsum_t abdstat_scatter_chunk_waste; 153 wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; 154 wmsum_t abdstat_scatter_page_multi_chunk; 155 wmsum_t abdstat_scatter_page_multi_zone; 156 wmsum_t abdstat_scatter_page_alloc_retry; 157 wmsum_t abdstat_scatter_sg_table_retry; 158 } abd_sums; 159 160 #define abd_for_each_sg(abd, sg, n, i) \ 161 for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) 162 163 /* 164 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 165 * ABD's. Smaller allocations will use linear ABD's which uses 166 * zio_[data_]buf_alloc(). 167 * 168 * Scatter ABD's use at least one page each, so sub-page allocations waste 169 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 170 * half of each page). Using linear ABD's for small allocations means that 171 * they will be put on slabs which contain many allocations. This can 172 * improve memory efficiency, but it also makes it much harder for ARC 173 * evictions to actually free pages, because all the buffers on one slab need 174 * to be freed in order for the slab (and underlying pages) to be freed. 175 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 176 * possible for them to actually waste more memory than scatter (one page per 177 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 178 * 179 * Spill blocks are typically 512B and are heavily used on systems running 180 * selinux with the default dnode size and the `xattr=sa` property set. 181 * 182 * By default we use linear allocations for 512B and 1KB, and scatter 183 * allocations for larger (1.5KB and up). 184 */ 185 static int zfs_abd_scatter_min_size = 512 * 3; 186 187 /* 188 * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are 189 * just a single zero'd page. This allows us to conserve memory by 190 * only using a single zero page for the scatterlist. 191 */ 192 abd_t *abd_zero_scatter = NULL; 193 194 struct page; 195 /* 196 * _KERNEL - Will point to ZERO_PAGE if it is available or it will be 197 * an allocated zero'd PAGESIZE buffer. 198 * Userspace - Will be an allocated zero'ed PAGESIZE buffer. 199 * 200 * abd_zero_page is assigned to each of the pages of abd_zero_scatter. 201 */ 202 static struct page *abd_zero_page = NULL; 203 204 static kmem_cache_t *abd_cache = NULL; 205 static kstat_t *abd_ksp; 206 207 static uint_t 208 abd_chunkcnt_for_bytes(size_t size) 209 { 210 return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); 211 } 212 213 abd_t * 214 abd_alloc_struct_impl(size_t size) 215 { 216 /* 217 * In Linux we do not use the size passed in during ABD 218 * allocation, so we just ignore it. 219 */ 220 (void) size; 221 abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); 222 ASSERT3P(abd, !=, NULL); 223 ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); 224 225 return (abd); 226 } 227 228 void 229 abd_free_struct_impl(abd_t *abd) 230 { 231 kmem_cache_free(abd_cache, abd); 232 ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); 233 } 234 235 #ifdef _KERNEL 236 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; 237 238 /* 239 * Mark zfs data pages so they can be excluded from kernel crash dumps 240 */ 241 #ifdef _LP64 242 #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E 243 244 static inline void 245 abd_mark_zfs_page(struct page *page) 246 { 247 get_page(page); 248 SetPagePrivate(page); 249 set_page_private(page, ABD_FILE_CACHE_PAGE); 250 } 251 252 static inline void 253 abd_unmark_zfs_page(struct page *page) 254 { 255 set_page_private(page, 0UL); 256 ClearPagePrivate(page); 257 put_page(page); 258 } 259 #else 260 #define abd_mark_zfs_page(page) 261 #define abd_unmark_zfs_page(page) 262 #endif /* _LP64 */ 263 264 #ifndef CONFIG_HIGHMEM 265 266 #ifndef __GFP_RECLAIM 267 #define __GFP_RECLAIM __GFP_WAIT 268 #endif 269 270 /* 271 * The goal is to minimize fragmentation by preferentially populating ABDs 272 * with higher order compound pages from a single zone. Allocation size is 273 * progressively decreased until it can be satisfied without performing 274 * reclaim or compaction. When necessary this function will degenerate to 275 * allocating individual pages and allowing reclaim to satisfy allocations. 276 */ 277 void 278 abd_alloc_chunks(abd_t *abd, size_t size) 279 { 280 struct list_head pages; 281 struct sg_table table; 282 struct scatterlist *sg; 283 struct page *page, *tmp_page = NULL; 284 gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 285 gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; 286 unsigned int max_order = MIN(zfs_abd_scatter_max_order, 287 ABD_MAX_ORDER - 1); 288 unsigned int nr_pages = abd_chunkcnt_for_bytes(size); 289 unsigned int chunks = 0, zones = 0; 290 size_t remaining_size; 291 int nid = NUMA_NO_NODE; 292 unsigned int alloc_pages = 0; 293 294 INIT_LIST_HEAD(&pages); 295 296 ASSERT3U(alloc_pages, <, nr_pages); 297 298 while (alloc_pages < nr_pages) { 299 unsigned int chunk_pages; 300 unsigned int order; 301 302 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); 303 chunk_pages = (1U << order); 304 305 page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); 306 if (page == NULL) { 307 if (order == 0) { 308 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 309 schedule_timeout_interruptible(1); 310 } else { 311 max_order = MAX(0, order - 1); 312 } 313 continue; 314 } 315 316 list_add_tail(&page->lru, &pages); 317 318 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) 319 zones++; 320 321 nid = page_to_nid(page); 322 ABDSTAT_BUMP(abdstat_scatter_orders[order]); 323 chunks++; 324 alloc_pages += chunk_pages; 325 } 326 327 ASSERT3S(alloc_pages, ==, nr_pages); 328 329 while (sg_alloc_table(&table, chunks, gfp)) { 330 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 331 schedule_timeout_interruptible(1); 332 } 333 334 sg = table.sgl; 335 remaining_size = size; 336 list_for_each_entry_safe(page, tmp_page, &pages, lru) { 337 size_t sg_size = MIN(PAGESIZE << compound_order(page), 338 remaining_size); 339 sg_set_page(sg, page, sg_size, 0); 340 abd_mark_zfs_page(page); 341 remaining_size -= sg_size; 342 343 sg = sg_next(sg); 344 list_del(&page->lru); 345 } 346 347 /* 348 * These conditions ensure that a possible transformation to a linear 349 * ABD would be valid. 350 */ 351 ASSERT(!PageHighMem(sg_page(table.sgl))); 352 ASSERT0(ABD_SCATTER(abd).abd_offset); 353 354 if (table.nents == 1) { 355 /* 356 * Since there is only one entry, this ABD can be represented 357 * as a linear buffer. All single-page (4K) ABD's can be 358 * represented this way. Some multi-page ABD's can also be 359 * represented this way, if we were able to allocate a single 360 * "chunk" (higher-order "page" which represents a power-of-2 361 * series of physically-contiguous pages). This is often the 362 * case for 2-page (8K) ABD's. 363 * 364 * Representing a single-entry scatter ABD as a linear ABD 365 * has the performance advantage of avoiding the copy (and 366 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. 367 * A performance increase of around 5% has been observed for 368 * ARC-cached reads (of small blocks which can take advantage 369 * of this). 370 * 371 * Note that this optimization is only possible because the 372 * pages are always mapped into the kernel's address space. 373 * This is not the case for highmem pages, so the 374 * optimization can not be made there. 375 */ 376 abd->abd_flags |= ABD_FLAG_LINEAR; 377 abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; 378 abd->abd_u.abd_linear.abd_sgl = table.sgl; 379 ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); 380 } else if (table.nents > 1) { 381 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 382 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 383 384 if (zones) { 385 ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); 386 abd->abd_flags |= ABD_FLAG_MULTI_ZONE; 387 } 388 389 ABD_SCATTER(abd).abd_sgl = table.sgl; 390 ABD_SCATTER(abd).abd_nents = table.nents; 391 } 392 } 393 #else 394 395 /* 396 * Allocate N individual pages to construct a scatter ABD. This function 397 * makes no attempt to request contiguous pages and requires the minimal 398 * number of kernel interfaces. It's designed for maximum compatibility. 399 */ 400 void 401 abd_alloc_chunks(abd_t *abd, size_t size) 402 { 403 struct scatterlist *sg = NULL; 404 struct sg_table table; 405 struct page *page; 406 gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 407 int nr_pages = abd_chunkcnt_for_bytes(size); 408 int i = 0; 409 410 while (sg_alloc_table(&table, nr_pages, gfp)) { 411 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 412 schedule_timeout_interruptible(1); 413 } 414 415 ASSERT3U(table.nents, ==, nr_pages); 416 ABD_SCATTER(abd).abd_sgl = table.sgl; 417 ABD_SCATTER(abd).abd_nents = nr_pages; 418 419 abd_for_each_sg(abd, sg, nr_pages, i) { 420 while ((page = __page_cache_alloc(gfp)) == NULL) { 421 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 422 schedule_timeout_interruptible(1); 423 } 424 425 ABDSTAT_BUMP(abdstat_scatter_orders[0]); 426 sg_set_page(sg, page, PAGESIZE, 0); 427 abd_mark_zfs_page(page); 428 } 429 430 if (nr_pages > 1) { 431 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 432 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 433 } 434 } 435 #endif /* !CONFIG_HIGHMEM */ 436 437 /* 438 * This must be called if any of the sg_table allocation functions 439 * are called. 440 */ 441 static void 442 abd_free_sg_table(abd_t *abd) 443 { 444 struct sg_table table; 445 446 table.sgl = ABD_SCATTER(abd).abd_sgl; 447 table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; 448 sg_free_table(&table); 449 } 450 451 void 452 abd_free_chunks(abd_t *abd) 453 { 454 struct scatterlist *sg = NULL; 455 struct page *page; 456 int nr_pages = ABD_SCATTER(abd).abd_nents; 457 int order, i = 0; 458 459 if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) 460 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); 461 462 if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) 463 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 464 465 abd_for_each_sg(abd, sg, nr_pages, i) { 466 page = sg_page(sg); 467 abd_unmark_zfs_page(page); 468 order = compound_order(page); 469 __free_pages(page, order); 470 ASSERT3U(sg->length, <=, PAGE_SIZE << order); 471 ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); 472 } 473 abd_free_sg_table(abd); 474 } 475 476 /* 477 * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in 478 * the scatterlist will be set to the zero'd out buffer abd_zero_page. 479 */ 480 static void 481 abd_alloc_zero_scatter(void) 482 { 483 struct scatterlist *sg = NULL; 484 struct sg_table table; 485 gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 486 int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 487 int i = 0; 488 489 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 490 gfp_t gfp_zero_page = gfp | __GFP_ZERO; 491 while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { 492 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 493 schedule_timeout_interruptible(1); 494 } 495 abd_mark_zfs_page(abd_zero_page); 496 #else 497 abd_zero_page = ZERO_PAGE(0); 498 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 499 500 while (sg_alloc_table(&table, nr_pages, gfp)) { 501 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 502 schedule_timeout_interruptible(1); 503 } 504 ASSERT3U(table.nents, ==, nr_pages); 505 506 abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 507 abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 508 ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 509 ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; 510 ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 511 abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 512 abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 513 514 abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 515 sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 516 } 517 518 ABDSTAT_BUMP(abdstat_scatter_cnt); 519 ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 520 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 521 } 522 523 #else /* _KERNEL */ 524 525 #ifndef PAGE_SHIFT 526 #define PAGE_SHIFT (highbit64(PAGESIZE)-1) 527 #endif 528 529 #define zfs_kmap_atomic(chunk) ((void *)chunk) 530 #define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0) 531 #define local_irq_save(flags) do { (void)(flags); } while (0) 532 #define local_irq_restore(flags) do { (void)(flags); } while (0) 533 #define nth_page(pg, i) \ 534 ((struct page *)((void *)(pg) + (i) * PAGESIZE)) 535 536 struct scatterlist { 537 struct page *page; 538 int length; 539 int end; 540 }; 541 542 static void 543 sg_init_table(struct scatterlist *sg, int nr) 544 { 545 memset(sg, 0, nr * sizeof (struct scatterlist)); 546 sg[nr - 1].end = 1; 547 } 548 549 /* 550 * This must be called if any of the sg_table allocation functions 551 * are called. 552 */ 553 static void 554 abd_free_sg_table(abd_t *abd) 555 { 556 int nents = ABD_SCATTER(abd).abd_nents; 557 vmem_free(ABD_SCATTER(abd).abd_sgl, 558 nents * sizeof (struct scatterlist)); 559 } 560 561 #define for_each_sg(sgl, sg, nr, i) \ 562 for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) 563 564 static inline void 565 sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, 566 unsigned int offset) 567 { 568 /* currently we don't use offset */ 569 ASSERT(offset == 0); 570 sg->page = page; 571 sg->length = len; 572 } 573 574 static inline struct page * 575 sg_page(struct scatterlist *sg) 576 { 577 return (sg->page); 578 } 579 580 static inline struct scatterlist * 581 sg_next(struct scatterlist *sg) 582 { 583 if (sg->end) 584 return (NULL); 585 586 return (sg + 1); 587 } 588 589 void 590 abd_alloc_chunks(abd_t *abd, size_t size) 591 { 592 unsigned nr_pages = abd_chunkcnt_for_bytes(size); 593 struct scatterlist *sg; 594 int i; 595 596 ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * 597 sizeof (struct scatterlist), KM_SLEEP); 598 sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); 599 600 abd_for_each_sg(abd, sg, nr_pages, i) { 601 struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 602 sg_set_page(sg, p, PAGESIZE, 0); 603 } 604 ABD_SCATTER(abd).abd_nents = nr_pages; 605 } 606 607 void 608 abd_free_chunks(abd_t *abd) 609 { 610 int i, n = ABD_SCATTER(abd).abd_nents; 611 struct scatterlist *sg; 612 613 abd_for_each_sg(abd, sg, n, i) { 614 struct page *p = nth_page(sg_page(sg), 0); 615 umem_free_aligned(p, PAGESIZE); 616 } 617 abd_free_sg_table(abd); 618 } 619 620 static void 621 abd_alloc_zero_scatter(void) 622 { 623 unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 624 struct scatterlist *sg; 625 int i; 626 627 abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 628 memset(abd_zero_page, 0, PAGESIZE); 629 abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 630 abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 631 abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 632 ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 633 ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 634 abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 635 ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * 636 sizeof (struct scatterlist), KM_SLEEP); 637 638 sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); 639 640 abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 641 sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 642 } 643 644 ABDSTAT_BUMP(abdstat_scatter_cnt); 645 ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 646 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 647 } 648 649 #endif /* _KERNEL */ 650 651 boolean_t 652 abd_size_alloc_linear(size_t size) 653 { 654 return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); 655 } 656 657 void 658 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) 659 { 660 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 661 int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; 662 if (op == ABDSTAT_INCR) { 663 ABDSTAT_BUMP(abdstat_scatter_cnt); 664 ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); 665 ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); 666 arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); 667 } else { 668 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 669 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 670 ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); 671 arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); 672 } 673 } 674 675 void 676 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) 677 { 678 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 679 if (op == ABDSTAT_INCR) { 680 ABDSTAT_BUMP(abdstat_linear_cnt); 681 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 682 } else { 683 ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 684 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 685 } 686 } 687 688 void 689 abd_verify_scatter(abd_t *abd) 690 { 691 size_t n; 692 int i = 0; 693 struct scatterlist *sg = NULL; 694 695 ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); 696 ASSERT3U(ABD_SCATTER(abd).abd_offset, <, 697 ABD_SCATTER(abd).abd_sgl->length); 698 n = ABD_SCATTER(abd).abd_nents; 699 abd_for_each_sg(abd, sg, n, i) { 700 ASSERT3P(sg_page(sg), !=, NULL); 701 } 702 } 703 704 static void 705 abd_free_zero_scatter(void) 706 { 707 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 708 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); 709 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 710 711 abd_free_sg_table(abd_zero_scatter); 712 abd_free_struct(abd_zero_scatter); 713 abd_zero_scatter = NULL; 714 ASSERT3P(abd_zero_page, !=, NULL); 715 #if defined(_KERNEL) 716 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 717 abd_unmark_zfs_page(abd_zero_page); 718 __free_page(abd_zero_page); 719 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 720 #else 721 umem_free_aligned(abd_zero_page, PAGESIZE); 722 #endif /* _KERNEL */ 723 } 724 725 static int 726 abd_kstats_update(kstat_t *ksp, int rw) 727 { 728 abd_stats_t *as = ksp->ks_data; 729 730 if (rw == KSTAT_WRITE) 731 return (EACCES); 732 as->abdstat_struct_size.value.ui64 = 733 wmsum_value(&abd_sums.abdstat_struct_size); 734 as->abdstat_linear_cnt.value.ui64 = 735 wmsum_value(&abd_sums.abdstat_linear_cnt); 736 as->abdstat_linear_data_size.value.ui64 = 737 wmsum_value(&abd_sums.abdstat_linear_data_size); 738 as->abdstat_scatter_cnt.value.ui64 = 739 wmsum_value(&abd_sums.abdstat_scatter_cnt); 740 as->abdstat_scatter_data_size.value.ui64 = 741 wmsum_value(&abd_sums.abdstat_scatter_data_size); 742 as->abdstat_scatter_chunk_waste.value.ui64 = 743 wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); 744 for (int i = 0; i < ABD_MAX_ORDER; i++) { 745 as->abdstat_scatter_orders[i].value.ui64 = 746 wmsum_value(&abd_sums.abdstat_scatter_orders[i]); 747 } 748 as->abdstat_scatter_page_multi_chunk.value.ui64 = 749 wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); 750 as->abdstat_scatter_page_multi_zone.value.ui64 = 751 wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); 752 as->abdstat_scatter_page_alloc_retry.value.ui64 = 753 wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); 754 as->abdstat_scatter_sg_table_retry.value.ui64 = 755 wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); 756 return (0); 757 } 758 759 void 760 abd_init(void) 761 { 762 int i; 763 764 abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 765 0, NULL, NULL, NULL, NULL, NULL, 0); 766 767 wmsum_init(&abd_sums.abdstat_struct_size, 0); 768 wmsum_init(&abd_sums.abdstat_linear_cnt, 0); 769 wmsum_init(&abd_sums.abdstat_linear_data_size, 0); 770 wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); 771 wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); 772 wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); 773 for (i = 0; i < ABD_MAX_ORDER; i++) 774 wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); 775 wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); 776 wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); 777 wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); 778 wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); 779 780 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 781 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 782 if (abd_ksp != NULL) { 783 for (i = 0; i < ABD_MAX_ORDER; i++) { 784 snprintf(abd_stats.abdstat_scatter_orders[i].name, 785 KSTAT_STRLEN, "scatter_order_%d", i); 786 abd_stats.abdstat_scatter_orders[i].data_type = 787 KSTAT_DATA_UINT64; 788 } 789 abd_ksp->ks_data = &abd_stats; 790 abd_ksp->ks_update = abd_kstats_update; 791 kstat_install(abd_ksp); 792 } 793 794 abd_alloc_zero_scatter(); 795 } 796 797 void 798 abd_fini(void) 799 { 800 abd_free_zero_scatter(); 801 802 if (abd_ksp != NULL) { 803 kstat_delete(abd_ksp); 804 abd_ksp = NULL; 805 } 806 807 wmsum_fini(&abd_sums.abdstat_struct_size); 808 wmsum_fini(&abd_sums.abdstat_linear_cnt); 809 wmsum_fini(&abd_sums.abdstat_linear_data_size); 810 wmsum_fini(&abd_sums.abdstat_scatter_cnt); 811 wmsum_fini(&abd_sums.abdstat_scatter_data_size); 812 wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); 813 for (int i = 0; i < ABD_MAX_ORDER; i++) 814 wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); 815 wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); 816 wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); 817 wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); 818 wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); 819 820 if (abd_cache) { 821 kmem_cache_destroy(abd_cache); 822 abd_cache = NULL; 823 } 824 } 825 826 void 827 abd_free_linear_page(abd_t *abd) 828 { 829 /* Transform it back into a scatter ABD for freeing */ 830 struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; 831 abd->abd_flags &= ~ABD_FLAG_LINEAR; 832 abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; 833 ABD_SCATTER(abd).abd_nents = 1; 834 ABD_SCATTER(abd).abd_offset = 0; 835 ABD_SCATTER(abd).abd_sgl = sg; 836 abd_free_chunks(abd); 837 838 abd_update_scatter_stats(abd, ABDSTAT_DECR); 839 } 840 841 /* 842 * If we're going to use this ABD for doing I/O using the block layer, the 843 * consumer of the ABD data doesn't care if it's scattered or not, and we don't 844 * plan to store this ABD in memory for a long period of time, we should 845 * allocate the ABD type that requires the least data copying to do the I/O. 846 * 847 * On Linux the optimal thing to do would be to use abd_get_offset() and 848 * construct a new ABD which shares the original pages thereby eliminating 849 * the copy. But for the moment a new linear ABD is allocated until this 850 * performance optimization can be implemented. 851 */ 852 abd_t * 853 abd_alloc_for_io(size_t size, boolean_t is_metadata) 854 { 855 return (abd_alloc(size, is_metadata)); 856 } 857 858 abd_t * 859 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, 860 size_t size) 861 { 862 (void) size; 863 int i = 0; 864 struct scatterlist *sg = NULL; 865 866 abd_verify(sabd); 867 ASSERT3U(off, <=, sabd->abd_size); 868 869 size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; 870 871 if (abd == NULL) 872 abd = abd_alloc_struct(0); 873 874 /* 875 * Even if this buf is filesystem metadata, we only track that 876 * if we own the underlying data buffer, which is not true in 877 * this case. Therefore, we don't ever use ABD_FLAG_META here. 878 */ 879 880 abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { 881 if (new_offset < sg->length) 882 break; 883 new_offset -= sg->length; 884 } 885 886 ABD_SCATTER(abd).abd_sgl = sg; 887 ABD_SCATTER(abd).abd_offset = new_offset; 888 ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; 889 890 return (abd); 891 } 892 893 /* 894 * Initialize the abd_iter. 895 */ 896 void 897 abd_iter_init(struct abd_iter *aiter, abd_t *abd) 898 { 899 ASSERT(!abd_is_gang(abd)); 900 abd_verify(abd); 901 memset(aiter, 0, sizeof (struct abd_iter)); 902 aiter->iter_abd = abd; 903 if (!abd_is_linear(abd)) { 904 aiter->iter_offset = ABD_SCATTER(abd).abd_offset; 905 aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; 906 } 907 } 908 909 /* 910 * This is just a helper function to see if we have exhausted the 911 * abd_iter and reached the end. 912 */ 913 boolean_t 914 abd_iter_at_end(struct abd_iter *aiter) 915 { 916 ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); 917 return (aiter->iter_pos == aiter->iter_abd->abd_size); 918 } 919 920 /* 921 * Advance the iterator by a certain amount. Cannot be called when a chunk is 922 * in use. This can be safely called when the aiter has already exhausted, in 923 * which case this does nothing. 924 */ 925 void 926 abd_iter_advance(struct abd_iter *aiter, size_t amount) 927 { 928 /* 929 * Ensure that last chunk is not in use. abd_iterate_*() must clear 930 * this state (directly or abd_iter_unmap()) before advancing. 931 */ 932 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 933 ASSERT0(aiter->iter_mapsize); 934 ASSERT3P(aiter->iter_page, ==, NULL); 935 ASSERT0(aiter->iter_page_doff); 936 ASSERT0(aiter->iter_page_dsize); 937 938 /* There's nothing left to advance to, so do nothing */ 939 if (abd_iter_at_end(aiter)) 940 return; 941 942 aiter->iter_pos += amount; 943 aiter->iter_offset += amount; 944 if (!abd_is_linear(aiter->iter_abd)) { 945 while (aiter->iter_offset >= aiter->iter_sg->length) { 946 aiter->iter_offset -= aiter->iter_sg->length; 947 aiter->iter_sg = sg_next(aiter->iter_sg); 948 if (aiter->iter_sg == NULL) { 949 ASSERT0(aiter->iter_offset); 950 break; 951 } 952 } 953 } 954 } 955 956 /* 957 * Map the current chunk into aiter. This can be safely called when the aiter 958 * has already exhausted, in which case this does nothing. 959 */ 960 void 961 abd_iter_map(struct abd_iter *aiter) 962 { 963 void *paddr; 964 size_t offset = 0; 965 966 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 967 ASSERT0(aiter->iter_mapsize); 968 969 /* There's nothing left to iterate over, so do nothing */ 970 if (abd_iter_at_end(aiter)) 971 return; 972 973 if (abd_is_linear(aiter->iter_abd)) { 974 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 975 offset = aiter->iter_offset; 976 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 977 paddr = ABD_LINEAR_BUF(aiter->iter_abd); 978 } else { 979 offset = aiter->iter_offset; 980 aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, 981 aiter->iter_abd->abd_size - aiter->iter_pos); 982 983 paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg)); 984 } 985 986 aiter->iter_mapaddr = (char *)paddr + offset; 987 } 988 989 /* 990 * Unmap the current chunk from aiter. This can be safely called when the aiter 991 * has already exhausted, in which case this does nothing. 992 */ 993 void 994 abd_iter_unmap(struct abd_iter *aiter) 995 { 996 /* There's nothing left to unmap, so do nothing */ 997 if (abd_iter_at_end(aiter)) 998 return; 999 1000 if (!abd_is_linear(aiter->iter_abd)) { 1001 /* LINTED E_FUNC_SET_NOT_USED */ 1002 zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset); 1003 } 1004 1005 ASSERT3P(aiter->iter_mapaddr, !=, NULL); 1006 ASSERT3U(aiter->iter_mapsize, >, 0); 1007 1008 aiter->iter_mapaddr = NULL; 1009 aiter->iter_mapsize = 0; 1010 } 1011 1012 void 1013 abd_cache_reap_now(void) 1014 { 1015 } 1016 1017 #if defined(_KERNEL) 1018 1019 /* 1020 * This is abd_iter_page(), the function underneath abd_iterate_page_func(). 1021 * It yields the next page struct and data offset and size within it, without 1022 * mapping it into the address space. 1023 */ 1024 1025 /* 1026 * "Compound pages" are a group of pages that can be referenced from a single 1027 * struct page *. Its organised as a "head" page, followed by a series of 1028 * "tail" pages. 1029 * 1030 * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we 1031 * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a 1032 * great many of the IO buffers we get are going to be of this type. 1033 * 1034 * The tail pages are just regular PAGESIZE pages, and can be safely used 1035 * as-is. However, the head page has length covering itself and all the tail 1036 * pages. If the ABD chunk spans multiple pages, then we can use the head page 1037 * and a >PAGESIZE length, which is far more efficient. 1038 * 1039 * Before kernel 4.5 however, compound page heads were refcounted separately 1040 * from tail pages, such that moving back to the head page would require us to 1041 * take a reference to it and releasing it once we're completely finished with 1042 * it. In practice, that means when our caller is done with the ABD, which we 1043 * have no insight into from here. Rather than contort this API to track head 1044 * page references on such ancient kernels, we disable this special compound 1045 * page handling on 4.5, instead just using treating each page within it as a 1046 * regular PAGESIZE page (which it is). This is slightly less efficient, but 1047 * makes everything far simpler. 1048 * 1049 * The below test sets/clears ABD_ITER_COMPOUND_PAGES to enable/disable the 1050 * special handling, and also defines the ABD_ITER_PAGE_SIZE(page) macro to 1051 * understand compound pages, or not, as required. 1052 */ 1053 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) 1054 #define ABD_ITER_COMPOUND_PAGES 1 1055 #define ABD_ITER_PAGE_SIZE(page) \ 1056 (PageCompound(page) ? page_size(page) : PAGESIZE) 1057 #else 1058 #undef ABD_ITER_COMPOUND_PAGES 1059 #define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) 1060 #endif 1061 1062 void 1063 abd_iter_page(struct abd_iter *aiter) 1064 { 1065 if (abd_iter_at_end(aiter)) { 1066 aiter->iter_page = NULL; 1067 aiter->iter_page_doff = 0; 1068 aiter->iter_page_dsize = 0; 1069 return; 1070 } 1071 1072 struct page *page; 1073 size_t doff, dsize; 1074 1075 /* 1076 * Find the page, and the start of the data within it. This is computed 1077 * differently for linear and scatter ABDs; linear is referenced by 1078 * virtual memory location, while scatter is referenced by page 1079 * pointer. 1080 */ 1081 if (abd_is_linear(aiter->iter_abd)) { 1082 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 1083 1084 /* memory address at iter_pos */ 1085 void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; 1086 1087 /* struct page for address */ 1088 page = is_vmalloc_addr(paddr) ? 1089 vmalloc_to_page(paddr) : virt_to_page(paddr); 1090 1091 /* offset of address within the page */ 1092 doff = offset_in_page(paddr); 1093 } else { 1094 ASSERT(!abd_is_gang(aiter->iter_abd)); 1095 1096 /* current scatter page */ 1097 page = nth_page(sg_page(aiter->iter_sg), 1098 aiter->iter_offset >> PAGE_SHIFT); 1099 1100 /* position within page */ 1101 doff = aiter->iter_offset & (PAGESIZE - 1); 1102 } 1103 1104 #ifdef ABD_ITER_COMPOUND_PAGES 1105 if (PageTail(page)) { 1106 /* 1107 * If this is a compound tail page, move back to the head, and 1108 * adjust the offset to match. This may let us yield a much 1109 * larger amount of data from a single logical page, and so 1110 * leave our caller with fewer pages to process. 1111 */ 1112 struct page *head = compound_head(page); 1113 doff += ((page - head) * PAGESIZE); 1114 page = head; 1115 } 1116 #endif 1117 1118 ASSERT(page); 1119 1120 /* 1121 * Compute the maximum amount of data we can take from this page. This 1122 * is the smaller of: 1123 * - the remaining space in the page 1124 * - the remaining space in this scatterlist entry (which may not cover 1125 * the entire page) 1126 * - the remaining space in the abd (which may not cover the entire 1127 * scatterlist entry) 1128 */ 1129 dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, 1130 aiter->iter_abd->abd_size - aiter->iter_pos); 1131 if (!abd_is_linear(aiter->iter_abd)) 1132 dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); 1133 ASSERT3U(dsize, >, 0); 1134 1135 /* final iterator outputs */ 1136 aiter->iter_page = page; 1137 aiter->iter_page_doff = doff; 1138 aiter->iter_page_dsize = dsize; 1139 } 1140 1141 /* 1142 * Note: ABD BIO functions only needed to support vdev_classic. See comments in 1143 * vdev_disk.c. 1144 */ 1145 1146 /* 1147 * bio_nr_pages for ABD. 1148 * @off is the offset in @abd 1149 */ 1150 unsigned long 1151 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) 1152 { 1153 unsigned long pos; 1154 1155 if (abd_is_gang(abd)) { 1156 unsigned long count = 0; 1157 1158 for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1159 cabd != NULL && size != 0; 1160 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1161 ASSERT3U(off, <, cabd->abd_size); 1162 int mysize = MIN(size, cabd->abd_size - off); 1163 count += abd_nr_pages_off(cabd, mysize, off); 1164 size -= mysize; 1165 off = 0; 1166 } 1167 return (count); 1168 } 1169 1170 if (abd_is_linear(abd)) 1171 pos = (unsigned long)abd_to_buf(abd) + off; 1172 else 1173 pos = ABD_SCATTER(abd).abd_offset + off; 1174 1175 return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - 1176 (pos >> PAGE_SHIFT)); 1177 } 1178 1179 static unsigned int 1180 bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) 1181 { 1182 unsigned int offset, size, i; 1183 struct page *page; 1184 1185 offset = offset_in_page(buf_ptr); 1186 for (i = 0; i < bio->bi_max_vecs; i++) { 1187 size = PAGE_SIZE - offset; 1188 1189 if (bio_size <= 0) 1190 break; 1191 1192 if (size > bio_size) 1193 size = bio_size; 1194 1195 if (is_vmalloc_addr(buf_ptr)) 1196 page = vmalloc_to_page(buf_ptr); 1197 else 1198 page = virt_to_page(buf_ptr); 1199 1200 /* 1201 * Some network related block device uses tcp_sendpage, which 1202 * doesn't behave well when using 0-count page, this is a 1203 * safety net to catch them. 1204 */ 1205 ASSERT3S(page_count(page), >, 0); 1206 1207 if (bio_add_page(bio, page, size, offset) != size) 1208 break; 1209 1210 buf_ptr += size; 1211 bio_size -= size; 1212 offset = 0; 1213 } 1214 1215 return (bio_size); 1216 } 1217 1218 /* 1219 * bio_map for gang ABD. 1220 */ 1221 static unsigned int 1222 abd_gang_bio_map_off(struct bio *bio, abd_t *abd, 1223 unsigned int io_size, size_t off) 1224 { 1225 ASSERT(abd_is_gang(abd)); 1226 1227 for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1228 cabd != NULL; 1229 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1230 ASSERT3U(off, <, cabd->abd_size); 1231 int size = MIN(io_size, cabd->abd_size - off); 1232 int remainder = abd_bio_map_off(bio, cabd, size, off); 1233 io_size -= (size - remainder); 1234 if (io_size == 0 || remainder > 0) 1235 return (io_size); 1236 off = 0; 1237 } 1238 ASSERT0(io_size); 1239 return (io_size); 1240 } 1241 1242 /* 1243 * bio_map for ABD. 1244 * @off is the offset in @abd 1245 * Remaining IO size is returned 1246 */ 1247 unsigned int 1248 abd_bio_map_off(struct bio *bio, abd_t *abd, 1249 unsigned int io_size, size_t off) 1250 { 1251 struct abd_iter aiter; 1252 1253 ASSERT3U(io_size, <=, abd->abd_size - off); 1254 if (abd_is_linear(abd)) 1255 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); 1256 1257 ASSERT(!abd_is_linear(abd)); 1258 if (abd_is_gang(abd)) 1259 return (abd_gang_bio_map_off(bio, abd, io_size, off)); 1260 1261 abd_iter_init(&aiter, abd); 1262 abd_iter_advance(&aiter, off); 1263 1264 for (int i = 0; i < bio->bi_max_vecs; i++) { 1265 struct page *pg; 1266 size_t len, sgoff, pgoff; 1267 struct scatterlist *sg; 1268 1269 if (io_size <= 0) 1270 break; 1271 1272 sg = aiter.iter_sg; 1273 sgoff = aiter.iter_offset; 1274 pgoff = sgoff & (PAGESIZE - 1); 1275 len = MIN(io_size, PAGESIZE - pgoff); 1276 ASSERT(len > 0); 1277 1278 pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); 1279 if (bio_add_page(bio, pg, len, pgoff) != len) 1280 break; 1281 1282 io_size -= len; 1283 abd_iter_advance(&aiter, len); 1284 } 1285 1286 return (io_size); 1287 } 1288 1289 /* Tunable Parameters */ 1290 module_param(zfs_abd_scatter_enabled, int, 0644); 1291 MODULE_PARM_DESC(zfs_abd_scatter_enabled, 1292 "Toggle whether ABD allocations must be linear."); 1293 module_param(zfs_abd_scatter_min_size, int, 0644); 1294 MODULE_PARM_DESC(zfs_abd_scatter_min_size, 1295 "Minimum size of scatter allocations."); 1296 /* CSTYLED */ 1297 module_param(zfs_abd_scatter_max_order, uint, 0644); 1298 MODULE_PARM_DESC(zfs_abd_scatter_max_order, 1299 "Maximum order allocation used for a scatter ABD."); 1300 1301 #endif /* _KERNEL */ 1302