1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 24 * Copyright (c) 2019 by Delphix. All rights reserved. 25 * Copyright (c) 2023, 2024, Klara Inc. 26 */ 27 28 /* 29 * See abd.c for a general overview of the arc buffered data (ABD). 30 * 31 * Linear buffers act exactly like normal buffers and are always mapped into the 32 * kernel's virtual memory space, while scattered ABD data chunks are allocated 33 * as physical pages and then mapped in only while they are actually being 34 * accessed through one of the abd_* library functions. Using scattered ABDs 35 * provides several benefits: 36 * 37 * (1) They avoid use of kmem_*, preventing performance problems where running 38 * kmem_reap on very large memory systems never finishes and causes 39 * constant TLB shootdowns. 40 * 41 * (2) Fragmentation is less of an issue since when we are at the limit of 42 * allocatable space, we won't have to search around for a long free 43 * hole in the VA space for large ARC allocations. Each chunk is mapped in 44 * individually, so even if we are using HIGHMEM (see next point) we 45 * wouldn't need to worry about finding a contiguous address range. 46 * 47 * (3) If we are not using HIGHMEM, then all physical memory is always 48 * mapped into the kernel's address space, so we also avoid the map / 49 * unmap costs on each ABD access. 50 * 51 * If we are not using HIGHMEM, scattered buffers which have only one chunk 52 * can be treated as linear buffers, because they are contiguous in the 53 * kernel's virtual address space. See abd_alloc_chunks() for details. 54 */ 55 56 #include <sys/abd_impl.h> 57 #include <sys/param.h> 58 #include <sys/zio.h> 59 #include <sys/arc.h> 60 #include <sys/zfs_context.h> 61 #include <sys/zfs_znode.h> 62 #include <linux/kmap_compat.h> 63 #include <linux/mm_compat.h> 64 #include <linux/scatterlist.h> 65 #include <linux/version.h> 66 67 #if defined(MAX_ORDER) 68 #define ABD_MAX_ORDER (MAX_ORDER) 69 #elif defined(MAX_PAGE_ORDER) 70 #define ABD_MAX_ORDER (MAX_PAGE_ORDER) 71 #endif 72 73 typedef struct abd_stats { 74 kstat_named_t abdstat_struct_size; 75 kstat_named_t abdstat_linear_cnt; 76 kstat_named_t abdstat_linear_data_size; 77 kstat_named_t abdstat_scatter_cnt; 78 kstat_named_t abdstat_scatter_data_size; 79 kstat_named_t abdstat_scatter_chunk_waste; 80 kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; 81 kstat_named_t abdstat_scatter_page_multi_chunk; 82 kstat_named_t abdstat_scatter_page_multi_zone; 83 kstat_named_t abdstat_scatter_page_alloc_retry; 84 kstat_named_t abdstat_scatter_sg_table_retry; 85 } abd_stats_t; 86 87 static abd_stats_t abd_stats = { 88 /* Amount of memory occupied by all of the abd_t struct allocations */ 89 { "struct_size", KSTAT_DATA_UINT64 }, 90 /* 91 * The number of linear ABDs which are currently allocated, excluding 92 * ABDs which don't own their data (for instance the ones which were 93 * allocated through abd_get_offset() and abd_get_from_buf()). If an 94 * ABD takes ownership of its buf then it will become tracked. 95 */ 96 { "linear_cnt", KSTAT_DATA_UINT64 }, 97 /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 98 { "linear_data_size", KSTAT_DATA_UINT64 }, 99 /* 100 * The number of scatter ABDs which are currently allocated, excluding 101 * ABDs which don't own their data (for instance the ones which were 102 * allocated through abd_get_offset()). 103 */ 104 { "scatter_cnt", KSTAT_DATA_UINT64 }, 105 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 106 { "scatter_data_size", KSTAT_DATA_UINT64 }, 107 /* 108 * The amount of space wasted at the end of the last chunk across all 109 * scatter ABDs tracked by scatter_cnt. 110 */ 111 { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 112 /* 113 * The number of compound allocations of a given order. These 114 * allocations are spread over all currently allocated ABDs, and 115 * act as a measure of memory fragmentation. 116 */ 117 { { "scatter_order_N", KSTAT_DATA_UINT64 } }, 118 /* 119 * The number of scatter ABDs which contain multiple chunks. 120 * ABDs are preferentially allocated from the minimum number of 121 * contiguous multi-page chunks, a single chunk is optimal. 122 */ 123 { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, 124 /* 125 * The number of scatter ABDs which are split across memory zones. 126 * ABDs are preferentially allocated using pages from a single zone. 127 */ 128 { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, 129 /* 130 * The total number of retries encountered when attempting to 131 * allocate the pages to populate the scatter ABD. 132 */ 133 { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, 134 /* 135 * The total number of retries encountered when attempting to 136 * allocate the sg table for an ABD. 137 */ 138 { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, 139 }; 140 141 static struct { 142 wmsum_t abdstat_struct_size; 143 wmsum_t abdstat_linear_cnt; 144 wmsum_t abdstat_linear_data_size; 145 wmsum_t abdstat_scatter_cnt; 146 wmsum_t abdstat_scatter_data_size; 147 wmsum_t abdstat_scatter_chunk_waste; 148 wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; 149 wmsum_t abdstat_scatter_page_multi_chunk; 150 wmsum_t abdstat_scatter_page_multi_zone; 151 wmsum_t abdstat_scatter_page_alloc_retry; 152 wmsum_t abdstat_scatter_sg_table_retry; 153 } abd_sums; 154 155 #define abd_for_each_sg(abd, sg, n, i) \ 156 for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) 157 158 /* 159 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 160 * ABD's. Smaller allocations will use linear ABD's which uses 161 * zio_[data_]buf_alloc(). 162 * 163 * Scatter ABD's use at least one page each, so sub-page allocations waste 164 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 165 * half of each page). Using linear ABD's for small allocations means that 166 * they will be put on slabs which contain many allocations. This can 167 * improve memory efficiency, but it also makes it much harder for ARC 168 * evictions to actually free pages, because all the buffers on one slab need 169 * to be freed in order for the slab (and underlying pages) to be freed. 170 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 171 * possible for them to actually waste more memory than scatter (one page per 172 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 173 * 174 * Spill blocks are typically 512B and are heavily used on systems running 175 * selinux with the default dnode size and the `xattr=sa` property set. 176 * 177 * By default we use linear allocations for 512B and 1KB, and scatter 178 * allocations for larger (1.5KB and up). 179 */ 180 static int zfs_abd_scatter_min_size = 512 * 3; 181 182 /* 183 * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are 184 * just a single zero'd page. This allows us to conserve memory by 185 * only using a single zero page for the scatterlist. 186 */ 187 abd_t *abd_zero_scatter = NULL; 188 189 struct page; 190 191 /* 192 * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will 193 * point to ZERO_PAGE if it is available or it will be an allocated zero'd 194 * PAGESIZE buffer. 195 */ 196 static struct page *abd_zero_page = NULL; 197 198 static kmem_cache_t *abd_cache = NULL; 199 static kstat_t *abd_ksp; 200 201 static uint_t 202 abd_chunkcnt_for_bytes(size_t size) 203 { 204 return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); 205 } 206 207 abd_t * 208 abd_alloc_struct_impl(size_t size) 209 { 210 /* 211 * In Linux we do not use the size passed in during ABD 212 * allocation, so we just ignore it. 213 */ 214 (void) size; 215 abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); 216 ASSERT3P(abd, !=, NULL); 217 ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); 218 219 return (abd); 220 } 221 222 void 223 abd_free_struct_impl(abd_t *abd) 224 { 225 kmem_cache_free(abd_cache, abd); 226 ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); 227 } 228 229 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; 230 231 /* 232 * Mark zfs data pages so they can be excluded from kernel crash dumps 233 */ 234 #ifdef _LP64 235 #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E 236 237 static inline void 238 abd_mark_zfs_page(struct page *page) 239 { 240 get_page(page); 241 SetPagePrivate(page); 242 set_page_private(page, ABD_FILE_CACHE_PAGE); 243 } 244 245 static inline void 246 abd_unmark_zfs_page(struct page *page) 247 { 248 set_page_private(page, 0UL); 249 ClearPagePrivate(page); 250 put_page(page); 251 } 252 #else 253 #define abd_mark_zfs_page(page) 254 #define abd_unmark_zfs_page(page) 255 #endif /* _LP64 */ 256 257 #ifndef CONFIG_HIGHMEM 258 259 #ifndef __GFP_RECLAIM 260 #define __GFP_RECLAIM __GFP_WAIT 261 #endif 262 263 /* 264 * The goal is to minimize fragmentation by preferentially populating ABDs 265 * with higher order compound pages from a single zone. Allocation size is 266 * progressively decreased until it can be satisfied without performing 267 * reclaim or compaction. When necessary this function will degenerate to 268 * allocating individual pages and allowing reclaim to satisfy allocations. 269 */ 270 void 271 abd_alloc_chunks(abd_t *abd, size_t size) 272 { 273 struct list_head pages; 274 struct sg_table table; 275 struct scatterlist *sg; 276 struct page *page, *tmp_page = NULL; 277 gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; 278 gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; 279 unsigned int max_order = MIN(zfs_abd_scatter_max_order, 280 ABD_MAX_ORDER - 1); 281 unsigned int nr_pages = abd_chunkcnt_for_bytes(size); 282 unsigned int chunks = 0, zones = 0; 283 size_t remaining_size; 284 int nid = NUMA_NO_NODE; 285 unsigned int alloc_pages = 0; 286 287 INIT_LIST_HEAD(&pages); 288 289 ASSERT3U(alloc_pages, <, nr_pages); 290 291 while (alloc_pages < nr_pages) { 292 unsigned int chunk_pages; 293 unsigned int order; 294 295 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); 296 chunk_pages = (1U << order); 297 298 page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); 299 if (page == NULL) { 300 if (order == 0) { 301 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 302 schedule_timeout_interruptible(1); 303 } else { 304 max_order = MAX(0, order - 1); 305 } 306 continue; 307 } 308 309 list_add_tail(&page->lru, &pages); 310 311 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) 312 zones++; 313 314 nid = page_to_nid(page); 315 ABDSTAT_BUMP(abdstat_scatter_orders[order]); 316 chunks++; 317 alloc_pages += chunk_pages; 318 } 319 320 ASSERT3S(alloc_pages, ==, nr_pages); 321 322 while (sg_alloc_table(&table, chunks, gfp)) { 323 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 324 schedule_timeout_interruptible(1); 325 } 326 327 sg = table.sgl; 328 remaining_size = size; 329 list_for_each_entry_safe(page, tmp_page, &pages, lru) { 330 size_t sg_size = MIN(PAGESIZE << compound_order(page), 331 remaining_size); 332 sg_set_page(sg, page, sg_size, 0); 333 abd_mark_zfs_page(page); 334 remaining_size -= sg_size; 335 336 sg = sg_next(sg); 337 list_del(&page->lru); 338 } 339 340 /* 341 * These conditions ensure that a possible transformation to a linear 342 * ABD would be valid. 343 */ 344 ASSERT(!PageHighMem(sg_page(table.sgl))); 345 ASSERT0(ABD_SCATTER(abd).abd_offset); 346 347 if (table.nents == 1) { 348 /* 349 * Since there is only one entry, this ABD can be represented 350 * as a linear buffer. All single-page (4K) ABD's can be 351 * represented this way. Some multi-page ABD's can also be 352 * represented this way, if we were able to allocate a single 353 * "chunk" (higher-order "page" which represents a power-of-2 354 * series of physically-contiguous pages). This is often the 355 * case for 2-page (8K) ABD's. 356 * 357 * Representing a single-entry scatter ABD as a linear ABD 358 * has the performance advantage of avoiding the copy (and 359 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. 360 * A performance increase of around 5% has been observed for 361 * ARC-cached reads (of small blocks which can take advantage 362 * of this). 363 * 364 * Note that this optimization is only possible because the 365 * pages are always mapped into the kernel's address space. 366 * This is not the case for highmem pages, so the 367 * optimization can not be made there. 368 */ 369 abd->abd_flags |= ABD_FLAG_LINEAR; 370 abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; 371 abd->abd_u.abd_linear.abd_sgl = table.sgl; 372 ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); 373 } else if (table.nents > 1) { 374 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 375 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 376 377 if (zones) { 378 ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); 379 abd->abd_flags |= ABD_FLAG_MULTI_ZONE; 380 } 381 382 ABD_SCATTER(abd).abd_sgl = table.sgl; 383 ABD_SCATTER(abd).abd_nents = table.nents; 384 } 385 } 386 #else 387 388 /* 389 * Allocate N individual pages to construct a scatter ABD. This function 390 * makes no attempt to request contiguous pages and requires the minimal 391 * number of kernel interfaces. It's designed for maximum compatibility. 392 */ 393 void 394 abd_alloc_chunks(abd_t *abd, size_t size) 395 { 396 struct scatterlist *sg = NULL; 397 struct sg_table table; 398 struct page *page; 399 gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; 400 int nr_pages = abd_chunkcnt_for_bytes(size); 401 int i = 0; 402 403 while (sg_alloc_table(&table, nr_pages, gfp)) { 404 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 405 schedule_timeout_interruptible(1); 406 } 407 408 ASSERT3U(table.nents, ==, nr_pages); 409 ABD_SCATTER(abd).abd_sgl = table.sgl; 410 ABD_SCATTER(abd).abd_nents = nr_pages; 411 412 abd_for_each_sg(abd, sg, nr_pages, i) { 413 while ((page = __page_cache_alloc(gfp)) == NULL) { 414 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 415 schedule_timeout_interruptible(1); 416 } 417 418 ABDSTAT_BUMP(abdstat_scatter_orders[0]); 419 sg_set_page(sg, page, PAGESIZE, 0); 420 abd_mark_zfs_page(page); 421 } 422 423 if (nr_pages > 1) { 424 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 425 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 426 } 427 } 428 #endif /* !CONFIG_HIGHMEM */ 429 430 /* 431 * This must be called if any of the sg_table allocation functions 432 * are called. 433 */ 434 static void 435 abd_free_sg_table(abd_t *abd) 436 { 437 struct sg_table table; 438 439 table.sgl = ABD_SCATTER(abd).abd_sgl; 440 table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; 441 sg_free_table(&table); 442 } 443 444 void 445 abd_free_chunks(abd_t *abd) 446 { 447 struct scatterlist *sg = NULL; 448 struct page *page; 449 int nr_pages = ABD_SCATTER(abd).abd_nents; 450 int order, i = 0; 451 452 if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) 453 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); 454 455 if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) 456 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 457 458 /* 459 * Scatter ABDs may be constructed by abd_alloc_from_pages() from 460 * an array of pages. In which case they should not be freed. 461 */ 462 if (!abd_is_from_pages(abd)) { 463 abd_for_each_sg(abd, sg, nr_pages, i) { 464 page = sg_page(sg); 465 abd_unmark_zfs_page(page); 466 order = compound_order(page); 467 __free_pages(page, order); 468 ASSERT3U(sg->length, <=, PAGE_SIZE << order); 469 ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); 470 } 471 } 472 473 abd_free_sg_table(abd); 474 } 475 476 /* 477 * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in 478 * the scatterlist will be set to the zero'd out buffer abd_zero_page. 479 */ 480 static void 481 abd_alloc_zero_scatter(void) 482 { 483 struct scatterlist *sg = NULL; 484 struct sg_table table; 485 gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 486 int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 487 int i = 0; 488 489 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 490 gfp_t gfp_zero_page = gfp | __GFP_ZERO; 491 while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { 492 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 493 schedule_timeout_interruptible(1); 494 } 495 abd_mark_zfs_page(abd_zero_page); 496 #else 497 abd_zero_page = ZERO_PAGE(0); 498 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 499 500 while (sg_alloc_table(&table, nr_pages, gfp)) { 501 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 502 schedule_timeout_interruptible(1); 503 } 504 ASSERT3U(table.nents, ==, nr_pages); 505 506 abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 507 abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 508 ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 509 ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; 510 ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 511 abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 512 abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK; 513 514 abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 515 sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 516 } 517 518 ABDSTAT_BUMP(abdstat_scatter_cnt); 519 ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 520 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 521 } 522 523 boolean_t 524 abd_size_alloc_linear(size_t size) 525 { 526 return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); 527 } 528 529 void 530 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) 531 { 532 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 533 int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; 534 if (op == ABDSTAT_INCR) { 535 ABDSTAT_BUMP(abdstat_scatter_cnt); 536 ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); 537 ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); 538 arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); 539 } else { 540 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 541 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 542 ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); 543 arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); 544 } 545 } 546 547 void 548 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) 549 { 550 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 551 if (op == ABDSTAT_INCR) { 552 ABDSTAT_BUMP(abdstat_linear_cnt); 553 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 554 } else { 555 ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 556 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 557 } 558 } 559 560 void 561 abd_verify_scatter(abd_t *abd) 562 { 563 ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); 564 ASSERT3U(ABD_SCATTER(abd).abd_offset, <, 565 ABD_SCATTER(abd).abd_sgl->length); 566 567 #ifdef ZFS_DEBUG 568 struct scatterlist *sg = NULL; 569 size_t n = ABD_SCATTER(abd).abd_nents; 570 int i = 0; 571 572 abd_for_each_sg(abd, sg, n, i) { 573 ASSERT3P(sg_page(sg), !=, NULL); 574 } 575 #endif 576 } 577 578 static void 579 abd_free_zero_scatter(void) 580 { 581 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 582 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); 583 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 584 585 abd_free_sg_table(abd_zero_scatter); 586 abd_free_struct(abd_zero_scatter); 587 abd_zero_scatter = NULL; 588 ASSERT3P(abd_zero_page, !=, NULL); 589 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 590 abd_unmark_zfs_page(abd_zero_page); 591 __free_page(abd_zero_page); 592 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 593 } 594 595 static int 596 abd_kstats_update(kstat_t *ksp, int rw) 597 { 598 abd_stats_t *as = ksp->ks_data; 599 600 if (rw == KSTAT_WRITE) 601 return (EACCES); 602 as->abdstat_struct_size.value.ui64 = 603 wmsum_value(&abd_sums.abdstat_struct_size); 604 as->abdstat_linear_cnt.value.ui64 = 605 wmsum_value(&abd_sums.abdstat_linear_cnt); 606 as->abdstat_linear_data_size.value.ui64 = 607 wmsum_value(&abd_sums.abdstat_linear_data_size); 608 as->abdstat_scatter_cnt.value.ui64 = 609 wmsum_value(&abd_sums.abdstat_scatter_cnt); 610 as->abdstat_scatter_data_size.value.ui64 = 611 wmsum_value(&abd_sums.abdstat_scatter_data_size); 612 as->abdstat_scatter_chunk_waste.value.ui64 = 613 wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); 614 for (int i = 0; i < ABD_MAX_ORDER; i++) { 615 as->abdstat_scatter_orders[i].value.ui64 = 616 wmsum_value(&abd_sums.abdstat_scatter_orders[i]); 617 } 618 as->abdstat_scatter_page_multi_chunk.value.ui64 = 619 wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); 620 as->abdstat_scatter_page_multi_zone.value.ui64 = 621 wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); 622 as->abdstat_scatter_page_alloc_retry.value.ui64 = 623 wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); 624 as->abdstat_scatter_sg_table_retry.value.ui64 = 625 wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); 626 return (0); 627 } 628 629 void 630 abd_init(void) 631 { 632 int i; 633 634 abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 635 0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE); 636 637 wmsum_init(&abd_sums.abdstat_struct_size, 0); 638 wmsum_init(&abd_sums.abdstat_linear_cnt, 0); 639 wmsum_init(&abd_sums.abdstat_linear_data_size, 0); 640 wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); 641 wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); 642 wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); 643 for (i = 0; i < ABD_MAX_ORDER; i++) 644 wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); 645 wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); 646 wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); 647 wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); 648 wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); 649 650 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 651 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 652 if (abd_ksp != NULL) { 653 for (i = 0; i < ABD_MAX_ORDER; i++) { 654 snprintf(abd_stats.abdstat_scatter_orders[i].name, 655 KSTAT_STRLEN, "scatter_order_%d", i); 656 abd_stats.abdstat_scatter_orders[i].data_type = 657 KSTAT_DATA_UINT64; 658 } 659 abd_ksp->ks_data = &abd_stats; 660 abd_ksp->ks_update = abd_kstats_update; 661 kstat_install(abd_ksp); 662 } 663 664 abd_alloc_zero_scatter(); 665 } 666 667 void 668 abd_fini(void) 669 { 670 abd_free_zero_scatter(); 671 672 if (abd_ksp != NULL) { 673 kstat_delete(abd_ksp); 674 abd_ksp = NULL; 675 } 676 677 wmsum_fini(&abd_sums.abdstat_struct_size); 678 wmsum_fini(&abd_sums.abdstat_linear_cnt); 679 wmsum_fini(&abd_sums.abdstat_linear_data_size); 680 wmsum_fini(&abd_sums.abdstat_scatter_cnt); 681 wmsum_fini(&abd_sums.abdstat_scatter_data_size); 682 wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); 683 for (int i = 0; i < ABD_MAX_ORDER; i++) 684 wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); 685 wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); 686 wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); 687 wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); 688 wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); 689 690 if (abd_cache) { 691 kmem_cache_destroy(abd_cache); 692 abd_cache = NULL; 693 } 694 } 695 696 void 697 abd_free_linear_page(abd_t *abd) 698 { 699 /* Transform it back into a scatter ABD for freeing */ 700 struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; 701 702 /* When backed by user page unmap it */ 703 if (abd_is_from_pages(abd)) 704 zfs_kunmap(sg_page(sg)); 705 else 706 abd_update_scatter_stats(abd, ABDSTAT_DECR); 707 708 abd->abd_flags &= ~ABD_FLAG_LINEAR; 709 abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; 710 ABD_SCATTER(abd).abd_nents = 1; 711 ABD_SCATTER(abd).abd_offset = 0; 712 ABD_SCATTER(abd).abd_sgl = sg; 713 abd_free_chunks(abd); 714 } 715 716 /* 717 * Allocate a scatter ABD structure from user pages. The pages must be 718 * pinned with get_user_pages, or similiar, but need not be mapped via 719 * the kmap interfaces. 720 */ 721 abd_t * 722 abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) 723 { 724 uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE); 725 struct sg_table table; 726 727 VERIFY3U(size, <=, DMU_MAX_ACCESS); 728 ASSERT3U(offset, <, PAGE_SIZE); 729 ASSERT3P(pages, !=, NULL); 730 731 /* 732 * Even if this buf is filesystem metadata, we only track that we 733 * own the underlying data buffer, which is not true in this case. 734 * Therefore, we don't ever use ABD_FLAG_META here. 735 */ 736 abd_t *abd = abd_alloc_struct(0); 737 abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER; 738 abd->abd_size = size; 739 740 while (sg_alloc_table_from_pages(&table, pages, npages, offset, 741 size, __GFP_NOWARN | GFP_NOIO) != 0) { 742 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 743 schedule_timeout_interruptible(1); 744 } 745 746 if ((offset + size) <= PAGE_SIZE) { 747 /* 748 * Since there is only one entry, this ABD can be represented 749 * as a linear buffer. All single-page (4K) ABD's constructed 750 * from a user page can be represented this way as long as the 751 * page is mapped to a virtual address. This allows us to 752 * apply an offset in to the mapped page. 753 * 754 * Note that kmap() must be used, not kmap_atomic(), because 755 * the mapping needs to bet set up on all CPUs. Using kmap() 756 * also enables the user of highmem pages when required. 757 */ 758 abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; 759 abd->abd_u.abd_linear.abd_sgl = table.sgl; 760 zfs_kmap(sg_page(table.sgl)); 761 ABD_LINEAR_BUF(abd) = sg_virt(table.sgl); 762 } else { 763 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 764 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 765 766 ABD_SCATTER(abd).abd_offset = offset; 767 ABD_SCATTER(abd).abd_sgl = table.sgl; 768 ABD_SCATTER(abd).abd_nents = table.nents; 769 770 ASSERT0(ABD_SCATTER(abd).abd_offset); 771 } 772 773 return (abd); 774 } 775 776 /* 777 * If we're going to use this ABD for doing I/O using the block layer, the 778 * consumer of the ABD data doesn't care if it's scattered or not, and we don't 779 * plan to store this ABD in memory for a long period of time, we should 780 * allocate the ABD type that requires the least data copying to do the I/O. 781 * 782 * On Linux the optimal thing to do would be to use abd_get_offset() and 783 * construct a new ABD which shares the original pages thereby eliminating 784 * the copy. But for the moment a new linear ABD is allocated until this 785 * performance optimization can be implemented. 786 */ 787 abd_t * 788 abd_alloc_for_io(size_t size, boolean_t is_metadata) 789 { 790 return (abd_alloc(size, is_metadata)); 791 } 792 793 abd_t * 794 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, 795 size_t size) 796 { 797 (void) size; 798 int i = 0; 799 struct scatterlist *sg = NULL; 800 801 abd_verify(sabd); 802 ASSERT3U(off, <=, sabd->abd_size); 803 804 size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; 805 806 if (abd == NULL) 807 abd = abd_alloc_struct(0); 808 809 /* 810 * Even if this buf is filesystem metadata, we only track that 811 * if we own the underlying data buffer, which is not true in 812 * this case. Therefore, we don't ever use ABD_FLAG_META here. 813 */ 814 815 abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { 816 if (new_offset < sg->length) 817 break; 818 new_offset -= sg->length; 819 } 820 821 ABD_SCATTER(abd).abd_sgl = sg; 822 ABD_SCATTER(abd).abd_offset = new_offset; 823 ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; 824 825 if (abd_is_from_pages(sabd)) 826 abd->abd_flags |= ABD_FLAG_FROM_PAGES; 827 828 return (abd); 829 } 830 831 /* 832 * Initialize the abd_iter. 833 */ 834 void 835 abd_iter_init(struct abd_iter *aiter, abd_t *abd) 836 { 837 ASSERT(!abd_is_gang(abd)); 838 abd_verify(abd); 839 memset(aiter, 0, sizeof (struct abd_iter)); 840 aiter->iter_abd = abd; 841 if (!abd_is_linear(abd)) { 842 aiter->iter_offset = ABD_SCATTER(abd).abd_offset; 843 aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; 844 } 845 } 846 847 /* 848 * This is just a helper function to see if we have exhausted the 849 * abd_iter and reached the end. 850 */ 851 boolean_t 852 abd_iter_at_end(struct abd_iter *aiter) 853 { 854 ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); 855 return (aiter->iter_pos == aiter->iter_abd->abd_size); 856 } 857 858 /* 859 * Advance the iterator by a certain amount. Cannot be called when a chunk is 860 * in use. This can be safely called when the aiter has already exhausted, in 861 * which case this does nothing. 862 */ 863 void 864 abd_iter_advance(struct abd_iter *aiter, size_t amount) 865 { 866 /* 867 * Ensure that last chunk is not in use. abd_iterate_*() must clear 868 * this state (directly or abd_iter_unmap()) before advancing. 869 */ 870 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 871 ASSERT0(aiter->iter_mapsize); 872 ASSERT3P(aiter->iter_page, ==, NULL); 873 ASSERT0(aiter->iter_page_doff); 874 ASSERT0(aiter->iter_page_dsize); 875 876 /* There's nothing left to advance to, so do nothing */ 877 if (abd_iter_at_end(aiter)) 878 return; 879 880 aiter->iter_pos += amount; 881 aiter->iter_offset += amount; 882 if (!abd_is_linear(aiter->iter_abd)) { 883 while (aiter->iter_offset >= aiter->iter_sg->length) { 884 aiter->iter_offset -= aiter->iter_sg->length; 885 aiter->iter_sg = sg_next(aiter->iter_sg); 886 if (aiter->iter_sg == NULL) { 887 ASSERT0(aiter->iter_offset); 888 break; 889 } 890 } 891 } 892 } 893 894 /* 895 * Map the current chunk into aiter. This can be safely called when the aiter 896 * has already exhausted, in which case this does nothing. 897 */ 898 void 899 abd_iter_map(struct abd_iter *aiter) 900 { 901 void *paddr; 902 size_t offset = 0; 903 904 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 905 ASSERT0(aiter->iter_mapsize); 906 907 /* There's nothing left to iterate over, so do nothing */ 908 if (abd_iter_at_end(aiter)) 909 return; 910 911 if (abd_is_linear(aiter->iter_abd)) { 912 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 913 offset = aiter->iter_offset; 914 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 915 paddr = ABD_LINEAR_BUF(aiter->iter_abd); 916 } else { 917 offset = aiter->iter_offset; 918 aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, 919 aiter->iter_abd->abd_size - aiter->iter_pos); 920 921 paddr = zfs_kmap_local(sg_page(aiter->iter_sg)); 922 } 923 924 aiter->iter_mapaddr = (char *)paddr + offset; 925 } 926 927 /* 928 * Unmap the current chunk from aiter. This can be safely called when the aiter 929 * has already exhausted, in which case this does nothing. 930 */ 931 void 932 abd_iter_unmap(struct abd_iter *aiter) 933 { 934 /* There's nothing left to unmap, so do nothing */ 935 if (abd_iter_at_end(aiter)) 936 return; 937 938 if (!abd_is_linear(aiter->iter_abd)) { 939 /* LINTED E_FUNC_SET_NOT_USED */ 940 zfs_kunmap_local(aiter->iter_mapaddr - aiter->iter_offset); 941 } 942 943 ASSERT3P(aiter->iter_mapaddr, !=, NULL); 944 ASSERT3U(aiter->iter_mapsize, >, 0); 945 946 aiter->iter_mapaddr = NULL; 947 aiter->iter_mapsize = 0; 948 } 949 950 void 951 abd_cache_reap_now(void) 952 { 953 } 954 955 /* 956 * Borrow a raw buffer from an ABD without copying the contents of the ABD 957 * into the buffer. If the ABD is scattered, this will allocate a raw buffer 958 * whose contents are undefined. To copy over the existing data in the ABD, use 959 * abd_borrow_buf_copy() instead. 960 */ 961 void * 962 abd_borrow_buf(abd_t *abd, size_t n) 963 { 964 void *buf; 965 abd_verify(abd); 966 ASSERT3U(abd->abd_size, >=, 0); 967 /* 968 * In the event the ABD is composed of a single user page from Direct 969 * I/O we can not direclty return the raw buffer. This is a consequence 970 * of not being able to write protect the page and the contents of the 971 * page can be changed at any time by the user. 972 */ 973 if (abd_is_from_pages(abd)) { 974 buf = zio_buf_alloc(n); 975 } else if (abd_is_linear(abd)) { 976 buf = abd_to_buf(abd); 977 } else { 978 buf = zio_buf_alloc(n); 979 } 980 981 #ifdef ZFS_DEBUG 982 (void) zfs_refcount_add_many(&abd->abd_children, n, buf); 983 #endif 984 return (buf); 985 } 986 987 void * 988 abd_borrow_buf_copy(abd_t *abd, size_t n) 989 { 990 void *buf = abd_borrow_buf(abd, n); 991 992 /* 993 * In the event the ABD is composed of a single user page from Direct 994 * I/O we must make sure copy the data over into the newly allocated 995 * buffer. This is a consequence of the fact that we can not write 996 * protect the user page and there is a risk the contents of the page 997 * could be changed by the user at any moment. 998 */ 999 if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { 1000 abd_copy_to_buf(buf, abd, n); 1001 } 1002 return (buf); 1003 } 1004 1005 /* 1006 * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will 1007 * not change the contents of the ABD. If you want any changes you made to 1008 * buf to be copied back to abd, use abd_return_buf_copy() instead. If the 1009 * ABD is not constructed from user pages for Direct I/O then an ASSERT 1010 * checks to make sure the contents of buffer have not changed since it was 1011 * borrowed. We can not ASSERT that the contents of the buffer have not changed 1012 * if it is composed of user pages because the pages can not be placed under 1013 * write protection and the user could have possibly changed the contents in 1014 * the pages at any time. This is also an issue for Direct I/O reads. Checksum 1015 * verifications in the ZIO pipeline check for this issue and handle it by 1016 * returning an error on checksum verification failure. 1017 */ 1018 void 1019 abd_return_buf(abd_t *abd, void *buf, size_t n) 1020 { 1021 abd_verify(abd); 1022 ASSERT3U(abd->abd_size, >=, n); 1023 #ifdef ZFS_DEBUG 1024 (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); 1025 #endif 1026 if (abd_is_from_pages(abd)) { 1027 zio_buf_free(buf, n); 1028 } else if (abd_is_linear(abd)) { 1029 ASSERT3P(buf, ==, abd_to_buf(abd)); 1030 } else if (abd_is_gang(abd)) { 1031 #ifdef ZFS_DEBUG 1032 /* 1033 * We have to be careful with gang ABD's that we do not ASSERT0 1034 * for any ABD's that contain user pages from Direct I/O. In 1035 * order to handle this, we just iterate through the gang ABD 1036 * and only verify ABDs that are not from user pages. 1037 */ 1038 void *cmp_buf = buf; 1039 1040 for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); 1041 cabd != NULL; 1042 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1043 if (!abd_is_from_pages(cabd)) { 1044 ASSERT0(abd_cmp_buf(cabd, cmp_buf, 1045 cabd->abd_size)); 1046 } 1047 cmp_buf = (char *)cmp_buf + cabd->abd_size; 1048 } 1049 #endif 1050 zio_buf_free(buf, n); 1051 } else { 1052 ASSERT0(abd_cmp_buf(abd, buf, n)); 1053 zio_buf_free(buf, n); 1054 } 1055 } 1056 1057 void 1058 abd_return_buf_copy(abd_t *abd, void *buf, size_t n) 1059 { 1060 if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { 1061 abd_copy_from_buf(abd, buf, n); 1062 } 1063 abd_return_buf(abd, buf, n); 1064 } 1065 1066 /* 1067 * This is abd_iter_page(), the function underneath abd_iterate_page_func(). 1068 * It yields the next page struct and data offset and size within it, without 1069 * mapping it into the address space. 1070 */ 1071 1072 /* 1073 * "Compound pages" are a group of pages that can be referenced from a single 1074 * struct page *. Its organised as a "head" page, followed by a series of 1075 * "tail" pages. 1076 * 1077 * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, which we 1078 * get from scatter ABDs and SPL vmalloc slabs (ie >16K allocations). So a 1079 * great many of the IO buffers we get are going to be of this type. 1080 * 1081 * The tail pages are just regular PAGESIZE pages, and can be safely used 1082 * as-is. However, the head page has length covering itself and all the tail 1083 * pages. If the ABD chunk spans multiple pages, then we can use the head page 1084 * and a >PAGESIZE length, which is far more efficient. 1085 * 1086 * Before kernel 4.5 however, compound page heads were refcounted separately 1087 * from tail pages, such that moving back to the head page would require us to 1088 * take a reference to it and releasing it once we're completely finished with 1089 * it. In practice, that meant when our caller is done with the ABD, which we 1090 * have no insight into from here. Rather than contort this API to track head 1091 * page references on such ancient kernels, we disabled this special compound 1092 * page handling on kernels before 4.5, instead just using treating each page 1093 * within it as a regular PAGESIZE page (which it is). This is slightly less 1094 * efficient, but makes everything far simpler. 1095 * 1096 * We no longer support kernels before 4.5, so in theory none of this is 1097 * necessary. However, this code is still relatively new in the grand scheme of 1098 * things, so I'm leaving the ability to compile this out for the moment. 1099 * 1100 * Setting/clearing ABD_ITER_COMPOUND_PAGES below enables/disables the special 1101 * handling, by defining the ABD_ITER_PAGE_SIZE(page) macro to understand 1102 * compound pages, or not, and compiling in/out the support to detect compound 1103 * tail pages and move back to the start. 1104 */ 1105 1106 /* On by default */ 1107 #define ABD_ITER_COMPOUND_PAGES 1108 1109 #ifdef ABD_ITER_COMPOUND_PAGES 1110 #define ABD_ITER_PAGE_SIZE(page) \ 1111 (PageCompound(page) ? page_size(page) : PAGESIZE) 1112 #else 1113 #define ABD_ITER_PAGE_SIZE(page) (PAGESIZE) 1114 #endif 1115 1116 void 1117 abd_iter_page(struct abd_iter *aiter) 1118 { 1119 if (abd_iter_at_end(aiter)) { 1120 aiter->iter_page = NULL; 1121 aiter->iter_page_doff = 0; 1122 aiter->iter_page_dsize = 0; 1123 return; 1124 } 1125 1126 struct page *page; 1127 size_t doff, dsize; 1128 1129 /* 1130 * Find the page, and the start of the data within it. This is computed 1131 * differently for linear and scatter ABDs; linear is referenced by 1132 * virtual memory location, while scatter is referenced by page 1133 * pointer. 1134 */ 1135 if (abd_is_linear(aiter->iter_abd)) { 1136 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 1137 1138 /* memory address at iter_pos */ 1139 void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; 1140 1141 /* struct page for address */ 1142 page = is_vmalloc_addr(paddr) ? 1143 vmalloc_to_page(paddr) : virt_to_page(paddr); 1144 1145 /* offset of address within the page */ 1146 doff = offset_in_page(paddr); 1147 } else { 1148 ASSERT(!abd_is_gang(aiter->iter_abd)); 1149 1150 /* current scatter page */ 1151 page = nth_page(sg_page(aiter->iter_sg), 1152 aiter->iter_offset >> PAGE_SHIFT); 1153 1154 /* position within page */ 1155 doff = aiter->iter_offset & (PAGESIZE - 1); 1156 } 1157 1158 #ifdef ABD_ITER_COMPOUND_PAGES 1159 if (PageTail(page)) { 1160 /* 1161 * If this is a compound tail page, move back to the head, and 1162 * adjust the offset to match. This may let us yield a much 1163 * larger amount of data from a single logical page, and so 1164 * leave our caller with fewer pages to process. 1165 */ 1166 struct page *head = compound_head(page); 1167 doff += ((page - head) * PAGESIZE); 1168 page = head; 1169 } 1170 #endif 1171 1172 ASSERT(page); 1173 1174 /* 1175 * Compute the maximum amount of data we can take from this page. This 1176 * is the smaller of: 1177 * - the remaining space in the page 1178 * - the remaining space in this scatterlist entry (which may not cover 1179 * the entire page) 1180 * - the remaining space in the abd (which may not cover the entire 1181 * scatterlist entry) 1182 */ 1183 dsize = MIN(ABD_ITER_PAGE_SIZE(page) - doff, 1184 aiter->iter_abd->abd_size - aiter->iter_pos); 1185 if (!abd_is_linear(aiter->iter_abd)) 1186 dsize = MIN(dsize, aiter->iter_sg->length - aiter->iter_offset); 1187 ASSERT3U(dsize, >, 0); 1188 1189 /* final iterator outputs */ 1190 aiter->iter_page = page; 1191 aiter->iter_page_doff = doff; 1192 aiter->iter_page_dsize = dsize; 1193 } 1194 1195 /* 1196 * Note: ABD BIO functions only needed to support vdev_classic. See comments in 1197 * vdev_disk.c. 1198 */ 1199 1200 /* 1201 * bio_nr_pages for ABD. 1202 * @off is the offset in @abd 1203 */ 1204 unsigned long 1205 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) 1206 { 1207 unsigned long pos; 1208 1209 if (abd_is_gang(abd)) { 1210 unsigned long count = 0; 1211 1212 for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1213 cabd != NULL && size != 0; 1214 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1215 ASSERT3U(off, <, cabd->abd_size); 1216 int mysize = MIN(size, cabd->abd_size - off); 1217 count += abd_nr_pages_off(cabd, mysize, off); 1218 size -= mysize; 1219 off = 0; 1220 } 1221 return (count); 1222 } 1223 1224 if (abd_is_linear(abd)) 1225 pos = (unsigned long)abd_to_buf(abd) + off; 1226 else 1227 pos = ABD_SCATTER(abd).abd_offset + off; 1228 1229 return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - 1230 (pos >> PAGE_SHIFT)); 1231 } 1232 1233 static unsigned int 1234 bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) 1235 { 1236 unsigned int offset, size, i; 1237 struct page *page; 1238 1239 offset = offset_in_page(buf_ptr); 1240 for (i = 0; i < bio->bi_max_vecs; i++) { 1241 size = PAGE_SIZE - offset; 1242 1243 if (bio_size <= 0) 1244 break; 1245 1246 if (size > bio_size) 1247 size = bio_size; 1248 1249 if (is_vmalloc_addr(buf_ptr)) 1250 page = vmalloc_to_page(buf_ptr); 1251 else 1252 page = virt_to_page(buf_ptr); 1253 1254 /* 1255 * Some network related block device uses tcp_sendpage, which 1256 * doesn't behave well when using 0-count page, this is a 1257 * safety net to catch them. 1258 */ 1259 ASSERT3S(page_count(page), >, 0); 1260 1261 if (bio_add_page(bio, page, size, offset) != size) 1262 break; 1263 1264 buf_ptr += size; 1265 bio_size -= size; 1266 offset = 0; 1267 } 1268 1269 return (bio_size); 1270 } 1271 1272 /* 1273 * bio_map for gang ABD. 1274 */ 1275 static unsigned int 1276 abd_gang_bio_map_off(struct bio *bio, abd_t *abd, 1277 unsigned int io_size, size_t off) 1278 { 1279 ASSERT(abd_is_gang(abd)); 1280 1281 for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1282 cabd != NULL; 1283 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1284 ASSERT3U(off, <, cabd->abd_size); 1285 int size = MIN(io_size, cabd->abd_size - off); 1286 int remainder = abd_bio_map_off(bio, cabd, size, off); 1287 io_size -= (size - remainder); 1288 if (io_size == 0 || remainder > 0) 1289 return (io_size); 1290 off = 0; 1291 } 1292 ASSERT0(io_size); 1293 return (io_size); 1294 } 1295 1296 /* 1297 * bio_map for ABD. 1298 * @off is the offset in @abd 1299 * Remaining IO size is returned 1300 */ 1301 unsigned int 1302 abd_bio_map_off(struct bio *bio, abd_t *abd, 1303 unsigned int io_size, size_t off) 1304 { 1305 struct abd_iter aiter; 1306 1307 ASSERT3U(io_size, <=, abd->abd_size - off); 1308 if (abd_is_linear(abd)) 1309 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); 1310 1311 ASSERT(!abd_is_linear(abd)); 1312 if (abd_is_gang(abd)) 1313 return (abd_gang_bio_map_off(bio, abd, io_size, off)); 1314 1315 abd_iter_init(&aiter, abd); 1316 abd_iter_advance(&aiter, off); 1317 1318 for (int i = 0; i < bio->bi_max_vecs; i++) { 1319 struct page *pg; 1320 size_t len, sgoff, pgoff; 1321 struct scatterlist *sg; 1322 1323 if (io_size <= 0) 1324 break; 1325 1326 sg = aiter.iter_sg; 1327 sgoff = aiter.iter_offset; 1328 pgoff = sgoff & (PAGESIZE - 1); 1329 len = MIN(io_size, PAGESIZE - pgoff); 1330 ASSERT(len > 0); 1331 1332 pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); 1333 if (bio_add_page(bio, pg, len, pgoff) != len) 1334 break; 1335 1336 io_size -= len; 1337 abd_iter_advance(&aiter, len); 1338 } 1339 1340 return (io_size); 1341 } 1342 1343 EXPORT_SYMBOL(abd_alloc_from_pages); 1344 1345 /* Tunable Parameters */ 1346 module_param(zfs_abd_scatter_enabled, int, 0644); 1347 MODULE_PARM_DESC(zfs_abd_scatter_enabled, 1348 "Toggle whether ABD allocations must be linear."); 1349 module_param(zfs_abd_scatter_min_size, int, 0644); 1350 MODULE_PARM_DESC(zfs_abd_scatter_min_size, 1351 "Minimum size of scatter allocations."); 1352 module_param(zfs_abd_scatter_max_order, uint, 0644); 1353 MODULE_PARM_DESC(zfs_abd_scatter_max_order, 1354 "Maximum order allocation used for a scatter ABD."); 1355