1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2014 by Chunwei Chen. All rights reserved. 23 * Copyright (c) 2019 by Delphix. All rights reserved. 24 */ 25 26 /* 27 * See abd.c for a general overview of the arc buffered data (ABD). 28 * 29 * Linear buffers act exactly like normal buffers and are always mapped into the 30 * kernel's virtual memory space, while scattered ABD data chunks are allocated 31 * as physical pages and then mapped in only while they are actually being 32 * accessed through one of the abd_* library functions. Using scattered ABDs 33 * provides several benefits: 34 * 35 * (1) They avoid use of kmem_*, preventing performance problems where running 36 * kmem_reap on very large memory systems never finishes and causes 37 * constant TLB shootdowns. 38 * 39 * (2) Fragmentation is less of an issue since when we are at the limit of 40 * allocatable space, we won't have to search around for a long free 41 * hole in the VA space for large ARC allocations. Each chunk is mapped in 42 * individually, so even if we are using HIGHMEM (see next point) we 43 * wouldn't need to worry about finding a contiguous address range. 44 * 45 * (3) If we are not using HIGHMEM, then all physical memory is always 46 * mapped into the kernel's address space, so we also avoid the map / 47 * unmap costs on each ABD access. 48 * 49 * If we are not using HIGHMEM, scattered buffers which have only one chunk 50 * can be treated as linear buffers, because they are contiguous in the 51 * kernel's virtual address space. See abd_alloc_chunks() for details. 52 */ 53 54 #include <sys/abd_impl.h> 55 #include <sys/param.h> 56 #include <sys/zio.h> 57 #include <sys/arc.h> 58 #include <sys/zfs_context.h> 59 #include <sys/zfs_znode.h> 60 #ifdef _KERNEL 61 #include <linux/kmap_compat.h> 62 #include <linux/scatterlist.h> 63 #endif 64 65 #ifdef _KERNEL 66 #if defined(MAX_ORDER) 67 #define ABD_MAX_ORDER (MAX_ORDER) 68 #elif defined(MAX_PAGE_ORDER) 69 #define ABD_MAX_ORDER (MAX_PAGE_ORDER) 70 #endif 71 #else 72 #define ABD_MAX_ORDER (1) 73 #endif 74 75 typedef struct abd_stats { 76 kstat_named_t abdstat_struct_size; 77 kstat_named_t abdstat_linear_cnt; 78 kstat_named_t abdstat_linear_data_size; 79 kstat_named_t abdstat_scatter_cnt; 80 kstat_named_t abdstat_scatter_data_size; 81 kstat_named_t abdstat_scatter_chunk_waste; 82 kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; 83 kstat_named_t abdstat_scatter_page_multi_chunk; 84 kstat_named_t abdstat_scatter_page_multi_zone; 85 kstat_named_t abdstat_scatter_page_alloc_retry; 86 kstat_named_t abdstat_scatter_sg_table_retry; 87 } abd_stats_t; 88 89 static abd_stats_t abd_stats = { 90 /* Amount of memory occupied by all of the abd_t struct allocations */ 91 { "struct_size", KSTAT_DATA_UINT64 }, 92 /* 93 * The number of linear ABDs which are currently allocated, excluding 94 * ABDs which don't own their data (for instance the ones which were 95 * allocated through abd_get_offset() and abd_get_from_buf()). If an 96 * ABD takes ownership of its buf then it will become tracked. 97 */ 98 { "linear_cnt", KSTAT_DATA_UINT64 }, 99 /* Amount of data stored in all linear ABDs tracked by linear_cnt */ 100 { "linear_data_size", KSTAT_DATA_UINT64 }, 101 /* 102 * The number of scatter ABDs which are currently allocated, excluding 103 * ABDs which don't own their data (for instance the ones which were 104 * allocated through abd_get_offset()). 105 */ 106 { "scatter_cnt", KSTAT_DATA_UINT64 }, 107 /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ 108 { "scatter_data_size", KSTAT_DATA_UINT64 }, 109 /* 110 * The amount of space wasted at the end of the last chunk across all 111 * scatter ABDs tracked by scatter_cnt. 112 */ 113 { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, 114 /* 115 * The number of compound allocations of a given order. These 116 * allocations are spread over all currently allocated ABDs, and 117 * act as a measure of memory fragmentation. 118 */ 119 { { "scatter_order_N", KSTAT_DATA_UINT64 } }, 120 /* 121 * The number of scatter ABDs which contain multiple chunks. 122 * ABDs are preferentially allocated from the minimum number of 123 * contiguous multi-page chunks, a single chunk is optimal. 124 */ 125 { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, 126 /* 127 * The number of scatter ABDs which are split across memory zones. 128 * ABDs are preferentially allocated using pages from a single zone. 129 */ 130 { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, 131 /* 132 * The total number of retries encountered when attempting to 133 * allocate the pages to populate the scatter ABD. 134 */ 135 { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, 136 /* 137 * The total number of retries encountered when attempting to 138 * allocate the sg table for an ABD. 139 */ 140 { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, 141 }; 142 143 static struct { 144 wmsum_t abdstat_struct_size; 145 wmsum_t abdstat_linear_cnt; 146 wmsum_t abdstat_linear_data_size; 147 wmsum_t abdstat_scatter_cnt; 148 wmsum_t abdstat_scatter_data_size; 149 wmsum_t abdstat_scatter_chunk_waste; 150 wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; 151 wmsum_t abdstat_scatter_page_multi_chunk; 152 wmsum_t abdstat_scatter_page_multi_zone; 153 wmsum_t abdstat_scatter_page_alloc_retry; 154 wmsum_t abdstat_scatter_sg_table_retry; 155 } abd_sums; 156 157 #define abd_for_each_sg(abd, sg, n, i) \ 158 for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) 159 160 /* 161 * zfs_abd_scatter_min_size is the minimum allocation size to use scatter 162 * ABD's. Smaller allocations will use linear ABD's which uses 163 * zio_[data_]buf_alloc(). 164 * 165 * Scatter ABD's use at least one page each, so sub-page allocations waste 166 * some space when allocated as scatter (e.g. 2KB scatter allocation wastes 167 * half of each page). Using linear ABD's for small allocations means that 168 * they will be put on slabs which contain many allocations. This can 169 * improve memory efficiency, but it also makes it much harder for ARC 170 * evictions to actually free pages, because all the buffers on one slab need 171 * to be freed in order for the slab (and underlying pages) to be freed. 172 * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's 173 * possible for them to actually waste more memory than scatter (one page per 174 * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). 175 * 176 * Spill blocks are typically 512B and are heavily used on systems running 177 * selinux with the default dnode size and the `xattr=sa` property set. 178 * 179 * By default we use linear allocations for 512B and 1KB, and scatter 180 * allocations for larger (1.5KB and up). 181 */ 182 static int zfs_abd_scatter_min_size = 512 * 3; 183 184 /* 185 * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are 186 * just a single zero'd page. This allows us to conserve memory by 187 * only using a single zero page for the scatterlist. 188 */ 189 abd_t *abd_zero_scatter = NULL; 190 191 struct page; 192 /* 193 * _KERNEL - Will point to ZERO_PAGE if it is available or it will be 194 * an allocated zero'd PAGESIZE buffer. 195 * Userspace - Will be an allocated zero'ed PAGESIZE buffer. 196 * 197 * abd_zero_page is assigned to each of the pages of abd_zero_scatter. 198 */ 199 static struct page *abd_zero_page = NULL; 200 201 static kmem_cache_t *abd_cache = NULL; 202 static kstat_t *abd_ksp; 203 204 static uint_t 205 abd_chunkcnt_for_bytes(size_t size) 206 { 207 return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); 208 } 209 210 abd_t * 211 abd_alloc_struct_impl(size_t size) 212 { 213 /* 214 * In Linux we do not use the size passed in during ABD 215 * allocation, so we just ignore it. 216 */ 217 (void) size; 218 abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); 219 ASSERT3P(abd, !=, NULL); 220 ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); 221 222 return (abd); 223 } 224 225 void 226 abd_free_struct_impl(abd_t *abd) 227 { 228 kmem_cache_free(abd_cache, abd); 229 ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); 230 } 231 232 #ifdef _KERNEL 233 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; 234 235 /* 236 * Mark zfs data pages so they can be excluded from kernel crash dumps 237 */ 238 #ifdef _LP64 239 #define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E 240 241 static inline void 242 abd_mark_zfs_page(struct page *page) 243 { 244 get_page(page); 245 SetPagePrivate(page); 246 set_page_private(page, ABD_FILE_CACHE_PAGE); 247 } 248 249 static inline void 250 abd_unmark_zfs_page(struct page *page) 251 { 252 set_page_private(page, 0UL); 253 ClearPagePrivate(page); 254 put_page(page); 255 } 256 #else 257 #define abd_mark_zfs_page(page) 258 #define abd_unmark_zfs_page(page) 259 #endif /* _LP64 */ 260 261 #ifndef CONFIG_HIGHMEM 262 263 #ifndef __GFP_RECLAIM 264 #define __GFP_RECLAIM __GFP_WAIT 265 #endif 266 267 /* 268 * The goal is to minimize fragmentation by preferentially populating ABDs 269 * with higher order compound pages from a single zone. Allocation size is 270 * progressively decreased until it can be satisfied without performing 271 * reclaim or compaction. When necessary this function will degenerate to 272 * allocating individual pages and allowing reclaim to satisfy allocations. 273 */ 274 void 275 abd_alloc_chunks(abd_t *abd, size_t size) 276 { 277 struct list_head pages; 278 struct sg_table table; 279 struct scatterlist *sg; 280 struct page *page, *tmp_page = NULL; 281 gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 282 gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; 283 unsigned int max_order = MIN(zfs_abd_scatter_max_order, 284 ABD_MAX_ORDER - 1); 285 unsigned int nr_pages = abd_chunkcnt_for_bytes(size); 286 unsigned int chunks = 0, zones = 0; 287 size_t remaining_size; 288 int nid = NUMA_NO_NODE; 289 unsigned int alloc_pages = 0; 290 291 INIT_LIST_HEAD(&pages); 292 293 ASSERT3U(alloc_pages, <, nr_pages); 294 295 while (alloc_pages < nr_pages) { 296 unsigned int chunk_pages; 297 unsigned int order; 298 299 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); 300 chunk_pages = (1U << order); 301 302 page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); 303 if (page == NULL) { 304 if (order == 0) { 305 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 306 schedule_timeout_interruptible(1); 307 } else { 308 max_order = MAX(0, order - 1); 309 } 310 continue; 311 } 312 313 list_add_tail(&page->lru, &pages); 314 315 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) 316 zones++; 317 318 nid = page_to_nid(page); 319 ABDSTAT_BUMP(abdstat_scatter_orders[order]); 320 chunks++; 321 alloc_pages += chunk_pages; 322 } 323 324 ASSERT3S(alloc_pages, ==, nr_pages); 325 326 while (sg_alloc_table(&table, chunks, gfp)) { 327 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 328 schedule_timeout_interruptible(1); 329 } 330 331 sg = table.sgl; 332 remaining_size = size; 333 list_for_each_entry_safe(page, tmp_page, &pages, lru) { 334 size_t sg_size = MIN(PAGESIZE << compound_order(page), 335 remaining_size); 336 sg_set_page(sg, page, sg_size, 0); 337 abd_mark_zfs_page(page); 338 remaining_size -= sg_size; 339 340 sg = sg_next(sg); 341 list_del(&page->lru); 342 } 343 344 /* 345 * These conditions ensure that a possible transformation to a linear 346 * ABD would be valid. 347 */ 348 ASSERT(!PageHighMem(sg_page(table.sgl))); 349 ASSERT0(ABD_SCATTER(abd).abd_offset); 350 351 if (table.nents == 1) { 352 /* 353 * Since there is only one entry, this ABD can be represented 354 * as a linear buffer. All single-page (4K) ABD's can be 355 * represented this way. Some multi-page ABD's can also be 356 * represented this way, if we were able to allocate a single 357 * "chunk" (higher-order "page" which represents a power-of-2 358 * series of physically-contiguous pages). This is often the 359 * case for 2-page (8K) ABD's. 360 * 361 * Representing a single-entry scatter ABD as a linear ABD 362 * has the performance advantage of avoiding the copy (and 363 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. 364 * A performance increase of around 5% has been observed for 365 * ARC-cached reads (of small blocks which can take advantage 366 * of this). 367 * 368 * Note that this optimization is only possible because the 369 * pages are always mapped into the kernel's address space. 370 * This is not the case for highmem pages, so the 371 * optimization can not be made there. 372 */ 373 abd->abd_flags |= ABD_FLAG_LINEAR; 374 abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; 375 abd->abd_u.abd_linear.abd_sgl = table.sgl; 376 ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); 377 } else if (table.nents > 1) { 378 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 379 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 380 381 if (zones) { 382 ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); 383 abd->abd_flags |= ABD_FLAG_MULTI_ZONE; 384 } 385 386 ABD_SCATTER(abd).abd_sgl = table.sgl; 387 ABD_SCATTER(abd).abd_nents = table.nents; 388 } 389 } 390 #else 391 392 /* 393 * Allocate N individual pages to construct a scatter ABD. This function 394 * makes no attempt to request contiguous pages and requires the minimal 395 * number of kernel interfaces. It's designed for maximum compatibility. 396 */ 397 void 398 abd_alloc_chunks(abd_t *abd, size_t size) 399 { 400 struct scatterlist *sg = NULL; 401 struct sg_table table; 402 struct page *page; 403 gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 404 int nr_pages = abd_chunkcnt_for_bytes(size); 405 int i = 0; 406 407 while (sg_alloc_table(&table, nr_pages, gfp)) { 408 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 409 schedule_timeout_interruptible(1); 410 } 411 412 ASSERT3U(table.nents, ==, nr_pages); 413 ABD_SCATTER(abd).abd_sgl = table.sgl; 414 ABD_SCATTER(abd).abd_nents = nr_pages; 415 416 abd_for_each_sg(abd, sg, nr_pages, i) { 417 while ((page = __page_cache_alloc(gfp)) == NULL) { 418 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 419 schedule_timeout_interruptible(1); 420 } 421 422 ABDSTAT_BUMP(abdstat_scatter_orders[0]); 423 sg_set_page(sg, page, PAGESIZE, 0); 424 abd_mark_zfs_page(page); 425 } 426 427 if (nr_pages > 1) { 428 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 429 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; 430 } 431 } 432 #endif /* !CONFIG_HIGHMEM */ 433 434 /* 435 * This must be called if any of the sg_table allocation functions 436 * are called. 437 */ 438 static void 439 abd_free_sg_table(abd_t *abd) 440 { 441 struct sg_table table; 442 443 table.sgl = ABD_SCATTER(abd).abd_sgl; 444 table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; 445 sg_free_table(&table); 446 } 447 448 void 449 abd_free_chunks(abd_t *abd) 450 { 451 struct scatterlist *sg = NULL; 452 struct page *page; 453 int nr_pages = ABD_SCATTER(abd).abd_nents; 454 int order, i = 0; 455 456 if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) 457 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); 458 459 if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) 460 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 461 462 abd_for_each_sg(abd, sg, nr_pages, i) { 463 page = sg_page(sg); 464 abd_unmark_zfs_page(page); 465 order = compound_order(page); 466 __free_pages(page, order); 467 ASSERT3U(sg->length, <=, PAGE_SIZE << order); 468 ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); 469 } 470 abd_free_sg_table(abd); 471 } 472 473 /* 474 * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in 475 * the scatterlist will be set to the zero'd out buffer abd_zero_page. 476 */ 477 static void 478 abd_alloc_zero_scatter(void) 479 { 480 struct scatterlist *sg = NULL; 481 struct sg_table table; 482 gfp_t gfp = __GFP_NOWARN | GFP_NOIO; 483 int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 484 int i = 0; 485 486 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 487 gfp_t gfp_zero_page = gfp | __GFP_ZERO; 488 while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { 489 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); 490 schedule_timeout_interruptible(1); 491 } 492 abd_mark_zfs_page(abd_zero_page); 493 #else 494 abd_zero_page = ZERO_PAGE(0); 495 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 496 497 while (sg_alloc_table(&table, nr_pages, gfp)) { 498 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); 499 schedule_timeout_interruptible(1); 500 } 501 ASSERT3U(table.nents, ==, nr_pages); 502 503 abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 504 abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 505 ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 506 ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; 507 ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 508 abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 509 abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 510 511 abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 512 sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 513 } 514 515 ABDSTAT_BUMP(abdstat_scatter_cnt); 516 ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 517 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 518 } 519 520 #else /* _KERNEL */ 521 522 #ifndef PAGE_SHIFT 523 #define PAGE_SHIFT (highbit64(PAGESIZE)-1) 524 #endif 525 526 #define zfs_kmap_atomic(chunk) ((void *)chunk) 527 #define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0) 528 #define local_irq_save(flags) do { (void)(flags); } while (0) 529 #define local_irq_restore(flags) do { (void)(flags); } while (0) 530 #define nth_page(pg, i) \ 531 ((struct page *)((void *)(pg) + (i) * PAGESIZE)) 532 533 struct scatterlist { 534 struct page *page; 535 int length; 536 int end; 537 }; 538 539 static void 540 sg_init_table(struct scatterlist *sg, int nr) 541 { 542 memset(sg, 0, nr * sizeof (struct scatterlist)); 543 sg[nr - 1].end = 1; 544 } 545 546 /* 547 * This must be called if any of the sg_table allocation functions 548 * are called. 549 */ 550 static void 551 abd_free_sg_table(abd_t *abd) 552 { 553 int nents = ABD_SCATTER(abd).abd_nents; 554 vmem_free(ABD_SCATTER(abd).abd_sgl, 555 nents * sizeof (struct scatterlist)); 556 } 557 558 #define for_each_sg(sgl, sg, nr, i) \ 559 for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) 560 561 static inline void 562 sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, 563 unsigned int offset) 564 { 565 /* currently we don't use offset */ 566 ASSERT(offset == 0); 567 sg->page = page; 568 sg->length = len; 569 } 570 571 static inline struct page * 572 sg_page(struct scatterlist *sg) 573 { 574 return (sg->page); 575 } 576 577 static inline struct scatterlist * 578 sg_next(struct scatterlist *sg) 579 { 580 if (sg->end) 581 return (NULL); 582 583 return (sg + 1); 584 } 585 586 void 587 abd_alloc_chunks(abd_t *abd, size_t size) 588 { 589 unsigned nr_pages = abd_chunkcnt_for_bytes(size); 590 struct scatterlist *sg; 591 int i; 592 593 ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * 594 sizeof (struct scatterlist), KM_SLEEP); 595 sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); 596 597 abd_for_each_sg(abd, sg, nr_pages, i) { 598 struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 599 sg_set_page(sg, p, PAGESIZE, 0); 600 } 601 ABD_SCATTER(abd).abd_nents = nr_pages; 602 } 603 604 void 605 abd_free_chunks(abd_t *abd) 606 { 607 int i, n = ABD_SCATTER(abd).abd_nents; 608 struct scatterlist *sg; 609 610 abd_for_each_sg(abd, sg, n, i) { 611 struct page *p = nth_page(sg_page(sg), 0); 612 umem_free_aligned(p, PAGESIZE); 613 } 614 abd_free_sg_table(abd); 615 } 616 617 static void 618 abd_alloc_zero_scatter(void) 619 { 620 unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); 621 struct scatterlist *sg; 622 int i; 623 624 abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); 625 memset(abd_zero_page, 0, PAGESIZE); 626 abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); 627 abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER; 628 abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; 629 ABD_SCATTER(abd_zero_scatter).abd_offset = 0; 630 ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; 631 abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; 632 ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * 633 sizeof (struct scatterlist), KM_SLEEP); 634 635 sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); 636 637 abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { 638 sg_set_page(sg, abd_zero_page, PAGESIZE, 0); 639 } 640 641 ABDSTAT_BUMP(abdstat_scatter_cnt); 642 ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); 643 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); 644 } 645 646 #endif /* _KERNEL */ 647 648 boolean_t 649 abd_size_alloc_linear(size_t size) 650 { 651 return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size); 652 } 653 654 void 655 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) 656 { 657 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 658 int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; 659 if (op == ABDSTAT_INCR) { 660 ABDSTAT_BUMP(abdstat_scatter_cnt); 661 ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); 662 ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); 663 arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); 664 } else { 665 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 666 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); 667 ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); 668 arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); 669 } 670 } 671 672 void 673 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) 674 { 675 ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); 676 if (op == ABDSTAT_INCR) { 677 ABDSTAT_BUMP(abdstat_linear_cnt); 678 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); 679 } else { 680 ABDSTAT_BUMPDOWN(abdstat_linear_cnt); 681 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); 682 } 683 } 684 685 void 686 abd_verify_scatter(abd_t *abd) 687 { 688 size_t n; 689 int i = 0; 690 struct scatterlist *sg = NULL; 691 692 ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); 693 ASSERT3U(ABD_SCATTER(abd).abd_offset, <, 694 ABD_SCATTER(abd).abd_sgl->length); 695 n = ABD_SCATTER(abd).abd_nents; 696 abd_for_each_sg(abd, sg, n, i) { 697 ASSERT3P(sg_page(sg), !=, NULL); 698 } 699 } 700 701 static void 702 abd_free_zero_scatter(void) 703 { 704 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); 705 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); 706 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); 707 708 abd_free_sg_table(abd_zero_scatter); 709 abd_free_struct(abd_zero_scatter); 710 abd_zero_scatter = NULL; 711 ASSERT3P(abd_zero_page, !=, NULL); 712 #if defined(_KERNEL) 713 #if defined(HAVE_ZERO_PAGE_GPL_ONLY) 714 abd_unmark_zfs_page(abd_zero_page); 715 __free_page(abd_zero_page); 716 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */ 717 #else 718 umem_free_aligned(abd_zero_page, PAGESIZE); 719 #endif /* _KERNEL */ 720 } 721 722 static int 723 abd_kstats_update(kstat_t *ksp, int rw) 724 { 725 abd_stats_t *as = ksp->ks_data; 726 727 if (rw == KSTAT_WRITE) 728 return (EACCES); 729 as->abdstat_struct_size.value.ui64 = 730 wmsum_value(&abd_sums.abdstat_struct_size); 731 as->abdstat_linear_cnt.value.ui64 = 732 wmsum_value(&abd_sums.abdstat_linear_cnt); 733 as->abdstat_linear_data_size.value.ui64 = 734 wmsum_value(&abd_sums.abdstat_linear_data_size); 735 as->abdstat_scatter_cnt.value.ui64 = 736 wmsum_value(&abd_sums.abdstat_scatter_cnt); 737 as->abdstat_scatter_data_size.value.ui64 = 738 wmsum_value(&abd_sums.abdstat_scatter_data_size); 739 as->abdstat_scatter_chunk_waste.value.ui64 = 740 wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); 741 for (int i = 0; i < ABD_MAX_ORDER; i++) { 742 as->abdstat_scatter_orders[i].value.ui64 = 743 wmsum_value(&abd_sums.abdstat_scatter_orders[i]); 744 } 745 as->abdstat_scatter_page_multi_chunk.value.ui64 = 746 wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk); 747 as->abdstat_scatter_page_multi_zone.value.ui64 = 748 wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone); 749 as->abdstat_scatter_page_alloc_retry.value.ui64 = 750 wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry); 751 as->abdstat_scatter_sg_table_retry.value.ui64 = 752 wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry); 753 return (0); 754 } 755 756 void 757 abd_init(void) 758 { 759 int i; 760 761 abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), 762 0, NULL, NULL, NULL, NULL, NULL, 0); 763 764 wmsum_init(&abd_sums.abdstat_struct_size, 0); 765 wmsum_init(&abd_sums.abdstat_linear_cnt, 0); 766 wmsum_init(&abd_sums.abdstat_linear_data_size, 0); 767 wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); 768 wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); 769 wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); 770 for (i = 0; i < ABD_MAX_ORDER; i++) 771 wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); 772 wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); 773 wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); 774 wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0); 775 wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0); 776 777 abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, 778 sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 779 if (abd_ksp != NULL) { 780 for (i = 0; i < ABD_MAX_ORDER; i++) { 781 snprintf(abd_stats.abdstat_scatter_orders[i].name, 782 KSTAT_STRLEN, "scatter_order_%d", i); 783 abd_stats.abdstat_scatter_orders[i].data_type = 784 KSTAT_DATA_UINT64; 785 } 786 abd_ksp->ks_data = &abd_stats; 787 abd_ksp->ks_update = abd_kstats_update; 788 kstat_install(abd_ksp); 789 } 790 791 abd_alloc_zero_scatter(); 792 } 793 794 void 795 abd_fini(void) 796 { 797 abd_free_zero_scatter(); 798 799 if (abd_ksp != NULL) { 800 kstat_delete(abd_ksp); 801 abd_ksp = NULL; 802 } 803 804 wmsum_fini(&abd_sums.abdstat_struct_size); 805 wmsum_fini(&abd_sums.abdstat_linear_cnt); 806 wmsum_fini(&abd_sums.abdstat_linear_data_size); 807 wmsum_fini(&abd_sums.abdstat_scatter_cnt); 808 wmsum_fini(&abd_sums.abdstat_scatter_data_size); 809 wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); 810 for (int i = 0; i < ABD_MAX_ORDER; i++) 811 wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); 812 wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); 813 wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); 814 wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry); 815 wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry); 816 817 if (abd_cache) { 818 kmem_cache_destroy(abd_cache); 819 abd_cache = NULL; 820 } 821 } 822 823 void 824 abd_free_linear_page(abd_t *abd) 825 { 826 /* Transform it back into a scatter ABD for freeing */ 827 struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; 828 abd->abd_flags &= ~ABD_FLAG_LINEAR; 829 abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; 830 ABD_SCATTER(abd).abd_nents = 1; 831 ABD_SCATTER(abd).abd_offset = 0; 832 ABD_SCATTER(abd).abd_sgl = sg; 833 abd_free_chunks(abd); 834 835 abd_update_scatter_stats(abd, ABDSTAT_DECR); 836 } 837 838 /* 839 * If we're going to use this ABD for doing I/O using the block layer, the 840 * consumer of the ABD data doesn't care if it's scattered or not, and we don't 841 * plan to store this ABD in memory for a long period of time, we should 842 * allocate the ABD type that requires the least data copying to do the I/O. 843 * 844 * On Linux the optimal thing to do would be to use abd_get_offset() and 845 * construct a new ABD which shares the original pages thereby eliminating 846 * the copy. But for the moment a new linear ABD is allocated until this 847 * performance optimization can be implemented. 848 */ 849 abd_t * 850 abd_alloc_for_io(size_t size, boolean_t is_metadata) 851 { 852 return (abd_alloc(size, is_metadata)); 853 } 854 855 abd_t * 856 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, 857 size_t size) 858 { 859 (void) size; 860 int i = 0; 861 struct scatterlist *sg = NULL; 862 863 abd_verify(sabd); 864 ASSERT3U(off, <=, sabd->abd_size); 865 866 size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; 867 868 if (abd == NULL) 869 abd = abd_alloc_struct(0); 870 871 /* 872 * Even if this buf is filesystem metadata, we only track that 873 * if we own the underlying data buffer, which is not true in 874 * this case. Therefore, we don't ever use ABD_FLAG_META here. 875 */ 876 877 abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { 878 if (new_offset < sg->length) 879 break; 880 new_offset -= sg->length; 881 } 882 883 ABD_SCATTER(abd).abd_sgl = sg; 884 ABD_SCATTER(abd).abd_offset = new_offset; 885 ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; 886 887 return (abd); 888 } 889 890 /* 891 * Initialize the abd_iter. 892 */ 893 void 894 abd_iter_init(struct abd_iter *aiter, abd_t *abd) 895 { 896 ASSERT(!abd_is_gang(abd)); 897 abd_verify(abd); 898 aiter->iter_abd = abd; 899 aiter->iter_mapaddr = NULL; 900 aiter->iter_mapsize = 0; 901 aiter->iter_pos = 0; 902 if (abd_is_linear(abd)) { 903 aiter->iter_offset = 0; 904 aiter->iter_sg = NULL; 905 } else { 906 aiter->iter_offset = ABD_SCATTER(abd).abd_offset; 907 aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; 908 } 909 } 910 911 /* 912 * This is just a helper function to see if we have exhausted the 913 * abd_iter and reached the end. 914 */ 915 boolean_t 916 abd_iter_at_end(struct abd_iter *aiter) 917 { 918 return (aiter->iter_pos == aiter->iter_abd->abd_size); 919 } 920 921 /* 922 * Advance the iterator by a certain amount. Cannot be called when a chunk is 923 * in use. This can be safely called when the aiter has already exhausted, in 924 * which case this does nothing. 925 */ 926 void 927 abd_iter_advance(struct abd_iter *aiter, size_t amount) 928 { 929 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 930 ASSERT0(aiter->iter_mapsize); 931 932 /* There's nothing left to advance to, so do nothing */ 933 if (abd_iter_at_end(aiter)) 934 return; 935 936 aiter->iter_pos += amount; 937 aiter->iter_offset += amount; 938 if (!abd_is_linear(aiter->iter_abd)) { 939 while (aiter->iter_offset >= aiter->iter_sg->length) { 940 aiter->iter_offset -= aiter->iter_sg->length; 941 aiter->iter_sg = sg_next(aiter->iter_sg); 942 if (aiter->iter_sg == NULL) { 943 ASSERT0(aiter->iter_offset); 944 break; 945 } 946 } 947 } 948 } 949 950 /* 951 * Map the current chunk into aiter. This can be safely called when the aiter 952 * has already exhausted, in which case this does nothing. 953 */ 954 void 955 abd_iter_map(struct abd_iter *aiter) 956 { 957 void *paddr; 958 size_t offset = 0; 959 960 ASSERT3P(aiter->iter_mapaddr, ==, NULL); 961 ASSERT0(aiter->iter_mapsize); 962 963 /* There's nothing left to iterate over, so do nothing */ 964 if (abd_iter_at_end(aiter)) 965 return; 966 967 if (abd_is_linear(aiter->iter_abd)) { 968 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); 969 offset = aiter->iter_offset; 970 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; 971 paddr = ABD_LINEAR_BUF(aiter->iter_abd); 972 } else { 973 offset = aiter->iter_offset; 974 aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, 975 aiter->iter_abd->abd_size - aiter->iter_pos); 976 977 paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg)); 978 } 979 980 aiter->iter_mapaddr = (char *)paddr + offset; 981 } 982 983 /* 984 * Unmap the current chunk from aiter. This can be safely called when the aiter 985 * has already exhausted, in which case this does nothing. 986 */ 987 void 988 abd_iter_unmap(struct abd_iter *aiter) 989 { 990 /* There's nothing left to unmap, so do nothing */ 991 if (abd_iter_at_end(aiter)) 992 return; 993 994 if (!abd_is_linear(aiter->iter_abd)) { 995 /* LINTED E_FUNC_SET_NOT_USED */ 996 zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset); 997 } 998 999 ASSERT3P(aiter->iter_mapaddr, !=, NULL); 1000 ASSERT3U(aiter->iter_mapsize, >, 0); 1001 1002 aiter->iter_mapaddr = NULL; 1003 aiter->iter_mapsize = 0; 1004 } 1005 1006 void 1007 abd_cache_reap_now(void) 1008 { 1009 } 1010 1011 #if defined(_KERNEL) 1012 /* 1013 * bio_nr_pages for ABD. 1014 * @off is the offset in @abd 1015 */ 1016 unsigned long 1017 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) 1018 { 1019 unsigned long pos; 1020 1021 if (abd_is_gang(abd)) { 1022 unsigned long count = 0; 1023 1024 for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1025 cabd != NULL && size != 0; 1026 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1027 ASSERT3U(off, <, cabd->abd_size); 1028 int mysize = MIN(size, cabd->abd_size - off); 1029 count += abd_nr_pages_off(cabd, mysize, off); 1030 size -= mysize; 1031 off = 0; 1032 } 1033 return (count); 1034 } 1035 1036 if (abd_is_linear(abd)) 1037 pos = (unsigned long)abd_to_buf(abd) + off; 1038 else 1039 pos = ABD_SCATTER(abd).abd_offset + off; 1040 1041 return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - 1042 (pos >> PAGE_SHIFT)); 1043 } 1044 1045 static unsigned int 1046 bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) 1047 { 1048 unsigned int offset, size, i; 1049 struct page *page; 1050 1051 offset = offset_in_page(buf_ptr); 1052 for (i = 0; i < bio->bi_max_vecs; i++) { 1053 size = PAGE_SIZE - offset; 1054 1055 if (bio_size <= 0) 1056 break; 1057 1058 if (size > bio_size) 1059 size = bio_size; 1060 1061 if (is_vmalloc_addr(buf_ptr)) 1062 page = vmalloc_to_page(buf_ptr); 1063 else 1064 page = virt_to_page(buf_ptr); 1065 1066 /* 1067 * Some network related block device uses tcp_sendpage, which 1068 * doesn't behave well when using 0-count page, this is a 1069 * safety net to catch them. 1070 */ 1071 ASSERT3S(page_count(page), >, 0); 1072 1073 if (bio_add_page(bio, page, size, offset) != size) 1074 break; 1075 1076 buf_ptr += size; 1077 bio_size -= size; 1078 offset = 0; 1079 } 1080 1081 return (bio_size); 1082 } 1083 1084 /* 1085 * bio_map for gang ABD. 1086 */ 1087 static unsigned int 1088 abd_gang_bio_map_off(struct bio *bio, abd_t *abd, 1089 unsigned int io_size, size_t off) 1090 { 1091 ASSERT(abd_is_gang(abd)); 1092 1093 for (abd_t *cabd = abd_gang_get_offset(abd, &off); 1094 cabd != NULL; 1095 cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { 1096 ASSERT3U(off, <, cabd->abd_size); 1097 int size = MIN(io_size, cabd->abd_size - off); 1098 int remainder = abd_bio_map_off(bio, cabd, size, off); 1099 io_size -= (size - remainder); 1100 if (io_size == 0 || remainder > 0) 1101 return (io_size); 1102 off = 0; 1103 } 1104 ASSERT0(io_size); 1105 return (io_size); 1106 } 1107 1108 /* 1109 * bio_map for ABD. 1110 * @off is the offset in @abd 1111 * Remaining IO size is returned 1112 */ 1113 unsigned int 1114 abd_bio_map_off(struct bio *bio, abd_t *abd, 1115 unsigned int io_size, size_t off) 1116 { 1117 struct abd_iter aiter; 1118 1119 ASSERT3U(io_size, <=, abd->abd_size - off); 1120 if (abd_is_linear(abd)) 1121 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); 1122 1123 ASSERT(!abd_is_linear(abd)); 1124 if (abd_is_gang(abd)) 1125 return (abd_gang_bio_map_off(bio, abd, io_size, off)); 1126 1127 abd_iter_init(&aiter, abd); 1128 abd_iter_advance(&aiter, off); 1129 1130 for (int i = 0; i < bio->bi_max_vecs; i++) { 1131 struct page *pg; 1132 size_t len, sgoff, pgoff; 1133 struct scatterlist *sg; 1134 1135 if (io_size <= 0) 1136 break; 1137 1138 sg = aiter.iter_sg; 1139 sgoff = aiter.iter_offset; 1140 pgoff = sgoff & (PAGESIZE - 1); 1141 len = MIN(io_size, PAGESIZE - pgoff); 1142 ASSERT(len > 0); 1143 1144 pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); 1145 if (bio_add_page(bio, pg, len, pgoff) != len) 1146 break; 1147 1148 io_size -= len; 1149 abd_iter_advance(&aiter, len); 1150 } 1151 1152 return (io_size); 1153 } 1154 1155 /* Tunable Parameters */ 1156 module_param(zfs_abd_scatter_enabled, int, 0644); 1157 MODULE_PARM_DESC(zfs_abd_scatter_enabled, 1158 "Toggle whether ABD allocations must be linear."); 1159 module_param(zfs_abd_scatter_min_size, int, 0644); 1160 MODULE_PARM_DESC(zfs_abd_scatter_min_size, 1161 "Minimum size of scatter allocations."); 1162 /* CSTYLED */ 1163 module_param(zfs_abd_scatter_max_order, uint, 0644); 1164 MODULE_PARM_DESC(zfs_abd_scatter_max_order, 1165 "Maximum order allocation used for a scatter ABD."); 1166 #endif 1167