1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 #include "btrfs_inode.h" 7 8 /* 9 * Subpage (sectorsize < PAGE_SIZE) support overview: 10 * 11 * Limitations: 12 * 13 * - Only support 64K page size for now 14 * This is to make metadata handling easier, as 64K page would ensure 15 * all nodesize would fit inside one page, thus we don't need to handle 16 * cases where a tree block crosses several pages. 17 * 18 * - Only metadata read-write for now 19 * The data read-write part is in development. 20 * 21 * - Metadata can't cross 64K page boundary 22 * btrfs-progs and kernel have done that for a while, thus only ancient 23 * filesystems could have such problem. For such case, do a graceful 24 * rejection. 25 * 26 * Special behavior: 27 * 28 * - Metadata 29 * Metadata read is fully supported. 30 * Meaning when reading one tree block will only trigger the read for the 31 * needed range, other unrelated range in the same page will not be touched. 32 * 33 * Metadata write support is partial. 34 * The writeback is still for the full page, but we will only submit 35 * the dirty extent buffers in the page. 36 * 37 * This means, if we have a metadata page like this: 38 * 39 * Page offset 40 * 0 16K 32K 48K 64K 41 * |/////////| |///////////| 42 * \- Tree block A \- Tree block B 43 * 44 * Even if we just want to writeback tree block A, we will also writeback 45 * tree block B if it's also dirty. 46 * 47 * This may cause extra metadata writeback which results more COW. 48 * 49 * Implementation: 50 * 51 * - Common 52 * Both metadata and data will use a new structure, btrfs_subpage, to 53 * record the status of each sector inside a page. This provides the extra 54 * granularity needed. 55 * 56 * - Metadata 57 * Since we have multiple tree blocks inside one page, we can't rely on page 58 * locking anymore, or we will have greatly reduced concurrency or even 59 * deadlocks (hold one tree lock while trying to lock another tree lock in 60 * the same page). 61 * 62 * Thus for metadata locking, subpage support relies on io_tree locking only. 63 * This means a slightly higher tree locking latency. 64 */ 65 66 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 67 struct page *page, enum btrfs_subpage_type type) 68 { 69 struct btrfs_subpage *subpage = NULL; 70 int ret; 71 72 /* 73 * We have cases like a dummy extent buffer page, which is not mappped 74 * and doesn't need to be locked. 75 */ 76 if (page->mapping) 77 ASSERT(PageLocked(page)); 78 /* Either not subpage, or the page already has private attached */ 79 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 80 return 0; 81 82 ret = btrfs_alloc_subpage(fs_info, &subpage, type); 83 if (ret < 0) 84 return ret; 85 attach_page_private(page, subpage); 86 return 0; 87 } 88 89 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 90 struct page *page) 91 { 92 struct btrfs_subpage *subpage; 93 94 /* Either not subpage, or already detached */ 95 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 96 return; 97 98 subpage = (struct btrfs_subpage *)detach_page_private(page); 99 ASSERT(subpage); 100 btrfs_free_subpage(subpage); 101 } 102 103 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 104 struct btrfs_subpage **ret, 105 enum btrfs_subpage_type type) 106 { 107 if (fs_info->sectorsize == PAGE_SIZE) 108 return 0; 109 110 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 111 if (!*ret) 112 return -ENOMEM; 113 spin_lock_init(&(*ret)->lock); 114 if (type == BTRFS_SUBPAGE_METADATA) { 115 atomic_set(&(*ret)->eb_refs, 0); 116 } else { 117 atomic_set(&(*ret)->readers, 0); 118 atomic_set(&(*ret)->writers, 0); 119 } 120 return 0; 121 } 122 123 void btrfs_free_subpage(struct btrfs_subpage *subpage) 124 { 125 kfree(subpage); 126 } 127 128 /* 129 * Increase the eb_refs of current subpage. 130 * 131 * This is important for eb allocation, to prevent race with last eb freeing 132 * of the same page. 133 * With the eb_refs increased before the eb inserted into radix tree, 134 * detach_extent_buffer_page() won't detach the page private while we're still 135 * allocating the extent buffer. 136 */ 137 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 138 struct page *page) 139 { 140 struct btrfs_subpage *subpage; 141 142 if (fs_info->sectorsize == PAGE_SIZE) 143 return; 144 145 ASSERT(PagePrivate(page) && page->mapping); 146 lockdep_assert_held(&page->mapping->private_lock); 147 148 subpage = (struct btrfs_subpage *)page->private; 149 atomic_inc(&subpage->eb_refs); 150 } 151 152 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 153 struct page *page) 154 { 155 struct btrfs_subpage *subpage; 156 157 if (fs_info->sectorsize == PAGE_SIZE) 158 return; 159 160 ASSERT(PagePrivate(page) && page->mapping); 161 lockdep_assert_held(&page->mapping->private_lock); 162 163 subpage = (struct btrfs_subpage *)page->private; 164 ASSERT(atomic_read(&subpage->eb_refs)); 165 atomic_dec(&subpage->eb_refs); 166 } 167 168 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 169 struct page *page, u64 start, u32 len) 170 { 171 /* Basic checks */ 172 ASSERT(PagePrivate(page) && page->private); 173 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 174 IS_ALIGNED(len, fs_info->sectorsize)); 175 /* 176 * The range check only works for mapped page, we can still have 177 * unmapped page like dummy extent buffer pages. 178 */ 179 if (page->mapping) 180 ASSERT(page_offset(page) <= start && 181 start + len <= page_offset(page) + PAGE_SIZE); 182 } 183 184 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 185 struct page *page, u64 start, u32 len) 186 { 187 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 188 const int nbits = len >> fs_info->sectorsize_bits; 189 190 btrfs_subpage_assert(fs_info, page, start, len); 191 192 atomic_add(nbits, &subpage->readers); 193 } 194 195 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 196 struct page *page, u64 start, u32 len) 197 { 198 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 199 const int nbits = len >> fs_info->sectorsize_bits; 200 bool is_data; 201 bool last; 202 203 btrfs_subpage_assert(fs_info, page, start, len); 204 is_data = is_data_inode(page->mapping->host); 205 ASSERT(atomic_read(&subpage->readers) >= nbits); 206 last = atomic_sub_and_test(nbits, &subpage->readers); 207 208 /* 209 * For data we need to unlock the page if the last read has finished. 210 * 211 * And please don't replace @last with atomic_sub_and_test() call 212 * inside if () condition. 213 * As we want the atomic_sub_and_test() to be always executed. 214 */ 215 if (is_data && last) 216 unlock_page(page); 217 } 218 219 static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 220 { 221 u64 orig_start = *start; 222 u32 orig_len = *len; 223 224 *start = max_t(u64, page_offset(page), orig_start); 225 *len = min_t(u64, page_offset(page) + PAGE_SIZE, 226 orig_start + orig_len) - *start; 227 } 228 229 void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 230 struct page *page, u64 start, u32 len) 231 { 232 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 233 const int nbits = (len >> fs_info->sectorsize_bits); 234 int ret; 235 236 btrfs_subpage_assert(fs_info, page, start, len); 237 238 ASSERT(atomic_read(&subpage->readers) == 0); 239 ret = atomic_add_return(nbits, &subpage->writers); 240 ASSERT(ret == nbits); 241 } 242 243 bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 244 struct page *page, u64 start, u32 len) 245 { 246 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 247 const int nbits = (len >> fs_info->sectorsize_bits); 248 249 btrfs_subpage_assert(fs_info, page, start, len); 250 251 ASSERT(atomic_read(&subpage->writers) >= nbits); 252 return atomic_sub_and_test(nbits, &subpage->writers); 253 } 254 255 /* 256 * Lock a page for delalloc page writeback. 257 * 258 * Return -EAGAIN if the page is not properly initialized. 259 * Return 0 with the page locked, and writer counter updated. 260 * 261 * Even with 0 returned, the page still need extra check to make sure 262 * it's really the correct page, as the caller is using 263 * find_get_pages_contig(), which can race with page invalidating. 264 */ 265 int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 266 struct page *page, u64 start, u32 len) 267 { 268 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 269 lock_page(page); 270 return 0; 271 } 272 lock_page(page); 273 if (!PagePrivate(page) || !page->private) { 274 unlock_page(page); 275 return -EAGAIN; 276 } 277 btrfs_subpage_clamp_range(page, &start, &len); 278 btrfs_subpage_start_writer(fs_info, page, start, len); 279 return 0; 280 } 281 282 void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 283 struct page *page, u64 start, u32 len) 284 { 285 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 286 return unlock_page(page); 287 btrfs_subpage_clamp_range(page, &start, &len); 288 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) 289 unlock_page(page); 290 } 291 292 /* 293 * Convert the [start, start + len) range into a u16 bitmap 294 * 295 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 296 */ 297 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 298 struct page *page, u64 start, u32 len) 299 { 300 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 301 const int nbits = len >> fs_info->sectorsize_bits; 302 303 btrfs_subpage_assert(fs_info, page, start, len); 304 305 /* 306 * Here nbits can be 16, thus can go beyond u16 range. We make the 307 * first left shift to be calculate in unsigned long (at least u32), 308 * then truncate the result to u16. 309 */ 310 return (u16)(((1UL << nbits) - 1) << bit_start); 311 } 312 313 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 314 struct page *page, u64 start, u32 len) 315 { 316 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 317 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 318 unsigned long flags; 319 320 spin_lock_irqsave(&subpage->lock, flags); 321 subpage->uptodate_bitmap |= tmp; 322 if (subpage->uptodate_bitmap == U16_MAX) 323 SetPageUptodate(page); 324 spin_unlock_irqrestore(&subpage->lock, flags); 325 } 326 327 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 328 struct page *page, u64 start, u32 len) 329 { 330 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 331 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 332 unsigned long flags; 333 334 spin_lock_irqsave(&subpage->lock, flags); 335 subpage->uptodate_bitmap &= ~tmp; 336 ClearPageUptodate(page); 337 spin_unlock_irqrestore(&subpage->lock, flags); 338 } 339 340 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 341 struct page *page, u64 start, u32 len) 342 { 343 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 344 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 345 unsigned long flags; 346 347 spin_lock_irqsave(&subpage->lock, flags); 348 subpage->error_bitmap |= tmp; 349 SetPageError(page); 350 spin_unlock_irqrestore(&subpage->lock, flags); 351 } 352 353 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 354 struct page *page, u64 start, u32 len) 355 { 356 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 357 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 358 unsigned long flags; 359 360 spin_lock_irqsave(&subpage->lock, flags); 361 subpage->error_bitmap &= ~tmp; 362 if (subpage->error_bitmap == 0) 363 ClearPageError(page); 364 spin_unlock_irqrestore(&subpage->lock, flags); 365 } 366 367 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 368 struct page *page, u64 start, u32 len) 369 { 370 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 371 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 372 unsigned long flags; 373 374 spin_lock_irqsave(&subpage->lock, flags); 375 subpage->dirty_bitmap |= tmp; 376 spin_unlock_irqrestore(&subpage->lock, flags); 377 set_page_dirty(page); 378 } 379 380 /* 381 * Extra clear_and_test function for subpage dirty bitmap. 382 * 383 * Return true if we're the last bits in the dirty_bitmap and clear the 384 * dirty_bitmap. 385 * Return false otherwise. 386 * 387 * NOTE: Callers should manually clear page dirty for true case, as we have 388 * extra handling for tree blocks. 389 */ 390 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 391 struct page *page, u64 start, u32 len) 392 { 393 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 394 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 395 unsigned long flags; 396 bool last = false; 397 398 spin_lock_irqsave(&subpage->lock, flags); 399 subpage->dirty_bitmap &= ~tmp; 400 if (subpage->dirty_bitmap == 0) 401 last = true; 402 spin_unlock_irqrestore(&subpage->lock, flags); 403 return last; 404 } 405 406 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 407 struct page *page, u64 start, u32 len) 408 { 409 bool last; 410 411 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 412 if (last) 413 clear_page_dirty_for_io(page); 414 } 415 416 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 417 struct page *page, u64 start, u32 len) 418 { 419 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 420 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 421 unsigned long flags; 422 423 spin_lock_irqsave(&subpage->lock, flags); 424 subpage->writeback_bitmap |= tmp; 425 set_page_writeback(page); 426 spin_unlock_irqrestore(&subpage->lock, flags); 427 } 428 429 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 430 struct page *page, u64 start, u32 len) 431 { 432 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 433 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 434 unsigned long flags; 435 436 spin_lock_irqsave(&subpage->lock, flags); 437 subpage->writeback_bitmap &= ~tmp; 438 if (subpage->writeback_bitmap == 0) { 439 ASSERT(PageWriteback(page)); 440 end_page_writeback(page); 441 } 442 spin_unlock_irqrestore(&subpage->lock, flags); 443 } 444 445 void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, 446 struct page *page, u64 start, u32 len) 447 { 448 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 449 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 450 unsigned long flags; 451 452 spin_lock_irqsave(&subpage->lock, flags); 453 subpage->ordered_bitmap |= tmp; 454 SetPageOrdered(page); 455 spin_unlock_irqrestore(&subpage->lock, flags); 456 } 457 458 void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, 459 struct page *page, u64 start, u32 len) 460 { 461 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 462 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 463 unsigned long flags; 464 465 spin_lock_irqsave(&subpage->lock, flags); 466 subpage->ordered_bitmap &= ~tmp; 467 if (subpage->ordered_bitmap == 0) 468 ClearPageOrdered(page); 469 spin_unlock_irqrestore(&subpage->lock, flags); 470 } 471 /* 472 * Unlike set/clear which is dependent on each page status, for test all bits 473 * are tested in the same way. 474 */ 475 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 476 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 477 struct page *page, u64 start, u32 len) \ 478 { \ 479 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 480 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 481 unsigned long flags; \ 482 bool ret; \ 483 \ 484 spin_lock_irqsave(&subpage->lock, flags); \ 485 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 486 spin_unlock_irqrestore(&subpage->lock, flags); \ 487 return ret; \ 488 } 489 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 490 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 491 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 492 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 493 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); 494 495 /* 496 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 497 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 498 * back to regular sectorsize branch. 499 */ 500 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 501 test_page_func) \ 502 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 503 struct page *page, u64 start, u32 len) \ 504 { \ 505 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 506 set_page_func(page); \ 507 return; \ 508 } \ 509 btrfs_subpage_set_##name(fs_info, page, start, len); \ 510 } \ 511 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 512 struct page *page, u64 start, u32 len) \ 513 { \ 514 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 515 clear_page_func(page); \ 516 return; \ 517 } \ 518 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 519 } \ 520 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 521 struct page *page, u64 start, u32 len) \ 522 { \ 523 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 524 return test_page_func(page); \ 525 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 526 } \ 527 void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 528 struct page *page, u64 start, u32 len) \ 529 { \ 530 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 531 set_page_func(page); \ 532 return; \ 533 } \ 534 btrfs_subpage_clamp_range(page, &start, &len); \ 535 btrfs_subpage_set_##name(fs_info, page, start, len); \ 536 } \ 537 void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 538 struct page *page, u64 start, u32 len) \ 539 { \ 540 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 541 clear_page_func(page); \ 542 return; \ 543 } \ 544 btrfs_subpage_clamp_range(page, &start, &len); \ 545 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 546 } \ 547 bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 548 struct page *page, u64 start, u32 len) \ 549 { \ 550 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 551 return test_page_func(page); \ 552 btrfs_subpage_clamp_range(page, &start, &len); \ 553 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 554 } 555 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 556 PageUptodate); 557 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 558 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 559 PageDirty); 560 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 561 PageWriteback); 562 IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, 563 PageOrdered); 564 565 /* 566 * Make sure not only the page dirty bit is cleared, but also subpage dirty bit 567 * is cleared. 568 */ 569 void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, 570 struct page *page) 571 { 572 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 573 574 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) 575 return; 576 577 ASSERT(!PageDirty(page)); 578 if (fs_info->sectorsize == PAGE_SIZE) 579 return; 580 581 ASSERT(PagePrivate(page) && page->private); 582 ASSERT(subpage->dirty_bitmap == 0); 583 } 584