1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/slab.h> 4 #include "ctree.h" 5 #include "subpage.h" 6 7 /* 8 * Subpage (sectorsize < PAGE_SIZE) support overview: 9 * 10 * Limitations: 11 * 12 * - Only support 64K page size for now 13 * This is to make metadata handling easier, as 64K page would ensure 14 * all nodesize would fit inside one page, thus we don't need to handle 15 * cases where a tree block crosses several pages. 16 * 17 * - Only metadata read-write for now 18 * The data read-write part is in development. 19 * 20 * - Metadata can't cross 64K page boundary 21 * btrfs-progs and kernel have done that for a while, thus only ancient 22 * filesystems could have such problem. For such case, do a graceful 23 * rejection. 24 * 25 * Special behavior: 26 * 27 * - Metadata 28 * Metadata read is fully supported. 29 * Meaning when reading one tree block will only trigger the read for the 30 * needed range, other unrelated range in the same page will not be touched. 31 * 32 * Metadata write support is partial. 33 * The writeback is still for the full page, but we will only submit 34 * the dirty extent buffers in the page. 35 * 36 * This means, if we have a metadata page like this: 37 * 38 * Page offset 39 * 0 16K 32K 48K 64K 40 * |/////////| |///////////| 41 * \- Tree block A \- Tree block B 42 * 43 * Even if we just want to writeback tree block A, we will also writeback 44 * tree block B if it's also dirty. 45 * 46 * This may cause extra metadata writeback which results more COW. 47 * 48 * Implementation: 49 * 50 * - Common 51 * Both metadata and data will use a new structure, btrfs_subpage, to 52 * record the status of each sector inside a page. This provides the extra 53 * granularity needed. 54 * 55 * - Metadata 56 * Since we have multiple tree blocks inside one page, we can't rely on page 57 * locking anymore, or we will have greatly reduced concurrency or even 58 * deadlocks (hold one tree lock while trying to lock another tree lock in 59 * the same page). 60 * 61 * Thus for metadata locking, subpage support relies on io_tree locking only. 62 * This means a slightly higher tree locking latency. 63 */ 64 65 int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, 66 struct page *page, enum btrfs_subpage_type type) 67 { 68 struct btrfs_subpage *subpage = NULL; 69 int ret; 70 71 /* 72 * We have cases like a dummy extent buffer page, which is not mappped 73 * and doesn't need to be locked. 74 */ 75 if (page->mapping) 76 ASSERT(PageLocked(page)); 77 /* Either not subpage, or the page already has private attached */ 78 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) 79 return 0; 80 81 ret = btrfs_alloc_subpage(fs_info, &subpage, type); 82 if (ret < 0) 83 return ret; 84 attach_page_private(page, subpage); 85 return 0; 86 } 87 88 void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, 89 struct page *page) 90 { 91 struct btrfs_subpage *subpage; 92 93 /* Either not subpage, or already detached */ 94 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) 95 return; 96 97 subpage = (struct btrfs_subpage *)detach_page_private(page); 98 ASSERT(subpage); 99 btrfs_free_subpage(subpage); 100 } 101 102 int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, 103 struct btrfs_subpage **ret, 104 enum btrfs_subpage_type type) 105 { 106 if (fs_info->sectorsize == PAGE_SIZE) 107 return 0; 108 109 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); 110 if (!*ret) 111 return -ENOMEM; 112 spin_lock_init(&(*ret)->lock); 113 if (type == BTRFS_SUBPAGE_METADATA) 114 atomic_set(&(*ret)->eb_refs, 0); 115 else 116 atomic_set(&(*ret)->readers, 0); 117 return 0; 118 } 119 120 void btrfs_free_subpage(struct btrfs_subpage *subpage) 121 { 122 kfree(subpage); 123 } 124 125 /* 126 * Increase the eb_refs of current subpage. 127 * 128 * This is important for eb allocation, to prevent race with last eb freeing 129 * of the same page. 130 * With the eb_refs increased before the eb inserted into radix tree, 131 * detach_extent_buffer_page() won't detach the page private while we're still 132 * allocating the extent buffer. 133 */ 134 void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, 135 struct page *page) 136 { 137 struct btrfs_subpage *subpage; 138 139 if (fs_info->sectorsize == PAGE_SIZE) 140 return; 141 142 ASSERT(PagePrivate(page) && page->mapping); 143 lockdep_assert_held(&page->mapping->private_lock); 144 145 subpage = (struct btrfs_subpage *)page->private; 146 atomic_inc(&subpage->eb_refs); 147 } 148 149 void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, 150 struct page *page) 151 { 152 struct btrfs_subpage *subpage; 153 154 if (fs_info->sectorsize == PAGE_SIZE) 155 return; 156 157 ASSERT(PagePrivate(page) && page->mapping); 158 lockdep_assert_held(&page->mapping->private_lock); 159 160 subpage = (struct btrfs_subpage *)page->private; 161 ASSERT(atomic_read(&subpage->eb_refs)); 162 atomic_dec(&subpage->eb_refs); 163 } 164 165 static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, 166 struct page *page, u64 start, u32 len) 167 { 168 /* Basic checks */ 169 ASSERT(PagePrivate(page) && page->private); 170 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 171 IS_ALIGNED(len, fs_info->sectorsize)); 172 /* 173 * The range check only works for mapped page, we can still have 174 * unmapped page like dummy extent buffer pages. 175 */ 176 if (page->mapping) 177 ASSERT(page_offset(page) <= start && 178 start + len <= page_offset(page) + PAGE_SIZE); 179 } 180 181 void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, 182 struct page *page, u64 start, u32 len) 183 { 184 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 185 const int nbits = len >> fs_info->sectorsize_bits; 186 int ret; 187 188 btrfs_subpage_assert(fs_info, page, start, len); 189 190 ret = atomic_add_return(nbits, &subpage->readers); 191 ASSERT(ret == nbits); 192 } 193 194 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 195 struct page *page, u64 start, u32 len) 196 { 197 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 198 const int nbits = len >> fs_info->sectorsize_bits; 199 200 btrfs_subpage_assert(fs_info, page, start, len); 201 ASSERT(atomic_read(&subpage->readers) >= nbits); 202 if (atomic_sub_and_test(nbits, &subpage->readers)) 203 unlock_page(page); 204 } 205 206 /* 207 * Convert the [start, start + len) range into a u16 bitmap 208 * 209 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. 210 */ 211 static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, 212 struct page *page, u64 start, u32 len) 213 { 214 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; 215 const int nbits = len >> fs_info->sectorsize_bits; 216 217 btrfs_subpage_assert(fs_info, page, start, len); 218 219 /* 220 * Here nbits can be 16, thus can go beyond u16 range. We make the 221 * first left shift to be calculate in unsigned long (at least u32), 222 * then truncate the result to u16. 223 */ 224 return (u16)(((1UL << nbits) - 1) << bit_start); 225 } 226 227 void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, 228 struct page *page, u64 start, u32 len) 229 { 230 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 231 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 232 unsigned long flags; 233 234 spin_lock_irqsave(&subpage->lock, flags); 235 subpage->uptodate_bitmap |= tmp; 236 if (subpage->uptodate_bitmap == U16_MAX) 237 SetPageUptodate(page); 238 spin_unlock_irqrestore(&subpage->lock, flags); 239 } 240 241 void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, 242 struct page *page, u64 start, u32 len) 243 { 244 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 245 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 246 unsigned long flags; 247 248 spin_lock_irqsave(&subpage->lock, flags); 249 subpage->uptodate_bitmap &= ~tmp; 250 ClearPageUptodate(page); 251 spin_unlock_irqrestore(&subpage->lock, flags); 252 } 253 254 void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, 255 struct page *page, u64 start, u32 len) 256 { 257 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 258 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 259 unsigned long flags; 260 261 spin_lock_irqsave(&subpage->lock, flags); 262 subpage->error_bitmap |= tmp; 263 SetPageError(page); 264 spin_unlock_irqrestore(&subpage->lock, flags); 265 } 266 267 void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, 268 struct page *page, u64 start, u32 len) 269 { 270 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 271 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 272 unsigned long flags; 273 274 spin_lock_irqsave(&subpage->lock, flags); 275 subpage->error_bitmap &= ~tmp; 276 if (subpage->error_bitmap == 0) 277 ClearPageError(page); 278 spin_unlock_irqrestore(&subpage->lock, flags); 279 } 280 281 void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, 282 struct page *page, u64 start, u32 len) 283 { 284 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 285 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 286 unsigned long flags; 287 288 spin_lock_irqsave(&subpage->lock, flags); 289 subpage->dirty_bitmap |= tmp; 290 spin_unlock_irqrestore(&subpage->lock, flags); 291 set_page_dirty(page); 292 } 293 294 /* 295 * Extra clear_and_test function for subpage dirty bitmap. 296 * 297 * Return true if we're the last bits in the dirty_bitmap and clear the 298 * dirty_bitmap. 299 * Return false otherwise. 300 * 301 * NOTE: Callers should manually clear page dirty for true case, as we have 302 * extra handling for tree blocks. 303 */ 304 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 305 struct page *page, u64 start, u32 len) 306 { 307 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 308 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 309 unsigned long flags; 310 bool last = false; 311 312 spin_lock_irqsave(&subpage->lock, flags); 313 subpage->dirty_bitmap &= ~tmp; 314 if (subpage->dirty_bitmap == 0) 315 last = true; 316 spin_unlock_irqrestore(&subpage->lock, flags); 317 return last; 318 } 319 320 void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, 321 struct page *page, u64 start, u32 len) 322 { 323 bool last; 324 325 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); 326 if (last) 327 clear_page_dirty_for_io(page); 328 } 329 330 void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, 331 struct page *page, u64 start, u32 len) 332 { 333 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 334 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 335 unsigned long flags; 336 337 spin_lock_irqsave(&subpage->lock, flags); 338 subpage->writeback_bitmap |= tmp; 339 set_page_writeback(page); 340 spin_unlock_irqrestore(&subpage->lock, flags); 341 } 342 343 void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, 344 struct page *page, u64 start, u32 len) 345 { 346 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 347 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 348 unsigned long flags; 349 350 spin_lock_irqsave(&subpage->lock, flags); 351 subpage->writeback_bitmap &= ~tmp; 352 if (subpage->writeback_bitmap == 0) 353 end_page_writeback(page); 354 spin_unlock_irqrestore(&subpage->lock, flags); 355 } 356 357 /* 358 * Unlike set/clear which is dependent on each page status, for test all bits 359 * are tested in the same way. 360 */ 361 #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ 362 bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ 363 struct page *page, u64 start, u32 len) \ 364 { \ 365 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ 366 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ 367 unsigned long flags; \ 368 bool ret; \ 369 \ 370 spin_lock_irqsave(&subpage->lock, flags); \ 371 ret = ((subpage->name##_bitmap & tmp) == tmp); \ 372 spin_unlock_irqrestore(&subpage->lock, flags); \ 373 return ret; \ 374 } 375 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); 376 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 377 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 378 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 379 380 /* 381 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed 382 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall 383 * back to regular sectorsize branch. 384 */ 385 #define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ 386 test_page_func) \ 387 void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ 388 struct page *page, u64 start, u32 len) \ 389 { \ 390 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 391 set_page_func(page); \ 392 return; \ 393 } \ 394 btrfs_subpage_set_##name(fs_info, page, start, len); \ 395 } \ 396 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 397 struct page *page, u64 start, u32 len) \ 398 { \ 399 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 400 clear_page_func(page); \ 401 return; \ 402 } \ 403 btrfs_subpage_clear_##name(fs_info, page, start, len); \ 404 } \ 405 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 406 struct page *page, u64 start, u32 len) \ 407 { \ 408 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 409 return test_page_func(page); \ 410 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 411 } 412 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 413 PageUptodate); 414 IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); 415 IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, 416 PageDirty); 417 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 418 PageWriteback); 419