1 /* 2 * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3 * 4 * This program is free software; you can redistribute it and/or modify it 5 * under the terms and conditions of the GNU General Public License, 6 * version 2, as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11 * more details. 12 * 13 */ 14 #include <linux/kernel.h> 15 #include <linux/wait.h> 16 #include <linux/blkdev.h> 17 #include <linux/slab.h> 18 #include <linux/raid/md_p.h> 19 #include <linux/crc32c.h> 20 #include <linux/random.h> 21 #include "md.h" 22 #include "raid5.h" 23 24 /* 25 * metadata/data stored in disk with 4k size unit (a block) regardless 26 * underneath hardware sector size. only works with PAGE_SIZE == 4096 27 */ 28 #define BLOCK_SECTORS (8) 29 30 /* 31 * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent 32 * recovery scans a very long log 33 */ 34 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 35 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 36 37 /* 38 * We only need 2 bios per I/O unit to make progress, but ensure we 39 * have a few more available to not get too tight. 40 */ 41 #define R5L_POOL_SIZE 4 42 43 struct r5l_log { 44 struct md_rdev *rdev; 45 46 u32 uuid_checksum; 47 48 sector_t device_size; /* log device size, round to 49 * BLOCK_SECTORS */ 50 sector_t max_free_space; /* reclaim run if free space is at 51 * this size */ 52 53 sector_t last_checkpoint; /* log tail. where recovery scan 54 * starts from */ 55 u64 last_cp_seq; /* log tail sequence */ 56 57 sector_t log_start; /* log head. where new data appends */ 58 u64 seq; /* log head sequence */ 59 60 sector_t next_checkpoint; 61 u64 next_cp_seq; 62 63 struct mutex io_mutex; 64 struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 65 66 spinlock_t io_list_lock; 67 struct list_head running_ios; /* io_units which are still running, 68 * and have not yet been completely 69 * written to the log */ 70 struct list_head io_end_ios; /* io_units which have been completely 71 * written to the log but not yet written 72 * to the RAID */ 73 struct list_head flushing_ios; /* io_units which are waiting for log 74 * cache flush */ 75 struct list_head finished_ios; /* io_units which settle down in log disk */ 76 struct bio flush_bio; 77 78 struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 79 80 struct kmem_cache *io_kc; 81 mempool_t *io_pool; 82 struct bio_set *bs; 83 mempool_t *meta_pool; 84 85 struct md_thread *reclaim_thread; 86 unsigned long reclaim_target; /* number of space that need to be 87 * reclaimed. if it's 0, reclaim spaces 88 * used by io_units which are in 89 * IO_UNIT_STRIPE_END state (eg, reclaim 90 * dones't wait for specific io_unit 91 * switching to IO_UNIT_STRIPE_END 92 * state) */ 93 wait_queue_head_t iounit_wait; 94 95 struct list_head no_space_stripes; /* pending stripes, log has no space */ 96 spinlock_t no_space_stripes_lock; 97 98 bool need_cache_flush; 99 }; 100 101 /* 102 * an IO range starts from a meta data block and end at the next meta data 103 * block. The io unit's the meta data block tracks data/parity followed it. io 104 * unit is written to log disk with normal write, as we always flush log disk 105 * first and then start move data to raid disks, there is no requirement to 106 * write io unit with FLUSH/FUA 107 */ 108 struct r5l_io_unit { 109 struct r5l_log *log; 110 111 struct page *meta_page; /* store meta block */ 112 int meta_offset; /* current offset in meta_page */ 113 114 struct bio *current_bio;/* current_bio accepting new data */ 115 116 atomic_t pending_stripe;/* how many stripes not flushed to raid */ 117 u64 seq; /* seq number of the metablock */ 118 sector_t log_start; /* where the io_unit starts */ 119 sector_t log_end; /* where the io_unit ends */ 120 struct list_head log_sibling; /* log->running_ios */ 121 struct list_head stripe_list; /* stripes added to the io_unit */ 122 123 int state; 124 bool need_split_bio; 125 }; 126 127 /* r5l_io_unit state */ 128 enum r5l_io_unit_state { 129 IO_UNIT_RUNNING = 0, /* accepting new IO */ 130 IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 131 * don't accepting new bio */ 132 IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 133 IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 134 }; 135 136 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 137 { 138 start += inc; 139 if (start >= log->device_size) 140 start = start - log->device_size; 141 return start; 142 } 143 144 static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 145 sector_t end) 146 { 147 if (end >= start) 148 return end - start; 149 else 150 return end + log->device_size - start; 151 } 152 153 static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 154 { 155 sector_t used_size; 156 157 used_size = r5l_ring_distance(log, log->last_checkpoint, 158 log->log_start); 159 160 return log->device_size > used_size + size; 161 } 162 163 static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 164 enum r5l_io_unit_state state) 165 { 166 if (WARN_ON(io->state >= state)) 167 return; 168 io->state = state; 169 } 170 171 static void r5l_io_run_stripes(struct r5l_io_unit *io) 172 { 173 struct stripe_head *sh, *next; 174 175 list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 176 list_del_init(&sh->log_list); 177 set_bit(STRIPE_HANDLE, &sh->state); 178 raid5_release_stripe(sh); 179 } 180 } 181 182 static void r5l_log_run_stripes(struct r5l_log *log) 183 { 184 struct r5l_io_unit *io, *next; 185 186 assert_spin_locked(&log->io_list_lock); 187 188 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 189 /* don't change list order */ 190 if (io->state < IO_UNIT_IO_END) 191 break; 192 193 list_move_tail(&io->log_sibling, &log->finished_ios); 194 r5l_io_run_stripes(io); 195 } 196 } 197 198 static void r5l_move_to_end_ios(struct r5l_log *log) 199 { 200 struct r5l_io_unit *io, *next; 201 202 assert_spin_locked(&log->io_list_lock); 203 204 list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 205 /* don't change list order */ 206 if (io->state < IO_UNIT_IO_END) 207 break; 208 list_move_tail(&io->log_sibling, &log->io_end_ios); 209 } 210 } 211 212 static void r5l_log_endio(struct bio *bio) 213 { 214 struct r5l_io_unit *io = bio->bi_private; 215 struct r5l_log *log = io->log; 216 unsigned long flags; 217 218 if (bio->bi_error) 219 md_error(log->rdev->mddev, log->rdev); 220 221 bio_put(bio); 222 mempool_free(io->meta_page, log->meta_pool); 223 224 spin_lock_irqsave(&log->io_list_lock, flags); 225 __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 226 if (log->need_cache_flush) 227 r5l_move_to_end_ios(log); 228 else 229 r5l_log_run_stripes(log); 230 spin_unlock_irqrestore(&log->io_list_lock, flags); 231 232 if (log->need_cache_flush) 233 md_wakeup_thread(log->rdev->mddev->thread); 234 } 235 236 static void r5l_submit_current_io(struct r5l_log *log) 237 { 238 struct r5l_io_unit *io = log->current_io; 239 struct r5l_meta_block *block; 240 unsigned long flags; 241 u32 crc; 242 243 if (!io) 244 return; 245 246 block = page_address(io->meta_page); 247 block->meta_size = cpu_to_le32(io->meta_offset); 248 crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 249 block->checksum = cpu_to_le32(crc); 250 251 log->current_io = NULL; 252 spin_lock_irqsave(&log->io_list_lock, flags); 253 __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 254 spin_unlock_irqrestore(&log->io_list_lock, flags); 255 256 submit_bio(io->current_bio); 257 } 258 259 static struct bio *r5l_bio_alloc(struct r5l_log *log) 260 { 261 struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs); 262 263 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 264 bio->bi_bdev = log->rdev->bdev; 265 bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 266 267 return bio; 268 } 269 270 static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 271 { 272 log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 273 274 /* 275 * If we filled up the log device start from the beginning again, 276 * which will require a new bio. 277 * 278 * Note: for this to work properly the log size needs to me a multiple 279 * of BLOCK_SECTORS. 280 */ 281 if (log->log_start == 0) 282 io->need_split_bio = true; 283 284 io->log_end = log->log_start; 285 } 286 287 static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 288 { 289 struct r5l_io_unit *io; 290 struct r5l_meta_block *block; 291 292 io = mempool_alloc(log->io_pool, GFP_ATOMIC); 293 if (!io) 294 return NULL; 295 memset(io, 0, sizeof(*io)); 296 297 io->log = log; 298 INIT_LIST_HEAD(&io->log_sibling); 299 INIT_LIST_HEAD(&io->stripe_list); 300 io->state = IO_UNIT_RUNNING; 301 302 io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO); 303 block = page_address(io->meta_page); 304 clear_page(block); 305 block->magic = cpu_to_le32(R5LOG_MAGIC); 306 block->version = R5LOG_VERSION; 307 block->seq = cpu_to_le64(log->seq); 308 block->position = cpu_to_le64(log->log_start); 309 310 io->log_start = log->log_start; 311 io->meta_offset = sizeof(struct r5l_meta_block); 312 io->seq = log->seq++; 313 314 io->current_bio = r5l_bio_alloc(log); 315 io->current_bio->bi_end_io = r5l_log_endio; 316 io->current_bio->bi_private = io; 317 bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 318 319 r5_reserve_log_entry(log, io); 320 321 spin_lock_irq(&log->io_list_lock); 322 list_add_tail(&io->log_sibling, &log->running_ios); 323 spin_unlock_irq(&log->io_list_lock); 324 325 return io; 326 } 327 328 static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 329 { 330 if (log->current_io && 331 log->current_io->meta_offset + payload_size > PAGE_SIZE) 332 r5l_submit_current_io(log); 333 334 if (!log->current_io) { 335 log->current_io = r5l_new_meta(log); 336 if (!log->current_io) 337 return -ENOMEM; 338 } 339 340 return 0; 341 } 342 343 static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 344 sector_t location, 345 u32 checksum1, u32 checksum2, 346 bool checksum2_valid) 347 { 348 struct r5l_io_unit *io = log->current_io; 349 struct r5l_payload_data_parity *payload; 350 351 payload = page_address(io->meta_page) + io->meta_offset; 352 payload->header.type = cpu_to_le16(type); 353 payload->header.flags = cpu_to_le16(0); 354 payload->size = cpu_to_le32((1 + !!checksum2_valid) << 355 (PAGE_SHIFT - 9)); 356 payload->location = cpu_to_le64(location); 357 payload->checksum[0] = cpu_to_le32(checksum1); 358 if (checksum2_valid) 359 payload->checksum[1] = cpu_to_le32(checksum2); 360 361 io->meta_offset += sizeof(struct r5l_payload_data_parity) + 362 sizeof(__le32) * (1 + !!checksum2_valid); 363 } 364 365 static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 366 { 367 struct r5l_io_unit *io = log->current_io; 368 369 if (io->need_split_bio) { 370 struct bio *prev = io->current_bio; 371 372 io->current_bio = r5l_bio_alloc(log); 373 bio_chain(io->current_bio, prev); 374 375 submit_bio(prev); 376 } 377 378 if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 379 BUG(); 380 381 r5_reserve_log_entry(log, io); 382 } 383 384 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 385 int data_pages, int parity_pages) 386 { 387 int i; 388 int meta_size; 389 int ret; 390 struct r5l_io_unit *io; 391 392 meta_size = 393 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 394 * data_pages) + 395 sizeof(struct r5l_payload_data_parity) + 396 sizeof(__le32) * parity_pages; 397 398 ret = r5l_get_meta(log, meta_size); 399 if (ret) 400 return ret; 401 402 io = log->current_io; 403 404 for (i = 0; i < sh->disks; i++) { 405 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 406 continue; 407 if (i == sh->pd_idx || i == sh->qd_idx) 408 continue; 409 r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 410 raid5_compute_blocknr(sh, i, 0), 411 sh->dev[i].log_checksum, 0, false); 412 r5l_append_payload_page(log, sh->dev[i].page); 413 } 414 415 if (sh->qd_idx >= 0) { 416 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 417 sh->sector, sh->dev[sh->pd_idx].log_checksum, 418 sh->dev[sh->qd_idx].log_checksum, true); 419 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 420 r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 421 } else { 422 r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 423 sh->sector, sh->dev[sh->pd_idx].log_checksum, 424 0, false); 425 r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 426 } 427 428 list_add_tail(&sh->log_list, &io->stripe_list); 429 atomic_inc(&io->pending_stripe); 430 sh->log_io = io; 431 432 return 0; 433 } 434 435 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 436 /* 437 * running in raid5d, where reclaim could wait for raid5d too (when it flushes 438 * data from log to raid disks), so we shouldn't wait for reclaim here 439 */ 440 int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 441 { 442 int write_disks = 0; 443 int data_pages, parity_pages; 444 int meta_size; 445 int reserve; 446 int i; 447 int ret = 0; 448 449 if (!log) 450 return -EAGAIN; 451 /* Don't support stripe batch */ 452 if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 453 test_bit(STRIPE_SYNCING, &sh->state)) { 454 /* the stripe is written to log, we start writing it to raid */ 455 clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 456 return -EAGAIN; 457 } 458 459 for (i = 0; i < sh->disks; i++) { 460 void *addr; 461 462 if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 463 continue; 464 write_disks++; 465 /* checksum is already calculated in last run */ 466 if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 467 continue; 468 addr = kmap_atomic(sh->dev[i].page); 469 sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 470 addr, PAGE_SIZE); 471 kunmap_atomic(addr); 472 } 473 parity_pages = 1 + !!(sh->qd_idx >= 0); 474 data_pages = write_disks - parity_pages; 475 476 meta_size = 477 ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 478 * data_pages) + 479 sizeof(struct r5l_payload_data_parity) + 480 sizeof(__le32) * parity_pages; 481 /* Doesn't work with very big raid array */ 482 if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) 483 return -EINVAL; 484 485 set_bit(STRIPE_LOG_TRAPPED, &sh->state); 486 /* 487 * The stripe must enter state machine again to finish the write, so 488 * don't delay. 489 */ 490 clear_bit(STRIPE_DELAYED, &sh->state); 491 atomic_inc(&sh->count); 492 493 mutex_lock(&log->io_mutex); 494 /* meta + data */ 495 reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 496 if (!r5l_has_free_space(log, reserve)) { 497 spin_lock(&log->no_space_stripes_lock); 498 list_add_tail(&sh->log_list, &log->no_space_stripes); 499 spin_unlock(&log->no_space_stripes_lock); 500 501 r5l_wake_reclaim(log, reserve); 502 } else { 503 ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 504 if (ret) { 505 spin_lock_irq(&log->io_list_lock); 506 list_add_tail(&sh->log_list, &log->no_mem_stripes); 507 spin_unlock_irq(&log->io_list_lock); 508 } 509 } 510 511 mutex_unlock(&log->io_mutex); 512 return 0; 513 } 514 515 void r5l_write_stripe_run(struct r5l_log *log) 516 { 517 if (!log) 518 return; 519 mutex_lock(&log->io_mutex); 520 r5l_submit_current_io(log); 521 mutex_unlock(&log->io_mutex); 522 } 523 524 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 525 { 526 if (!log) 527 return -ENODEV; 528 /* 529 * we flush log disk cache first, then write stripe data to raid disks. 530 * So if bio is finished, the log disk cache is flushed already. The 531 * recovery guarantees we can recovery the bio from log disk, so we 532 * don't need to flush again 533 */ 534 if (bio->bi_iter.bi_size == 0) { 535 bio_endio(bio); 536 return 0; 537 } 538 bio->bi_opf &= ~REQ_PREFLUSH; 539 return -EAGAIN; 540 } 541 542 /* This will run after log space is reclaimed */ 543 static void r5l_run_no_space_stripes(struct r5l_log *log) 544 { 545 struct stripe_head *sh; 546 547 spin_lock(&log->no_space_stripes_lock); 548 while (!list_empty(&log->no_space_stripes)) { 549 sh = list_first_entry(&log->no_space_stripes, 550 struct stripe_head, log_list); 551 list_del_init(&sh->log_list); 552 set_bit(STRIPE_HANDLE, &sh->state); 553 raid5_release_stripe(sh); 554 } 555 spin_unlock(&log->no_space_stripes_lock); 556 } 557 558 static sector_t r5l_reclaimable_space(struct r5l_log *log) 559 { 560 return r5l_ring_distance(log, log->last_checkpoint, 561 log->next_checkpoint); 562 } 563 564 static void r5l_run_no_mem_stripe(struct r5l_log *log) 565 { 566 struct stripe_head *sh; 567 568 assert_spin_locked(&log->io_list_lock); 569 570 if (!list_empty(&log->no_mem_stripes)) { 571 sh = list_first_entry(&log->no_mem_stripes, 572 struct stripe_head, log_list); 573 list_del_init(&sh->log_list); 574 set_bit(STRIPE_HANDLE, &sh->state); 575 raid5_release_stripe(sh); 576 } 577 } 578 579 static bool r5l_complete_finished_ios(struct r5l_log *log) 580 { 581 struct r5l_io_unit *io, *next; 582 bool found = false; 583 584 assert_spin_locked(&log->io_list_lock); 585 586 list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 587 /* don't change list order */ 588 if (io->state < IO_UNIT_STRIPE_END) 589 break; 590 591 log->next_checkpoint = io->log_start; 592 log->next_cp_seq = io->seq; 593 594 list_del(&io->log_sibling); 595 mempool_free(io, log->io_pool); 596 r5l_run_no_mem_stripe(log); 597 598 found = true; 599 } 600 601 return found; 602 } 603 604 static void __r5l_stripe_write_finished(struct r5l_io_unit *io) 605 { 606 struct r5l_log *log = io->log; 607 unsigned long flags; 608 609 spin_lock_irqsave(&log->io_list_lock, flags); 610 __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 611 612 if (!r5l_complete_finished_ios(log)) { 613 spin_unlock_irqrestore(&log->io_list_lock, flags); 614 return; 615 } 616 617 if (r5l_reclaimable_space(log) > log->max_free_space) 618 r5l_wake_reclaim(log, 0); 619 620 spin_unlock_irqrestore(&log->io_list_lock, flags); 621 wake_up(&log->iounit_wait); 622 } 623 624 void r5l_stripe_write_finished(struct stripe_head *sh) 625 { 626 struct r5l_io_unit *io; 627 628 io = sh->log_io; 629 sh->log_io = NULL; 630 631 if (io && atomic_dec_and_test(&io->pending_stripe)) 632 __r5l_stripe_write_finished(io); 633 } 634 635 static void r5l_log_flush_endio(struct bio *bio) 636 { 637 struct r5l_log *log = container_of(bio, struct r5l_log, 638 flush_bio); 639 unsigned long flags; 640 struct r5l_io_unit *io; 641 642 if (bio->bi_error) 643 md_error(log->rdev->mddev, log->rdev); 644 645 spin_lock_irqsave(&log->io_list_lock, flags); 646 list_for_each_entry(io, &log->flushing_ios, log_sibling) 647 r5l_io_run_stripes(io); 648 list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 649 spin_unlock_irqrestore(&log->io_list_lock, flags); 650 } 651 652 /* 653 * Starting dispatch IO to raid. 654 * io_unit(meta) consists of a log. There is one situation we want to avoid. A 655 * broken meta in the middle of a log causes recovery can't find meta at the 656 * head of log. If operations require meta at the head persistent in log, we 657 * must make sure meta before it persistent in log too. A case is: 658 * 659 * stripe data/parity is in log, we start write stripe to raid disks. stripe 660 * data/parity must be persistent in log before we do the write to raid disks. 661 * 662 * The solution is we restrictly maintain io_unit list order. In this case, we 663 * only write stripes of an io_unit to raid disks till the io_unit is the first 664 * one whose data/parity is in log. 665 */ 666 void r5l_flush_stripe_to_raid(struct r5l_log *log) 667 { 668 bool do_flush; 669 670 if (!log || !log->need_cache_flush) 671 return; 672 673 spin_lock_irq(&log->io_list_lock); 674 /* flush bio is running */ 675 if (!list_empty(&log->flushing_ios)) { 676 spin_unlock_irq(&log->io_list_lock); 677 return; 678 } 679 list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 680 do_flush = !list_empty(&log->flushing_ios); 681 spin_unlock_irq(&log->io_list_lock); 682 683 if (!do_flush) 684 return; 685 bio_reset(&log->flush_bio); 686 log->flush_bio.bi_bdev = log->rdev->bdev; 687 log->flush_bio.bi_end_io = r5l_log_flush_endio; 688 bio_set_op_attrs(&log->flush_bio, REQ_OP_WRITE, WRITE_FLUSH); 689 submit_bio(&log->flush_bio); 690 } 691 692 static void r5l_write_super(struct r5l_log *log, sector_t cp); 693 static void r5l_write_super_and_discard_space(struct r5l_log *log, 694 sector_t end) 695 { 696 struct block_device *bdev = log->rdev->bdev; 697 struct mddev *mddev; 698 699 r5l_write_super(log, end); 700 701 if (!blk_queue_discard(bdev_get_queue(bdev))) 702 return; 703 704 mddev = log->rdev->mddev; 705 /* 706 * Discard could zero data, so before discard we must make sure 707 * superblock is updated to new log tail. Updating superblock (either 708 * directly call md_update_sb() or depend on md thread) must hold 709 * reconfig mutex. On the other hand, raid5_quiesce is called with 710 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 711 * for all IO finish, hence waitting for reclaim thread, while reclaim 712 * thread is calling this function and waitting for reconfig mutex. So 713 * there is a deadlock. We workaround this issue with a trylock. 714 * FIXME: we could miss discard if we can't take reconfig mutex 715 */ 716 set_mask_bits(&mddev->flags, 0, 717 BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 718 if (!mddev_trylock(mddev)) 719 return; 720 md_update_sb(mddev, 1); 721 mddev_unlock(mddev); 722 723 /* discard IO error really doesn't matter, ignore it */ 724 if (log->last_checkpoint < end) { 725 blkdev_issue_discard(bdev, 726 log->last_checkpoint + log->rdev->data_offset, 727 end - log->last_checkpoint, GFP_NOIO, 0); 728 } else { 729 blkdev_issue_discard(bdev, 730 log->last_checkpoint + log->rdev->data_offset, 731 log->device_size - log->last_checkpoint, 732 GFP_NOIO, 0); 733 blkdev_issue_discard(bdev, log->rdev->data_offset, end, 734 GFP_NOIO, 0); 735 } 736 } 737 738 739 static void r5l_do_reclaim(struct r5l_log *log) 740 { 741 sector_t reclaim_target = xchg(&log->reclaim_target, 0); 742 sector_t reclaimable; 743 sector_t next_checkpoint; 744 u64 next_cp_seq; 745 746 spin_lock_irq(&log->io_list_lock); 747 /* 748 * move proper io_unit to reclaim list. We should not change the order. 749 * reclaimable/unreclaimable io_unit can be mixed in the list, we 750 * shouldn't reuse space of an unreclaimable io_unit 751 */ 752 while (1) { 753 reclaimable = r5l_reclaimable_space(log); 754 if (reclaimable >= reclaim_target || 755 (list_empty(&log->running_ios) && 756 list_empty(&log->io_end_ios) && 757 list_empty(&log->flushing_ios) && 758 list_empty(&log->finished_ios))) 759 break; 760 761 md_wakeup_thread(log->rdev->mddev->thread); 762 wait_event_lock_irq(log->iounit_wait, 763 r5l_reclaimable_space(log) > reclaimable, 764 log->io_list_lock); 765 } 766 767 next_checkpoint = log->next_checkpoint; 768 next_cp_seq = log->next_cp_seq; 769 spin_unlock_irq(&log->io_list_lock); 770 771 BUG_ON(reclaimable < 0); 772 if (reclaimable == 0) 773 return; 774 775 /* 776 * write_super will flush cache of each raid disk. We must write super 777 * here, because the log area might be reused soon and we don't want to 778 * confuse recovery 779 */ 780 r5l_write_super_and_discard_space(log, next_checkpoint); 781 782 mutex_lock(&log->io_mutex); 783 log->last_checkpoint = next_checkpoint; 784 log->last_cp_seq = next_cp_seq; 785 mutex_unlock(&log->io_mutex); 786 787 r5l_run_no_space_stripes(log); 788 } 789 790 static void r5l_reclaim_thread(struct md_thread *thread) 791 { 792 struct mddev *mddev = thread->mddev; 793 struct r5conf *conf = mddev->private; 794 struct r5l_log *log = conf->log; 795 796 if (!log) 797 return; 798 r5l_do_reclaim(log); 799 } 800 801 static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 802 { 803 unsigned long target; 804 unsigned long new = (unsigned long)space; /* overflow in theory */ 805 806 do { 807 target = log->reclaim_target; 808 if (new < target) 809 return; 810 } while (cmpxchg(&log->reclaim_target, target, new) != target); 811 md_wakeup_thread(log->reclaim_thread); 812 } 813 814 void r5l_quiesce(struct r5l_log *log, int state) 815 { 816 struct mddev *mddev; 817 if (!log || state == 2) 818 return; 819 if (state == 0) { 820 /* 821 * This is a special case for hotadd. In suspend, the array has 822 * no journal. In resume, journal is initialized as well as the 823 * reclaim thread. 824 */ 825 if (log->reclaim_thread) 826 return; 827 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 828 log->rdev->mddev, "reclaim"); 829 } else if (state == 1) { 830 /* make sure r5l_write_super_and_discard_space exits */ 831 mddev = log->rdev->mddev; 832 wake_up(&mddev->sb_wait); 833 r5l_wake_reclaim(log, -1L); 834 md_unregister_thread(&log->reclaim_thread); 835 r5l_do_reclaim(log); 836 } 837 } 838 839 bool r5l_log_disk_error(struct r5conf *conf) 840 { 841 struct r5l_log *log; 842 bool ret; 843 /* don't allow write if journal disk is missing */ 844 rcu_read_lock(); 845 log = rcu_dereference(conf->log); 846 847 if (!log) 848 ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 849 else 850 ret = test_bit(Faulty, &log->rdev->flags); 851 rcu_read_unlock(); 852 return ret; 853 } 854 855 struct r5l_recovery_ctx { 856 struct page *meta_page; /* current meta */ 857 sector_t meta_total_blocks; /* total size of current meta and data */ 858 sector_t pos; /* recovery position */ 859 u64 seq; /* recovery position seq */ 860 }; 861 862 static int r5l_read_meta_block(struct r5l_log *log, 863 struct r5l_recovery_ctx *ctx) 864 { 865 struct page *page = ctx->meta_page; 866 struct r5l_meta_block *mb; 867 u32 crc, stored_crc; 868 869 if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0, 870 false)) 871 return -EIO; 872 873 mb = page_address(page); 874 stored_crc = le32_to_cpu(mb->checksum); 875 mb->checksum = 0; 876 877 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 878 le64_to_cpu(mb->seq) != ctx->seq || 879 mb->version != R5LOG_VERSION || 880 le64_to_cpu(mb->position) != ctx->pos) 881 return -EINVAL; 882 883 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 884 if (stored_crc != crc) 885 return -EINVAL; 886 887 if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 888 return -EINVAL; 889 890 ctx->meta_total_blocks = BLOCK_SECTORS; 891 892 return 0; 893 } 894 895 static int r5l_recovery_flush_one_stripe(struct r5l_log *log, 896 struct r5l_recovery_ctx *ctx, 897 sector_t stripe_sect, 898 int *offset, sector_t *log_offset) 899 { 900 struct r5conf *conf = log->rdev->mddev->private; 901 struct stripe_head *sh; 902 struct r5l_payload_data_parity *payload; 903 int disk_index; 904 905 sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); 906 while (1) { 907 payload = page_address(ctx->meta_page) + *offset; 908 909 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 910 raid5_compute_sector(conf, 911 le64_to_cpu(payload->location), 0, 912 &disk_index, sh); 913 914 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 915 sh->dev[disk_index].page, REQ_OP_READ, 0, 916 false); 917 sh->dev[disk_index].log_checksum = 918 le32_to_cpu(payload->checksum[0]); 919 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 920 ctx->meta_total_blocks += BLOCK_SECTORS; 921 } else { 922 disk_index = sh->pd_idx; 923 sync_page_io(log->rdev, *log_offset, PAGE_SIZE, 924 sh->dev[disk_index].page, REQ_OP_READ, 0, 925 false); 926 sh->dev[disk_index].log_checksum = 927 le32_to_cpu(payload->checksum[0]); 928 set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); 929 930 if (sh->qd_idx >= 0) { 931 disk_index = sh->qd_idx; 932 sync_page_io(log->rdev, 933 r5l_ring_add(log, *log_offset, BLOCK_SECTORS), 934 PAGE_SIZE, sh->dev[disk_index].page, 935 REQ_OP_READ, 0, false); 936 sh->dev[disk_index].log_checksum = 937 le32_to_cpu(payload->checksum[1]); 938 set_bit(R5_Wantwrite, 939 &sh->dev[disk_index].flags); 940 } 941 ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 942 } 943 944 *log_offset = r5l_ring_add(log, *log_offset, 945 le32_to_cpu(payload->size)); 946 *offset += sizeof(struct r5l_payload_data_parity) + 947 sizeof(__le32) * 948 (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 949 if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 950 break; 951 } 952 953 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 954 void *addr; 955 u32 checksum; 956 957 if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 958 continue; 959 addr = kmap_atomic(sh->dev[disk_index].page); 960 checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 961 kunmap_atomic(addr); 962 if (checksum != sh->dev[disk_index].log_checksum) 963 goto error; 964 } 965 966 for (disk_index = 0; disk_index < sh->disks; disk_index++) { 967 struct md_rdev *rdev, *rrdev; 968 969 if (!test_and_clear_bit(R5_Wantwrite, 970 &sh->dev[disk_index].flags)) 971 continue; 972 973 /* in case device is broken */ 974 rdev = rcu_dereference(conf->disks[disk_index].rdev); 975 if (rdev) 976 sync_page_io(rdev, stripe_sect, PAGE_SIZE, 977 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 978 false); 979 rrdev = rcu_dereference(conf->disks[disk_index].replacement); 980 if (rrdev) 981 sync_page_io(rrdev, stripe_sect, PAGE_SIZE, 982 sh->dev[disk_index].page, REQ_OP_WRITE, 0, 983 false); 984 } 985 raid5_release_stripe(sh); 986 return 0; 987 988 error: 989 for (disk_index = 0; disk_index < sh->disks; disk_index++) 990 sh->dev[disk_index].flags = 0; 991 raid5_release_stripe(sh); 992 return -EINVAL; 993 } 994 995 static int r5l_recovery_flush_one_meta(struct r5l_log *log, 996 struct r5l_recovery_ctx *ctx) 997 { 998 struct r5conf *conf = log->rdev->mddev->private; 999 struct r5l_payload_data_parity *payload; 1000 struct r5l_meta_block *mb; 1001 int offset; 1002 sector_t log_offset; 1003 sector_t stripe_sector; 1004 1005 mb = page_address(ctx->meta_page); 1006 offset = sizeof(struct r5l_meta_block); 1007 log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 1008 1009 while (offset < le32_to_cpu(mb->meta_size)) { 1010 int dd; 1011 1012 payload = (void *)mb + offset; 1013 stripe_sector = raid5_compute_sector(conf, 1014 le64_to_cpu(payload->location), 0, &dd, NULL); 1015 if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, 1016 &offset, &log_offset)) 1017 return -EINVAL; 1018 } 1019 return 0; 1020 } 1021 1022 /* copy data/parity from log to raid disks */ 1023 static void r5l_recovery_flush_log(struct r5l_log *log, 1024 struct r5l_recovery_ctx *ctx) 1025 { 1026 while (1) { 1027 if (r5l_read_meta_block(log, ctx)) 1028 return; 1029 if (r5l_recovery_flush_one_meta(log, ctx)) 1030 return; 1031 ctx->seq++; 1032 ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 1033 } 1034 } 1035 1036 static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 1037 u64 seq) 1038 { 1039 struct page *page; 1040 struct r5l_meta_block *mb; 1041 u32 crc; 1042 1043 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1044 if (!page) 1045 return -ENOMEM; 1046 mb = page_address(page); 1047 mb->magic = cpu_to_le32(R5LOG_MAGIC); 1048 mb->version = R5LOG_VERSION; 1049 mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 1050 mb->seq = cpu_to_le64(seq); 1051 mb->position = cpu_to_le64(pos); 1052 crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1053 mb->checksum = cpu_to_le32(crc); 1054 1055 if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 1056 WRITE_FUA, false)) { 1057 __free_page(page); 1058 return -EIO; 1059 } 1060 __free_page(page); 1061 return 0; 1062 } 1063 1064 static int r5l_recovery_log(struct r5l_log *log) 1065 { 1066 struct r5l_recovery_ctx ctx; 1067 1068 ctx.pos = log->last_checkpoint; 1069 ctx.seq = log->last_cp_seq; 1070 ctx.meta_page = alloc_page(GFP_KERNEL); 1071 if (!ctx.meta_page) 1072 return -ENOMEM; 1073 1074 r5l_recovery_flush_log(log, &ctx); 1075 __free_page(ctx.meta_page); 1076 1077 /* 1078 * we did a recovery. Now ctx.pos points to an invalid meta block. New 1079 * log will start here. but we can't let superblock point to last valid 1080 * meta block. The log might looks like: 1081 * | meta 1| meta 2| meta 3| 1082 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 1083 * superblock points to meta 1, we write a new valid meta 2n. if crash 1084 * happens again, new recovery will start from meta 1. Since meta 2n is 1085 * valid now, recovery will think meta 3 is valid, which is wrong. 1086 * The solution is we create a new meta in meta2 with its seq == meta 1087 * 1's seq + 10 and let superblock points to meta2. The same recovery will 1088 * not think meta 3 is a valid meta, because its seq doesn't match 1089 */ 1090 if (ctx.seq > log->last_cp_seq + 1) { 1091 int ret; 1092 1093 ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); 1094 if (ret) 1095 return ret; 1096 log->seq = ctx.seq + 11; 1097 log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 1098 r5l_write_super(log, ctx.pos); 1099 } else { 1100 log->log_start = ctx.pos; 1101 log->seq = ctx.seq; 1102 } 1103 return 0; 1104 } 1105 1106 static void r5l_write_super(struct r5l_log *log, sector_t cp) 1107 { 1108 struct mddev *mddev = log->rdev->mddev; 1109 1110 log->rdev->journal_tail = cp; 1111 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1112 } 1113 1114 static int r5l_load_log(struct r5l_log *log) 1115 { 1116 struct md_rdev *rdev = log->rdev; 1117 struct page *page; 1118 struct r5l_meta_block *mb; 1119 sector_t cp = log->rdev->journal_tail; 1120 u32 stored_crc, expected_crc; 1121 bool create_super = false; 1122 int ret; 1123 1124 /* Make sure it's valid */ 1125 if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 1126 cp = 0; 1127 page = alloc_page(GFP_KERNEL); 1128 if (!page) 1129 return -ENOMEM; 1130 1131 if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 1132 ret = -EIO; 1133 goto ioerr; 1134 } 1135 mb = page_address(page); 1136 1137 if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 1138 mb->version != R5LOG_VERSION) { 1139 create_super = true; 1140 goto create; 1141 } 1142 stored_crc = le32_to_cpu(mb->checksum); 1143 mb->checksum = 0; 1144 expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 1145 if (stored_crc != expected_crc) { 1146 create_super = true; 1147 goto create; 1148 } 1149 if (le64_to_cpu(mb->position) != cp) { 1150 create_super = true; 1151 goto create; 1152 } 1153 create: 1154 if (create_super) { 1155 log->last_cp_seq = prandom_u32(); 1156 cp = 0; 1157 /* 1158 * Make sure super points to correct address. Log might have 1159 * data very soon. If super hasn't correct log tail address, 1160 * recovery can't find the log 1161 */ 1162 r5l_write_super(log, cp); 1163 } else 1164 log->last_cp_seq = le64_to_cpu(mb->seq); 1165 1166 log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 1167 log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 1168 if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 1169 log->max_free_space = RECLAIM_MAX_FREE_SPACE; 1170 log->last_checkpoint = cp; 1171 1172 __free_page(page); 1173 1174 return r5l_recovery_log(log); 1175 ioerr: 1176 __free_page(page); 1177 return ret; 1178 } 1179 1180 int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 1181 { 1182 struct request_queue *q = bdev_get_queue(rdev->bdev); 1183 struct r5l_log *log; 1184 1185 if (PAGE_SIZE != 4096) 1186 return -EINVAL; 1187 log = kzalloc(sizeof(*log), GFP_KERNEL); 1188 if (!log) 1189 return -ENOMEM; 1190 log->rdev = rdev; 1191 1192 log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 1193 1194 log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 1195 sizeof(rdev->mddev->uuid)); 1196 1197 mutex_init(&log->io_mutex); 1198 1199 spin_lock_init(&log->io_list_lock); 1200 INIT_LIST_HEAD(&log->running_ios); 1201 INIT_LIST_HEAD(&log->io_end_ios); 1202 INIT_LIST_HEAD(&log->flushing_ios); 1203 INIT_LIST_HEAD(&log->finished_ios); 1204 bio_init(&log->flush_bio); 1205 1206 log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 1207 if (!log->io_kc) 1208 goto io_kc; 1209 1210 log->io_pool = mempool_create_slab_pool(R5L_POOL_SIZE, log->io_kc); 1211 if (!log->io_pool) 1212 goto io_pool; 1213 1214 log->bs = bioset_create(R5L_POOL_SIZE, 0); 1215 if (!log->bs) 1216 goto io_bs; 1217 1218 log->meta_pool = mempool_create_page_pool(R5L_POOL_SIZE, 0); 1219 if (!log->meta_pool) 1220 goto out_mempool; 1221 1222 log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 1223 log->rdev->mddev, "reclaim"); 1224 if (!log->reclaim_thread) 1225 goto reclaim_thread; 1226 init_waitqueue_head(&log->iounit_wait); 1227 1228 INIT_LIST_HEAD(&log->no_mem_stripes); 1229 1230 INIT_LIST_HEAD(&log->no_space_stripes); 1231 spin_lock_init(&log->no_space_stripes_lock); 1232 1233 if (r5l_load_log(log)) 1234 goto error; 1235 1236 rcu_assign_pointer(conf->log, log); 1237 set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 1238 return 0; 1239 1240 error: 1241 md_unregister_thread(&log->reclaim_thread); 1242 reclaim_thread: 1243 mempool_destroy(log->meta_pool); 1244 out_mempool: 1245 bioset_free(log->bs); 1246 io_bs: 1247 mempool_destroy(log->io_pool); 1248 io_pool: 1249 kmem_cache_destroy(log->io_kc); 1250 io_kc: 1251 kfree(log); 1252 return -EINVAL; 1253 } 1254 1255 void r5l_exit_log(struct r5l_log *log) 1256 { 1257 md_unregister_thread(&log->reclaim_thread); 1258 mempool_destroy(log->meta_pool); 1259 bioset_free(log->bs); 1260 mempool_destroy(log->io_pool); 1261 kmem_cache_destroy(log->io_kc); 1262 kfree(log); 1263 } 1264