1*f6bed0efSShaohua Li /* 2*f6bed0efSShaohua Li * Copyright (C) 2015 Shaohua Li <shli@fb.com> 3*f6bed0efSShaohua Li * 4*f6bed0efSShaohua Li * This program is free software; you can redistribute it and/or modify it 5*f6bed0efSShaohua Li * under the terms and conditions of the GNU General Public License, 6*f6bed0efSShaohua Li * version 2, as published by the Free Software Foundation. 7*f6bed0efSShaohua Li * 8*f6bed0efSShaohua Li * This program is distributed in the hope it will be useful, but WITHOUT 9*f6bed0efSShaohua Li * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10*f6bed0efSShaohua Li * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 11*f6bed0efSShaohua Li * more details. 12*f6bed0efSShaohua Li * 13*f6bed0efSShaohua Li */ 14*f6bed0efSShaohua Li #include <linux/kernel.h> 15*f6bed0efSShaohua Li #include <linux/wait.h> 16*f6bed0efSShaohua Li #include <linux/blkdev.h> 17*f6bed0efSShaohua Li #include <linux/slab.h> 18*f6bed0efSShaohua Li #include <linux/raid/md_p.h> 19*f6bed0efSShaohua Li #include <linux/crc32.h> 20*f6bed0efSShaohua Li #include <linux/random.h> 21*f6bed0efSShaohua Li #include "md.h" 22*f6bed0efSShaohua Li #include "raid5.h" 23*f6bed0efSShaohua Li 24*f6bed0efSShaohua Li /* 25*f6bed0efSShaohua Li * metadata/data stored in disk with 4k size unit (a block) regardless 26*f6bed0efSShaohua Li * underneath hardware sector size. only works with PAGE_SIZE == 4096 27*f6bed0efSShaohua Li */ 28*f6bed0efSShaohua Li #define BLOCK_SECTORS (8) 29*f6bed0efSShaohua Li 30*f6bed0efSShaohua Li struct r5l_log { 31*f6bed0efSShaohua Li struct md_rdev *rdev; 32*f6bed0efSShaohua Li 33*f6bed0efSShaohua Li u32 uuid_checksum; 34*f6bed0efSShaohua Li 35*f6bed0efSShaohua Li sector_t device_size; /* log device size, round to 36*f6bed0efSShaohua Li * BLOCK_SECTORS */ 37*f6bed0efSShaohua Li 38*f6bed0efSShaohua Li sector_t last_checkpoint; /* log tail. where recovery scan 39*f6bed0efSShaohua Li * starts from */ 40*f6bed0efSShaohua Li u64 last_cp_seq; /* log tail sequence */ 41*f6bed0efSShaohua Li 42*f6bed0efSShaohua Li sector_t log_start; /* log head. where new data appends */ 43*f6bed0efSShaohua Li u64 seq; /* log head sequence */ 44*f6bed0efSShaohua Li 45*f6bed0efSShaohua Li struct mutex io_mutex; 46*f6bed0efSShaohua Li struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 47*f6bed0efSShaohua Li 48*f6bed0efSShaohua Li spinlock_t io_list_lock; 49*f6bed0efSShaohua Li struct list_head running_ios; /* io_units which are still running, 50*f6bed0efSShaohua Li * and have not yet been completely 51*f6bed0efSShaohua Li * written to the log */ 52*f6bed0efSShaohua Li struct list_head io_end_ios; /* io_units which have been completely 53*f6bed0efSShaohua Li * written to the log but not yet written 54*f6bed0efSShaohua Li * to the RAID */ 55*f6bed0efSShaohua Li 56*f6bed0efSShaohua Li struct kmem_cache *io_kc; 57*f6bed0efSShaohua Li 58*f6bed0efSShaohua Li struct list_head no_space_stripes; /* pending stripes, log has no space */ 59*f6bed0efSShaohua Li spinlock_t no_space_stripes_lock; 60*f6bed0efSShaohua Li }; 61*f6bed0efSShaohua Li 62*f6bed0efSShaohua Li /* 63*f6bed0efSShaohua Li * an IO range starts from a meta data block and end at the next meta data 64*f6bed0efSShaohua Li * block. The io unit's the meta data block tracks data/parity followed it. io 65*f6bed0efSShaohua Li * unit is written to log disk with normal write, as we always flush log disk 66*f6bed0efSShaohua Li * first and then start move data to raid disks, there is no requirement to 67*f6bed0efSShaohua Li * write io unit with FLUSH/FUA 68*f6bed0efSShaohua Li */ 69*f6bed0efSShaohua Li struct r5l_io_unit { 70*f6bed0efSShaohua Li struct r5l_log *log; 71*f6bed0efSShaohua Li 72*f6bed0efSShaohua Li struct page *meta_page; /* store meta block */ 73*f6bed0efSShaohua Li int meta_offset; /* current offset in meta_page */ 74*f6bed0efSShaohua Li 75*f6bed0efSShaohua Li struct bio_list bios; 76*f6bed0efSShaohua Li atomic_t pending_io; /* pending bios not written to log yet */ 77*f6bed0efSShaohua Li struct bio *current_bio;/* current_bio accepting new data */ 78*f6bed0efSShaohua Li 79*f6bed0efSShaohua Li atomic_t pending_stripe;/* how many stripes not flushed to raid */ 80*f6bed0efSShaohua Li u64 seq; /* seq number of the metablock */ 81*f6bed0efSShaohua Li sector_t log_start; /* where the io_unit starts */ 82*f6bed0efSShaohua Li sector_t log_end; /* where the io_unit ends */ 83*f6bed0efSShaohua Li struct list_head log_sibling; /* log->running_ios */ 84*f6bed0efSShaohua Li struct list_head stripe_list; /* stripes added to the io_unit */ 85*f6bed0efSShaohua Li 86*f6bed0efSShaohua Li int state; 87*f6bed0efSShaohua Li wait_queue_head_t wait_state; 88*f6bed0efSShaohua Li }; 89*f6bed0efSShaohua Li 90*f6bed0efSShaohua Li /* r5l_io_unit state */ 91*f6bed0efSShaohua Li enum r5l_io_unit_state { 92*f6bed0efSShaohua Li IO_UNIT_RUNNING = 0, /* accepting new IO */ 93*f6bed0efSShaohua Li IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 94*f6bed0efSShaohua Li * don't accepting new bio */ 95*f6bed0efSShaohua Li IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 96*f6bed0efSShaohua Li IO_UNIT_STRIPE_START = 3, /* stripes of io_unit are flushing to raid */ 97*f6bed0efSShaohua Li IO_UNIT_STRIPE_END = 4, /* stripes data finished writing to raid */ 98*f6bed0efSShaohua Li }; 99*f6bed0efSShaohua Li 100*f6bed0efSShaohua Li static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 101*f6bed0efSShaohua Li { 102*f6bed0efSShaohua Li start += inc; 103*f6bed0efSShaohua Li if (start >= log->device_size) 104*f6bed0efSShaohua Li start = start - log->device_size; 105*f6bed0efSShaohua Li return start; 106*f6bed0efSShaohua Li } 107*f6bed0efSShaohua Li 108*f6bed0efSShaohua Li static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 109*f6bed0efSShaohua Li sector_t end) 110*f6bed0efSShaohua Li { 111*f6bed0efSShaohua Li if (end >= start) 112*f6bed0efSShaohua Li return end - start; 113*f6bed0efSShaohua Li else 114*f6bed0efSShaohua Li return end + log->device_size - start; 115*f6bed0efSShaohua Li } 116*f6bed0efSShaohua Li 117*f6bed0efSShaohua Li static bool r5l_has_free_space(struct r5l_log *log, sector_t size) 118*f6bed0efSShaohua Li { 119*f6bed0efSShaohua Li sector_t used_size; 120*f6bed0efSShaohua Li 121*f6bed0efSShaohua Li used_size = r5l_ring_distance(log, log->last_checkpoint, 122*f6bed0efSShaohua Li log->log_start); 123*f6bed0efSShaohua Li 124*f6bed0efSShaohua Li return log->device_size > used_size + size; 125*f6bed0efSShaohua Li } 126*f6bed0efSShaohua Li 127*f6bed0efSShaohua Li static struct r5l_io_unit *r5l_alloc_io_unit(struct r5l_log *log) 128*f6bed0efSShaohua Li { 129*f6bed0efSShaohua Li struct r5l_io_unit *io; 130*f6bed0efSShaohua Li /* We can't handle memory allocate failure so far */ 131*f6bed0efSShaohua Li gfp_t gfp = GFP_NOIO | __GFP_NOFAIL; 132*f6bed0efSShaohua Li 133*f6bed0efSShaohua Li io = kmem_cache_zalloc(log->io_kc, gfp); 134*f6bed0efSShaohua Li io->log = log; 135*f6bed0efSShaohua Li io->meta_page = alloc_page(gfp | __GFP_ZERO); 136*f6bed0efSShaohua Li 137*f6bed0efSShaohua Li bio_list_init(&io->bios); 138*f6bed0efSShaohua Li INIT_LIST_HEAD(&io->log_sibling); 139*f6bed0efSShaohua Li INIT_LIST_HEAD(&io->stripe_list); 140*f6bed0efSShaohua Li io->state = IO_UNIT_RUNNING; 141*f6bed0efSShaohua Li init_waitqueue_head(&io->wait_state); 142*f6bed0efSShaohua Li return io; 143*f6bed0efSShaohua Li } 144*f6bed0efSShaohua Li 145*f6bed0efSShaohua Li static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) 146*f6bed0efSShaohua Li { 147*f6bed0efSShaohua Li __free_page(io->meta_page); 148*f6bed0efSShaohua Li kmem_cache_free(log->io_kc, io); 149*f6bed0efSShaohua Li } 150*f6bed0efSShaohua Li 151*f6bed0efSShaohua Li static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to, 152*f6bed0efSShaohua Li enum r5l_io_unit_state state) 153*f6bed0efSShaohua Li { 154*f6bed0efSShaohua Li struct r5l_io_unit *io; 155*f6bed0efSShaohua Li 156*f6bed0efSShaohua Li while (!list_empty(from)) { 157*f6bed0efSShaohua Li io = list_first_entry(from, struct r5l_io_unit, log_sibling); 158*f6bed0efSShaohua Li /* don't change list order */ 159*f6bed0efSShaohua Li if (io->state >= state) 160*f6bed0efSShaohua Li list_move_tail(&io->log_sibling, to); 161*f6bed0efSShaohua Li else 162*f6bed0efSShaohua Li break; 163*f6bed0efSShaohua Li } 164*f6bed0efSShaohua Li } 165*f6bed0efSShaohua Li 166*f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); 167*f6bed0efSShaohua Li static void __r5l_set_io_unit_state(struct r5l_io_unit *io, 168*f6bed0efSShaohua Li enum r5l_io_unit_state state) 169*f6bed0efSShaohua Li { 170*f6bed0efSShaohua Li struct r5l_log *log = io->log; 171*f6bed0efSShaohua Li 172*f6bed0efSShaohua Li if (WARN_ON(io->state >= state)) 173*f6bed0efSShaohua Li return; 174*f6bed0efSShaohua Li io->state = state; 175*f6bed0efSShaohua Li if (state == IO_UNIT_IO_END) 176*f6bed0efSShaohua Li r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios, 177*f6bed0efSShaohua Li IO_UNIT_IO_END); 178*f6bed0efSShaohua Li wake_up(&io->wait_state); 179*f6bed0efSShaohua Li } 180*f6bed0efSShaohua Li 181*f6bed0efSShaohua Li static void r5l_set_io_unit_state(struct r5l_io_unit *io, 182*f6bed0efSShaohua Li enum r5l_io_unit_state state) 183*f6bed0efSShaohua Li { 184*f6bed0efSShaohua Li struct r5l_log *log = io->log; 185*f6bed0efSShaohua Li unsigned long flags; 186*f6bed0efSShaohua Li 187*f6bed0efSShaohua Li spin_lock_irqsave(&log->io_list_lock, flags); 188*f6bed0efSShaohua Li __r5l_set_io_unit_state(io, state); 189*f6bed0efSShaohua Li spin_unlock_irqrestore(&log->io_list_lock, flags); 190*f6bed0efSShaohua Li } 191*f6bed0efSShaohua Li 192*f6bed0efSShaohua Li /* XXX: totally ignores I/O errors */ 193*f6bed0efSShaohua Li static void r5l_log_endio(struct bio *bio) 194*f6bed0efSShaohua Li { 195*f6bed0efSShaohua Li struct r5l_io_unit *io = bio->bi_private; 196*f6bed0efSShaohua Li struct r5l_log *log = io->log; 197*f6bed0efSShaohua Li 198*f6bed0efSShaohua Li bio_put(bio); 199*f6bed0efSShaohua Li 200*f6bed0efSShaohua Li if (!atomic_dec_and_test(&io->pending_io)) 201*f6bed0efSShaohua Li return; 202*f6bed0efSShaohua Li 203*f6bed0efSShaohua Li r5l_set_io_unit_state(io, IO_UNIT_IO_END); 204*f6bed0efSShaohua Li md_wakeup_thread(log->rdev->mddev->thread); 205*f6bed0efSShaohua Li } 206*f6bed0efSShaohua Li 207*f6bed0efSShaohua Li static void r5l_submit_current_io(struct r5l_log *log) 208*f6bed0efSShaohua Li { 209*f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 210*f6bed0efSShaohua Li struct r5l_meta_block *block; 211*f6bed0efSShaohua Li struct bio *bio; 212*f6bed0efSShaohua Li u32 crc; 213*f6bed0efSShaohua Li 214*f6bed0efSShaohua Li if (!io) 215*f6bed0efSShaohua Li return; 216*f6bed0efSShaohua Li 217*f6bed0efSShaohua Li block = page_address(io->meta_page); 218*f6bed0efSShaohua Li block->meta_size = cpu_to_le32(io->meta_offset); 219*f6bed0efSShaohua Li crc = crc32_le(log->uuid_checksum, (void *)block, PAGE_SIZE); 220*f6bed0efSShaohua Li block->checksum = cpu_to_le32(crc); 221*f6bed0efSShaohua Li 222*f6bed0efSShaohua Li log->current_io = NULL; 223*f6bed0efSShaohua Li r5l_set_io_unit_state(io, IO_UNIT_IO_START); 224*f6bed0efSShaohua Li 225*f6bed0efSShaohua Li while ((bio = bio_list_pop(&io->bios))) { 226*f6bed0efSShaohua Li /* all IO must start from rdev->data_offset */ 227*f6bed0efSShaohua Li bio->bi_iter.bi_sector += log->rdev->data_offset; 228*f6bed0efSShaohua Li submit_bio(WRITE, bio); 229*f6bed0efSShaohua Li } 230*f6bed0efSShaohua Li } 231*f6bed0efSShaohua Li 232*f6bed0efSShaohua Li static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 233*f6bed0efSShaohua Li { 234*f6bed0efSShaohua Li struct r5l_io_unit *io; 235*f6bed0efSShaohua Li struct r5l_meta_block *block; 236*f6bed0efSShaohua Li struct bio *bio; 237*f6bed0efSShaohua Li 238*f6bed0efSShaohua Li io = r5l_alloc_io_unit(log); 239*f6bed0efSShaohua Li 240*f6bed0efSShaohua Li block = page_address(io->meta_page); 241*f6bed0efSShaohua Li block->magic = cpu_to_le32(R5LOG_MAGIC); 242*f6bed0efSShaohua Li block->version = R5LOG_VERSION; 243*f6bed0efSShaohua Li block->seq = cpu_to_le64(log->seq); 244*f6bed0efSShaohua Li block->position = cpu_to_le64(log->log_start); 245*f6bed0efSShaohua Li 246*f6bed0efSShaohua Li io->log_start = log->log_start; 247*f6bed0efSShaohua Li io->meta_offset = sizeof(struct r5l_meta_block); 248*f6bed0efSShaohua Li io->seq = log->seq; 249*f6bed0efSShaohua Li 250*f6bed0efSShaohua Li bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); 251*f6bed0efSShaohua Li io->current_bio = bio; 252*f6bed0efSShaohua Li bio->bi_rw = WRITE; 253*f6bed0efSShaohua Li bio->bi_bdev = log->rdev->bdev; 254*f6bed0efSShaohua Li bio->bi_iter.bi_sector = log->log_start; 255*f6bed0efSShaohua Li bio_add_page(bio, io->meta_page, PAGE_SIZE, 0); 256*f6bed0efSShaohua Li bio->bi_end_io = r5l_log_endio; 257*f6bed0efSShaohua Li bio->bi_private = io; 258*f6bed0efSShaohua Li 259*f6bed0efSShaohua Li bio_list_add(&io->bios, bio); 260*f6bed0efSShaohua Li atomic_inc(&io->pending_io); 261*f6bed0efSShaohua Li 262*f6bed0efSShaohua Li log->seq++; 263*f6bed0efSShaohua Li log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 264*f6bed0efSShaohua Li io->log_end = log->log_start; 265*f6bed0efSShaohua Li /* current bio hit disk end */ 266*f6bed0efSShaohua Li if (log->log_start == 0) 267*f6bed0efSShaohua Li io->current_bio = NULL; 268*f6bed0efSShaohua Li 269*f6bed0efSShaohua Li spin_lock_irq(&log->io_list_lock); 270*f6bed0efSShaohua Li list_add_tail(&io->log_sibling, &log->running_ios); 271*f6bed0efSShaohua Li spin_unlock_irq(&log->io_list_lock); 272*f6bed0efSShaohua Li 273*f6bed0efSShaohua Li return io; 274*f6bed0efSShaohua Li } 275*f6bed0efSShaohua Li 276*f6bed0efSShaohua Li static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 277*f6bed0efSShaohua Li { 278*f6bed0efSShaohua Li struct r5l_io_unit *io; 279*f6bed0efSShaohua Li 280*f6bed0efSShaohua Li io = log->current_io; 281*f6bed0efSShaohua Li if (io && io->meta_offset + payload_size > PAGE_SIZE) 282*f6bed0efSShaohua Li r5l_submit_current_io(log); 283*f6bed0efSShaohua Li io = log->current_io; 284*f6bed0efSShaohua Li if (io) 285*f6bed0efSShaohua Li return 0; 286*f6bed0efSShaohua Li 287*f6bed0efSShaohua Li log->current_io = r5l_new_meta(log); 288*f6bed0efSShaohua Li return 0; 289*f6bed0efSShaohua Li } 290*f6bed0efSShaohua Li 291*f6bed0efSShaohua Li static void r5l_append_payload_meta(struct r5l_log *log, u16 type, 292*f6bed0efSShaohua Li sector_t location, 293*f6bed0efSShaohua Li u32 checksum1, u32 checksum2, 294*f6bed0efSShaohua Li bool checksum2_valid) 295*f6bed0efSShaohua Li { 296*f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 297*f6bed0efSShaohua Li struct r5l_payload_data_parity *payload; 298*f6bed0efSShaohua Li 299*f6bed0efSShaohua Li payload = page_address(io->meta_page) + io->meta_offset; 300*f6bed0efSShaohua Li payload->header.type = cpu_to_le16(type); 301*f6bed0efSShaohua Li payload->header.flags = cpu_to_le16(0); 302*f6bed0efSShaohua Li payload->size = cpu_to_le32((1 + !!checksum2_valid) << 303*f6bed0efSShaohua Li (PAGE_SHIFT - 9)); 304*f6bed0efSShaohua Li payload->location = cpu_to_le64(location); 305*f6bed0efSShaohua Li payload->checksum[0] = cpu_to_le32(checksum1); 306*f6bed0efSShaohua Li if (checksum2_valid) 307*f6bed0efSShaohua Li payload->checksum[1] = cpu_to_le32(checksum2); 308*f6bed0efSShaohua Li 309*f6bed0efSShaohua Li io->meta_offset += sizeof(struct r5l_payload_data_parity) + 310*f6bed0efSShaohua Li sizeof(__le32) * (1 + !!checksum2_valid); 311*f6bed0efSShaohua Li } 312*f6bed0efSShaohua Li 313*f6bed0efSShaohua Li static void r5l_append_payload_page(struct r5l_log *log, struct page *page) 314*f6bed0efSShaohua Li { 315*f6bed0efSShaohua Li struct r5l_io_unit *io = log->current_io; 316*f6bed0efSShaohua Li 317*f6bed0efSShaohua Li alloc_bio: 318*f6bed0efSShaohua Li if (!io->current_bio) { 319*f6bed0efSShaohua Li struct bio *bio; 320*f6bed0efSShaohua Li 321*f6bed0efSShaohua Li bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); 322*f6bed0efSShaohua Li bio->bi_rw = WRITE; 323*f6bed0efSShaohua Li bio->bi_bdev = log->rdev->bdev; 324*f6bed0efSShaohua Li bio->bi_iter.bi_sector = log->log_start; 325*f6bed0efSShaohua Li bio->bi_end_io = r5l_log_endio; 326*f6bed0efSShaohua Li bio->bi_private = io; 327*f6bed0efSShaohua Li bio_list_add(&io->bios, bio); 328*f6bed0efSShaohua Li atomic_inc(&io->pending_io); 329*f6bed0efSShaohua Li io->current_bio = bio; 330*f6bed0efSShaohua Li } 331*f6bed0efSShaohua Li if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) { 332*f6bed0efSShaohua Li io->current_bio = NULL; 333*f6bed0efSShaohua Li goto alloc_bio; 334*f6bed0efSShaohua Li } 335*f6bed0efSShaohua Li log->log_start = r5l_ring_add(log, log->log_start, 336*f6bed0efSShaohua Li BLOCK_SECTORS); 337*f6bed0efSShaohua Li /* current bio hit disk end */ 338*f6bed0efSShaohua Li if (log->log_start == 0) 339*f6bed0efSShaohua Li io->current_bio = NULL; 340*f6bed0efSShaohua Li 341*f6bed0efSShaohua Li io->log_end = log->log_start; 342*f6bed0efSShaohua Li } 343*f6bed0efSShaohua Li 344*f6bed0efSShaohua Li static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 345*f6bed0efSShaohua Li int data_pages, int parity_pages) 346*f6bed0efSShaohua Li { 347*f6bed0efSShaohua Li int i; 348*f6bed0efSShaohua Li int meta_size; 349*f6bed0efSShaohua Li struct r5l_io_unit *io; 350*f6bed0efSShaohua Li 351*f6bed0efSShaohua Li meta_size = 352*f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 353*f6bed0efSShaohua Li * data_pages) + 354*f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 355*f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 356*f6bed0efSShaohua Li 357*f6bed0efSShaohua Li r5l_get_meta(log, meta_size); 358*f6bed0efSShaohua Li io = log->current_io; 359*f6bed0efSShaohua Li 360*f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 361*f6bed0efSShaohua Li if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 362*f6bed0efSShaohua Li continue; 363*f6bed0efSShaohua Li if (i == sh->pd_idx || i == sh->qd_idx) 364*f6bed0efSShaohua Li continue; 365*f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 366*f6bed0efSShaohua Li raid5_compute_blocknr(sh, i, 0), 367*f6bed0efSShaohua Li sh->dev[i].log_checksum, 0, false); 368*f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[i].page); 369*f6bed0efSShaohua Li } 370*f6bed0efSShaohua Li 371*f6bed0efSShaohua Li if (sh->qd_idx >= 0) { 372*f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 373*f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 374*f6bed0efSShaohua Li sh->dev[sh->qd_idx].log_checksum, true); 375*f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 376*f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 377*f6bed0efSShaohua Li } else { 378*f6bed0efSShaohua Li r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 379*f6bed0efSShaohua Li sh->sector, sh->dev[sh->pd_idx].log_checksum, 380*f6bed0efSShaohua Li 0, false); 381*f6bed0efSShaohua Li r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 382*f6bed0efSShaohua Li } 383*f6bed0efSShaohua Li 384*f6bed0efSShaohua Li list_add_tail(&sh->log_list, &io->stripe_list); 385*f6bed0efSShaohua Li atomic_inc(&io->pending_stripe); 386*f6bed0efSShaohua Li sh->log_io = io; 387*f6bed0efSShaohua Li } 388*f6bed0efSShaohua Li 389*f6bed0efSShaohua Li /* 390*f6bed0efSShaohua Li * running in raid5d, where reclaim could wait for raid5d too (when it flushes 391*f6bed0efSShaohua Li * data from log to raid disks), so we shouldn't wait for reclaim here 392*f6bed0efSShaohua Li */ 393*f6bed0efSShaohua Li int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 394*f6bed0efSShaohua Li { 395*f6bed0efSShaohua Li int write_disks = 0; 396*f6bed0efSShaohua Li int data_pages, parity_pages; 397*f6bed0efSShaohua Li int meta_size; 398*f6bed0efSShaohua Li int reserve; 399*f6bed0efSShaohua Li int i; 400*f6bed0efSShaohua Li 401*f6bed0efSShaohua Li if (!log) 402*f6bed0efSShaohua Li return -EAGAIN; 403*f6bed0efSShaohua Li /* Don't support stripe batch */ 404*f6bed0efSShaohua Li if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 405*f6bed0efSShaohua Li test_bit(STRIPE_SYNCING, &sh->state)) { 406*f6bed0efSShaohua Li /* the stripe is written to log, we start writing it to raid */ 407*f6bed0efSShaohua Li clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 408*f6bed0efSShaohua Li return -EAGAIN; 409*f6bed0efSShaohua Li } 410*f6bed0efSShaohua Li 411*f6bed0efSShaohua Li for (i = 0; i < sh->disks; i++) { 412*f6bed0efSShaohua Li void *addr; 413*f6bed0efSShaohua Li 414*f6bed0efSShaohua Li if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 415*f6bed0efSShaohua Li continue; 416*f6bed0efSShaohua Li write_disks++; 417*f6bed0efSShaohua Li /* checksum is already calculated in last run */ 418*f6bed0efSShaohua Li if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 419*f6bed0efSShaohua Li continue; 420*f6bed0efSShaohua Li addr = kmap_atomic(sh->dev[i].page); 421*f6bed0efSShaohua Li sh->dev[i].log_checksum = crc32_le(log->uuid_checksum, 422*f6bed0efSShaohua Li addr, PAGE_SIZE); 423*f6bed0efSShaohua Li kunmap_atomic(addr); 424*f6bed0efSShaohua Li } 425*f6bed0efSShaohua Li parity_pages = 1 + !!(sh->qd_idx >= 0); 426*f6bed0efSShaohua Li data_pages = write_disks - parity_pages; 427*f6bed0efSShaohua Li 428*f6bed0efSShaohua Li meta_size = 429*f6bed0efSShaohua Li ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 430*f6bed0efSShaohua Li * data_pages) + 431*f6bed0efSShaohua Li sizeof(struct r5l_payload_data_parity) + 432*f6bed0efSShaohua Li sizeof(__le32) * parity_pages; 433*f6bed0efSShaohua Li /* Doesn't work with very big raid array */ 434*f6bed0efSShaohua Li if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) 435*f6bed0efSShaohua Li return -EINVAL; 436*f6bed0efSShaohua Li 437*f6bed0efSShaohua Li set_bit(STRIPE_LOG_TRAPPED, &sh->state); 438*f6bed0efSShaohua Li atomic_inc(&sh->count); 439*f6bed0efSShaohua Li 440*f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 441*f6bed0efSShaohua Li /* meta + data */ 442*f6bed0efSShaohua Li reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 443*f6bed0efSShaohua Li if (r5l_has_free_space(log, reserve)) 444*f6bed0efSShaohua Li r5l_log_stripe(log, sh, data_pages, parity_pages); 445*f6bed0efSShaohua Li else { 446*f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 447*f6bed0efSShaohua Li list_add_tail(&sh->log_list, &log->no_space_stripes); 448*f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 449*f6bed0efSShaohua Li 450*f6bed0efSShaohua Li r5l_wake_reclaim(log, reserve); 451*f6bed0efSShaohua Li } 452*f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 453*f6bed0efSShaohua Li 454*f6bed0efSShaohua Li return 0; 455*f6bed0efSShaohua Li } 456*f6bed0efSShaohua Li 457*f6bed0efSShaohua Li void r5l_write_stripe_run(struct r5l_log *log) 458*f6bed0efSShaohua Li { 459*f6bed0efSShaohua Li if (!log) 460*f6bed0efSShaohua Li return; 461*f6bed0efSShaohua Li mutex_lock(&log->io_mutex); 462*f6bed0efSShaohua Li r5l_submit_current_io(log); 463*f6bed0efSShaohua Li mutex_unlock(&log->io_mutex); 464*f6bed0efSShaohua Li } 465*f6bed0efSShaohua Li 466*f6bed0efSShaohua Li /* This will run after log space is reclaimed */ 467*f6bed0efSShaohua Li static void r5l_run_no_space_stripes(struct r5l_log *log) 468*f6bed0efSShaohua Li { 469*f6bed0efSShaohua Li struct stripe_head *sh; 470*f6bed0efSShaohua Li 471*f6bed0efSShaohua Li spin_lock(&log->no_space_stripes_lock); 472*f6bed0efSShaohua Li while (!list_empty(&log->no_space_stripes)) { 473*f6bed0efSShaohua Li sh = list_first_entry(&log->no_space_stripes, 474*f6bed0efSShaohua Li struct stripe_head, log_list); 475*f6bed0efSShaohua Li list_del_init(&sh->log_list); 476*f6bed0efSShaohua Li set_bit(STRIPE_HANDLE, &sh->state); 477*f6bed0efSShaohua Li raid5_release_stripe(sh); 478*f6bed0efSShaohua Li } 479*f6bed0efSShaohua Li spin_unlock(&log->no_space_stripes_lock); 480*f6bed0efSShaohua Li } 481*f6bed0efSShaohua Li 482*f6bed0efSShaohua Li static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) 483*f6bed0efSShaohua Li { 484*f6bed0efSShaohua Li /* will implement later */ 485*f6bed0efSShaohua Li } 486*f6bed0efSShaohua Li 487*f6bed0efSShaohua Li static int r5l_recovery_log(struct r5l_log *log) 488*f6bed0efSShaohua Li { 489*f6bed0efSShaohua Li /* fake recovery */ 490*f6bed0efSShaohua Li log->seq = log->last_cp_seq + 1; 491*f6bed0efSShaohua Li log->log_start = r5l_ring_add(log, log->last_checkpoint, BLOCK_SECTORS); 492*f6bed0efSShaohua Li return 0; 493*f6bed0efSShaohua Li } 494*f6bed0efSShaohua Li 495*f6bed0efSShaohua Li static void r5l_write_super(struct r5l_log *log, sector_t cp) 496*f6bed0efSShaohua Li { 497*f6bed0efSShaohua Li struct mddev *mddev = log->rdev->mddev; 498*f6bed0efSShaohua Li 499*f6bed0efSShaohua Li log->rdev->journal_tail = cp; 500*f6bed0efSShaohua Li set_bit(MD_CHANGE_DEVS, &mddev->flags); 501*f6bed0efSShaohua Li } 502*f6bed0efSShaohua Li 503*f6bed0efSShaohua Li static int r5l_load_log(struct r5l_log *log) 504*f6bed0efSShaohua Li { 505*f6bed0efSShaohua Li struct md_rdev *rdev = log->rdev; 506*f6bed0efSShaohua Li struct page *page; 507*f6bed0efSShaohua Li struct r5l_meta_block *mb; 508*f6bed0efSShaohua Li sector_t cp = log->rdev->journal_tail; 509*f6bed0efSShaohua Li u32 stored_crc, expected_crc; 510*f6bed0efSShaohua Li bool create_super = false; 511*f6bed0efSShaohua Li int ret; 512*f6bed0efSShaohua Li 513*f6bed0efSShaohua Li /* Make sure it's valid */ 514*f6bed0efSShaohua Li if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 515*f6bed0efSShaohua Li cp = 0; 516*f6bed0efSShaohua Li page = alloc_page(GFP_KERNEL); 517*f6bed0efSShaohua Li if (!page) 518*f6bed0efSShaohua Li return -ENOMEM; 519*f6bed0efSShaohua Li 520*f6bed0efSShaohua Li if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) { 521*f6bed0efSShaohua Li ret = -EIO; 522*f6bed0efSShaohua Li goto ioerr; 523*f6bed0efSShaohua Li } 524*f6bed0efSShaohua Li mb = page_address(page); 525*f6bed0efSShaohua Li 526*f6bed0efSShaohua Li if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 527*f6bed0efSShaohua Li mb->version != R5LOG_VERSION) { 528*f6bed0efSShaohua Li create_super = true; 529*f6bed0efSShaohua Li goto create; 530*f6bed0efSShaohua Li } 531*f6bed0efSShaohua Li stored_crc = le32_to_cpu(mb->checksum); 532*f6bed0efSShaohua Li mb->checksum = 0; 533*f6bed0efSShaohua Li expected_crc = crc32_le(log->uuid_checksum, (void *)mb, PAGE_SIZE); 534*f6bed0efSShaohua Li if (stored_crc != expected_crc) { 535*f6bed0efSShaohua Li create_super = true; 536*f6bed0efSShaohua Li goto create; 537*f6bed0efSShaohua Li } 538*f6bed0efSShaohua Li if (le64_to_cpu(mb->position) != cp) { 539*f6bed0efSShaohua Li create_super = true; 540*f6bed0efSShaohua Li goto create; 541*f6bed0efSShaohua Li } 542*f6bed0efSShaohua Li create: 543*f6bed0efSShaohua Li if (create_super) { 544*f6bed0efSShaohua Li log->last_cp_seq = prandom_u32(); 545*f6bed0efSShaohua Li cp = 0; 546*f6bed0efSShaohua Li /* 547*f6bed0efSShaohua Li * Make sure super points to correct address. Log might have 548*f6bed0efSShaohua Li * data very soon. If super hasn't correct log tail address, 549*f6bed0efSShaohua Li * recovery can't find the log 550*f6bed0efSShaohua Li */ 551*f6bed0efSShaohua Li r5l_write_super(log, cp); 552*f6bed0efSShaohua Li } else 553*f6bed0efSShaohua Li log->last_cp_seq = le64_to_cpu(mb->seq); 554*f6bed0efSShaohua Li 555*f6bed0efSShaohua Li log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 556*f6bed0efSShaohua Li log->last_checkpoint = cp; 557*f6bed0efSShaohua Li 558*f6bed0efSShaohua Li __free_page(page); 559*f6bed0efSShaohua Li 560*f6bed0efSShaohua Li return r5l_recovery_log(log); 561*f6bed0efSShaohua Li ioerr: 562*f6bed0efSShaohua Li __free_page(page); 563*f6bed0efSShaohua Li return ret; 564*f6bed0efSShaohua Li } 565*f6bed0efSShaohua Li 566*f6bed0efSShaohua Li int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 567*f6bed0efSShaohua Li { 568*f6bed0efSShaohua Li struct r5l_log *log; 569*f6bed0efSShaohua Li 570*f6bed0efSShaohua Li if (PAGE_SIZE != 4096) 571*f6bed0efSShaohua Li return -EINVAL; 572*f6bed0efSShaohua Li log = kzalloc(sizeof(*log), GFP_KERNEL); 573*f6bed0efSShaohua Li if (!log) 574*f6bed0efSShaohua Li return -ENOMEM; 575*f6bed0efSShaohua Li log->rdev = rdev; 576*f6bed0efSShaohua Li 577*f6bed0efSShaohua Li log->uuid_checksum = crc32_le(~0, (void *)rdev->mddev->uuid, 578*f6bed0efSShaohua Li sizeof(rdev->mddev->uuid)); 579*f6bed0efSShaohua Li 580*f6bed0efSShaohua Li mutex_init(&log->io_mutex); 581*f6bed0efSShaohua Li 582*f6bed0efSShaohua Li spin_lock_init(&log->io_list_lock); 583*f6bed0efSShaohua Li INIT_LIST_HEAD(&log->running_ios); 584*f6bed0efSShaohua Li 585*f6bed0efSShaohua Li log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 586*f6bed0efSShaohua Li if (!log->io_kc) 587*f6bed0efSShaohua Li goto io_kc; 588*f6bed0efSShaohua Li 589*f6bed0efSShaohua Li INIT_LIST_HEAD(&log->no_space_stripes); 590*f6bed0efSShaohua Li spin_lock_init(&log->no_space_stripes_lock); 591*f6bed0efSShaohua Li 592*f6bed0efSShaohua Li if (r5l_load_log(log)) 593*f6bed0efSShaohua Li goto error; 594*f6bed0efSShaohua Li 595*f6bed0efSShaohua Li conf->log = log; 596*f6bed0efSShaohua Li return 0; 597*f6bed0efSShaohua Li error: 598*f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 599*f6bed0efSShaohua Li io_kc: 600*f6bed0efSShaohua Li kfree(log); 601*f6bed0efSShaohua Li return -EINVAL; 602*f6bed0efSShaohua Li } 603*f6bed0efSShaohua Li 604*f6bed0efSShaohua Li void r5l_exit_log(struct r5l_log *log) 605*f6bed0efSShaohua Li { 606*f6bed0efSShaohua Li kmem_cache_destroy(log->io_kc); 607*f6bed0efSShaohua Li kfree(log); 608*f6bed0efSShaohua Li } 609