1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 drbd_actlog.c 4 5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 6 7 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 8 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 9 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 10 11 12 */ 13 14 #include <linux/slab.h> 15 #include <linux/crc32c.h> 16 #include <linux/drbd.h> 17 #include <linux/drbd_limits.h> 18 #include "drbd_int.h" 19 20 21 enum al_transaction_types { 22 AL_TR_UPDATE = 0, 23 AL_TR_INITIALIZED = 0xffff 24 }; 25 /* all fields on disc in big endian */ 26 struct __packed al_transaction_on_disk { 27 /* don't we all like magic */ 28 __be32 magic; 29 30 /* to identify the most recent transaction block 31 * in the on disk ring buffer */ 32 __be32 tr_number; 33 34 /* checksum on the full 4k block, with this field set to 0. */ 35 __be32 crc32c; 36 37 /* type of transaction, special transaction types like: 38 * purge-all, set-all-idle, set-all-active, ... to-be-defined 39 * see also enum al_transaction_types */ 40 __be16 transaction_type; 41 42 /* we currently allow only a few thousand extents, 43 * so 16bit will be enough for the slot number. */ 44 45 /* how many updates in this transaction */ 46 __be16 n_updates; 47 48 /* maximum slot number, "al-extents" in drbd.conf speak. 49 * Having this in each transaction should make reconfiguration 50 * of that parameter easier. */ 51 __be16 context_size; 52 53 /* slot number the context starts with */ 54 __be16 context_start_slot_nr; 55 56 /* Some reserved bytes. Expected usage is a 64bit counter of 57 * sectors-written since device creation, and other data generation tag 58 * supporting usage */ 59 __be32 __reserved[4]; 60 61 /* --- 36 byte used --- */ 62 63 /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes 64 * in one transaction, then use the remaining byte in the 4k block for 65 * context information. "Flexible" number of updates per transaction 66 * does not help, as we have to account for the case when all update 67 * slots are used anyways, so it would only complicate code without 68 * additional benefit. 69 */ 70 __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; 71 72 /* but the extent number is 32bit, which at an extent size of 4 MiB 73 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ 74 __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; 75 76 /* --- 420 bytes used (36 + 64*6) --- */ 77 78 /* 4096 - 420 = 3676 = 919 * 4 */ 79 __be32 context[AL_CONTEXT_PER_TRANSACTION]; 80 }; 81 82 void *drbd_md_get_buffer(struct drbd_device *device, const char *intent) 83 { 84 int r; 85 86 wait_event(device->misc_wait, 87 (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || 88 device->state.disk <= D_FAILED); 89 90 if (r) 91 return NULL; 92 93 device->md_io.current_use = intent; 94 device->md_io.start_jif = jiffies; 95 device->md_io.submit_jif = device->md_io.start_jif - 1; 96 return page_address(device->md_io.page); 97 } 98 99 void drbd_md_put_buffer(struct drbd_device *device) 100 { 101 if (atomic_dec_and_test(&device->md_io.in_use)) 102 wake_up(&device->misc_wait); 103 } 104 105 void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, 106 unsigned int *done) 107 { 108 long dt; 109 110 rcu_read_lock(); 111 dt = rcu_dereference(bdev->disk_conf)->disk_timeout; 112 rcu_read_unlock(); 113 dt = dt * HZ / 10; 114 if (dt == 0) 115 dt = MAX_SCHEDULE_TIMEOUT; 116 117 dt = wait_event_timeout(device->misc_wait, 118 *done || test_bit(FORCE_DETACH, &device->flags), dt); 119 if (dt == 0) { 120 drbd_err(device, "meta-data IO operation timed out\n"); 121 drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH); 122 } 123 } 124 125 static int _drbd_md_sync_page_io(struct drbd_device *device, 126 struct drbd_backing_dev *bdev, 127 sector_t sector, enum req_op op) 128 { 129 struct bio *bio; 130 /* we do all our meta data IO in aligned 4k blocks. */ 131 const int size = 4096; 132 int err; 133 blk_opf_t op_flags = 0; 134 135 device->md_io.done = 0; 136 device->md_io.error = -ENODEV; 137 138 if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags)) 139 op_flags |= REQ_FUA | REQ_PREFLUSH; 140 op_flags |= REQ_SYNC; 141 142 bio = bio_alloc_bioset(bdev->md_bdev, 1, op | op_flags, GFP_NOIO, 143 &drbd_md_io_bio_set); 144 bio->bi_iter.bi_sector = sector; 145 err = -EIO; 146 if (bio_add_page(bio, device->md_io.page, size, 0) != size) 147 goto out; 148 bio->bi_private = device; 149 bio->bi_end_io = drbd_md_endio; 150 151 if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL) 152 /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ 153 ; 154 else if (!get_ldev_if_state(device, D_ATTACHING)) { 155 /* Corresponding put_ldev in drbd_md_endio() */ 156 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 157 err = -ENODEV; 158 goto out; 159 } 160 161 bio_get(bio); /* one bio_put() is in the completion handler */ 162 atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ 163 device->md_io.submit_jif = jiffies; 164 if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 165 bio_io_error(bio); 166 else 167 submit_bio(bio); 168 wait_until_done_or_force_detached(device, bdev, &device->md_io.done); 169 if (!bio->bi_status) 170 err = device->md_io.error; 171 172 out: 173 bio_put(bio); 174 return err; 175 } 176 177 int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, 178 sector_t sector, enum req_op op) 179 { 180 int err; 181 D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); 182 183 BUG_ON(!bdev->md_bdev); 184 185 dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", 186 current->comm, current->pid, __func__, 187 (unsigned long long)sector, (op == REQ_OP_WRITE) ? "WRITE" : "READ", 188 (void*)_RET_IP_ ); 189 190 if (sector < drbd_md_first_sector(bdev) || 191 sector + 7 > drbd_md_last_sector(bdev)) 192 drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 193 current->comm, current->pid, __func__, 194 (unsigned long long)sector, 195 (op == REQ_OP_WRITE) ? "WRITE" : "READ"); 196 197 err = _drbd_md_sync_page_io(device, bdev, sector, op); 198 if (err) { 199 drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 200 (unsigned long long)sector, 201 (op == REQ_OP_WRITE) ? "WRITE" : "READ", err); 202 } 203 return err; 204 } 205 206 static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr) 207 { 208 struct lc_element *tmp; 209 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 210 if (unlikely(tmp != NULL)) { 211 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 212 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) 213 return bm_ext; 214 } 215 return NULL; 216 } 217 218 static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock) 219 { 220 struct lc_element *al_ext; 221 struct bm_extent *bm_ext; 222 int wake; 223 224 spin_lock_irq(&device->al_lock); 225 bm_ext = find_active_resync_extent(device, enr); 226 if (bm_ext) { 227 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 228 spin_unlock_irq(&device->al_lock); 229 if (wake) 230 wake_up(&device->al_wait); 231 return NULL; 232 } 233 if (nonblock) 234 al_ext = lc_try_get(device->act_log, enr); 235 else 236 al_ext = lc_get(device->act_log, enr); 237 spin_unlock_irq(&device->al_lock); 238 return al_ext; 239 } 240 241 bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i) 242 { 243 /* for bios crossing activity log extent boundaries, 244 * we may need to activate two extents in one go */ 245 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 246 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 247 248 D_ASSERT(device, first <= last); 249 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 250 251 /* FIXME figure out a fast path for bios crossing AL extent boundaries */ 252 if (first != last) 253 return false; 254 255 return _al_get(device, first, true); 256 } 257 258 bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) 259 { 260 /* for bios crossing activity log extent boundaries, 261 * we may need to activate two extents in one go */ 262 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 263 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 264 unsigned enr; 265 bool need_transaction = false; 266 267 D_ASSERT(device, first <= last); 268 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 269 270 for (enr = first; enr <= last; enr++) { 271 struct lc_element *al_ext; 272 wait_event(device->al_wait, 273 (al_ext = _al_get(device, enr, false)) != NULL); 274 if (al_ext->lc_number != enr) 275 need_transaction = true; 276 } 277 return need_transaction; 278 } 279 280 #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) 281 /* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT 282 * are still coupled, or assume too much about their relation. 283 * Code below will not work if this is violated. 284 * Will be cleaned up with some followup patch. 285 */ 286 # error FIXME 287 #endif 288 289 static unsigned int al_extent_to_bm_page(unsigned int al_enr) 290 { 291 return al_enr >> 292 /* bit to page */ 293 ((PAGE_SHIFT + 3) - 294 /* al extent number to bit */ 295 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 296 } 297 298 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) 299 { 300 const unsigned int stripes = device->ldev->md.al_stripes; 301 const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; 302 303 /* transaction number, modulo on-disk ring buffer wrap around */ 304 unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); 305 306 /* ... to aligned 4k on disk block */ 307 t = ((t % stripes) * stripe_size_4kB) + t/stripes; 308 309 /* ... to 512 byte sector in activity log */ 310 t *= 8; 311 312 /* ... plus offset to the on disk position */ 313 return device->ldev->md.md_offset + device->ldev->md.al_offset + t; 314 } 315 316 static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer) 317 { 318 struct lc_element *e; 319 sector_t sector; 320 int i, mx; 321 unsigned extent_nr; 322 unsigned crc = 0; 323 int err = 0; 324 325 memset(buffer, 0, sizeof(*buffer)); 326 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); 327 buffer->tr_number = cpu_to_be32(device->al_tr_number); 328 329 i = 0; 330 331 drbd_bm_reset_al_hints(device); 332 333 /* Even though no one can start to change this list 334 * once we set the LC_LOCKED -- from drbd_al_begin_io(), 335 * lc_try_lock_for_transaction() --, someone may still 336 * be in the process of changing it. */ 337 spin_lock_irq(&device->al_lock); 338 list_for_each_entry(e, &device->act_log->to_be_changed, list) { 339 if (i == AL_UPDATES_PER_TRANSACTION) { 340 i++; 341 break; 342 } 343 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); 344 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); 345 if (e->lc_number != LC_FREE) 346 drbd_bm_mark_for_writeout(device, 347 al_extent_to_bm_page(e->lc_number)); 348 i++; 349 } 350 spin_unlock_irq(&device->al_lock); 351 BUG_ON(i > AL_UPDATES_PER_TRANSACTION); 352 353 buffer->n_updates = cpu_to_be16(i); 354 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { 355 buffer->update_slot_nr[i] = cpu_to_be16(-1); 356 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); 357 } 358 359 buffer->context_size = cpu_to_be16(device->act_log->nr_elements); 360 buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); 361 362 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, 363 device->act_log->nr_elements - device->al_tr_cycle); 364 for (i = 0; i < mx; i++) { 365 unsigned idx = device->al_tr_cycle + i; 366 extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; 367 buffer->context[i] = cpu_to_be32(extent_nr); 368 } 369 for (; i < AL_CONTEXT_PER_TRANSACTION; i++) 370 buffer->context[i] = cpu_to_be32(LC_FREE); 371 372 device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; 373 if (device->al_tr_cycle >= device->act_log->nr_elements) 374 device->al_tr_cycle = 0; 375 376 sector = al_tr_number_to_on_disk_sector(device); 377 378 crc = crc32c(0, buffer, 4096); 379 buffer->crc32c = cpu_to_be32(crc); 380 381 if (drbd_bm_write_hinted(device)) 382 err = -EIO; 383 else { 384 bool write_al_updates; 385 rcu_read_lock(); 386 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 387 rcu_read_unlock(); 388 if (write_al_updates) { 389 if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) { 390 err = -EIO; 391 drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); 392 } else { 393 device->al_tr_number++; 394 device->al_writ_cnt++; 395 } 396 } 397 } 398 399 return err; 400 } 401 402 static int al_write_transaction(struct drbd_device *device) 403 { 404 struct al_transaction_on_disk *buffer; 405 int err; 406 407 if (!get_ldev(device)) { 408 drbd_err(device, "disk is %s, cannot start al transaction\n", 409 drbd_disk_str(device->state.disk)); 410 return -EIO; 411 } 412 413 /* The bitmap write may have failed, causing a state change. */ 414 if (device->state.disk < D_INCONSISTENT) { 415 drbd_err(device, 416 "disk is %s, cannot write al transaction\n", 417 drbd_disk_str(device->state.disk)); 418 put_ldev(device); 419 return -EIO; 420 } 421 422 /* protects md_io_buffer, al_tr_cycle, ... */ 423 buffer = drbd_md_get_buffer(device, __func__); 424 if (!buffer) { 425 drbd_err(device, "disk failed while waiting for md_io buffer\n"); 426 put_ldev(device); 427 return -ENODEV; 428 } 429 430 err = __al_write_transaction(device, buffer); 431 432 drbd_md_put_buffer(device); 433 put_ldev(device); 434 435 return err; 436 } 437 438 439 void drbd_al_begin_io_commit(struct drbd_device *device) 440 { 441 bool locked = false; 442 443 /* Serialize multiple transactions. 444 * This uses test_and_set_bit, memory barrier is implicit. 445 */ 446 wait_event(device->al_wait, 447 device->act_log->pending_changes == 0 || 448 (locked = lc_try_lock_for_transaction(device->act_log))); 449 450 if (locked) { 451 /* Double check: it may have been committed by someone else, 452 * while we have been waiting for the lock. */ 453 if (device->act_log->pending_changes) { 454 bool write_al_updates; 455 456 rcu_read_lock(); 457 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 458 rcu_read_unlock(); 459 460 if (write_al_updates) 461 al_write_transaction(device); 462 spin_lock_irq(&device->al_lock); 463 /* FIXME 464 if (err) 465 we need an "lc_cancel" here; 466 */ 467 lc_committed(device->act_log); 468 spin_unlock_irq(&device->al_lock); 469 } 470 lc_unlock(device->act_log); 471 wake_up(&device->al_wait); 472 } 473 } 474 475 /* 476 * @delegate: delegate activity log I/O to the worker thread 477 */ 478 void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) 479 { 480 if (drbd_al_begin_io_prepare(device, i)) 481 drbd_al_begin_io_commit(device); 482 } 483 484 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) 485 { 486 struct lru_cache *al = device->act_log; 487 /* for bios crossing activity log extent boundaries, 488 * we may need to activate two extents in one go */ 489 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 490 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 491 unsigned nr_al_extents; 492 unsigned available_update_slots; 493 unsigned enr; 494 495 D_ASSERT(device, first <= last); 496 497 nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ 498 available_update_slots = min(al->nr_elements - al->used, 499 al->max_pending_changes - al->pending_changes); 500 501 /* We want all necessary updates for a given request within the same transaction 502 * We could first check how many updates are *actually* needed, 503 * and use that instead of the worst-case nr_al_extents */ 504 if (available_update_slots < nr_al_extents) { 505 /* Too many activity log extents are currently "hot". 506 * 507 * If we have accumulated pending changes already, 508 * we made progress. 509 * 510 * If we cannot get even a single pending change through, 511 * stop the fast path until we made some progress, 512 * or requests to "cold" extents could be starved. */ 513 if (!al->pending_changes) 514 __set_bit(__LC_STARVING, &device->act_log->flags); 515 return -ENOBUFS; 516 } 517 518 /* Is resync active in this area? */ 519 for (enr = first; enr <= last; enr++) { 520 struct lc_element *tmp; 521 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 522 if (unlikely(tmp != NULL)) { 523 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 524 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 525 if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) 526 return -EBUSY; 527 return -EWOULDBLOCK; 528 } 529 } 530 } 531 532 /* Checkout the refcounts. 533 * Given that we checked for available elements and update slots above, 534 * this has to be successful. */ 535 for (enr = first; enr <= last; enr++) { 536 struct lc_element *al_ext; 537 al_ext = lc_get_cumulative(device->act_log, enr); 538 if (!al_ext) 539 drbd_info(device, "LOGIC BUG for enr=%u\n", enr); 540 } 541 return 0; 542 } 543 544 void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) 545 { 546 /* for bios crossing activity log extent boundaries, 547 * we may need to activate two extents in one go */ 548 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 549 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 550 unsigned enr; 551 struct lc_element *extent; 552 unsigned long flags; 553 554 D_ASSERT(device, first <= last); 555 spin_lock_irqsave(&device->al_lock, flags); 556 557 for (enr = first; enr <= last; enr++) { 558 extent = lc_find(device->act_log, enr); 559 if (!extent) { 560 drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); 561 continue; 562 } 563 lc_put(device->act_log, extent); 564 } 565 spin_unlock_irqrestore(&device->al_lock, flags); 566 wake_up(&device->al_wait); 567 } 568 569 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 570 { 571 int rv; 572 573 spin_lock_irq(&device->al_lock); 574 rv = (al_ext->refcnt == 0); 575 if (likely(rv)) 576 lc_del(device->act_log, al_ext); 577 spin_unlock_irq(&device->al_lock); 578 579 return rv; 580 } 581 582 /** 583 * drbd_al_shrink() - Removes all active extents form the activity log 584 * @device: DRBD device. 585 * 586 * Removes all active extents form the activity log, waiting until 587 * the reference count of each entry dropped to 0 first, of course. 588 * 589 * You need to lock device->act_log with lc_try_lock() / lc_unlock() 590 */ 591 void drbd_al_shrink(struct drbd_device *device) 592 { 593 struct lc_element *al_ext; 594 int i; 595 596 D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags)); 597 598 for (i = 0; i < device->act_log->nr_elements; i++) { 599 al_ext = lc_element_by_index(device->act_log, i); 600 if (al_ext->lc_number == LC_FREE) 601 continue; 602 wait_event(device->al_wait, _try_lc_del(device, al_ext)); 603 } 604 605 wake_up(&device->al_wait); 606 } 607 608 int drbd_al_initialize(struct drbd_device *device, void *buffer) 609 { 610 struct al_transaction_on_disk *al = buffer; 611 struct drbd_md *md = &device->ldev->md; 612 int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 613 int i; 614 615 __al_write_transaction(device, al); 616 /* There may or may not have been a pending transaction. */ 617 spin_lock_irq(&device->al_lock); 618 lc_committed(device->act_log); 619 spin_unlock_irq(&device->al_lock); 620 621 /* The rest of the transactions will have an empty "updates" list, and 622 * are written out only to provide the context, and to initialize the 623 * on-disk ring buffer. */ 624 for (i = 1; i < al_size_4k; i++) { 625 int err = __al_write_transaction(device, al); 626 if (err) 627 return err; 628 } 629 return 0; 630 } 631 632 static const char *drbd_change_sync_fname[] = { 633 [RECORD_RS_FAILED] = "drbd_rs_failed_io", 634 [SET_IN_SYNC] = "drbd_set_in_sync", 635 [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" 636 }; 637 638 /* ATTENTION. The AL's extents are 4MB each, while the extents in the 639 * resync LRU-cache are 16MB each. 640 * The caller of this function has to hold an get_ldev() reference. 641 * 642 * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), 643 * potentially pulling in (and recounting the corresponding bits) 644 * this resync extent into the resync extent lru cache. 645 * 646 * Returns whether all bits have been cleared for this resync extent, 647 * precisely: (rs_left <= rs_failed) 648 * 649 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 650 */ 651 static bool update_rs_extent(struct drbd_device *device, 652 unsigned int enr, int count, 653 enum update_sync_bits_mode mode) 654 { 655 struct lc_element *e; 656 657 D_ASSERT(device, atomic_read(&device->local_cnt)); 658 659 /* When setting out-of-sync bits, 660 * we don't need it cached (lc_find). 661 * But if it is present in the cache, 662 * we should update the cached bit count. 663 * Otherwise, that extent should be in the resync extent lru cache 664 * already -- or we want to pull it in if necessary -- (lc_get), 665 * then update and check rs_left and rs_failed. */ 666 if (mode == SET_OUT_OF_SYNC) 667 e = lc_find(device->resync, enr); 668 else 669 e = lc_get(device->resync, enr); 670 if (e) { 671 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 672 if (ext->lce.lc_number == enr) { 673 if (mode == SET_IN_SYNC) 674 ext->rs_left -= count; 675 else if (mode == SET_OUT_OF_SYNC) 676 ext->rs_left += count; 677 else 678 ext->rs_failed += count; 679 if (ext->rs_left < ext->rs_failed) { 680 drbd_warn(device, "BAD! enr=%u rs_left=%d " 681 "rs_failed=%d count=%d cstate=%s\n", 682 ext->lce.lc_number, ext->rs_left, 683 ext->rs_failed, count, 684 drbd_conn_str(device->state.conn)); 685 686 /* We don't expect to be able to clear more bits 687 * than have been set when we originally counted 688 * the set bits to cache that value in ext->rs_left. 689 * Whatever the reason (disconnect during resync, 690 * delayed local completion of an application write), 691 * try to fix it up by recounting here. */ 692 ext->rs_left = drbd_bm_e_weight(device, enr); 693 } 694 } else { 695 /* Normally this element should be in the cache, 696 * since drbd_rs_begin_io() pulled it already in. 697 * 698 * But maybe an application write finished, and we set 699 * something outside the resync lru_cache in sync. 700 */ 701 int rs_left = drbd_bm_e_weight(device, enr); 702 if (ext->flags != 0) { 703 drbd_warn(device, "changing resync lce: %d[%u;%02lx]" 704 " -> %d[%u;00]\n", 705 ext->lce.lc_number, ext->rs_left, 706 ext->flags, enr, rs_left); 707 ext->flags = 0; 708 } 709 if (ext->rs_failed) { 710 drbd_warn(device, "Kicking resync_lru element enr=%u " 711 "out with rs_failed=%d\n", 712 ext->lce.lc_number, ext->rs_failed); 713 } 714 ext->rs_left = rs_left; 715 ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; 716 /* we don't keep a persistent log of the resync lru, 717 * we can commit any change right away. */ 718 lc_committed(device->resync); 719 } 720 if (mode != SET_OUT_OF_SYNC) 721 lc_put(device->resync, &ext->lce); 722 /* no race, we are within the al_lock! */ 723 724 if (ext->rs_left <= ext->rs_failed) { 725 ext->rs_failed = 0; 726 return true; 727 } 728 } else if (mode != SET_OUT_OF_SYNC) { 729 /* be quiet if lc_find() did not find it. */ 730 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 731 device->resync_locked, 732 device->resync->nr_elements, 733 device->resync->flags); 734 } 735 return false; 736 } 737 738 void drbd_advance_rs_marks(struct drbd_peer_device *peer_device, unsigned long still_to_go) 739 { 740 struct drbd_device *device = peer_device->device; 741 unsigned long now = jiffies; 742 unsigned long last = device->rs_mark_time[device->rs_last_mark]; 743 int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; 744 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { 745 if (device->rs_mark_left[device->rs_last_mark] != still_to_go && 746 device->state.conn != C_PAUSED_SYNC_T && 747 device->state.conn != C_PAUSED_SYNC_S) { 748 device->rs_mark_time[next] = now; 749 device->rs_mark_left[next] = still_to_go; 750 device->rs_last_mark = next; 751 } 752 } 753 } 754 755 /* It is called lazy update, so don't do write-out too often. */ 756 static bool lazy_bitmap_update_due(struct drbd_device *device) 757 { 758 return time_after(jiffies, device->rs_last_bcast + 2*HZ); 759 } 760 761 static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) 762 { 763 if (rs_done) { 764 struct drbd_connection *connection = first_peer_device(device)->connection; 765 if (connection->agreed_pro_version <= 95 || 766 is_sync_target_state(device->state.conn)) 767 set_bit(RS_DONE, &device->flags); 768 /* and also set RS_PROGRESS below */ 769 770 /* Else: rather wait for explicit notification via receive_state, 771 * to avoid uuids-rotated-too-fast causing full resync 772 * in next handshake, in case the replication link breaks 773 * at the most unfortunate time... */ 774 } else if (!lazy_bitmap_update_due(device)) 775 return; 776 777 drbd_device_post_work(device, RS_PROGRESS); 778 } 779 780 static int update_sync_bits(struct drbd_device *device, 781 unsigned long sbnr, unsigned long ebnr, 782 enum update_sync_bits_mode mode) 783 { 784 /* 785 * We keep a count of set bits per resync-extent in the ->rs_left 786 * caching member, so we need to loop and work within the resync extent 787 * alignment. Typically this loop will execute exactly once. 788 */ 789 unsigned long flags; 790 unsigned long count = 0; 791 unsigned int cleared = 0; 792 while (sbnr <= ebnr) { 793 /* set temporary boundary bit number to last bit number within 794 * the resync extent of the current start bit number, 795 * but cap at provided end bit number */ 796 unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); 797 unsigned long c; 798 799 if (mode == RECORD_RS_FAILED) 800 /* Only called from drbd_rs_failed_io(), bits 801 * supposedly still set. Recount, maybe some 802 * of the bits have been successfully cleared 803 * by application IO meanwhile. 804 */ 805 c = drbd_bm_count_bits(device, sbnr, tbnr); 806 else if (mode == SET_IN_SYNC) 807 c = drbd_bm_clear_bits(device, sbnr, tbnr); 808 else /* if (mode == SET_OUT_OF_SYNC) */ 809 c = drbd_bm_set_bits(device, sbnr, tbnr); 810 811 if (c) { 812 spin_lock_irqsave(&device->al_lock, flags); 813 cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); 814 spin_unlock_irqrestore(&device->al_lock, flags); 815 count += c; 816 } 817 sbnr = tbnr + 1; 818 } 819 if (count) { 820 if (mode == SET_IN_SYNC) { 821 unsigned long still_to_go = drbd_bm_total_weight(device); 822 bool rs_is_done = (still_to_go <= device->rs_failed); 823 drbd_advance_rs_marks(first_peer_device(device), still_to_go); 824 if (cleared || rs_is_done) 825 maybe_schedule_on_disk_bitmap_update(device, rs_is_done); 826 } else if (mode == RECORD_RS_FAILED) 827 device->rs_failed += count; 828 wake_up(&device->al_wait); 829 } 830 return count; 831 } 832 833 static bool plausible_request_size(int size) 834 { 835 return size > 0 836 && size <= DRBD_MAX_BATCH_BIO_SIZE 837 && IS_ALIGNED(size, 512); 838 } 839 840 /* clear the bit corresponding to the piece of storage in question: 841 * size byte of data starting from sector. Only clear bits of the affected 842 * one or more _aligned_ BM_BLOCK_SIZE blocks. 843 * 844 * called by worker on C_SYNC_TARGET and receiver on SyncSource. 845 * 846 */ 847 int __drbd_change_sync(struct drbd_peer_device *peer_device, sector_t sector, int size, 848 enum update_sync_bits_mode mode) 849 { 850 /* Is called from worker and receiver context _only_ */ 851 struct drbd_device *device = peer_device->device; 852 unsigned long sbnr, ebnr, lbnr; 853 unsigned long count = 0; 854 sector_t esector, nr_sectors; 855 856 /* This would be an empty REQ_PREFLUSH, be silent. */ 857 if ((mode == SET_OUT_OF_SYNC) && size == 0) 858 return 0; 859 860 if (!plausible_request_size(size)) { 861 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", 862 drbd_change_sync_fname[mode], 863 (unsigned long long)sector, size); 864 return 0; 865 } 866 867 if (!get_ldev(device)) 868 return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ 869 870 nr_sectors = get_capacity(device->vdisk); 871 esector = sector + (size >> 9) - 1; 872 873 if (!expect(device, sector < nr_sectors)) 874 goto out; 875 if (!expect(device, esector < nr_sectors)) 876 esector = nr_sectors - 1; 877 878 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 879 880 if (mode == SET_IN_SYNC) { 881 /* Round up start sector, round down end sector. We make sure 882 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ 883 if (unlikely(esector < BM_SECT_PER_BIT-1)) 884 goto out; 885 if (unlikely(esector == (nr_sectors-1))) 886 ebnr = lbnr; 887 else 888 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 889 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 890 } else { 891 /* We set it out of sync, or record resync failure. 892 * Should not round anything here. */ 893 sbnr = BM_SECT_TO_BIT(sector); 894 ebnr = BM_SECT_TO_BIT(esector); 895 } 896 897 count = update_sync_bits(device, sbnr, ebnr, mode); 898 out: 899 put_ldev(device); 900 return count; 901 } 902 903 static 904 struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr) 905 { 906 struct lc_element *e; 907 struct bm_extent *bm_ext; 908 int wakeup = 0; 909 unsigned long rs_flags; 910 911 spin_lock_irq(&device->al_lock); 912 if (device->resync_locked > device->resync->nr_elements/2) { 913 spin_unlock_irq(&device->al_lock); 914 return NULL; 915 } 916 e = lc_get(device->resync, enr); 917 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 918 if (bm_ext) { 919 if (bm_ext->lce.lc_number != enr) { 920 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 921 bm_ext->rs_failed = 0; 922 lc_committed(device->resync); 923 wakeup = 1; 924 } 925 if (bm_ext->lce.refcnt == 1) 926 device->resync_locked++; 927 set_bit(BME_NO_WRITES, &bm_ext->flags); 928 } 929 rs_flags = device->resync->flags; 930 spin_unlock_irq(&device->al_lock); 931 if (wakeup) 932 wake_up(&device->al_wait); 933 934 if (!bm_ext) { 935 if (rs_flags & LC_STARVING) 936 drbd_warn(device, "Have to wait for element" 937 " (resync LRU too small?)\n"); 938 BUG_ON(rs_flags & LC_LOCKED); 939 } 940 941 return bm_ext; 942 } 943 944 static int _is_in_al(struct drbd_device *device, unsigned int enr) 945 { 946 int rv; 947 948 spin_lock_irq(&device->al_lock); 949 rv = lc_is_used(device->act_log, enr); 950 spin_unlock_irq(&device->al_lock); 951 952 return rv; 953 } 954 955 /** 956 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 957 * @device: DRBD device. 958 * @sector: The sector number. 959 * 960 * This functions sleeps on al_wait. 961 * 962 * Returns: %0 on success, -EINTR if interrupted. 963 */ 964 int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) 965 { 966 unsigned int enr = BM_SECT_TO_EXT(sector); 967 struct bm_extent *bm_ext; 968 int i, sig; 969 bool sa; 970 971 retry: 972 sig = wait_event_interruptible(device->al_wait, 973 (bm_ext = _bme_get(device, enr))); 974 if (sig) 975 return -EINTR; 976 977 if (test_bit(BME_LOCKED, &bm_ext->flags)) 978 return 0; 979 980 /* step aside only while we are above c-min-rate; unless disabled. */ 981 sa = drbd_rs_c_min_rate_throttle(device); 982 983 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 984 sig = wait_event_interruptible(device->al_wait, 985 !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || 986 (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); 987 988 if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { 989 spin_lock_irq(&device->al_lock); 990 if (lc_put(device->resync, &bm_ext->lce) == 0) { 991 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ 992 device->resync_locked--; 993 wake_up(&device->al_wait); 994 } 995 spin_unlock_irq(&device->al_lock); 996 if (sig) 997 return -EINTR; 998 if (schedule_timeout_interruptible(HZ/10)) 999 return -EINTR; 1000 goto retry; 1001 } 1002 } 1003 set_bit(BME_LOCKED, &bm_ext->flags); 1004 return 0; 1005 } 1006 1007 /** 1008 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 1009 * @peer_device: DRBD device. 1010 * @sector: The sector number. 1011 * 1012 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 1013 * tries to set it to BME_LOCKED. 1014 * 1015 * Returns: %0 upon success, and -EAGAIN 1016 * if there is still application IO going on in this area. 1017 */ 1018 int drbd_try_rs_begin_io(struct drbd_peer_device *peer_device, sector_t sector) 1019 { 1020 struct drbd_device *device = peer_device->device; 1021 unsigned int enr = BM_SECT_TO_EXT(sector); 1022 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 1023 struct lc_element *e; 1024 struct bm_extent *bm_ext; 1025 int i; 1026 bool throttle = drbd_rs_should_slow_down(peer_device, sector, true); 1027 1028 /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, 1029 * not yet BME_LOCKED) extent needs to be kicked out explicitly if we 1030 * need to throttle. There is at most one such half-locked extent, 1031 * which is remembered in resync_wenr. */ 1032 1033 if (throttle && device->resync_wenr != enr) 1034 return -EAGAIN; 1035 1036 spin_lock_irq(&device->al_lock); 1037 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { 1038 /* in case you have very heavy scattered io, it may 1039 * stall the syncer undefined if we give up the ref count 1040 * when we try again and requeue. 1041 * 1042 * if we don't give up the refcount, but the next time 1043 * we are scheduled this extent has been "synced" by new 1044 * application writes, we'd miss the lc_put on the 1045 * extent we keep the refcount on. 1046 * so we remembered which extent we had to try again, and 1047 * if the next requested one is something else, we do 1048 * the lc_put here... 1049 * we also have to wake_up 1050 */ 1051 e = lc_find(device->resync, device->resync_wenr); 1052 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1053 if (bm_ext) { 1054 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1055 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1056 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1057 device->resync_wenr = LC_FREE; 1058 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1059 bm_ext->flags = 0; 1060 device->resync_locked--; 1061 } 1062 wake_up(&device->al_wait); 1063 } else { 1064 drbd_alert(device, "LOGIC BUG\n"); 1065 } 1066 } 1067 /* TRY. */ 1068 e = lc_try_get(device->resync, enr); 1069 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1070 if (bm_ext) { 1071 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1072 goto proceed; 1073 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 1074 device->resync_locked++; 1075 } else { 1076 /* we did set the BME_NO_WRITES, 1077 * but then could not set BME_LOCKED, 1078 * so we tried again. 1079 * drop the extra reference. */ 1080 bm_ext->lce.refcnt--; 1081 D_ASSERT(device, bm_ext->lce.refcnt > 0); 1082 } 1083 goto check_al; 1084 } else { 1085 /* do we rather want to try later? */ 1086 if (device->resync_locked > device->resync->nr_elements-3) 1087 goto try_again; 1088 /* Do or do not. There is no try. -- Yoda */ 1089 e = lc_get(device->resync, enr); 1090 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1091 if (!bm_ext) { 1092 const unsigned long rs_flags = device->resync->flags; 1093 if (rs_flags & LC_STARVING) 1094 drbd_warn(device, "Have to wait for element" 1095 " (resync LRU too small?)\n"); 1096 BUG_ON(rs_flags & LC_LOCKED); 1097 goto try_again; 1098 } 1099 if (bm_ext->lce.lc_number != enr) { 1100 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 1101 bm_ext->rs_failed = 0; 1102 lc_committed(device->resync); 1103 wake_up(&device->al_wait); 1104 D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0); 1105 } 1106 set_bit(BME_NO_WRITES, &bm_ext->flags); 1107 D_ASSERT(device, bm_ext->lce.refcnt == 1); 1108 device->resync_locked++; 1109 goto check_al; 1110 } 1111 check_al: 1112 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1113 if (lc_is_used(device->act_log, al_enr+i)) 1114 goto try_again; 1115 } 1116 set_bit(BME_LOCKED, &bm_ext->flags); 1117 proceed: 1118 device->resync_wenr = LC_FREE; 1119 spin_unlock_irq(&device->al_lock); 1120 return 0; 1121 1122 try_again: 1123 if (bm_ext) { 1124 if (throttle) { 1125 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1126 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1127 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1128 device->resync_wenr = LC_FREE; 1129 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1130 bm_ext->flags = 0; 1131 device->resync_locked--; 1132 } 1133 wake_up(&device->al_wait); 1134 } else 1135 device->resync_wenr = enr; 1136 } 1137 spin_unlock_irq(&device->al_lock); 1138 return -EAGAIN; 1139 } 1140 1141 void drbd_rs_complete_io(struct drbd_device *device, sector_t sector) 1142 { 1143 unsigned int enr = BM_SECT_TO_EXT(sector); 1144 struct lc_element *e; 1145 struct bm_extent *bm_ext; 1146 unsigned long flags; 1147 1148 spin_lock_irqsave(&device->al_lock, flags); 1149 e = lc_find(device->resync, enr); 1150 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1151 if (!bm_ext) { 1152 spin_unlock_irqrestore(&device->al_lock, flags); 1153 if (drbd_ratelimit()) 1154 drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); 1155 return; 1156 } 1157 1158 if (bm_ext->lce.refcnt == 0) { 1159 spin_unlock_irqrestore(&device->al_lock, flags); 1160 drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, " 1161 "but refcnt is 0!?\n", 1162 (unsigned long long)sector, enr); 1163 return; 1164 } 1165 1166 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1167 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ 1168 device->resync_locked--; 1169 wake_up(&device->al_wait); 1170 } 1171 1172 spin_unlock_irqrestore(&device->al_lock, flags); 1173 } 1174 1175 /** 1176 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 1177 * @device: DRBD device. 1178 */ 1179 void drbd_rs_cancel_all(struct drbd_device *device) 1180 { 1181 spin_lock_irq(&device->al_lock); 1182 1183 if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */ 1184 lc_reset(device->resync); 1185 put_ldev(device); 1186 } 1187 device->resync_locked = 0; 1188 device->resync_wenr = LC_FREE; 1189 spin_unlock_irq(&device->al_lock); 1190 wake_up(&device->al_wait); 1191 } 1192 1193 /** 1194 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 1195 * @device: DRBD device. 1196 * 1197 * Returns: %0 upon success, -EAGAIN if at least one reference count was 1198 * not zero. 1199 */ 1200 int drbd_rs_del_all(struct drbd_device *device) 1201 { 1202 struct lc_element *e; 1203 struct bm_extent *bm_ext; 1204 int i; 1205 1206 spin_lock_irq(&device->al_lock); 1207 1208 if (get_ldev_if_state(device, D_FAILED)) { 1209 /* ok, ->resync is there. */ 1210 for (i = 0; i < device->resync->nr_elements; i++) { 1211 e = lc_element_by_index(device->resync, i); 1212 bm_ext = lc_entry(e, struct bm_extent, lce); 1213 if (bm_ext->lce.lc_number == LC_FREE) 1214 continue; 1215 if (bm_ext->lce.lc_number == device->resync_wenr) { 1216 drbd_info(device, "dropping %u in drbd_rs_del_all, apparently" 1217 " got 'synced' by application io\n", 1218 device->resync_wenr); 1219 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1220 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1221 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1222 device->resync_wenr = LC_FREE; 1223 lc_put(device->resync, &bm_ext->lce); 1224 } 1225 if (bm_ext->lce.refcnt != 0) { 1226 drbd_info(device, "Retrying drbd_rs_del_all() later. " 1227 "refcnt=%d\n", bm_ext->lce.refcnt); 1228 put_ldev(device); 1229 spin_unlock_irq(&device->al_lock); 1230 return -EAGAIN; 1231 } 1232 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1233 D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags)); 1234 lc_del(device->resync, &bm_ext->lce); 1235 } 1236 D_ASSERT(device, device->resync->used == 0); 1237 put_ldev(device); 1238 } 1239 spin_unlock_irq(&device->al_lock); 1240 wake_up(&device->al_wait); 1241 1242 return 0; 1243 } 1244