1 /* 2 drbd_actlog.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 24 */ 25 26 #include <linux/slab.h> 27 #include <linux/crc32c.h> 28 #include <linux/drbd.h> 29 #include <linux/drbd_limits.h> 30 #include <linux/dynamic_debug.h> 31 #include "drbd_int.h" 32 #include "drbd_wrappers.h" 33 34 35 enum al_transaction_types { 36 AL_TR_UPDATE = 0, 37 AL_TR_INITIALIZED = 0xffff 38 }; 39 /* all fields on disc in big endian */ 40 struct __packed al_transaction_on_disk { 41 /* don't we all like magic */ 42 __be32 magic; 43 44 /* to identify the most recent transaction block 45 * in the on disk ring buffer */ 46 __be32 tr_number; 47 48 /* checksum on the full 4k block, with this field set to 0. */ 49 __be32 crc32c; 50 51 /* type of transaction, special transaction types like: 52 * purge-all, set-all-idle, set-all-active, ... to-be-defined 53 * see also enum al_transaction_types */ 54 __be16 transaction_type; 55 56 /* we currently allow only a few thousand extents, 57 * so 16bit will be enough for the slot number. */ 58 59 /* how many updates in this transaction */ 60 __be16 n_updates; 61 62 /* maximum slot number, "al-extents" in drbd.conf speak. 63 * Having this in each transaction should make reconfiguration 64 * of that parameter easier. */ 65 __be16 context_size; 66 67 /* slot number the context starts with */ 68 __be16 context_start_slot_nr; 69 70 /* Some reserved bytes. Expected usage is a 64bit counter of 71 * sectors-written since device creation, and other data generation tag 72 * supporting usage */ 73 __be32 __reserved[4]; 74 75 /* --- 36 byte used --- */ 76 77 /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes 78 * in one transaction, then use the remaining byte in the 4k block for 79 * context information. "Flexible" number of updates per transaction 80 * does not help, as we have to account for the case when all update 81 * slots are used anyways, so it would only complicate code without 82 * additional benefit. 83 */ 84 __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; 85 86 /* but the extent number is 32bit, which at an extent size of 4 MiB 87 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ 88 __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; 89 90 /* --- 420 bytes used (36 + 64*6) --- */ 91 92 /* 4096 - 420 = 3676 = 919 * 4 */ 93 __be32 context[AL_CONTEXT_PER_TRANSACTION]; 94 }; 95 96 struct update_odbm_work { 97 struct drbd_work w; 98 unsigned int enr; 99 }; 100 101 struct update_al_work { 102 struct drbd_work w; 103 struct completion event; 104 int err; 105 }; 106 107 static int al_write_transaction(struct drbd_conf *mdev); 108 109 void *drbd_md_get_buffer(struct drbd_conf *mdev) 110 { 111 int r; 112 113 wait_event(mdev->misc_wait, 114 (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || 115 mdev->state.disk <= D_FAILED); 116 117 return r ? NULL : page_address(mdev->md_io_page); 118 } 119 120 void drbd_md_put_buffer(struct drbd_conf *mdev) 121 { 122 if (atomic_dec_and_test(&mdev->md_io_in_use)) 123 wake_up(&mdev->misc_wait); 124 } 125 126 void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, 127 unsigned int *done) 128 { 129 long dt; 130 131 rcu_read_lock(); 132 dt = rcu_dereference(bdev->disk_conf)->disk_timeout; 133 rcu_read_unlock(); 134 dt = dt * HZ / 10; 135 if (dt == 0) 136 dt = MAX_SCHEDULE_TIMEOUT; 137 138 dt = wait_event_timeout(mdev->misc_wait, 139 *done || test_bit(FORCE_DETACH, &mdev->flags), dt); 140 if (dt == 0) { 141 dev_err(DEV, "meta-data IO operation timed out\n"); 142 drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH); 143 } 144 } 145 146 static int _drbd_md_sync_page_io(struct drbd_conf *mdev, 147 struct drbd_backing_dev *bdev, 148 struct page *page, sector_t sector, 149 int rw, int size) 150 { 151 struct bio *bio; 152 int err; 153 154 mdev->md_io.done = 0; 155 mdev->md_io.error = -ENODEV; 156 157 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) 158 rw |= REQ_FUA | REQ_FLUSH; 159 rw |= REQ_SYNC; 160 161 bio = bio_alloc_drbd(GFP_NOIO); 162 bio->bi_bdev = bdev->md_bdev; 163 bio->bi_sector = sector; 164 err = -EIO; 165 if (bio_add_page(bio, page, size, 0) != size) 166 goto out; 167 bio->bi_private = &mdev->md_io; 168 bio->bi_end_io = drbd_md_io_complete; 169 bio->bi_rw = rw; 170 171 if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ 172 dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 173 err = -ENODEV; 174 goto out; 175 } 176 177 bio_get(bio); /* one bio_put() is in the completion handler */ 178 atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ 179 if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 180 bio_endio(bio, -EIO); 181 else 182 submit_bio(rw, bio); 183 wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done); 184 if (bio_flagged(bio, BIO_UPTODATE)) 185 err = mdev->md_io.error; 186 187 out: 188 bio_put(bio); 189 return err; 190 } 191 192 int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, 193 sector_t sector, int rw) 194 { 195 int err; 196 struct page *iop = mdev->md_io_page; 197 198 D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); 199 200 BUG_ON(!bdev->md_bdev); 201 202 dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n", 203 current->comm, current->pid, __func__, 204 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 205 206 if (sector < drbd_md_first_sector(bdev) || 207 sector + 7 > drbd_md_last_sector(bdev)) 208 dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 209 current->comm, current->pid, __func__, 210 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 211 212 err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE); 213 if (err) { 214 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 215 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 216 } 217 return err; 218 } 219 220 static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) 221 { 222 struct lc_element *al_ext; 223 struct lc_element *tmp; 224 int wake; 225 226 spin_lock_irq(&mdev->al_lock); 227 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 228 if (unlikely(tmp != NULL)) { 229 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 230 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 231 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 232 spin_unlock_irq(&mdev->al_lock); 233 if (wake) 234 wake_up(&mdev->al_wait); 235 return NULL; 236 } 237 } 238 al_ext = lc_get(mdev->act_log, enr); 239 spin_unlock_irq(&mdev->al_lock); 240 return al_ext; 241 } 242 243 void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i) 244 { 245 /* for bios crossing activity log extent boundaries, 246 * we may need to activate two extents in one go */ 247 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 248 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 249 unsigned enr; 250 bool locked = false; 251 252 253 D_ASSERT(first <= last); 254 D_ASSERT(atomic_read(&mdev->local_cnt) > 0); 255 256 for (enr = first; enr <= last; enr++) 257 wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL); 258 259 /* Serialize multiple transactions. 260 * This uses test_and_set_bit, memory barrier is implicit. 261 */ 262 wait_event(mdev->al_wait, 263 mdev->act_log->pending_changes == 0 || 264 (locked = lc_try_lock_for_transaction(mdev->act_log))); 265 266 if (locked) { 267 /* drbd_al_write_transaction(mdev,al_ext,enr); 268 * recurses into generic_make_request(), which 269 * disallows recursion, bios being serialized on the 270 * current->bio_tail list now. 271 * we have to delegate updates to the activity log 272 * to the worker thread. */ 273 274 /* Double check: it may have been committed by someone else, 275 * while we have been waiting for the lock. */ 276 if (mdev->act_log->pending_changes) { 277 bool write_al_updates; 278 279 rcu_read_lock(); 280 write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates; 281 rcu_read_unlock(); 282 283 if (write_al_updates) { 284 al_write_transaction(mdev); 285 mdev->al_writ_cnt++; 286 } 287 288 spin_lock_irq(&mdev->al_lock); 289 /* FIXME 290 if (err) 291 we need an "lc_cancel" here; 292 */ 293 lc_committed(mdev->act_log); 294 spin_unlock_irq(&mdev->al_lock); 295 } 296 lc_unlock(mdev->act_log); 297 wake_up(&mdev->al_wait); 298 } 299 } 300 301 void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i) 302 { 303 /* for bios crossing activity log extent boundaries, 304 * we may need to activate two extents in one go */ 305 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 306 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 307 unsigned enr; 308 struct lc_element *extent; 309 unsigned long flags; 310 311 D_ASSERT(first <= last); 312 spin_lock_irqsave(&mdev->al_lock, flags); 313 314 for (enr = first; enr <= last; enr++) { 315 extent = lc_find(mdev->act_log, enr); 316 if (!extent) { 317 dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); 318 continue; 319 } 320 lc_put(mdev->act_log, extent); 321 } 322 spin_unlock_irqrestore(&mdev->al_lock, flags); 323 wake_up(&mdev->al_wait); 324 } 325 326 #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) 327 /* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT 328 * are still coupled, or assume too much about their relation. 329 * Code below will not work if this is violated. 330 * Will be cleaned up with some followup patch. 331 */ 332 # error FIXME 333 #endif 334 335 static unsigned int al_extent_to_bm_page(unsigned int al_enr) 336 { 337 return al_enr >> 338 /* bit to page */ 339 ((PAGE_SHIFT + 3) - 340 /* al extent number to bit */ 341 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 342 } 343 344 static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) 345 { 346 return rs_enr >> 347 /* bit to page */ 348 ((PAGE_SHIFT + 3) - 349 /* resync extent number to bit */ 350 (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); 351 } 352 353 static int 354 _al_write_transaction(struct drbd_conf *mdev) 355 { 356 struct al_transaction_on_disk *buffer; 357 struct lc_element *e; 358 sector_t sector; 359 int i, mx; 360 unsigned extent_nr; 361 unsigned crc = 0; 362 int err = 0; 363 364 if (!get_ldev(mdev)) { 365 dev_err(DEV, "disk is %s, cannot start al transaction\n", 366 drbd_disk_str(mdev->state.disk)); 367 return -EIO; 368 } 369 370 /* The bitmap write may have failed, causing a state change. */ 371 if (mdev->state.disk < D_INCONSISTENT) { 372 dev_err(DEV, 373 "disk is %s, cannot write al transaction\n", 374 drbd_disk_str(mdev->state.disk)); 375 put_ldev(mdev); 376 return -EIO; 377 } 378 379 buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ 380 if (!buffer) { 381 dev_err(DEV, "disk failed while waiting for md_io buffer\n"); 382 put_ldev(mdev); 383 return -ENODEV; 384 } 385 386 memset(buffer, 0, sizeof(*buffer)); 387 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); 388 buffer->tr_number = cpu_to_be32(mdev->al_tr_number); 389 390 i = 0; 391 392 /* Even though no one can start to change this list 393 * once we set the LC_LOCKED -- from drbd_al_begin_io(), 394 * lc_try_lock_for_transaction() --, someone may still 395 * be in the process of changing it. */ 396 spin_lock_irq(&mdev->al_lock); 397 list_for_each_entry(e, &mdev->act_log->to_be_changed, list) { 398 if (i == AL_UPDATES_PER_TRANSACTION) { 399 i++; 400 break; 401 } 402 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); 403 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); 404 if (e->lc_number != LC_FREE) 405 drbd_bm_mark_for_writeout(mdev, 406 al_extent_to_bm_page(e->lc_number)); 407 i++; 408 } 409 spin_unlock_irq(&mdev->al_lock); 410 BUG_ON(i > AL_UPDATES_PER_TRANSACTION); 411 412 buffer->n_updates = cpu_to_be16(i); 413 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { 414 buffer->update_slot_nr[i] = cpu_to_be16(-1); 415 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); 416 } 417 418 buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements); 419 buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle); 420 421 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, 422 mdev->act_log->nr_elements - mdev->al_tr_cycle); 423 for (i = 0; i < mx; i++) { 424 unsigned idx = mdev->al_tr_cycle + i; 425 extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; 426 buffer->context[i] = cpu_to_be32(extent_nr); 427 } 428 for (; i < AL_CONTEXT_PER_TRANSACTION; i++) 429 buffer->context[i] = cpu_to_be32(LC_FREE); 430 431 mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; 432 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) 433 mdev->al_tr_cycle = 0; 434 435 sector = mdev->ldev->md.md_offset 436 + mdev->ldev->md.al_offset 437 + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9); 438 439 crc = crc32c(0, buffer, 4096); 440 buffer->crc32c = cpu_to_be32(crc); 441 442 if (drbd_bm_write_hinted(mdev)) 443 err = -EIO; 444 /* drbd_chk_io_error done already */ 445 else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { 446 err = -EIO; 447 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); 448 } else { 449 /* advance ringbuffer position and transaction counter */ 450 mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE); 451 mdev->al_tr_number++; 452 } 453 454 drbd_md_put_buffer(mdev); 455 put_ldev(mdev); 456 457 return err; 458 } 459 460 461 static int w_al_write_transaction(struct drbd_work *w, int unused) 462 { 463 struct update_al_work *aw = container_of(w, struct update_al_work, w); 464 struct drbd_conf *mdev = w->mdev; 465 int err; 466 467 err = _al_write_transaction(mdev); 468 aw->err = err; 469 complete(&aw->event); 470 471 return err != -EIO ? err : 0; 472 } 473 474 /* Calls from worker context (see w_restart_disk_io()) need to write the 475 transaction directly. Others came through generic_make_request(), 476 those need to delegate it to the worker. */ 477 static int al_write_transaction(struct drbd_conf *mdev) 478 { 479 struct update_al_work al_work; 480 481 if (current == mdev->tconn->worker.task) 482 return _al_write_transaction(mdev); 483 484 init_completion(&al_work.event); 485 al_work.w.cb = w_al_write_transaction; 486 al_work.w.mdev = mdev; 487 drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w); 488 wait_for_completion(&al_work.event); 489 490 return al_work.err; 491 } 492 493 static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) 494 { 495 int rv; 496 497 spin_lock_irq(&mdev->al_lock); 498 rv = (al_ext->refcnt == 0); 499 if (likely(rv)) 500 lc_del(mdev->act_log, al_ext); 501 spin_unlock_irq(&mdev->al_lock); 502 503 return rv; 504 } 505 506 /** 507 * drbd_al_shrink() - Removes all active extents form the activity log 508 * @mdev: DRBD device. 509 * 510 * Removes all active extents form the activity log, waiting until 511 * the reference count of each entry dropped to 0 first, of course. 512 * 513 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() 514 */ 515 void drbd_al_shrink(struct drbd_conf *mdev) 516 { 517 struct lc_element *al_ext; 518 int i; 519 520 D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags)); 521 522 for (i = 0; i < mdev->act_log->nr_elements; i++) { 523 al_ext = lc_element_by_index(mdev->act_log, i); 524 if (al_ext->lc_number == LC_FREE) 525 continue; 526 wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); 527 } 528 529 wake_up(&mdev->al_wait); 530 } 531 532 static int w_update_odbm(struct drbd_work *w, int unused) 533 { 534 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); 535 struct drbd_conf *mdev = w->mdev; 536 struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; 537 538 if (!get_ldev(mdev)) { 539 if (__ratelimit(&drbd_ratelimit_state)) 540 dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); 541 kfree(udw); 542 return 0; 543 } 544 545 drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr)); 546 put_ldev(mdev); 547 548 kfree(udw); 549 550 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { 551 switch (mdev->state.conn) { 552 case C_SYNC_SOURCE: case C_SYNC_TARGET: 553 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: 554 drbd_resync_finished(mdev); 555 default: 556 /* nothing to do */ 557 break; 558 } 559 } 560 drbd_bcast_event(mdev, &sib); 561 562 return 0; 563 } 564 565 566 /* ATTENTION. The AL's extents are 4MB each, while the extents in the 567 * resync LRU-cache are 16MB each. 568 * The caller of this function has to hold an get_ldev() reference. 569 * 570 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 571 */ 572 static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, 573 int count, int success) 574 { 575 struct lc_element *e; 576 struct update_odbm_work *udw; 577 578 unsigned int enr; 579 580 D_ASSERT(atomic_read(&mdev->local_cnt)); 581 582 /* I simply assume that a sector/size pair never crosses 583 * a 16 MB extent border. (Currently this is true...) */ 584 enr = BM_SECT_TO_EXT(sector); 585 586 e = lc_get(mdev->resync, enr); 587 if (e) { 588 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 589 if (ext->lce.lc_number == enr) { 590 if (success) 591 ext->rs_left -= count; 592 else 593 ext->rs_failed += count; 594 if (ext->rs_left < ext->rs_failed) { 595 dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " 596 "rs_failed=%d count=%d cstate=%s\n", 597 (unsigned long long)sector, 598 ext->lce.lc_number, ext->rs_left, 599 ext->rs_failed, count, 600 drbd_conn_str(mdev->state.conn)); 601 602 /* We don't expect to be able to clear more bits 603 * than have been set when we originally counted 604 * the set bits to cache that value in ext->rs_left. 605 * Whatever the reason (disconnect during resync, 606 * delayed local completion of an application write), 607 * try to fix it up by recounting here. */ 608 ext->rs_left = drbd_bm_e_weight(mdev, enr); 609 } 610 } else { 611 /* Normally this element should be in the cache, 612 * since drbd_rs_begin_io() pulled it already in. 613 * 614 * But maybe an application write finished, and we set 615 * something outside the resync lru_cache in sync. 616 */ 617 int rs_left = drbd_bm_e_weight(mdev, enr); 618 if (ext->flags != 0) { 619 dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" 620 " -> %d[%u;00]\n", 621 ext->lce.lc_number, ext->rs_left, 622 ext->flags, enr, rs_left); 623 ext->flags = 0; 624 } 625 if (ext->rs_failed) { 626 dev_warn(DEV, "Kicking resync_lru element enr=%u " 627 "out with rs_failed=%d\n", 628 ext->lce.lc_number, ext->rs_failed); 629 } 630 ext->rs_left = rs_left; 631 ext->rs_failed = success ? 0 : count; 632 /* we don't keep a persistent log of the resync lru, 633 * we can commit any change right away. */ 634 lc_committed(mdev->resync); 635 } 636 lc_put(mdev->resync, &ext->lce); 637 /* no race, we are within the al_lock! */ 638 639 if (ext->rs_left == ext->rs_failed) { 640 ext->rs_failed = 0; 641 642 udw = kmalloc(sizeof(*udw), GFP_ATOMIC); 643 if (udw) { 644 udw->enr = ext->lce.lc_number; 645 udw->w.cb = w_update_odbm; 646 udw->w.mdev = mdev; 647 drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w); 648 } else { 649 dev_warn(DEV, "Could not kmalloc an udw\n"); 650 } 651 } 652 } else { 653 dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", 654 mdev->resync_locked, 655 mdev->resync->nr_elements, 656 mdev->resync->flags); 657 } 658 } 659 660 void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go) 661 { 662 unsigned long now = jiffies; 663 unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; 664 int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; 665 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { 666 if (mdev->rs_mark_left[mdev->rs_last_mark] != still_to_go && 667 mdev->state.conn != C_PAUSED_SYNC_T && 668 mdev->state.conn != C_PAUSED_SYNC_S) { 669 mdev->rs_mark_time[next] = now; 670 mdev->rs_mark_left[next] = still_to_go; 671 mdev->rs_last_mark = next; 672 } 673 } 674 } 675 676 /* clear the bit corresponding to the piece of storage in question: 677 * size byte of data starting from sector. Only clear a bits of the affected 678 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 679 * 680 * called by worker on C_SYNC_TARGET and receiver on SyncSource. 681 * 682 */ 683 void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, 684 const char *file, const unsigned int line) 685 { 686 /* Is called from worker and receiver context _only_ */ 687 unsigned long sbnr, ebnr, lbnr; 688 unsigned long count = 0; 689 sector_t esector, nr_sectors; 690 int wake_up = 0; 691 unsigned long flags; 692 693 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 694 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 695 (unsigned long long)sector, size); 696 return; 697 } 698 699 if (!get_ldev(mdev)) 700 return; /* no disk, no metadata, no bitmap to clear bits in */ 701 702 nr_sectors = drbd_get_capacity(mdev->this_bdev); 703 esector = sector + (size >> 9) - 1; 704 705 if (!expect(sector < nr_sectors)) 706 goto out; 707 if (!expect(esector < nr_sectors)) 708 esector = nr_sectors - 1; 709 710 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 711 712 /* we clear it (in sync). 713 * round up start sector, round down end sector. we make sure we only 714 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ 715 if (unlikely(esector < BM_SECT_PER_BIT-1)) 716 goto out; 717 if (unlikely(esector == (nr_sectors-1))) 718 ebnr = lbnr; 719 else 720 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 721 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 722 723 if (sbnr > ebnr) 724 goto out; 725 726 /* 727 * ok, (capacity & 7) != 0 sometimes, but who cares... 728 * we count rs_{total,left} in bits, not sectors. 729 */ 730 count = drbd_bm_clear_bits(mdev, sbnr, ebnr); 731 if (count) { 732 drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev)); 733 spin_lock_irqsave(&mdev->al_lock, flags); 734 drbd_try_clear_on_disk_bm(mdev, sector, count, true); 735 spin_unlock_irqrestore(&mdev->al_lock, flags); 736 737 /* just wake_up unconditional now, various lc_chaged(), 738 * lc_put() in drbd_try_clear_on_disk_bm(). */ 739 wake_up = 1; 740 } 741 out: 742 put_ldev(mdev); 743 if (wake_up) 744 wake_up(&mdev->al_wait); 745 } 746 747 /* 748 * this is intended to set one request worth of data out of sync. 749 * affects at least 1 bit, 750 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. 751 * 752 * called by tl_clear and drbd_send_dblock (==drbd_make_request). 753 * so this can be _any_ process. 754 */ 755 int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, 756 const char *file, const unsigned int line) 757 { 758 unsigned long sbnr, ebnr, flags; 759 sector_t esector, nr_sectors; 760 unsigned int enr, count = 0; 761 struct lc_element *e; 762 763 /* this should be an empty REQ_FLUSH */ 764 if (size == 0) 765 return 0; 766 767 if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 768 dev_err(DEV, "sector: %llus, size: %d\n", 769 (unsigned long long)sector, size); 770 return 0; 771 } 772 773 if (!get_ldev(mdev)) 774 return 0; /* no disk, no metadata, no bitmap to set bits in */ 775 776 nr_sectors = drbd_get_capacity(mdev->this_bdev); 777 esector = sector + (size >> 9) - 1; 778 779 if (!expect(sector < nr_sectors)) 780 goto out; 781 if (!expect(esector < nr_sectors)) 782 esector = nr_sectors - 1; 783 784 /* we set it out of sync, 785 * we do not need to round anything here */ 786 sbnr = BM_SECT_TO_BIT(sector); 787 ebnr = BM_SECT_TO_BIT(esector); 788 789 /* ok, (capacity & 7) != 0 sometimes, but who cares... 790 * we count rs_{total,left} in bits, not sectors. */ 791 spin_lock_irqsave(&mdev->al_lock, flags); 792 count = drbd_bm_set_bits(mdev, sbnr, ebnr); 793 794 enr = BM_SECT_TO_EXT(sector); 795 e = lc_find(mdev->resync, enr); 796 if (e) 797 lc_entry(e, struct bm_extent, lce)->rs_left += count; 798 spin_unlock_irqrestore(&mdev->al_lock, flags); 799 800 out: 801 put_ldev(mdev); 802 803 return count; 804 } 805 806 static 807 struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) 808 { 809 struct lc_element *e; 810 struct bm_extent *bm_ext; 811 int wakeup = 0; 812 unsigned long rs_flags; 813 814 spin_lock_irq(&mdev->al_lock); 815 if (mdev->resync_locked > mdev->resync->nr_elements/2) { 816 spin_unlock_irq(&mdev->al_lock); 817 return NULL; 818 } 819 e = lc_get(mdev->resync, enr); 820 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 821 if (bm_ext) { 822 if (bm_ext->lce.lc_number != enr) { 823 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); 824 bm_ext->rs_failed = 0; 825 lc_committed(mdev->resync); 826 wakeup = 1; 827 } 828 if (bm_ext->lce.refcnt == 1) 829 mdev->resync_locked++; 830 set_bit(BME_NO_WRITES, &bm_ext->flags); 831 } 832 rs_flags = mdev->resync->flags; 833 spin_unlock_irq(&mdev->al_lock); 834 if (wakeup) 835 wake_up(&mdev->al_wait); 836 837 if (!bm_ext) { 838 if (rs_flags & LC_STARVING) 839 dev_warn(DEV, "Have to wait for element" 840 " (resync LRU too small?)\n"); 841 BUG_ON(rs_flags & LC_LOCKED); 842 } 843 844 return bm_ext; 845 } 846 847 static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) 848 { 849 int rv; 850 851 spin_lock_irq(&mdev->al_lock); 852 rv = lc_is_used(mdev->act_log, enr); 853 spin_unlock_irq(&mdev->al_lock); 854 855 return rv; 856 } 857 858 /** 859 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 860 * @mdev: DRBD device. 861 * @sector: The sector number. 862 * 863 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. 864 */ 865 int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 866 { 867 unsigned int enr = BM_SECT_TO_EXT(sector); 868 struct bm_extent *bm_ext; 869 int i, sig; 870 int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. 871 200 times -> 20 seconds. */ 872 873 retry: 874 sig = wait_event_interruptible(mdev->al_wait, 875 (bm_ext = _bme_get(mdev, enr))); 876 if (sig) 877 return -EINTR; 878 879 if (test_bit(BME_LOCKED, &bm_ext->flags)) 880 return 0; 881 882 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 883 sig = wait_event_interruptible(mdev->al_wait, 884 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i) || 885 test_bit(BME_PRIORITY, &bm_ext->flags)); 886 887 if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { 888 spin_lock_irq(&mdev->al_lock); 889 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 890 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ 891 mdev->resync_locked--; 892 wake_up(&mdev->al_wait); 893 } 894 spin_unlock_irq(&mdev->al_lock); 895 if (sig) 896 return -EINTR; 897 if (schedule_timeout_interruptible(HZ/10)) 898 return -EINTR; 899 if (sa && --sa == 0) 900 dev_warn(DEV,"drbd_rs_begin_io() stepped aside for 20sec." 901 "Resync stalled?\n"); 902 goto retry; 903 } 904 } 905 set_bit(BME_LOCKED, &bm_ext->flags); 906 return 0; 907 } 908 909 /** 910 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 911 * @mdev: DRBD device. 912 * @sector: The sector number. 913 * 914 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 915 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN 916 * if there is still application IO going on in this area. 917 */ 918 int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 919 { 920 unsigned int enr = BM_SECT_TO_EXT(sector); 921 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 922 struct lc_element *e; 923 struct bm_extent *bm_ext; 924 int i; 925 926 spin_lock_irq(&mdev->al_lock); 927 if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { 928 /* in case you have very heavy scattered io, it may 929 * stall the syncer undefined if we give up the ref count 930 * when we try again and requeue. 931 * 932 * if we don't give up the refcount, but the next time 933 * we are scheduled this extent has been "synced" by new 934 * application writes, we'd miss the lc_put on the 935 * extent we keep the refcount on. 936 * so we remembered which extent we had to try again, and 937 * if the next requested one is something else, we do 938 * the lc_put here... 939 * we also have to wake_up 940 */ 941 e = lc_find(mdev->resync, mdev->resync_wenr); 942 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 943 if (bm_ext) { 944 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 945 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); 946 clear_bit(BME_NO_WRITES, &bm_ext->flags); 947 mdev->resync_wenr = LC_FREE; 948 if (lc_put(mdev->resync, &bm_ext->lce) == 0) 949 mdev->resync_locked--; 950 wake_up(&mdev->al_wait); 951 } else { 952 dev_alert(DEV, "LOGIC BUG\n"); 953 } 954 } 955 /* TRY. */ 956 e = lc_try_get(mdev->resync, enr); 957 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 958 if (bm_ext) { 959 if (test_bit(BME_LOCKED, &bm_ext->flags)) 960 goto proceed; 961 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 962 mdev->resync_locked++; 963 } else { 964 /* we did set the BME_NO_WRITES, 965 * but then could not set BME_LOCKED, 966 * so we tried again. 967 * drop the extra reference. */ 968 bm_ext->lce.refcnt--; 969 D_ASSERT(bm_ext->lce.refcnt > 0); 970 } 971 goto check_al; 972 } else { 973 /* do we rather want to try later? */ 974 if (mdev->resync_locked > mdev->resync->nr_elements-3) 975 goto try_again; 976 /* Do or do not. There is no try. -- Yoda */ 977 e = lc_get(mdev->resync, enr); 978 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 979 if (!bm_ext) { 980 const unsigned long rs_flags = mdev->resync->flags; 981 if (rs_flags & LC_STARVING) 982 dev_warn(DEV, "Have to wait for element" 983 " (resync LRU too small?)\n"); 984 BUG_ON(rs_flags & LC_LOCKED); 985 goto try_again; 986 } 987 if (bm_ext->lce.lc_number != enr) { 988 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); 989 bm_ext->rs_failed = 0; 990 lc_committed(mdev->resync); 991 wake_up(&mdev->al_wait); 992 D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); 993 } 994 set_bit(BME_NO_WRITES, &bm_ext->flags); 995 D_ASSERT(bm_ext->lce.refcnt == 1); 996 mdev->resync_locked++; 997 goto check_al; 998 } 999 check_al: 1000 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1001 if (lc_is_used(mdev->act_log, al_enr+i)) 1002 goto try_again; 1003 } 1004 set_bit(BME_LOCKED, &bm_ext->flags); 1005 proceed: 1006 mdev->resync_wenr = LC_FREE; 1007 spin_unlock_irq(&mdev->al_lock); 1008 return 0; 1009 1010 try_again: 1011 if (bm_ext) 1012 mdev->resync_wenr = enr; 1013 spin_unlock_irq(&mdev->al_lock); 1014 return -EAGAIN; 1015 } 1016 1017 void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) 1018 { 1019 unsigned int enr = BM_SECT_TO_EXT(sector); 1020 struct lc_element *e; 1021 struct bm_extent *bm_ext; 1022 unsigned long flags; 1023 1024 spin_lock_irqsave(&mdev->al_lock, flags); 1025 e = lc_find(mdev->resync, enr); 1026 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1027 if (!bm_ext) { 1028 spin_unlock_irqrestore(&mdev->al_lock, flags); 1029 if (__ratelimit(&drbd_ratelimit_state)) 1030 dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); 1031 return; 1032 } 1033 1034 if (bm_ext->lce.refcnt == 0) { 1035 spin_unlock_irqrestore(&mdev->al_lock, flags); 1036 dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " 1037 "but refcnt is 0!?\n", 1038 (unsigned long long)sector, enr); 1039 return; 1040 } 1041 1042 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 1043 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ 1044 mdev->resync_locked--; 1045 wake_up(&mdev->al_wait); 1046 } 1047 1048 spin_unlock_irqrestore(&mdev->al_lock, flags); 1049 } 1050 1051 /** 1052 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 1053 * @mdev: DRBD device. 1054 */ 1055 void drbd_rs_cancel_all(struct drbd_conf *mdev) 1056 { 1057 spin_lock_irq(&mdev->al_lock); 1058 1059 if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ 1060 lc_reset(mdev->resync); 1061 put_ldev(mdev); 1062 } 1063 mdev->resync_locked = 0; 1064 mdev->resync_wenr = LC_FREE; 1065 spin_unlock_irq(&mdev->al_lock); 1066 wake_up(&mdev->al_wait); 1067 } 1068 1069 /** 1070 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 1071 * @mdev: DRBD device. 1072 * 1073 * Returns 0 upon success, -EAGAIN if at least one reference count was 1074 * not zero. 1075 */ 1076 int drbd_rs_del_all(struct drbd_conf *mdev) 1077 { 1078 struct lc_element *e; 1079 struct bm_extent *bm_ext; 1080 int i; 1081 1082 spin_lock_irq(&mdev->al_lock); 1083 1084 if (get_ldev_if_state(mdev, D_FAILED)) { 1085 /* ok, ->resync is there. */ 1086 for (i = 0; i < mdev->resync->nr_elements; i++) { 1087 e = lc_element_by_index(mdev->resync, i); 1088 bm_ext = lc_entry(e, struct bm_extent, lce); 1089 if (bm_ext->lce.lc_number == LC_FREE) 1090 continue; 1091 if (bm_ext->lce.lc_number == mdev->resync_wenr) { 1092 dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" 1093 " got 'synced' by application io\n", 1094 mdev->resync_wenr); 1095 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1096 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); 1097 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1098 mdev->resync_wenr = LC_FREE; 1099 lc_put(mdev->resync, &bm_ext->lce); 1100 } 1101 if (bm_ext->lce.refcnt != 0) { 1102 dev_info(DEV, "Retrying drbd_rs_del_all() later. " 1103 "refcnt=%d\n", bm_ext->lce.refcnt); 1104 put_ldev(mdev); 1105 spin_unlock_irq(&mdev->al_lock); 1106 return -EAGAIN; 1107 } 1108 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1109 D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); 1110 lc_del(mdev->resync, &bm_ext->lce); 1111 } 1112 D_ASSERT(mdev->resync->used == 0); 1113 put_ldev(mdev); 1114 } 1115 spin_unlock_irq(&mdev->al_lock); 1116 wake_up(&mdev->al_wait); 1117 1118 return 0; 1119 } 1120 1121 /** 1122 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks 1123 * @mdev: DRBD device. 1124 * @sector: The sector number. 1125 * @size: Size of failed IO operation, in byte. 1126 */ 1127 void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) 1128 { 1129 /* Is called from worker and receiver context _only_ */ 1130 unsigned long sbnr, ebnr, lbnr; 1131 unsigned long count; 1132 sector_t esector, nr_sectors; 1133 int wake_up = 0; 1134 1135 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 1136 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1137 (unsigned long long)sector, size); 1138 return; 1139 } 1140 nr_sectors = drbd_get_capacity(mdev->this_bdev); 1141 esector = sector + (size >> 9) - 1; 1142 1143 if (!expect(sector < nr_sectors)) 1144 return; 1145 if (!expect(esector < nr_sectors)) 1146 esector = nr_sectors - 1; 1147 1148 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 1149 1150 /* 1151 * round up start sector, round down end sector. we make sure we only 1152 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ 1153 if (unlikely(esector < BM_SECT_PER_BIT-1)) 1154 return; 1155 if (unlikely(esector == (nr_sectors-1))) 1156 ebnr = lbnr; 1157 else 1158 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 1159 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 1160 1161 if (sbnr > ebnr) 1162 return; 1163 1164 /* 1165 * ok, (capacity & 7) != 0 sometimes, but who cares... 1166 * we count rs_{total,left} in bits, not sectors. 1167 */ 1168 spin_lock_irq(&mdev->al_lock); 1169 count = drbd_bm_count_bits(mdev, sbnr, ebnr); 1170 if (count) { 1171 mdev->rs_failed += count; 1172 1173 if (get_ldev(mdev)) { 1174 drbd_try_clear_on_disk_bm(mdev, sector, count, false); 1175 put_ldev(mdev); 1176 } 1177 1178 /* just wake_up unconditional now, various lc_chaged(), 1179 * lc_put() in drbd_try_clear_on_disk_bm(). */ 1180 wake_up = 1; 1181 } 1182 spin_unlock_irq(&mdev->al_lock); 1183 if (wake_up) 1184 wake_up(&mdev->al_wait); 1185 } 1186