1 /* 2 drbd_actlog.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 24 */ 25 26 #include <linux/slab.h> 27 #include <linux/crc32c.h> 28 #include <linux/drbd.h> 29 #include <linux/drbd_limits.h> 30 #include <linux/dynamic_debug.h> 31 #include "drbd_int.h" 32 33 34 enum al_transaction_types { 35 AL_TR_UPDATE = 0, 36 AL_TR_INITIALIZED = 0xffff 37 }; 38 /* all fields on disc in big endian */ 39 struct __packed al_transaction_on_disk { 40 /* don't we all like magic */ 41 __be32 magic; 42 43 /* to identify the most recent transaction block 44 * in the on disk ring buffer */ 45 __be32 tr_number; 46 47 /* checksum on the full 4k block, with this field set to 0. */ 48 __be32 crc32c; 49 50 /* type of transaction, special transaction types like: 51 * purge-all, set-all-idle, set-all-active, ... to-be-defined 52 * see also enum al_transaction_types */ 53 __be16 transaction_type; 54 55 /* we currently allow only a few thousand extents, 56 * so 16bit will be enough for the slot number. */ 57 58 /* how many updates in this transaction */ 59 __be16 n_updates; 60 61 /* maximum slot number, "al-extents" in drbd.conf speak. 62 * Having this in each transaction should make reconfiguration 63 * of that parameter easier. */ 64 __be16 context_size; 65 66 /* slot number the context starts with */ 67 __be16 context_start_slot_nr; 68 69 /* Some reserved bytes. Expected usage is a 64bit counter of 70 * sectors-written since device creation, and other data generation tag 71 * supporting usage */ 72 __be32 __reserved[4]; 73 74 /* --- 36 byte used --- */ 75 76 /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes 77 * in one transaction, then use the remaining byte in the 4k block for 78 * context information. "Flexible" number of updates per transaction 79 * does not help, as we have to account for the case when all update 80 * slots are used anyways, so it would only complicate code without 81 * additional benefit. 82 */ 83 __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; 84 85 /* but the extent number is 32bit, which at an extent size of 4 MiB 86 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ 87 __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; 88 89 /* --- 420 bytes used (36 + 64*6) --- */ 90 91 /* 4096 - 420 = 3676 = 919 * 4 */ 92 __be32 context[AL_CONTEXT_PER_TRANSACTION]; 93 }; 94 95 void *drbd_md_get_buffer(struct drbd_device *device, const char *intent) 96 { 97 int r; 98 99 wait_event(device->misc_wait, 100 (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || 101 device->state.disk <= D_FAILED); 102 103 if (r) 104 return NULL; 105 106 device->md_io.current_use = intent; 107 device->md_io.start_jif = jiffies; 108 device->md_io.submit_jif = device->md_io.start_jif - 1; 109 return page_address(device->md_io.page); 110 } 111 112 void drbd_md_put_buffer(struct drbd_device *device) 113 { 114 if (atomic_dec_and_test(&device->md_io.in_use)) 115 wake_up(&device->misc_wait); 116 } 117 118 void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, 119 unsigned int *done) 120 { 121 long dt; 122 123 rcu_read_lock(); 124 dt = rcu_dereference(bdev->disk_conf)->disk_timeout; 125 rcu_read_unlock(); 126 dt = dt * HZ / 10; 127 if (dt == 0) 128 dt = MAX_SCHEDULE_TIMEOUT; 129 130 dt = wait_event_timeout(device->misc_wait, 131 *done || test_bit(FORCE_DETACH, &device->flags), dt); 132 if (dt == 0) { 133 drbd_err(device, "meta-data IO operation timed out\n"); 134 drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH); 135 } 136 } 137 138 static int _drbd_md_sync_page_io(struct drbd_device *device, 139 struct drbd_backing_dev *bdev, 140 sector_t sector, int rw) 141 { 142 struct bio *bio; 143 /* we do all our meta data IO in aligned 4k blocks. */ 144 const int size = 4096; 145 int err; 146 147 device->md_io.done = 0; 148 device->md_io.error = -ENODEV; 149 150 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) 151 rw |= REQ_FUA | REQ_FLUSH; 152 rw |= REQ_SYNC | REQ_NOIDLE; 153 154 bio = bio_alloc_drbd(GFP_NOIO); 155 bio->bi_bdev = bdev->md_bdev; 156 bio->bi_iter.bi_sector = sector; 157 err = -EIO; 158 if (bio_add_page(bio, device->md_io.page, size, 0) != size) 159 goto out; 160 bio->bi_private = device; 161 bio->bi_end_io = drbd_md_endio; 162 bio->bi_rw = rw; 163 164 if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL) 165 /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ 166 ; 167 else if (!get_ldev_if_state(device, D_ATTACHING)) { 168 /* Corresponding put_ldev in drbd_md_endio() */ 169 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 170 err = -ENODEV; 171 goto out; 172 } 173 174 bio_get(bio); /* one bio_put() is in the completion handler */ 175 atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ 176 device->md_io.submit_jif = jiffies; 177 if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 178 bio_io_error(bio); 179 else 180 submit_bio(rw, bio); 181 wait_until_done_or_force_detached(device, bdev, &device->md_io.done); 182 if (!bio->bi_error) 183 err = device->md_io.error; 184 185 out: 186 bio_put(bio); 187 return err; 188 } 189 190 int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, 191 sector_t sector, int rw) 192 { 193 int err; 194 D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); 195 196 BUG_ON(!bdev->md_bdev); 197 198 dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", 199 current->comm, current->pid, __func__, 200 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", 201 (void*)_RET_IP_ ); 202 203 if (sector < drbd_md_first_sector(bdev) || 204 sector + 7 > drbd_md_last_sector(bdev)) 205 drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 206 current->comm, current->pid, __func__, 207 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 208 209 err = _drbd_md_sync_page_io(device, bdev, sector, rw); 210 if (err) { 211 drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 212 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 213 } 214 return err; 215 } 216 217 static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr) 218 { 219 struct lc_element *tmp; 220 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 221 if (unlikely(tmp != NULL)) { 222 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 223 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) 224 return bm_ext; 225 } 226 return NULL; 227 } 228 229 static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock) 230 { 231 struct lc_element *al_ext; 232 struct bm_extent *bm_ext; 233 int wake; 234 235 spin_lock_irq(&device->al_lock); 236 bm_ext = find_active_resync_extent(device, enr); 237 if (bm_ext) { 238 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 239 spin_unlock_irq(&device->al_lock); 240 if (wake) 241 wake_up(&device->al_wait); 242 return NULL; 243 } 244 if (nonblock) 245 al_ext = lc_try_get(device->act_log, enr); 246 else 247 al_ext = lc_get(device->act_log, enr); 248 spin_unlock_irq(&device->al_lock); 249 return al_ext; 250 } 251 252 bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i) 253 { 254 /* for bios crossing activity log extent boundaries, 255 * we may need to activate two extents in one go */ 256 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 257 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 258 259 D_ASSERT(device, (unsigned)(last - first) <= 1); 260 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 261 262 /* FIXME figure out a fast path for bios crossing AL extent boundaries */ 263 if (first != last) 264 return false; 265 266 return _al_get(device, first, true); 267 } 268 269 bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) 270 { 271 /* for bios crossing activity log extent boundaries, 272 * we may need to activate two extents in one go */ 273 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 274 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 275 unsigned enr; 276 bool need_transaction = false; 277 278 D_ASSERT(device, first <= last); 279 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 280 281 for (enr = first; enr <= last; enr++) { 282 struct lc_element *al_ext; 283 wait_event(device->al_wait, 284 (al_ext = _al_get(device, enr, false)) != NULL); 285 if (al_ext->lc_number != enr) 286 need_transaction = true; 287 } 288 return need_transaction; 289 } 290 291 #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) 292 /* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT 293 * are still coupled, or assume too much about their relation. 294 * Code below will not work if this is violated. 295 * Will be cleaned up with some followup patch. 296 */ 297 # error FIXME 298 #endif 299 300 static unsigned int al_extent_to_bm_page(unsigned int al_enr) 301 { 302 return al_enr >> 303 /* bit to page */ 304 ((PAGE_SHIFT + 3) - 305 /* al extent number to bit */ 306 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 307 } 308 309 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) 310 { 311 const unsigned int stripes = device->ldev->md.al_stripes; 312 const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; 313 314 /* transaction number, modulo on-disk ring buffer wrap around */ 315 unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); 316 317 /* ... to aligned 4k on disk block */ 318 t = ((t % stripes) * stripe_size_4kB) + t/stripes; 319 320 /* ... to 512 byte sector in activity log */ 321 t *= 8; 322 323 /* ... plus offset to the on disk position */ 324 return device->ldev->md.md_offset + device->ldev->md.al_offset + t; 325 } 326 327 static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer) 328 { 329 struct lc_element *e; 330 sector_t sector; 331 int i, mx; 332 unsigned extent_nr; 333 unsigned crc = 0; 334 int err = 0; 335 336 memset(buffer, 0, sizeof(*buffer)); 337 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); 338 buffer->tr_number = cpu_to_be32(device->al_tr_number); 339 340 i = 0; 341 342 /* Even though no one can start to change this list 343 * once we set the LC_LOCKED -- from drbd_al_begin_io(), 344 * lc_try_lock_for_transaction() --, someone may still 345 * be in the process of changing it. */ 346 spin_lock_irq(&device->al_lock); 347 list_for_each_entry(e, &device->act_log->to_be_changed, list) { 348 if (i == AL_UPDATES_PER_TRANSACTION) { 349 i++; 350 break; 351 } 352 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); 353 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); 354 if (e->lc_number != LC_FREE) 355 drbd_bm_mark_for_writeout(device, 356 al_extent_to_bm_page(e->lc_number)); 357 i++; 358 } 359 spin_unlock_irq(&device->al_lock); 360 BUG_ON(i > AL_UPDATES_PER_TRANSACTION); 361 362 buffer->n_updates = cpu_to_be16(i); 363 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { 364 buffer->update_slot_nr[i] = cpu_to_be16(-1); 365 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); 366 } 367 368 buffer->context_size = cpu_to_be16(device->act_log->nr_elements); 369 buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); 370 371 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, 372 device->act_log->nr_elements - device->al_tr_cycle); 373 for (i = 0; i < mx; i++) { 374 unsigned idx = device->al_tr_cycle + i; 375 extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; 376 buffer->context[i] = cpu_to_be32(extent_nr); 377 } 378 for (; i < AL_CONTEXT_PER_TRANSACTION; i++) 379 buffer->context[i] = cpu_to_be32(LC_FREE); 380 381 device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; 382 if (device->al_tr_cycle >= device->act_log->nr_elements) 383 device->al_tr_cycle = 0; 384 385 sector = al_tr_number_to_on_disk_sector(device); 386 387 crc = crc32c(0, buffer, 4096); 388 buffer->crc32c = cpu_to_be32(crc); 389 390 if (drbd_bm_write_hinted(device)) 391 err = -EIO; 392 else { 393 bool write_al_updates; 394 rcu_read_lock(); 395 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 396 rcu_read_unlock(); 397 if (write_al_updates) { 398 if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { 399 err = -EIO; 400 drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); 401 } else { 402 device->al_tr_number++; 403 device->al_writ_cnt++; 404 } 405 } 406 } 407 408 return err; 409 } 410 411 static int al_write_transaction(struct drbd_device *device) 412 { 413 struct al_transaction_on_disk *buffer; 414 int err; 415 416 if (!get_ldev(device)) { 417 drbd_err(device, "disk is %s, cannot start al transaction\n", 418 drbd_disk_str(device->state.disk)); 419 return -EIO; 420 } 421 422 /* The bitmap write may have failed, causing a state change. */ 423 if (device->state.disk < D_INCONSISTENT) { 424 drbd_err(device, 425 "disk is %s, cannot write al transaction\n", 426 drbd_disk_str(device->state.disk)); 427 put_ldev(device); 428 return -EIO; 429 } 430 431 /* protects md_io_buffer, al_tr_cycle, ... */ 432 buffer = drbd_md_get_buffer(device, __func__); 433 if (!buffer) { 434 drbd_err(device, "disk failed while waiting for md_io buffer\n"); 435 put_ldev(device); 436 return -ENODEV; 437 } 438 439 err = __al_write_transaction(device, buffer); 440 441 drbd_md_put_buffer(device); 442 put_ldev(device); 443 444 return err; 445 } 446 447 448 void drbd_al_begin_io_commit(struct drbd_device *device) 449 { 450 bool locked = false; 451 452 /* Serialize multiple transactions. 453 * This uses test_and_set_bit, memory barrier is implicit. 454 */ 455 wait_event(device->al_wait, 456 device->act_log->pending_changes == 0 || 457 (locked = lc_try_lock_for_transaction(device->act_log))); 458 459 if (locked) { 460 /* Double check: it may have been committed by someone else, 461 * while we have been waiting for the lock. */ 462 if (device->act_log->pending_changes) { 463 bool write_al_updates; 464 465 rcu_read_lock(); 466 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 467 rcu_read_unlock(); 468 469 if (write_al_updates) 470 al_write_transaction(device); 471 spin_lock_irq(&device->al_lock); 472 /* FIXME 473 if (err) 474 we need an "lc_cancel" here; 475 */ 476 lc_committed(device->act_log); 477 spin_unlock_irq(&device->al_lock); 478 } 479 lc_unlock(device->act_log); 480 wake_up(&device->al_wait); 481 } 482 } 483 484 /* 485 * @delegate: delegate activity log I/O to the worker thread 486 */ 487 void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) 488 { 489 if (drbd_al_begin_io_prepare(device, i)) 490 drbd_al_begin_io_commit(device); 491 } 492 493 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) 494 { 495 struct lru_cache *al = device->act_log; 496 /* for bios crossing activity log extent boundaries, 497 * we may need to activate two extents in one go */ 498 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 499 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 500 unsigned nr_al_extents; 501 unsigned available_update_slots; 502 unsigned enr; 503 504 D_ASSERT(device, first <= last); 505 506 nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ 507 available_update_slots = min(al->nr_elements - al->used, 508 al->max_pending_changes - al->pending_changes); 509 510 /* We want all necessary updates for a given request within the same transaction 511 * We could first check how many updates are *actually* needed, 512 * and use that instead of the worst-case nr_al_extents */ 513 if (available_update_slots < nr_al_extents) { 514 /* Too many activity log extents are currently "hot". 515 * 516 * If we have accumulated pending changes already, 517 * we made progress. 518 * 519 * If we cannot get even a single pending change through, 520 * stop the fast path until we made some progress, 521 * or requests to "cold" extents could be starved. */ 522 if (!al->pending_changes) 523 __set_bit(__LC_STARVING, &device->act_log->flags); 524 return -ENOBUFS; 525 } 526 527 /* Is resync active in this area? */ 528 for (enr = first; enr <= last; enr++) { 529 struct lc_element *tmp; 530 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 531 if (unlikely(tmp != NULL)) { 532 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 533 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 534 if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) 535 return -EBUSY; 536 return -EWOULDBLOCK; 537 } 538 } 539 } 540 541 /* Checkout the refcounts. 542 * Given that we checked for available elements and update slots above, 543 * this has to be successful. */ 544 for (enr = first; enr <= last; enr++) { 545 struct lc_element *al_ext; 546 al_ext = lc_get_cumulative(device->act_log, enr); 547 if (!al_ext) 548 drbd_info(device, "LOGIC BUG for enr=%u\n", enr); 549 } 550 return 0; 551 } 552 553 void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) 554 { 555 /* for bios crossing activity log extent boundaries, 556 * we may need to activate two extents in one go */ 557 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 558 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 559 unsigned enr; 560 struct lc_element *extent; 561 unsigned long flags; 562 563 D_ASSERT(device, first <= last); 564 spin_lock_irqsave(&device->al_lock, flags); 565 566 for (enr = first; enr <= last; enr++) { 567 extent = lc_find(device->act_log, enr); 568 if (!extent) { 569 drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); 570 continue; 571 } 572 lc_put(device->act_log, extent); 573 } 574 spin_unlock_irqrestore(&device->al_lock, flags); 575 wake_up(&device->al_wait); 576 } 577 578 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 579 { 580 int rv; 581 582 spin_lock_irq(&device->al_lock); 583 rv = (al_ext->refcnt == 0); 584 if (likely(rv)) 585 lc_del(device->act_log, al_ext); 586 spin_unlock_irq(&device->al_lock); 587 588 return rv; 589 } 590 591 /** 592 * drbd_al_shrink() - Removes all active extents form the activity log 593 * @device: DRBD device. 594 * 595 * Removes all active extents form the activity log, waiting until 596 * the reference count of each entry dropped to 0 first, of course. 597 * 598 * You need to lock device->act_log with lc_try_lock() / lc_unlock() 599 */ 600 void drbd_al_shrink(struct drbd_device *device) 601 { 602 struct lc_element *al_ext; 603 int i; 604 605 D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags)); 606 607 for (i = 0; i < device->act_log->nr_elements; i++) { 608 al_ext = lc_element_by_index(device->act_log, i); 609 if (al_ext->lc_number == LC_FREE) 610 continue; 611 wait_event(device->al_wait, _try_lc_del(device, al_ext)); 612 } 613 614 wake_up(&device->al_wait); 615 } 616 617 int drbd_al_initialize(struct drbd_device *device, void *buffer) 618 { 619 struct al_transaction_on_disk *al = buffer; 620 struct drbd_md *md = &device->ldev->md; 621 int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 622 int i; 623 624 __al_write_transaction(device, al); 625 /* There may or may not have been a pending transaction. */ 626 spin_lock_irq(&device->al_lock); 627 lc_committed(device->act_log); 628 spin_unlock_irq(&device->al_lock); 629 630 /* The rest of the transactions will have an empty "updates" list, and 631 * are written out only to provide the context, and to initialize the 632 * on-disk ring buffer. */ 633 for (i = 1; i < al_size_4k; i++) { 634 int err = __al_write_transaction(device, al); 635 if (err) 636 return err; 637 } 638 return 0; 639 } 640 641 static const char *drbd_change_sync_fname[] = { 642 [RECORD_RS_FAILED] = "drbd_rs_failed_io", 643 [SET_IN_SYNC] = "drbd_set_in_sync", 644 [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" 645 }; 646 647 /* ATTENTION. The AL's extents are 4MB each, while the extents in the 648 * resync LRU-cache are 16MB each. 649 * The caller of this function has to hold an get_ldev() reference. 650 * 651 * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), 652 * potentially pulling in (and recounting the corresponding bits) 653 * this resync extent into the resync extent lru cache. 654 * 655 * Returns whether all bits have been cleared for this resync extent, 656 * precisely: (rs_left <= rs_failed) 657 * 658 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 659 */ 660 static bool update_rs_extent(struct drbd_device *device, 661 unsigned int enr, int count, 662 enum update_sync_bits_mode mode) 663 { 664 struct lc_element *e; 665 666 D_ASSERT(device, atomic_read(&device->local_cnt)); 667 668 /* When setting out-of-sync bits, 669 * we don't need it cached (lc_find). 670 * But if it is present in the cache, 671 * we should update the cached bit count. 672 * Otherwise, that extent should be in the resync extent lru cache 673 * already -- or we want to pull it in if necessary -- (lc_get), 674 * then update and check rs_left and rs_failed. */ 675 if (mode == SET_OUT_OF_SYNC) 676 e = lc_find(device->resync, enr); 677 else 678 e = lc_get(device->resync, enr); 679 if (e) { 680 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 681 if (ext->lce.lc_number == enr) { 682 if (mode == SET_IN_SYNC) 683 ext->rs_left -= count; 684 else if (mode == SET_OUT_OF_SYNC) 685 ext->rs_left += count; 686 else 687 ext->rs_failed += count; 688 if (ext->rs_left < ext->rs_failed) { 689 drbd_warn(device, "BAD! enr=%u rs_left=%d " 690 "rs_failed=%d count=%d cstate=%s\n", 691 ext->lce.lc_number, ext->rs_left, 692 ext->rs_failed, count, 693 drbd_conn_str(device->state.conn)); 694 695 /* We don't expect to be able to clear more bits 696 * than have been set when we originally counted 697 * the set bits to cache that value in ext->rs_left. 698 * Whatever the reason (disconnect during resync, 699 * delayed local completion of an application write), 700 * try to fix it up by recounting here. */ 701 ext->rs_left = drbd_bm_e_weight(device, enr); 702 } 703 } else { 704 /* Normally this element should be in the cache, 705 * since drbd_rs_begin_io() pulled it already in. 706 * 707 * But maybe an application write finished, and we set 708 * something outside the resync lru_cache in sync. 709 */ 710 int rs_left = drbd_bm_e_weight(device, enr); 711 if (ext->flags != 0) { 712 drbd_warn(device, "changing resync lce: %d[%u;%02lx]" 713 " -> %d[%u;00]\n", 714 ext->lce.lc_number, ext->rs_left, 715 ext->flags, enr, rs_left); 716 ext->flags = 0; 717 } 718 if (ext->rs_failed) { 719 drbd_warn(device, "Kicking resync_lru element enr=%u " 720 "out with rs_failed=%d\n", 721 ext->lce.lc_number, ext->rs_failed); 722 } 723 ext->rs_left = rs_left; 724 ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; 725 /* we don't keep a persistent log of the resync lru, 726 * we can commit any change right away. */ 727 lc_committed(device->resync); 728 } 729 if (mode != SET_OUT_OF_SYNC) 730 lc_put(device->resync, &ext->lce); 731 /* no race, we are within the al_lock! */ 732 733 if (ext->rs_left <= ext->rs_failed) { 734 ext->rs_failed = 0; 735 return true; 736 } 737 } else if (mode != SET_OUT_OF_SYNC) { 738 /* be quiet if lc_find() did not find it. */ 739 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 740 device->resync_locked, 741 device->resync->nr_elements, 742 device->resync->flags); 743 } 744 return false; 745 } 746 747 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) 748 { 749 unsigned long now = jiffies; 750 unsigned long last = device->rs_mark_time[device->rs_last_mark]; 751 int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; 752 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { 753 if (device->rs_mark_left[device->rs_last_mark] != still_to_go && 754 device->state.conn != C_PAUSED_SYNC_T && 755 device->state.conn != C_PAUSED_SYNC_S) { 756 device->rs_mark_time[next] = now; 757 device->rs_mark_left[next] = still_to_go; 758 device->rs_last_mark = next; 759 } 760 } 761 } 762 763 /* It is called lazy update, so don't do write-out too often. */ 764 static bool lazy_bitmap_update_due(struct drbd_device *device) 765 { 766 return time_after(jiffies, device->rs_last_bcast + 2*HZ); 767 } 768 769 static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) 770 { 771 if (rs_done) 772 set_bit(RS_DONE, &device->flags); 773 /* and also set RS_PROGRESS below */ 774 else if (!lazy_bitmap_update_due(device)) 775 return; 776 777 drbd_device_post_work(device, RS_PROGRESS); 778 } 779 780 static int update_sync_bits(struct drbd_device *device, 781 unsigned long sbnr, unsigned long ebnr, 782 enum update_sync_bits_mode mode) 783 { 784 /* 785 * We keep a count of set bits per resync-extent in the ->rs_left 786 * caching member, so we need to loop and work within the resync extent 787 * alignment. Typically this loop will execute exactly once. 788 */ 789 unsigned long flags; 790 unsigned long count = 0; 791 unsigned int cleared = 0; 792 while (sbnr <= ebnr) { 793 /* set temporary boundary bit number to last bit number within 794 * the resync extent of the current start bit number, 795 * but cap at provided end bit number */ 796 unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); 797 unsigned long c; 798 799 if (mode == RECORD_RS_FAILED) 800 /* Only called from drbd_rs_failed_io(), bits 801 * supposedly still set. Recount, maybe some 802 * of the bits have been successfully cleared 803 * by application IO meanwhile. 804 */ 805 c = drbd_bm_count_bits(device, sbnr, tbnr); 806 else if (mode == SET_IN_SYNC) 807 c = drbd_bm_clear_bits(device, sbnr, tbnr); 808 else /* if (mode == SET_OUT_OF_SYNC) */ 809 c = drbd_bm_set_bits(device, sbnr, tbnr); 810 811 if (c) { 812 spin_lock_irqsave(&device->al_lock, flags); 813 cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); 814 spin_unlock_irqrestore(&device->al_lock, flags); 815 count += c; 816 } 817 sbnr = tbnr + 1; 818 } 819 if (count) { 820 if (mode == SET_IN_SYNC) { 821 unsigned long still_to_go = drbd_bm_total_weight(device); 822 bool rs_is_done = (still_to_go <= device->rs_failed); 823 drbd_advance_rs_marks(device, still_to_go); 824 if (cleared || rs_is_done) 825 maybe_schedule_on_disk_bitmap_update(device, rs_is_done); 826 } else if (mode == RECORD_RS_FAILED) 827 device->rs_failed += count; 828 wake_up(&device->al_wait); 829 } 830 return count; 831 } 832 833 /* clear the bit corresponding to the piece of storage in question: 834 * size byte of data starting from sector. Only clear a bits of the affected 835 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 836 * 837 * called by worker on C_SYNC_TARGET and receiver on SyncSource. 838 * 839 */ 840 int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, 841 enum update_sync_bits_mode mode) 842 { 843 /* Is called from worker and receiver context _only_ */ 844 unsigned long sbnr, ebnr, lbnr; 845 unsigned long count = 0; 846 sector_t esector, nr_sectors; 847 848 /* This would be an empty REQ_FLUSH, be silent. */ 849 if ((mode == SET_OUT_OF_SYNC) && size == 0) 850 return 0; 851 852 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 853 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", 854 drbd_change_sync_fname[mode], 855 (unsigned long long)sector, size); 856 return 0; 857 } 858 859 if (!get_ldev(device)) 860 return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ 861 862 nr_sectors = drbd_get_capacity(device->this_bdev); 863 esector = sector + (size >> 9) - 1; 864 865 if (!expect(sector < nr_sectors)) 866 goto out; 867 if (!expect(esector < nr_sectors)) 868 esector = nr_sectors - 1; 869 870 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 871 872 if (mode == SET_IN_SYNC) { 873 /* Round up start sector, round down end sector. We make sure 874 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ 875 if (unlikely(esector < BM_SECT_PER_BIT-1)) 876 goto out; 877 if (unlikely(esector == (nr_sectors-1))) 878 ebnr = lbnr; 879 else 880 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 881 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 882 } else { 883 /* We set it out of sync, or record resync failure. 884 * Should not round anything here. */ 885 sbnr = BM_SECT_TO_BIT(sector); 886 ebnr = BM_SECT_TO_BIT(esector); 887 } 888 889 count = update_sync_bits(device, sbnr, ebnr, mode); 890 out: 891 put_ldev(device); 892 return count; 893 } 894 895 static 896 struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr) 897 { 898 struct lc_element *e; 899 struct bm_extent *bm_ext; 900 int wakeup = 0; 901 unsigned long rs_flags; 902 903 spin_lock_irq(&device->al_lock); 904 if (device->resync_locked > device->resync->nr_elements/2) { 905 spin_unlock_irq(&device->al_lock); 906 return NULL; 907 } 908 e = lc_get(device->resync, enr); 909 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 910 if (bm_ext) { 911 if (bm_ext->lce.lc_number != enr) { 912 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 913 bm_ext->rs_failed = 0; 914 lc_committed(device->resync); 915 wakeup = 1; 916 } 917 if (bm_ext->lce.refcnt == 1) 918 device->resync_locked++; 919 set_bit(BME_NO_WRITES, &bm_ext->flags); 920 } 921 rs_flags = device->resync->flags; 922 spin_unlock_irq(&device->al_lock); 923 if (wakeup) 924 wake_up(&device->al_wait); 925 926 if (!bm_ext) { 927 if (rs_flags & LC_STARVING) 928 drbd_warn(device, "Have to wait for element" 929 " (resync LRU too small?)\n"); 930 BUG_ON(rs_flags & LC_LOCKED); 931 } 932 933 return bm_ext; 934 } 935 936 static int _is_in_al(struct drbd_device *device, unsigned int enr) 937 { 938 int rv; 939 940 spin_lock_irq(&device->al_lock); 941 rv = lc_is_used(device->act_log, enr); 942 spin_unlock_irq(&device->al_lock); 943 944 return rv; 945 } 946 947 /** 948 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 949 * @device: DRBD device. 950 * @sector: The sector number. 951 * 952 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. 953 */ 954 int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) 955 { 956 unsigned int enr = BM_SECT_TO_EXT(sector); 957 struct bm_extent *bm_ext; 958 int i, sig; 959 bool sa; 960 961 retry: 962 sig = wait_event_interruptible(device->al_wait, 963 (bm_ext = _bme_get(device, enr))); 964 if (sig) 965 return -EINTR; 966 967 if (test_bit(BME_LOCKED, &bm_ext->flags)) 968 return 0; 969 970 /* step aside only while we are above c-min-rate; unless disabled. */ 971 sa = drbd_rs_c_min_rate_throttle(device); 972 973 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 974 sig = wait_event_interruptible(device->al_wait, 975 !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || 976 (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); 977 978 if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { 979 spin_lock_irq(&device->al_lock); 980 if (lc_put(device->resync, &bm_ext->lce) == 0) { 981 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ 982 device->resync_locked--; 983 wake_up(&device->al_wait); 984 } 985 spin_unlock_irq(&device->al_lock); 986 if (sig) 987 return -EINTR; 988 if (schedule_timeout_interruptible(HZ/10)) 989 return -EINTR; 990 goto retry; 991 } 992 } 993 set_bit(BME_LOCKED, &bm_ext->flags); 994 return 0; 995 } 996 997 /** 998 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 999 * @device: DRBD device. 1000 * @sector: The sector number. 1001 * 1002 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 1003 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN 1004 * if there is still application IO going on in this area. 1005 */ 1006 int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) 1007 { 1008 unsigned int enr = BM_SECT_TO_EXT(sector); 1009 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 1010 struct lc_element *e; 1011 struct bm_extent *bm_ext; 1012 int i; 1013 bool throttle = drbd_rs_should_slow_down(device, sector, true); 1014 1015 /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, 1016 * not yet BME_LOCKED) extent needs to be kicked out explicitly if we 1017 * need to throttle. There is at most one such half-locked extent, 1018 * which is remembered in resync_wenr. */ 1019 1020 if (throttle && device->resync_wenr != enr) 1021 return -EAGAIN; 1022 1023 spin_lock_irq(&device->al_lock); 1024 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { 1025 /* in case you have very heavy scattered io, it may 1026 * stall the syncer undefined if we give up the ref count 1027 * when we try again and requeue. 1028 * 1029 * if we don't give up the refcount, but the next time 1030 * we are scheduled this extent has been "synced" by new 1031 * application writes, we'd miss the lc_put on the 1032 * extent we keep the refcount on. 1033 * so we remembered which extent we had to try again, and 1034 * if the next requested one is something else, we do 1035 * the lc_put here... 1036 * we also have to wake_up 1037 */ 1038 e = lc_find(device->resync, device->resync_wenr); 1039 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1040 if (bm_ext) { 1041 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1042 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1043 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1044 device->resync_wenr = LC_FREE; 1045 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1046 bm_ext->flags = 0; 1047 device->resync_locked--; 1048 } 1049 wake_up(&device->al_wait); 1050 } else { 1051 drbd_alert(device, "LOGIC BUG\n"); 1052 } 1053 } 1054 /* TRY. */ 1055 e = lc_try_get(device->resync, enr); 1056 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1057 if (bm_ext) { 1058 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1059 goto proceed; 1060 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 1061 device->resync_locked++; 1062 } else { 1063 /* we did set the BME_NO_WRITES, 1064 * but then could not set BME_LOCKED, 1065 * so we tried again. 1066 * drop the extra reference. */ 1067 bm_ext->lce.refcnt--; 1068 D_ASSERT(device, bm_ext->lce.refcnt > 0); 1069 } 1070 goto check_al; 1071 } else { 1072 /* do we rather want to try later? */ 1073 if (device->resync_locked > device->resync->nr_elements-3) 1074 goto try_again; 1075 /* Do or do not. There is no try. -- Yoda */ 1076 e = lc_get(device->resync, enr); 1077 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1078 if (!bm_ext) { 1079 const unsigned long rs_flags = device->resync->flags; 1080 if (rs_flags & LC_STARVING) 1081 drbd_warn(device, "Have to wait for element" 1082 " (resync LRU too small?)\n"); 1083 BUG_ON(rs_flags & LC_LOCKED); 1084 goto try_again; 1085 } 1086 if (bm_ext->lce.lc_number != enr) { 1087 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 1088 bm_ext->rs_failed = 0; 1089 lc_committed(device->resync); 1090 wake_up(&device->al_wait); 1091 D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0); 1092 } 1093 set_bit(BME_NO_WRITES, &bm_ext->flags); 1094 D_ASSERT(device, bm_ext->lce.refcnt == 1); 1095 device->resync_locked++; 1096 goto check_al; 1097 } 1098 check_al: 1099 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1100 if (lc_is_used(device->act_log, al_enr+i)) 1101 goto try_again; 1102 } 1103 set_bit(BME_LOCKED, &bm_ext->flags); 1104 proceed: 1105 device->resync_wenr = LC_FREE; 1106 spin_unlock_irq(&device->al_lock); 1107 return 0; 1108 1109 try_again: 1110 if (bm_ext) { 1111 if (throttle) { 1112 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1113 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1114 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1115 device->resync_wenr = LC_FREE; 1116 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1117 bm_ext->flags = 0; 1118 device->resync_locked--; 1119 } 1120 wake_up(&device->al_wait); 1121 } else 1122 device->resync_wenr = enr; 1123 } 1124 spin_unlock_irq(&device->al_lock); 1125 return -EAGAIN; 1126 } 1127 1128 void drbd_rs_complete_io(struct drbd_device *device, sector_t sector) 1129 { 1130 unsigned int enr = BM_SECT_TO_EXT(sector); 1131 struct lc_element *e; 1132 struct bm_extent *bm_ext; 1133 unsigned long flags; 1134 1135 spin_lock_irqsave(&device->al_lock, flags); 1136 e = lc_find(device->resync, enr); 1137 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1138 if (!bm_ext) { 1139 spin_unlock_irqrestore(&device->al_lock, flags); 1140 if (__ratelimit(&drbd_ratelimit_state)) 1141 drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); 1142 return; 1143 } 1144 1145 if (bm_ext->lce.refcnt == 0) { 1146 spin_unlock_irqrestore(&device->al_lock, flags); 1147 drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, " 1148 "but refcnt is 0!?\n", 1149 (unsigned long long)sector, enr); 1150 return; 1151 } 1152 1153 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1154 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ 1155 device->resync_locked--; 1156 wake_up(&device->al_wait); 1157 } 1158 1159 spin_unlock_irqrestore(&device->al_lock, flags); 1160 } 1161 1162 /** 1163 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 1164 * @device: DRBD device. 1165 */ 1166 void drbd_rs_cancel_all(struct drbd_device *device) 1167 { 1168 spin_lock_irq(&device->al_lock); 1169 1170 if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */ 1171 lc_reset(device->resync); 1172 put_ldev(device); 1173 } 1174 device->resync_locked = 0; 1175 device->resync_wenr = LC_FREE; 1176 spin_unlock_irq(&device->al_lock); 1177 wake_up(&device->al_wait); 1178 } 1179 1180 /** 1181 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 1182 * @device: DRBD device. 1183 * 1184 * Returns 0 upon success, -EAGAIN if at least one reference count was 1185 * not zero. 1186 */ 1187 int drbd_rs_del_all(struct drbd_device *device) 1188 { 1189 struct lc_element *e; 1190 struct bm_extent *bm_ext; 1191 int i; 1192 1193 spin_lock_irq(&device->al_lock); 1194 1195 if (get_ldev_if_state(device, D_FAILED)) { 1196 /* ok, ->resync is there. */ 1197 for (i = 0; i < device->resync->nr_elements; i++) { 1198 e = lc_element_by_index(device->resync, i); 1199 bm_ext = lc_entry(e, struct bm_extent, lce); 1200 if (bm_ext->lce.lc_number == LC_FREE) 1201 continue; 1202 if (bm_ext->lce.lc_number == device->resync_wenr) { 1203 drbd_info(device, "dropping %u in drbd_rs_del_all, apparently" 1204 " got 'synced' by application io\n", 1205 device->resync_wenr); 1206 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1207 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1208 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1209 device->resync_wenr = LC_FREE; 1210 lc_put(device->resync, &bm_ext->lce); 1211 } 1212 if (bm_ext->lce.refcnt != 0) { 1213 drbd_info(device, "Retrying drbd_rs_del_all() later. " 1214 "refcnt=%d\n", bm_ext->lce.refcnt); 1215 put_ldev(device); 1216 spin_unlock_irq(&device->al_lock); 1217 return -EAGAIN; 1218 } 1219 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1220 D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags)); 1221 lc_del(device->resync, &bm_ext->lce); 1222 } 1223 D_ASSERT(device, device->resync->used == 0); 1224 put_ldev(device); 1225 } 1226 spin_unlock_irq(&device->al_lock); 1227 wake_up(&device->al_wait); 1228 1229 return 0; 1230 } 1231