1 /* 2 * dm-exception-store.c 3 * 4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 5 * Copyright (C) 2006 Red Hat GmbH 6 * 7 * This file is released under the GPL. 8 */ 9 10 #include "dm.h" 11 #include "dm-snap.h" 12 #include "dm-io.h" 13 #include "kcopyd.h" 14 15 #include <linux/mm.h> 16 #include <linux/pagemap.h> 17 #include <linux/vmalloc.h> 18 #include <linux/slab.h> 19 20 #define DM_MSG_PREFIX "snapshots" 21 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ 22 23 /*----------------------------------------------------------------- 24 * Persistent snapshots, by persistent we mean that the snapshot 25 * will survive a reboot. 26 *---------------------------------------------------------------*/ 27 28 /* 29 * We need to store a record of which parts of the origin have 30 * been copied to the snapshot device. The snapshot code 31 * requires that we copy exception chunks to chunk aligned areas 32 * of the COW store. It makes sense therefore, to store the 33 * metadata in chunk size blocks. 34 * 35 * There is no backward or forward compatibility implemented, 36 * snapshots with different disk versions than the kernel will 37 * not be usable. It is expected that "lvcreate" will blank out 38 * the start of a fresh COW device before calling the snapshot 39 * constructor. 40 * 41 * The first chunk of the COW device just contains the header. 42 * After this there is a chunk filled with exception metadata, 43 * followed by as many exception chunks as can fit in the 44 * metadata areas. 45 * 46 * All on disk structures are in little-endian format. The end 47 * of the exceptions info is indicated by an exception with a 48 * new_chunk of 0, which is invalid since it would point to the 49 * header chunk. 50 */ 51 52 /* 53 * Magic for persistent snapshots: "SnAp" - Feeble isn't it. 54 */ 55 #define SNAP_MAGIC 0x70416e53 56 57 /* 58 * The on-disk version of the metadata. 59 */ 60 #define SNAPSHOT_DISK_VERSION 1 61 62 struct disk_header { 63 uint32_t magic; 64 65 /* 66 * Is this snapshot valid. There is no way of recovering 67 * an invalid snapshot. 68 */ 69 uint32_t valid; 70 71 /* 72 * Simple, incrementing version. no backward 73 * compatibility. 74 */ 75 uint32_t version; 76 77 /* In sectors */ 78 uint32_t chunk_size; 79 }; 80 81 struct disk_exception { 82 uint64_t old_chunk; 83 uint64_t new_chunk; 84 }; 85 86 struct commit_callback { 87 void (*callback)(void *, int success); 88 void *context; 89 }; 90 91 /* 92 * The top level structure for a persistent exception store. 93 */ 94 struct pstore { 95 struct dm_snapshot *snap; /* up pointer to my snapshot */ 96 int version; 97 int valid; 98 uint32_t exceptions_per_area; 99 100 /* 101 * Now that we have an asynchronous kcopyd there is no 102 * need for large chunk sizes, so it wont hurt to have a 103 * whole chunks worth of metadata in memory at once. 104 */ 105 void *area; 106 107 /* 108 * Used to keep track of which metadata area the data in 109 * 'chunk' refers to. 110 */ 111 uint32_t current_area; 112 113 /* 114 * The next free chunk for an exception. 115 */ 116 uint32_t next_free; 117 118 /* 119 * The index of next free exception in the current 120 * metadata area. 121 */ 122 uint32_t current_committed; 123 124 atomic_t pending_count; 125 uint32_t callback_count; 126 struct commit_callback *callbacks; 127 struct dm_io_client *io_client; 128 }; 129 130 static inline unsigned int sectors_to_pages(unsigned int sectors) 131 { 132 return sectors / (PAGE_SIZE >> 9); 133 } 134 135 static int alloc_area(struct pstore *ps) 136 { 137 int r = -ENOMEM; 138 size_t len; 139 140 len = ps->snap->chunk_size << SECTOR_SHIFT; 141 142 /* 143 * Allocate the chunk_size block of memory that will hold 144 * a single metadata area. 145 */ 146 ps->area = vmalloc(len); 147 if (!ps->area) 148 return r; 149 150 return 0; 151 } 152 153 static void free_area(struct pstore *ps) 154 { 155 vfree(ps->area); 156 ps->area = NULL; 157 } 158 159 /* 160 * Read or write a chunk aligned and sized block of data from a device. 161 */ 162 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) 163 { 164 struct io_region where = { 165 .bdev = ps->snap->cow->bdev, 166 .sector = ps->snap->chunk_size * chunk, 167 .count = ps->snap->chunk_size, 168 }; 169 struct dm_io_request io_req = { 170 .bi_rw = rw, 171 .mem.type = DM_IO_VMA, 172 .mem.ptr.vma = ps->area, 173 .client = ps->io_client, 174 .notify.fn = NULL, 175 }; 176 177 return dm_io(&io_req, 1, &where, NULL); 178 } 179 180 /* 181 * Read or write a metadata area. Remembering to skip the first 182 * chunk which holds the header. 183 */ 184 static int area_io(struct pstore *ps, uint32_t area, int rw) 185 { 186 int r; 187 uint32_t chunk; 188 189 /* convert a metadata area index to a chunk index */ 190 chunk = 1 + ((ps->exceptions_per_area + 1) * area); 191 192 r = chunk_io(ps, chunk, rw); 193 if (r) 194 return r; 195 196 ps->current_area = area; 197 return 0; 198 } 199 200 static int zero_area(struct pstore *ps, uint32_t area) 201 { 202 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 203 return area_io(ps, area, WRITE); 204 } 205 206 static int read_header(struct pstore *ps, int *new_snapshot) 207 { 208 int r; 209 struct disk_header *dh; 210 chunk_t chunk_size; 211 int chunk_size_supplied = 1; 212 213 /* 214 * Use default chunk size (or hardsect_size, if larger) if none supplied 215 */ 216 if (!ps->snap->chunk_size) { 217 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 218 bdev_hardsect_size(ps->snap->cow->bdev) >> 9); 219 ps->snap->chunk_mask = ps->snap->chunk_size - 1; 220 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; 221 chunk_size_supplied = 0; 222 } 223 224 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap-> 225 chunk_size)); 226 if (IS_ERR(ps->io_client)) 227 return PTR_ERR(ps->io_client); 228 229 r = alloc_area(ps); 230 if (r) 231 return r; 232 233 r = chunk_io(ps, 0, READ); 234 if (r) 235 goto bad; 236 237 dh = (struct disk_header *) ps->area; 238 239 if (le32_to_cpu(dh->magic) == 0) { 240 *new_snapshot = 1; 241 return 0; 242 } 243 244 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { 245 DMWARN("Invalid or corrupt snapshot"); 246 r = -ENXIO; 247 goto bad; 248 } 249 250 *new_snapshot = 0; 251 ps->valid = le32_to_cpu(dh->valid); 252 ps->version = le32_to_cpu(dh->version); 253 chunk_size = le32_to_cpu(dh->chunk_size); 254 255 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) 256 return 0; 257 258 DMWARN("chunk size %llu in device metadata overrides " 259 "table chunk size of %llu.", 260 (unsigned long long)chunk_size, 261 (unsigned long long)ps->snap->chunk_size); 262 263 /* We had a bogus chunk_size. Fix stuff up. */ 264 free_area(ps); 265 266 ps->snap->chunk_size = chunk_size; 267 ps->snap->chunk_mask = chunk_size - 1; 268 ps->snap->chunk_shift = ffs(chunk_size) - 1; 269 270 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size), 271 ps->io_client); 272 if (r) 273 return r; 274 275 r = alloc_area(ps); 276 return r; 277 278 bad: 279 free_area(ps); 280 return r; 281 } 282 283 static int write_header(struct pstore *ps) 284 { 285 struct disk_header *dh; 286 287 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 288 289 dh = (struct disk_header *) ps->area; 290 dh->magic = cpu_to_le32(SNAP_MAGIC); 291 dh->valid = cpu_to_le32(ps->valid); 292 dh->version = cpu_to_le32(ps->version); 293 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); 294 295 return chunk_io(ps, 0, WRITE); 296 } 297 298 /* 299 * Access functions for the disk exceptions, these do the endian conversions. 300 */ 301 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) 302 { 303 BUG_ON(index >= ps->exceptions_per_area); 304 305 return ((struct disk_exception *) ps->area) + index; 306 } 307 308 static void read_exception(struct pstore *ps, 309 uint32_t index, struct disk_exception *result) 310 { 311 struct disk_exception *e = get_exception(ps, index); 312 313 /* copy it */ 314 result->old_chunk = le64_to_cpu(e->old_chunk); 315 result->new_chunk = le64_to_cpu(e->new_chunk); 316 } 317 318 static void write_exception(struct pstore *ps, 319 uint32_t index, struct disk_exception *de) 320 { 321 struct disk_exception *e = get_exception(ps, index); 322 323 /* copy it */ 324 e->old_chunk = cpu_to_le64(de->old_chunk); 325 e->new_chunk = cpu_to_le64(de->new_chunk); 326 } 327 328 /* 329 * Registers the exceptions that are present in the current area. 330 * 'full' is filled in to indicate if the area has been 331 * filled. 332 */ 333 static int insert_exceptions(struct pstore *ps, int *full) 334 { 335 int r; 336 unsigned int i; 337 struct disk_exception de; 338 339 /* presume the area is full */ 340 *full = 1; 341 342 for (i = 0; i < ps->exceptions_per_area; i++) { 343 read_exception(ps, i, &de); 344 345 /* 346 * If the new_chunk is pointing at the start of 347 * the COW device, where the first metadata area 348 * is we know that we've hit the end of the 349 * exceptions. Therefore the area is not full. 350 */ 351 if (de.new_chunk == 0LL) { 352 ps->current_committed = i; 353 *full = 0; 354 break; 355 } 356 357 /* 358 * Keep track of the start of the free chunks. 359 */ 360 if (ps->next_free <= de.new_chunk) 361 ps->next_free = de.new_chunk + 1; 362 363 /* 364 * Otherwise we add the exception to the snapshot. 365 */ 366 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); 367 if (r) 368 return r; 369 } 370 371 return 0; 372 } 373 374 static int read_exceptions(struct pstore *ps) 375 { 376 uint32_t area; 377 int r, full = 1; 378 379 /* 380 * Keeping reading chunks and inserting exceptions until 381 * we find a partially full area. 382 */ 383 for (area = 0; full; area++) { 384 r = area_io(ps, area, READ); 385 if (r) 386 return r; 387 388 r = insert_exceptions(ps, &full); 389 if (r) 390 return r; 391 } 392 393 return 0; 394 } 395 396 static inline struct pstore *get_info(struct exception_store *store) 397 { 398 return (struct pstore *) store->context; 399 } 400 401 static void persistent_fraction_full(struct exception_store *store, 402 sector_t *numerator, sector_t *denominator) 403 { 404 *numerator = get_info(store)->next_free * store->snap->chunk_size; 405 *denominator = get_dev_size(store->snap->cow->bdev); 406 } 407 408 static void persistent_destroy(struct exception_store *store) 409 { 410 struct pstore *ps = get_info(store); 411 412 dm_io_client_destroy(ps->io_client); 413 vfree(ps->callbacks); 414 free_area(ps); 415 kfree(ps); 416 } 417 418 static int persistent_read_metadata(struct exception_store *store) 419 { 420 int r, new_snapshot; 421 struct pstore *ps = get_info(store); 422 423 /* 424 * Read the snapshot header. 425 */ 426 r = read_header(ps, &new_snapshot); 427 if (r) 428 return r; 429 430 /* 431 * Now we know correct chunk_size, complete the initialisation. 432 */ 433 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / 434 sizeof(struct disk_exception); 435 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 436 sizeof(*ps->callbacks)); 437 if (!ps->callbacks) 438 return -ENOMEM; 439 440 /* 441 * Do we need to setup a new snapshot ? 442 */ 443 if (new_snapshot) { 444 r = write_header(ps); 445 if (r) { 446 DMWARN("write_header failed"); 447 return r; 448 } 449 450 r = zero_area(ps, 0); 451 if (r) { 452 DMWARN("zero_area(0) failed"); 453 return r; 454 } 455 456 } else { 457 /* 458 * Sanity checks. 459 */ 460 if (!ps->valid) { 461 DMWARN("snapshot is marked invalid"); 462 return -EINVAL; 463 } 464 465 if (ps->version != SNAPSHOT_DISK_VERSION) { 466 DMWARN("unable to handle snapshot disk version %d", 467 ps->version); 468 return -EINVAL; 469 } 470 471 /* 472 * Read the metadata. 473 */ 474 r = read_exceptions(ps); 475 if (r) 476 return r; 477 } 478 479 return 0; 480 } 481 482 static int persistent_prepare(struct exception_store *store, 483 struct exception *e) 484 { 485 struct pstore *ps = get_info(store); 486 uint32_t stride; 487 sector_t size = get_dev_size(store->snap->cow->bdev); 488 489 /* Is there enough room ? */ 490 if (size < ((ps->next_free + 1) * store->snap->chunk_size)) 491 return -ENOSPC; 492 493 e->new_chunk = ps->next_free; 494 495 /* 496 * Move onto the next free pending, making sure to take 497 * into account the location of the metadata chunks. 498 */ 499 stride = (ps->exceptions_per_area + 1); 500 if ((++ps->next_free % stride) == 1) 501 ps->next_free++; 502 503 atomic_inc(&ps->pending_count); 504 return 0; 505 } 506 507 static void persistent_commit(struct exception_store *store, 508 struct exception *e, 509 void (*callback) (void *, int success), 510 void *callback_context) 511 { 512 int r; 513 unsigned int i; 514 struct pstore *ps = get_info(store); 515 struct disk_exception de; 516 struct commit_callback *cb; 517 518 de.old_chunk = e->old_chunk; 519 de.new_chunk = e->new_chunk; 520 write_exception(ps, ps->current_committed++, &de); 521 522 /* 523 * Add the callback to the back of the array. This code 524 * is the only place where the callback array is 525 * manipulated, and we know that it will never be called 526 * multiple times concurrently. 527 */ 528 cb = ps->callbacks + ps->callback_count++; 529 cb->callback = callback; 530 cb->context = callback_context; 531 532 /* 533 * If there are no more exceptions in flight, or we have 534 * filled this metadata area we commit the exceptions to 535 * disk. 536 */ 537 if (atomic_dec_and_test(&ps->pending_count) || 538 (ps->current_committed == ps->exceptions_per_area)) { 539 r = area_io(ps, ps->current_area, WRITE); 540 if (r) 541 ps->valid = 0; 542 543 /* 544 * Have we completely filled the current area ? 545 */ 546 if (ps->current_committed == ps->exceptions_per_area) { 547 ps->current_committed = 0; 548 r = zero_area(ps, ps->current_area + 1); 549 if (r) 550 ps->valid = 0; 551 } 552 553 for (i = 0; i < ps->callback_count; i++) { 554 cb = ps->callbacks + i; 555 cb->callback(cb->context, r == 0 ? 1 : 0); 556 } 557 558 ps->callback_count = 0; 559 } 560 } 561 562 static void persistent_drop(struct exception_store *store) 563 { 564 struct pstore *ps = get_info(store); 565 566 ps->valid = 0; 567 if (write_header(ps)) 568 DMWARN("write header failed"); 569 } 570 571 int dm_create_persistent(struct exception_store *store) 572 { 573 struct pstore *ps; 574 575 /* allocate the pstore */ 576 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 577 if (!ps) 578 return -ENOMEM; 579 580 ps->snap = store->snap; 581 ps->valid = 1; 582 ps->version = SNAPSHOT_DISK_VERSION; 583 ps->area = NULL; 584 ps->next_free = 2; /* skipping the header and first area */ 585 ps->current_committed = 0; 586 587 ps->callback_count = 0; 588 atomic_set(&ps->pending_count, 0); 589 ps->callbacks = NULL; 590 591 store->destroy = persistent_destroy; 592 store->read_metadata = persistent_read_metadata; 593 store->prepare_exception = persistent_prepare; 594 store->commit_exception = persistent_commit; 595 store->drop_snapshot = persistent_drop; 596 store->fraction_full = persistent_fraction_full; 597 store->context = ps; 598 599 return 0; 600 } 601 602 /*----------------------------------------------------------------- 603 * Implementation of the store for non-persistent snapshots. 604 *---------------------------------------------------------------*/ 605 struct transient_c { 606 sector_t next_free; 607 }; 608 609 static void transient_destroy(struct exception_store *store) 610 { 611 kfree(store->context); 612 } 613 614 static int transient_read_metadata(struct exception_store *store) 615 { 616 return 0; 617 } 618 619 static int transient_prepare(struct exception_store *store, struct exception *e) 620 { 621 struct transient_c *tc = (struct transient_c *) store->context; 622 sector_t size = get_dev_size(store->snap->cow->bdev); 623 624 if (size < (tc->next_free + store->snap->chunk_size)) 625 return -1; 626 627 e->new_chunk = sector_to_chunk(store->snap, tc->next_free); 628 tc->next_free += store->snap->chunk_size; 629 630 return 0; 631 } 632 633 static void transient_commit(struct exception_store *store, 634 struct exception *e, 635 void (*callback) (void *, int success), 636 void *callback_context) 637 { 638 /* Just succeed */ 639 callback(callback_context, 1); 640 } 641 642 static void transient_fraction_full(struct exception_store *store, 643 sector_t *numerator, sector_t *denominator) 644 { 645 *numerator = ((struct transient_c *) store->context)->next_free; 646 *denominator = get_dev_size(store->snap->cow->bdev); 647 } 648 649 int dm_create_transient(struct exception_store *store) 650 { 651 struct transient_c *tc; 652 653 store->destroy = transient_destroy; 654 store->read_metadata = transient_read_metadata; 655 store->prepare_exception = transient_prepare; 656 store->commit_exception = transient_commit; 657 store->drop_snapshot = NULL; 658 store->fraction_full = transient_fraction_full; 659 660 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); 661 if (!tc) 662 return -ENOMEM; 663 664 tc->next_free = 0; 665 store->context = tc; 666 667 return 0; 668 } 669