1 /* 2 * dm-snapshot.c 3 * 4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm.h" 10 #include "dm-snap.h" 11 #include "dm-io.h" 12 #include "kcopyd.h" 13 14 #include <linux/mm.h> 15 #include <linux/pagemap.h> 16 #include <linux/vmalloc.h> 17 #include <linux/slab.h> 18 19 #define DM_MSG_PREFIX "snapshots" 20 21 /*----------------------------------------------------------------- 22 * Persistent snapshots, by persistent we mean that the snapshot 23 * will survive a reboot. 24 *---------------------------------------------------------------*/ 25 26 /* 27 * We need to store a record of which parts of the origin have 28 * been copied to the snapshot device. The snapshot code 29 * requires that we copy exception chunks to chunk aligned areas 30 * of the COW store. It makes sense therefore, to store the 31 * metadata in chunk size blocks. 32 * 33 * There is no backward or forward compatibility implemented, 34 * snapshots with different disk versions than the kernel will 35 * not be usable. It is expected that "lvcreate" will blank out 36 * the start of a fresh COW device before calling the snapshot 37 * constructor. 38 * 39 * The first chunk of the COW device just contains the header. 40 * After this there is a chunk filled with exception metadata, 41 * followed by as many exception chunks as can fit in the 42 * metadata areas. 43 * 44 * All on disk structures are in little-endian format. The end 45 * of the exceptions info is indicated by an exception with a 46 * new_chunk of 0, which is invalid since it would point to the 47 * header chunk. 48 */ 49 50 /* 51 * Magic for persistent snapshots: "SnAp" - Feeble isn't it. 52 */ 53 #define SNAP_MAGIC 0x70416e53 54 55 /* 56 * The on-disk version of the metadata. 57 */ 58 #define SNAPSHOT_DISK_VERSION 1 59 60 struct disk_header { 61 uint32_t magic; 62 63 /* 64 * Is this snapshot valid. There is no way of recovering 65 * an invalid snapshot. 66 */ 67 uint32_t valid; 68 69 /* 70 * Simple, incrementing version. no backward 71 * compatibility. 72 */ 73 uint32_t version; 74 75 /* In sectors */ 76 uint32_t chunk_size; 77 }; 78 79 struct disk_exception { 80 uint64_t old_chunk; 81 uint64_t new_chunk; 82 }; 83 84 struct commit_callback { 85 void (*callback)(void *, int success); 86 void *context; 87 }; 88 89 /* 90 * The top level structure for a persistent exception store. 91 */ 92 struct pstore { 93 struct dm_snapshot *snap; /* up pointer to my snapshot */ 94 int version; 95 int valid; 96 uint32_t exceptions_per_area; 97 98 /* 99 * Now that we have an asynchronous kcopyd there is no 100 * need for large chunk sizes, so it wont hurt to have a 101 * whole chunks worth of metadata in memory at once. 102 */ 103 void *area; 104 105 /* 106 * Used to keep track of which metadata area the data in 107 * 'chunk' refers to. 108 */ 109 uint32_t current_area; 110 111 /* 112 * The next free chunk for an exception. 113 */ 114 uint32_t next_free; 115 116 /* 117 * The index of next free exception in the current 118 * metadata area. 119 */ 120 uint32_t current_committed; 121 122 atomic_t pending_count; 123 uint32_t callback_count; 124 struct commit_callback *callbacks; 125 }; 126 127 static inline unsigned int sectors_to_pages(unsigned int sectors) 128 { 129 return sectors / (PAGE_SIZE >> 9); 130 } 131 132 static int alloc_area(struct pstore *ps) 133 { 134 int r = -ENOMEM; 135 size_t len; 136 137 len = ps->snap->chunk_size << SECTOR_SHIFT; 138 139 /* 140 * Allocate the chunk_size block of memory that will hold 141 * a single metadata area. 142 */ 143 ps->area = vmalloc(len); 144 if (!ps->area) 145 return r; 146 147 return 0; 148 } 149 150 static void free_area(struct pstore *ps) 151 { 152 vfree(ps->area); 153 } 154 155 /* 156 * Read or write a chunk aligned and sized block of data from a device. 157 */ 158 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw) 159 { 160 struct io_region where; 161 unsigned long bits; 162 163 where.bdev = ps->snap->cow->bdev; 164 where.sector = ps->snap->chunk_size * chunk; 165 where.count = ps->snap->chunk_size; 166 167 return dm_io_sync_vm(1, &where, rw, ps->area, &bits); 168 } 169 170 /* 171 * Read or write a metadata area. Remembering to skip the first 172 * chunk which holds the header. 173 */ 174 static int area_io(struct pstore *ps, uint32_t area, int rw) 175 { 176 int r; 177 uint32_t chunk; 178 179 /* convert a metadata area index to a chunk index */ 180 chunk = 1 + ((ps->exceptions_per_area + 1) * area); 181 182 r = chunk_io(ps, chunk, rw); 183 if (r) 184 return r; 185 186 ps->current_area = area; 187 return 0; 188 } 189 190 static int zero_area(struct pstore *ps, uint32_t area) 191 { 192 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 193 return area_io(ps, area, WRITE); 194 } 195 196 static int read_header(struct pstore *ps, int *new_snapshot) 197 { 198 int r; 199 struct disk_header *dh; 200 chunk_t chunk_size; 201 202 r = chunk_io(ps, 0, READ); 203 if (r) 204 return r; 205 206 dh = (struct disk_header *) ps->area; 207 208 if (le32_to_cpu(dh->magic) == 0) { 209 *new_snapshot = 1; 210 211 } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) { 212 *new_snapshot = 0; 213 ps->valid = le32_to_cpu(dh->valid); 214 ps->version = le32_to_cpu(dh->version); 215 chunk_size = le32_to_cpu(dh->chunk_size); 216 if (ps->snap->chunk_size != chunk_size) { 217 DMWARN("chunk size %llu in device metadata overrides " 218 "table chunk size of %llu.", 219 (unsigned long long)chunk_size, 220 (unsigned long long)ps->snap->chunk_size); 221 222 /* We had a bogus chunk_size. Fix stuff up. */ 223 dm_io_put(sectors_to_pages(ps->snap->chunk_size)); 224 free_area(ps); 225 226 ps->snap->chunk_size = chunk_size; 227 ps->snap->chunk_mask = chunk_size - 1; 228 ps->snap->chunk_shift = ffs(chunk_size) - 1; 229 230 r = alloc_area(ps); 231 if (r) 232 return r; 233 234 r = dm_io_get(sectors_to_pages(chunk_size)); 235 if (r) 236 return r; 237 } 238 } else { 239 DMWARN("Invalid/corrupt snapshot"); 240 r = -ENXIO; 241 } 242 243 return r; 244 } 245 246 static int write_header(struct pstore *ps) 247 { 248 struct disk_header *dh; 249 250 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 251 252 dh = (struct disk_header *) ps->area; 253 dh->magic = cpu_to_le32(SNAP_MAGIC); 254 dh->valid = cpu_to_le32(ps->valid); 255 dh->version = cpu_to_le32(ps->version); 256 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); 257 258 return chunk_io(ps, 0, WRITE); 259 } 260 261 /* 262 * Access functions for the disk exceptions, these do the endian conversions. 263 */ 264 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) 265 { 266 if (index >= ps->exceptions_per_area) 267 return NULL; 268 269 return ((struct disk_exception *) ps->area) + index; 270 } 271 272 static int read_exception(struct pstore *ps, 273 uint32_t index, struct disk_exception *result) 274 { 275 struct disk_exception *e; 276 277 e = get_exception(ps, index); 278 if (!e) 279 return -EINVAL; 280 281 /* copy it */ 282 result->old_chunk = le64_to_cpu(e->old_chunk); 283 result->new_chunk = le64_to_cpu(e->new_chunk); 284 285 return 0; 286 } 287 288 static int write_exception(struct pstore *ps, 289 uint32_t index, struct disk_exception *de) 290 { 291 struct disk_exception *e; 292 293 e = get_exception(ps, index); 294 if (!e) 295 return -EINVAL; 296 297 /* copy it */ 298 e->old_chunk = cpu_to_le64(de->old_chunk); 299 e->new_chunk = cpu_to_le64(de->new_chunk); 300 301 return 0; 302 } 303 304 /* 305 * Registers the exceptions that are present in the current area. 306 * 'full' is filled in to indicate if the area has been 307 * filled. 308 */ 309 static int insert_exceptions(struct pstore *ps, int *full) 310 { 311 int r; 312 unsigned int i; 313 struct disk_exception de; 314 315 /* presume the area is full */ 316 *full = 1; 317 318 for (i = 0; i < ps->exceptions_per_area; i++) { 319 r = read_exception(ps, i, &de); 320 321 if (r) 322 return r; 323 324 /* 325 * If the new_chunk is pointing at the start of 326 * the COW device, where the first metadata area 327 * is we know that we've hit the end of the 328 * exceptions. Therefore the area is not full. 329 */ 330 if (de.new_chunk == 0LL) { 331 ps->current_committed = i; 332 *full = 0; 333 break; 334 } 335 336 /* 337 * Keep track of the start of the free chunks. 338 */ 339 if (ps->next_free <= de.new_chunk) 340 ps->next_free = de.new_chunk + 1; 341 342 /* 343 * Otherwise we add the exception to the snapshot. 344 */ 345 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); 346 if (r) 347 return r; 348 } 349 350 return 0; 351 } 352 353 static int read_exceptions(struct pstore *ps) 354 { 355 uint32_t area; 356 int r, full = 1; 357 358 /* 359 * Keeping reading chunks and inserting exceptions until 360 * we find a partially full area. 361 */ 362 for (area = 0; full; area++) { 363 r = area_io(ps, area, READ); 364 if (r) 365 return r; 366 367 r = insert_exceptions(ps, &full); 368 if (r) 369 return r; 370 } 371 372 return 0; 373 } 374 375 static inline struct pstore *get_info(struct exception_store *store) 376 { 377 return (struct pstore *) store->context; 378 } 379 380 static void persistent_fraction_full(struct exception_store *store, 381 sector_t *numerator, sector_t *denominator) 382 { 383 *numerator = get_info(store)->next_free * store->snap->chunk_size; 384 *denominator = get_dev_size(store->snap->cow->bdev); 385 } 386 387 static void persistent_destroy(struct exception_store *store) 388 { 389 struct pstore *ps = get_info(store); 390 391 dm_io_put(sectors_to_pages(ps->snap->chunk_size)); 392 vfree(ps->callbacks); 393 free_area(ps); 394 kfree(ps); 395 } 396 397 static int persistent_read_metadata(struct exception_store *store) 398 { 399 int r, new_snapshot; 400 struct pstore *ps = get_info(store); 401 402 /* 403 * Read the snapshot header. 404 */ 405 r = read_header(ps, &new_snapshot); 406 if (r) 407 return r; 408 409 /* 410 * Now we know correct chunk_size, complete the initialisation. 411 */ 412 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / 413 sizeof(struct disk_exception); 414 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 415 sizeof(*ps->callbacks)); 416 if (!ps->callbacks) 417 return -ENOMEM; 418 419 /* 420 * Do we need to setup a new snapshot ? 421 */ 422 if (new_snapshot) { 423 r = write_header(ps); 424 if (r) { 425 DMWARN("write_header failed"); 426 return r; 427 } 428 429 r = zero_area(ps, 0); 430 if (r) { 431 DMWARN("zero_area(0) failed"); 432 return r; 433 } 434 435 } else { 436 /* 437 * Sanity checks. 438 */ 439 if (!ps->valid) { 440 DMWARN("snapshot is marked invalid"); 441 return -EINVAL; 442 } 443 444 if (ps->version != SNAPSHOT_DISK_VERSION) { 445 DMWARN("unable to handle snapshot disk version %d", 446 ps->version); 447 return -EINVAL; 448 } 449 450 /* 451 * Read the metadata. 452 */ 453 r = read_exceptions(ps); 454 if (r) 455 return r; 456 } 457 458 return 0; 459 } 460 461 static int persistent_prepare(struct exception_store *store, 462 struct exception *e) 463 { 464 struct pstore *ps = get_info(store); 465 uint32_t stride; 466 sector_t size = get_dev_size(store->snap->cow->bdev); 467 468 /* Is there enough room ? */ 469 if (size < ((ps->next_free + 1) * store->snap->chunk_size)) 470 return -ENOSPC; 471 472 e->new_chunk = ps->next_free; 473 474 /* 475 * Move onto the next free pending, making sure to take 476 * into account the location of the metadata chunks. 477 */ 478 stride = (ps->exceptions_per_area + 1); 479 if ((++ps->next_free % stride) == 1) 480 ps->next_free++; 481 482 atomic_inc(&ps->pending_count); 483 return 0; 484 } 485 486 static void persistent_commit(struct exception_store *store, 487 struct exception *e, 488 void (*callback) (void *, int success), 489 void *callback_context) 490 { 491 int r; 492 unsigned int i; 493 struct pstore *ps = get_info(store); 494 struct disk_exception de; 495 struct commit_callback *cb; 496 497 de.old_chunk = e->old_chunk; 498 de.new_chunk = e->new_chunk; 499 write_exception(ps, ps->current_committed++, &de); 500 501 /* 502 * Add the callback to the back of the array. This code 503 * is the only place where the callback array is 504 * manipulated, and we know that it will never be called 505 * multiple times concurrently. 506 */ 507 cb = ps->callbacks + ps->callback_count++; 508 cb->callback = callback; 509 cb->context = callback_context; 510 511 /* 512 * If there are no more exceptions in flight, or we have 513 * filled this metadata area we commit the exceptions to 514 * disk. 515 */ 516 if (atomic_dec_and_test(&ps->pending_count) || 517 (ps->current_committed == ps->exceptions_per_area)) { 518 r = area_io(ps, ps->current_area, WRITE); 519 if (r) 520 ps->valid = 0; 521 522 for (i = 0; i < ps->callback_count; i++) { 523 cb = ps->callbacks + i; 524 cb->callback(cb->context, r == 0 ? 1 : 0); 525 } 526 527 ps->callback_count = 0; 528 } 529 530 /* 531 * Have we completely filled the current area ? 532 */ 533 if (ps->current_committed == ps->exceptions_per_area) { 534 ps->current_committed = 0; 535 r = zero_area(ps, ps->current_area + 1); 536 if (r) 537 ps->valid = 0; 538 } 539 } 540 541 static void persistent_drop(struct exception_store *store) 542 { 543 struct pstore *ps = get_info(store); 544 545 ps->valid = 0; 546 if (write_header(ps)) 547 DMWARN("write header failed"); 548 } 549 550 int dm_create_persistent(struct exception_store *store, uint32_t chunk_size) 551 { 552 int r; 553 struct pstore *ps; 554 555 r = dm_io_get(sectors_to_pages(chunk_size)); 556 if (r) 557 return r; 558 559 /* allocate the pstore */ 560 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 561 if (!ps) { 562 r = -ENOMEM; 563 goto bad; 564 } 565 566 ps->snap = store->snap; 567 ps->valid = 1; 568 ps->version = SNAPSHOT_DISK_VERSION; 569 ps->next_free = 2; /* skipping the header and first area */ 570 ps->current_committed = 0; 571 572 r = alloc_area(ps); 573 if (r) 574 goto bad; 575 576 ps->callback_count = 0; 577 atomic_set(&ps->pending_count, 0); 578 ps->callbacks = NULL; 579 580 store->destroy = persistent_destroy; 581 store->read_metadata = persistent_read_metadata; 582 store->prepare_exception = persistent_prepare; 583 store->commit_exception = persistent_commit; 584 store->drop_snapshot = persistent_drop; 585 store->fraction_full = persistent_fraction_full; 586 store->context = ps; 587 588 return 0; 589 590 bad: 591 dm_io_put(sectors_to_pages(chunk_size)); 592 if (ps && ps->area) 593 free_area(ps); 594 kfree(ps); 595 return r; 596 } 597 598 /*----------------------------------------------------------------- 599 * Implementation of the store for non-persistent snapshots. 600 *---------------------------------------------------------------*/ 601 struct transient_c { 602 sector_t next_free; 603 }; 604 605 static void transient_destroy(struct exception_store *store) 606 { 607 kfree(store->context); 608 } 609 610 static int transient_read_metadata(struct exception_store *store) 611 { 612 return 0; 613 } 614 615 static int transient_prepare(struct exception_store *store, struct exception *e) 616 { 617 struct transient_c *tc = (struct transient_c *) store->context; 618 sector_t size = get_dev_size(store->snap->cow->bdev); 619 620 if (size < (tc->next_free + store->snap->chunk_size)) 621 return -1; 622 623 e->new_chunk = sector_to_chunk(store->snap, tc->next_free); 624 tc->next_free += store->snap->chunk_size; 625 626 return 0; 627 } 628 629 static void transient_commit(struct exception_store *store, 630 struct exception *e, 631 void (*callback) (void *, int success), 632 void *callback_context) 633 { 634 /* Just succeed */ 635 callback(callback_context, 1); 636 } 637 638 static void transient_fraction_full(struct exception_store *store, 639 sector_t *numerator, sector_t *denominator) 640 { 641 *numerator = ((struct transient_c *) store->context)->next_free; 642 *denominator = get_dev_size(store->snap->cow->bdev); 643 } 644 645 int dm_create_transient(struct exception_store *store, 646 struct dm_snapshot *s, int blocksize) 647 { 648 struct transient_c *tc; 649 650 memset(store, 0, sizeof(*store)); 651 store->destroy = transient_destroy; 652 store->read_metadata = transient_read_metadata; 653 store->prepare_exception = transient_prepare; 654 store->commit_exception = transient_commit; 655 store->fraction_full = transient_fraction_full; 656 store->snap = s; 657 658 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); 659 if (!tc) 660 return -ENOMEM; 661 662 tc->next_free = 0; 663 store->context = tc; 664 665 return 0; 666 } 667