xref: /linux/drivers/md/dm-exception-store.c (revision 5e8d780d745c1619aba81fe7166c5a4b5cad2b84)
1 /*
2  * dm-snapshot.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include "dm.h"
10 #include "dm-snap.h"
11 #include "dm-io.h"
12 #include "kcopyd.h"
13 
14 #include <linux/mm.h>
15 #include <linux/pagemap.h>
16 #include <linux/vmalloc.h>
17 #include <linux/slab.h>
18 
19 #define DM_MSG_PREFIX "snapshots"
20 
21 /*-----------------------------------------------------------------
22  * Persistent snapshots, by persistent we mean that the snapshot
23  * will survive a reboot.
24  *---------------------------------------------------------------*/
25 
26 /*
27  * We need to store a record of which parts of the origin have
28  * been copied to the snapshot device.  The snapshot code
29  * requires that we copy exception chunks to chunk aligned areas
30  * of the COW store.  It makes sense therefore, to store the
31  * metadata in chunk size blocks.
32  *
33  * There is no backward or forward compatibility implemented,
34  * snapshots with different disk versions than the kernel will
35  * not be usable.  It is expected that "lvcreate" will blank out
36  * the start of a fresh COW device before calling the snapshot
37  * constructor.
38  *
39  * The first chunk of the COW device just contains the header.
40  * After this there is a chunk filled with exception metadata,
41  * followed by as many exception chunks as can fit in the
42  * metadata areas.
43  *
44  * All on disk structures are in little-endian format.  The end
45  * of the exceptions info is indicated by an exception with a
46  * new_chunk of 0, which is invalid since it would point to the
47  * header chunk.
48  */
49 
50 /*
51  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
52  */
53 #define SNAP_MAGIC 0x70416e53
54 
55 /*
56  * The on-disk version of the metadata.
57  */
58 #define SNAPSHOT_DISK_VERSION 1
59 
60 struct disk_header {
61 	uint32_t magic;
62 
63 	/*
64 	 * Is this snapshot valid.  There is no way of recovering
65 	 * an invalid snapshot.
66 	 */
67 	uint32_t valid;
68 
69 	/*
70 	 * Simple, incrementing version. no backward
71 	 * compatibility.
72 	 */
73 	uint32_t version;
74 
75 	/* In sectors */
76 	uint32_t chunk_size;
77 };
78 
79 struct disk_exception {
80 	uint64_t old_chunk;
81 	uint64_t new_chunk;
82 };
83 
84 struct commit_callback {
85 	void (*callback)(void *, int success);
86 	void *context;
87 };
88 
89 /*
90  * The top level structure for a persistent exception store.
91  */
92 struct pstore {
93 	struct dm_snapshot *snap;	/* up pointer to my snapshot */
94 	int version;
95 	int valid;
96 	uint32_t exceptions_per_area;
97 
98 	/*
99 	 * Now that we have an asynchronous kcopyd there is no
100 	 * need for large chunk sizes, so it wont hurt to have a
101 	 * whole chunks worth of metadata in memory at once.
102 	 */
103 	void *area;
104 
105 	/*
106 	 * Used to keep track of which metadata area the data in
107 	 * 'chunk' refers to.
108 	 */
109 	uint32_t current_area;
110 
111 	/*
112 	 * The next free chunk for an exception.
113 	 */
114 	uint32_t next_free;
115 
116 	/*
117 	 * The index of next free exception in the current
118 	 * metadata area.
119 	 */
120 	uint32_t current_committed;
121 
122 	atomic_t pending_count;
123 	uint32_t callback_count;
124 	struct commit_callback *callbacks;
125 };
126 
127 static inline unsigned int sectors_to_pages(unsigned int sectors)
128 {
129 	return sectors / (PAGE_SIZE >> 9);
130 }
131 
132 static int alloc_area(struct pstore *ps)
133 {
134 	int r = -ENOMEM;
135 	size_t len;
136 
137 	len = ps->snap->chunk_size << SECTOR_SHIFT;
138 
139 	/*
140 	 * Allocate the chunk_size block of memory that will hold
141 	 * a single metadata area.
142 	 */
143 	ps->area = vmalloc(len);
144 	if (!ps->area)
145 		return r;
146 
147 	return 0;
148 }
149 
150 static void free_area(struct pstore *ps)
151 {
152 	vfree(ps->area);
153 }
154 
155 /*
156  * Read or write a chunk aligned and sized block of data from a device.
157  */
158 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
159 {
160 	struct io_region where;
161 	unsigned long bits;
162 
163 	where.bdev = ps->snap->cow->bdev;
164 	where.sector = ps->snap->chunk_size * chunk;
165 	where.count = ps->snap->chunk_size;
166 
167 	return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
168 }
169 
170 /*
171  * Read or write a metadata area.  Remembering to skip the first
172  * chunk which holds the header.
173  */
174 static int area_io(struct pstore *ps, uint32_t area, int rw)
175 {
176 	int r;
177 	uint32_t chunk;
178 
179 	/* convert a metadata area index to a chunk index */
180 	chunk = 1 + ((ps->exceptions_per_area + 1) * area);
181 
182 	r = chunk_io(ps, chunk, rw);
183 	if (r)
184 		return r;
185 
186 	ps->current_area = area;
187 	return 0;
188 }
189 
190 static int zero_area(struct pstore *ps, uint32_t area)
191 {
192 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
193 	return area_io(ps, area, WRITE);
194 }
195 
196 static int read_header(struct pstore *ps, int *new_snapshot)
197 {
198 	int r;
199 	struct disk_header *dh;
200 	chunk_t chunk_size;
201 
202 	r = chunk_io(ps, 0, READ);
203 	if (r)
204 		return r;
205 
206 	dh = (struct disk_header *) ps->area;
207 
208 	if (le32_to_cpu(dh->magic) == 0) {
209 		*new_snapshot = 1;
210 
211 	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
212 		*new_snapshot = 0;
213 		ps->valid = le32_to_cpu(dh->valid);
214 		ps->version = le32_to_cpu(dh->version);
215 		chunk_size = le32_to_cpu(dh->chunk_size);
216 		if (ps->snap->chunk_size != chunk_size) {
217 			DMWARN("chunk size %llu in device metadata overrides "
218 			       "table chunk size of %llu.",
219 			       (unsigned long long)chunk_size,
220 			       (unsigned long long)ps->snap->chunk_size);
221 
222 			/* We had a bogus chunk_size. Fix stuff up. */
223 			dm_io_put(sectors_to_pages(ps->snap->chunk_size));
224 			free_area(ps);
225 
226 			ps->snap->chunk_size = chunk_size;
227 			ps->snap->chunk_mask = chunk_size - 1;
228 			ps->snap->chunk_shift = ffs(chunk_size) - 1;
229 
230 			r = alloc_area(ps);
231 			if (r)
232 				return r;
233 
234 			r = dm_io_get(sectors_to_pages(chunk_size));
235 			if (r)
236 				return r;
237 		}
238 	} else {
239 		DMWARN("Invalid/corrupt snapshot");
240 		r = -ENXIO;
241 	}
242 
243 	return r;
244 }
245 
246 static int write_header(struct pstore *ps)
247 {
248 	struct disk_header *dh;
249 
250 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
251 
252 	dh = (struct disk_header *) ps->area;
253 	dh->magic = cpu_to_le32(SNAP_MAGIC);
254 	dh->valid = cpu_to_le32(ps->valid);
255 	dh->version = cpu_to_le32(ps->version);
256 	dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
257 
258 	return chunk_io(ps, 0, WRITE);
259 }
260 
261 /*
262  * Access functions for the disk exceptions, these do the endian conversions.
263  */
264 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
265 {
266 	if (index >= ps->exceptions_per_area)
267 		return NULL;
268 
269 	return ((struct disk_exception *) ps->area) + index;
270 }
271 
272 static int read_exception(struct pstore *ps,
273 			  uint32_t index, struct disk_exception *result)
274 {
275 	struct disk_exception *e;
276 
277 	e = get_exception(ps, index);
278 	if (!e)
279 		return -EINVAL;
280 
281 	/* copy it */
282 	result->old_chunk = le64_to_cpu(e->old_chunk);
283 	result->new_chunk = le64_to_cpu(e->new_chunk);
284 
285 	return 0;
286 }
287 
288 static int write_exception(struct pstore *ps,
289 			   uint32_t index, struct disk_exception *de)
290 {
291 	struct disk_exception *e;
292 
293 	e = get_exception(ps, index);
294 	if (!e)
295 		return -EINVAL;
296 
297 	/* copy it */
298 	e->old_chunk = cpu_to_le64(de->old_chunk);
299 	e->new_chunk = cpu_to_le64(de->new_chunk);
300 
301 	return 0;
302 }
303 
304 /*
305  * Registers the exceptions that are present in the current area.
306  * 'full' is filled in to indicate if the area has been
307  * filled.
308  */
309 static int insert_exceptions(struct pstore *ps, int *full)
310 {
311 	int r;
312 	unsigned int i;
313 	struct disk_exception de;
314 
315 	/* presume the area is full */
316 	*full = 1;
317 
318 	for (i = 0; i < ps->exceptions_per_area; i++) {
319 		r = read_exception(ps, i, &de);
320 
321 		if (r)
322 			return r;
323 
324 		/*
325 		 * If the new_chunk is pointing at the start of
326 		 * the COW device, where the first metadata area
327 		 * is we know that we've hit the end of the
328 		 * exceptions.  Therefore the area is not full.
329 		 */
330 		if (de.new_chunk == 0LL) {
331 			ps->current_committed = i;
332 			*full = 0;
333 			break;
334 		}
335 
336 		/*
337 		 * Keep track of the start of the free chunks.
338 		 */
339 		if (ps->next_free <= de.new_chunk)
340 			ps->next_free = de.new_chunk + 1;
341 
342 		/*
343 		 * Otherwise we add the exception to the snapshot.
344 		 */
345 		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
346 		if (r)
347 			return r;
348 	}
349 
350 	return 0;
351 }
352 
353 static int read_exceptions(struct pstore *ps)
354 {
355 	uint32_t area;
356 	int r, full = 1;
357 
358 	/*
359 	 * Keeping reading chunks and inserting exceptions until
360 	 * we find a partially full area.
361 	 */
362 	for (area = 0; full; area++) {
363 		r = area_io(ps, area, READ);
364 		if (r)
365 			return r;
366 
367 		r = insert_exceptions(ps, &full);
368 		if (r)
369 			return r;
370 	}
371 
372 	return 0;
373 }
374 
375 static inline struct pstore *get_info(struct exception_store *store)
376 {
377 	return (struct pstore *) store->context;
378 }
379 
380 static void persistent_fraction_full(struct exception_store *store,
381 				     sector_t *numerator, sector_t *denominator)
382 {
383 	*numerator = get_info(store)->next_free * store->snap->chunk_size;
384 	*denominator = get_dev_size(store->snap->cow->bdev);
385 }
386 
387 static void persistent_destroy(struct exception_store *store)
388 {
389 	struct pstore *ps = get_info(store);
390 
391 	dm_io_put(sectors_to_pages(ps->snap->chunk_size));
392 	vfree(ps->callbacks);
393 	free_area(ps);
394 	kfree(ps);
395 }
396 
397 static int persistent_read_metadata(struct exception_store *store)
398 {
399 	int r, new_snapshot;
400 	struct pstore *ps = get_info(store);
401 
402 	/*
403 	 * Read the snapshot header.
404 	 */
405 	r = read_header(ps, &new_snapshot);
406 	if (r)
407 		return r;
408 
409 	/*
410 	 * Now we know correct chunk_size, complete the initialisation.
411 	 */
412 	ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
413 				  sizeof(struct disk_exception);
414 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
415 			sizeof(*ps->callbacks));
416 	if (!ps->callbacks)
417 		return -ENOMEM;
418 
419 	/*
420 	 * Do we need to setup a new snapshot ?
421 	 */
422 	if (new_snapshot) {
423 		r = write_header(ps);
424 		if (r) {
425 			DMWARN("write_header failed");
426 			return r;
427 		}
428 
429 		r = zero_area(ps, 0);
430 		if (r) {
431 			DMWARN("zero_area(0) failed");
432 			return r;
433 		}
434 
435 	} else {
436 		/*
437 		 * Sanity checks.
438 		 */
439 		if (!ps->valid) {
440 			DMWARN("snapshot is marked invalid");
441 			return -EINVAL;
442 		}
443 
444 		if (ps->version != SNAPSHOT_DISK_VERSION) {
445 			DMWARN("unable to handle snapshot disk version %d",
446 			       ps->version);
447 			return -EINVAL;
448 		}
449 
450 		/*
451 		 * Read the metadata.
452 		 */
453 		r = read_exceptions(ps);
454 		if (r)
455 			return r;
456 	}
457 
458 	return 0;
459 }
460 
461 static int persistent_prepare(struct exception_store *store,
462 			      struct exception *e)
463 {
464 	struct pstore *ps = get_info(store);
465 	uint32_t stride;
466 	sector_t size = get_dev_size(store->snap->cow->bdev);
467 
468 	/* Is there enough room ? */
469 	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
470 		return -ENOSPC;
471 
472 	e->new_chunk = ps->next_free;
473 
474 	/*
475 	 * Move onto the next free pending, making sure to take
476 	 * into account the location of the metadata chunks.
477 	 */
478 	stride = (ps->exceptions_per_area + 1);
479 	if ((++ps->next_free % stride) == 1)
480 		ps->next_free++;
481 
482 	atomic_inc(&ps->pending_count);
483 	return 0;
484 }
485 
486 static void persistent_commit(struct exception_store *store,
487 			      struct exception *e,
488 			      void (*callback) (void *, int success),
489 			      void *callback_context)
490 {
491 	int r;
492 	unsigned int i;
493 	struct pstore *ps = get_info(store);
494 	struct disk_exception de;
495 	struct commit_callback *cb;
496 
497 	de.old_chunk = e->old_chunk;
498 	de.new_chunk = e->new_chunk;
499 	write_exception(ps, ps->current_committed++, &de);
500 
501 	/*
502 	 * Add the callback to the back of the array.  This code
503 	 * is the only place where the callback array is
504 	 * manipulated, and we know that it will never be called
505 	 * multiple times concurrently.
506 	 */
507 	cb = ps->callbacks + ps->callback_count++;
508 	cb->callback = callback;
509 	cb->context = callback_context;
510 
511 	/*
512 	 * If there are no more exceptions in flight, or we have
513 	 * filled this metadata area we commit the exceptions to
514 	 * disk.
515 	 */
516 	if (atomic_dec_and_test(&ps->pending_count) ||
517 	    (ps->current_committed == ps->exceptions_per_area)) {
518 		r = area_io(ps, ps->current_area, WRITE);
519 		if (r)
520 			ps->valid = 0;
521 
522 		for (i = 0; i < ps->callback_count; i++) {
523 			cb = ps->callbacks + i;
524 			cb->callback(cb->context, r == 0 ? 1 : 0);
525 		}
526 
527 		ps->callback_count = 0;
528 	}
529 
530 	/*
531 	 * Have we completely filled the current area ?
532 	 */
533 	if (ps->current_committed == ps->exceptions_per_area) {
534 		ps->current_committed = 0;
535 		r = zero_area(ps, ps->current_area + 1);
536 		if (r)
537 			ps->valid = 0;
538 	}
539 }
540 
541 static void persistent_drop(struct exception_store *store)
542 {
543 	struct pstore *ps = get_info(store);
544 
545 	ps->valid = 0;
546 	if (write_header(ps))
547 		DMWARN("write header failed");
548 }
549 
550 int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
551 {
552 	int r;
553 	struct pstore *ps;
554 
555 	r = dm_io_get(sectors_to_pages(chunk_size));
556 	if (r)
557 		return r;
558 
559 	/* allocate the pstore */
560 	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
561 	if (!ps) {
562 		r = -ENOMEM;
563 		goto bad;
564 	}
565 
566 	ps->snap = store->snap;
567 	ps->valid = 1;
568 	ps->version = SNAPSHOT_DISK_VERSION;
569 	ps->next_free = 2;	/* skipping the header and first area */
570 	ps->current_committed = 0;
571 
572 	r = alloc_area(ps);
573 	if (r)
574 		goto bad;
575 
576 	ps->callback_count = 0;
577 	atomic_set(&ps->pending_count, 0);
578 	ps->callbacks = NULL;
579 
580 	store->destroy = persistent_destroy;
581 	store->read_metadata = persistent_read_metadata;
582 	store->prepare_exception = persistent_prepare;
583 	store->commit_exception = persistent_commit;
584 	store->drop_snapshot = persistent_drop;
585 	store->fraction_full = persistent_fraction_full;
586 	store->context = ps;
587 
588 	return 0;
589 
590       bad:
591 	dm_io_put(sectors_to_pages(chunk_size));
592 	if (ps && ps->area)
593 		free_area(ps);
594 	kfree(ps);
595 	return r;
596 }
597 
598 /*-----------------------------------------------------------------
599  * Implementation of the store for non-persistent snapshots.
600  *---------------------------------------------------------------*/
601 struct transient_c {
602 	sector_t next_free;
603 };
604 
605 static void transient_destroy(struct exception_store *store)
606 {
607 	kfree(store->context);
608 }
609 
610 static int transient_read_metadata(struct exception_store *store)
611 {
612 	return 0;
613 }
614 
615 static int transient_prepare(struct exception_store *store, struct exception *e)
616 {
617 	struct transient_c *tc = (struct transient_c *) store->context;
618 	sector_t size = get_dev_size(store->snap->cow->bdev);
619 
620 	if (size < (tc->next_free + store->snap->chunk_size))
621 		return -1;
622 
623 	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
624 	tc->next_free += store->snap->chunk_size;
625 
626 	return 0;
627 }
628 
629 static void transient_commit(struct exception_store *store,
630 		      struct exception *e,
631 		      void (*callback) (void *, int success),
632 		      void *callback_context)
633 {
634 	/* Just succeed */
635 	callback(callback_context, 1);
636 }
637 
638 static void transient_fraction_full(struct exception_store *store,
639 				    sector_t *numerator, sector_t *denominator)
640 {
641 	*numerator = ((struct transient_c *) store->context)->next_free;
642 	*denominator = get_dev_size(store->snap->cow->bdev);
643 }
644 
645 int dm_create_transient(struct exception_store *store,
646 			struct dm_snapshot *s, int blocksize)
647 {
648 	struct transient_c *tc;
649 
650 	memset(store, 0, sizeof(*store));
651 	store->destroy = transient_destroy;
652 	store->read_metadata = transient_read_metadata;
653 	store->prepare_exception = transient_prepare;
654 	store->commit_exception = transient_commit;
655 	store->fraction_full = transient_fraction_full;
656 	store->snap = s;
657 
658 	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
659 	if (!tc)
660 		return -ENOMEM;
661 
662 	tc->next_free = 0;
663 	store->context = tc;
664 
665 	return 0;
666 }
667