xref: /linux/drivers/md/dm-snap.c (revision 87c2ce3b9305b9b723faeedf6e32ef703ec9b33a)
1 /*
2  * dm-snapshot.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include <linux/blkdev.h>
10 #include <linux/config.h>
11 #include <linux/ctype.h>
12 #include <linux/device-mapper.h>
13 #include <linux/fs.h>
14 #include <linux/init.h>
15 #include <linux/kdev_t.h>
16 #include <linux/list.h>
17 #include <linux/mempool.h>
18 #include <linux/module.h>
19 #include <linux/slab.h>
20 #include <linux/vmalloc.h>
21 
22 #include "dm-snap.h"
23 #include "dm-bio-list.h"
24 #include "kcopyd.h"
25 
26 /*
27  * The percentage increment we will wake up users at
28  */
29 #define WAKE_UP_PERCENT 5
30 
31 /*
32  * kcopyd priority of snapshot operations
33  */
34 #define SNAPSHOT_COPY_PRIORITY 2
35 
36 /*
37  * Each snapshot reserves this many pages for io
38  */
39 #define SNAPSHOT_PAGES 256
40 
41 struct pending_exception {
42 	struct exception e;
43 
44 	/*
45 	 * Origin buffers waiting for this to complete are held
46 	 * in a bio list
47 	 */
48 	struct bio_list origin_bios;
49 	struct bio_list snapshot_bios;
50 
51 	/*
52 	 * Other pending_exceptions that are processing this
53 	 * chunk.  When this list is empty, we know we can
54 	 * complete the origins.
55 	 */
56 	struct list_head siblings;
57 
58 	/* Pointer back to snapshot context */
59 	struct dm_snapshot *snap;
60 
61 	/*
62 	 * 1 indicates the exception has already been sent to
63 	 * kcopyd.
64 	 */
65 	int started;
66 };
67 
68 /*
69  * Hash table mapping origin volumes to lists of snapshots and
70  * a lock to protect it
71  */
72 static kmem_cache_t *exception_cache;
73 static kmem_cache_t *pending_cache;
74 static mempool_t *pending_pool;
75 
76 /*
77  * One of these per registered origin, held in the snapshot_origins hash
78  */
79 struct origin {
80 	/* The origin device */
81 	struct block_device *bdev;
82 
83 	struct list_head hash_list;
84 
85 	/* List of snapshots for this origin */
86 	struct list_head snapshots;
87 };
88 
89 /*
90  * Size of the hash table for origin volumes. If we make this
91  * the size of the minors list then it should be nearly perfect
92  */
93 #define ORIGIN_HASH_SIZE 256
94 #define ORIGIN_MASK      0xFF
95 static struct list_head *_origins;
96 static struct rw_semaphore _origins_lock;
97 
98 static int init_origin_hash(void)
99 {
100 	int i;
101 
102 	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
103 			   GFP_KERNEL);
104 	if (!_origins) {
105 		DMERR("Device mapper: Snapshot: unable to allocate memory");
106 		return -ENOMEM;
107 	}
108 
109 	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
110 		INIT_LIST_HEAD(_origins + i);
111 	init_rwsem(&_origins_lock);
112 
113 	return 0;
114 }
115 
116 static void exit_origin_hash(void)
117 {
118 	kfree(_origins);
119 }
120 
121 static inline unsigned int origin_hash(struct block_device *bdev)
122 {
123 	return bdev->bd_dev & ORIGIN_MASK;
124 }
125 
126 static struct origin *__lookup_origin(struct block_device *origin)
127 {
128 	struct list_head *ol;
129 	struct origin *o;
130 
131 	ol = &_origins[origin_hash(origin)];
132 	list_for_each_entry (o, ol, hash_list)
133 		if (bdev_equal(o->bdev, origin))
134 			return o;
135 
136 	return NULL;
137 }
138 
139 static void __insert_origin(struct origin *o)
140 {
141 	struct list_head *sl = &_origins[origin_hash(o->bdev)];
142 	list_add_tail(&o->hash_list, sl);
143 }
144 
145 /*
146  * Make a note of the snapshot and its origin so we can look it
147  * up when the origin has a write on it.
148  */
149 static int register_snapshot(struct dm_snapshot *snap)
150 {
151 	struct origin *o;
152 	struct block_device *bdev = snap->origin->bdev;
153 
154 	down_write(&_origins_lock);
155 	o = __lookup_origin(bdev);
156 
157 	if (!o) {
158 		/* New origin */
159 		o = kmalloc(sizeof(*o), GFP_KERNEL);
160 		if (!o) {
161 			up_write(&_origins_lock);
162 			return -ENOMEM;
163 		}
164 
165 		/* Initialise the struct */
166 		INIT_LIST_HEAD(&o->snapshots);
167 		o->bdev = bdev;
168 
169 		__insert_origin(o);
170 	}
171 
172 	list_add_tail(&snap->list, &o->snapshots);
173 
174 	up_write(&_origins_lock);
175 	return 0;
176 }
177 
178 static void unregister_snapshot(struct dm_snapshot *s)
179 {
180 	struct origin *o;
181 
182 	down_write(&_origins_lock);
183 	o = __lookup_origin(s->origin->bdev);
184 
185 	list_del(&s->list);
186 	if (list_empty(&o->snapshots)) {
187 		list_del(&o->hash_list);
188 		kfree(o);
189 	}
190 
191 	up_write(&_origins_lock);
192 }
193 
194 /*
195  * Implementation of the exception hash tables.
196  */
197 static int init_exception_table(struct exception_table *et, uint32_t size)
198 {
199 	unsigned int i;
200 
201 	et->hash_mask = size - 1;
202 	et->table = dm_vcalloc(size, sizeof(struct list_head));
203 	if (!et->table)
204 		return -ENOMEM;
205 
206 	for (i = 0; i < size; i++)
207 		INIT_LIST_HEAD(et->table + i);
208 
209 	return 0;
210 }
211 
212 static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
213 {
214 	struct list_head *slot;
215 	struct exception *ex, *next;
216 	int i, size;
217 
218 	size = et->hash_mask + 1;
219 	for (i = 0; i < size; i++) {
220 		slot = et->table + i;
221 
222 		list_for_each_entry_safe (ex, next, slot, hash_list)
223 			kmem_cache_free(mem, ex);
224 	}
225 
226 	vfree(et->table);
227 }
228 
229 static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
230 {
231 	return chunk & et->hash_mask;
232 }
233 
234 static void insert_exception(struct exception_table *eh, struct exception *e)
235 {
236 	struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
237 	list_add(&e->hash_list, l);
238 }
239 
240 static inline void remove_exception(struct exception *e)
241 {
242 	list_del(&e->hash_list);
243 }
244 
245 /*
246  * Return the exception data for a sector, or NULL if not
247  * remapped.
248  */
249 static struct exception *lookup_exception(struct exception_table *et,
250 					  chunk_t chunk)
251 {
252 	struct list_head *slot;
253 	struct exception *e;
254 
255 	slot = &et->table[exception_hash(et, chunk)];
256 	list_for_each_entry (e, slot, hash_list)
257 		if (e->old_chunk == chunk)
258 			return e;
259 
260 	return NULL;
261 }
262 
263 static inline struct exception *alloc_exception(void)
264 {
265 	struct exception *e;
266 
267 	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
268 	if (!e)
269 		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
270 
271 	return e;
272 }
273 
274 static inline void free_exception(struct exception *e)
275 {
276 	kmem_cache_free(exception_cache, e);
277 }
278 
279 static inline struct pending_exception *alloc_pending_exception(void)
280 {
281 	return mempool_alloc(pending_pool, GFP_NOIO);
282 }
283 
284 static inline void free_pending_exception(struct pending_exception *pe)
285 {
286 	mempool_free(pe, pending_pool);
287 }
288 
289 int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
290 {
291 	struct exception *e;
292 
293 	e = alloc_exception();
294 	if (!e)
295 		return -ENOMEM;
296 
297 	e->old_chunk = old;
298 	e->new_chunk = new;
299 	insert_exception(&s->complete, e);
300 	return 0;
301 }
302 
303 /*
304  * Hard coded magic.
305  */
306 static int calc_max_buckets(void)
307 {
308 	/* use a fixed size of 2MB */
309 	unsigned long mem = 2 * 1024 * 1024;
310 	mem /= sizeof(struct list_head);
311 
312 	return mem;
313 }
314 
315 /*
316  * Rounds a number down to a power of 2.
317  */
318 static inline uint32_t round_down(uint32_t n)
319 {
320 	while (n & (n - 1))
321 		n &= (n - 1);
322 	return n;
323 }
324 
325 /*
326  * Allocate room for a suitable hash table.
327  */
328 static int init_hash_tables(struct dm_snapshot *s)
329 {
330 	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
331 
332 	/*
333 	 * Calculate based on the size of the original volume or
334 	 * the COW volume...
335 	 */
336 	cow_dev_size = get_dev_size(s->cow->bdev);
337 	origin_dev_size = get_dev_size(s->origin->bdev);
338 	max_buckets = calc_max_buckets();
339 
340 	hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
341 	hash_size = min(hash_size, max_buckets);
342 
343 	/* Round it down to a power of 2 */
344 	hash_size = round_down(hash_size);
345 	if (init_exception_table(&s->complete, hash_size))
346 		return -ENOMEM;
347 
348 	/*
349 	 * Allocate hash table for in-flight exceptions
350 	 * Make this smaller than the real hash table
351 	 */
352 	hash_size >>= 3;
353 	if (hash_size < 64)
354 		hash_size = 64;
355 
356 	if (init_exception_table(&s->pending, hash_size)) {
357 		exit_exception_table(&s->complete, exception_cache);
358 		return -ENOMEM;
359 	}
360 
361 	return 0;
362 }
363 
364 /*
365  * Round a number up to the nearest 'size' boundary.  size must
366  * be a power of 2.
367  */
368 static inline ulong round_up(ulong n, ulong size)
369 {
370 	size--;
371 	return (n + size) & ~size;
372 }
373 
374 static void read_snapshot_metadata(struct dm_snapshot *s)
375 {
376 	if (s->have_metadata)
377 		return;
378 
379 	if (s->store.read_metadata(&s->store)) {
380 		down_write(&s->lock);
381 		s->valid = 0;
382 		up_write(&s->lock);
383 	}
384 
385 	s->have_metadata = 1;
386 }
387 
388 /*
389  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
390  */
391 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
392 {
393 	struct dm_snapshot *s;
394 	unsigned long chunk_size;
395 	int r = -EINVAL;
396 	char persistent;
397 	char *origin_path;
398 	char *cow_path;
399 	char *value;
400 	int blocksize;
401 
402 	if (argc < 4) {
403 		ti->error = "dm-snapshot: requires exactly 4 arguments";
404 		r = -EINVAL;
405 		goto bad1;
406 	}
407 
408 	origin_path = argv[0];
409 	cow_path = argv[1];
410 	persistent = toupper(*argv[2]);
411 
412 	if (persistent != 'P' && persistent != 'N') {
413 		ti->error = "Persistent flag is not P or N";
414 		r = -EINVAL;
415 		goto bad1;
416 	}
417 
418 	chunk_size = simple_strtoul(argv[3], &value, 10);
419 	if (chunk_size == 0 || value == NULL) {
420 		ti->error = "Invalid chunk size";
421 		r = -EINVAL;
422 		goto bad1;
423 	}
424 
425 	s = kmalloc(sizeof(*s), GFP_KERNEL);
426 	if (s == NULL) {
427 		ti->error = "Cannot allocate snapshot context private "
428 		    "structure";
429 		r = -ENOMEM;
430 		goto bad1;
431 	}
432 
433 	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
434 	if (r) {
435 		ti->error = "Cannot get origin device";
436 		goto bad2;
437 	}
438 
439 	r = dm_get_device(ti, cow_path, 0, 0,
440 			  FMODE_READ | FMODE_WRITE, &s->cow);
441 	if (r) {
442 		dm_put_device(ti, s->origin);
443 		ti->error = "Cannot get COW device";
444 		goto bad2;
445 	}
446 
447 	/*
448 	 * Chunk size must be multiple of page size.  Silently
449 	 * round up if it's not.
450 	 */
451 	chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
452 
453 	/* Validate the chunk size against the device block size */
454 	blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
455 	if (chunk_size % (blocksize >> 9)) {
456 		ti->error = "Chunk size is not a multiple of device blocksize";
457 		r = -EINVAL;
458 		goto bad3;
459 	}
460 
461 	/* Check chunk_size is a power of 2 */
462 	if (chunk_size & (chunk_size - 1)) {
463 		ti->error = "Chunk size is not a power of 2";
464 		r = -EINVAL;
465 		goto bad3;
466 	}
467 
468 	s->chunk_size = chunk_size;
469 	s->chunk_mask = chunk_size - 1;
470 	s->type = persistent;
471 	s->chunk_shift = ffs(chunk_size) - 1;
472 
473 	s->valid = 1;
474 	s->have_metadata = 0;
475 	s->last_percent = 0;
476 	init_rwsem(&s->lock);
477 	s->table = ti->table;
478 
479 	/* Allocate hash table for COW data */
480 	if (init_hash_tables(s)) {
481 		ti->error = "Unable to allocate hash table space";
482 		r = -ENOMEM;
483 		goto bad3;
484 	}
485 
486 	/*
487 	 * Check the persistent flag - done here because we need the iobuf
488 	 * to check the LV header
489 	 */
490 	s->store.snap = s;
491 
492 	if (persistent == 'P')
493 		r = dm_create_persistent(&s->store, chunk_size);
494 	else
495 		r = dm_create_transient(&s->store, s, blocksize);
496 
497 	if (r) {
498 		ti->error = "Couldn't create exception store";
499 		r = -EINVAL;
500 		goto bad4;
501 	}
502 
503 	r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
504 	if (r) {
505 		ti->error = "Could not create kcopyd client";
506 		goto bad5;
507 	}
508 
509 	/* Add snapshot to the list of snapshots for this origin */
510 	if (register_snapshot(s)) {
511 		r = -EINVAL;
512 		ti->error = "Cannot register snapshot origin";
513 		goto bad6;
514 	}
515 
516 	ti->private = s;
517 	ti->split_io = chunk_size;
518 
519 	return 0;
520 
521  bad6:
522 	kcopyd_client_destroy(s->kcopyd_client);
523 
524  bad5:
525 	s->store.destroy(&s->store);
526 
527  bad4:
528 	exit_exception_table(&s->pending, pending_cache);
529 	exit_exception_table(&s->complete, exception_cache);
530 
531  bad3:
532 	dm_put_device(ti, s->cow);
533 	dm_put_device(ti, s->origin);
534 
535  bad2:
536 	kfree(s);
537 
538  bad1:
539 	return r;
540 }
541 
542 static void snapshot_dtr(struct dm_target *ti)
543 {
544 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
545 
546 	unregister_snapshot(s);
547 
548 	exit_exception_table(&s->pending, pending_cache);
549 	exit_exception_table(&s->complete, exception_cache);
550 
551 	/* Deallocate memory used */
552 	s->store.destroy(&s->store);
553 
554 	dm_put_device(ti, s->origin);
555 	dm_put_device(ti, s->cow);
556 	kcopyd_client_destroy(s->kcopyd_client);
557 	kfree(s);
558 }
559 
560 /*
561  * Flush a list of buffers.
562  */
563 static void flush_bios(struct bio *bio)
564 {
565 	struct bio *n;
566 
567 	while (bio) {
568 		n = bio->bi_next;
569 		bio->bi_next = NULL;
570 		generic_make_request(bio);
571 		bio = n;
572 	}
573 }
574 
575 /*
576  * Error a list of buffers.
577  */
578 static void error_bios(struct bio *bio)
579 {
580 	struct bio *n;
581 
582 	while (bio) {
583 		n = bio->bi_next;
584 		bio->bi_next = NULL;
585 		bio_io_error(bio, bio->bi_size);
586 		bio = n;
587 	}
588 }
589 
590 static struct bio *__flush_bios(struct pending_exception *pe)
591 {
592 	struct pending_exception *sibling;
593 
594 	if (list_empty(&pe->siblings))
595 		return bio_list_get(&pe->origin_bios);
596 
597 	sibling = list_entry(pe->siblings.next,
598 			     struct pending_exception, siblings);
599 
600 	list_del(&pe->siblings);
601 
602 	/* This is fine as long as kcopyd is single-threaded. If kcopyd
603 	 * becomes multi-threaded, we'll need some locking here.
604 	 */
605 	bio_list_merge(&sibling->origin_bios, &pe->origin_bios);
606 
607 	return NULL;
608 }
609 
610 static void pending_complete(struct pending_exception *pe, int success)
611 {
612 	struct exception *e;
613 	struct dm_snapshot *s = pe->snap;
614 	struct bio *flush = NULL;
615 
616 	if (success) {
617 		e = alloc_exception();
618 		if (!e) {
619 			DMWARN("Unable to allocate exception.");
620 			down_write(&s->lock);
621 			s->store.drop_snapshot(&s->store);
622 			s->valid = 0;
623 			flush = __flush_bios(pe);
624 			up_write(&s->lock);
625 
626 			error_bios(bio_list_get(&pe->snapshot_bios));
627 			goto out;
628 		}
629 		*e = pe->e;
630 
631 		/*
632 		 * Add a proper exception, and remove the
633 		 * in-flight exception from the list.
634 		 */
635 		down_write(&s->lock);
636 		insert_exception(&s->complete, e);
637 		remove_exception(&pe->e);
638 		flush = __flush_bios(pe);
639 
640 		/* Submit any pending write bios */
641 		up_write(&s->lock);
642 
643 		flush_bios(bio_list_get(&pe->snapshot_bios));
644 	} else {
645 		/* Read/write error - snapshot is unusable */
646 		down_write(&s->lock);
647 		if (s->valid)
648 			DMERR("Error reading/writing snapshot");
649 		s->store.drop_snapshot(&s->store);
650 		s->valid = 0;
651 		remove_exception(&pe->e);
652 		flush = __flush_bios(pe);
653 		up_write(&s->lock);
654 
655 		error_bios(bio_list_get(&pe->snapshot_bios));
656 
657 		dm_table_event(s->table);
658 	}
659 
660  out:
661 	free_pending_exception(pe);
662 
663 	if (flush)
664 		flush_bios(flush);
665 }
666 
667 static void commit_callback(void *context, int success)
668 {
669 	struct pending_exception *pe = (struct pending_exception *) context;
670 	pending_complete(pe, success);
671 }
672 
673 /*
674  * Called when the copy I/O has finished.  kcopyd actually runs
675  * this code so don't block.
676  */
677 static void copy_callback(int read_err, unsigned int write_err, void *context)
678 {
679 	struct pending_exception *pe = (struct pending_exception *) context;
680 	struct dm_snapshot *s = pe->snap;
681 
682 	if (read_err || write_err)
683 		pending_complete(pe, 0);
684 
685 	else
686 		/* Update the metadata if we are persistent */
687 		s->store.commit_exception(&s->store, &pe->e, commit_callback,
688 					  pe);
689 }
690 
691 /*
692  * Dispatches the copy operation to kcopyd.
693  */
694 static inline void start_copy(struct pending_exception *pe)
695 {
696 	struct dm_snapshot *s = pe->snap;
697 	struct io_region src, dest;
698 	struct block_device *bdev = s->origin->bdev;
699 	sector_t dev_size;
700 
701 	dev_size = get_dev_size(bdev);
702 
703 	src.bdev = bdev;
704 	src.sector = chunk_to_sector(s, pe->e.old_chunk);
705 	src.count = min(s->chunk_size, dev_size - src.sector);
706 
707 	dest.bdev = s->cow->bdev;
708 	dest.sector = chunk_to_sector(s, pe->e.new_chunk);
709 	dest.count = src.count;
710 
711 	/* Hand over to kcopyd */
712 	kcopyd_copy(s->kcopyd_client,
713 		    &src, 1, &dest, 0, copy_callback, pe);
714 }
715 
716 /*
717  * Looks to see if this snapshot already has a pending exception
718  * for this chunk, otherwise it allocates a new one and inserts
719  * it into the pending table.
720  *
721  * NOTE: a write lock must be held on snap->lock before calling
722  * this.
723  */
724 static struct pending_exception *
725 __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
726 {
727 	struct exception *e;
728 	struct pending_exception *pe;
729 	chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
730 
731 	/*
732 	 * Is there a pending exception for this already ?
733 	 */
734 	e = lookup_exception(&s->pending, chunk);
735 	if (e) {
736 		/* cast the exception to a pending exception */
737 		pe = container_of(e, struct pending_exception, e);
738 
739 	} else {
740 		/*
741 		 * Create a new pending exception, we don't want
742 		 * to hold the lock while we do this.
743 		 */
744 		up_write(&s->lock);
745 		pe = alloc_pending_exception();
746 		down_write(&s->lock);
747 
748 		e = lookup_exception(&s->pending, chunk);
749 		if (e) {
750 			free_pending_exception(pe);
751 			pe = container_of(e, struct pending_exception, e);
752 		} else {
753 			pe->e.old_chunk = chunk;
754 			bio_list_init(&pe->origin_bios);
755 			bio_list_init(&pe->snapshot_bios);
756 			INIT_LIST_HEAD(&pe->siblings);
757 			pe->snap = s;
758 			pe->started = 0;
759 
760 			if (s->store.prepare_exception(&s->store, &pe->e)) {
761 				free_pending_exception(pe);
762 				s->valid = 0;
763 				return NULL;
764 			}
765 
766 			insert_exception(&s->pending, &pe->e);
767 		}
768 	}
769 
770 	return pe;
771 }
772 
773 static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
774 				   struct bio *bio)
775 {
776 	bio->bi_bdev = s->cow->bdev;
777 	bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
778 		(bio->bi_sector & s->chunk_mask);
779 }
780 
781 static int snapshot_map(struct dm_target *ti, struct bio *bio,
782 			union map_info *map_context)
783 {
784 	struct exception *e;
785 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
786 	int r = 1;
787 	chunk_t chunk;
788 	struct pending_exception *pe;
789 
790 	chunk = sector_to_chunk(s, bio->bi_sector);
791 
792 	/* Full snapshots are not usable */
793 	if (!s->valid)
794 		return -EIO;
795 
796 	/*
797 	 * Write to snapshot - higher level takes care of RW/RO
798 	 * flags so we should only get this if we are
799 	 * writeable.
800 	 */
801 	if (bio_rw(bio) == WRITE) {
802 
803 		/* FIXME: should only take write lock if we need
804 		 * to copy an exception */
805 		down_write(&s->lock);
806 
807 		/* If the block is already remapped - use that, else remap it */
808 		e = lookup_exception(&s->complete, chunk);
809 		if (e) {
810 			remap_exception(s, e, bio);
811 			up_write(&s->lock);
812 
813 		} else {
814 			pe = __find_pending_exception(s, bio);
815 
816 			if (!pe) {
817 				if (s->store.drop_snapshot)
818 					s->store.drop_snapshot(&s->store);
819 				s->valid = 0;
820 				r = -EIO;
821 				up_write(&s->lock);
822 			} else {
823 				remap_exception(s, &pe->e, bio);
824 				bio_list_add(&pe->snapshot_bios, bio);
825 
826 				if (!pe->started) {
827 					/* this is protected by snap->lock */
828 					pe->started = 1;
829 					up_write(&s->lock);
830 					start_copy(pe);
831 				} else
832 					up_write(&s->lock);
833 				r = 0;
834 			}
835 		}
836 
837 	} else {
838 		/*
839 		 * FIXME: this read path scares me because we
840 		 * always use the origin when we have a pending
841 		 * exception.  However I can't think of a
842 		 * situation where this is wrong - ejt.
843 		 */
844 
845 		/* Do reads */
846 		down_read(&s->lock);
847 
848 		/* See if it it has been remapped */
849 		e = lookup_exception(&s->complete, chunk);
850 		if (e)
851 			remap_exception(s, e, bio);
852 		else
853 			bio->bi_bdev = s->origin->bdev;
854 
855 		up_read(&s->lock);
856 	}
857 
858 	return r;
859 }
860 
861 static void snapshot_resume(struct dm_target *ti)
862 {
863 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
864 
865 	read_snapshot_metadata(s);
866 }
867 
868 static int snapshot_status(struct dm_target *ti, status_type_t type,
869 			   char *result, unsigned int maxlen)
870 {
871 	struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
872 
873 	switch (type) {
874 	case STATUSTYPE_INFO:
875 		if (!snap->valid)
876 			snprintf(result, maxlen, "Invalid");
877 		else {
878 			if (snap->store.fraction_full) {
879 				sector_t numerator, denominator;
880 				snap->store.fraction_full(&snap->store,
881 							  &numerator,
882 							  &denominator);
883 				snprintf(result, maxlen,
884 					 SECTOR_FORMAT "/" SECTOR_FORMAT,
885 					 numerator, denominator);
886 			}
887 			else
888 				snprintf(result, maxlen, "Unknown");
889 		}
890 		break;
891 
892 	case STATUSTYPE_TABLE:
893 		/*
894 		 * kdevname returns a static pointer so we need
895 		 * to make private copies if the output is to
896 		 * make sense.
897 		 */
898 		snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
899 			 snap->origin->name, snap->cow->name,
900 			 snap->type, snap->chunk_size);
901 		break;
902 	}
903 
904 	return 0;
905 }
906 
907 /*-----------------------------------------------------------------
908  * Origin methods
909  *---------------------------------------------------------------*/
910 static void list_merge(struct list_head *l1, struct list_head *l2)
911 {
912 	struct list_head *l1_n, *l2_p;
913 
914 	l1_n = l1->next;
915 	l2_p = l2->prev;
916 
917 	l1->next = l2;
918 	l2->prev = l1;
919 
920 	l2_p->next = l1_n;
921 	l1_n->prev = l2_p;
922 }
923 
924 static int __origin_write(struct list_head *snapshots, struct bio *bio)
925 {
926 	int r = 1, first = 1;
927 	struct dm_snapshot *snap;
928 	struct exception *e;
929 	struct pending_exception *pe, *last = NULL;
930 	chunk_t chunk;
931 
932 	/* Do all the snapshots on this origin */
933 	list_for_each_entry (snap, snapshots, list) {
934 
935 		/* Only deal with valid snapshots */
936 		if (!snap->valid)
937 			continue;
938 
939 		/* Nothing to do if writing beyond end of snapshot */
940 		if (bio->bi_sector >= dm_table_get_size(snap->table))
941 			continue;
942 
943 		down_write(&snap->lock);
944 
945 		/*
946 		 * Remember, different snapshots can have
947 		 * different chunk sizes.
948 		 */
949 		chunk = sector_to_chunk(snap, bio->bi_sector);
950 
951 		/*
952 		 * Check exception table to see if block
953 		 * is already remapped in this snapshot
954 		 * and trigger an exception if not.
955 		 */
956 		e = lookup_exception(&snap->complete, chunk);
957 		if (!e) {
958 			pe = __find_pending_exception(snap, bio);
959 			if (!pe) {
960 				snap->store.drop_snapshot(&snap->store);
961 				snap->valid = 0;
962 
963 			} else {
964 				if (last)
965 					list_merge(&pe->siblings,
966 						   &last->siblings);
967 
968 				last = pe;
969 				r = 0;
970 			}
971 		}
972 
973 		up_write(&snap->lock);
974 	}
975 
976 	/*
977 	 * Now that we have a complete pe list we can start the copying.
978 	 */
979 	if (last) {
980 		pe = last;
981 		do {
982 			down_write(&pe->snap->lock);
983 			if (first)
984 				bio_list_add(&pe->origin_bios, bio);
985 			if (!pe->started) {
986 				pe->started = 1;
987 				up_write(&pe->snap->lock);
988 				start_copy(pe);
989 			} else
990 				up_write(&pe->snap->lock);
991 			first = 0;
992 			pe = list_entry(pe->siblings.next,
993 					struct pending_exception, siblings);
994 
995 		} while (pe != last);
996 	}
997 
998 	return r;
999 }
1000 
1001 /*
1002  * Called on a write from the origin driver.
1003  */
1004 static int do_origin(struct dm_dev *origin, struct bio *bio)
1005 {
1006 	struct origin *o;
1007 	int r = 1;
1008 
1009 	down_read(&_origins_lock);
1010 	o = __lookup_origin(origin->bdev);
1011 	if (o)
1012 		r = __origin_write(&o->snapshots, bio);
1013 	up_read(&_origins_lock);
1014 
1015 	return r;
1016 }
1017 
1018 /*
1019  * Origin: maps a linear range of a device, with hooks for snapshotting.
1020  */
1021 
1022 /*
1023  * Construct an origin mapping: <dev_path>
1024  * The context for an origin is merely a 'struct dm_dev *'
1025  * pointing to the real device.
1026  */
1027 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1028 {
1029 	int r;
1030 	struct dm_dev *dev;
1031 
1032 	if (argc != 1) {
1033 		ti->error = "dm-origin: incorrect number of arguments";
1034 		return -EINVAL;
1035 	}
1036 
1037 	r = dm_get_device(ti, argv[0], 0, ti->len,
1038 			  dm_table_get_mode(ti->table), &dev);
1039 	if (r) {
1040 		ti->error = "Cannot get target device";
1041 		return r;
1042 	}
1043 
1044 	ti->private = dev;
1045 	return 0;
1046 }
1047 
1048 static void origin_dtr(struct dm_target *ti)
1049 {
1050 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1051 	dm_put_device(ti, dev);
1052 }
1053 
1054 static int origin_map(struct dm_target *ti, struct bio *bio,
1055 		      union map_info *map_context)
1056 {
1057 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1058 	bio->bi_bdev = dev->bdev;
1059 
1060 	/* Only tell snapshots if this is a write */
1061 	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
1062 }
1063 
1064 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1065 
1066 /*
1067  * Set the target "split_io" field to the minimum of all the snapshots'
1068  * chunk sizes.
1069  */
1070 static void origin_resume(struct dm_target *ti)
1071 {
1072 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1073 	struct dm_snapshot *snap;
1074 	struct origin *o;
1075 	chunk_t chunk_size = 0;
1076 
1077 	down_read(&_origins_lock);
1078 	o = __lookup_origin(dev->bdev);
1079 	if (o)
1080 		list_for_each_entry (snap, &o->snapshots, list)
1081 			chunk_size = min_not_zero(chunk_size, snap->chunk_size);
1082 	up_read(&_origins_lock);
1083 
1084 	ti->split_io = chunk_size;
1085 }
1086 
1087 static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1088 			 unsigned int maxlen)
1089 {
1090 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1091 
1092 	switch (type) {
1093 	case STATUSTYPE_INFO:
1094 		result[0] = '\0';
1095 		break;
1096 
1097 	case STATUSTYPE_TABLE:
1098 		snprintf(result, maxlen, "%s", dev->name);
1099 		break;
1100 	}
1101 
1102 	return 0;
1103 }
1104 
1105 static struct target_type origin_target = {
1106 	.name    = "snapshot-origin",
1107 	.version = {1, 0, 1},
1108 	.module  = THIS_MODULE,
1109 	.ctr     = origin_ctr,
1110 	.dtr     = origin_dtr,
1111 	.map     = origin_map,
1112 	.resume  = origin_resume,
1113 	.status  = origin_status,
1114 };
1115 
1116 static struct target_type snapshot_target = {
1117 	.name    = "snapshot",
1118 	.version = {1, 0, 1},
1119 	.module  = THIS_MODULE,
1120 	.ctr     = snapshot_ctr,
1121 	.dtr     = snapshot_dtr,
1122 	.map     = snapshot_map,
1123 	.resume  = snapshot_resume,
1124 	.status  = snapshot_status,
1125 };
1126 
1127 static int __init dm_snapshot_init(void)
1128 {
1129 	int r;
1130 
1131 	r = dm_register_target(&snapshot_target);
1132 	if (r) {
1133 		DMERR("snapshot target register failed %d", r);
1134 		return r;
1135 	}
1136 
1137 	r = dm_register_target(&origin_target);
1138 	if (r < 0) {
1139 		DMERR("Device mapper: Origin: register failed %d\n", r);
1140 		goto bad1;
1141 	}
1142 
1143 	r = init_origin_hash();
1144 	if (r) {
1145 		DMERR("init_origin_hash failed.");
1146 		goto bad2;
1147 	}
1148 
1149 	exception_cache = kmem_cache_create("dm-snapshot-ex",
1150 					    sizeof(struct exception),
1151 					    __alignof__(struct exception),
1152 					    0, NULL, NULL);
1153 	if (!exception_cache) {
1154 		DMERR("Couldn't create exception cache.");
1155 		r = -ENOMEM;
1156 		goto bad3;
1157 	}
1158 
1159 	pending_cache =
1160 	    kmem_cache_create("dm-snapshot-in",
1161 			      sizeof(struct pending_exception),
1162 			      __alignof__(struct pending_exception),
1163 			      0, NULL, NULL);
1164 	if (!pending_cache) {
1165 		DMERR("Couldn't create pending cache.");
1166 		r = -ENOMEM;
1167 		goto bad4;
1168 	}
1169 
1170 	pending_pool = mempool_create(128, mempool_alloc_slab,
1171 				      mempool_free_slab, pending_cache);
1172 	if (!pending_pool) {
1173 		DMERR("Couldn't create pending pool.");
1174 		r = -ENOMEM;
1175 		goto bad5;
1176 	}
1177 
1178 	return 0;
1179 
1180       bad5:
1181 	kmem_cache_destroy(pending_cache);
1182       bad4:
1183 	kmem_cache_destroy(exception_cache);
1184       bad3:
1185 	exit_origin_hash();
1186       bad2:
1187 	dm_unregister_target(&origin_target);
1188       bad1:
1189 	dm_unregister_target(&snapshot_target);
1190 	return r;
1191 }
1192 
1193 static void __exit dm_snapshot_exit(void)
1194 {
1195 	int r;
1196 
1197 	r = dm_unregister_target(&snapshot_target);
1198 	if (r)
1199 		DMERR("snapshot unregister failed %d", r);
1200 
1201 	r = dm_unregister_target(&origin_target);
1202 	if (r)
1203 		DMERR("origin unregister failed %d", r);
1204 
1205 	exit_origin_hash();
1206 	mempool_destroy(pending_pool);
1207 	kmem_cache_destroy(pending_cache);
1208 	kmem_cache_destroy(exception_cache);
1209 }
1210 
1211 /* Module hooks */
1212 module_init(dm_snapshot_init);
1213 module_exit(dm_snapshot_exit);
1214 
1215 MODULE_DESCRIPTION(DM_NAME " snapshot target");
1216 MODULE_AUTHOR("Joe Thornber");
1217 MODULE_LICENSE("GPL");
1218