xref: /linux/drivers/md/dm-snap.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  * dm-snapshot.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include <linux/blkdev.h>
10 #include <linux/ctype.h>
11 #include <linux/device-mapper.h>
12 #include <linux/fs.h>
13 #include <linux/init.h>
14 #include <linux/kdev_t.h>
15 #include <linux/list.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 
21 #include "dm-snap.h"
22 #include "dm-bio-list.h"
23 #include "kcopyd.h"
24 
25 #define DM_MSG_PREFIX "snapshots"
26 
27 /*
28  * The percentage increment we will wake up users at
29  */
30 #define WAKE_UP_PERCENT 5
31 
32 /*
33  * kcopyd priority of snapshot operations
34  */
35 #define SNAPSHOT_COPY_PRIORITY 2
36 
37 /*
38  * Each snapshot reserves this many pages for io
39  */
40 #define SNAPSHOT_PAGES 256
41 
42 struct pending_exception {
43 	struct exception e;
44 
45 	/*
46 	 * Origin buffers waiting for this to complete are held
47 	 * in a bio list
48 	 */
49 	struct bio_list origin_bios;
50 	struct bio_list snapshot_bios;
51 
52 	/*
53 	 * Short-term queue of pending exceptions prior to submission.
54 	 */
55 	struct list_head list;
56 
57 	/*
58 	 * The primary pending_exception is the one that holds
59 	 * the sibling_count and the list of origin_bios for a
60 	 * group of pending_exceptions.  It is always last to get freed.
61 	 * These fields get set up when writing to the origin.
62 	 */
63 	struct pending_exception *primary_pe;
64 
65 	/*
66 	 * Number of pending_exceptions processing this chunk.
67 	 * When this drops to zero we must complete the origin bios.
68 	 * If incrementing or decrementing this, hold pe->snap->lock for
69 	 * the sibling concerned and not pe->primary_pe->snap->lock unless
70 	 * they are the same.
71 	 */
72 	atomic_t sibling_count;
73 
74 	/* Pointer back to snapshot context */
75 	struct dm_snapshot *snap;
76 
77 	/*
78 	 * 1 indicates the exception has already been sent to
79 	 * kcopyd.
80 	 */
81 	int started;
82 };
83 
84 /*
85  * Hash table mapping origin volumes to lists of snapshots and
86  * a lock to protect it
87  */
88 static kmem_cache_t *exception_cache;
89 static kmem_cache_t *pending_cache;
90 static mempool_t *pending_pool;
91 
92 /*
93  * One of these per registered origin, held in the snapshot_origins hash
94  */
95 struct origin {
96 	/* The origin device */
97 	struct block_device *bdev;
98 
99 	struct list_head hash_list;
100 
101 	/* List of snapshots for this origin */
102 	struct list_head snapshots;
103 };
104 
105 /*
106  * Size of the hash table for origin volumes. If we make this
107  * the size of the minors list then it should be nearly perfect
108  */
109 #define ORIGIN_HASH_SIZE 256
110 #define ORIGIN_MASK      0xFF
111 static struct list_head *_origins;
112 static struct rw_semaphore _origins_lock;
113 
114 static int init_origin_hash(void)
115 {
116 	int i;
117 
118 	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
119 			   GFP_KERNEL);
120 	if (!_origins) {
121 		DMERR("unable to allocate memory");
122 		return -ENOMEM;
123 	}
124 
125 	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
126 		INIT_LIST_HEAD(_origins + i);
127 	init_rwsem(&_origins_lock);
128 
129 	return 0;
130 }
131 
132 static void exit_origin_hash(void)
133 {
134 	kfree(_origins);
135 }
136 
137 static inline unsigned int origin_hash(struct block_device *bdev)
138 {
139 	return bdev->bd_dev & ORIGIN_MASK;
140 }
141 
142 static struct origin *__lookup_origin(struct block_device *origin)
143 {
144 	struct list_head *ol;
145 	struct origin *o;
146 
147 	ol = &_origins[origin_hash(origin)];
148 	list_for_each_entry (o, ol, hash_list)
149 		if (bdev_equal(o->bdev, origin))
150 			return o;
151 
152 	return NULL;
153 }
154 
155 static void __insert_origin(struct origin *o)
156 {
157 	struct list_head *sl = &_origins[origin_hash(o->bdev)];
158 	list_add_tail(&o->hash_list, sl);
159 }
160 
161 /*
162  * Make a note of the snapshot and its origin so we can look it
163  * up when the origin has a write on it.
164  */
165 static int register_snapshot(struct dm_snapshot *snap)
166 {
167 	struct origin *o;
168 	struct block_device *bdev = snap->origin->bdev;
169 
170 	down_write(&_origins_lock);
171 	o = __lookup_origin(bdev);
172 
173 	if (!o) {
174 		/* New origin */
175 		o = kmalloc(sizeof(*o), GFP_KERNEL);
176 		if (!o) {
177 			up_write(&_origins_lock);
178 			return -ENOMEM;
179 		}
180 
181 		/* Initialise the struct */
182 		INIT_LIST_HEAD(&o->snapshots);
183 		o->bdev = bdev;
184 
185 		__insert_origin(o);
186 	}
187 
188 	list_add_tail(&snap->list, &o->snapshots);
189 
190 	up_write(&_origins_lock);
191 	return 0;
192 }
193 
194 static void unregister_snapshot(struct dm_snapshot *s)
195 {
196 	struct origin *o;
197 
198 	down_write(&_origins_lock);
199 	o = __lookup_origin(s->origin->bdev);
200 
201 	list_del(&s->list);
202 	if (list_empty(&o->snapshots)) {
203 		list_del(&o->hash_list);
204 		kfree(o);
205 	}
206 
207 	up_write(&_origins_lock);
208 }
209 
210 /*
211  * Implementation of the exception hash tables.
212  */
213 static int init_exception_table(struct exception_table *et, uint32_t size)
214 {
215 	unsigned int i;
216 
217 	et->hash_mask = size - 1;
218 	et->table = dm_vcalloc(size, sizeof(struct list_head));
219 	if (!et->table)
220 		return -ENOMEM;
221 
222 	for (i = 0; i < size; i++)
223 		INIT_LIST_HEAD(et->table + i);
224 
225 	return 0;
226 }
227 
228 static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
229 {
230 	struct list_head *slot;
231 	struct exception *ex, *next;
232 	int i, size;
233 
234 	size = et->hash_mask + 1;
235 	for (i = 0; i < size; i++) {
236 		slot = et->table + i;
237 
238 		list_for_each_entry_safe (ex, next, slot, hash_list)
239 			kmem_cache_free(mem, ex);
240 	}
241 
242 	vfree(et->table);
243 }
244 
245 static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
246 {
247 	return chunk & et->hash_mask;
248 }
249 
250 static void insert_exception(struct exception_table *eh, struct exception *e)
251 {
252 	struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
253 	list_add(&e->hash_list, l);
254 }
255 
256 static inline void remove_exception(struct exception *e)
257 {
258 	list_del(&e->hash_list);
259 }
260 
261 /*
262  * Return the exception data for a sector, or NULL if not
263  * remapped.
264  */
265 static struct exception *lookup_exception(struct exception_table *et,
266 					  chunk_t chunk)
267 {
268 	struct list_head *slot;
269 	struct exception *e;
270 
271 	slot = &et->table[exception_hash(et, chunk)];
272 	list_for_each_entry (e, slot, hash_list)
273 		if (e->old_chunk == chunk)
274 			return e;
275 
276 	return NULL;
277 }
278 
279 static inline struct exception *alloc_exception(void)
280 {
281 	struct exception *e;
282 
283 	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
284 	if (!e)
285 		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
286 
287 	return e;
288 }
289 
290 static inline void free_exception(struct exception *e)
291 {
292 	kmem_cache_free(exception_cache, e);
293 }
294 
295 static inline struct pending_exception *alloc_pending_exception(void)
296 {
297 	return mempool_alloc(pending_pool, GFP_NOIO);
298 }
299 
300 static inline void free_pending_exception(struct pending_exception *pe)
301 {
302 	mempool_free(pe, pending_pool);
303 }
304 
305 int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
306 {
307 	struct exception *e;
308 
309 	e = alloc_exception();
310 	if (!e)
311 		return -ENOMEM;
312 
313 	e->old_chunk = old;
314 	e->new_chunk = new;
315 	insert_exception(&s->complete, e);
316 	return 0;
317 }
318 
319 /*
320  * Hard coded magic.
321  */
322 static int calc_max_buckets(void)
323 {
324 	/* use a fixed size of 2MB */
325 	unsigned long mem = 2 * 1024 * 1024;
326 	mem /= sizeof(struct list_head);
327 
328 	return mem;
329 }
330 
331 /*
332  * Rounds a number down to a power of 2.
333  */
334 static inline uint32_t round_down(uint32_t n)
335 {
336 	while (n & (n - 1))
337 		n &= (n - 1);
338 	return n;
339 }
340 
341 /*
342  * Allocate room for a suitable hash table.
343  */
344 static int init_hash_tables(struct dm_snapshot *s)
345 {
346 	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
347 
348 	/*
349 	 * Calculate based on the size of the original volume or
350 	 * the COW volume...
351 	 */
352 	cow_dev_size = get_dev_size(s->cow->bdev);
353 	origin_dev_size = get_dev_size(s->origin->bdev);
354 	max_buckets = calc_max_buckets();
355 
356 	hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
357 	hash_size = min(hash_size, max_buckets);
358 
359 	/* Round it down to a power of 2 */
360 	hash_size = round_down(hash_size);
361 	if (init_exception_table(&s->complete, hash_size))
362 		return -ENOMEM;
363 
364 	/*
365 	 * Allocate hash table for in-flight exceptions
366 	 * Make this smaller than the real hash table
367 	 */
368 	hash_size >>= 3;
369 	if (hash_size < 64)
370 		hash_size = 64;
371 
372 	if (init_exception_table(&s->pending, hash_size)) {
373 		exit_exception_table(&s->complete, exception_cache);
374 		return -ENOMEM;
375 	}
376 
377 	return 0;
378 }
379 
380 /*
381  * Round a number up to the nearest 'size' boundary.  size must
382  * be a power of 2.
383  */
384 static inline ulong round_up(ulong n, ulong size)
385 {
386 	size--;
387 	return (n + size) & ~size;
388 }
389 
390 static void read_snapshot_metadata(struct dm_snapshot *s)
391 {
392 	if (s->store.read_metadata(&s->store)) {
393 		down_write(&s->lock);
394 		s->valid = 0;
395 		up_write(&s->lock);
396 
397 		dm_table_event(s->table);
398 	}
399 }
400 
401 /*
402  * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
403  */
404 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
405 {
406 	struct dm_snapshot *s;
407 	unsigned long chunk_size;
408 	int r = -EINVAL;
409 	char persistent;
410 	char *origin_path;
411 	char *cow_path;
412 	char *value;
413 	int blocksize;
414 
415 	if (argc < 4) {
416 		ti->error = "requires exactly 4 arguments";
417 		r = -EINVAL;
418 		goto bad1;
419 	}
420 
421 	origin_path = argv[0];
422 	cow_path = argv[1];
423 	persistent = toupper(*argv[2]);
424 
425 	if (persistent != 'P' && persistent != 'N') {
426 		ti->error = "Persistent flag is not P or N";
427 		r = -EINVAL;
428 		goto bad1;
429 	}
430 
431 	chunk_size = simple_strtoul(argv[3], &value, 10);
432 	if (chunk_size == 0 || value == NULL) {
433 		ti->error = "Invalid chunk size";
434 		r = -EINVAL;
435 		goto bad1;
436 	}
437 
438 	s = kmalloc(sizeof(*s), GFP_KERNEL);
439 	if (s == NULL) {
440 		ti->error = "Cannot allocate snapshot context private "
441 		    "structure";
442 		r = -ENOMEM;
443 		goto bad1;
444 	}
445 
446 	r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
447 	if (r) {
448 		ti->error = "Cannot get origin device";
449 		goto bad2;
450 	}
451 
452 	r = dm_get_device(ti, cow_path, 0, 0,
453 			  FMODE_READ | FMODE_WRITE, &s->cow);
454 	if (r) {
455 		dm_put_device(ti, s->origin);
456 		ti->error = "Cannot get COW device";
457 		goto bad2;
458 	}
459 
460 	/*
461 	 * Chunk size must be multiple of page size.  Silently
462 	 * round up if it's not.
463 	 */
464 	chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
465 
466 	/* Validate the chunk size against the device block size */
467 	blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
468 	if (chunk_size % (blocksize >> 9)) {
469 		ti->error = "Chunk size is not a multiple of device blocksize";
470 		r = -EINVAL;
471 		goto bad3;
472 	}
473 
474 	/* Check chunk_size is a power of 2 */
475 	if (chunk_size & (chunk_size - 1)) {
476 		ti->error = "Chunk size is not a power of 2";
477 		r = -EINVAL;
478 		goto bad3;
479 	}
480 
481 	s->chunk_size = chunk_size;
482 	s->chunk_mask = chunk_size - 1;
483 	s->type = persistent;
484 	s->chunk_shift = ffs(chunk_size) - 1;
485 
486 	s->valid = 1;
487 	s->active = 0;
488 	s->last_percent = 0;
489 	init_rwsem(&s->lock);
490 	s->table = ti->table;
491 
492 	/* Allocate hash table for COW data */
493 	if (init_hash_tables(s)) {
494 		ti->error = "Unable to allocate hash table space";
495 		r = -ENOMEM;
496 		goto bad3;
497 	}
498 
499 	/*
500 	 * Check the persistent flag - done here because we need the iobuf
501 	 * to check the LV header
502 	 */
503 	s->store.snap = s;
504 
505 	if (persistent == 'P')
506 		r = dm_create_persistent(&s->store, chunk_size);
507 	else
508 		r = dm_create_transient(&s->store, s, blocksize);
509 
510 	if (r) {
511 		ti->error = "Couldn't create exception store";
512 		r = -EINVAL;
513 		goto bad4;
514 	}
515 
516 	r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
517 	if (r) {
518 		ti->error = "Could not create kcopyd client";
519 		goto bad5;
520 	}
521 
522 	/* Metadata must only be loaded into one table at once */
523 	read_snapshot_metadata(s);
524 
525 	/* Add snapshot to the list of snapshots for this origin */
526 	/* Exceptions aren't triggered till snapshot_resume() is called */
527 	if (register_snapshot(s)) {
528 		r = -EINVAL;
529 		ti->error = "Cannot register snapshot origin";
530 		goto bad6;
531 	}
532 
533 	ti->private = s;
534 	ti->split_io = s->chunk_size;
535 
536 	return 0;
537 
538  bad6:
539 	kcopyd_client_destroy(s->kcopyd_client);
540 
541  bad5:
542 	s->store.destroy(&s->store);
543 
544  bad4:
545 	exit_exception_table(&s->pending, pending_cache);
546 	exit_exception_table(&s->complete, exception_cache);
547 
548  bad3:
549 	dm_put_device(ti, s->cow);
550 	dm_put_device(ti, s->origin);
551 
552  bad2:
553 	kfree(s);
554 
555  bad1:
556 	return r;
557 }
558 
559 static void snapshot_dtr(struct dm_target *ti)
560 {
561 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
562 
563 	/* Prevent further origin writes from using this snapshot. */
564 	/* After this returns there can be no new kcopyd jobs. */
565 	unregister_snapshot(s);
566 
567 	kcopyd_client_destroy(s->kcopyd_client);
568 
569 	exit_exception_table(&s->pending, pending_cache);
570 	exit_exception_table(&s->complete, exception_cache);
571 
572 	/* Deallocate memory used */
573 	s->store.destroy(&s->store);
574 
575 	dm_put_device(ti, s->origin);
576 	dm_put_device(ti, s->cow);
577 
578 	kfree(s);
579 }
580 
581 /*
582  * Flush a list of buffers.
583  */
584 static void flush_bios(struct bio *bio)
585 {
586 	struct bio *n;
587 
588 	while (bio) {
589 		n = bio->bi_next;
590 		bio->bi_next = NULL;
591 		generic_make_request(bio);
592 		bio = n;
593 	}
594 }
595 
596 /*
597  * Error a list of buffers.
598  */
599 static void error_bios(struct bio *bio)
600 {
601 	struct bio *n;
602 
603 	while (bio) {
604 		n = bio->bi_next;
605 		bio->bi_next = NULL;
606 		bio_io_error(bio, bio->bi_size);
607 		bio = n;
608 	}
609 }
610 
611 static inline void error_snapshot_bios(struct pending_exception *pe)
612 {
613 	error_bios(bio_list_get(&pe->snapshot_bios));
614 }
615 
616 static struct bio *__flush_bios(struct pending_exception *pe)
617 {
618 	/*
619 	 * If this pe is involved in a write to the origin and
620 	 * it is the last sibling to complete then release
621 	 * the bios for the original write to the origin.
622 	 */
623 
624 	if (pe->primary_pe &&
625 	    atomic_dec_and_test(&pe->primary_pe->sibling_count))
626 		return bio_list_get(&pe->primary_pe->origin_bios);
627 
628 	return NULL;
629 }
630 
631 static void __invalidate_snapshot(struct dm_snapshot *s,
632 				struct pending_exception *pe, int err)
633 {
634 	if (!s->valid)
635 		return;
636 
637 	if (err == -EIO)
638 		DMERR("Invalidating snapshot: Error reading/writing.");
639 	else if (err == -ENOMEM)
640 		DMERR("Invalidating snapshot: Unable to allocate exception.");
641 
642 	if (pe)
643 		remove_exception(&pe->e);
644 
645 	if (s->store.drop_snapshot)
646 		s->store.drop_snapshot(&s->store);
647 
648 	s->valid = 0;
649 
650 	dm_table_event(s->table);
651 }
652 
653 static void pending_complete(struct pending_exception *pe, int success)
654 {
655 	struct exception *e;
656 	struct pending_exception *primary_pe;
657 	struct dm_snapshot *s = pe->snap;
658 	struct bio *flush = NULL;
659 
660 	if (!success) {
661 		/* Read/write error - snapshot is unusable */
662 		down_write(&s->lock);
663 		__invalidate_snapshot(s, pe, -EIO);
664 		flush = __flush_bios(pe);
665 		up_write(&s->lock);
666 
667 		error_snapshot_bios(pe);
668 		goto out;
669 	}
670 
671 	e = alloc_exception();
672 	if (!e) {
673 		down_write(&s->lock);
674 		__invalidate_snapshot(s, pe, -ENOMEM);
675 		flush = __flush_bios(pe);
676 		up_write(&s->lock);
677 
678 		error_snapshot_bios(pe);
679 		goto out;
680 	}
681 	*e = pe->e;
682 
683 	/*
684 	 * Add a proper exception, and remove the
685 	 * in-flight exception from the list.
686 	 */
687 	down_write(&s->lock);
688 	if (!s->valid) {
689 		flush = __flush_bios(pe);
690 		up_write(&s->lock);
691 
692 		free_exception(e);
693 
694 		error_snapshot_bios(pe);
695 		goto out;
696 	}
697 
698 	insert_exception(&s->complete, e);
699 	remove_exception(&pe->e);
700 	flush = __flush_bios(pe);
701 
702 	up_write(&s->lock);
703 
704 	/* Submit any pending write bios */
705 	flush_bios(bio_list_get(&pe->snapshot_bios));
706 
707  out:
708 	primary_pe = pe->primary_pe;
709 
710 	/*
711 	 * Free the pe if it's not linked to an origin write or if
712 	 * it's not itself a primary pe.
713 	 */
714 	if (!primary_pe || primary_pe != pe)
715 		free_pending_exception(pe);
716 
717 	/*
718 	 * Free the primary pe if nothing references it.
719 	 */
720 	if (primary_pe && !atomic_read(&primary_pe->sibling_count))
721 		free_pending_exception(primary_pe);
722 
723 	if (flush)
724 		flush_bios(flush);
725 }
726 
727 static void commit_callback(void *context, int success)
728 {
729 	struct pending_exception *pe = (struct pending_exception *) context;
730 	pending_complete(pe, success);
731 }
732 
733 /*
734  * Called when the copy I/O has finished.  kcopyd actually runs
735  * this code so don't block.
736  */
737 static void copy_callback(int read_err, unsigned int write_err, void *context)
738 {
739 	struct pending_exception *pe = (struct pending_exception *) context;
740 	struct dm_snapshot *s = pe->snap;
741 
742 	if (read_err || write_err)
743 		pending_complete(pe, 0);
744 
745 	else
746 		/* Update the metadata if we are persistent */
747 		s->store.commit_exception(&s->store, &pe->e, commit_callback,
748 					  pe);
749 }
750 
751 /*
752  * Dispatches the copy operation to kcopyd.
753  */
754 static void start_copy(struct pending_exception *pe)
755 {
756 	struct dm_snapshot *s = pe->snap;
757 	struct io_region src, dest;
758 	struct block_device *bdev = s->origin->bdev;
759 	sector_t dev_size;
760 
761 	dev_size = get_dev_size(bdev);
762 
763 	src.bdev = bdev;
764 	src.sector = chunk_to_sector(s, pe->e.old_chunk);
765 	src.count = min(s->chunk_size, dev_size - src.sector);
766 
767 	dest.bdev = s->cow->bdev;
768 	dest.sector = chunk_to_sector(s, pe->e.new_chunk);
769 	dest.count = src.count;
770 
771 	/* Hand over to kcopyd */
772 	kcopyd_copy(s->kcopyd_client,
773 		    &src, 1, &dest, 0, copy_callback, pe);
774 }
775 
776 /*
777  * Looks to see if this snapshot already has a pending exception
778  * for this chunk, otherwise it allocates a new one and inserts
779  * it into the pending table.
780  *
781  * NOTE: a write lock must be held on snap->lock before calling
782  * this.
783  */
784 static struct pending_exception *
785 __find_pending_exception(struct dm_snapshot *s, struct bio *bio)
786 {
787 	struct exception *e;
788 	struct pending_exception *pe;
789 	chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
790 
791 	/*
792 	 * Is there a pending exception for this already ?
793 	 */
794 	e = lookup_exception(&s->pending, chunk);
795 	if (e) {
796 		/* cast the exception to a pending exception */
797 		pe = container_of(e, struct pending_exception, e);
798 		goto out;
799 	}
800 
801 	/*
802 	 * Create a new pending exception, we don't want
803 	 * to hold the lock while we do this.
804 	 */
805 	up_write(&s->lock);
806 	pe = alloc_pending_exception();
807 	down_write(&s->lock);
808 
809 	if (!s->valid) {
810 		free_pending_exception(pe);
811 		return NULL;
812 	}
813 
814 	e = lookup_exception(&s->pending, chunk);
815 	if (e) {
816 		free_pending_exception(pe);
817 		pe = container_of(e, struct pending_exception, e);
818 		goto out;
819 	}
820 
821 	pe->e.old_chunk = chunk;
822 	bio_list_init(&pe->origin_bios);
823 	bio_list_init(&pe->snapshot_bios);
824 	pe->primary_pe = NULL;
825 	atomic_set(&pe->sibling_count, 1);
826 	pe->snap = s;
827 	pe->started = 0;
828 
829 	if (s->store.prepare_exception(&s->store, &pe->e)) {
830 		free_pending_exception(pe);
831 		return NULL;
832 	}
833 
834 	insert_exception(&s->pending, &pe->e);
835 
836  out:
837 	return pe;
838 }
839 
840 static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
841 				   struct bio *bio)
842 {
843 	bio->bi_bdev = s->cow->bdev;
844 	bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
845 		(bio->bi_sector & s->chunk_mask);
846 }
847 
848 static int snapshot_map(struct dm_target *ti, struct bio *bio,
849 			union map_info *map_context)
850 {
851 	struct exception *e;
852 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
853 	int copy_needed = 0;
854 	int r = 1;
855 	chunk_t chunk;
856 	struct pending_exception *pe = NULL;
857 
858 	chunk = sector_to_chunk(s, bio->bi_sector);
859 
860 	/* Full snapshots are not usable */
861 	/* To get here the table must be live so s->active is always set. */
862 	if (!s->valid)
863 		return -EIO;
864 
865 	if (unlikely(bio_barrier(bio)))
866 		return -EOPNOTSUPP;
867 
868 	/*
869 	 * Write to snapshot - higher level takes care of RW/RO
870 	 * flags so we should only get this if we are
871 	 * writeable.
872 	 */
873 	if (bio_rw(bio) == WRITE) {
874 
875 		/* FIXME: should only take write lock if we need
876 		 * to copy an exception */
877 		down_write(&s->lock);
878 
879 		if (!s->valid) {
880 			r = -EIO;
881 			goto out_unlock;
882 		}
883 
884 		/* If the block is already remapped - use that, else remap it */
885 		e = lookup_exception(&s->complete, chunk);
886 		if (e) {
887 			remap_exception(s, e, bio);
888 			goto out_unlock;
889 		}
890 
891 		pe = __find_pending_exception(s, bio);
892 		if (!pe) {
893 			__invalidate_snapshot(s, pe, -ENOMEM);
894 			r = -EIO;
895 			goto out_unlock;
896 		}
897 
898 		remap_exception(s, &pe->e, bio);
899 		bio_list_add(&pe->snapshot_bios, bio);
900 
901 		if (!pe->started) {
902 			/* this is protected by snap->lock */
903 			pe->started = 1;
904 			copy_needed = 1;
905 		}
906 
907 		r = 0;
908 
909  out_unlock:
910 		up_write(&s->lock);
911 
912 		if (copy_needed)
913 			start_copy(pe);
914 	} else {
915 		/*
916 		 * FIXME: this read path scares me because we
917 		 * always use the origin when we have a pending
918 		 * exception.  However I can't think of a
919 		 * situation where this is wrong - ejt.
920 		 */
921 
922 		/* Do reads */
923 		down_read(&s->lock);
924 
925 		if (!s->valid) {
926 			up_read(&s->lock);
927 			return -EIO;
928 		}
929 
930 		/* See if it it has been remapped */
931 		e = lookup_exception(&s->complete, chunk);
932 		if (e)
933 			remap_exception(s, e, bio);
934 		else
935 			bio->bi_bdev = s->origin->bdev;
936 
937 		up_read(&s->lock);
938 	}
939 
940 	return r;
941 }
942 
943 static void snapshot_resume(struct dm_target *ti)
944 {
945 	struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
946 
947 	down_write(&s->lock);
948 	s->active = 1;
949 	up_write(&s->lock);
950 }
951 
952 static int snapshot_status(struct dm_target *ti, status_type_t type,
953 			   char *result, unsigned int maxlen)
954 {
955 	struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
956 
957 	switch (type) {
958 	case STATUSTYPE_INFO:
959 		if (!snap->valid)
960 			snprintf(result, maxlen, "Invalid");
961 		else {
962 			if (snap->store.fraction_full) {
963 				sector_t numerator, denominator;
964 				snap->store.fraction_full(&snap->store,
965 							  &numerator,
966 							  &denominator);
967 				snprintf(result, maxlen, "%llu/%llu",
968 					(unsigned long long)numerator,
969 					(unsigned long long)denominator);
970 			}
971 			else
972 				snprintf(result, maxlen, "Unknown");
973 		}
974 		break;
975 
976 	case STATUSTYPE_TABLE:
977 		/*
978 		 * kdevname returns a static pointer so we need
979 		 * to make private copies if the output is to
980 		 * make sense.
981 		 */
982 		snprintf(result, maxlen, "%s %s %c %llu",
983 			 snap->origin->name, snap->cow->name,
984 			 snap->type,
985 			 (unsigned long long)snap->chunk_size);
986 		break;
987 	}
988 
989 	return 0;
990 }
991 
992 /*-----------------------------------------------------------------
993  * Origin methods
994  *---------------------------------------------------------------*/
995 static int __origin_write(struct list_head *snapshots, struct bio *bio)
996 {
997 	int r = 1, first = 0;
998 	struct dm_snapshot *snap;
999 	struct exception *e;
1000 	struct pending_exception *pe, *next_pe, *primary_pe = NULL;
1001 	chunk_t chunk;
1002 	LIST_HEAD(pe_queue);
1003 
1004 	/* Do all the snapshots on this origin */
1005 	list_for_each_entry (snap, snapshots, list) {
1006 
1007 		down_write(&snap->lock);
1008 
1009 		/* Only deal with valid and active snapshots */
1010 		if (!snap->valid || !snap->active)
1011 			goto next_snapshot;
1012 
1013 		/* Nothing to do if writing beyond end of snapshot */
1014 		if (bio->bi_sector >= dm_table_get_size(snap->table))
1015 			goto next_snapshot;
1016 
1017 		/*
1018 		 * Remember, different snapshots can have
1019 		 * different chunk sizes.
1020 		 */
1021 		chunk = sector_to_chunk(snap, bio->bi_sector);
1022 
1023 		/*
1024 		 * Check exception table to see if block
1025 		 * is already remapped in this snapshot
1026 		 * and trigger an exception if not.
1027 		 *
1028 		 * sibling_count is initialised to 1 so pending_complete()
1029 		 * won't destroy the primary_pe while we're inside this loop.
1030 		 */
1031 		e = lookup_exception(&snap->complete, chunk);
1032 		if (e)
1033 			goto next_snapshot;
1034 
1035 		pe = __find_pending_exception(snap, bio);
1036 		if (!pe) {
1037 			__invalidate_snapshot(snap, pe, ENOMEM);
1038 			goto next_snapshot;
1039 		}
1040 
1041 		if (!primary_pe) {
1042 			/*
1043 			 * Either every pe here has same
1044 			 * primary_pe or none has one yet.
1045 			 */
1046 			if (pe->primary_pe)
1047 				primary_pe = pe->primary_pe;
1048 			else {
1049 				primary_pe = pe;
1050 				first = 1;
1051 			}
1052 
1053 			bio_list_add(&primary_pe->origin_bios, bio);
1054 
1055 			r = 0;
1056 		}
1057 
1058 		if (!pe->primary_pe) {
1059 			atomic_inc(&primary_pe->sibling_count);
1060 			pe->primary_pe = primary_pe;
1061 		}
1062 
1063 		if (!pe->started) {
1064 			pe->started = 1;
1065 			list_add_tail(&pe->list, &pe_queue);
1066 		}
1067 
1068  next_snapshot:
1069 		up_write(&snap->lock);
1070 	}
1071 
1072 	if (!primary_pe)
1073 		goto out;
1074 
1075 	/*
1076 	 * If this is the first time we're processing this chunk and
1077 	 * sibling_count is now 1 it means all the pending exceptions
1078 	 * got completed while we were in the loop above, so it falls to
1079 	 * us here to remove the primary_pe and submit any origin_bios.
1080 	 */
1081 
1082 	if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
1083 		flush_bios(bio_list_get(&primary_pe->origin_bios));
1084 		free_pending_exception(primary_pe);
1085 		/* If we got here, pe_queue is necessarily empty. */
1086 		goto out;
1087 	}
1088 
1089 	/*
1090 	 * Now that we have a complete pe list we can start the copying.
1091 	 */
1092 	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
1093 		start_copy(pe);
1094 
1095  out:
1096 	return r;
1097 }
1098 
1099 /*
1100  * Called on a write from the origin driver.
1101  */
1102 static int do_origin(struct dm_dev *origin, struct bio *bio)
1103 {
1104 	struct origin *o;
1105 	int r = 1;
1106 
1107 	down_read(&_origins_lock);
1108 	o = __lookup_origin(origin->bdev);
1109 	if (o)
1110 		r = __origin_write(&o->snapshots, bio);
1111 	up_read(&_origins_lock);
1112 
1113 	return r;
1114 }
1115 
1116 /*
1117  * Origin: maps a linear range of a device, with hooks for snapshotting.
1118  */
1119 
1120 /*
1121  * Construct an origin mapping: <dev_path>
1122  * The context for an origin is merely a 'struct dm_dev *'
1123  * pointing to the real device.
1124  */
1125 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1126 {
1127 	int r;
1128 	struct dm_dev *dev;
1129 
1130 	if (argc != 1) {
1131 		ti->error = "origin: incorrect number of arguments";
1132 		return -EINVAL;
1133 	}
1134 
1135 	r = dm_get_device(ti, argv[0], 0, ti->len,
1136 			  dm_table_get_mode(ti->table), &dev);
1137 	if (r) {
1138 		ti->error = "Cannot get target device";
1139 		return r;
1140 	}
1141 
1142 	ti->private = dev;
1143 	return 0;
1144 }
1145 
1146 static void origin_dtr(struct dm_target *ti)
1147 {
1148 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1149 	dm_put_device(ti, dev);
1150 }
1151 
1152 static int origin_map(struct dm_target *ti, struct bio *bio,
1153 		      union map_info *map_context)
1154 {
1155 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1156 	bio->bi_bdev = dev->bdev;
1157 
1158 	if (unlikely(bio_barrier(bio)))
1159 		return -EOPNOTSUPP;
1160 
1161 	/* Only tell snapshots if this is a write */
1162 	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
1163 }
1164 
1165 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1166 
1167 /*
1168  * Set the target "split_io" field to the minimum of all the snapshots'
1169  * chunk sizes.
1170  */
1171 static void origin_resume(struct dm_target *ti)
1172 {
1173 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1174 	struct dm_snapshot *snap;
1175 	struct origin *o;
1176 	chunk_t chunk_size = 0;
1177 
1178 	down_read(&_origins_lock);
1179 	o = __lookup_origin(dev->bdev);
1180 	if (o)
1181 		list_for_each_entry (snap, &o->snapshots, list)
1182 			chunk_size = min_not_zero(chunk_size, snap->chunk_size);
1183 	up_read(&_origins_lock);
1184 
1185 	ti->split_io = chunk_size;
1186 }
1187 
1188 static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1189 			 unsigned int maxlen)
1190 {
1191 	struct dm_dev *dev = (struct dm_dev *) ti->private;
1192 
1193 	switch (type) {
1194 	case STATUSTYPE_INFO:
1195 		result[0] = '\0';
1196 		break;
1197 
1198 	case STATUSTYPE_TABLE:
1199 		snprintf(result, maxlen, "%s", dev->name);
1200 		break;
1201 	}
1202 
1203 	return 0;
1204 }
1205 
1206 static struct target_type origin_target = {
1207 	.name    = "snapshot-origin",
1208 	.version = {1, 4, 0},
1209 	.module  = THIS_MODULE,
1210 	.ctr     = origin_ctr,
1211 	.dtr     = origin_dtr,
1212 	.map     = origin_map,
1213 	.resume  = origin_resume,
1214 	.status  = origin_status,
1215 };
1216 
1217 static struct target_type snapshot_target = {
1218 	.name    = "snapshot",
1219 	.version = {1, 4, 0},
1220 	.module  = THIS_MODULE,
1221 	.ctr     = snapshot_ctr,
1222 	.dtr     = snapshot_dtr,
1223 	.map     = snapshot_map,
1224 	.resume  = snapshot_resume,
1225 	.status  = snapshot_status,
1226 };
1227 
1228 static int __init dm_snapshot_init(void)
1229 {
1230 	int r;
1231 
1232 	r = dm_register_target(&snapshot_target);
1233 	if (r) {
1234 		DMERR("snapshot target register failed %d", r);
1235 		return r;
1236 	}
1237 
1238 	r = dm_register_target(&origin_target);
1239 	if (r < 0) {
1240 		DMERR("Origin target register failed %d", r);
1241 		goto bad1;
1242 	}
1243 
1244 	r = init_origin_hash();
1245 	if (r) {
1246 		DMERR("init_origin_hash failed.");
1247 		goto bad2;
1248 	}
1249 
1250 	exception_cache = kmem_cache_create("dm-snapshot-ex",
1251 					    sizeof(struct exception),
1252 					    __alignof__(struct exception),
1253 					    0, NULL, NULL);
1254 	if (!exception_cache) {
1255 		DMERR("Couldn't create exception cache.");
1256 		r = -ENOMEM;
1257 		goto bad3;
1258 	}
1259 
1260 	pending_cache =
1261 	    kmem_cache_create("dm-snapshot-in",
1262 			      sizeof(struct pending_exception),
1263 			      __alignof__(struct pending_exception),
1264 			      0, NULL, NULL);
1265 	if (!pending_cache) {
1266 		DMERR("Couldn't create pending cache.");
1267 		r = -ENOMEM;
1268 		goto bad4;
1269 	}
1270 
1271 	pending_pool = mempool_create_slab_pool(128, pending_cache);
1272 	if (!pending_pool) {
1273 		DMERR("Couldn't create pending pool.");
1274 		r = -ENOMEM;
1275 		goto bad5;
1276 	}
1277 
1278 	return 0;
1279 
1280       bad5:
1281 	kmem_cache_destroy(pending_cache);
1282       bad4:
1283 	kmem_cache_destroy(exception_cache);
1284       bad3:
1285 	exit_origin_hash();
1286       bad2:
1287 	dm_unregister_target(&origin_target);
1288       bad1:
1289 	dm_unregister_target(&snapshot_target);
1290 	return r;
1291 }
1292 
1293 static void __exit dm_snapshot_exit(void)
1294 {
1295 	int r;
1296 
1297 	r = dm_unregister_target(&snapshot_target);
1298 	if (r)
1299 		DMERR("snapshot unregister failed %d", r);
1300 
1301 	r = dm_unregister_target(&origin_target);
1302 	if (r)
1303 		DMERR("origin unregister failed %d", r);
1304 
1305 	exit_origin_hash();
1306 	mempool_destroy(pending_pool);
1307 	kmem_cache_destroy(pending_cache);
1308 	kmem_cache_destroy(exception_cache);
1309 }
1310 
1311 /* Module hooks */
1312 module_init(dm_snapshot_init);
1313 module_exit(dm_snapshot_exit);
1314 
1315 MODULE_DESCRIPTION(DM_NAME " snapshot target");
1316 MODULE_AUTHOR("Joe Thornber");
1317 MODULE_LICENSE("GPL");
1318