xref: /linux/drivers/md/dm-snap.c (revision 32a92f8c89326985e05dce8b22d3f0aa07a3e1bd)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include <linux/blkdev.h>
9 #include <linux/device-mapper.h>
10 #include <linux/delay.h>
11 #include <linux/fs.h>
12 #include <linux/init.h>
13 #include <linux/kdev_t.h>
14 #include <linux/list.h>
15 #include <linux/list_bl.h>
16 #include <linux/mempool.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <linux/log2.h>
21 #include <linux/dm-kcopyd.h>
22 
23 #include "dm.h"
24 
25 #include "dm-exception-store.h"
26 
27 #define DM_MSG_PREFIX "snapshots"
28 
29 static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
30 
31 #define dm_target_is_snapshot_merge(ti) \
32 	((ti)->type->name == dm_snapshot_merge_target_name)
33 
34 /*
35  * The size of the mempool used to track chunks in use.
36  */
37 #define MIN_IOS 256
38 
39 #define DM_TRACKED_CHUNK_HASH_SIZE	16
40 #define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
41 					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
42 
43 struct dm_hlist_head {
44 	struct hlist_head head;
45 	spinlock_t lock;
46 };
47 
48 struct dm_exception_table {
49 	uint32_t hash_mask;
50 	unsigned int hash_shift;
51 	struct dm_hlist_head *table;
52 };
53 
54 struct dm_snapshot {
55 	struct rw_semaphore lock;
56 
57 	struct dm_dev *origin;
58 	struct dm_dev *cow;
59 
60 	struct dm_target *ti;
61 
62 	/* List of snapshots per Origin */
63 	struct list_head list;
64 
65 	/*
66 	 * You can't use a snapshot if this is 0 (e.g. if full).
67 	 * A snapshot-merge target never clears this.
68 	 */
69 	int valid;
70 
71 	/*
72 	 * The snapshot overflowed because of a write to the snapshot device.
73 	 * We don't have to invalidate the snapshot in this case, but we need
74 	 * to prevent further writes.
75 	 */
76 	int snapshot_overflowed;
77 
78 	/* Origin writes don't trigger exceptions until this is set */
79 	int active;
80 
81 	atomic_t pending_exceptions_count;
82 
83 	spinlock_t pe_allocation_lock;
84 
85 	/* Protected by "pe_allocation_lock" */
86 	sector_t exception_start_sequence;
87 
88 	/* Protected by kcopyd single-threaded callback */
89 	sector_t exception_complete_sequence;
90 
91 	/*
92 	 * A list of pending exceptions that completed out of order.
93 	 * Protected by kcopyd single-threaded callback.
94 	 */
95 	struct rb_root out_of_order_tree;
96 
97 	mempool_t pending_pool;
98 
99 	struct dm_exception_table pending;
100 	struct dm_exception_table complete;
101 
102 	/*
103 	 * pe_lock protects all pending_exception operations and access
104 	 * as well as the snapshot_bios list.
105 	 */
106 	spinlock_t pe_lock;
107 
108 	/* Chunks with outstanding reads */
109 	spinlock_t tracked_chunk_lock;
110 	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
111 
112 	/* The on disk metadata handler */
113 	struct dm_exception_store *store;
114 
115 	unsigned int in_progress;
116 	struct wait_queue_head in_progress_wait;
117 
118 	struct dm_kcopyd_client *kcopyd_client;
119 
120 	/* Wait for events based on state_bits */
121 	unsigned long state_bits;
122 
123 	/* Range of chunks currently being merged. */
124 	chunk_t first_merging_chunk;
125 	int num_merging_chunks;
126 
127 	/*
128 	 * The merge operation failed if this flag is set.
129 	 * Failure modes are handled as follows:
130 	 * - I/O error reading the header
131 	 *	=> don't load the target; abort.
132 	 * - Header does not have "valid" flag set
133 	 *	=> use the origin; forget about the snapshot.
134 	 * - I/O error when reading exceptions
135 	 *	=> don't load the target; abort.
136 	 *         (We can't use the intermediate origin state.)
137 	 * - I/O error while merging
138 	 *	=> stop merging; set merge_failed; process I/O normally.
139 	 */
140 	bool merge_failed:1;
141 
142 	bool discard_zeroes_cow:1;
143 	bool discard_passdown_origin:1;
144 
145 	/*
146 	 * Incoming bios that overlap with chunks being merged must wait
147 	 * for them to be committed.
148 	 */
149 	struct bio_list bios_queued_during_merge;
150 };
151 
152 /*
153  * state_bits:
154  *   RUNNING_MERGE  - Merge operation is in progress.
155  *   SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
156  *                    cleared afterwards.
157  */
158 #define RUNNING_MERGE          0
159 #define SHUTDOWN_MERGE         1
160 
161 /*
162  * Maximum number of chunks being copied on write.
163  *
164  * The value was decided experimentally as a trade-off between memory
165  * consumption, stalling the kernel's workqueues and maintaining a high enough
166  * throughput.
167  */
168 #define DEFAULT_COW_THRESHOLD 2048
169 
170 static unsigned int cow_threshold = DEFAULT_COW_THRESHOLD;
171 module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
172 MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
173 
174 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
175 		"A percentage of time allocated for copy on write");
176 
dm_snap_origin(struct dm_snapshot * s)177 struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
178 {
179 	return s->origin;
180 }
181 EXPORT_SYMBOL(dm_snap_origin);
182 
dm_snap_cow(struct dm_snapshot * s)183 struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
184 {
185 	return s->cow;
186 }
187 EXPORT_SYMBOL(dm_snap_cow);
188 
chunk_to_sector(struct dm_exception_store * store,chunk_t chunk)189 static sector_t chunk_to_sector(struct dm_exception_store *store,
190 				chunk_t chunk)
191 {
192 	return chunk << store->chunk_shift;
193 }
194 
bdev_equal(struct block_device * lhs,struct block_device * rhs)195 static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
196 {
197 	/*
198 	 * There is only ever one instance of a particular block
199 	 * device so we can compare pointers safely.
200 	 */
201 	return lhs == rhs;
202 }
203 
204 struct dm_snap_pending_exception {
205 	struct dm_exception e;
206 
207 	/*
208 	 * Origin buffers waiting for this to complete are held
209 	 * in a bio list
210 	 */
211 	struct bio_list origin_bios;
212 	struct bio_list snapshot_bios;
213 
214 	/* Pointer back to snapshot context */
215 	struct dm_snapshot *snap;
216 
217 	/*
218 	 * 1 indicates the exception has already been sent to
219 	 * kcopyd.
220 	 */
221 	int started;
222 
223 	/* There was copying error. */
224 	int copy_error;
225 
226 	/* A sequence number, it is used for in-order completion. */
227 	sector_t exception_sequence;
228 
229 	struct rb_node out_of_order_node;
230 
231 	/*
232 	 * For writing a complete chunk, bypassing the copy.
233 	 */
234 	struct bio *full_bio;
235 	bio_end_io_t *full_bio_end_io;
236 };
237 
238 /*
239  * Hash table mapping origin volumes to lists of snapshots and
240  * a lock to protect it
241  */
242 static struct kmem_cache *exception_cache;
243 static struct kmem_cache *pending_cache;
244 
245 struct dm_snap_tracked_chunk {
246 	struct hlist_node node;
247 	chunk_t chunk;
248 };
249 
init_tracked_chunk(struct bio * bio)250 static void init_tracked_chunk(struct bio *bio)
251 {
252 	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
253 
254 	INIT_HLIST_NODE(&c->node);
255 }
256 
is_bio_tracked(struct bio * bio)257 static bool is_bio_tracked(struct bio *bio)
258 {
259 	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
260 
261 	return !hlist_unhashed(&c->node);
262 }
263 
track_chunk(struct dm_snapshot * s,struct bio * bio,chunk_t chunk)264 static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
265 {
266 	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
267 
268 	c->chunk = chunk;
269 
270 	spin_lock_irq(&s->tracked_chunk_lock);
271 	hlist_add_head(&c->node,
272 		       &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
273 	spin_unlock_irq(&s->tracked_chunk_lock);
274 }
275 
stop_tracking_chunk(struct dm_snapshot * s,struct bio * bio)276 static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
277 {
278 	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
279 	unsigned long flags;
280 
281 	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
282 	hlist_del(&c->node);
283 	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
284 }
285 
__chunk_is_tracked(struct dm_snapshot * s,chunk_t chunk)286 static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
287 {
288 	struct dm_snap_tracked_chunk *c;
289 	int found = 0;
290 
291 	spin_lock_irq(&s->tracked_chunk_lock);
292 
293 	hlist_for_each_entry(c,
294 	    &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
295 		if (c->chunk == chunk) {
296 			found = 1;
297 			break;
298 		}
299 	}
300 
301 	spin_unlock_irq(&s->tracked_chunk_lock);
302 
303 	return found;
304 }
305 
306 /*
307  * This conflicting I/O is extremely improbable in the caller,
308  * so fsleep(1000) is sufficient and there is no need for a wait queue.
309  */
__check_for_conflicting_io(struct dm_snapshot * s,chunk_t chunk)310 static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
311 {
312 	while (__chunk_is_tracked(s, chunk))
313 		fsleep(1000);
314 }
315 
316 /*
317  * One of these per registered origin, held in the snapshot_origins hash
318  */
319 struct origin {
320 	/* The origin device */
321 	struct block_device *bdev;
322 
323 	struct list_head hash_list;
324 
325 	/* List of snapshots for this origin */
326 	struct list_head snapshots;
327 };
328 
329 /*
330  * This structure is allocated for each origin target
331  */
332 struct dm_origin {
333 	struct dm_dev *dev;
334 	struct dm_target *ti;
335 	unsigned int split_boundary;
336 	struct list_head hash_list;
337 };
338 
339 /*
340  * Size of the hash table for origin volumes. If we make this
341  * the size of the minors list then it should be nearly perfect
342  */
343 #define ORIGIN_HASH_SIZE 256
344 #define ORIGIN_MASK      0xFF
345 static struct list_head *_origins;
346 static struct list_head *_dm_origins;
347 static struct rw_semaphore _origins_lock;
348 
349 static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
350 static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
351 static uint64_t _pending_exceptions_done_count;
352 
init_origin_hash(void)353 static int init_origin_hash(void)
354 {
355 	int i;
356 
357 	_origins = kmalloc_objs(struct list_head, ORIGIN_HASH_SIZE);
358 	if (!_origins) {
359 		DMERR("unable to allocate memory for _origins");
360 		return -ENOMEM;
361 	}
362 	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
363 		INIT_LIST_HEAD(_origins + i);
364 
365 	_dm_origins = kmalloc_objs(struct list_head, ORIGIN_HASH_SIZE);
366 	if (!_dm_origins) {
367 		DMERR("unable to allocate memory for _dm_origins");
368 		kfree(_origins);
369 		return -ENOMEM;
370 	}
371 	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
372 		INIT_LIST_HEAD(_dm_origins + i);
373 
374 	init_rwsem(&_origins_lock);
375 
376 	return 0;
377 }
378 
exit_origin_hash(void)379 static void exit_origin_hash(void)
380 {
381 	kfree(_origins);
382 	kfree(_dm_origins);
383 }
384 
origin_hash(struct block_device * bdev)385 static unsigned int origin_hash(struct block_device *bdev)
386 {
387 	return bdev->bd_dev & ORIGIN_MASK;
388 }
389 
__lookup_origin(struct block_device * origin)390 static struct origin *__lookup_origin(struct block_device *origin)
391 {
392 	struct list_head *ol;
393 	struct origin *o;
394 
395 	ol = &_origins[origin_hash(origin)];
396 	list_for_each_entry(o, ol, hash_list)
397 		if (bdev_equal(o->bdev, origin))
398 			return o;
399 
400 	return NULL;
401 }
402 
__insert_origin(struct origin * o)403 static void __insert_origin(struct origin *o)
404 {
405 	struct list_head *sl = &_origins[origin_hash(o->bdev)];
406 
407 	list_add_tail(&o->hash_list, sl);
408 }
409 
__lookup_dm_origin(struct block_device * origin)410 static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
411 {
412 	struct list_head *ol;
413 	struct dm_origin *o;
414 
415 	ol = &_dm_origins[origin_hash(origin)];
416 	list_for_each_entry(o, ol, hash_list)
417 		if (bdev_equal(o->dev->bdev, origin))
418 			return o;
419 
420 	return NULL;
421 }
422 
__insert_dm_origin(struct dm_origin * o)423 static void __insert_dm_origin(struct dm_origin *o)
424 {
425 	struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
426 
427 	list_add_tail(&o->hash_list, sl);
428 }
429 
__remove_dm_origin(struct dm_origin * o)430 static void __remove_dm_origin(struct dm_origin *o)
431 {
432 	list_del(&o->hash_list);
433 }
434 
435 /*
436  * _origins_lock must be held when calling this function.
437  * Returns number of snapshots registered using the supplied cow device, plus:
438  * snap_src - a snapshot suitable for use as a source of exception handover
439  * snap_dest - a snapshot capable of receiving exception handover.
440  * snap_merge - an existing snapshot-merge target linked to the same origin.
441  *   There can be at most one snapshot-merge target. The parameter is optional.
442  *
443  * Possible return values and states of snap_src and snap_dest.
444  *   0: NULL, NULL  - first new snapshot
445  *   1: snap_src, NULL - normal snapshot
446  *   2: snap_src, snap_dest  - waiting for handover
447  *   2: snap_src, NULL - handed over, waiting for old to be deleted
448  *   1: NULL, snap_dest - source got destroyed without handover
449  */
__find_snapshots_sharing_cow(struct dm_snapshot * snap,struct dm_snapshot ** snap_src,struct dm_snapshot ** snap_dest,struct dm_snapshot ** snap_merge)450 static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
451 					struct dm_snapshot **snap_src,
452 					struct dm_snapshot **snap_dest,
453 					struct dm_snapshot **snap_merge)
454 {
455 	struct dm_snapshot *s;
456 	struct origin *o;
457 	int count = 0;
458 	int active;
459 
460 	o = __lookup_origin(snap->origin->bdev);
461 	if (!o)
462 		goto out;
463 
464 	list_for_each_entry(s, &o->snapshots, list) {
465 		if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
466 			*snap_merge = s;
467 		if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
468 			continue;
469 
470 		down_read(&s->lock);
471 		active = s->active;
472 		up_read(&s->lock);
473 
474 		if (active) {
475 			if (snap_src)
476 				*snap_src = s;
477 		} else if (snap_dest)
478 			*snap_dest = s;
479 
480 		count++;
481 	}
482 
483 out:
484 	return count;
485 }
486 
487 /*
488  * On success, returns 1 if this snapshot is a handover destination,
489  * otherwise returns 0.
490  */
__validate_exception_handover(struct dm_snapshot * snap)491 static int __validate_exception_handover(struct dm_snapshot *snap)
492 {
493 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
494 	struct dm_snapshot *snap_merge = NULL;
495 
496 	/* Does snapshot need exceptions handed over to it? */
497 	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
498 					  &snap_merge) == 2) ||
499 	    snap_dest) {
500 		snap->ti->error = "Snapshot cow pairing for exception table handover failed";
501 		return -EINVAL;
502 	}
503 
504 	/*
505 	 * If no snap_src was found, snap cannot become a handover
506 	 * destination.
507 	 */
508 	if (!snap_src)
509 		return 0;
510 
511 	/*
512 	 * Non-snapshot-merge handover?
513 	 */
514 	if (!dm_target_is_snapshot_merge(snap->ti))
515 		return 1;
516 
517 	/*
518 	 * Do not allow more than one merging snapshot.
519 	 */
520 	if (snap_merge) {
521 		snap->ti->error = "A snapshot is already merging.";
522 		return -EINVAL;
523 	}
524 
525 	if (!snap_src->store->type->prepare_merge ||
526 	    !snap_src->store->type->commit_merge) {
527 		snap->ti->error = "Snapshot exception store does not support snapshot-merge.";
528 		return -EINVAL;
529 	}
530 
531 	return 1;
532 }
533 
__insert_snapshot(struct origin * o,struct dm_snapshot * s)534 static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
535 {
536 	struct dm_snapshot *l;
537 
538 	/* Sort the list according to chunk size, largest-first smallest-last */
539 	list_for_each_entry(l, &o->snapshots, list)
540 		if (l->store->chunk_size < s->store->chunk_size)
541 			break;
542 	list_add_tail(&s->list, &l->list);
543 }
544 
545 /*
546  * Make a note of the snapshot and its origin so we can look it
547  * up when the origin has a write on it.
548  *
549  * Also validate snapshot exception store handovers.
550  * On success, returns 1 if this registration is a handover destination,
551  * otherwise returns 0.
552  */
register_snapshot(struct dm_snapshot * snap)553 static int register_snapshot(struct dm_snapshot *snap)
554 {
555 	struct origin *o, *new_o = NULL;
556 	struct block_device *bdev = snap->origin->bdev;
557 	int r = 0;
558 
559 	new_o = kmalloc_obj(*new_o);
560 	if (!new_o)
561 		return -ENOMEM;
562 
563 	down_write(&_origins_lock);
564 
565 	r = __validate_exception_handover(snap);
566 	if (r < 0) {
567 		kfree(new_o);
568 		goto out;
569 	}
570 
571 	o = __lookup_origin(bdev);
572 	if (o)
573 		kfree(new_o);
574 	else {
575 		/* New origin */
576 		o = new_o;
577 
578 		/* Initialise the struct */
579 		INIT_LIST_HEAD(&o->snapshots);
580 		o->bdev = bdev;
581 
582 		__insert_origin(o);
583 	}
584 
585 	__insert_snapshot(o, snap);
586 
587 out:
588 	up_write(&_origins_lock);
589 
590 	return r;
591 }
592 
593 /*
594  * Move snapshot to correct place in list according to chunk size.
595  */
reregister_snapshot(struct dm_snapshot * s)596 static void reregister_snapshot(struct dm_snapshot *s)
597 {
598 	struct block_device *bdev = s->origin->bdev;
599 
600 	down_write(&_origins_lock);
601 
602 	list_del(&s->list);
603 	__insert_snapshot(__lookup_origin(bdev), s);
604 
605 	up_write(&_origins_lock);
606 }
607 
unregister_snapshot(struct dm_snapshot * s)608 static void unregister_snapshot(struct dm_snapshot *s)
609 {
610 	struct origin *o;
611 
612 	down_write(&_origins_lock);
613 	o = __lookup_origin(s->origin->bdev);
614 
615 	list_del(&s->list);
616 	if (o && list_empty(&o->snapshots)) {
617 		list_del(&o->hash_list);
618 		kfree(o);
619 	}
620 
621 	up_write(&_origins_lock);
622 }
623 
624 /*
625  * Implementation of the exception hash tables.
626  * The lowest hash_shift bits of the chunk number are ignored, allowing
627  * some consecutive chunks to be grouped together.
628  */
629 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
630 
631 /* Lock to protect access to the completed and pending exception hash tables. */
632 struct dm_exception_table_lock {
633 	spinlock_t *complete_slot;
634 	spinlock_t *pending_slot;
635 };
636 
dm_exception_table_lock_init(struct dm_snapshot * s,chunk_t chunk,struct dm_exception_table_lock * lock)637 static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
638 					 struct dm_exception_table_lock *lock)
639 {
640 	struct dm_exception_table *complete = &s->complete;
641 	struct dm_exception_table *pending = &s->pending;
642 
643 	lock->complete_slot = &complete->table[exception_hash(complete, chunk)].lock;
644 	lock->pending_slot = &pending->table[exception_hash(pending, chunk)].lock;
645 }
646 
dm_exception_table_lock(struct dm_exception_table_lock * lock)647 static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
648 {
649 	spin_lock_nested(lock->complete_slot, 1);
650 	spin_lock_nested(lock->pending_slot, 2);
651 }
652 
dm_exception_table_unlock(struct dm_exception_table_lock * lock)653 static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
654 {
655 	spin_unlock(lock->pending_slot);
656 	spin_unlock(lock->complete_slot);
657 }
658 
dm_exception_table_init(struct dm_exception_table * et,uint32_t size,unsigned int hash_shift)659 static int dm_exception_table_init(struct dm_exception_table *et,
660 				   uint32_t size, unsigned int hash_shift)
661 {
662 	unsigned int i;
663 
664 	et->hash_shift = hash_shift;
665 	et->hash_mask = size - 1;
666 	et->table = kvmalloc_objs(struct dm_hlist_head, size);
667 	if (!et->table)
668 		return -ENOMEM;
669 
670 	for (i = 0; i < size; i++) {
671 		INIT_HLIST_HEAD(&et->table[i].head);
672 		spin_lock_init(&et->table[i].lock);
673 	}
674 
675 	return 0;
676 }
677 
dm_exception_table_exit(struct dm_exception_table * et,struct kmem_cache * mem)678 static void dm_exception_table_exit(struct dm_exception_table *et,
679 				    struct kmem_cache *mem)
680 {
681 	struct dm_hlist_head *slot;
682 	struct dm_exception *ex;
683 	struct hlist_node *pos;
684 	int i, size;
685 
686 	size = et->hash_mask + 1;
687 	for (i = 0; i < size; i++) {
688 		slot = et->table + i;
689 
690 		hlist_for_each_entry_safe(ex, pos, &slot->head, hash_list) {
691 			hlist_del(&ex->hash_list);
692 			kmem_cache_free(mem, ex);
693 			cond_resched();
694 		}
695 	}
696 
697 	kvfree(et->table);
698 }
699 
exception_hash(struct dm_exception_table * et,chunk_t chunk)700 static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
701 {
702 	return (chunk >> et->hash_shift) & et->hash_mask;
703 }
704 
dm_remove_exception(struct dm_exception * e)705 static void dm_remove_exception(struct dm_exception *e)
706 {
707 	hlist_del(&e->hash_list);
708 }
709 
710 /*
711  * Return the exception data for a sector, or NULL if not
712  * remapped.
713  */
dm_lookup_exception(struct dm_exception_table * et,chunk_t chunk)714 static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
715 						chunk_t chunk)
716 {
717 	struct hlist_head *slot;
718 	struct dm_exception *e;
719 
720 	slot = &et->table[exception_hash(et, chunk)].head;
721 	hlist_for_each_entry(e, slot, hash_list)
722 		if (chunk >= e->old_chunk &&
723 		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
724 			return e;
725 
726 	return NULL;
727 }
728 
alloc_completed_exception(gfp_t gfp)729 static struct dm_exception *alloc_completed_exception(gfp_t gfp)
730 {
731 	struct dm_exception *e;
732 
733 	e = kmem_cache_alloc(exception_cache, gfp);
734 	if (!e && gfp == GFP_NOIO)
735 		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
736 
737 	return e;
738 }
739 
free_completed_exception(struct dm_exception * e)740 static void free_completed_exception(struct dm_exception *e)
741 {
742 	kmem_cache_free(exception_cache, e);
743 }
744 
alloc_pending_exception(struct dm_snapshot * s)745 static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
746 {
747 	struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
748 							     GFP_NOIO);
749 
750 	atomic_inc(&s->pending_exceptions_count);
751 	pe->snap = s;
752 
753 	return pe;
754 }
755 
free_pending_exception(struct dm_snap_pending_exception * pe)756 static void free_pending_exception(struct dm_snap_pending_exception *pe)
757 {
758 	struct dm_snapshot *s = pe->snap;
759 
760 	mempool_free(pe, &s->pending_pool);
761 	smp_mb__before_atomic();
762 	atomic_dec(&s->pending_exceptions_count);
763 }
764 
dm_insert_exception(struct dm_exception_table * eh,struct dm_exception * new_e)765 static void dm_insert_exception(struct dm_exception_table *eh,
766 				struct dm_exception *new_e)
767 {
768 	struct hlist_head *l;
769 	struct dm_exception *e = NULL;
770 
771 	l = &eh->table[exception_hash(eh, new_e->old_chunk)].head;
772 
773 	/* Add immediately if this table doesn't support consecutive chunks */
774 	if (!eh->hash_shift)
775 		goto out;
776 
777 	/* List is ordered by old_chunk */
778 	hlist_for_each_entry(e, l, hash_list) {
779 		/* Insert after an existing chunk? */
780 		if (new_e->old_chunk == (e->old_chunk +
781 					 dm_consecutive_chunk_count(e) + 1) &&
782 		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
783 					 dm_consecutive_chunk_count(e) + 1)) {
784 			dm_consecutive_chunk_count_inc(e);
785 			free_completed_exception(new_e);
786 			return;
787 		}
788 
789 		/* Insert before an existing chunk? */
790 		if (new_e->old_chunk == (e->old_chunk - 1) &&
791 		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
792 			dm_consecutive_chunk_count_inc(e);
793 			e->old_chunk--;
794 			e->new_chunk--;
795 			free_completed_exception(new_e);
796 			return;
797 		}
798 
799 		if (new_e->old_chunk < e->old_chunk)
800 			break;
801 	}
802 
803 out:
804 	if (!e) {
805 		/*
806 		 * Either the table doesn't support consecutive chunks or slot
807 		 * l is empty.
808 		 */
809 		hlist_add_head(&new_e->hash_list, l);
810 	} else if (new_e->old_chunk < e->old_chunk) {
811 		/* Add before an existing exception */
812 		hlist_add_before(&new_e->hash_list, &e->hash_list);
813 	} else {
814 		/* Add to l's tail: e is the last exception in this slot */
815 		hlist_add_behind(&new_e->hash_list, &e->hash_list);
816 	}
817 }
818 
819 /*
820  * Callback used by the exception stores to load exceptions when
821  * initialising.
822  */
dm_add_exception(void * context,chunk_t old,chunk_t new)823 static int dm_add_exception(void *context, chunk_t old, chunk_t new)
824 {
825 	struct dm_snapshot *s = context;
826 	struct dm_exception *e;
827 
828 	e = alloc_completed_exception(GFP_KERNEL);
829 	if (!e)
830 		return -ENOMEM;
831 
832 	e->old_chunk = old;
833 
834 	/* Consecutive_count is implicitly initialised to zero */
835 	e->new_chunk = new;
836 
837 	dm_insert_exception(&s->complete, e);
838 
839 	return 0;
840 }
841 
842 /*
843  * Return a minimum chunk size of all snapshots that have the specified origin.
844  * Return zero if the origin has no snapshots.
845  */
__minimum_chunk_size(struct origin * o)846 static uint32_t __minimum_chunk_size(struct origin *o)
847 {
848 	struct dm_snapshot *snap;
849 	unsigned int chunk_size = rounddown_pow_of_two(UINT_MAX);
850 
851 	if (o)
852 		list_for_each_entry(snap, &o->snapshots, list)
853 			chunk_size = min_not_zero(chunk_size,
854 						  snap->store->chunk_size);
855 
856 	return (uint32_t) chunk_size;
857 }
858 
859 /*
860  * Hard coded magic.
861  */
calc_max_buckets(void)862 static int calc_max_buckets(void)
863 {
864 	/* use a fixed size of 2MB */
865 	unsigned long mem = 2 * 1024 * 1024;
866 
867 	mem /= sizeof(struct dm_hlist_head);
868 
869 	return mem;
870 }
871 
872 /*
873  * Allocate room for a suitable hash table.
874  */
init_hash_tables(struct dm_snapshot * s)875 static int init_hash_tables(struct dm_snapshot *s)
876 {
877 	sector_t hash_size, cow_dev_size, max_buckets;
878 
879 	/*
880 	 * Calculate based on the size of the original volume or
881 	 * the COW volume...
882 	 */
883 	cow_dev_size = get_dev_size(s->cow->bdev);
884 	max_buckets = calc_max_buckets();
885 
886 	hash_size = cow_dev_size >> s->store->chunk_shift;
887 	hash_size = min(hash_size, max_buckets);
888 
889 	if (hash_size < 64)
890 		hash_size = 64;
891 	hash_size = rounddown_pow_of_two(hash_size);
892 	if (dm_exception_table_init(&s->complete, hash_size,
893 				    DM_CHUNK_CONSECUTIVE_BITS))
894 		return -ENOMEM;
895 
896 	/*
897 	 * Allocate hash table for in-flight exceptions
898 	 * Make this smaller than the real hash table
899 	 */
900 	hash_size >>= 3;
901 	if (hash_size < 64)
902 		hash_size = 64;
903 
904 	if (dm_exception_table_init(&s->pending, hash_size, 0)) {
905 		dm_exception_table_exit(&s->complete, exception_cache);
906 		return -ENOMEM;
907 	}
908 
909 	return 0;
910 }
911 
merge_shutdown(struct dm_snapshot * s)912 static void merge_shutdown(struct dm_snapshot *s)
913 {
914 	clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
915 	smp_mb__after_atomic();
916 	wake_up_bit(&s->state_bits, RUNNING_MERGE);
917 }
918 
__release_queued_bios_after_merge(struct dm_snapshot * s)919 static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
920 {
921 	s->first_merging_chunk = 0;
922 	s->num_merging_chunks = 0;
923 
924 	return bio_list_get(&s->bios_queued_during_merge);
925 }
926 
927 /*
928  * Remove one chunk from the index of completed exceptions.
929  */
__remove_single_exception_chunk(struct dm_snapshot * s,chunk_t old_chunk)930 static int __remove_single_exception_chunk(struct dm_snapshot *s,
931 					   chunk_t old_chunk)
932 {
933 	struct dm_exception *e;
934 
935 	e = dm_lookup_exception(&s->complete, old_chunk);
936 	if (!e) {
937 		DMERR("Corruption detected: exception for block %llu is on disk but not in memory",
938 		      (unsigned long long)old_chunk);
939 		return -EINVAL;
940 	}
941 
942 	/*
943 	 * If this is the only chunk using this exception, remove exception.
944 	 */
945 	if (!dm_consecutive_chunk_count(e)) {
946 		dm_remove_exception(e);
947 		free_completed_exception(e);
948 		return 0;
949 	}
950 
951 	/*
952 	 * The chunk may be either at the beginning or the end of a
953 	 * group of consecutive chunks - never in the middle.  We are
954 	 * removing chunks in the opposite order to that in which they
955 	 * were added, so this should always be true.
956 	 * Decrement the consecutive chunk counter and adjust the
957 	 * starting point if necessary.
958 	 */
959 	if (old_chunk == e->old_chunk) {
960 		e->old_chunk++;
961 		e->new_chunk++;
962 	} else if (old_chunk != e->old_chunk +
963 		   dm_consecutive_chunk_count(e)) {
964 		DMERR("Attempt to merge block %llu from the middle of a chunk range [%llu - %llu]",
965 		      (unsigned long long)old_chunk,
966 		      (unsigned long long)e->old_chunk,
967 		      (unsigned long long)
968 		      e->old_chunk + dm_consecutive_chunk_count(e));
969 		return -EINVAL;
970 	}
971 
972 	dm_consecutive_chunk_count_dec(e);
973 
974 	return 0;
975 }
976 
977 static void flush_bios(struct bio *bio);
978 
remove_single_exception_chunk(struct dm_snapshot * s)979 static int remove_single_exception_chunk(struct dm_snapshot *s)
980 {
981 	struct bio *b = NULL;
982 	int r;
983 	chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
984 
985 	down_write(&s->lock);
986 
987 	/*
988 	 * Process chunks (and associated exceptions) in reverse order
989 	 * so that dm_consecutive_chunk_count_dec() accounting works.
990 	 */
991 	do {
992 		r = __remove_single_exception_chunk(s, old_chunk);
993 		if (r)
994 			goto out;
995 	} while (old_chunk-- > s->first_merging_chunk);
996 
997 	b = __release_queued_bios_after_merge(s);
998 
999 out:
1000 	up_write(&s->lock);
1001 	if (b)
1002 		flush_bios(b);
1003 
1004 	return r;
1005 }
1006 
1007 static int origin_write_extent(struct dm_snapshot *merging_snap,
1008 			       sector_t sector, unsigned int chunk_size);
1009 
1010 static void merge_callback(int read_err, unsigned long write_err,
1011 			   void *context);
1012 
read_pending_exceptions_done_count(void)1013 static uint64_t read_pending_exceptions_done_count(void)
1014 {
1015 	uint64_t pending_exceptions_done;
1016 
1017 	spin_lock(&_pending_exceptions_done_spinlock);
1018 	pending_exceptions_done = _pending_exceptions_done_count;
1019 	spin_unlock(&_pending_exceptions_done_spinlock);
1020 
1021 	return pending_exceptions_done;
1022 }
1023 
increment_pending_exceptions_done_count(void)1024 static void increment_pending_exceptions_done_count(void)
1025 {
1026 	spin_lock(&_pending_exceptions_done_spinlock);
1027 	_pending_exceptions_done_count++;
1028 	spin_unlock(&_pending_exceptions_done_spinlock);
1029 
1030 	wake_up_all(&_pending_exceptions_done);
1031 }
1032 
snapshot_merge_next_chunks(struct dm_snapshot * s)1033 static void snapshot_merge_next_chunks(struct dm_snapshot *s)
1034 {
1035 	int i, linear_chunks;
1036 	chunk_t old_chunk, new_chunk;
1037 	struct dm_io_region src, dest;
1038 	sector_t io_size;
1039 	uint64_t previous_count;
1040 
1041 	BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
1042 	if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
1043 		goto shut;
1044 
1045 	/*
1046 	 * valid flag never changes during merge, so no lock required.
1047 	 */
1048 	if (!s->valid) {
1049 		DMERR("Snapshot is invalid: can't merge");
1050 		goto shut;
1051 	}
1052 
1053 	linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
1054 						      &new_chunk);
1055 	if (linear_chunks <= 0) {
1056 		if (linear_chunks < 0) {
1057 			DMERR("Read error in exception store: shutting down merge");
1058 			down_write(&s->lock);
1059 			s->merge_failed = true;
1060 			up_write(&s->lock);
1061 		}
1062 		goto shut;
1063 	}
1064 
1065 	/* Adjust old_chunk and new_chunk to reflect start of linear region */
1066 	old_chunk = old_chunk + 1 - linear_chunks;
1067 	new_chunk = new_chunk + 1 - linear_chunks;
1068 
1069 	/*
1070 	 * Use one (potentially large) I/O to copy all 'linear_chunks'
1071 	 * from the exception store to the origin
1072 	 */
1073 	io_size = linear_chunks * s->store->chunk_size;
1074 
1075 	dest.bdev = s->origin->bdev;
1076 	dest.sector = chunk_to_sector(s->store, old_chunk);
1077 	dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
1078 
1079 	src.bdev = s->cow->bdev;
1080 	src.sector = chunk_to_sector(s->store, new_chunk);
1081 	src.count = dest.count;
1082 
1083 	/*
1084 	 * Reallocate any exceptions needed in other snapshots then
1085 	 * wait for the pending exceptions to complete.
1086 	 * Each time any pending exception (globally on the system)
1087 	 * completes we are woken and repeat the process to find out
1088 	 * if we can proceed.  While this may not seem a particularly
1089 	 * efficient algorithm, it is not expected to have any
1090 	 * significant impact on performance.
1091 	 */
1092 	previous_count = read_pending_exceptions_done_count();
1093 	while (origin_write_extent(s, dest.sector, io_size)) {
1094 		wait_event(_pending_exceptions_done,
1095 			   (read_pending_exceptions_done_count() !=
1096 			    previous_count));
1097 		/* Retry after the wait, until all exceptions are done. */
1098 		previous_count = read_pending_exceptions_done_count();
1099 	}
1100 
1101 	down_write(&s->lock);
1102 	s->first_merging_chunk = old_chunk;
1103 	s->num_merging_chunks = linear_chunks;
1104 	up_write(&s->lock);
1105 
1106 	/* Wait until writes to all 'linear_chunks' drain */
1107 	for (i = 0; i < linear_chunks; i++)
1108 		__check_for_conflicting_io(s, old_chunk + i);
1109 
1110 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
1111 	return;
1112 
1113 shut:
1114 	merge_shutdown(s);
1115 }
1116 
1117 static void error_bios(struct bio *bio);
1118 
merge_callback(int read_err,unsigned long write_err,void * context)1119 static void merge_callback(int read_err, unsigned long write_err, void *context)
1120 {
1121 	struct dm_snapshot *s = context;
1122 	struct bio *b = NULL;
1123 
1124 	if (read_err || write_err) {
1125 		if (read_err)
1126 			DMERR("Read error: shutting down merge.");
1127 		else
1128 			DMERR("Write error: shutting down merge.");
1129 		goto shut;
1130 	}
1131 
1132 	if (blkdev_issue_flush(s->origin->bdev) < 0) {
1133 		DMERR("Flush after merge failed: shutting down merge");
1134 		goto shut;
1135 	}
1136 
1137 	if (s->store->type->commit_merge(s->store,
1138 					 s->num_merging_chunks) < 0) {
1139 		DMERR("Write error in exception store: shutting down merge");
1140 		goto shut;
1141 	}
1142 
1143 	if (remove_single_exception_chunk(s) < 0)
1144 		goto shut;
1145 
1146 	snapshot_merge_next_chunks(s);
1147 
1148 	return;
1149 
1150 shut:
1151 	down_write(&s->lock);
1152 	s->merge_failed = true;
1153 	b = __release_queued_bios_after_merge(s);
1154 	up_write(&s->lock);
1155 	error_bios(b);
1156 
1157 	merge_shutdown(s);
1158 }
1159 
start_merge(struct dm_snapshot * s)1160 static void start_merge(struct dm_snapshot *s)
1161 {
1162 	if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
1163 		snapshot_merge_next_chunks(s);
1164 }
1165 
1166 /*
1167  * Stop the merging process and wait until it finishes.
1168  */
stop_merge(struct dm_snapshot * s)1169 static void stop_merge(struct dm_snapshot *s)
1170 {
1171 	set_bit(SHUTDOWN_MERGE, &s->state_bits);
1172 	wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
1173 	clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1174 }
1175 
parse_snapshot_features(struct dm_arg_set * as,struct dm_snapshot * s,struct dm_target * ti)1176 static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s,
1177 				   struct dm_target *ti)
1178 {
1179 	int r;
1180 	unsigned int argc;
1181 	const char *arg_name;
1182 
1183 	static const struct dm_arg _args[] = {
1184 		{0, 2, "Invalid number of feature arguments"},
1185 	};
1186 
1187 	/*
1188 	 * No feature arguments supplied.
1189 	 */
1190 	if (!as->argc)
1191 		return 0;
1192 
1193 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
1194 	if (r)
1195 		return -EINVAL;
1196 
1197 	while (argc && !r) {
1198 		arg_name = dm_shift_arg(as);
1199 		argc--;
1200 
1201 		if (!strcasecmp(arg_name, "discard_zeroes_cow"))
1202 			s->discard_zeroes_cow = true;
1203 
1204 		else if (!strcasecmp(arg_name, "discard_passdown_origin"))
1205 			s->discard_passdown_origin = true;
1206 
1207 		else {
1208 			ti->error = "Unrecognised feature requested";
1209 			r = -EINVAL;
1210 			break;
1211 		}
1212 	}
1213 
1214 	if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
1215 		/*
1216 		 * TODO: really these are disjoint.. but ti->num_discard_bios
1217 		 * and dm_bio_get_target_bio_nr() require rigid constraints.
1218 		 */
1219 		ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
1220 		r = -EINVAL;
1221 	}
1222 
1223 	return r;
1224 }
1225 
1226 /*
1227  * Construct a snapshot mapping:
1228  * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*]
1229  */
snapshot_ctr(struct dm_target * ti,unsigned int argc,char ** argv)1230 static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1231 {
1232 	struct dm_snapshot *s;
1233 	struct dm_arg_set as;
1234 	int i;
1235 	int r = -EINVAL;
1236 	char *origin_path, *cow_path;
1237 	unsigned int args_used, num_flush_bios = 1;
1238 	blk_mode_t origin_mode = BLK_OPEN_READ;
1239 
1240 	if (argc < 4) {
1241 		ti->error = "requires 4 or more arguments";
1242 		r = -EINVAL;
1243 		goto bad;
1244 	}
1245 
1246 	if (dm_target_is_snapshot_merge(ti)) {
1247 		num_flush_bios = 2;
1248 		origin_mode = BLK_OPEN_WRITE;
1249 	}
1250 
1251 	s = kzalloc_obj(*s);
1252 	if (!s) {
1253 		ti->error = "Cannot allocate private snapshot structure";
1254 		r = -ENOMEM;
1255 		goto bad;
1256 	}
1257 
1258 	as.argc = argc;
1259 	as.argv = argv;
1260 	dm_consume_args(&as, 4);
1261 	r = parse_snapshot_features(&as, s, ti);
1262 	if (r)
1263 		goto bad_features;
1264 
1265 	origin_path = argv[0];
1266 	argv++;
1267 	argc--;
1268 
1269 	r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
1270 	if (r) {
1271 		ti->error = "Cannot get origin device";
1272 		goto bad_origin;
1273 	}
1274 
1275 	cow_path = argv[0];
1276 	argv++;
1277 	argc--;
1278 
1279 	r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
1280 	if (r) {
1281 		ti->error = "Cannot get COW device";
1282 		goto bad_cow;
1283 	}
1284 	if (s->cow->bdev && s->cow->bdev == s->origin->bdev) {
1285 		ti->error = "COW device cannot be the same as origin device";
1286 		r = -EINVAL;
1287 		goto bad_store;
1288 	}
1289 
1290 	r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
1291 	if (r) {
1292 		ti->error = "Couldn't create exception store";
1293 		r = -EINVAL;
1294 		goto bad_store;
1295 	}
1296 
1297 	argv += args_used;
1298 	argc -= args_used;
1299 
1300 	s->ti = ti;
1301 	s->valid = 1;
1302 	s->snapshot_overflowed = 0;
1303 	s->active = 0;
1304 	atomic_set(&s->pending_exceptions_count, 0);
1305 	spin_lock_init(&s->pe_allocation_lock);
1306 	s->exception_start_sequence = 0;
1307 	s->exception_complete_sequence = 0;
1308 	s->out_of_order_tree = RB_ROOT;
1309 	init_rwsem(&s->lock);
1310 	INIT_LIST_HEAD(&s->list);
1311 	spin_lock_init(&s->pe_lock);
1312 	s->state_bits = 0;
1313 	s->merge_failed = false;
1314 	s->first_merging_chunk = 0;
1315 	s->num_merging_chunks = 0;
1316 	bio_list_init(&s->bios_queued_during_merge);
1317 
1318 	/* Allocate hash table for COW data */
1319 	if (init_hash_tables(s)) {
1320 		ti->error = "Unable to allocate hash table space";
1321 		r = -ENOMEM;
1322 		goto bad_hash_tables;
1323 	}
1324 
1325 	init_waitqueue_head(&s->in_progress_wait);
1326 
1327 	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1328 	if (IS_ERR(s->kcopyd_client)) {
1329 		r = PTR_ERR(s->kcopyd_client);
1330 		ti->error = "Could not create kcopyd client";
1331 		goto bad_kcopyd;
1332 	}
1333 
1334 	r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
1335 	if (r) {
1336 		ti->error = "Could not allocate mempool for pending exceptions";
1337 		goto bad_pending_pool;
1338 	}
1339 
1340 	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
1341 		INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
1342 
1343 	spin_lock_init(&s->tracked_chunk_lock);
1344 
1345 	ti->private = s;
1346 	ti->num_flush_bios = num_flush_bios;
1347 	if (s->discard_zeroes_cow)
1348 		ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
1349 	ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
1350 
1351 	/* Add snapshot to the list of snapshots for this origin */
1352 	/* Exceptions aren't triggered till snapshot_resume() is called */
1353 	r = register_snapshot(s);
1354 	if (r == -ENOMEM) {
1355 		ti->error = "Snapshot origin struct allocation failed";
1356 		goto bad_load_and_register;
1357 	} else if (r < 0) {
1358 		/* invalid handover, register_snapshot has set ti->error */
1359 		goto bad_load_and_register;
1360 	}
1361 
1362 	/*
1363 	 * Metadata must only be loaded into one table at once, so skip this
1364 	 * if metadata will be handed over during resume.
1365 	 * Chunk size will be set during the handover - set it to zero to
1366 	 * ensure it's ignored.
1367 	 */
1368 	if (r > 0) {
1369 		s->store->chunk_size = 0;
1370 		return 0;
1371 	}
1372 
1373 	r = s->store->type->read_metadata(s->store, dm_add_exception,
1374 					  (void *)s);
1375 	if (r < 0) {
1376 		ti->error = "Failed to read snapshot metadata";
1377 		goto bad_read_metadata;
1378 	} else if (r > 0) {
1379 		s->valid = 0;
1380 		DMWARN("Snapshot is marked invalid.");
1381 	}
1382 
1383 	if (!s->store->chunk_size) {
1384 		ti->error = "Chunk size not set";
1385 		r = -EINVAL;
1386 		goto bad_read_metadata;
1387 	}
1388 
1389 	r = dm_set_target_max_io_len(ti, s->store->chunk_size);
1390 	if (r)
1391 		goto bad_read_metadata;
1392 
1393 	return 0;
1394 
1395 bad_read_metadata:
1396 	unregister_snapshot(s);
1397 bad_load_and_register:
1398 	mempool_exit(&s->pending_pool);
1399 bad_pending_pool:
1400 	dm_kcopyd_client_destroy(s->kcopyd_client);
1401 bad_kcopyd:
1402 	dm_exception_table_exit(&s->pending, pending_cache);
1403 	dm_exception_table_exit(&s->complete, exception_cache);
1404 bad_hash_tables:
1405 	dm_exception_store_destroy(s->store);
1406 bad_store:
1407 	dm_put_device(ti, s->cow);
1408 bad_cow:
1409 	dm_put_device(ti, s->origin);
1410 bad_origin:
1411 bad_features:
1412 	kfree(s);
1413 bad:
1414 	return r;
1415 }
1416 
__free_exceptions(struct dm_snapshot * s)1417 static void __free_exceptions(struct dm_snapshot *s)
1418 {
1419 	dm_kcopyd_client_destroy(s->kcopyd_client);
1420 	s->kcopyd_client = NULL;
1421 
1422 	dm_exception_table_exit(&s->pending, pending_cache);
1423 	dm_exception_table_exit(&s->complete, exception_cache);
1424 }
1425 
__handover_exceptions(struct dm_snapshot * snap_src,struct dm_snapshot * snap_dest)1426 static void __handover_exceptions(struct dm_snapshot *snap_src,
1427 				  struct dm_snapshot *snap_dest)
1428 {
1429 	union {
1430 		struct dm_exception_table table_swap;
1431 		struct dm_exception_store *store_swap;
1432 	} u;
1433 
1434 	/*
1435 	 * Swap all snapshot context information between the two instances.
1436 	 */
1437 	u.table_swap = snap_dest->complete;
1438 	snap_dest->complete = snap_src->complete;
1439 	snap_src->complete = u.table_swap;
1440 
1441 	u.store_swap = snap_dest->store;
1442 	snap_dest->store = snap_src->store;
1443 	snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
1444 	snap_src->store = u.store_swap;
1445 
1446 	snap_dest->store->snap = snap_dest;
1447 	snap_src->store->snap = snap_src;
1448 
1449 	snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
1450 	snap_dest->valid = snap_src->valid;
1451 	snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
1452 
1453 	/*
1454 	 * Set source invalid to ensure it receives no further I/O.
1455 	 */
1456 	snap_src->valid = 0;
1457 }
1458 
snapshot_dtr(struct dm_target * ti)1459 static void snapshot_dtr(struct dm_target *ti)
1460 {
1461 #ifdef CONFIG_DM_DEBUG
1462 	int i;
1463 #endif
1464 	struct dm_snapshot *s = ti->private;
1465 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1466 
1467 	down_read(&_origins_lock);
1468 	/* Check whether exception handover must be cancelled */
1469 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1470 	if (snap_src && snap_dest && (s == snap_src)) {
1471 		down_write(&snap_dest->lock);
1472 		snap_dest->valid = 0;
1473 		up_write(&snap_dest->lock);
1474 		DMERR("Cancelling snapshot handover.");
1475 	}
1476 	up_read(&_origins_lock);
1477 
1478 	if (dm_target_is_snapshot_merge(ti))
1479 		stop_merge(s);
1480 
1481 	/* Prevent further origin writes from using this snapshot. */
1482 	/* After this returns there can be no new kcopyd jobs. */
1483 	unregister_snapshot(s);
1484 
1485 	while (atomic_read(&s->pending_exceptions_count))
1486 		fsleep(1000);
1487 	/*
1488 	 * Ensure instructions in mempool_exit aren't reordered
1489 	 * before atomic_read.
1490 	 */
1491 	smp_mb();
1492 
1493 #ifdef CONFIG_DM_DEBUG
1494 	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
1495 		BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
1496 #endif
1497 
1498 	__free_exceptions(s);
1499 
1500 	mempool_exit(&s->pending_pool);
1501 
1502 	dm_exception_store_destroy(s->store);
1503 
1504 	dm_put_device(ti, s->cow);
1505 
1506 	dm_put_device(ti, s->origin);
1507 
1508 	WARN_ON(s->in_progress);
1509 
1510 	kfree(s);
1511 }
1512 
account_start_copy(struct dm_snapshot * s)1513 static void account_start_copy(struct dm_snapshot *s)
1514 {
1515 	spin_lock(&s->in_progress_wait.lock);
1516 	s->in_progress++;
1517 	spin_unlock(&s->in_progress_wait.lock);
1518 }
1519 
account_end_copy(struct dm_snapshot * s)1520 static void account_end_copy(struct dm_snapshot *s)
1521 {
1522 	spin_lock(&s->in_progress_wait.lock);
1523 	BUG_ON(!s->in_progress);
1524 	s->in_progress--;
1525 	if (likely(s->in_progress <= cow_threshold) &&
1526 	    unlikely(waitqueue_active(&s->in_progress_wait)))
1527 		wake_up_locked(&s->in_progress_wait);
1528 	spin_unlock(&s->in_progress_wait.lock);
1529 }
1530 
wait_for_in_progress(struct dm_snapshot * s,bool unlock_origins)1531 static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
1532 {
1533 	if (unlikely(s->in_progress > cow_threshold)) {
1534 		spin_lock(&s->in_progress_wait.lock);
1535 		if (likely(s->in_progress > cow_threshold)) {
1536 			/*
1537 			 * NOTE: this throttle doesn't account for whether
1538 			 * the caller is servicing an IO that will trigger a COW
1539 			 * so excess throttling may result for chunks not required
1540 			 * to be COW'd.  But if cow_threshold was reached, extra
1541 			 * throttling is unlikely to negatively impact performance.
1542 			 */
1543 			DECLARE_WAITQUEUE(wait, current);
1544 
1545 			__add_wait_queue(&s->in_progress_wait, &wait);
1546 			__set_current_state(TASK_UNINTERRUPTIBLE);
1547 			spin_unlock(&s->in_progress_wait.lock);
1548 			if (unlock_origins)
1549 				up_read(&_origins_lock);
1550 			io_schedule();
1551 			remove_wait_queue(&s->in_progress_wait, &wait);
1552 			return false;
1553 		}
1554 		spin_unlock(&s->in_progress_wait.lock);
1555 	}
1556 	return true;
1557 }
1558 
1559 /*
1560  * Flush a list of buffers.
1561  */
flush_bios(struct bio * bio)1562 static void flush_bios(struct bio *bio)
1563 {
1564 	struct bio *n;
1565 
1566 	while (bio) {
1567 		n = bio->bi_next;
1568 		bio->bi_next = NULL;
1569 		submit_bio_noacct(bio);
1570 		bio = n;
1571 	}
1572 }
1573 
1574 static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
1575 
1576 /*
1577  * Flush a list of buffers.
1578  */
retry_origin_bios(struct dm_snapshot * s,struct bio * bio)1579 static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1580 {
1581 	struct bio *n;
1582 	int r;
1583 
1584 	while (bio) {
1585 		n = bio->bi_next;
1586 		bio->bi_next = NULL;
1587 		r = do_origin(s->origin, bio, false);
1588 		if (r == DM_MAPIO_REMAPPED)
1589 			submit_bio_noacct(bio);
1590 		bio = n;
1591 	}
1592 }
1593 
1594 /*
1595  * Error a list of buffers.
1596  */
error_bios(struct bio * bio)1597 static void error_bios(struct bio *bio)
1598 {
1599 	struct bio *n;
1600 
1601 	while (bio) {
1602 		n = bio->bi_next;
1603 		bio->bi_next = NULL;
1604 		bio_io_error(bio);
1605 		bio = n;
1606 	}
1607 }
1608 
__invalidate_snapshot(struct dm_snapshot * s,int err)1609 static void __invalidate_snapshot(struct dm_snapshot *s, int err)
1610 {
1611 	if (!s->valid)
1612 		return;
1613 
1614 	if (err == -EIO)
1615 		DMERR("Invalidating snapshot: Error reading/writing.");
1616 	else if (err == -ENOMEM)
1617 		DMERR("Invalidating snapshot: Unable to allocate exception.");
1618 
1619 	if (s->store->type->drop_snapshot)
1620 		s->store->type->drop_snapshot(s->store);
1621 
1622 	s->valid = 0;
1623 
1624 	dm_table_event(s->ti->table);
1625 }
1626 
invalidate_snapshot(struct dm_snapshot * s,int err)1627 static void invalidate_snapshot(struct dm_snapshot *s, int err)
1628 {
1629 	down_write(&s->lock);
1630 	__invalidate_snapshot(s, err);
1631 	up_write(&s->lock);
1632 }
1633 
pending_complete(void * context,int success)1634 static void pending_complete(void *context, int success)
1635 {
1636 	struct dm_snap_pending_exception *pe = context;
1637 	struct dm_exception *e;
1638 	struct dm_snapshot *s = pe->snap;
1639 	struct bio *origin_bios = NULL;
1640 	struct bio *snapshot_bios = NULL;
1641 	struct bio *full_bio = NULL;
1642 	struct dm_exception_table_lock lock;
1643 	int error = 0;
1644 
1645 	dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
1646 
1647 	if (!success) {
1648 		/* Read/write error - snapshot is unusable */
1649 		invalidate_snapshot(s, -EIO);
1650 		error = 1;
1651 
1652 		dm_exception_table_lock(&lock);
1653 		goto out;
1654 	}
1655 
1656 	e = alloc_completed_exception(GFP_NOIO);
1657 	if (!e) {
1658 		invalidate_snapshot(s, -ENOMEM);
1659 		error = 1;
1660 
1661 		dm_exception_table_lock(&lock);
1662 		goto out;
1663 	}
1664 	*e = pe->e;
1665 
1666 	down_read(&s->lock);
1667 	dm_exception_table_lock(&lock);
1668 	if (!s->valid) {
1669 		up_read(&s->lock);
1670 		free_completed_exception(e);
1671 		error = 1;
1672 
1673 		goto out;
1674 	}
1675 
1676 	/*
1677 	 * Add a proper exception. After inserting the completed exception all
1678 	 * subsequent snapshot reads to this chunk will be redirected to the
1679 	 * COW device.  This ensures that we do not starve. Moreover, as long
1680 	 * as the pending exception exists, neither origin writes nor snapshot
1681 	 * merging can overwrite the chunk in origin.
1682 	 */
1683 	dm_insert_exception(&s->complete, e);
1684 	up_read(&s->lock);
1685 
1686 	/* Wait for conflicting reads to drain */
1687 	if (__chunk_is_tracked(s, pe->e.old_chunk)) {
1688 		dm_exception_table_unlock(&lock);
1689 		__check_for_conflicting_io(s, pe->e.old_chunk);
1690 		dm_exception_table_lock(&lock);
1691 	}
1692 
1693 out:
1694 	/* Remove the in-flight exception from the list */
1695 	dm_remove_exception(&pe->e);
1696 
1697 	dm_exception_table_unlock(&lock);
1698 
1699 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
1700 	origin_bios = bio_list_get(&pe->origin_bios);
1701 	full_bio = pe->full_bio;
1702 	if (full_bio)
1703 		full_bio->bi_end_io = pe->full_bio_end_io;
1704 	increment_pending_exceptions_done_count();
1705 
1706 	/* Submit any pending write bios */
1707 	if (error) {
1708 		if (full_bio)
1709 			bio_io_error(full_bio);
1710 		error_bios(snapshot_bios);
1711 	} else {
1712 		if (full_bio)
1713 			bio_endio(full_bio);
1714 		flush_bios(snapshot_bios);
1715 	}
1716 
1717 	retry_origin_bios(s, origin_bios);
1718 
1719 	free_pending_exception(pe);
1720 }
1721 
complete_exception(struct dm_snap_pending_exception * pe)1722 static void complete_exception(struct dm_snap_pending_exception *pe)
1723 {
1724 	struct dm_snapshot *s = pe->snap;
1725 
1726 	/* Update the metadata if we are persistent */
1727 	s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
1728 					 pending_complete, pe);
1729 }
1730 
1731 /*
1732  * Called when the copy I/O has finished.  kcopyd actually runs
1733  * this code so don't block.
1734  */
copy_callback(int read_err,unsigned long write_err,void * context)1735 static void copy_callback(int read_err, unsigned long write_err, void *context)
1736 {
1737 	struct dm_snap_pending_exception *pe = context;
1738 	struct dm_snapshot *s = pe->snap;
1739 
1740 	pe->copy_error = read_err || write_err;
1741 
1742 	if (pe->exception_sequence == s->exception_complete_sequence) {
1743 		struct rb_node *next;
1744 
1745 		s->exception_complete_sequence++;
1746 		complete_exception(pe);
1747 
1748 		next = rb_first(&s->out_of_order_tree);
1749 		while (next) {
1750 			pe = rb_entry(next, struct dm_snap_pending_exception,
1751 					out_of_order_node);
1752 			if (pe->exception_sequence != s->exception_complete_sequence)
1753 				break;
1754 			next = rb_next(next);
1755 			s->exception_complete_sequence++;
1756 			rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
1757 			complete_exception(pe);
1758 			cond_resched();
1759 		}
1760 	} else {
1761 		struct rb_node *parent = NULL;
1762 		struct rb_node **p = &s->out_of_order_tree.rb_node;
1763 		struct dm_snap_pending_exception *pe2;
1764 
1765 		while (*p) {
1766 			pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
1767 			parent = *p;
1768 
1769 			BUG_ON(pe->exception_sequence == pe2->exception_sequence);
1770 			if (pe->exception_sequence < pe2->exception_sequence)
1771 				p = &((*p)->rb_left);
1772 			else
1773 				p = &((*p)->rb_right);
1774 		}
1775 
1776 		rb_link_node(&pe->out_of_order_node, parent, p);
1777 		rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
1778 	}
1779 	account_end_copy(s);
1780 }
1781 
1782 /*
1783  * Dispatches the copy operation to kcopyd.
1784  */
start_copy(struct dm_snap_pending_exception * pe)1785 static void start_copy(struct dm_snap_pending_exception *pe)
1786 {
1787 	struct dm_snapshot *s = pe->snap;
1788 	struct dm_io_region src, dest;
1789 	struct block_device *bdev = s->origin->bdev;
1790 	sector_t dev_size;
1791 
1792 	dev_size = get_dev_size(bdev);
1793 
1794 	src.bdev = bdev;
1795 	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
1796 	src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
1797 
1798 	dest.bdev = s->cow->bdev;
1799 	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
1800 	dest.count = src.count;
1801 
1802 	/* Hand over to kcopyd */
1803 	account_start_copy(s);
1804 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
1805 }
1806 
full_bio_end_io(struct bio * bio)1807 static void full_bio_end_io(struct bio *bio)
1808 {
1809 	void *callback_data = bio->bi_private;
1810 
1811 	dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
1812 }
1813 
start_full_bio(struct dm_snap_pending_exception * pe,struct bio * bio)1814 static void start_full_bio(struct dm_snap_pending_exception *pe,
1815 			   struct bio *bio)
1816 {
1817 	struct dm_snapshot *s = pe->snap;
1818 	void *callback_data;
1819 
1820 	pe->full_bio = bio;
1821 	pe->full_bio_end_io = bio->bi_end_io;
1822 
1823 	account_start_copy(s);
1824 	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
1825 						   copy_callback, pe);
1826 
1827 	bio->bi_end_io = full_bio_end_io;
1828 	bio->bi_private = callback_data;
1829 
1830 	submit_bio_noacct(bio);
1831 }
1832 
1833 static struct dm_snap_pending_exception *
__lookup_pending_exception(struct dm_snapshot * s,chunk_t chunk)1834 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
1835 {
1836 	struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
1837 
1838 	if (!e)
1839 		return NULL;
1840 
1841 	return container_of(e, struct dm_snap_pending_exception, e);
1842 }
1843 
1844 /*
1845  * Inserts a pending exception into the pending table.
1846  *
1847  * NOTE: a write lock must be held on the chunk's pending exception table slot
1848  * before calling this.
1849  */
1850 static struct dm_snap_pending_exception *
__insert_pending_exception(struct dm_snapshot * s,struct dm_snap_pending_exception * pe,chunk_t chunk)1851 __insert_pending_exception(struct dm_snapshot *s,
1852 			   struct dm_snap_pending_exception *pe, chunk_t chunk)
1853 {
1854 	pe->e.old_chunk = chunk;
1855 	bio_list_init(&pe->origin_bios);
1856 	bio_list_init(&pe->snapshot_bios);
1857 	pe->started = 0;
1858 	pe->full_bio = NULL;
1859 
1860 	spin_lock(&s->pe_allocation_lock);
1861 	if (s->store->type->prepare_exception(s->store, &pe->e)) {
1862 		spin_unlock(&s->pe_allocation_lock);
1863 		free_pending_exception(pe);
1864 		return NULL;
1865 	}
1866 
1867 	pe->exception_sequence = s->exception_start_sequence++;
1868 	spin_unlock(&s->pe_allocation_lock);
1869 
1870 	dm_insert_exception(&s->pending, &pe->e);
1871 
1872 	return pe;
1873 }
1874 
1875 /*
1876  * Looks to see if this snapshot already has a pending exception
1877  * for this chunk, otherwise it allocates a new one and inserts
1878  * it into the pending table.
1879  *
1880  * NOTE: a write lock must be held on the chunk's pending exception table slot
1881  * before calling this.
1882  */
1883 static struct dm_snap_pending_exception *
__find_pending_exception(struct dm_snapshot * s,struct dm_snap_pending_exception * pe,chunk_t chunk)1884 __find_pending_exception(struct dm_snapshot *s,
1885 			 struct dm_snap_pending_exception *pe, chunk_t chunk)
1886 {
1887 	struct dm_snap_pending_exception *pe2;
1888 
1889 	pe2 = __lookup_pending_exception(s, chunk);
1890 	if (pe2) {
1891 		free_pending_exception(pe);
1892 		return pe2;
1893 	}
1894 
1895 	return __insert_pending_exception(s, pe, chunk);
1896 }
1897 
remap_exception(struct dm_snapshot * s,struct dm_exception * e,struct bio * bio,chunk_t chunk)1898 static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1899 			    struct bio *bio, chunk_t chunk)
1900 {
1901 	bio_set_dev(bio, s->cow->bdev);
1902 	bio->bi_iter.bi_sector =
1903 		chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
1904 				(chunk - e->old_chunk)) +
1905 		(bio->bi_iter.bi_sector & s->store->chunk_mask);
1906 }
1907 
zero_callback(int read_err,unsigned long write_err,void * context)1908 static void zero_callback(int read_err, unsigned long write_err, void *context)
1909 {
1910 	struct bio *bio = context;
1911 	struct dm_snapshot *s = bio->bi_private;
1912 
1913 	account_end_copy(s);
1914 	bio->bi_status = write_err ? BLK_STS_IOERR : 0;
1915 	bio_endio(bio);
1916 }
1917 
zero_exception(struct dm_snapshot * s,struct dm_exception * e,struct bio * bio,chunk_t chunk)1918 static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
1919 			   struct bio *bio, chunk_t chunk)
1920 {
1921 	struct dm_io_region dest;
1922 
1923 	dest.bdev = s->cow->bdev;
1924 	dest.sector = bio->bi_iter.bi_sector;
1925 	dest.count = s->store->chunk_size;
1926 
1927 	account_start_copy(s);
1928 	WARN_ON_ONCE(bio->bi_private);
1929 	bio->bi_private = s;
1930 	dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
1931 }
1932 
io_overlaps_chunk(struct dm_snapshot * s,struct bio * bio)1933 static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio)
1934 {
1935 	return bio->bi_iter.bi_size ==
1936 		(s->store->chunk_size << SECTOR_SHIFT);
1937 }
1938 
snapshot_map(struct dm_target * ti,struct bio * bio)1939 static int snapshot_map(struct dm_target *ti, struct bio *bio)
1940 {
1941 	struct dm_exception *e;
1942 	struct dm_snapshot *s = ti->private;
1943 	int r = DM_MAPIO_REMAPPED;
1944 	chunk_t chunk;
1945 	struct dm_snap_pending_exception *pe = NULL;
1946 	struct dm_exception_table_lock lock;
1947 
1948 	init_tracked_chunk(bio);
1949 
1950 	if (bio->bi_opf & REQ_PREFLUSH) {
1951 		bio_set_dev(bio, s->cow->bdev);
1952 		return DM_MAPIO_REMAPPED;
1953 	}
1954 
1955 	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
1956 	dm_exception_table_lock_init(s, chunk, &lock);
1957 
1958 	/* Full snapshots are not usable */
1959 	/* To get here the table must be live so s->active is always set. */
1960 	if (!s->valid)
1961 		return DM_MAPIO_KILL;
1962 
1963 	if (bio_data_dir(bio) == WRITE) {
1964 		while (unlikely(!wait_for_in_progress(s, false)))
1965 			; /* wait_for_in_progress() has slept */
1966 	}
1967 
1968 	down_read(&s->lock);
1969 	dm_exception_table_lock(&lock);
1970 
1971 	if (!s->valid || (unlikely(s->snapshot_overflowed) &&
1972 	    bio_data_dir(bio) == WRITE)) {
1973 		r = DM_MAPIO_KILL;
1974 		goto out_unlock;
1975 	}
1976 
1977 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1978 		if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
1979 			/*
1980 			 * passdown discard to origin (without triggering
1981 			 * snapshot exceptions via do_origin; doing so would
1982 			 * defeat the goal of freeing space in origin that is
1983 			 * implied by the "discard_passdown_origin" feature)
1984 			 */
1985 			bio_set_dev(bio, s->origin->bdev);
1986 			track_chunk(s, bio, chunk);
1987 			goto out_unlock;
1988 		}
1989 		/* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
1990 	}
1991 
1992 	/* If the block is already remapped - use that, else remap it */
1993 	e = dm_lookup_exception(&s->complete, chunk);
1994 	if (e) {
1995 		remap_exception(s, e, bio, chunk);
1996 		if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
1997 		    io_overlaps_chunk(s, bio)) {
1998 			dm_exception_table_unlock(&lock);
1999 			up_read(&s->lock);
2000 			zero_exception(s, e, bio, chunk);
2001 			r = DM_MAPIO_SUBMITTED; /* discard is not issued */
2002 			goto out;
2003 		}
2004 		goto out_unlock;
2005 	}
2006 
2007 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
2008 		/*
2009 		 * If no exception exists, complete discard immediately
2010 		 * otherwise it'll trigger copy-out.
2011 		 */
2012 		bio_endio(bio);
2013 		r = DM_MAPIO_SUBMITTED;
2014 		goto out_unlock;
2015 	}
2016 
2017 	/*
2018 	 * Write to snapshot - higher level takes care of RW/RO
2019 	 * flags so we should only get this if we are
2020 	 * writable.
2021 	 */
2022 	if (bio_data_dir(bio) == WRITE) {
2023 		pe = __lookup_pending_exception(s, chunk);
2024 		if (!pe) {
2025 			dm_exception_table_unlock(&lock);
2026 			pe = alloc_pending_exception(s);
2027 			dm_exception_table_lock(&lock);
2028 
2029 			e = dm_lookup_exception(&s->complete, chunk);
2030 			if (e) {
2031 				free_pending_exception(pe);
2032 				remap_exception(s, e, bio, chunk);
2033 				goto out_unlock;
2034 			}
2035 
2036 			pe = __find_pending_exception(s, pe, chunk);
2037 			if (!pe) {
2038 				dm_exception_table_unlock(&lock);
2039 				up_read(&s->lock);
2040 
2041 				down_write(&s->lock);
2042 
2043 				if (s->store->userspace_supports_overflow) {
2044 					if (s->valid && !s->snapshot_overflowed) {
2045 						s->snapshot_overflowed = 1;
2046 						DMERR("Snapshot overflowed: Unable to allocate exception.");
2047 					}
2048 				} else
2049 					__invalidate_snapshot(s, -ENOMEM);
2050 				up_write(&s->lock);
2051 
2052 				r = DM_MAPIO_KILL;
2053 				goto out;
2054 			}
2055 		}
2056 
2057 		remap_exception(s, &pe->e, bio, chunk);
2058 
2059 		r = DM_MAPIO_SUBMITTED;
2060 
2061 		if (!pe->started && io_overlaps_chunk(s, bio)) {
2062 			pe->started = 1;
2063 
2064 			dm_exception_table_unlock(&lock);
2065 			up_read(&s->lock);
2066 
2067 			start_full_bio(pe, bio);
2068 			goto out;
2069 		}
2070 
2071 		bio_list_add(&pe->snapshot_bios, bio);
2072 
2073 		if (!pe->started) {
2074 			/* this is protected by the exception table lock */
2075 			pe->started = 1;
2076 
2077 			dm_exception_table_unlock(&lock);
2078 			up_read(&s->lock);
2079 
2080 			start_copy(pe);
2081 			goto out;
2082 		}
2083 	} else {
2084 		bio_set_dev(bio, s->origin->bdev);
2085 		track_chunk(s, bio, chunk);
2086 	}
2087 
2088 out_unlock:
2089 	dm_exception_table_unlock(&lock);
2090 	up_read(&s->lock);
2091 out:
2092 	return r;
2093 }
2094 
2095 /*
2096  * A snapshot-merge target behaves like a combination of a snapshot
2097  * target and a snapshot-origin target.  It only generates new
2098  * exceptions in other snapshots and not in the one that is being
2099  * merged.
2100  *
2101  * For each chunk, if there is an existing exception, it is used to
2102  * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
2103  * which in turn might generate exceptions in other snapshots.
2104  * If merging is currently taking place on the chunk in question, the
2105  * I/O is deferred by adding it to s->bios_queued_during_merge.
2106  */
snapshot_merge_map(struct dm_target * ti,struct bio * bio)2107 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
2108 {
2109 	struct dm_exception *e;
2110 	struct dm_snapshot *s = ti->private;
2111 	int r = DM_MAPIO_REMAPPED;
2112 	chunk_t chunk;
2113 
2114 	init_tracked_chunk(bio);
2115 
2116 	if (bio->bi_opf & REQ_PREFLUSH) {
2117 		if (!dm_bio_get_target_bio_nr(bio))
2118 			bio_set_dev(bio, s->origin->bdev);
2119 		else
2120 			bio_set_dev(bio, s->cow->bdev);
2121 		return DM_MAPIO_REMAPPED;
2122 	}
2123 
2124 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
2125 		/* Once merging, discards no longer effect change */
2126 		bio_endio(bio);
2127 		return DM_MAPIO_SUBMITTED;
2128 	}
2129 
2130 	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
2131 
2132 	down_write(&s->lock);
2133 
2134 	/* Full merging snapshots are redirected to the origin */
2135 	if (!s->valid)
2136 		goto redirect_to_origin;
2137 
2138 	/* If the block is already remapped - use that */
2139 	e = dm_lookup_exception(&s->complete, chunk);
2140 	if (e) {
2141 		/* Queue writes overlapping with chunks being merged */
2142 		if (bio_data_dir(bio) == WRITE &&
2143 		    chunk >= s->first_merging_chunk &&
2144 		    chunk < (s->first_merging_chunk +
2145 			     s->num_merging_chunks)) {
2146 			bio_set_dev(bio, s->origin->bdev);
2147 			bio_list_add(&s->bios_queued_during_merge, bio);
2148 			r = DM_MAPIO_SUBMITTED;
2149 			goto out_unlock;
2150 		}
2151 
2152 		remap_exception(s, e, bio, chunk);
2153 
2154 		if (bio_data_dir(bio) == WRITE)
2155 			track_chunk(s, bio, chunk);
2156 		goto out_unlock;
2157 	}
2158 
2159 redirect_to_origin:
2160 	bio_set_dev(bio, s->origin->bdev);
2161 
2162 	if (bio_data_dir(bio) == WRITE) {
2163 		up_write(&s->lock);
2164 		return do_origin(s->origin, bio, false);
2165 	}
2166 
2167 out_unlock:
2168 	up_write(&s->lock);
2169 
2170 	return r;
2171 }
2172 
snapshot_end_io(struct dm_target * ti,struct bio * bio,blk_status_t * error)2173 static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
2174 		blk_status_t *error)
2175 {
2176 	struct dm_snapshot *s = ti->private;
2177 
2178 	if (is_bio_tracked(bio))
2179 		stop_tracking_chunk(s, bio);
2180 
2181 	return DM_ENDIO_DONE;
2182 }
2183 
snapshot_merge_presuspend(struct dm_target * ti)2184 static void snapshot_merge_presuspend(struct dm_target *ti)
2185 {
2186 	struct dm_snapshot *s = ti->private;
2187 
2188 	stop_merge(s);
2189 }
2190 
snapshot_preresume(struct dm_target * ti)2191 static int snapshot_preresume(struct dm_target *ti)
2192 {
2193 	int r = 0;
2194 	struct dm_snapshot *s = ti->private;
2195 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
2196 
2197 	down_read(&_origins_lock);
2198 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
2199 	if (snap_src && snap_dest) {
2200 		down_read(&snap_src->lock);
2201 		if (s == snap_src) {
2202 			DMERR("Unable to resume snapshot source until handover completes.");
2203 			r = -EINVAL;
2204 		} else if (!dm_suspended(snap_src->ti)) {
2205 			DMERR("Unable to perform snapshot handover until source is suspended.");
2206 			r = -EINVAL;
2207 		}
2208 		up_read(&snap_src->lock);
2209 	}
2210 	up_read(&_origins_lock);
2211 
2212 	return r;
2213 }
2214 
snapshot_resume(struct dm_target * ti)2215 static void snapshot_resume(struct dm_target *ti)
2216 {
2217 	struct dm_snapshot *s = ti->private;
2218 	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
2219 	struct dm_origin *o;
2220 	struct mapped_device *origin_md = NULL;
2221 	bool must_restart_merging = false;
2222 
2223 	down_read(&_origins_lock);
2224 
2225 	o = __lookup_dm_origin(s->origin->bdev);
2226 	if (o)
2227 		origin_md = dm_table_get_md(o->ti->table);
2228 	if (!origin_md) {
2229 		(void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
2230 		if (snap_merging)
2231 			origin_md = dm_table_get_md(snap_merging->ti->table);
2232 	}
2233 	if (origin_md == dm_table_get_md(ti->table))
2234 		origin_md = NULL;
2235 	if (origin_md) {
2236 		if (dm_hold(origin_md))
2237 			origin_md = NULL;
2238 	}
2239 
2240 	up_read(&_origins_lock);
2241 
2242 	if (origin_md) {
2243 		dm_internal_suspend_fast(origin_md);
2244 		if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
2245 			must_restart_merging = true;
2246 			stop_merge(snap_merging);
2247 		}
2248 	}
2249 
2250 	down_read(&_origins_lock);
2251 
2252 	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
2253 	if (snap_src && snap_dest) {
2254 		down_write(&snap_src->lock);
2255 		down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
2256 		__handover_exceptions(snap_src, snap_dest);
2257 		up_write(&snap_dest->lock);
2258 		up_write(&snap_src->lock);
2259 	}
2260 
2261 	up_read(&_origins_lock);
2262 
2263 	if (origin_md) {
2264 		if (must_restart_merging)
2265 			start_merge(snap_merging);
2266 		dm_internal_resume_fast(origin_md);
2267 		dm_put(origin_md);
2268 	}
2269 
2270 	/* Now we have correct chunk size, reregister */
2271 	reregister_snapshot(s);
2272 
2273 	down_write(&s->lock);
2274 	s->active = 1;
2275 	up_write(&s->lock);
2276 }
2277 
get_origin_minimum_chunksize(struct block_device * bdev)2278 static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
2279 {
2280 	uint32_t min_chunksize;
2281 
2282 	down_read(&_origins_lock);
2283 	min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
2284 	up_read(&_origins_lock);
2285 
2286 	return min_chunksize;
2287 }
2288 
snapshot_merge_resume(struct dm_target * ti)2289 static void snapshot_merge_resume(struct dm_target *ti)
2290 {
2291 	struct dm_snapshot *s = ti->private;
2292 
2293 	/*
2294 	 * Handover exceptions from existing snapshot.
2295 	 */
2296 	snapshot_resume(ti);
2297 
2298 	/*
2299 	 * snapshot-merge acts as an origin, so set ti->max_io_len
2300 	 */
2301 	ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
2302 
2303 	start_merge(s);
2304 }
2305 
snapshot_status(struct dm_target * ti,status_type_t type,unsigned int status_flags,char * result,unsigned int maxlen)2306 static void snapshot_status(struct dm_target *ti, status_type_t type,
2307 			    unsigned int status_flags, char *result, unsigned int maxlen)
2308 {
2309 	unsigned int sz = 0;
2310 	struct dm_snapshot *snap = ti->private;
2311 	unsigned int num_features;
2312 
2313 	switch (type) {
2314 	case STATUSTYPE_INFO:
2315 
2316 		down_write(&snap->lock);
2317 
2318 		if (!snap->valid)
2319 			DMEMIT("Invalid");
2320 		else if (snap->merge_failed)
2321 			DMEMIT("Merge failed");
2322 		else if (snap->snapshot_overflowed)
2323 			DMEMIT("Overflow");
2324 		else {
2325 			if (snap->store->type->usage) {
2326 				sector_t total_sectors, sectors_allocated,
2327 					 metadata_sectors;
2328 				snap->store->type->usage(snap->store,
2329 							 &total_sectors,
2330 							 &sectors_allocated,
2331 							 &metadata_sectors);
2332 				DMEMIT("%llu/%llu %llu",
2333 				       (unsigned long long)sectors_allocated,
2334 				       (unsigned long long)total_sectors,
2335 				       (unsigned long long)metadata_sectors);
2336 			} else
2337 				DMEMIT("Unknown");
2338 		}
2339 
2340 		up_write(&snap->lock);
2341 
2342 		break;
2343 
2344 	case STATUSTYPE_TABLE:
2345 		/*
2346 		 * kdevname returns a static pointer so we need
2347 		 * to make private copies if the output is to
2348 		 * make sense.
2349 		 */
2350 		DMEMIT("%s %s", snap->origin->name, snap->cow->name);
2351 		sz += snap->store->type->status(snap->store, type, result + sz,
2352 						maxlen - sz);
2353 		num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
2354 		if (num_features) {
2355 			DMEMIT(" %u", num_features);
2356 			if (snap->discard_zeroes_cow)
2357 				DMEMIT(" discard_zeroes_cow");
2358 			if (snap->discard_passdown_origin)
2359 				DMEMIT(" discard_passdown_origin");
2360 		}
2361 		break;
2362 
2363 	case STATUSTYPE_IMA:
2364 		DMEMIT_TARGET_NAME_VERSION(ti->type);
2365 		DMEMIT(",snap_origin_name=%s", snap->origin->name);
2366 		DMEMIT(",snap_cow_name=%s", snap->cow->name);
2367 		DMEMIT(",snap_valid=%c", snap->valid ? 'y' : 'n');
2368 		DMEMIT(",snap_merge_failed=%c", snap->merge_failed ? 'y' : 'n');
2369 		DMEMIT(",snapshot_overflowed=%c", snap->snapshot_overflowed ? 'y' : 'n');
2370 		DMEMIT(";");
2371 		break;
2372 	}
2373 }
2374 
snapshot_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)2375 static int snapshot_iterate_devices(struct dm_target *ti,
2376 				    iterate_devices_callout_fn fn, void *data)
2377 {
2378 	struct dm_snapshot *snap = ti->private;
2379 	int r;
2380 
2381 	r = fn(ti, snap->origin, 0, ti->len, data);
2382 
2383 	if (!r)
2384 		r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
2385 
2386 	return r;
2387 }
2388 
snapshot_io_hints(struct dm_target * ti,struct queue_limits * limits)2389 static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
2390 {
2391 	struct dm_snapshot *snap = ti->private;
2392 
2393 	if (snap->discard_zeroes_cow) {
2394 		struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
2395 
2396 		down_read(&_origins_lock);
2397 
2398 		(void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
2399 		if (snap_src && snap_dest)
2400 			snap = snap_src;
2401 
2402 		/* All discards are split on chunk_size boundary */
2403 		limits->discard_granularity = snap->store->chunk_size;
2404 		limits->max_hw_discard_sectors = snap->store->chunk_size;
2405 
2406 		up_read(&_origins_lock);
2407 	}
2408 }
2409 
2410 /*
2411  *---------------------------------------------------------------
2412  * Origin methods
2413  *---------------------------------------------------------------
2414  */
2415 /*
2416  * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
2417  * supplied bio was ignored.  The caller may submit it immediately.
2418  * (No remapping actually occurs as the origin is always a direct linear
2419  * map.)
2420  *
2421  * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
2422  * and any supplied bio is added to a list to be submitted once all
2423  * the necessary exceptions exist.
2424  */
__origin_write(struct list_head * snapshots,sector_t sector,struct bio * bio)2425 static int __origin_write(struct list_head *snapshots, sector_t sector,
2426 			  struct bio *bio)
2427 {
2428 	int r = DM_MAPIO_REMAPPED;
2429 	struct dm_snapshot *snap;
2430 	struct dm_exception *e;
2431 	struct dm_snap_pending_exception *pe, *pe2;
2432 	struct dm_snap_pending_exception *pe_to_start_now = NULL;
2433 	struct dm_snap_pending_exception *pe_to_start_last = NULL;
2434 	struct dm_exception_table_lock lock;
2435 	chunk_t chunk;
2436 
2437 	/* Do all the snapshots on this origin */
2438 	list_for_each_entry(snap, snapshots, list) {
2439 		/*
2440 		 * Don't make new exceptions in a merging snapshot
2441 		 * because it has effectively been deleted
2442 		 */
2443 		if (dm_target_is_snapshot_merge(snap->ti))
2444 			continue;
2445 
2446 		/* Nothing to do if writing beyond end of snapshot */
2447 		if (sector >= dm_table_get_size(snap->ti->table))
2448 			continue;
2449 
2450 		/*
2451 		 * Remember, different snapshots can have
2452 		 * different chunk sizes.
2453 		 */
2454 		chunk = sector_to_chunk(snap->store, sector);
2455 		dm_exception_table_lock_init(snap, chunk, &lock);
2456 
2457 		down_read(&snap->lock);
2458 		dm_exception_table_lock(&lock);
2459 
2460 		/* Only deal with valid and active snapshots */
2461 		if (!snap->valid || !snap->active)
2462 			goto next_snapshot;
2463 
2464 		pe = __lookup_pending_exception(snap, chunk);
2465 		if (!pe) {
2466 			/*
2467 			 * Check exception table to see if block is already
2468 			 * remapped in this snapshot and trigger an exception
2469 			 * if not.
2470 			 */
2471 			e = dm_lookup_exception(&snap->complete, chunk);
2472 			if (e)
2473 				goto next_snapshot;
2474 
2475 			dm_exception_table_unlock(&lock);
2476 			pe = alloc_pending_exception(snap);
2477 			dm_exception_table_lock(&lock);
2478 
2479 			pe2 = __lookup_pending_exception(snap, chunk);
2480 
2481 			if (!pe2) {
2482 				e = dm_lookup_exception(&snap->complete, chunk);
2483 				if (e) {
2484 					free_pending_exception(pe);
2485 					goto next_snapshot;
2486 				}
2487 
2488 				pe = __insert_pending_exception(snap, pe, chunk);
2489 				if (!pe) {
2490 					dm_exception_table_unlock(&lock);
2491 					up_read(&snap->lock);
2492 
2493 					invalidate_snapshot(snap, -ENOMEM);
2494 					continue;
2495 				}
2496 			} else {
2497 				free_pending_exception(pe);
2498 				pe = pe2;
2499 			}
2500 		}
2501 
2502 		r = DM_MAPIO_SUBMITTED;
2503 
2504 		/*
2505 		 * If an origin bio was supplied, queue it to wait for the
2506 		 * completion of this exception, and start this one last,
2507 		 * at the end of the function.
2508 		 */
2509 		if (bio) {
2510 			bio_list_add(&pe->origin_bios, bio);
2511 			bio = NULL;
2512 
2513 			if (!pe->started) {
2514 				pe->started = 1;
2515 				pe_to_start_last = pe;
2516 			}
2517 		}
2518 
2519 		if (!pe->started) {
2520 			pe->started = 1;
2521 			pe_to_start_now = pe;
2522 		}
2523 
2524 next_snapshot:
2525 		dm_exception_table_unlock(&lock);
2526 		up_read(&snap->lock);
2527 
2528 		if (pe_to_start_now) {
2529 			start_copy(pe_to_start_now);
2530 			pe_to_start_now = NULL;
2531 		}
2532 	}
2533 
2534 	/*
2535 	 * Submit the exception against which the bio is queued last,
2536 	 * to give the other exceptions a head start.
2537 	 */
2538 	if (pe_to_start_last)
2539 		start_copy(pe_to_start_last);
2540 
2541 	return r;
2542 }
2543 
2544 /*
2545  * Called on a write from the origin driver.
2546  */
do_origin(struct dm_dev * origin,struct bio * bio,bool limit)2547 static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
2548 {
2549 	struct origin *o;
2550 	int r = DM_MAPIO_REMAPPED;
2551 
2552 again:
2553 	down_read(&_origins_lock);
2554 	o = __lookup_origin(origin->bdev);
2555 	if (o) {
2556 		if (limit) {
2557 			struct dm_snapshot *s;
2558 
2559 			list_for_each_entry(s, &o->snapshots, list)
2560 				if (unlikely(!wait_for_in_progress(s, true)))
2561 					goto again;
2562 		}
2563 
2564 		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
2565 	}
2566 	up_read(&_origins_lock);
2567 
2568 	return r;
2569 }
2570 
2571 /*
2572  * Trigger exceptions in all non-merging snapshots.
2573  *
2574  * The chunk size of the merging snapshot may be larger than the chunk
2575  * size of some other snapshot so we may need to reallocate multiple
2576  * chunks in other snapshots.
2577  *
2578  * We scan all the overlapping exceptions in the other snapshots.
2579  * Returns 1 if anything was reallocated and must be waited for,
2580  * otherwise returns 0.
2581  *
2582  * size must be a multiple of merging_snap's chunk_size.
2583  */
origin_write_extent(struct dm_snapshot * merging_snap,sector_t sector,unsigned int size)2584 static int origin_write_extent(struct dm_snapshot *merging_snap,
2585 			       sector_t sector, unsigned int size)
2586 {
2587 	int must_wait = 0;
2588 	sector_t n;
2589 	struct origin *o;
2590 
2591 	/*
2592 	 * The origin's __minimum_chunk_size() got stored in max_io_len
2593 	 * by snapshot_merge_resume().
2594 	 */
2595 	down_read(&_origins_lock);
2596 	o = __lookup_origin(merging_snap->origin->bdev);
2597 	for (n = 0; n < size; n += merging_snap->ti->max_io_len)
2598 		if (__origin_write(&o->snapshots, sector + n, NULL) ==
2599 		    DM_MAPIO_SUBMITTED)
2600 			must_wait = 1;
2601 	up_read(&_origins_lock);
2602 
2603 	return must_wait;
2604 }
2605 
2606 /*
2607  * Origin: maps a linear range of a device, with hooks for snapshotting.
2608  */
2609 
2610 /*
2611  * Construct an origin mapping: <dev_path>
2612  * The context for an origin is merely a 'struct dm_dev *'
2613  * pointing to the real device.
2614  */
origin_ctr(struct dm_target * ti,unsigned int argc,char ** argv)2615 static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2616 {
2617 	int r;
2618 	struct dm_origin *o;
2619 
2620 	if (argc != 1) {
2621 		ti->error = "origin: incorrect number of arguments";
2622 		return -EINVAL;
2623 	}
2624 
2625 	o = kmalloc_obj(struct dm_origin);
2626 	if (!o) {
2627 		ti->error = "Cannot allocate private origin structure";
2628 		r = -ENOMEM;
2629 		goto bad_alloc;
2630 	}
2631 
2632 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
2633 	if (r) {
2634 		ti->error = "Cannot get target device";
2635 		goto bad_open;
2636 	}
2637 
2638 	o->ti = ti;
2639 	ti->private = o;
2640 	ti->num_flush_bios = 1;
2641 
2642 	return 0;
2643 
2644 bad_open:
2645 	kfree(o);
2646 bad_alloc:
2647 	return r;
2648 }
2649 
origin_dtr(struct dm_target * ti)2650 static void origin_dtr(struct dm_target *ti)
2651 {
2652 	struct dm_origin *o = ti->private;
2653 
2654 	dm_put_device(ti, o->dev);
2655 	kfree(o);
2656 }
2657 
origin_map(struct dm_target * ti,struct bio * bio)2658 static int origin_map(struct dm_target *ti, struct bio *bio)
2659 {
2660 	struct dm_origin *o = ti->private;
2661 	unsigned int available_sectors;
2662 
2663 	bio_set_dev(bio, o->dev->bdev);
2664 
2665 	if (unlikely(bio->bi_opf & REQ_PREFLUSH))
2666 		return DM_MAPIO_REMAPPED;
2667 
2668 	if (bio_data_dir(bio) != WRITE)
2669 		return DM_MAPIO_REMAPPED;
2670 
2671 	available_sectors = o->split_boundary -
2672 		((unsigned int)bio->bi_iter.bi_sector & (o->split_boundary - 1));
2673 
2674 	if (bio_sectors(bio) > available_sectors)
2675 		dm_accept_partial_bio(bio, available_sectors);
2676 
2677 	/* Only tell snapshots if this is a write */
2678 	return do_origin(o->dev, bio, true);
2679 }
2680 
2681 /*
2682  * Set the target "max_io_len" field to the minimum of all the snapshots'
2683  * chunk sizes.
2684  */
origin_resume(struct dm_target * ti)2685 static void origin_resume(struct dm_target *ti)
2686 {
2687 	struct dm_origin *o = ti->private;
2688 
2689 	o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
2690 
2691 	down_write(&_origins_lock);
2692 	__insert_dm_origin(o);
2693 	up_write(&_origins_lock);
2694 }
2695 
origin_postsuspend(struct dm_target * ti)2696 static void origin_postsuspend(struct dm_target *ti)
2697 {
2698 	struct dm_origin *o = ti->private;
2699 
2700 	down_write(&_origins_lock);
2701 	__remove_dm_origin(o);
2702 	up_write(&_origins_lock);
2703 }
2704 
origin_status(struct dm_target * ti,status_type_t type,unsigned int status_flags,char * result,unsigned int maxlen)2705 static void origin_status(struct dm_target *ti, status_type_t type,
2706 			  unsigned int status_flags, char *result, unsigned int maxlen)
2707 {
2708 	struct dm_origin *o = ti->private;
2709 
2710 	switch (type) {
2711 	case STATUSTYPE_INFO:
2712 		result[0] = '\0';
2713 		break;
2714 
2715 	case STATUSTYPE_TABLE:
2716 		snprintf(result, maxlen, "%s", o->dev->name);
2717 		break;
2718 	case STATUSTYPE_IMA:
2719 		result[0] = '\0';
2720 		break;
2721 	}
2722 }
2723 
origin_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)2724 static int origin_iterate_devices(struct dm_target *ti,
2725 				  iterate_devices_callout_fn fn, void *data)
2726 {
2727 	struct dm_origin *o = ti->private;
2728 
2729 	return fn(ti, o->dev, 0, ti->len, data);
2730 }
2731 
2732 static struct target_type origin_target = {
2733 	.name    = "snapshot-origin",
2734 	.version = {1, 9, 0},
2735 	.module  = THIS_MODULE,
2736 	.ctr     = origin_ctr,
2737 	.dtr     = origin_dtr,
2738 	.map     = origin_map,
2739 	.resume  = origin_resume,
2740 	.postsuspend = origin_postsuspend,
2741 	.status  = origin_status,
2742 	.iterate_devices = origin_iterate_devices,
2743 };
2744 
2745 static struct target_type snapshot_target = {
2746 	.name    = "snapshot",
2747 	.version = {1, 16, 0},
2748 	.module  = THIS_MODULE,
2749 	.ctr     = snapshot_ctr,
2750 	.dtr     = snapshot_dtr,
2751 	.map     = snapshot_map,
2752 	.end_io  = snapshot_end_io,
2753 	.preresume  = snapshot_preresume,
2754 	.resume  = snapshot_resume,
2755 	.status  = snapshot_status,
2756 	.iterate_devices = snapshot_iterate_devices,
2757 	.io_hints = snapshot_io_hints,
2758 };
2759 
2760 static struct target_type merge_target = {
2761 	.name    = dm_snapshot_merge_target_name,
2762 	.version = {1, 5, 0},
2763 	.module  = THIS_MODULE,
2764 	.ctr     = snapshot_ctr,
2765 	.dtr     = snapshot_dtr,
2766 	.map     = snapshot_merge_map,
2767 	.end_io  = snapshot_end_io,
2768 	.presuspend = snapshot_merge_presuspend,
2769 	.preresume  = snapshot_preresume,
2770 	.resume  = snapshot_merge_resume,
2771 	.status  = snapshot_status,
2772 	.iterate_devices = snapshot_iterate_devices,
2773 	.io_hints = snapshot_io_hints,
2774 };
2775 
dm_snapshot_init(void)2776 static int __init dm_snapshot_init(void)
2777 {
2778 	int r;
2779 
2780 	r = dm_exception_store_init();
2781 	if (r) {
2782 		DMERR("Failed to initialize exception stores");
2783 		return r;
2784 	}
2785 
2786 	r = init_origin_hash();
2787 	if (r) {
2788 		DMERR("init_origin_hash failed.");
2789 		goto bad_origin_hash;
2790 	}
2791 
2792 	exception_cache = KMEM_CACHE(dm_exception, 0);
2793 	if (!exception_cache) {
2794 		DMERR("Couldn't create exception cache.");
2795 		r = -ENOMEM;
2796 		goto bad_exception_cache;
2797 	}
2798 
2799 	pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
2800 	if (!pending_cache) {
2801 		DMERR("Couldn't create pending cache.");
2802 		r = -ENOMEM;
2803 		goto bad_pending_cache;
2804 	}
2805 
2806 	r = dm_register_target(&snapshot_target);
2807 	if (r < 0)
2808 		goto bad_register_snapshot_target;
2809 
2810 	r = dm_register_target(&origin_target);
2811 	if (r < 0)
2812 		goto bad_register_origin_target;
2813 
2814 	r = dm_register_target(&merge_target);
2815 	if (r < 0)
2816 		goto bad_register_merge_target;
2817 
2818 	return 0;
2819 
2820 bad_register_merge_target:
2821 	dm_unregister_target(&origin_target);
2822 bad_register_origin_target:
2823 	dm_unregister_target(&snapshot_target);
2824 bad_register_snapshot_target:
2825 	kmem_cache_destroy(pending_cache);
2826 bad_pending_cache:
2827 	kmem_cache_destroy(exception_cache);
2828 bad_exception_cache:
2829 	exit_origin_hash();
2830 bad_origin_hash:
2831 	dm_exception_store_exit();
2832 
2833 	return r;
2834 }
2835 
dm_snapshot_exit(void)2836 static void __exit dm_snapshot_exit(void)
2837 {
2838 	dm_unregister_target(&snapshot_target);
2839 	dm_unregister_target(&origin_target);
2840 	dm_unregister_target(&merge_target);
2841 
2842 	exit_origin_hash();
2843 	kmem_cache_destroy(pending_cache);
2844 	kmem_cache_destroy(exception_cache);
2845 
2846 	dm_exception_store_exit();
2847 }
2848 
2849 /* Module hooks */
2850 module_init(dm_snapshot_init);
2851 module_exit(dm_snapshot_exit);
2852 
2853 MODULE_DESCRIPTION(DM_NAME " snapshot target");
2854 MODULE_AUTHOR("Joe Thornber");
2855 MODULE_LICENSE("GPL");
2856 MODULE_ALIAS("dm-snapshot-origin");
2857 MODULE_ALIAS("dm-snapshot-merge");
2858