xref: /linux/drivers/md/dm-cache-target.c (revision 64f0962c33d52524deb32d7c34ab8b2c271ee1a3)
1 /*
2  * Copyright (C) 2012 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm.h"
8 #include "dm-bio-prison.h"
9 #include "dm-cache-metadata.h"
10 
11 #include <linux/dm-io.h>
12 #include <linux/dm-kcopyd.h>
13 #include <linux/init.h>
14 #include <linux/mempool.h>
15 #include <linux/module.h>
16 #include <linux/slab.h>
17 #include <linux/vmalloc.h>
18 
19 #define DM_MSG_PREFIX "cache"
20 
21 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
22 	"A percentage of time allocated for copying to and/or from cache");
23 
24 /*----------------------------------------------------------------*/
25 
26 /*
27  * Glossary:
28  *
29  * oblock: index of an origin block
30  * cblock: index of a cache block
31  * promotion: movement of a block from origin to cache
32  * demotion: movement of a block from cache to origin
33  * migration: movement of a block between the origin and cache device,
34  *	      either direction
35  */
36 
37 /*----------------------------------------------------------------*/
38 
39 static size_t bitset_size_in_bytes(unsigned nr_entries)
40 {
41 	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
42 }
43 
44 static unsigned long *alloc_bitset(unsigned nr_entries)
45 {
46 	size_t s = bitset_size_in_bytes(nr_entries);
47 	return vzalloc(s);
48 }
49 
50 static void clear_bitset(void *bitset, unsigned nr_entries)
51 {
52 	size_t s = bitset_size_in_bytes(nr_entries);
53 	memset(bitset, 0, s);
54 }
55 
56 static void free_bitset(unsigned long *bits)
57 {
58 	vfree(bits);
59 }
60 
61 /*----------------------------------------------------------------*/
62 
63 #define PRISON_CELLS 1024
64 #define MIGRATION_POOL_SIZE 128
65 #define COMMIT_PERIOD HZ
66 #define MIGRATION_COUNT_WINDOW 10
67 
68 /*
69  * The block size of the device holding cache data must be >= 32KB
70  */
71 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
72 
73 /*
74  * FIXME: the cache is read/write for the time being.
75  */
76 enum cache_mode {
77 	CM_WRITE,		/* metadata may be changed */
78 	CM_READ_ONLY,		/* metadata may not be changed */
79 };
80 
81 struct cache_features {
82 	enum cache_mode mode;
83 	bool write_through:1;
84 };
85 
86 struct cache_stats {
87 	atomic_t read_hit;
88 	atomic_t read_miss;
89 	atomic_t write_hit;
90 	atomic_t write_miss;
91 	atomic_t demotion;
92 	atomic_t promotion;
93 	atomic_t copies_avoided;
94 	atomic_t cache_cell_clash;
95 	atomic_t commit_count;
96 	atomic_t discard_count;
97 };
98 
99 struct cache {
100 	struct dm_target *ti;
101 	struct dm_target_callbacks callbacks;
102 
103 	/*
104 	 * Metadata is written to this device.
105 	 */
106 	struct dm_dev *metadata_dev;
107 
108 	/*
109 	 * The slower of the two data devices.  Typically a spindle.
110 	 */
111 	struct dm_dev *origin_dev;
112 
113 	/*
114 	 * The faster of the two data devices.  Typically an SSD.
115 	 */
116 	struct dm_dev *cache_dev;
117 
118 	/*
119 	 * Cache features such as write-through.
120 	 */
121 	struct cache_features features;
122 
123 	/*
124 	 * Size of the origin device in _complete_ blocks and native sectors.
125 	 */
126 	dm_oblock_t origin_blocks;
127 	sector_t origin_sectors;
128 
129 	/*
130 	 * Size of the cache device in blocks.
131 	 */
132 	dm_cblock_t cache_size;
133 
134 	/*
135 	 * Fields for converting from sectors to blocks.
136 	 */
137 	uint32_t sectors_per_block;
138 	int sectors_per_block_shift;
139 
140 	struct dm_cache_metadata *cmd;
141 
142 	spinlock_t lock;
143 	struct bio_list deferred_bios;
144 	struct bio_list deferred_flush_bios;
145 	struct bio_list deferred_writethrough_bios;
146 	struct list_head quiesced_migrations;
147 	struct list_head completed_migrations;
148 	struct list_head need_commit_migrations;
149 	sector_t migration_threshold;
150 	atomic_t nr_migrations;
151 	wait_queue_head_t migration_wait;
152 
153 	/*
154 	 * cache_size entries, dirty if set
155 	 */
156 	dm_cblock_t nr_dirty;
157 	unsigned long *dirty_bitset;
158 
159 	/*
160 	 * origin_blocks entries, discarded if set.
161 	 */
162 	uint32_t discard_block_size; /* a power of 2 times sectors per block */
163 	dm_dblock_t discard_nr_blocks;
164 	unsigned long *discard_bitset;
165 
166 	struct dm_kcopyd_client *copier;
167 	struct workqueue_struct *wq;
168 	struct work_struct worker;
169 
170 	struct delayed_work waker;
171 	unsigned long last_commit_jiffies;
172 
173 	struct dm_bio_prison *prison;
174 	struct dm_deferred_set *all_io_ds;
175 
176 	mempool_t *migration_pool;
177 	struct dm_cache_migration *next_migration;
178 
179 	struct dm_cache_policy *policy;
180 	unsigned policy_nr_args;
181 
182 	bool need_tick_bio:1;
183 	bool sized:1;
184 	bool quiescing:1;
185 	bool commit_requested:1;
186 	bool loaded_mappings:1;
187 	bool loaded_discards:1;
188 
189 	struct cache_stats stats;
190 
191 	/*
192 	 * Rather than reconstructing the table line for the status we just
193 	 * save it and regurgitate.
194 	 */
195 	unsigned nr_ctr_args;
196 	const char **ctr_args;
197 };
198 
199 struct per_bio_data {
200 	bool tick:1;
201 	unsigned req_nr:2;
202 	struct dm_deferred_entry *all_io_entry;
203 
204 	/* writethrough fields */
205 	struct cache *cache;
206 	dm_cblock_t cblock;
207 	bio_end_io_t *saved_bi_end_io;
208 };
209 
210 struct dm_cache_migration {
211 	struct list_head list;
212 	struct cache *cache;
213 
214 	unsigned long start_jiffies;
215 	dm_oblock_t old_oblock;
216 	dm_oblock_t new_oblock;
217 	dm_cblock_t cblock;
218 
219 	bool err:1;
220 	bool writeback:1;
221 	bool demote:1;
222 	bool promote:1;
223 
224 	struct dm_bio_prison_cell *old_ocell;
225 	struct dm_bio_prison_cell *new_ocell;
226 };
227 
228 /*
229  * Processing a bio in the worker thread may require these memory
230  * allocations.  We prealloc to avoid deadlocks (the same worker thread
231  * frees them back to the mempool).
232  */
233 struct prealloc {
234 	struct dm_cache_migration *mg;
235 	struct dm_bio_prison_cell *cell1;
236 	struct dm_bio_prison_cell *cell2;
237 };
238 
239 static void wake_worker(struct cache *cache)
240 {
241 	queue_work(cache->wq, &cache->worker);
242 }
243 
244 /*----------------------------------------------------------------*/
245 
246 static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
247 {
248 	/* FIXME: change to use a local slab. */
249 	return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
250 }
251 
252 static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
253 {
254 	dm_bio_prison_free_cell(cache->prison, cell);
255 }
256 
257 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
258 {
259 	if (!p->mg) {
260 		p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
261 		if (!p->mg)
262 			return -ENOMEM;
263 	}
264 
265 	if (!p->cell1) {
266 		p->cell1 = alloc_prison_cell(cache);
267 		if (!p->cell1)
268 			return -ENOMEM;
269 	}
270 
271 	if (!p->cell2) {
272 		p->cell2 = alloc_prison_cell(cache);
273 		if (!p->cell2)
274 			return -ENOMEM;
275 	}
276 
277 	return 0;
278 }
279 
280 static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
281 {
282 	if (p->cell2)
283 		free_prison_cell(cache, p->cell2);
284 
285 	if (p->cell1)
286 		free_prison_cell(cache, p->cell1);
287 
288 	if (p->mg)
289 		mempool_free(p->mg, cache->migration_pool);
290 }
291 
292 static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
293 {
294 	struct dm_cache_migration *mg = p->mg;
295 
296 	BUG_ON(!mg);
297 	p->mg = NULL;
298 
299 	return mg;
300 }
301 
302 /*
303  * You must have a cell within the prealloc struct to return.  If not this
304  * function will BUG() rather than returning NULL.
305  */
306 static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
307 {
308 	struct dm_bio_prison_cell *r = NULL;
309 
310 	if (p->cell1) {
311 		r = p->cell1;
312 		p->cell1 = NULL;
313 
314 	} else if (p->cell2) {
315 		r = p->cell2;
316 		p->cell2 = NULL;
317 	} else
318 		BUG();
319 
320 	return r;
321 }
322 
323 /*
324  * You can't have more than two cells in a prealloc struct.  BUG() will be
325  * called if you try and overfill.
326  */
327 static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
328 {
329 	if (!p->cell2)
330 		p->cell2 = cell;
331 
332 	else if (!p->cell1)
333 		p->cell1 = cell;
334 
335 	else
336 		BUG();
337 }
338 
339 /*----------------------------------------------------------------*/
340 
341 static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
342 {
343 	key->virtual = 0;
344 	key->dev = 0;
345 	key->block = from_oblock(oblock);
346 }
347 
348 /*
349  * The caller hands in a preallocated cell, and a free function for it.
350  * The cell will be freed if there's an error, or if it wasn't used because
351  * a cell with that key already exists.
352  */
353 typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
354 
355 static int bio_detain(struct cache *cache, dm_oblock_t oblock,
356 		      struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
357 		      cell_free_fn free_fn, void *free_context,
358 		      struct dm_bio_prison_cell **cell_result)
359 {
360 	int r;
361 	struct dm_cell_key key;
362 
363 	build_key(oblock, &key);
364 	r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
365 	if (r)
366 		free_fn(free_context, cell_prealloc);
367 
368 	return r;
369 }
370 
371 static int get_cell(struct cache *cache,
372 		    dm_oblock_t oblock,
373 		    struct prealloc *structs,
374 		    struct dm_bio_prison_cell **cell_result)
375 {
376 	int r;
377 	struct dm_cell_key key;
378 	struct dm_bio_prison_cell *cell_prealloc;
379 
380 	cell_prealloc = prealloc_get_cell(structs);
381 
382 	build_key(oblock, &key);
383 	r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
384 	if (r)
385 		prealloc_put_cell(structs, cell_prealloc);
386 
387 	return r;
388 }
389 
390  /*----------------------------------------------------------------*/
391 
392 static bool is_dirty(struct cache *cache, dm_cblock_t b)
393 {
394 	return test_bit(from_cblock(b), cache->dirty_bitset);
395 }
396 
397 static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
398 {
399 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
400 		cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
401 		policy_set_dirty(cache->policy, oblock);
402 	}
403 }
404 
405 static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
406 {
407 	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
408 		policy_clear_dirty(cache->policy, oblock);
409 		cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
410 		if (!from_cblock(cache->nr_dirty))
411 			dm_table_event(cache->ti->table);
412 	}
413 }
414 
415 /*----------------------------------------------------------------*/
416 static bool block_size_is_power_of_two(struct cache *cache)
417 {
418 	return cache->sectors_per_block_shift >= 0;
419 }
420 
421 static dm_block_t block_div(dm_block_t b, uint32_t n)
422 {
423 	do_div(b, n);
424 
425 	return b;
426 }
427 
428 static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
429 {
430 	uint32_t discard_blocks = cache->discard_block_size;
431 	dm_block_t b = from_oblock(oblock);
432 
433 	if (!block_size_is_power_of_two(cache))
434 		discard_blocks = discard_blocks / cache->sectors_per_block;
435 	else
436 		discard_blocks >>= cache->sectors_per_block_shift;
437 
438 	b = block_div(b, discard_blocks);
439 
440 	return to_dblock(b);
441 }
442 
443 static void set_discard(struct cache *cache, dm_dblock_t b)
444 {
445 	unsigned long flags;
446 
447 	atomic_inc(&cache->stats.discard_count);
448 
449 	spin_lock_irqsave(&cache->lock, flags);
450 	set_bit(from_dblock(b), cache->discard_bitset);
451 	spin_unlock_irqrestore(&cache->lock, flags);
452 }
453 
454 static void clear_discard(struct cache *cache, dm_dblock_t b)
455 {
456 	unsigned long flags;
457 
458 	spin_lock_irqsave(&cache->lock, flags);
459 	clear_bit(from_dblock(b), cache->discard_bitset);
460 	spin_unlock_irqrestore(&cache->lock, flags);
461 }
462 
463 static bool is_discarded(struct cache *cache, dm_dblock_t b)
464 {
465 	int r;
466 	unsigned long flags;
467 
468 	spin_lock_irqsave(&cache->lock, flags);
469 	r = test_bit(from_dblock(b), cache->discard_bitset);
470 	spin_unlock_irqrestore(&cache->lock, flags);
471 
472 	return r;
473 }
474 
475 static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
476 {
477 	int r;
478 	unsigned long flags;
479 
480 	spin_lock_irqsave(&cache->lock, flags);
481 	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
482 		     cache->discard_bitset);
483 	spin_unlock_irqrestore(&cache->lock, flags);
484 
485 	return r;
486 }
487 
488 /*----------------------------------------------------------------*/
489 
490 static void load_stats(struct cache *cache)
491 {
492 	struct dm_cache_statistics stats;
493 
494 	dm_cache_metadata_get_stats(cache->cmd, &stats);
495 	atomic_set(&cache->stats.read_hit, stats.read_hits);
496 	atomic_set(&cache->stats.read_miss, stats.read_misses);
497 	atomic_set(&cache->stats.write_hit, stats.write_hits);
498 	atomic_set(&cache->stats.write_miss, stats.write_misses);
499 }
500 
501 static void save_stats(struct cache *cache)
502 {
503 	struct dm_cache_statistics stats;
504 
505 	stats.read_hits = atomic_read(&cache->stats.read_hit);
506 	stats.read_misses = atomic_read(&cache->stats.read_miss);
507 	stats.write_hits = atomic_read(&cache->stats.write_hit);
508 	stats.write_misses = atomic_read(&cache->stats.write_miss);
509 
510 	dm_cache_metadata_set_stats(cache->cmd, &stats);
511 }
512 
513 /*----------------------------------------------------------------
514  * Per bio data
515  *--------------------------------------------------------------*/
516 static struct per_bio_data *get_per_bio_data(struct bio *bio)
517 {
518 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
519 	BUG_ON(!pb);
520 	return pb;
521 }
522 
523 static struct per_bio_data *init_per_bio_data(struct bio *bio)
524 {
525 	struct per_bio_data *pb = get_per_bio_data(bio);
526 
527 	pb->tick = false;
528 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
529 	pb->all_io_entry = NULL;
530 
531 	return pb;
532 }
533 
534 /*----------------------------------------------------------------
535  * Remapping
536  *--------------------------------------------------------------*/
537 static void remap_to_origin(struct cache *cache, struct bio *bio)
538 {
539 	bio->bi_bdev = cache->origin_dev->bdev;
540 }
541 
542 static void remap_to_cache(struct cache *cache, struct bio *bio,
543 			   dm_cblock_t cblock)
544 {
545 	sector_t bi_sector = bio->bi_sector;
546 
547 	bio->bi_bdev = cache->cache_dev->bdev;
548 	if (!block_size_is_power_of_two(cache))
549 		bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
550 				sector_div(bi_sector, cache->sectors_per_block);
551 	else
552 		bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
553 				(bi_sector & (cache->sectors_per_block - 1));
554 }
555 
556 static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
557 {
558 	unsigned long flags;
559 	struct per_bio_data *pb = get_per_bio_data(bio);
560 
561 	spin_lock_irqsave(&cache->lock, flags);
562 	if (cache->need_tick_bio &&
563 	    !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
564 		pb->tick = true;
565 		cache->need_tick_bio = false;
566 	}
567 	spin_unlock_irqrestore(&cache->lock, flags);
568 }
569 
570 static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
571 				  dm_oblock_t oblock)
572 {
573 	check_if_tick_bio_needed(cache, bio);
574 	remap_to_origin(cache, bio);
575 	if (bio_data_dir(bio) == WRITE)
576 		clear_discard(cache, oblock_to_dblock(cache, oblock));
577 }
578 
579 static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
580 				 dm_oblock_t oblock, dm_cblock_t cblock)
581 {
582 	remap_to_cache(cache, bio, cblock);
583 	if (bio_data_dir(bio) == WRITE) {
584 		set_dirty(cache, oblock, cblock);
585 		clear_discard(cache, oblock_to_dblock(cache, oblock));
586 	}
587 }
588 
589 static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
590 {
591 	sector_t block_nr = bio->bi_sector;
592 
593 	if (!block_size_is_power_of_two(cache))
594 		(void) sector_div(block_nr, cache->sectors_per_block);
595 	else
596 		block_nr >>= cache->sectors_per_block_shift;
597 
598 	return to_oblock(block_nr);
599 }
600 
601 static int bio_triggers_commit(struct cache *cache, struct bio *bio)
602 {
603 	return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
604 }
605 
606 static void issue(struct cache *cache, struct bio *bio)
607 {
608 	unsigned long flags;
609 
610 	if (!bio_triggers_commit(cache, bio)) {
611 		generic_make_request(bio);
612 		return;
613 	}
614 
615 	/*
616 	 * Batch together any bios that trigger commits and then issue a
617 	 * single commit for them in do_worker().
618 	 */
619 	spin_lock_irqsave(&cache->lock, flags);
620 	cache->commit_requested = true;
621 	bio_list_add(&cache->deferred_flush_bios, bio);
622 	spin_unlock_irqrestore(&cache->lock, flags);
623 }
624 
625 static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
626 {
627 	unsigned long flags;
628 
629 	spin_lock_irqsave(&cache->lock, flags);
630 	bio_list_add(&cache->deferred_writethrough_bios, bio);
631 	spin_unlock_irqrestore(&cache->lock, flags);
632 
633 	wake_worker(cache);
634 }
635 
636 static void writethrough_endio(struct bio *bio, int err)
637 {
638 	struct per_bio_data *pb = get_per_bio_data(bio);
639 	bio->bi_end_io = pb->saved_bi_end_io;
640 
641 	if (err) {
642 		bio_endio(bio, err);
643 		return;
644 	}
645 
646 	remap_to_cache(pb->cache, bio, pb->cblock);
647 
648 	/*
649 	 * We can't issue this bio directly, since we're in interrupt
650 	 * context.  So it get's put on a bio list for processing by the
651 	 * worker thread.
652 	 */
653 	defer_writethrough_bio(pb->cache, bio);
654 }
655 
656 /*
657  * When running in writethrough mode we need to send writes to clean blocks
658  * to both the cache and origin devices.  In future we'd like to clone the
659  * bio and send them in parallel, but for now we're doing them in
660  * series as this is easier.
661  */
662 static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
663 				       dm_oblock_t oblock, dm_cblock_t cblock)
664 {
665 	struct per_bio_data *pb = get_per_bio_data(bio);
666 
667 	pb->cache = cache;
668 	pb->cblock = cblock;
669 	pb->saved_bi_end_io = bio->bi_end_io;
670 	bio->bi_end_io = writethrough_endio;
671 
672 	remap_to_origin_clear_discard(pb->cache, bio, oblock);
673 }
674 
675 /*----------------------------------------------------------------
676  * Migration processing
677  *
678  * Migration covers moving data from the origin device to the cache, or
679  * vice versa.
680  *--------------------------------------------------------------*/
681 static void free_migration(struct dm_cache_migration *mg)
682 {
683 	mempool_free(mg, mg->cache->migration_pool);
684 }
685 
686 static void inc_nr_migrations(struct cache *cache)
687 {
688 	atomic_inc(&cache->nr_migrations);
689 }
690 
691 static void dec_nr_migrations(struct cache *cache)
692 {
693 	atomic_dec(&cache->nr_migrations);
694 
695 	/*
696 	 * Wake the worker in case we're suspending the target.
697 	 */
698 	wake_up(&cache->migration_wait);
699 }
700 
701 static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
702 			 bool holder)
703 {
704 	(holder ? dm_cell_release : dm_cell_release_no_holder)
705 		(cache->prison, cell, &cache->deferred_bios);
706 	free_prison_cell(cache, cell);
707 }
708 
709 static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
710 		       bool holder)
711 {
712 	unsigned long flags;
713 
714 	spin_lock_irqsave(&cache->lock, flags);
715 	__cell_defer(cache, cell, holder);
716 	spin_unlock_irqrestore(&cache->lock, flags);
717 
718 	wake_worker(cache);
719 }
720 
721 static void cleanup_migration(struct dm_cache_migration *mg)
722 {
723 	dec_nr_migrations(mg->cache);
724 	free_migration(mg);
725 }
726 
727 static void migration_failure(struct dm_cache_migration *mg)
728 {
729 	struct cache *cache = mg->cache;
730 
731 	if (mg->writeback) {
732 		DMWARN_LIMIT("writeback failed; couldn't copy block");
733 		set_dirty(cache, mg->old_oblock, mg->cblock);
734 		cell_defer(cache, mg->old_ocell, false);
735 
736 	} else if (mg->demote) {
737 		DMWARN_LIMIT("demotion failed; couldn't copy block");
738 		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
739 
740 		cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
741 		if (mg->promote)
742 			cell_defer(cache, mg->new_ocell, 1);
743 	} else {
744 		DMWARN_LIMIT("promotion failed; couldn't copy block");
745 		policy_remove_mapping(cache->policy, mg->new_oblock);
746 		cell_defer(cache, mg->new_ocell, 1);
747 	}
748 
749 	cleanup_migration(mg);
750 }
751 
752 static void migration_success_pre_commit(struct dm_cache_migration *mg)
753 {
754 	unsigned long flags;
755 	struct cache *cache = mg->cache;
756 
757 	if (mg->writeback) {
758 		cell_defer(cache, mg->old_ocell, false);
759 		clear_dirty(cache, mg->old_oblock, mg->cblock);
760 		cleanup_migration(mg);
761 		return;
762 
763 	} else if (mg->demote) {
764 		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
765 			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
766 			policy_force_mapping(cache->policy, mg->new_oblock,
767 					     mg->old_oblock);
768 			if (mg->promote)
769 				cell_defer(cache, mg->new_ocell, true);
770 			cleanup_migration(mg);
771 			return;
772 		}
773 	} else {
774 		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
775 			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
776 			policy_remove_mapping(cache->policy, mg->new_oblock);
777 			cleanup_migration(mg);
778 			return;
779 		}
780 	}
781 
782 	spin_lock_irqsave(&cache->lock, flags);
783 	list_add_tail(&mg->list, &cache->need_commit_migrations);
784 	cache->commit_requested = true;
785 	spin_unlock_irqrestore(&cache->lock, flags);
786 }
787 
788 static void migration_success_post_commit(struct dm_cache_migration *mg)
789 {
790 	unsigned long flags;
791 	struct cache *cache = mg->cache;
792 
793 	if (mg->writeback) {
794 		DMWARN("writeback unexpectedly triggered commit");
795 		return;
796 
797 	} else if (mg->demote) {
798 		cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
799 
800 		if (mg->promote) {
801 			mg->demote = false;
802 
803 			spin_lock_irqsave(&cache->lock, flags);
804 			list_add_tail(&mg->list, &cache->quiesced_migrations);
805 			spin_unlock_irqrestore(&cache->lock, flags);
806 
807 		} else
808 			cleanup_migration(mg);
809 
810 	} else {
811 		cell_defer(cache, mg->new_ocell, true);
812 		clear_dirty(cache, mg->new_oblock, mg->cblock);
813 		cleanup_migration(mg);
814 	}
815 }
816 
817 static void copy_complete(int read_err, unsigned long write_err, void *context)
818 {
819 	unsigned long flags;
820 	struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
821 	struct cache *cache = mg->cache;
822 
823 	if (read_err || write_err)
824 		mg->err = true;
825 
826 	spin_lock_irqsave(&cache->lock, flags);
827 	list_add_tail(&mg->list, &cache->completed_migrations);
828 	spin_unlock_irqrestore(&cache->lock, flags);
829 
830 	wake_worker(cache);
831 }
832 
833 static void issue_copy_real(struct dm_cache_migration *mg)
834 {
835 	int r;
836 	struct dm_io_region o_region, c_region;
837 	struct cache *cache = mg->cache;
838 
839 	o_region.bdev = cache->origin_dev->bdev;
840 	o_region.count = cache->sectors_per_block;
841 
842 	c_region.bdev = cache->cache_dev->bdev;
843 	c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
844 	c_region.count = cache->sectors_per_block;
845 
846 	if (mg->writeback || mg->demote) {
847 		/* demote */
848 		o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
849 		r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
850 	} else {
851 		/* promote */
852 		o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
853 		r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
854 	}
855 
856 	if (r < 0)
857 		migration_failure(mg);
858 }
859 
860 static void avoid_copy(struct dm_cache_migration *mg)
861 {
862 	atomic_inc(&mg->cache->stats.copies_avoided);
863 	migration_success_pre_commit(mg);
864 }
865 
866 static void issue_copy(struct dm_cache_migration *mg)
867 {
868 	bool avoid;
869 	struct cache *cache = mg->cache;
870 
871 	if (mg->writeback || mg->demote)
872 		avoid = !is_dirty(cache, mg->cblock) ||
873 			is_discarded_oblock(cache, mg->old_oblock);
874 	else
875 		avoid = is_discarded_oblock(cache, mg->new_oblock);
876 
877 	avoid ? avoid_copy(mg) : issue_copy_real(mg);
878 }
879 
880 static void complete_migration(struct dm_cache_migration *mg)
881 {
882 	if (mg->err)
883 		migration_failure(mg);
884 	else
885 		migration_success_pre_commit(mg);
886 }
887 
888 static void process_migrations(struct cache *cache, struct list_head *head,
889 			       void (*fn)(struct dm_cache_migration *))
890 {
891 	unsigned long flags;
892 	struct list_head list;
893 	struct dm_cache_migration *mg, *tmp;
894 
895 	INIT_LIST_HEAD(&list);
896 	spin_lock_irqsave(&cache->lock, flags);
897 	list_splice_init(head, &list);
898 	spin_unlock_irqrestore(&cache->lock, flags);
899 
900 	list_for_each_entry_safe(mg, tmp, &list, list)
901 		fn(mg);
902 }
903 
904 static void __queue_quiesced_migration(struct dm_cache_migration *mg)
905 {
906 	list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
907 }
908 
909 static void queue_quiesced_migration(struct dm_cache_migration *mg)
910 {
911 	unsigned long flags;
912 	struct cache *cache = mg->cache;
913 
914 	spin_lock_irqsave(&cache->lock, flags);
915 	__queue_quiesced_migration(mg);
916 	spin_unlock_irqrestore(&cache->lock, flags);
917 
918 	wake_worker(cache);
919 }
920 
921 static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
922 {
923 	unsigned long flags;
924 	struct dm_cache_migration *mg, *tmp;
925 
926 	spin_lock_irqsave(&cache->lock, flags);
927 	list_for_each_entry_safe(mg, tmp, work, list)
928 		__queue_quiesced_migration(mg);
929 	spin_unlock_irqrestore(&cache->lock, flags);
930 
931 	wake_worker(cache);
932 }
933 
934 static void check_for_quiesced_migrations(struct cache *cache,
935 					  struct per_bio_data *pb)
936 {
937 	struct list_head work;
938 
939 	if (!pb->all_io_entry)
940 		return;
941 
942 	INIT_LIST_HEAD(&work);
943 	if (pb->all_io_entry)
944 		dm_deferred_entry_dec(pb->all_io_entry, &work);
945 
946 	if (!list_empty(&work))
947 		queue_quiesced_migrations(cache, &work);
948 }
949 
950 static void quiesce_migration(struct dm_cache_migration *mg)
951 {
952 	if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
953 		queue_quiesced_migration(mg);
954 }
955 
956 static void promote(struct cache *cache, struct prealloc *structs,
957 		    dm_oblock_t oblock, dm_cblock_t cblock,
958 		    struct dm_bio_prison_cell *cell)
959 {
960 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
961 
962 	mg->err = false;
963 	mg->writeback = false;
964 	mg->demote = false;
965 	mg->promote = true;
966 	mg->cache = cache;
967 	mg->new_oblock = oblock;
968 	mg->cblock = cblock;
969 	mg->old_ocell = NULL;
970 	mg->new_ocell = cell;
971 	mg->start_jiffies = jiffies;
972 
973 	inc_nr_migrations(cache);
974 	quiesce_migration(mg);
975 }
976 
977 static void writeback(struct cache *cache, struct prealloc *structs,
978 		      dm_oblock_t oblock, dm_cblock_t cblock,
979 		      struct dm_bio_prison_cell *cell)
980 {
981 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
982 
983 	mg->err = false;
984 	mg->writeback = true;
985 	mg->demote = false;
986 	mg->promote = false;
987 	mg->cache = cache;
988 	mg->old_oblock = oblock;
989 	mg->cblock = cblock;
990 	mg->old_ocell = cell;
991 	mg->new_ocell = NULL;
992 	mg->start_jiffies = jiffies;
993 
994 	inc_nr_migrations(cache);
995 	quiesce_migration(mg);
996 }
997 
998 static void demote_then_promote(struct cache *cache, struct prealloc *structs,
999 				dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1000 				dm_cblock_t cblock,
1001 				struct dm_bio_prison_cell *old_ocell,
1002 				struct dm_bio_prison_cell *new_ocell)
1003 {
1004 	struct dm_cache_migration *mg = prealloc_get_migration(structs);
1005 
1006 	mg->err = false;
1007 	mg->writeback = false;
1008 	mg->demote = true;
1009 	mg->promote = true;
1010 	mg->cache = cache;
1011 	mg->old_oblock = old_oblock;
1012 	mg->new_oblock = new_oblock;
1013 	mg->cblock = cblock;
1014 	mg->old_ocell = old_ocell;
1015 	mg->new_ocell = new_ocell;
1016 	mg->start_jiffies = jiffies;
1017 
1018 	inc_nr_migrations(cache);
1019 	quiesce_migration(mg);
1020 }
1021 
1022 /*----------------------------------------------------------------
1023  * bio processing
1024  *--------------------------------------------------------------*/
1025 static void defer_bio(struct cache *cache, struct bio *bio)
1026 {
1027 	unsigned long flags;
1028 
1029 	spin_lock_irqsave(&cache->lock, flags);
1030 	bio_list_add(&cache->deferred_bios, bio);
1031 	spin_unlock_irqrestore(&cache->lock, flags);
1032 
1033 	wake_worker(cache);
1034 }
1035 
1036 static void process_flush_bio(struct cache *cache, struct bio *bio)
1037 {
1038 	struct per_bio_data *pb = get_per_bio_data(bio);
1039 
1040 	BUG_ON(bio->bi_size);
1041 	if (!pb->req_nr)
1042 		remap_to_origin(cache, bio);
1043 	else
1044 		remap_to_cache(cache, bio, 0);
1045 
1046 	issue(cache, bio);
1047 }
1048 
1049 /*
1050  * People generally discard large parts of a device, eg, the whole device
1051  * when formatting.  Splitting these large discards up into cache block
1052  * sized ios and then quiescing (always neccessary for discard) takes too
1053  * long.
1054  *
1055  * We keep it simple, and allow any size of discard to come in, and just
1056  * mark off blocks on the discard bitset.  No passdown occurs!
1057  *
1058  * To implement passdown we need to change the bio_prison such that a cell
1059  * can have a key that spans many blocks.
1060  */
1061 static void process_discard_bio(struct cache *cache, struct bio *bio)
1062 {
1063 	dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1064 						  cache->discard_block_size);
1065 	dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1066 	dm_block_t b;
1067 
1068 	end_block = block_div(end_block, cache->discard_block_size);
1069 
1070 	for (b = start_block; b < end_block; b++)
1071 		set_discard(cache, to_dblock(b));
1072 
1073 	bio_endio(bio, 0);
1074 }
1075 
1076 static bool spare_migration_bandwidth(struct cache *cache)
1077 {
1078 	sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1079 		cache->sectors_per_block;
1080 	return current_volume < cache->migration_threshold;
1081 }
1082 
1083 static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1084 			       dm_cblock_t cblock)
1085 {
1086 	return bio_data_dir(bio) == WRITE &&
1087 		cache->features.write_through && !is_dirty(cache, cblock);
1088 }
1089 
1090 static void inc_hit_counter(struct cache *cache, struct bio *bio)
1091 {
1092 	atomic_inc(bio_data_dir(bio) == READ ?
1093 		   &cache->stats.read_hit : &cache->stats.write_hit);
1094 }
1095 
1096 static void inc_miss_counter(struct cache *cache, struct bio *bio)
1097 {
1098 	atomic_inc(bio_data_dir(bio) == READ ?
1099 		   &cache->stats.read_miss : &cache->stats.write_miss);
1100 }
1101 
1102 static void process_bio(struct cache *cache, struct prealloc *structs,
1103 			struct bio *bio)
1104 {
1105 	int r;
1106 	bool release_cell = true;
1107 	dm_oblock_t block = get_bio_block(cache, bio);
1108 	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1109 	struct policy_result lookup_result;
1110 	struct per_bio_data *pb = get_per_bio_data(bio);
1111 	bool discarded_block = is_discarded_oblock(cache, block);
1112 	bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1113 
1114 	/*
1115 	 * Check to see if that block is currently migrating.
1116 	 */
1117 	cell_prealloc = prealloc_get_cell(structs);
1118 	r = bio_detain(cache, block, bio, cell_prealloc,
1119 		       (cell_free_fn) prealloc_put_cell,
1120 		       structs, &new_ocell);
1121 	if (r > 0)
1122 		return;
1123 
1124 	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1125 		       bio, &lookup_result);
1126 
1127 	if (r == -EWOULDBLOCK)
1128 		/* migration has been denied */
1129 		lookup_result.op = POLICY_MISS;
1130 
1131 	switch (lookup_result.op) {
1132 	case POLICY_HIT:
1133 		inc_hit_counter(cache, bio);
1134 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1135 
1136 		if (is_writethrough_io(cache, bio, lookup_result.cblock))
1137 			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1138 		else
1139 			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1140 
1141 		issue(cache, bio);
1142 		break;
1143 
1144 	case POLICY_MISS:
1145 		inc_miss_counter(cache, bio);
1146 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1147 		remap_to_origin_clear_discard(cache, bio, block);
1148 		issue(cache, bio);
1149 		break;
1150 
1151 	case POLICY_NEW:
1152 		atomic_inc(&cache->stats.promotion);
1153 		promote(cache, structs, block, lookup_result.cblock, new_ocell);
1154 		release_cell = false;
1155 		break;
1156 
1157 	case POLICY_REPLACE:
1158 		cell_prealloc = prealloc_get_cell(structs);
1159 		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1160 			       (cell_free_fn) prealloc_put_cell,
1161 			       structs, &old_ocell);
1162 		if (r > 0) {
1163 			/*
1164 			 * We have to be careful to avoid lock inversion of
1165 			 * the cells.  So we back off, and wait for the
1166 			 * old_ocell to become free.
1167 			 */
1168 			policy_force_mapping(cache->policy, block,
1169 					     lookup_result.old_oblock);
1170 			atomic_inc(&cache->stats.cache_cell_clash);
1171 			break;
1172 		}
1173 		atomic_inc(&cache->stats.demotion);
1174 		atomic_inc(&cache->stats.promotion);
1175 
1176 		demote_then_promote(cache, structs, lookup_result.old_oblock,
1177 				    block, lookup_result.cblock,
1178 				    old_ocell, new_ocell);
1179 		release_cell = false;
1180 		break;
1181 
1182 	default:
1183 		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1184 			    (unsigned) lookup_result.op);
1185 		bio_io_error(bio);
1186 	}
1187 
1188 	if (release_cell)
1189 		cell_defer(cache, new_ocell, false);
1190 }
1191 
1192 static int need_commit_due_to_time(struct cache *cache)
1193 {
1194 	return jiffies < cache->last_commit_jiffies ||
1195 	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1196 }
1197 
1198 static int commit_if_needed(struct cache *cache)
1199 {
1200 	if (dm_cache_changed_this_transaction(cache->cmd) &&
1201 	    (cache->commit_requested || need_commit_due_to_time(cache))) {
1202 		atomic_inc(&cache->stats.commit_count);
1203 		cache->last_commit_jiffies = jiffies;
1204 		cache->commit_requested = false;
1205 		return dm_cache_commit(cache->cmd, false);
1206 	}
1207 
1208 	return 0;
1209 }
1210 
1211 static void process_deferred_bios(struct cache *cache)
1212 {
1213 	unsigned long flags;
1214 	struct bio_list bios;
1215 	struct bio *bio;
1216 	struct prealloc structs;
1217 
1218 	memset(&structs, 0, sizeof(structs));
1219 	bio_list_init(&bios);
1220 
1221 	spin_lock_irqsave(&cache->lock, flags);
1222 	bio_list_merge(&bios, &cache->deferred_bios);
1223 	bio_list_init(&cache->deferred_bios);
1224 	spin_unlock_irqrestore(&cache->lock, flags);
1225 
1226 	while (!bio_list_empty(&bios)) {
1227 		/*
1228 		 * If we've got no free migration structs, and processing
1229 		 * this bio might require one, we pause until there are some
1230 		 * prepared mappings to process.
1231 		 */
1232 		if (prealloc_data_structs(cache, &structs)) {
1233 			spin_lock_irqsave(&cache->lock, flags);
1234 			bio_list_merge(&cache->deferred_bios, &bios);
1235 			spin_unlock_irqrestore(&cache->lock, flags);
1236 			break;
1237 		}
1238 
1239 		bio = bio_list_pop(&bios);
1240 
1241 		if (bio->bi_rw & REQ_FLUSH)
1242 			process_flush_bio(cache, bio);
1243 		else if (bio->bi_rw & REQ_DISCARD)
1244 			process_discard_bio(cache, bio);
1245 		else
1246 			process_bio(cache, &structs, bio);
1247 	}
1248 
1249 	prealloc_free_structs(cache, &structs);
1250 }
1251 
1252 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1253 {
1254 	unsigned long flags;
1255 	struct bio_list bios;
1256 	struct bio *bio;
1257 
1258 	bio_list_init(&bios);
1259 
1260 	spin_lock_irqsave(&cache->lock, flags);
1261 	bio_list_merge(&bios, &cache->deferred_flush_bios);
1262 	bio_list_init(&cache->deferred_flush_bios);
1263 	spin_unlock_irqrestore(&cache->lock, flags);
1264 
1265 	while ((bio = bio_list_pop(&bios)))
1266 		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1267 }
1268 
1269 static void process_deferred_writethrough_bios(struct cache *cache)
1270 {
1271 	unsigned long flags;
1272 	struct bio_list bios;
1273 	struct bio *bio;
1274 
1275 	bio_list_init(&bios);
1276 
1277 	spin_lock_irqsave(&cache->lock, flags);
1278 	bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1279 	bio_list_init(&cache->deferred_writethrough_bios);
1280 	spin_unlock_irqrestore(&cache->lock, flags);
1281 
1282 	while ((bio = bio_list_pop(&bios)))
1283 		generic_make_request(bio);
1284 }
1285 
1286 static void writeback_some_dirty_blocks(struct cache *cache)
1287 {
1288 	int r = 0;
1289 	dm_oblock_t oblock;
1290 	dm_cblock_t cblock;
1291 	struct prealloc structs;
1292 	struct dm_bio_prison_cell *old_ocell;
1293 
1294 	memset(&structs, 0, sizeof(structs));
1295 
1296 	while (spare_migration_bandwidth(cache)) {
1297 		if (prealloc_data_structs(cache, &structs))
1298 			break;
1299 
1300 		r = policy_writeback_work(cache->policy, &oblock, &cblock);
1301 		if (r)
1302 			break;
1303 
1304 		r = get_cell(cache, oblock, &structs, &old_ocell);
1305 		if (r) {
1306 			policy_set_dirty(cache->policy, oblock);
1307 			break;
1308 		}
1309 
1310 		writeback(cache, &structs, oblock, cblock, old_ocell);
1311 	}
1312 
1313 	prealloc_free_structs(cache, &structs);
1314 }
1315 
1316 /*----------------------------------------------------------------
1317  * Main worker loop
1318  *--------------------------------------------------------------*/
1319 static void start_quiescing(struct cache *cache)
1320 {
1321 	unsigned long flags;
1322 
1323 	spin_lock_irqsave(&cache->lock, flags);
1324 	cache->quiescing = 1;
1325 	spin_unlock_irqrestore(&cache->lock, flags);
1326 }
1327 
1328 static void stop_quiescing(struct cache *cache)
1329 {
1330 	unsigned long flags;
1331 
1332 	spin_lock_irqsave(&cache->lock, flags);
1333 	cache->quiescing = 0;
1334 	spin_unlock_irqrestore(&cache->lock, flags);
1335 }
1336 
1337 static bool is_quiescing(struct cache *cache)
1338 {
1339 	int r;
1340 	unsigned long flags;
1341 
1342 	spin_lock_irqsave(&cache->lock, flags);
1343 	r = cache->quiescing;
1344 	spin_unlock_irqrestore(&cache->lock, flags);
1345 
1346 	return r;
1347 }
1348 
1349 static void wait_for_migrations(struct cache *cache)
1350 {
1351 	wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1352 }
1353 
1354 static void stop_worker(struct cache *cache)
1355 {
1356 	cancel_delayed_work(&cache->waker);
1357 	flush_workqueue(cache->wq);
1358 }
1359 
1360 static void requeue_deferred_io(struct cache *cache)
1361 {
1362 	struct bio *bio;
1363 	struct bio_list bios;
1364 
1365 	bio_list_init(&bios);
1366 	bio_list_merge(&bios, &cache->deferred_bios);
1367 	bio_list_init(&cache->deferred_bios);
1368 
1369 	while ((bio = bio_list_pop(&bios)))
1370 		bio_endio(bio, DM_ENDIO_REQUEUE);
1371 }
1372 
1373 static int more_work(struct cache *cache)
1374 {
1375 	if (is_quiescing(cache))
1376 		return !list_empty(&cache->quiesced_migrations) ||
1377 			!list_empty(&cache->completed_migrations) ||
1378 			!list_empty(&cache->need_commit_migrations);
1379 	else
1380 		return !bio_list_empty(&cache->deferred_bios) ||
1381 			!bio_list_empty(&cache->deferred_flush_bios) ||
1382 			!bio_list_empty(&cache->deferred_writethrough_bios) ||
1383 			!list_empty(&cache->quiesced_migrations) ||
1384 			!list_empty(&cache->completed_migrations) ||
1385 			!list_empty(&cache->need_commit_migrations);
1386 }
1387 
1388 static void do_worker(struct work_struct *ws)
1389 {
1390 	struct cache *cache = container_of(ws, struct cache, worker);
1391 
1392 	do {
1393 		if (!is_quiescing(cache))
1394 			process_deferred_bios(cache);
1395 
1396 		process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1397 		process_migrations(cache, &cache->completed_migrations, complete_migration);
1398 
1399 		writeback_some_dirty_blocks(cache);
1400 
1401 		process_deferred_writethrough_bios(cache);
1402 
1403 		if (commit_if_needed(cache)) {
1404 			process_deferred_flush_bios(cache, false);
1405 
1406 			/*
1407 			 * FIXME: rollback metadata or just go into a
1408 			 * failure mode and error everything
1409 			 */
1410 		} else {
1411 			process_deferred_flush_bios(cache, true);
1412 			process_migrations(cache, &cache->need_commit_migrations,
1413 					   migration_success_post_commit);
1414 		}
1415 	} while (more_work(cache));
1416 }
1417 
1418 /*
1419  * We want to commit periodically so that not too much
1420  * unwritten metadata builds up.
1421  */
1422 static void do_waker(struct work_struct *ws)
1423 {
1424 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1425 	wake_worker(cache);
1426 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1427 }
1428 
1429 /*----------------------------------------------------------------*/
1430 
1431 static int is_congested(struct dm_dev *dev, int bdi_bits)
1432 {
1433 	struct request_queue *q = bdev_get_queue(dev->bdev);
1434 	return bdi_congested(&q->backing_dev_info, bdi_bits);
1435 }
1436 
1437 static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1438 {
1439 	struct cache *cache = container_of(cb, struct cache, callbacks);
1440 
1441 	return is_congested(cache->origin_dev, bdi_bits) ||
1442 		is_congested(cache->cache_dev, bdi_bits);
1443 }
1444 
1445 /*----------------------------------------------------------------
1446  * Target methods
1447  *--------------------------------------------------------------*/
1448 
1449 /*
1450  * This function gets called on the error paths of the constructor, so we
1451  * have to cope with a partially initialised struct.
1452  */
1453 static void destroy(struct cache *cache)
1454 {
1455 	unsigned i;
1456 
1457 	if (cache->next_migration)
1458 		mempool_free(cache->next_migration, cache->migration_pool);
1459 
1460 	if (cache->migration_pool)
1461 		mempool_destroy(cache->migration_pool);
1462 
1463 	if (cache->all_io_ds)
1464 		dm_deferred_set_destroy(cache->all_io_ds);
1465 
1466 	if (cache->prison)
1467 		dm_bio_prison_destroy(cache->prison);
1468 
1469 	if (cache->wq)
1470 		destroy_workqueue(cache->wq);
1471 
1472 	if (cache->dirty_bitset)
1473 		free_bitset(cache->dirty_bitset);
1474 
1475 	if (cache->discard_bitset)
1476 		free_bitset(cache->discard_bitset);
1477 
1478 	if (cache->copier)
1479 		dm_kcopyd_client_destroy(cache->copier);
1480 
1481 	if (cache->cmd)
1482 		dm_cache_metadata_close(cache->cmd);
1483 
1484 	if (cache->metadata_dev)
1485 		dm_put_device(cache->ti, cache->metadata_dev);
1486 
1487 	if (cache->origin_dev)
1488 		dm_put_device(cache->ti, cache->origin_dev);
1489 
1490 	if (cache->cache_dev)
1491 		dm_put_device(cache->ti, cache->cache_dev);
1492 
1493 	if (cache->policy)
1494 		dm_cache_policy_destroy(cache->policy);
1495 
1496 	for (i = 0; i < cache->nr_ctr_args ; i++)
1497 		kfree(cache->ctr_args[i]);
1498 	kfree(cache->ctr_args);
1499 
1500 	kfree(cache);
1501 }
1502 
1503 static void cache_dtr(struct dm_target *ti)
1504 {
1505 	struct cache *cache = ti->private;
1506 
1507 	destroy(cache);
1508 }
1509 
1510 static sector_t get_dev_size(struct dm_dev *dev)
1511 {
1512 	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1513 }
1514 
1515 /*----------------------------------------------------------------*/
1516 
1517 /*
1518  * Construct a cache device mapping.
1519  *
1520  * cache <metadata dev> <cache dev> <origin dev> <block size>
1521  *       <#feature args> [<feature arg>]*
1522  *       <policy> <#policy args> [<policy arg>]*
1523  *
1524  * metadata dev    : fast device holding the persistent metadata
1525  * cache dev	   : fast device holding cached data blocks
1526  * origin dev	   : slow device holding original data blocks
1527  * block size	   : cache unit size in sectors
1528  *
1529  * #feature args   : number of feature arguments passed
1530  * feature args    : writethrough.  (The default is writeback.)
1531  *
1532  * policy	   : the replacement policy to use
1533  * #policy args    : an even number of policy arguments corresponding
1534  *		     to key/value pairs passed to the policy
1535  * policy args	   : key/value pairs passed to the policy
1536  *		     E.g. 'sequential_threshold 1024'
1537  *		     See cache-policies.txt for details.
1538  *
1539  * Optional feature arguments are:
1540  *   writethrough  : write through caching that prohibits cache block
1541  *		     content from being different from origin block content.
1542  *		     Without this argument, the default behaviour is to write
1543  *		     back cache block contents later for performance reasons,
1544  *		     so they may differ from the corresponding origin blocks.
1545  */
1546 struct cache_args {
1547 	struct dm_target *ti;
1548 
1549 	struct dm_dev *metadata_dev;
1550 
1551 	struct dm_dev *cache_dev;
1552 	sector_t cache_sectors;
1553 
1554 	struct dm_dev *origin_dev;
1555 	sector_t origin_sectors;
1556 
1557 	uint32_t block_size;
1558 
1559 	const char *policy_name;
1560 	int policy_argc;
1561 	const char **policy_argv;
1562 
1563 	struct cache_features features;
1564 };
1565 
1566 static void destroy_cache_args(struct cache_args *ca)
1567 {
1568 	if (ca->metadata_dev)
1569 		dm_put_device(ca->ti, ca->metadata_dev);
1570 
1571 	if (ca->cache_dev)
1572 		dm_put_device(ca->ti, ca->cache_dev);
1573 
1574 	if (ca->origin_dev)
1575 		dm_put_device(ca->ti, ca->origin_dev);
1576 
1577 	kfree(ca);
1578 }
1579 
1580 static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1581 {
1582 	if (!as->argc) {
1583 		*error = "Insufficient args";
1584 		return false;
1585 	}
1586 
1587 	return true;
1588 }
1589 
1590 static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1591 			      char **error)
1592 {
1593 	int r;
1594 	sector_t metadata_dev_size;
1595 	char b[BDEVNAME_SIZE];
1596 
1597 	if (!at_least_one_arg(as, error))
1598 		return -EINVAL;
1599 
1600 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1601 			  &ca->metadata_dev);
1602 	if (r) {
1603 		*error = "Error opening metadata device";
1604 		return r;
1605 	}
1606 
1607 	metadata_dev_size = get_dev_size(ca->metadata_dev);
1608 	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1609 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1610 		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1611 
1612 	return 0;
1613 }
1614 
1615 static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1616 			   char **error)
1617 {
1618 	int r;
1619 
1620 	if (!at_least_one_arg(as, error))
1621 		return -EINVAL;
1622 
1623 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1624 			  &ca->cache_dev);
1625 	if (r) {
1626 		*error = "Error opening cache device";
1627 		return r;
1628 	}
1629 	ca->cache_sectors = get_dev_size(ca->cache_dev);
1630 
1631 	return 0;
1632 }
1633 
1634 static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1635 			    char **error)
1636 {
1637 	int r;
1638 
1639 	if (!at_least_one_arg(as, error))
1640 		return -EINVAL;
1641 
1642 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1643 			  &ca->origin_dev);
1644 	if (r) {
1645 		*error = "Error opening origin device";
1646 		return r;
1647 	}
1648 
1649 	ca->origin_sectors = get_dev_size(ca->origin_dev);
1650 	if (ca->ti->len > ca->origin_sectors) {
1651 		*error = "Device size larger than cached device";
1652 		return -EINVAL;
1653 	}
1654 
1655 	return 0;
1656 }
1657 
1658 static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1659 			    char **error)
1660 {
1661 	unsigned long tmp;
1662 
1663 	if (!at_least_one_arg(as, error))
1664 		return -EINVAL;
1665 
1666 	if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1667 	    tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1668 	    tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1669 		*error = "Invalid data block size";
1670 		return -EINVAL;
1671 	}
1672 
1673 	if (tmp > ca->cache_sectors) {
1674 		*error = "Data block size is larger than the cache device";
1675 		return -EINVAL;
1676 	}
1677 
1678 	ca->block_size = tmp;
1679 
1680 	return 0;
1681 }
1682 
1683 static void init_features(struct cache_features *cf)
1684 {
1685 	cf->mode = CM_WRITE;
1686 	cf->write_through = false;
1687 }
1688 
1689 static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1690 			  char **error)
1691 {
1692 	static struct dm_arg _args[] = {
1693 		{0, 1, "Invalid number of cache feature arguments"},
1694 	};
1695 
1696 	int r;
1697 	unsigned argc;
1698 	const char *arg;
1699 	struct cache_features *cf = &ca->features;
1700 
1701 	init_features(cf);
1702 
1703 	r = dm_read_arg_group(_args, as, &argc, error);
1704 	if (r)
1705 		return -EINVAL;
1706 
1707 	while (argc--) {
1708 		arg = dm_shift_arg(as);
1709 
1710 		if (!strcasecmp(arg, "writeback"))
1711 			cf->write_through = false;
1712 
1713 		else if (!strcasecmp(arg, "writethrough"))
1714 			cf->write_through = true;
1715 
1716 		else {
1717 			*error = "Unrecognised cache feature requested";
1718 			return -EINVAL;
1719 		}
1720 	}
1721 
1722 	return 0;
1723 }
1724 
1725 static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1726 			char **error)
1727 {
1728 	static struct dm_arg _args[] = {
1729 		{0, 1024, "Invalid number of policy arguments"},
1730 	};
1731 
1732 	int r;
1733 
1734 	if (!at_least_one_arg(as, error))
1735 		return -EINVAL;
1736 
1737 	ca->policy_name = dm_shift_arg(as);
1738 
1739 	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1740 	if (r)
1741 		return -EINVAL;
1742 
1743 	ca->policy_argv = (const char **)as->argv;
1744 	dm_consume_args(as, ca->policy_argc);
1745 
1746 	return 0;
1747 }
1748 
1749 static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1750 			    char **error)
1751 {
1752 	int r;
1753 	struct dm_arg_set as;
1754 
1755 	as.argc = argc;
1756 	as.argv = argv;
1757 
1758 	r = parse_metadata_dev(ca, &as, error);
1759 	if (r)
1760 		return r;
1761 
1762 	r = parse_cache_dev(ca, &as, error);
1763 	if (r)
1764 		return r;
1765 
1766 	r = parse_origin_dev(ca, &as, error);
1767 	if (r)
1768 		return r;
1769 
1770 	r = parse_block_size(ca, &as, error);
1771 	if (r)
1772 		return r;
1773 
1774 	r = parse_features(ca, &as, error);
1775 	if (r)
1776 		return r;
1777 
1778 	r = parse_policy(ca, &as, error);
1779 	if (r)
1780 		return r;
1781 
1782 	return 0;
1783 }
1784 
1785 /*----------------------------------------------------------------*/
1786 
1787 static struct kmem_cache *migration_cache;
1788 
1789 static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1790 {
1791 	int r = 0;
1792 
1793 	if (argc & 1) {
1794 		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1795 		return -EINVAL;
1796 	}
1797 
1798 	while (argc) {
1799 		r = policy_set_config_value(p, argv[0], argv[1]);
1800 		if (r) {
1801 			DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1802 			       argv[0], argv[1]);
1803 			return r;
1804 		}
1805 
1806 		argc -= 2;
1807 		argv += 2;
1808 	}
1809 
1810 	return r;
1811 }
1812 
1813 static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1814 			       char **error)
1815 {
1816 	int r;
1817 
1818 	cache->policy =	dm_cache_policy_create(ca->policy_name,
1819 					       cache->cache_size,
1820 					       cache->origin_sectors,
1821 					       cache->sectors_per_block);
1822 	if (!cache->policy) {
1823 		*error = "Error creating cache's policy";
1824 		return -ENOMEM;
1825 	}
1826 
1827 	r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
1828 	if (r) {
1829 		*error = "Error setting cache policy's config values";
1830 		dm_cache_policy_destroy(cache->policy);
1831 		cache->policy = NULL;
1832 	}
1833 
1834 	return r;
1835 }
1836 
1837 /*
1838  * We want the discard block size to be a power of two, at least the size
1839  * of the cache block size, and have no more than 2^14 discard blocks
1840  * across the origin.
1841  */
1842 #define MAX_DISCARD_BLOCKS (1 << 14)
1843 
1844 static bool too_many_discard_blocks(sector_t discard_block_size,
1845 				    sector_t origin_size)
1846 {
1847 	(void) sector_div(origin_size, discard_block_size);
1848 
1849 	return origin_size > MAX_DISCARD_BLOCKS;
1850 }
1851 
1852 static sector_t calculate_discard_block_size(sector_t cache_block_size,
1853 					     sector_t origin_size)
1854 {
1855 	sector_t discard_block_size;
1856 
1857 	discard_block_size = roundup_pow_of_two(cache_block_size);
1858 
1859 	if (origin_size)
1860 		while (too_many_discard_blocks(discard_block_size, origin_size))
1861 			discard_block_size *= 2;
1862 
1863 	return discard_block_size;
1864 }
1865 
1866 #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1867 
1868 static int cache_create(struct cache_args *ca, struct cache **result)
1869 {
1870 	int r = 0;
1871 	char **error = &ca->ti->error;
1872 	struct cache *cache;
1873 	struct dm_target *ti = ca->ti;
1874 	dm_block_t origin_blocks;
1875 	struct dm_cache_metadata *cmd;
1876 	bool may_format = ca->features.mode == CM_WRITE;
1877 
1878 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1879 	if (!cache)
1880 		return -ENOMEM;
1881 
1882 	cache->ti = ca->ti;
1883 	ti->private = cache;
1884 	ti->per_bio_data_size = sizeof(struct per_bio_data);
1885 	ti->num_flush_bios = 2;
1886 	ti->flush_supported = true;
1887 
1888 	ti->num_discard_bios = 1;
1889 	ti->discards_supported = true;
1890 	ti->discard_zeroes_data_unsupported = true;
1891 
1892 	memcpy(&cache->features, &ca->features, sizeof(cache->features));
1893 
1894 	cache->callbacks.congested_fn = cache_is_congested;
1895 	dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1896 
1897 	cache->metadata_dev = ca->metadata_dev;
1898 	cache->origin_dev = ca->origin_dev;
1899 	cache->cache_dev = ca->cache_dev;
1900 
1901 	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1902 
1903 	/* FIXME: factor out this whole section */
1904 	origin_blocks = cache->origin_sectors = ca->origin_sectors;
1905 	origin_blocks = block_div(origin_blocks, ca->block_size);
1906 	cache->origin_blocks = to_oblock(origin_blocks);
1907 
1908 	cache->sectors_per_block = ca->block_size;
1909 	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1910 		r = -EINVAL;
1911 		goto bad;
1912 	}
1913 
1914 	if (ca->block_size & (ca->block_size - 1)) {
1915 		dm_block_t cache_size = ca->cache_sectors;
1916 
1917 		cache->sectors_per_block_shift = -1;
1918 		cache_size = block_div(cache_size, ca->block_size);
1919 		cache->cache_size = to_cblock(cache_size);
1920 	} else {
1921 		cache->sectors_per_block_shift = __ffs(ca->block_size);
1922 		cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1923 	}
1924 
1925 	r = create_cache_policy(cache, ca, error);
1926 	if (r)
1927 		goto bad;
1928 	cache->policy_nr_args = ca->policy_argc;
1929 
1930 	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1931 				     ca->block_size, may_format,
1932 				     dm_cache_policy_get_hint_size(cache->policy));
1933 	if (IS_ERR(cmd)) {
1934 		*error = "Error creating metadata object";
1935 		r = PTR_ERR(cmd);
1936 		goto bad;
1937 	}
1938 	cache->cmd = cmd;
1939 
1940 	spin_lock_init(&cache->lock);
1941 	bio_list_init(&cache->deferred_bios);
1942 	bio_list_init(&cache->deferred_flush_bios);
1943 	bio_list_init(&cache->deferred_writethrough_bios);
1944 	INIT_LIST_HEAD(&cache->quiesced_migrations);
1945 	INIT_LIST_HEAD(&cache->completed_migrations);
1946 	INIT_LIST_HEAD(&cache->need_commit_migrations);
1947 	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1948 	atomic_set(&cache->nr_migrations, 0);
1949 	init_waitqueue_head(&cache->migration_wait);
1950 
1951 	cache->nr_dirty = 0;
1952 	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1953 	if (!cache->dirty_bitset) {
1954 		*error = "could not allocate dirty bitset";
1955 		goto bad;
1956 	}
1957 	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1958 
1959 	cache->discard_block_size =
1960 		calculate_discard_block_size(cache->sectors_per_block,
1961 					     cache->origin_sectors);
1962 	cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1963 	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1964 	if (!cache->discard_bitset) {
1965 		*error = "could not allocate discard bitset";
1966 		goto bad;
1967 	}
1968 	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1969 
1970 	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1971 	if (IS_ERR(cache->copier)) {
1972 		*error = "could not create kcopyd client";
1973 		r = PTR_ERR(cache->copier);
1974 		goto bad;
1975 	}
1976 
1977 	cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1978 	if (!cache->wq) {
1979 		*error = "could not create workqueue for metadata object";
1980 		goto bad;
1981 	}
1982 	INIT_WORK(&cache->worker, do_worker);
1983 	INIT_DELAYED_WORK(&cache->waker, do_waker);
1984 	cache->last_commit_jiffies = jiffies;
1985 
1986 	cache->prison = dm_bio_prison_create(PRISON_CELLS);
1987 	if (!cache->prison) {
1988 		*error = "could not create bio prison";
1989 		goto bad;
1990 	}
1991 
1992 	cache->all_io_ds = dm_deferred_set_create();
1993 	if (!cache->all_io_ds) {
1994 		*error = "could not create all_io deferred set";
1995 		goto bad;
1996 	}
1997 
1998 	cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
1999 							 migration_cache);
2000 	if (!cache->migration_pool) {
2001 		*error = "Error creating cache's migration mempool";
2002 		goto bad;
2003 	}
2004 
2005 	cache->next_migration = NULL;
2006 
2007 	cache->need_tick_bio = true;
2008 	cache->sized = false;
2009 	cache->quiescing = false;
2010 	cache->commit_requested = false;
2011 	cache->loaded_mappings = false;
2012 	cache->loaded_discards = false;
2013 
2014 	load_stats(cache);
2015 
2016 	atomic_set(&cache->stats.demotion, 0);
2017 	atomic_set(&cache->stats.promotion, 0);
2018 	atomic_set(&cache->stats.copies_avoided, 0);
2019 	atomic_set(&cache->stats.cache_cell_clash, 0);
2020 	atomic_set(&cache->stats.commit_count, 0);
2021 	atomic_set(&cache->stats.discard_count, 0);
2022 
2023 	*result = cache;
2024 	return 0;
2025 
2026 bad:
2027 	destroy(cache);
2028 	return r;
2029 }
2030 
2031 static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2032 {
2033 	unsigned i;
2034 	const char **copy;
2035 
2036 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2037 	if (!copy)
2038 		return -ENOMEM;
2039 	for (i = 0; i < argc; i++) {
2040 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
2041 		if (!copy[i]) {
2042 			while (i--)
2043 				kfree(copy[i]);
2044 			kfree(copy);
2045 			return -ENOMEM;
2046 		}
2047 	}
2048 
2049 	cache->nr_ctr_args = argc;
2050 	cache->ctr_args = copy;
2051 
2052 	return 0;
2053 }
2054 
2055 static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2056 {
2057 	int r = -EINVAL;
2058 	struct cache_args *ca;
2059 	struct cache *cache = NULL;
2060 
2061 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2062 	if (!ca) {
2063 		ti->error = "Error allocating memory for cache";
2064 		return -ENOMEM;
2065 	}
2066 	ca->ti = ti;
2067 
2068 	r = parse_cache_args(ca, argc, argv, &ti->error);
2069 	if (r)
2070 		goto out;
2071 
2072 	r = cache_create(ca, &cache);
2073 	if (r)
2074 		goto out;
2075 
2076 	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2077 	if (r) {
2078 		destroy(cache);
2079 		goto out;
2080 	}
2081 
2082 	ti->private = cache;
2083 
2084 out:
2085 	destroy_cache_args(ca);
2086 	return r;
2087 }
2088 
2089 static int cache_map(struct dm_target *ti, struct bio *bio)
2090 {
2091 	struct cache *cache = ti->private;
2092 
2093 	int r;
2094 	dm_oblock_t block = get_bio_block(cache, bio);
2095 	bool can_migrate = false;
2096 	bool discarded_block;
2097 	struct dm_bio_prison_cell *cell;
2098 	struct policy_result lookup_result;
2099 	struct per_bio_data *pb;
2100 
2101 	if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2102 		/*
2103 		 * This can only occur if the io goes to a partial block at
2104 		 * the end of the origin device.  We don't cache these.
2105 		 * Just remap to the origin and carry on.
2106 		 */
2107 		remap_to_origin_clear_discard(cache, bio, block);
2108 		return DM_MAPIO_REMAPPED;
2109 	}
2110 
2111 	pb = init_per_bio_data(bio);
2112 
2113 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2114 		defer_bio(cache, bio);
2115 		return DM_MAPIO_SUBMITTED;
2116 	}
2117 
2118 	/*
2119 	 * Check to see if that block is currently migrating.
2120 	 */
2121 	cell = alloc_prison_cell(cache);
2122 	if (!cell) {
2123 		defer_bio(cache, bio);
2124 		return DM_MAPIO_SUBMITTED;
2125 	}
2126 
2127 	r = bio_detain(cache, block, bio, cell,
2128 		       (cell_free_fn) free_prison_cell,
2129 		       cache, &cell);
2130 	if (r) {
2131 		if (r < 0)
2132 			defer_bio(cache, bio);
2133 
2134 		return DM_MAPIO_SUBMITTED;
2135 	}
2136 
2137 	discarded_block = is_discarded_oblock(cache, block);
2138 
2139 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2140 		       bio, &lookup_result);
2141 	if (r == -EWOULDBLOCK) {
2142 		cell_defer(cache, cell, true);
2143 		return DM_MAPIO_SUBMITTED;
2144 
2145 	} else if (r) {
2146 		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2147 		bio_io_error(bio);
2148 		return DM_MAPIO_SUBMITTED;
2149 	}
2150 
2151 	switch (lookup_result.op) {
2152 	case POLICY_HIT:
2153 		inc_hit_counter(cache, bio);
2154 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2155 
2156 		if (is_writethrough_io(cache, bio, lookup_result.cblock))
2157 			remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2158 		else
2159 			remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
2160 
2161 		cell_defer(cache, cell, false);
2162 		break;
2163 
2164 	case POLICY_MISS:
2165 		inc_miss_counter(cache, bio);
2166 		pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2167 
2168 		if (pb->req_nr != 0) {
2169 			/*
2170 			 * This is a duplicate writethrough io that is no
2171 			 * longer needed because the block has been demoted.
2172 			 */
2173 			bio_endio(bio, 0);
2174 			cell_defer(cache, cell, false);
2175 			return DM_MAPIO_SUBMITTED;
2176 		} else {
2177 			remap_to_origin_clear_discard(cache, bio, block);
2178 			cell_defer(cache, cell, false);
2179 		}
2180 		break;
2181 
2182 	default:
2183 		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2184 			    (unsigned) lookup_result.op);
2185 		bio_io_error(bio);
2186 		return DM_MAPIO_SUBMITTED;
2187 	}
2188 
2189 	return DM_MAPIO_REMAPPED;
2190 }
2191 
2192 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2193 {
2194 	struct cache *cache = ti->private;
2195 	unsigned long flags;
2196 	struct per_bio_data *pb = get_per_bio_data(bio);
2197 
2198 	if (pb->tick) {
2199 		policy_tick(cache->policy);
2200 
2201 		spin_lock_irqsave(&cache->lock, flags);
2202 		cache->need_tick_bio = true;
2203 		spin_unlock_irqrestore(&cache->lock, flags);
2204 	}
2205 
2206 	check_for_quiesced_migrations(cache, pb);
2207 
2208 	return 0;
2209 }
2210 
2211 static int write_dirty_bitset(struct cache *cache)
2212 {
2213 	unsigned i, r;
2214 
2215 	for (i = 0; i < from_cblock(cache->cache_size); i++) {
2216 		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2217 				       is_dirty(cache, to_cblock(i)));
2218 		if (r)
2219 			return r;
2220 	}
2221 
2222 	return 0;
2223 }
2224 
2225 static int write_discard_bitset(struct cache *cache)
2226 {
2227 	unsigned i, r;
2228 
2229 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2230 					   cache->discard_nr_blocks);
2231 	if (r) {
2232 		DMERR("could not resize on-disk discard bitset");
2233 		return r;
2234 	}
2235 
2236 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2237 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2238 					 is_discarded(cache, to_dblock(i)));
2239 		if (r)
2240 			return r;
2241 	}
2242 
2243 	return 0;
2244 }
2245 
2246 static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2247 		     uint32_t hint)
2248 {
2249 	struct cache *cache = context;
2250 	return dm_cache_save_hint(cache->cmd, cblock, hint);
2251 }
2252 
2253 static int write_hints(struct cache *cache)
2254 {
2255 	int r;
2256 
2257 	r = dm_cache_begin_hints(cache->cmd, cache->policy);
2258 	if (r) {
2259 		DMERR("dm_cache_begin_hints failed");
2260 		return r;
2261 	}
2262 
2263 	r = policy_walk_mappings(cache->policy, save_hint, cache);
2264 	if (r)
2265 		DMERR("policy_walk_mappings failed");
2266 
2267 	return r;
2268 }
2269 
2270 /*
2271  * returns true on success
2272  */
2273 static bool sync_metadata(struct cache *cache)
2274 {
2275 	int r1, r2, r3, r4;
2276 
2277 	r1 = write_dirty_bitset(cache);
2278 	if (r1)
2279 		DMERR("could not write dirty bitset");
2280 
2281 	r2 = write_discard_bitset(cache);
2282 	if (r2)
2283 		DMERR("could not write discard bitset");
2284 
2285 	save_stats(cache);
2286 
2287 	r3 = write_hints(cache);
2288 	if (r3)
2289 		DMERR("could not write hints");
2290 
2291 	/*
2292 	 * If writing the above metadata failed, we still commit, but don't
2293 	 * set the clean shutdown flag.  This will effectively force every
2294 	 * dirty bit to be set on reload.
2295 	 */
2296 	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2297 	if (r4)
2298 		DMERR("could not write cache metadata.  Data loss may occur.");
2299 
2300 	return !r1 && !r2 && !r3 && !r4;
2301 }
2302 
2303 static void cache_postsuspend(struct dm_target *ti)
2304 {
2305 	struct cache *cache = ti->private;
2306 
2307 	start_quiescing(cache);
2308 	wait_for_migrations(cache);
2309 	stop_worker(cache);
2310 	requeue_deferred_io(cache);
2311 	stop_quiescing(cache);
2312 
2313 	(void) sync_metadata(cache);
2314 }
2315 
2316 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2317 			bool dirty, uint32_t hint, bool hint_valid)
2318 {
2319 	int r;
2320 	struct cache *cache = context;
2321 
2322 	r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2323 	if (r)
2324 		return r;
2325 
2326 	if (dirty)
2327 		set_dirty(cache, oblock, cblock);
2328 	else
2329 		clear_dirty(cache, oblock, cblock);
2330 
2331 	return 0;
2332 }
2333 
2334 static int load_discard(void *context, sector_t discard_block_size,
2335 			dm_dblock_t dblock, bool discard)
2336 {
2337 	struct cache *cache = context;
2338 
2339 	/* FIXME: handle mis-matched block size */
2340 
2341 	if (discard)
2342 		set_discard(cache, dblock);
2343 	else
2344 		clear_discard(cache, dblock);
2345 
2346 	return 0;
2347 }
2348 
2349 static int cache_preresume(struct dm_target *ti)
2350 {
2351 	int r = 0;
2352 	struct cache *cache = ti->private;
2353 	sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2354 	(void) sector_div(actual_cache_size, cache->sectors_per_block);
2355 
2356 	/*
2357 	 * Check to see if the cache has resized.
2358 	 */
2359 	if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2360 		cache->cache_size = to_cblock(actual_cache_size);
2361 
2362 		r = dm_cache_resize(cache->cmd, cache->cache_size);
2363 		if (r) {
2364 			DMERR("could not resize cache metadata");
2365 			return r;
2366 		}
2367 
2368 		cache->sized = true;
2369 	}
2370 
2371 	if (!cache->loaded_mappings) {
2372 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
2373 					   load_mapping, cache);
2374 		if (r) {
2375 			DMERR("could not load cache mappings");
2376 			return r;
2377 		}
2378 
2379 		cache->loaded_mappings = true;
2380 	}
2381 
2382 	if (!cache->loaded_discards) {
2383 		r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2384 		if (r) {
2385 			DMERR("could not load origin discards");
2386 			return r;
2387 		}
2388 
2389 		cache->loaded_discards = true;
2390 	}
2391 
2392 	return r;
2393 }
2394 
2395 static void cache_resume(struct dm_target *ti)
2396 {
2397 	struct cache *cache = ti->private;
2398 
2399 	cache->need_tick_bio = true;
2400 	do_waker(&cache->waker.work);
2401 }
2402 
2403 /*
2404  * Status format:
2405  *
2406  * <#used metadata blocks>/<#total metadata blocks>
2407  * <#read hits> <#read misses> <#write hits> <#write misses>
2408  * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2409  * <#features> <features>*
2410  * <#core args> <core args>
2411  * <#policy args> <policy args>*
2412  */
2413 static void cache_status(struct dm_target *ti, status_type_t type,
2414 			 unsigned status_flags, char *result, unsigned maxlen)
2415 {
2416 	int r = 0;
2417 	unsigned i;
2418 	ssize_t sz = 0;
2419 	dm_block_t nr_free_blocks_metadata = 0;
2420 	dm_block_t nr_blocks_metadata = 0;
2421 	char buf[BDEVNAME_SIZE];
2422 	struct cache *cache = ti->private;
2423 	dm_cblock_t residency;
2424 
2425 	switch (type) {
2426 	case STATUSTYPE_INFO:
2427 		/* Commit to ensure statistics aren't out-of-date */
2428 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2429 			r = dm_cache_commit(cache->cmd, false);
2430 			if (r)
2431 				DMERR("could not commit metadata for accurate status");
2432 		}
2433 
2434 		r = dm_cache_get_free_metadata_block_count(cache->cmd,
2435 							   &nr_free_blocks_metadata);
2436 		if (r) {
2437 			DMERR("could not get metadata free block count");
2438 			goto err;
2439 		}
2440 
2441 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2442 		if (r) {
2443 			DMERR("could not get metadata device size");
2444 			goto err;
2445 		}
2446 
2447 		residency = policy_residency(cache->policy);
2448 
2449 		DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2450 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2451 		       (unsigned long long)nr_blocks_metadata,
2452 		       (unsigned) atomic_read(&cache->stats.read_hit),
2453 		       (unsigned) atomic_read(&cache->stats.read_miss),
2454 		       (unsigned) atomic_read(&cache->stats.write_hit),
2455 		       (unsigned) atomic_read(&cache->stats.write_miss),
2456 		       (unsigned) atomic_read(&cache->stats.demotion),
2457 		       (unsigned) atomic_read(&cache->stats.promotion),
2458 		       (unsigned long long) from_cblock(residency),
2459 		       cache->nr_dirty);
2460 
2461 		if (cache->features.write_through)
2462 			DMEMIT("1 writethrough ");
2463 		else
2464 			DMEMIT("0 ");
2465 
2466 		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2467 		if (sz < maxlen) {
2468 			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2469 			if (r)
2470 				DMERR("policy_emit_config_values returned %d", r);
2471 		}
2472 
2473 		break;
2474 
2475 	case STATUSTYPE_TABLE:
2476 		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2477 		DMEMIT("%s ", buf);
2478 		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2479 		DMEMIT("%s ", buf);
2480 		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2481 		DMEMIT("%s", buf);
2482 
2483 		for (i = 0; i < cache->nr_ctr_args - 1; i++)
2484 			DMEMIT(" %s", cache->ctr_args[i]);
2485 		if (cache->nr_ctr_args)
2486 			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2487 	}
2488 
2489 	return;
2490 
2491 err:
2492 	DMEMIT("Error");
2493 }
2494 
2495 #define NOT_CORE_OPTION 1
2496 
2497 static int process_config_option(struct cache *cache, char **argv)
2498 {
2499 	unsigned long tmp;
2500 
2501 	if (!strcasecmp(argv[0], "migration_threshold")) {
2502 		if (kstrtoul(argv[1], 10, &tmp))
2503 			return -EINVAL;
2504 
2505 		cache->migration_threshold = tmp;
2506 		return 0;
2507 	}
2508 
2509 	return NOT_CORE_OPTION;
2510 }
2511 
2512 /*
2513  * Supports <key> <value>.
2514  *
2515  * The key migration_threshold is supported by the cache target core.
2516  */
2517 static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2518 {
2519 	int r;
2520 	struct cache *cache = ti->private;
2521 
2522 	if (argc != 2)
2523 		return -EINVAL;
2524 
2525 	r = process_config_option(cache, argv);
2526 	if (r == NOT_CORE_OPTION)
2527 		return policy_set_config_value(cache->policy, argv[0], argv[1]);
2528 
2529 	return r;
2530 }
2531 
2532 static int cache_iterate_devices(struct dm_target *ti,
2533 				 iterate_devices_callout_fn fn, void *data)
2534 {
2535 	int r = 0;
2536 	struct cache *cache = ti->private;
2537 
2538 	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2539 	if (!r)
2540 		r = fn(ti, cache->origin_dev, 0, ti->len, data);
2541 
2542 	return r;
2543 }
2544 
2545 /*
2546  * We assume I/O is going to the origin (which is the volume
2547  * more likely to have restrictions e.g. by being striped).
2548  * (Looking up the exact location of the data would be expensive
2549  * and could always be out of date by the time the bio is submitted.)
2550  */
2551 static int cache_bvec_merge(struct dm_target *ti,
2552 			    struct bvec_merge_data *bvm,
2553 			    struct bio_vec *biovec, int max_size)
2554 {
2555 	struct cache *cache = ti->private;
2556 	struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2557 
2558 	if (!q->merge_bvec_fn)
2559 		return max_size;
2560 
2561 	bvm->bi_bdev = cache->origin_dev->bdev;
2562 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2563 }
2564 
2565 static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2566 {
2567 	/*
2568 	 * FIXME: these limits may be incompatible with the cache device
2569 	 */
2570 	limits->max_discard_sectors = cache->discard_block_size * 1024;
2571 	limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2572 }
2573 
2574 static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2575 {
2576 	struct cache *cache = ti->private;
2577 
2578 	blk_limits_io_min(limits, 0);
2579 	blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2580 	set_discard_limits(cache, limits);
2581 }
2582 
2583 /*----------------------------------------------------------------*/
2584 
2585 static struct target_type cache_target = {
2586 	.name = "cache",
2587 	.version = {1, 1, 0},
2588 	.module = THIS_MODULE,
2589 	.ctr = cache_ctr,
2590 	.dtr = cache_dtr,
2591 	.map = cache_map,
2592 	.end_io = cache_end_io,
2593 	.postsuspend = cache_postsuspend,
2594 	.preresume = cache_preresume,
2595 	.resume = cache_resume,
2596 	.status = cache_status,
2597 	.message = cache_message,
2598 	.iterate_devices = cache_iterate_devices,
2599 	.merge = cache_bvec_merge,
2600 	.io_hints = cache_io_hints,
2601 };
2602 
2603 static int __init dm_cache_init(void)
2604 {
2605 	int r;
2606 
2607 	r = dm_register_target(&cache_target);
2608 	if (r) {
2609 		DMERR("cache target registration failed: %d", r);
2610 		return r;
2611 	}
2612 
2613 	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2614 	if (!migration_cache) {
2615 		dm_unregister_target(&cache_target);
2616 		return -ENOMEM;
2617 	}
2618 
2619 	return 0;
2620 }
2621 
2622 static void __exit dm_cache_exit(void)
2623 {
2624 	dm_unregister_target(&cache_target);
2625 	kmem_cache_destroy(migration_cache);
2626 }
2627 
2628 module_init(dm_cache_init);
2629 module_exit(dm_cache_exit);
2630 
2631 MODULE_DESCRIPTION(DM_NAME " cache target");
2632 MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2633 MODULE_LICENSE("GPL");
2634