xref: /linux/drivers/md/dm-cache-policy-smq.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * Copyright (C) 2015 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-cache-policy.h"
8 #include "dm-cache-policy-internal.h"
9 #include "dm.h"
10 
11 #include <linux/hash.h>
12 #include <linux/jiffies.h>
13 #include <linux/module.h>
14 #include <linux/mutex.h>
15 #include <linux/vmalloc.h>
16 #include <linux/math64.h>
17 
18 #define DM_MSG_PREFIX "cache-policy-smq"
19 
20 /*----------------------------------------------------------------*/
21 
22 /*
23  * Safe division functions that return zero on divide by zero.
24  */
25 static unsigned safe_div(unsigned n, unsigned d)
26 {
27 	return d ? n / d : 0u;
28 }
29 
30 static unsigned safe_mod(unsigned n, unsigned d)
31 {
32 	return d ? n % d : 0u;
33 }
34 
35 /*----------------------------------------------------------------*/
36 
37 struct entry {
38 	unsigned hash_next:28;
39 	unsigned prev:28;
40 	unsigned next:28;
41 	unsigned level:7;
42 	bool dirty:1;
43 	bool allocated:1;
44 	bool sentinel:1;
45 
46 	dm_oblock_t oblock;
47 };
48 
49 /*----------------------------------------------------------------*/
50 
51 #define INDEXER_NULL ((1u << 28u) - 1u)
52 
53 /*
54  * An entry_space manages a set of entries that we use for the queues.
55  * The clean and dirty queues share entries, so this object is separate
56  * from the queue itself.
57  */
58 struct entry_space {
59 	struct entry *begin;
60 	struct entry *end;
61 };
62 
63 static int space_init(struct entry_space *es, unsigned nr_entries)
64 {
65 	if (!nr_entries) {
66 		es->begin = es->end = NULL;
67 		return 0;
68 	}
69 
70 	es->begin = vzalloc(sizeof(struct entry) * nr_entries);
71 	if (!es->begin)
72 		return -ENOMEM;
73 
74 	es->end = es->begin + nr_entries;
75 	return 0;
76 }
77 
78 static void space_exit(struct entry_space *es)
79 {
80 	vfree(es->begin);
81 }
82 
83 static struct entry *__get_entry(struct entry_space *es, unsigned block)
84 {
85 	struct entry *e;
86 
87 	e = es->begin + block;
88 	BUG_ON(e >= es->end);
89 
90 	return e;
91 }
92 
93 static unsigned to_index(struct entry_space *es, struct entry *e)
94 {
95 	BUG_ON(e < es->begin || e >= es->end);
96 	return e - es->begin;
97 }
98 
99 static struct entry *to_entry(struct entry_space *es, unsigned block)
100 {
101 	if (block == INDEXER_NULL)
102 		return NULL;
103 
104 	return __get_entry(es, block);
105 }
106 
107 /*----------------------------------------------------------------*/
108 
109 struct ilist {
110 	unsigned nr_elts;	/* excluding sentinel entries */
111 	unsigned head, tail;
112 };
113 
114 static void l_init(struct ilist *l)
115 {
116 	l->nr_elts = 0;
117 	l->head = l->tail = INDEXER_NULL;
118 }
119 
120 static struct entry *l_head(struct entry_space *es, struct ilist *l)
121 {
122 	return to_entry(es, l->head);
123 }
124 
125 static struct entry *l_tail(struct entry_space *es, struct ilist *l)
126 {
127 	return to_entry(es, l->tail);
128 }
129 
130 static struct entry *l_next(struct entry_space *es, struct entry *e)
131 {
132 	return to_entry(es, e->next);
133 }
134 
135 static struct entry *l_prev(struct entry_space *es, struct entry *e)
136 {
137 	return to_entry(es, e->prev);
138 }
139 
140 static bool l_empty(struct ilist *l)
141 {
142 	return l->head == INDEXER_NULL;
143 }
144 
145 static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
146 {
147 	struct entry *head = l_head(es, l);
148 
149 	e->next = l->head;
150 	e->prev = INDEXER_NULL;
151 
152 	if (head)
153 		head->prev = l->head = to_index(es, e);
154 	else
155 		l->head = l->tail = to_index(es, e);
156 
157 	if (!e->sentinel)
158 		l->nr_elts++;
159 }
160 
161 static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
162 {
163 	struct entry *tail = l_tail(es, l);
164 
165 	e->next = INDEXER_NULL;
166 	e->prev = l->tail;
167 
168 	if (tail)
169 		tail->next = l->tail = to_index(es, e);
170 	else
171 		l->head = l->tail = to_index(es, e);
172 
173 	if (!e->sentinel)
174 		l->nr_elts++;
175 }
176 
177 static void l_add_before(struct entry_space *es, struct ilist *l,
178 			 struct entry *old, struct entry *e)
179 {
180 	struct entry *prev = l_prev(es, old);
181 
182 	if (!prev)
183 		l_add_head(es, l, e);
184 
185 	else {
186 		e->prev = old->prev;
187 		e->next = to_index(es, old);
188 		prev->next = old->prev = to_index(es, e);
189 
190 		if (!e->sentinel)
191 			l->nr_elts++;
192 	}
193 }
194 
195 static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
196 {
197 	struct entry *prev = l_prev(es, e);
198 	struct entry *next = l_next(es, e);
199 
200 	if (prev)
201 		prev->next = e->next;
202 	else
203 		l->head = e->next;
204 
205 	if (next)
206 		next->prev = e->prev;
207 	else
208 		l->tail = e->prev;
209 
210 	if (!e->sentinel)
211 		l->nr_elts--;
212 }
213 
214 static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
215 {
216 	struct entry *e;
217 
218 	for (e = l_tail(es, l); e; e = l_prev(es, e))
219 		if (!e->sentinel) {
220 			l_del(es, l, e);
221 			return e;
222 		}
223 
224 	return NULL;
225 }
226 
227 /*----------------------------------------------------------------*/
228 
229 /*
230  * The stochastic-multi-queue is a set of lru lists stacked into levels.
231  * Entries are moved up levels when they are used, which loosely orders the
232  * most accessed entries in the top levels and least in the bottom.  This
233  * structure is *much* better than a single lru list.
234  */
235 #define MAX_LEVELS 64u
236 
237 struct queue {
238 	struct entry_space *es;
239 
240 	unsigned nr_elts;
241 	unsigned nr_levels;
242 	struct ilist qs[MAX_LEVELS];
243 
244 	/*
245 	 * We maintain a count of the number of entries we would like in each
246 	 * level.
247 	 */
248 	unsigned last_target_nr_elts;
249 	unsigned nr_top_levels;
250 	unsigned nr_in_top_levels;
251 	unsigned target_count[MAX_LEVELS];
252 };
253 
254 static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
255 {
256 	unsigned i;
257 
258 	q->es = es;
259 	q->nr_elts = 0;
260 	q->nr_levels = nr_levels;
261 
262 	for (i = 0; i < q->nr_levels; i++) {
263 		l_init(q->qs + i);
264 		q->target_count[i] = 0u;
265 	}
266 
267 	q->last_target_nr_elts = 0u;
268 	q->nr_top_levels = 0u;
269 	q->nr_in_top_levels = 0u;
270 }
271 
272 static unsigned q_size(struct queue *q)
273 {
274 	return q->nr_elts;
275 }
276 
277 /*
278  * Insert an entry to the back of the given level.
279  */
280 static void q_push(struct queue *q, struct entry *e)
281 {
282 	if (!e->sentinel)
283 		q->nr_elts++;
284 
285 	l_add_tail(q->es, q->qs + e->level, e);
286 }
287 
288 static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
289 {
290 	if (!e->sentinel)
291 		q->nr_elts++;
292 
293 	l_add_before(q->es, q->qs + e->level, old, e);
294 }
295 
296 static void q_del(struct queue *q, struct entry *e)
297 {
298 	l_del(q->es, q->qs + e->level, e);
299 	if (!e->sentinel)
300 		q->nr_elts--;
301 }
302 
303 /*
304  * Return the oldest entry of the lowest populated level.
305  */
306 static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
307 {
308 	unsigned level;
309 	struct entry *e;
310 
311 	max_level = min(max_level, q->nr_levels);
312 
313 	for (level = 0; level < max_level; level++)
314 		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
315 			if (e->sentinel) {
316 				if (can_cross_sentinel)
317 					continue;
318 				else
319 					break;
320 			}
321 
322 			return e;
323 		}
324 
325 	return NULL;
326 }
327 
328 static struct entry *q_pop(struct queue *q)
329 {
330 	struct entry *e = q_peek(q, q->nr_levels, true);
331 
332 	if (e)
333 		q_del(q, e);
334 
335 	return e;
336 }
337 
338 /*
339  * Pops an entry from a level that is not past a sentinel.
340  */
341 static struct entry *q_pop_old(struct queue *q, unsigned max_level)
342 {
343 	struct entry *e = q_peek(q, max_level, false);
344 
345 	if (e)
346 		q_del(q, e);
347 
348 	return e;
349 }
350 
351 /*
352  * This function assumes there is a non-sentinel entry to pop.  It's only
353  * used by redistribute, so we know this is true.  It also doesn't adjust
354  * the q->nr_elts count.
355  */
356 static struct entry *__redist_pop_from(struct queue *q, unsigned level)
357 {
358 	struct entry *e;
359 
360 	for (; level < q->nr_levels; level++)
361 		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
362 			if (!e->sentinel) {
363 				l_del(q->es, q->qs + e->level, e);
364 				return e;
365 			}
366 
367 	return NULL;
368 }
369 
370 static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
371 {
372 	unsigned level, nr_levels, entries_per_level, remainder;
373 
374 	BUG_ON(lbegin > lend);
375 	BUG_ON(lend > q->nr_levels);
376 	nr_levels = lend - lbegin;
377 	entries_per_level = safe_div(nr_elts, nr_levels);
378 	remainder = safe_mod(nr_elts, nr_levels);
379 
380 	for (level = lbegin; level < lend; level++)
381 		q->target_count[level] =
382 			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
383 }
384 
385 /*
386  * Typically we have fewer elements in the top few levels which allows us
387  * to adjust the promote threshold nicely.
388  */
389 static void q_set_targets(struct queue *q)
390 {
391 	if (q->last_target_nr_elts == q->nr_elts)
392 		return;
393 
394 	q->last_target_nr_elts = q->nr_elts;
395 
396 	if (q->nr_top_levels > q->nr_levels)
397 		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
398 
399 	else {
400 		q_set_targets_subrange_(q, q->nr_in_top_levels,
401 					q->nr_levels - q->nr_top_levels, q->nr_levels);
402 
403 		if (q->nr_in_top_levels < q->nr_elts)
404 			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
405 						0, q->nr_levels - q->nr_top_levels);
406 		else
407 			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
408 	}
409 }
410 
411 static void q_redistribute(struct queue *q)
412 {
413 	unsigned target, level;
414 	struct ilist *l, *l_above;
415 	struct entry *e;
416 
417 	q_set_targets(q);
418 
419 	for (level = 0u; level < q->nr_levels - 1u; level++) {
420 		l = q->qs + level;
421 		target = q->target_count[level];
422 
423 		/*
424 		 * Pull down some entries from the level above.
425 		 */
426 		while (l->nr_elts < target) {
427 			e = __redist_pop_from(q, level + 1u);
428 			if (!e) {
429 				/* bug in nr_elts */
430 				break;
431 			}
432 
433 			e->level = level;
434 			l_add_tail(q->es, l, e);
435 		}
436 
437 		/*
438 		 * Push some entries up.
439 		 */
440 		l_above = q->qs + level + 1u;
441 		while (l->nr_elts > target) {
442 			e = l_pop_tail(q->es, l);
443 
444 			if (!e)
445 				/* bug in nr_elts */
446 				break;
447 
448 			e->level = level + 1u;
449 			l_add_head(q->es, l_above, e);
450 		}
451 	}
452 }
453 
454 static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
455 {
456 	struct entry *de;
457 	unsigned new_level;
458 
459 	q_del(q, e);
460 
461 	if (extra_levels && (e->level < q->nr_levels - 1u)) {
462 		new_level = min(q->nr_levels - 1u, e->level + extra_levels);
463 		for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
464 			if (de->sentinel)
465 				continue;
466 
467 			q_del(q, de);
468 			de->level = e->level;
469 
470 			if (dest)
471 				q_push_before(q, dest, de);
472 			else
473 				q_push(q, de);
474 			break;
475 		}
476 
477 		e->level = new_level;
478 	}
479 
480 	q_push(q, e);
481 }
482 
483 static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
484 {
485 	q_requeue_before(q, NULL, e, extra_levels);
486 }
487 
488 /*----------------------------------------------------------------*/
489 
490 #define FP_SHIFT 8
491 #define SIXTEENTH (1u << (FP_SHIFT - 4u))
492 #define EIGHTH (1u << (FP_SHIFT - 3u))
493 
494 struct stats {
495 	unsigned hit_threshold;
496 	unsigned hits;
497 	unsigned misses;
498 };
499 
500 enum performance {
501 	Q_POOR,
502 	Q_FAIR,
503 	Q_WELL
504 };
505 
506 static void stats_init(struct stats *s, unsigned nr_levels)
507 {
508 	s->hit_threshold = (nr_levels * 3u) / 4u;
509 	s->hits = 0u;
510 	s->misses = 0u;
511 }
512 
513 static void stats_reset(struct stats *s)
514 {
515 	s->hits = s->misses = 0u;
516 }
517 
518 static void stats_level_accessed(struct stats *s, unsigned level)
519 {
520 	if (level >= s->hit_threshold)
521 		s->hits++;
522 	else
523 		s->misses++;
524 }
525 
526 static void stats_miss(struct stats *s)
527 {
528 	s->misses++;
529 }
530 
531 /*
532  * There are times when we don't have any confidence in the hotspot queue.
533  * Such as when a fresh cache is created and the blocks have been spread
534  * out across the levels, or if an io load changes.  We detect this by
535  * seeing how often a lookup is in the top levels of the hotspot queue.
536  */
537 static enum performance stats_assess(struct stats *s)
538 {
539 	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
540 
541 	if (confidence < SIXTEENTH)
542 		return Q_POOR;
543 
544 	else if (confidence < EIGHTH)
545 		return Q_FAIR;
546 
547 	else
548 		return Q_WELL;
549 }
550 
551 /*----------------------------------------------------------------*/
552 
553 struct hash_table {
554 	struct entry_space *es;
555 	unsigned long long hash_bits;
556 	unsigned *buckets;
557 };
558 
559 /*
560  * All cache entries are stored in a chained hash table.  To save space we
561  * use indexing again, and only store indexes to the next entry.
562  */
563 static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
564 {
565 	unsigned i, nr_buckets;
566 
567 	ht->es = es;
568 	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
569 	ht->hash_bits = __ffs(nr_buckets);
570 
571 	ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
572 	if (!ht->buckets)
573 		return -ENOMEM;
574 
575 	for (i = 0; i < nr_buckets; i++)
576 		ht->buckets[i] = INDEXER_NULL;
577 
578 	return 0;
579 }
580 
581 static void h_exit(struct hash_table *ht)
582 {
583 	vfree(ht->buckets);
584 }
585 
586 static struct entry *h_head(struct hash_table *ht, unsigned bucket)
587 {
588 	return to_entry(ht->es, ht->buckets[bucket]);
589 }
590 
591 static struct entry *h_next(struct hash_table *ht, struct entry *e)
592 {
593 	return to_entry(ht->es, e->hash_next);
594 }
595 
596 static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
597 {
598 	e->hash_next = ht->buckets[bucket];
599 	ht->buckets[bucket] = to_index(ht->es, e);
600 }
601 
602 static void h_insert(struct hash_table *ht, struct entry *e)
603 {
604 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
605 	__h_insert(ht, h, e);
606 }
607 
608 static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
609 				struct entry **prev)
610 {
611 	struct entry *e;
612 
613 	*prev = NULL;
614 	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
615 		if (e->oblock == oblock)
616 			return e;
617 
618 		*prev = e;
619 	}
620 
621 	return NULL;
622 }
623 
624 static void __h_unlink(struct hash_table *ht, unsigned h,
625 		       struct entry *e, struct entry *prev)
626 {
627 	if (prev)
628 		prev->hash_next = e->hash_next;
629 	else
630 		ht->buckets[h] = e->hash_next;
631 }
632 
633 /*
634  * Also moves each entry to the front of the bucket.
635  */
636 static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
637 {
638 	struct entry *e, *prev;
639 	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
640 
641 	e = __h_lookup(ht, h, oblock, &prev);
642 	if (e && prev) {
643 		/*
644 		 * Move to the front because this entry is likely
645 		 * to be hit again.
646 		 */
647 		__h_unlink(ht, h, e, prev);
648 		__h_insert(ht, h, e);
649 	}
650 
651 	return e;
652 }
653 
654 static void h_remove(struct hash_table *ht, struct entry *e)
655 {
656 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
657 	struct entry *prev;
658 
659 	/*
660 	 * The down side of using a singly linked list is we have to
661 	 * iterate the bucket to remove an item.
662 	 */
663 	e = __h_lookup(ht, h, e->oblock, &prev);
664 	if (e)
665 		__h_unlink(ht, h, e, prev);
666 }
667 
668 /*----------------------------------------------------------------*/
669 
670 struct entry_alloc {
671 	struct entry_space *es;
672 	unsigned begin;
673 
674 	unsigned nr_allocated;
675 	struct ilist free;
676 };
677 
678 static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
679 			   unsigned begin, unsigned end)
680 {
681 	unsigned i;
682 
683 	ea->es = es;
684 	ea->nr_allocated = 0u;
685 	ea->begin = begin;
686 
687 	l_init(&ea->free);
688 	for (i = begin; i != end; i++)
689 		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
690 }
691 
692 static void init_entry(struct entry *e)
693 {
694 	/*
695 	 * We can't memset because that would clear the hotspot and
696 	 * sentinel bits which remain constant.
697 	 */
698 	e->hash_next = INDEXER_NULL;
699 	e->next = INDEXER_NULL;
700 	e->prev = INDEXER_NULL;
701 	e->level = 0u;
702 	e->allocated = true;
703 }
704 
705 static struct entry *alloc_entry(struct entry_alloc *ea)
706 {
707 	struct entry *e;
708 
709 	if (l_empty(&ea->free))
710 		return NULL;
711 
712 	e = l_pop_tail(ea->es, &ea->free);
713 	init_entry(e);
714 	ea->nr_allocated++;
715 
716 	return e;
717 }
718 
719 /*
720  * This assumes the cblock hasn't already been allocated.
721  */
722 static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
723 {
724 	struct entry *e = __get_entry(ea->es, ea->begin + i);
725 
726 	BUG_ON(e->allocated);
727 
728 	l_del(ea->es, &ea->free, e);
729 	init_entry(e);
730 	ea->nr_allocated++;
731 
732 	return e;
733 }
734 
735 static void free_entry(struct entry_alloc *ea, struct entry *e)
736 {
737 	BUG_ON(!ea->nr_allocated);
738 	BUG_ON(!e->allocated);
739 
740 	ea->nr_allocated--;
741 	e->allocated = false;
742 	l_add_tail(ea->es, &ea->free, e);
743 }
744 
745 static bool allocator_empty(struct entry_alloc *ea)
746 {
747 	return l_empty(&ea->free);
748 }
749 
750 static unsigned get_index(struct entry_alloc *ea, struct entry *e)
751 {
752 	return to_index(ea->es, e) - ea->begin;
753 }
754 
755 static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
756 {
757 	return __get_entry(ea->es, ea->begin + index);
758 }
759 
760 /*----------------------------------------------------------------*/
761 
762 #define NR_HOTSPOT_LEVELS 64u
763 #define NR_CACHE_LEVELS 64u
764 
765 #define WRITEBACK_PERIOD (10 * HZ)
766 #define DEMOTE_PERIOD (60 * HZ)
767 
768 #define HOTSPOT_UPDATE_PERIOD (HZ)
769 #define CACHE_UPDATE_PERIOD (10u * HZ)
770 
771 struct smq_policy {
772 	struct dm_cache_policy policy;
773 
774 	/* protects everything */
775 	spinlock_t lock;
776 	dm_cblock_t cache_size;
777 	sector_t cache_block_size;
778 
779 	sector_t hotspot_block_size;
780 	unsigned nr_hotspot_blocks;
781 	unsigned cache_blocks_per_hotspot_block;
782 	unsigned hotspot_level_jump;
783 
784 	struct entry_space es;
785 	struct entry_alloc writeback_sentinel_alloc;
786 	struct entry_alloc demote_sentinel_alloc;
787 	struct entry_alloc hotspot_alloc;
788 	struct entry_alloc cache_alloc;
789 
790 	unsigned long *hotspot_hit_bits;
791 	unsigned long *cache_hit_bits;
792 
793 	/*
794 	 * We maintain three queues of entries.  The cache proper,
795 	 * consisting of a clean and dirty queue, containing the currently
796 	 * active mappings.  The hotspot queue uses a larger block size to
797 	 * track blocks that are being hit frequently and potential
798 	 * candidates for promotion to the cache.
799 	 */
800 	struct queue hotspot;
801 	struct queue clean;
802 	struct queue dirty;
803 
804 	struct stats hotspot_stats;
805 	struct stats cache_stats;
806 
807 	/*
808 	 * Keeps track of time, incremented by the core.  We use this to
809 	 * avoid attributing multiple hits within the same tick.
810 	 */
811 	unsigned tick;
812 
813 	/*
814 	 * The hash tables allows us to quickly find an entry by origin
815 	 * block.
816 	 */
817 	struct hash_table table;
818 	struct hash_table hotspot_table;
819 
820 	bool current_writeback_sentinels;
821 	unsigned long next_writeback_period;
822 
823 	bool current_demote_sentinels;
824 	unsigned long next_demote_period;
825 
826 	unsigned write_promote_level;
827 	unsigned read_promote_level;
828 
829 	unsigned long next_hotspot_period;
830 	unsigned long next_cache_period;
831 };
832 
833 /*----------------------------------------------------------------*/
834 
835 static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
836 {
837 	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
838 }
839 
840 static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
841 {
842 	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
843 }
844 
845 static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
846 {
847 	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
848 }
849 
850 static void __update_writeback_sentinels(struct smq_policy *mq)
851 {
852 	unsigned level;
853 	struct queue *q = &mq->dirty;
854 	struct entry *sentinel;
855 
856 	for (level = 0; level < q->nr_levels; level++) {
857 		sentinel = writeback_sentinel(mq, level);
858 		q_del(q, sentinel);
859 		q_push(q, sentinel);
860 	}
861 }
862 
863 static void __update_demote_sentinels(struct smq_policy *mq)
864 {
865 	unsigned level;
866 	struct queue *q = &mq->clean;
867 	struct entry *sentinel;
868 
869 	for (level = 0; level < q->nr_levels; level++) {
870 		sentinel = demote_sentinel(mq, level);
871 		q_del(q, sentinel);
872 		q_push(q, sentinel);
873 	}
874 }
875 
876 static void update_sentinels(struct smq_policy *mq)
877 {
878 	if (time_after(jiffies, mq->next_writeback_period)) {
879 		__update_writeback_sentinels(mq);
880 		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
881 		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
882 	}
883 
884 	if (time_after(jiffies, mq->next_demote_period)) {
885 		__update_demote_sentinels(mq);
886 		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
887 		mq->current_demote_sentinels = !mq->current_demote_sentinels;
888 	}
889 }
890 
891 static void __sentinels_init(struct smq_policy *mq)
892 {
893 	unsigned level;
894 	struct entry *sentinel;
895 
896 	for (level = 0; level < NR_CACHE_LEVELS; level++) {
897 		sentinel = writeback_sentinel(mq, level);
898 		sentinel->level = level;
899 		q_push(&mq->dirty, sentinel);
900 
901 		sentinel = demote_sentinel(mq, level);
902 		sentinel->level = level;
903 		q_push(&mq->clean, sentinel);
904 	}
905 }
906 
907 static void sentinels_init(struct smq_policy *mq)
908 {
909 	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
910 	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
911 
912 	mq->current_writeback_sentinels = false;
913 	mq->current_demote_sentinels = false;
914 	__sentinels_init(mq);
915 
916 	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
917 	mq->current_demote_sentinels = !mq->current_demote_sentinels;
918 	__sentinels_init(mq);
919 }
920 
921 /*----------------------------------------------------------------*/
922 
923 /*
924  * These methods tie together the dirty queue, clean queue and hash table.
925  */
926 static void push_new(struct smq_policy *mq, struct entry *e)
927 {
928 	struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
929 	h_insert(&mq->table, e);
930 	q_push(q, e);
931 }
932 
933 static void push(struct smq_policy *mq, struct entry *e)
934 {
935 	struct entry *sentinel;
936 
937 	h_insert(&mq->table, e);
938 
939 	/*
940 	 * Punch this into the queue just in front of the sentinel, to
941 	 * ensure it's cleaned straight away.
942 	 */
943 	if (e->dirty) {
944 		sentinel = writeback_sentinel(mq, e->level);
945 		q_push_before(&mq->dirty, sentinel, e);
946 	} else {
947 		sentinel = demote_sentinel(mq, e->level);
948 		q_push_before(&mq->clean, sentinel, e);
949 	}
950 }
951 
952 /*
953  * Removes an entry from cache.  Removes from the hash table.
954  */
955 static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
956 {
957 	q_del(q, e);
958 	h_remove(&mq->table, e);
959 }
960 
961 static void del(struct smq_policy *mq, struct entry *e)
962 {
963 	__del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
964 }
965 
966 static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
967 {
968 	struct entry *e = q_pop_old(q, max_level);
969 	if (e)
970 		h_remove(&mq->table, e);
971 	return e;
972 }
973 
974 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
975 {
976 	return to_cblock(get_index(&mq->cache_alloc, e));
977 }
978 
979 static void requeue(struct smq_policy *mq, struct entry *e)
980 {
981 	struct entry *sentinel;
982 
983 	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
984 		if (e->dirty) {
985 			sentinel = writeback_sentinel(mq, e->level);
986 			q_requeue_before(&mq->dirty, sentinel, e, 1u);
987 		} else {
988 			sentinel = demote_sentinel(mq, e->level);
989 			q_requeue_before(&mq->clean, sentinel, e, 1u);
990 		}
991 	}
992 }
993 
994 static unsigned default_promote_level(struct smq_policy *mq)
995 {
996 	/*
997 	 * The promote level depends on the current performance of the
998 	 * cache.
999 	 *
1000 	 * If the cache is performing badly, then we can't afford
1001 	 * to promote much without causing performance to drop below that
1002 	 * of the origin device.
1003 	 *
1004 	 * If the cache is performing well, then we don't need to promote
1005 	 * much.  If it isn't broken, don't fix it.
1006 	 *
1007 	 * If the cache is middling then we promote more.
1008 	 *
1009 	 * This scheme reminds me of a graph of entropy vs probability of a
1010 	 * binary variable.
1011 	 */
1012 	static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
1013 
1014 	unsigned hits = mq->cache_stats.hits;
1015 	unsigned misses = mq->cache_stats.misses;
1016 	unsigned index = safe_div(hits << 4u, hits + misses);
1017 	return table[index];
1018 }
1019 
1020 static void update_promote_levels(struct smq_policy *mq)
1021 {
1022 	/*
1023 	 * If there are unused cache entries then we want to be really
1024 	 * eager to promote.
1025 	 */
1026 	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1027 		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1028 
1029 	/*
1030 	 * If the hotspot queue is performing badly then we have little
1031 	 * confidence that we know which blocks to promote.  So we cut down
1032 	 * the amount of promotions.
1033 	 */
1034 	switch (stats_assess(&mq->hotspot_stats)) {
1035 	case Q_POOR:
1036 		threshold_level /= 4u;
1037 		break;
1038 
1039 	case Q_FAIR:
1040 		threshold_level /= 2u;
1041 		break;
1042 
1043 	case Q_WELL:
1044 		break;
1045 	}
1046 
1047 	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1048 	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
1049 }
1050 
1051 /*
1052  * If the hotspot queue is performing badly, then we try and move entries
1053  * around more quickly.
1054  */
1055 static void update_level_jump(struct smq_policy *mq)
1056 {
1057 	switch (stats_assess(&mq->hotspot_stats)) {
1058 	case Q_POOR:
1059 		mq->hotspot_level_jump = 4u;
1060 		break;
1061 
1062 	case Q_FAIR:
1063 		mq->hotspot_level_jump = 2u;
1064 		break;
1065 
1066 	case Q_WELL:
1067 		mq->hotspot_level_jump = 1u;
1068 		break;
1069 	}
1070 }
1071 
1072 static void end_hotspot_period(struct smq_policy *mq)
1073 {
1074 	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1075 	update_promote_levels(mq);
1076 
1077 	if (time_after(jiffies, mq->next_hotspot_period)) {
1078 		update_level_jump(mq);
1079 		q_redistribute(&mq->hotspot);
1080 		stats_reset(&mq->hotspot_stats);
1081 		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1082 	}
1083 }
1084 
1085 static void end_cache_period(struct smq_policy *mq)
1086 {
1087 	if (time_after(jiffies, mq->next_cache_period)) {
1088 		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1089 
1090 		q_redistribute(&mq->dirty);
1091 		q_redistribute(&mq->clean);
1092 		stats_reset(&mq->cache_stats);
1093 
1094 		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1095 	}
1096 }
1097 
1098 static int demote_cblock(struct smq_policy *mq,
1099 			 struct policy_locker *locker,
1100 			 dm_oblock_t *oblock)
1101 {
1102 	struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
1103 	if (!demoted)
1104 		/*
1105 		 * We could get a block from mq->dirty, but that
1106 		 * would add extra latency to the triggering bio as it
1107 		 * waits for the writeback.  Better to not promote this
1108 		 * time and hope there's a clean block next time this block
1109 		 * is hit.
1110 		 */
1111 		return -ENOSPC;
1112 
1113 	if (locker->fn(locker, demoted->oblock))
1114 		/*
1115 		 * We couldn't lock this block.
1116 		 */
1117 		return -EBUSY;
1118 
1119 	del(mq, demoted);
1120 	*oblock = demoted->oblock;
1121 	free_entry(&mq->cache_alloc, demoted);
1122 
1123 	return 0;
1124 }
1125 
1126 enum promote_result {
1127 	PROMOTE_NOT,
1128 	PROMOTE_TEMPORARY,
1129 	PROMOTE_PERMANENT
1130 };
1131 
1132 /*
1133  * Converts a boolean into a promote result.
1134  */
1135 static enum promote_result maybe_promote(bool promote)
1136 {
1137 	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1138 }
1139 
1140 static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
1141 					  bool fast_promote)
1142 {
1143 	if (bio_data_dir(bio) == WRITE) {
1144 		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1145 			return PROMOTE_TEMPORARY;
1146 
1147 		else
1148 			return maybe_promote(hs_e->level >= mq->write_promote_level);
1149 	} else
1150 		return maybe_promote(hs_e->level >= mq->read_promote_level);
1151 }
1152 
1153 static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
1154 			    struct policy_locker *locker,
1155 			    struct policy_result *result, enum promote_result pr)
1156 {
1157 	int r;
1158 	struct entry *e;
1159 
1160 	if (allocator_empty(&mq->cache_alloc)) {
1161 		result->op = POLICY_REPLACE;
1162 		r = demote_cblock(mq, locker, &result->old_oblock);
1163 		if (r) {
1164 			result->op = POLICY_MISS;
1165 			return;
1166 		}
1167 
1168 	} else
1169 		result->op = POLICY_NEW;
1170 
1171 	e = alloc_entry(&mq->cache_alloc);
1172 	BUG_ON(!e);
1173 	e->oblock = oblock;
1174 
1175 	if (pr == PROMOTE_TEMPORARY)
1176 		push(mq, e);
1177 	else
1178 		push_new(mq, e);
1179 
1180 	result->cblock = infer_cblock(mq, e);
1181 }
1182 
1183 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1184 {
1185 	sector_t r = from_oblock(b);
1186 	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1187 	return to_oblock(r);
1188 }
1189 
1190 static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
1191 {
1192 	unsigned hi;
1193 	dm_oblock_t hb = to_hblock(mq, b);
1194 	struct entry *e = h_lookup(&mq->hotspot_table, hb);
1195 
1196 	if (e) {
1197 		stats_level_accessed(&mq->hotspot_stats, e->level);
1198 
1199 		hi = get_index(&mq->hotspot_alloc, e);
1200 		q_requeue(&mq->hotspot, e,
1201 			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1202 			  0u : mq->hotspot_level_jump);
1203 
1204 	} else {
1205 		stats_miss(&mq->hotspot_stats);
1206 
1207 		e = alloc_entry(&mq->hotspot_alloc);
1208 		if (!e) {
1209 			e = q_pop(&mq->hotspot);
1210 			if (e) {
1211 				h_remove(&mq->hotspot_table, e);
1212 				hi = get_index(&mq->hotspot_alloc, e);
1213 				clear_bit(hi, mq->hotspot_hit_bits);
1214 			}
1215 
1216 		}
1217 
1218 		if (e) {
1219 			e->oblock = hb;
1220 			q_push(&mq->hotspot, e);
1221 			h_insert(&mq->hotspot_table, e);
1222 		}
1223 	}
1224 
1225 	return e;
1226 }
1227 
1228 /*
1229  * Looks the oblock up in the hash table, then decides whether to put in
1230  * pre_cache, or cache etc.
1231  */
1232 static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
1233 	       bool can_migrate, bool fast_promote,
1234 	       struct policy_locker *locker, struct policy_result *result)
1235 {
1236 	struct entry *e, *hs_e;
1237 	enum promote_result pr;
1238 
1239 	hs_e = update_hotspot_queue(mq, oblock, bio);
1240 
1241 	e = h_lookup(&mq->table, oblock);
1242 	if (e) {
1243 		stats_level_accessed(&mq->cache_stats, e->level);
1244 
1245 		requeue(mq, e);
1246 		result->op = POLICY_HIT;
1247 		result->cblock = infer_cblock(mq, e);
1248 
1249 	} else {
1250 		stats_miss(&mq->cache_stats);
1251 
1252 		pr = should_promote(mq, hs_e, bio, fast_promote);
1253 		if (pr == PROMOTE_NOT)
1254 			result->op = POLICY_MISS;
1255 
1256 		else {
1257 			if (!can_migrate) {
1258 				result->op = POLICY_MISS;
1259 				return -EWOULDBLOCK;
1260 			}
1261 
1262 			insert_in_cache(mq, oblock, locker, result, pr);
1263 		}
1264 	}
1265 
1266 	return 0;
1267 }
1268 
1269 /*----------------------------------------------------------------*/
1270 
1271 /*
1272  * Public interface, via the policy struct.  See dm-cache-policy.h for a
1273  * description of these.
1274  */
1275 
1276 static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1277 {
1278 	return container_of(p, struct smq_policy, policy);
1279 }
1280 
1281 static void smq_destroy(struct dm_cache_policy *p)
1282 {
1283 	struct smq_policy *mq = to_smq_policy(p);
1284 
1285 	h_exit(&mq->hotspot_table);
1286 	h_exit(&mq->table);
1287 	free_bitset(mq->hotspot_hit_bits);
1288 	free_bitset(mq->cache_hit_bits);
1289 	space_exit(&mq->es);
1290 	kfree(mq);
1291 }
1292 
1293 static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
1294 		   bool can_block, bool can_migrate, bool fast_promote,
1295 		   struct bio *bio, struct policy_locker *locker,
1296 		   struct policy_result *result)
1297 {
1298 	int r;
1299 	unsigned long flags;
1300 	struct smq_policy *mq = to_smq_policy(p);
1301 
1302 	result->op = POLICY_MISS;
1303 
1304 	spin_lock_irqsave(&mq->lock, flags);
1305 	r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
1306 	spin_unlock_irqrestore(&mq->lock, flags);
1307 
1308 	return r;
1309 }
1310 
1311 static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
1312 {
1313 	int r;
1314 	unsigned long flags;
1315 	struct smq_policy *mq = to_smq_policy(p);
1316 	struct entry *e;
1317 
1318 	spin_lock_irqsave(&mq->lock, flags);
1319 	e = h_lookup(&mq->table, oblock);
1320 	if (e) {
1321 		*cblock = infer_cblock(mq, e);
1322 		r = 0;
1323 	} else
1324 		r = -ENOENT;
1325 	spin_unlock_irqrestore(&mq->lock, flags);
1326 
1327 	return r;
1328 }
1329 
1330 static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
1331 {
1332 	struct entry *e;
1333 
1334 	e = h_lookup(&mq->table, oblock);
1335 	BUG_ON(!e);
1336 
1337 	del(mq, e);
1338 	e->dirty = set;
1339 	push(mq, e);
1340 }
1341 
1342 static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1343 {
1344 	unsigned long flags;
1345 	struct smq_policy *mq = to_smq_policy(p);
1346 
1347 	spin_lock_irqsave(&mq->lock, flags);
1348 	__smq_set_clear_dirty(mq, oblock, true);
1349 	spin_unlock_irqrestore(&mq->lock, flags);
1350 }
1351 
1352 static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1353 {
1354 	struct smq_policy *mq = to_smq_policy(p);
1355 	unsigned long flags;
1356 
1357 	spin_lock_irqsave(&mq->lock, flags);
1358 	__smq_set_clear_dirty(mq, oblock, false);
1359 	spin_unlock_irqrestore(&mq->lock, flags);
1360 }
1361 
1362 static int smq_load_mapping(struct dm_cache_policy *p,
1363 			    dm_oblock_t oblock, dm_cblock_t cblock,
1364 			    uint32_t hint, bool hint_valid)
1365 {
1366 	struct smq_policy *mq = to_smq_policy(p);
1367 	struct entry *e;
1368 
1369 	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1370 	e->oblock = oblock;
1371 	e->dirty = false;	/* this gets corrected in a minute */
1372 	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
1373 	push(mq, e);
1374 
1375 	return 0;
1376 }
1377 
1378 static int smq_save_hints(struct smq_policy *mq, struct queue *q,
1379 			  policy_walk_fn fn, void *context)
1380 {
1381 	int r;
1382 	unsigned level;
1383 	struct entry *e;
1384 
1385 	for (level = 0; level < q->nr_levels; level++)
1386 		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
1387 			if (!e->sentinel) {
1388 				r = fn(context, infer_cblock(mq, e),
1389 				       e->oblock, e->level);
1390 				if (r)
1391 					return r;
1392 			}
1393 		}
1394 
1395 	return 0;
1396 }
1397 
1398 static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
1399 			     void *context)
1400 {
1401 	struct smq_policy *mq = to_smq_policy(p);
1402 	int r = 0;
1403 
1404 	/*
1405 	 * We don't need to lock here since this method is only called once
1406 	 * the IO has stopped.
1407 	 */
1408 	r = smq_save_hints(mq, &mq->clean, fn, context);
1409 	if (!r)
1410 		r = smq_save_hints(mq, &mq->dirty, fn, context);
1411 
1412 	return r;
1413 }
1414 
1415 static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
1416 {
1417 	struct entry *e;
1418 
1419 	e = h_lookup(&mq->table, oblock);
1420 	BUG_ON(!e);
1421 
1422 	del(mq, e);
1423 	free_entry(&mq->cache_alloc, e);
1424 }
1425 
1426 static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
1427 {
1428 	struct smq_policy *mq = to_smq_policy(p);
1429 	unsigned long flags;
1430 
1431 	spin_lock_irqsave(&mq->lock, flags);
1432 	__remove_mapping(mq, oblock);
1433 	spin_unlock_irqrestore(&mq->lock, flags);
1434 }
1435 
1436 static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
1437 {
1438 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1439 
1440 	if (!e || !e->allocated)
1441 		return -ENODATA;
1442 
1443 	del(mq, e);
1444 	free_entry(&mq->cache_alloc, e);
1445 
1446 	return 0;
1447 }
1448 
1449 static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
1450 {
1451 	int r;
1452 	unsigned long flags;
1453 	struct smq_policy *mq = to_smq_policy(p);
1454 
1455 	spin_lock_irqsave(&mq->lock, flags);
1456 	r = __remove_cblock(mq, cblock);
1457 	spin_unlock_irqrestore(&mq->lock, flags);
1458 
1459 	return r;
1460 }
1461 
1462 
1463 #define CLEAN_TARGET_CRITICAL 5u /* percent */
1464 
1465 static bool clean_target_met(struct smq_policy *mq, bool critical)
1466 {
1467 	if (critical) {
1468 		/*
1469 		 * Cache entries may not be populated.  So we're cannot rely on the
1470 		 * size of the clean queue.
1471 		 */
1472 		unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
1473 		unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
1474 
1475 		return nr_clean >= target;
1476 	} else
1477 		return !q_size(&mq->dirty);
1478 }
1479 
1480 static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
1481 				dm_cblock_t *cblock, bool critical_only)
1482 {
1483 	struct entry *e = NULL;
1484 	bool target_met = clean_target_met(mq, critical_only);
1485 
1486 	if (critical_only)
1487 		/*
1488 		 * Always try and keep the bottom level clean.
1489 		 */
1490 		e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
1491 
1492 	else
1493 		e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
1494 
1495 	if (!e)
1496 		return -ENODATA;
1497 
1498 	*oblock = e->oblock;
1499 	*cblock = infer_cblock(mq, e);
1500 	e->dirty = false;
1501 	push_new(mq, e);
1502 
1503 	return 0;
1504 }
1505 
1506 static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
1507 			      dm_cblock_t *cblock, bool critical_only)
1508 {
1509 	int r;
1510 	unsigned long flags;
1511 	struct smq_policy *mq = to_smq_policy(p);
1512 
1513 	spin_lock_irqsave(&mq->lock, flags);
1514 	r = __smq_writeback_work(mq, oblock, cblock, critical_only);
1515 	spin_unlock_irqrestore(&mq->lock, flags);
1516 
1517 	return r;
1518 }
1519 
1520 static void __force_mapping(struct smq_policy *mq,
1521 			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1522 {
1523 	struct entry *e = h_lookup(&mq->table, current_oblock);
1524 
1525 	if (e) {
1526 		del(mq, e);
1527 		e->oblock = new_oblock;
1528 		e->dirty = true;
1529 		push(mq, e);
1530 	}
1531 }
1532 
1533 static void smq_force_mapping(struct dm_cache_policy *p,
1534 			      dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1535 {
1536 	unsigned long flags;
1537 	struct smq_policy *mq = to_smq_policy(p);
1538 
1539 	spin_lock_irqsave(&mq->lock, flags);
1540 	__force_mapping(mq, current_oblock, new_oblock);
1541 	spin_unlock_irqrestore(&mq->lock, flags);
1542 }
1543 
1544 static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1545 {
1546 	dm_cblock_t r;
1547 	unsigned long flags;
1548 	struct smq_policy *mq = to_smq_policy(p);
1549 
1550 	spin_lock_irqsave(&mq->lock, flags);
1551 	r = to_cblock(mq->cache_alloc.nr_allocated);
1552 	spin_unlock_irqrestore(&mq->lock, flags);
1553 
1554 	return r;
1555 }
1556 
1557 static void smq_tick(struct dm_cache_policy *p, bool can_block)
1558 {
1559 	struct smq_policy *mq = to_smq_policy(p);
1560 	unsigned long flags;
1561 
1562 	spin_lock_irqsave(&mq->lock, flags);
1563 	mq->tick++;
1564 	update_sentinels(mq);
1565 	end_hotspot_period(mq);
1566 	end_cache_period(mq);
1567 	spin_unlock_irqrestore(&mq->lock, flags);
1568 }
1569 
1570 /*
1571  * smq has no config values, but the old mq policy did.  To avoid breaking
1572  * software we continue to accept these configurables for the mq policy,
1573  * but they have no effect.
1574  */
1575 static int mq_set_config_value(struct dm_cache_policy *p,
1576 			       const char *key, const char *value)
1577 {
1578 	unsigned long tmp;
1579 
1580 	if (kstrtoul(value, 10, &tmp))
1581 		return -EINVAL;
1582 
1583 	if (!strcasecmp(key, "random_threshold") ||
1584 	    !strcasecmp(key, "sequential_threshold") ||
1585 	    !strcasecmp(key, "discard_promote_adjustment") ||
1586 	    !strcasecmp(key, "read_promote_adjustment") ||
1587 	    !strcasecmp(key, "write_promote_adjustment")) {
1588 		DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1589 		return 0;
1590 	}
1591 
1592 	return -EINVAL;
1593 }
1594 
1595 static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1596 				 unsigned maxlen, ssize_t *sz_ptr)
1597 {
1598 	ssize_t sz = *sz_ptr;
1599 
1600 	DMEMIT("10 random_threshold 0 "
1601 	       "sequential_threshold 0 "
1602 	       "discard_promote_adjustment 0 "
1603 	       "read_promote_adjustment 0 "
1604 	       "write_promote_adjustment 0 ");
1605 
1606 	*sz_ptr = sz;
1607 	return 0;
1608 }
1609 
1610 /* Init the policy plugin interface function pointers. */
1611 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1612 {
1613 	mq->policy.destroy = smq_destroy;
1614 	mq->policy.map = smq_map;
1615 	mq->policy.lookup = smq_lookup;
1616 	mq->policy.set_dirty = smq_set_dirty;
1617 	mq->policy.clear_dirty = smq_clear_dirty;
1618 	mq->policy.load_mapping = smq_load_mapping;
1619 	mq->policy.walk_mappings = smq_walk_mappings;
1620 	mq->policy.remove_mapping = smq_remove_mapping;
1621 	mq->policy.remove_cblock = smq_remove_cblock;
1622 	mq->policy.writeback_work = smq_writeback_work;
1623 	mq->policy.force_mapping = smq_force_mapping;
1624 	mq->policy.residency = smq_residency;
1625 	mq->policy.tick = smq_tick;
1626 
1627 	if (mimic_mq) {
1628 		mq->policy.set_config_value = mq_set_config_value;
1629 		mq->policy.emit_config_values = mq_emit_config_values;
1630 	}
1631 }
1632 
1633 static bool too_many_hotspot_blocks(sector_t origin_size,
1634 				    sector_t hotspot_block_size,
1635 				    unsigned nr_hotspot_blocks)
1636 {
1637 	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1638 }
1639 
1640 static void calc_hotspot_params(sector_t origin_size,
1641 				sector_t cache_block_size,
1642 				unsigned nr_cache_blocks,
1643 				sector_t *hotspot_block_size,
1644 				unsigned *nr_hotspot_blocks)
1645 {
1646 	*hotspot_block_size = cache_block_size * 16u;
1647 	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1648 
1649 	while ((*hotspot_block_size > cache_block_size) &&
1650 	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1651 		*hotspot_block_size /= 2u;
1652 }
1653 
1654 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1655 					    sector_t origin_size,
1656 					    sector_t cache_block_size,
1657 					    bool mimic_mq)
1658 {
1659 	unsigned i;
1660 	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1661 	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
1662 	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1663 
1664 	if (!mq)
1665 		return NULL;
1666 
1667 	init_policy_functions(mq, mimic_mq);
1668 	mq->cache_size = cache_size;
1669 	mq->cache_block_size = cache_block_size;
1670 
1671 	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1672 			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1673 
1674 	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1675 	mq->hotspot_level_jump = 1u;
1676 	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1677 		DMERR("couldn't initialize entry space");
1678 		goto bad_pool_init;
1679 	}
1680 
1681 	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1682         for (i = 0; i < nr_sentinels_per_queue; i++)
1683 		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1684 
1685 	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1686         for (i = 0; i < nr_sentinels_per_queue; i++)
1687 		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1688 
1689 	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1690 		       total_sentinels + mq->nr_hotspot_blocks);
1691 
1692 	init_allocator(&mq->cache_alloc, &mq->es,
1693 		       total_sentinels + mq->nr_hotspot_blocks,
1694 		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1695 
1696 	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1697 	if (!mq->hotspot_hit_bits) {
1698 		DMERR("couldn't allocate hotspot hit bitset");
1699 		goto bad_hotspot_hit_bits;
1700 	}
1701 	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1702 
1703 	if (from_cblock(cache_size)) {
1704 		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1705 		if (!mq->cache_hit_bits) {
1706 			DMERR("couldn't allocate cache hit bitset");
1707 			goto bad_cache_hit_bits;
1708 		}
1709 		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1710 	} else
1711 		mq->cache_hit_bits = NULL;
1712 
1713 	mq->tick = 0;
1714 	spin_lock_init(&mq->lock);
1715 
1716 	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1717 	mq->hotspot.nr_top_levels = 8;
1718 	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1719 					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1720 
1721 	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1722 	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1723 
1724 	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1725 	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1726 
1727 	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1728 		goto bad_alloc_table;
1729 
1730 	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1731 		goto bad_alloc_hotspot_table;
1732 
1733 	sentinels_init(mq);
1734 	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1735 
1736 	mq->next_hotspot_period = jiffies;
1737 	mq->next_cache_period = jiffies;
1738 
1739 	return &mq->policy;
1740 
1741 bad_alloc_hotspot_table:
1742 	h_exit(&mq->table);
1743 bad_alloc_table:
1744 	free_bitset(mq->cache_hit_bits);
1745 bad_cache_hit_bits:
1746 	free_bitset(mq->hotspot_hit_bits);
1747 bad_hotspot_hit_bits:
1748 	space_exit(&mq->es);
1749 bad_pool_init:
1750 	kfree(mq);
1751 
1752 	return NULL;
1753 }
1754 
1755 static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1756 					  sector_t origin_size,
1757 					  sector_t cache_block_size)
1758 {
1759 	return __smq_create(cache_size, origin_size, cache_block_size, false);
1760 }
1761 
1762 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1763 					 sector_t origin_size,
1764 					 sector_t cache_block_size)
1765 {
1766 	return __smq_create(cache_size, origin_size, cache_block_size, true);
1767 }
1768 
1769 /*----------------------------------------------------------------*/
1770 
1771 static struct dm_cache_policy_type smq_policy_type = {
1772 	.name = "smq",
1773 	.version = {1, 5, 0},
1774 	.hint_size = 4,
1775 	.owner = THIS_MODULE,
1776 	.create = smq_create
1777 };
1778 
1779 static struct dm_cache_policy_type mq_policy_type = {
1780 	.name = "mq",
1781 	.version = {1, 5, 0},
1782 	.hint_size = 4,
1783 	.owner = THIS_MODULE,
1784 	.create = mq_create,
1785 };
1786 
1787 static struct dm_cache_policy_type default_policy_type = {
1788 	.name = "default",
1789 	.version = {1, 5, 0},
1790 	.hint_size = 4,
1791 	.owner = THIS_MODULE,
1792 	.create = smq_create,
1793 	.real = &smq_policy_type
1794 };
1795 
1796 static int __init smq_init(void)
1797 {
1798 	int r;
1799 
1800 	r = dm_cache_policy_register(&smq_policy_type);
1801 	if (r) {
1802 		DMERR("register failed %d", r);
1803 		return -ENOMEM;
1804 	}
1805 
1806 	r = dm_cache_policy_register(&mq_policy_type);
1807 	if (r) {
1808 		DMERR("register failed (as mq) %d", r);
1809 		dm_cache_policy_unregister(&smq_policy_type);
1810 		return -ENOMEM;
1811 	}
1812 
1813 	r = dm_cache_policy_register(&default_policy_type);
1814 	if (r) {
1815 		DMERR("register failed (as default) %d", r);
1816 		dm_cache_policy_unregister(&mq_policy_type);
1817 		dm_cache_policy_unregister(&smq_policy_type);
1818 		return -ENOMEM;
1819 	}
1820 
1821 	return 0;
1822 }
1823 
1824 static void __exit smq_exit(void)
1825 {
1826 	dm_cache_policy_unregister(&smq_policy_type);
1827 	dm_cache_policy_unregister(&mq_policy_type);
1828 	dm_cache_policy_unregister(&default_policy_type);
1829 }
1830 
1831 module_init(smq_init);
1832 module_exit(smq_exit);
1833 
1834 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1835 MODULE_LICENSE("GPL");
1836 MODULE_DESCRIPTION("smq cache policy");
1837 
1838 MODULE_ALIAS("dm-cache-default");
1839 MODULE_ALIAS("dm-cache-mq");
1840