xref: /linux/drivers/md/dm-cache-policy-smq.c (revision bd628c1bed7902ec1f24ba0fe70758949146abbe)
1 /*
2  * Copyright (C) 2015 Red Hat. All rights reserved.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-cache-background-tracker.h"
8 #include "dm-cache-policy-internal.h"
9 #include "dm-cache-policy.h"
10 #include "dm.h"
11 
12 #include <linux/hash.h>
13 #include <linux/jiffies.h>
14 #include <linux/module.h>
15 #include <linux/mutex.h>
16 #include <linux/vmalloc.h>
17 #include <linux/math64.h>
18 
19 #define DM_MSG_PREFIX "cache-policy-smq"
20 
21 /*----------------------------------------------------------------*/
22 
23 /*
24  * Safe division functions that return zero on divide by zero.
25  */
26 static unsigned safe_div(unsigned n, unsigned d)
27 {
28 	return d ? n / d : 0u;
29 }
30 
31 static unsigned safe_mod(unsigned n, unsigned d)
32 {
33 	return d ? n % d : 0u;
34 }
35 
36 /*----------------------------------------------------------------*/
37 
38 struct entry {
39 	unsigned hash_next:28;
40 	unsigned prev:28;
41 	unsigned next:28;
42 	unsigned level:6;
43 	bool dirty:1;
44 	bool allocated:1;
45 	bool sentinel:1;
46 	bool pending_work:1;
47 
48 	dm_oblock_t oblock;
49 };
50 
51 /*----------------------------------------------------------------*/
52 
53 #define INDEXER_NULL ((1u << 28u) - 1u)
54 
55 /*
56  * An entry_space manages a set of entries that we use for the queues.
57  * The clean and dirty queues share entries, so this object is separate
58  * from the queue itself.
59  */
60 struct entry_space {
61 	struct entry *begin;
62 	struct entry *end;
63 };
64 
65 static int space_init(struct entry_space *es, unsigned nr_entries)
66 {
67 	if (!nr_entries) {
68 		es->begin = es->end = NULL;
69 		return 0;
70 	}
71 
72 	es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
73 	if (!es->begin)
74 		return -ENOMEM;
75 
76 	es->end = es->begin + nr_entries;
77 	return 0;
78 }
79 
80 static void space_exit(struct entry_space *es)
81 {
82 	vfree(es->begin);
83 }
84 
85 static struct entry *__get_entry(struct entry_space *es, unsigned block)
86 {
87 	struct entry *e;
88 
89 	e = es->begin + block;
90 	BUG_ON(e >= es->end);
91 
92 	return e;
93 }
94 
95 static unsigned to_index(struct entry_space *es, struct entry *e)
96 {
97 	BUG_ON(e < es->begin || e >= es->end);
98 	return e - es->begin;
99 }
100 
101 static struct entry *to_entry(struct entry_space *es, unsigned block)
102 {
103 	if (block == INDEXER_NULL)
104 		return NULL;
105 
106 	return __get_entry(es, block);
107 }
108 
109 /*----------------------------------------------------------------*/
110 
111 struct ilist {
112 	unsigned nr_elts;	/* excluding sentinel entries */
113 	unsigned head, tail;
114 };
115 
116 static void l_init(struct ilist *l)
117 {
118 	l->nr_elts = 0;
119 	l->head = l->tail = INDEXER_NULL;
120 }
121 
122 static struct entry *l_head(struct entry_space *es, struct ilist *l)
123 {
124 	return to_entry(es, l->head);
125 }
126 
127 static struct entry *l_tail(struct entry_space *es, struct ilist *l)
128 {
129 	return to_entry(es, l->tail);
130 }
131 
132 static struct entry *l_next(struct entry_space *es, struct entry *e)
133 {
134 	return to_entry(es, e->next);
135 }
136 
137 static struct entry *l_prev(struct entry_space *es, struct entry *e)
138 {
139 	return to_entry(es, e->prev);
140 }
141 
142 static bool l_empty(struct ilist *l)
143 {
144 	return l->head == INDEXER_NULL;
145 }
146 
147 static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
148 {
149 	struct entry *head = l_head(es, l);
150 
151 	e->next = l->head;
152 	e->prev = INDEXER_NULL;
153 
154 	if (head)
155 		head->prev = l->head = to_index(es, e);
156 	else
157 		l->head = l->tail = to_index(es, e);
158 
159 	if (!e->sentinel)
160 		l->nr_elts++;
161 }
162 
163 static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
164 {
165 	struct entry *tail = l_tail(es, l);
166 
167 	e->next = INDEXER_NULL;
168 	e->prev = l->tail;
169 
170 	if (tail)
171 		tail->next = l->tail = to_index(es, e);
172 	else
173 		l->head = l->tail = to_index(es, e);
174 
175 	if (!e->sentinel)
176 		l->nr_elts++;
177 }
178 
179 static void l_add_before(struct entry_space *es, struct ilist *l,
180 			 struct entry *old, struct entry *e)
181 {
182 	struct entry *prev = l_prev(es, old);
183 
184 	if (!prev)
185 		l_add_head(es, l, e);
186 
187 	else {
188 		e->prev = old->prev;
189 		e->next = to_index(es, old);
190 		prev->next = old->prev = to_index(es, e);
191 
192 		if (!e->sentinel)
193 			l->nr_elts++;
194 	}
195 }
196 
197 static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
198 {
199 	struct entry *prev = l_prev(es, e);
200 	struct entry *next = l_next(es, e);
201 
202 	if (prev)
203 		prev->next = e->next;
204 	else
205 		l->head = e->next;
206 
207 	if (next)
208 		next->prev = e->prev;
209 	else
210 		l->tail = e->prev;
211 
212 	if (!e->sentinel)
213 		l->nr_elts--;
214 }
215 
216 static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
217 {
218 	struct entry *e;
219 
220 	for (e = l_head(es, l); e; e = l_next(es, e))
221 		if (!e->sentinel) {
222 			l_del(es, l, e);
223 			return e;
224 		}
225 
226 	return NULL;
227 }
228 
229 static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
230 {
231 	struct entry *e;
232 
233 	for (e = l_tail(es, l); e; e = l_prev(es, e))
234 		if (!e->sentinel) {
235 			l_del(es, l, e);
236 			return e;
237 		}
238 
239 	return NULL;
240 }
241 
242 /*----------------------------------------------------------------*/
243 
244 /*
245  * The stochastic-multi-queue is a set of lru lists stacked into levels.
246  * Entries are moved up levels when they are used, which loosely orders the
247  * most accessed entries in the top levels and least in the bottom.  This
248  * structure is *much* better than a single lru list.
249  */
250 #define MAX_LEVELS 64u
251 
252 struct queue {
253 	struct entry_space *es;
254 
255 	unsigned nr_elts;
256 	unsigned nr_levels;
257 	struct ilist qs[MAX_LEVELS];
258 
259 	/*
260 	 * We maintain a count of the number of entries we would like in each
261 	 * level.
262 	 */
263 	unsigned last_target_nr_elts;
264 	unsigned nr_top_levels;
265 	unsigned nr_in_top_levels;
266 	unsigned target_count[MAX_LEVELS];
267 };
268 
269 static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
270 {
271 	unsigned i;
272 
273 	q->es = es;
274 	q->nr_elts = 0;
275 	q->nr_levels = nr_levels;
276 
277 	for (i = 0; i < q->nr_levels; i++) {
278 		l_init(q->qs + i);
279 		q->target_count[i] = 0u;
280 	}
281 
282 	q->last_target_nr_elts = 0u;
283 	q->nr_top_levels = 0u;
284 	q->nr_in_top_levels = 0u;
285 }
286 
287 static unsigned q_size(struct queue *q)
288 {
289 	return q->nr_elts;
290 }
291 
292 /*
293  * Insert an entry to the back of the given level.
294  */
295 static void q_push(struct queue *q, struct entry *e)
296 {
297 	BUG_ON(e->pending_work);
298 
299 	if (!e->sentinel)
300 		q->nr_elts++;
301 
302 	l_add_tail(q->es, q->qs + e->level, e);
303 }
304 
305 static void q_push_front(struct queue *q, struct entry *e)
306 {
307 	BUG_ON(e->pending_work);
308 
309 	if (!e->sentinel)
310 		q->nr_elts++;
311 
312 	l_add_head(q->es, q->qs + e->level, e);
313 }
314 
315 static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
316 {
317 	BUG_ON(e->pending_work);
318 
319 	if (!e->sentinel)
320 		q->nr_elts++;
321 
322 	l_add_before(q->es, q->qs + e->level, old, e);
323 }
324 
325 static void q_del(struct queue *q, struct entry *e)
326 {
327 	l_del(q->es, q->qs + e->level, e);
328 	if (!e->sentinel)
329 		q->nr_elts--;
330 }
331 
332 /*
333  * Return the oldest entry of the lowest populated level.
334  */
335 static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
336 {
337 	unsigned level;
338 	struct entry *e;
339 
340 	max_level = min(max_level, q->nr_levels);
341 
342 	for (level = 0; level < max_level; level++)
343 		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
344 			if (e->sentinel) {
345 				if (can_cross_sentinel)
346 					continue;
347 				else
348 					break;
349 			}
350 
351 			return e;
352 		}
353 
354 	return NULL;
355 }
356 
357 static struct entry *q_pop(struct queue *q)
358 {
359 	struct entry *e = q_peek(q, q->nr_levels, true);
360 
361 	if (e)
362 		q_del(q, e);
363 
364 	return e;
365 }
366 
367 /*
368  * This function assumes there is a non-sentinel entry to pop.  It's only
369  * used by redistribute, so we know this is true.  It also doesn't adjust
370  * the q->nr_elts count.
371  */
372 static struct entry *__redist_pop_from(struct queue *q, unsigned level)
373 {
374 	struct entry *e;
375 
376 	for (; level < q->nr_levels; level++)
377 		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
378 			if (!e->sentinel) {
379 				l_del(q->es, q->qs + e->level, e);
380 				return e;
381 			}
382 
383 	return NULL;
384 }
385 
386 static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
387 {
388 	unsigned level, nr_levels, entries_per_level, remainder;
389 
390 	BUG_ON(lbegin > lend);
391 	BUG_ON(lend > q->nr_levels);
392 	nr_levels = lend - lbegin;
393 	entries_per_level = safe_div(nr_elts, nr_levels);
394 	remainder = safe_mod(nr_elts, nr_levels);
395 
396 	for (level = lbegin; level < lend; level++)
397 		q->target_count[level] =
398 			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
399 }
400 
401 /*
402  * Typically we have fewer elements in the top few levels which allows us
403  * to adjust the promote threshold nicely.
404  */
405 static void q_set_targets(struct queue *q)
406 {
407 	if (q->last_target_nr_elts == q->nr_elts)
408 		return;
409 
410 	q->last_target_nr_elts = q->nr_elts;
411 
412 	if (q->nr_top_levels > q->nr_levels)
413 		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
414 
415 	else {
416 		q_set_targets_subrange_(q, q->nr_in_top_levels,
417 					q->nr_levels - q->nr_top_levels, q->nr_levels);
418 
419 		if (q->nr_in_top_levels < q->nr_elts)
420 			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
421 						0, q->nr_levels - q->nr_top_levels);
422 		else
423 			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
424 	}
425 }
426 
427 static void q_redistribute(struct queue *q)
428 {
429 	unsigned target, level;
430 	struct ilist *l, *l_above;
431 	struct entry *e;
432 
433 	q_set_targets(q);
434 
435 	for (level = 0u; level < q->nr_levels - 1u; level++) {
436 		l = q->qs + level;
437 		target = q->target_count[level];
438 
439 		/*
440 		 * Pull down some entries from the level above.
441 		 */
442 		while (l->nr_elts < target) {
443 			e = __redist_pop_from(q, level + 1u);
444 			if (!e) {
445 				/* bug in nr_elts */
446 				break;
447 			}
448 
449 			e->level = level;
450 			l_add_tail(q->es, l, e);
451 		}
452 
453 		/*
454 		 * Push some entries up.
455 		 */
456 		l_above = q->qs + level + 1u;
457 		while (l->nr_elts > target) {
458 			e = l_pop_tail(q->es, l);
459 
460 			if (!e)
461 				/* bug in nr_elts */
462 				break;
463 
464 			e->level = level + 1u;
465 			l_add_tail(q->es, l_above, e);
466 		}
467 	}
468 }
469 
470 static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
471 		      struct entry *s1, struct entry *s2)
472 {
473 	struct entry *de;
474 	unsigned sentinels_passed = 0;
475 	unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
476 
477 	/* try and find an entry to swap with */
478 	if (extra_levels && (e->level < q->nr_levels - 1u)) {
479 		for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
480 			sentinels_passed++;
481 
482 		if (de) {
483 			q_del(q, de);
484 			de->level = e->level;
485 			if (s1) {
486 				switch (sentinels_passed) {
487 				case 0:
488 					q_push_before(q, s1, de);
489 					break;
490 
491 				case 1:
492 					q_push_before(q, s2, de);
493 					break;
494 
495 				default:
496 					q_push(q, de);
497 				}
498 			} else
499 				q_push(q, de);
500 		}
501 	}
502 
503 	q_del(q, e);
504 	e->level = new_level;
505 	q_push(q, e);
506 }
507 
508 /*----------------------------------------------------------------*/
509 
510 #define FP_SHIFT 8
511 #define SIXTEENTH (1u << (FP_SHIFT - 4u))
512 #define EIGHTH (1u << (FP_SHIFT - 3u))
513 
514 struct stats {
515 	unsigned hit_threshold;
516 	unsigned hits;
517 	unsigned misses;
518 };
519 
520 enum performance {
521 	Q_POOR,
522 	Q_FAIR,
523 	Q_WELL
524 };
525 
526 static void stats_init(struct stats *s, unsigned nr_levels)
527 {
528 	s->hit_threshold = (nr_levels * 3u) / 4u;
529 	s->hits = 0u;
530 	s->misses = 0u;
531 }
532 
533 static void stats_reset(struct stats *s)
534 {
535 	s->hits = s->misses = 0u;
536 }
537 
538 static void stats_level_accessed(struct stats *s, unsigned level)
539 {
540 	if (level >= s->hit_threshold)
541 		s->hits++;
542 	else
543 		s->misses++;
544 }
545 
546 static void stats_miss(struct stats *s)
547 {
548 	s->misses++;
549 }
550 
551 /*
552  * There are times when we don't have any confidence in the hotspot queue.
553  * Such as when a fresh cache is created and the blocks have been spread
554  * out across the levels, or if an io load changes.  We detect this by
555  * seeing how often a lookup is in the top levels of the hotspot queue.
556  */
557 static enum performance stats_assess(struct stats *s)
558 {
559 	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
560 
561 	if (confidence < SIXTEENTH)
562 		return Q_POOR;
563 
564 	else if (confidence < EIGHTH)
565 		return Q_FAIR;
566 
567 	else
568 		return Q_WELL;
569 }
570 
571 /*----------------------------------------------------------------*/
572 
573 struct smq_hash_table {
574 	struct entry_space *es;
575 	unsigned long long hash_bits;
576 	unsigned *buckets;
577 };
578 
579 /*
580  * All cache entries are stored in a chained hash table.  To save space we
581  * use indexing again, and only store indexes to the next entry.
582  */
583 static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
584 {
585 	unsigned i, nr_buckets;
586 
587 	ht->es = es;
588 	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
589 	ht->hash_bits = __ffs(nr_buckets);
590 
591 	ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
592 	if (!ht->buckets)
593 		return -ENOMEM;
594 
595 	for (i = 0; i < nr_buckets; i++)
596 		ht->buckets[i] = INDEXER_NULL;
597 
598 	return 0;
599 }
600 
601 static void h_exit(struct smq_hash_table *ht)
602 {
603 	vfree(ht->buckets);
604 }
605 
606 static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
607 {
608 	return to_entry(ht->es, ht->buckets[bucket]);
609 }
610 
611 static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
612 {
613 	return to_entry(ht->es, e->hash_next);
614 }
615 
616 static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
617 {
618 	e->hash_next = ht->buckets[bucket];
619 	ht->buckets[bucket] = to_index(ht->es, e);
620 }
621 
622 static void h_insert(struct smq_hash_table *ht, struct entry *e)
623 {
624 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
625 	__h_insert(ht, h, e);
626 }
627 
628 static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
629 				struct entry **prev)
630 {
631 	struct entry *e;
632 
633 	*prev = NULL;
634 	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
635 		if (e->oblock == oblock)
636 			return e;
637 
638 		*prev = e;
639 	}
640 
641 	return NULL;
642 }
643 
644 static void __h_unlink(struct smq_hash_table *ht, unsigned h,
645 		       struct entry *e, struct entry *prev)
646 {
647 	if (prev)
648 		prev->hash_next = e->hash_next;
649 	else
650 		ht->buckets[h] = e->hash_next;
651 }
652 
653 /*
654  * Also moves each entry to the front of the bucket.
655  */
656 static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
657 {
658 	struct entry *e, *prev;
659 	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
660 
661 	e = __h_lookup(ht, h, oblock, &prev);
662 	if (e && prev) {
663 		/*
664 		 * Move to the front because this entry is likely
665 		 * to be hit again.
666 		 */
667 		__h_unlink(ht, h, e, prev);
668 		__h_insert(ht, h, e);
669 	}
670 
671 	return e;
672 }
673 
674 static void h_remove(struct smq_hash_table *ht, struct entry *e)
675 {
676 	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
677 	struct entry *prev;
678 
679 	/*
680 	 * The down side of using a singly linked list is we have to
681 	 * iterate the bucket to remove an item.
682 	 */
683 	e = __h_lookup(ht, h, e->oblock, &prev);
684 	if (e)
685 		__h_unlink(ht, h, e, prev);
686 }
687 
688 /*----------------------------------------------------------------*/
689 
690 struct entry_alloc {
691 	struct entry_space *es;
692 	unsigned begin;
693 
694 	unsigned nr_allocated;
695 	struct ilist free;
696 };
697 
698 static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
699 			   unsigned begin, unsigned end)
700 {
701 	unsigned i;
702 
703 	ea->es = es;
704 	ea->nr_allocated = 0u;
705 	ea->begin = begin;
706 
707 	l_init(&ea->free);
708 	for (i = begin; i != end; i++)
709 		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
710 }
711 
712 static void init_entry(struct entry *e)
713 {
714 	/*
715 	 * We can't memset because that would clear the hotspot and
716 	 * sentinel bits which remain constant.
717 	 */
718 	e->hash_next = INDEXER_NULL;
719 	e->next = INDEXER_NULL;
720 	e->prev = INDEXER_NULL;
721 	e->level = 0u;
722 	e->dirty = true;	/* FIXME: audit */
723 	e->allocated = true;
724 	e->sentinel = false;
725 	e->pending_work = false;
726 }
727 
728 static struct entry *alloc_entry(struct entry_alloc *ea)
729 {
730 	struct entry *e;
731 
732 	if (l_empty(&ea->free))
733 		return NULL;
734 
735 	e = l_pop_head(ea->es, &ea->free);
736 	init_entry(e);
737 	ea->nr_allocated++;
738 
739 	return e;
740 }
741 
742 /*
743  * This assumes the cblock hasn't already been allocated.
744  */
745 static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
746 {
747 	struct entry *e = __get_entry(ea->es, ea->begin + i);
748 
749 	BUG_ON(e->allocated);
750 
751 	l_del(ea->es, &ea->free, e);
752 	init_entry(e);
753 	ea->nr_allocated++;
754 
755 	return e;
756 }
757 
758 static void free_entry(struct entry_alloc *ea, struct entry *e)
759 {
760 	BUG_ON(!ea->nr_allocated);
761 	BUG_ON(!e->allocated);
762 
763 	ea->nr_allocated--;
764 	e->allocated = false;
765 	l_add_tail(ea->es, &ea->free, e);
766 }
767 
768 static bool allocator_empty(struct entry_alloc *ea)
769 {
770 	return l_empty(&ea->free);
771 }
772 
773 static unsigned get_index(struct entry_alloc *ea, struct entry *e)
774 {
775 	return to_index(ea->es, e) - ea->begin;
776 }
777 
778 static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
779 {
780 	return __get_entry(ea->es, ea->begin + index);
781 }
782 
783 /*----------------------------------------------------------------*/
784 
785 #define NR_HOTSPOT_LEVELS 64u
786 #define NR_CACHE_LEVELS 64u
787 
788 #define WRITEBACK_PERIOD (10ul * HZ)
789 #define DEMOTE_PERIOD (60ul * HZ)
790 
791 #define HOTSPOT_UPDATE_PERIOD (HZ)
792 #define CACHE_UPDATE_PERIOD (60ul * HZ)
793 
794 struct smq_policy {
795 	struct dm_cache_policy policy;
796 
797 	/* protects everything */
798 	spinlock_t lock;
799 	dm_cblock_t cache_size;
800 	sector_t cache_block_size;
801 
802 	sector_t hotspot_block_size;
803 	unsigned nr_hotspot_blocks;
804 	unsigned cache_blocks_per_hotspot_block;
805 	unsigned hotspot_level_jump;
806 
807 	struct entry_space es;
808 	struct entry_alloc writeback_sentinel_alloc;
809 	struct entry_alloc demote_sentinel_alloc;
810 	struct entry_alloc hotspot_alloc;
811 	struct entry_alloc cache_alloc;
812 
813 	unsigned long *hotspot_hit_bits;
814 	unsigned long *cache_hit_bits;
815 
816 	/*
817 	 * We maintain three queues of entries.  The cache proper,
818 	 * consisting of a clean and dirty queue, containing the currently
819 	 * active mappings.  The hotspot queue uses a larger block size to
820 	 * track blocks that are being hit frequently and potential
821 	 * candidates for promotion to the cache.
822 	 */
823 	struct queue hotspot;
824 	struct queue clean;
825 	struct queue dirty;
826 
827 	struct stats hotspot_stats;
828 	struct stats cache_stats;
829 
830 	/*
831 	 * Keeps track of time, incremented by the core.  We use this to
832 	 * avoid attributing multiple hits within the same tick.
833 	 */
834 	unsigned tick;
835 
836 	/*
837 	 * The hash tables allows us to quickly find an entry by origin
838 	 * block.
839 	 */
840 	struct smq_hash_table table;
841 	struct smq_hash_table hotspot_table;
842 
843 	bool current_writeback_sentinels;
844 	unsigned long next_writeback_period;
845 
846 	bool current_demote_sentinels;
847 	unsigned long next_demote_period;
848 
849 	unsigned write_promote_level;
850 	unsigned read_promote_level;
851 
852 	unsigned long next_hotspot_period;
853 	unsigned long next_cache_period;
854 
855 	struct background_tracker *bg_work;
856 
857 	bool migrations_allowed;
858 };
859 
860 /*----------------------------------------------------------------*/
861 
862 static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
863 {
864 	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
865 }
866 
867 static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
868 {
869 	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
870 }
871 
872 static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
873 {
874 	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
875 }
876 
877 static void __update_writeback_sentinels(struct smq_policy *mq)
878 {
879 	unsigned level;
880 	struct queue *q = &mq->dirty;
881 	struct entry *sentinel;
882 
883 	for (level = 0; level < q->nr_levels; level++) {
884 		sentinel = writeback_sentinel(mq, level);
885 		q_del(q, sentinel);
886 		q_push(q, sentinel);
887 	}
888 }
889 
890 static void __update_demote_sentinels(struct smq_policy *mq)
891 {
892 	unsigned level;
893 	struct queue *q = &mq->clean;
894 	struct entry *sentinel;
895 
896 	for (level = 0; level < q->nr_levels; level++) {
897 		sentinel = demote_sentinel(mq, level);
898 		q_del(q, sentinel);
899 		q_push(q, sentinel);
900 	}
901 }
902 
903 static void update_sentinels(struct smq_policy *mq)
904 {
905 	if (time_after(jiffies, mq->next_writeback_period)) {
906 		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
907 		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
908 		__update_writeback_sentinels(mq);
909 	}
910 
911 	if (time_after(jiffies, mq->next_demote_period)) {
912 		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
913 		mq->current_demote_sentinels = !mq->current_demote_sentinels;
914 		__update_demote_sentinels(mq);
915 	}
916 }
917 
918 static void __sentinels_init(struct smq_policy *mq)
919 {
920 	unsigned level;
921 	struct entry *sentinel;
922 
923 	for (level = 0; level < NR_CACHE_LEVELS; level++) {
924 		sentinel = writeback_sentinel(mq, level);
925 		sentinel->level = level;
926 		q_push(&mq->dirty, sentinel);
927 
928 		sentinel = demote_sentinel(mq, level);
929 		sentinel->level = level;
930 		q_push(&mq->clean, sentinel);
931 	}
932 }
933 
934 static void sentinels_init(struct smq_policy *mq)
935 {
936 	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
937 	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
938 
939 	mq->current_writeback_sentinels = false;
940 	mq->current_demote_sentinels = false;
941 	__sentinels_init(mq);
942 
943 	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
944 	mq->current_demote_sentinels = !mq->current_demote_sentinels;
945 	__sentinels_init(mq);
946 }
947 
948 /*----------------------------------------------------------------*/
949 
950 static void del_queue(struct smq_policy *mq, struct entry *e)
951 {
952 	q_del(e->dirty ? &mq->dirty : &mq->clean, e);
953 }
954 
955 static void push_queue(struct smq_policy *mq, struct entry *e)
956 {
957 	if (e->dirty)
958 		q_push(&mq->dirty, e);
959 	else
960 		q_push(&mq->clean, e);
961 }
962 
963 // !h, !q, a -> h, q, a
964 static void push(struct smq_policy *mq, struct entry *e)
965 {
966 	h_insert(&mq->table, e);
967 	if (!e->pending_work)
968 		push_queue(mq, e);
969 }
970 
971 static void push_queue_front(struct smq_policy *mq, struct entry *e)
972 {
973 	if (e->dirty)
974 		q_push_front(&mq->dirty, e);
975 	else
976 		q_push_front(&mq->clean, e);
977 }
978 
979 static void push_front(struct smq_policy *mq, struct entry *e)
980 {
981 	h_insert(&mq->table, e);
982 	if (!e->pending_work)
983 		push_queue_front(mq, e);
984 }
985 
986 static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
987 {
988 	return to_cblock(get_index(&mq->cache_alloc, e));
989 }
990 
991 static void requeue(struct smq_policy *mq, struct entry *e)
992 {
993 	/*
994 	 * Pending work has temporarily been taken out of the queues.
995 	 */
996 	if (e->pending_work)
997 		return;
998 
999 	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
1000 		if (!e->dirty) {
1001 			q_requeue(&mq->clean, e, 1u, NULL, NULL);
1002 			return;
1003 		}
1004 
1005 		q_requeue(&mq->dirty, e, 1u,
1006 			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
1007 			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
1008 	}
1009 }
1010 
1011 static unsigned default_promote_level(struct smq_policy *mq)
1012 {
1013 	/*
1014 	 * The promote level depends on the current performance of the
1015 	 * cache.
1016 	 *
1017 	 * If the cache is performing badly, then we can't afford
1018 	 * to promote much without causing performance to drop below that
1019 	 * of the origin device.
1020 	 *
1021 	 * If the cache is performing well, then we don't need to promote
1022 	 * much.  If it isn't broken, don't fix it.
1023 	 *
1024 	 * If the cache is middling then we promote more.
1025 	 *
1026 	 * This scheme reminds me of a graph of entropy vs probability of a
1027 	 * binary variable.
1028 	 */
1029 	static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
1030 
1031 	unsigned hits = mq->cache_stats.hits;
1032 	unsigned misses = mq->cache_stats.misses;
1033 	unsigned index = safe_div(hits << 4u, hits + misses);
1034 	return table[index];
1035 }
1036 
1037 static void update_promote_levels(struct smq_policy *mq)
1038 {
1039 	/*
1040 	 * If there are unused cache entries then we want to be really
1041 	 * eager to promote.
1042 	 */
1043 	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1044 		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1045 
1046 	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1047 
1048 	/*
1049 	 * If the hotspot queue is performing badly then we have little
1050 	 * confidence that we know which blocks to promote.  So we cut down
1051 	 * the amount of promotions.
1052 	 */
1053 	switch (stats_assess(&mq->hotspot_stats)) {
1054 	case Q_POOR:
1055 		threshold_level /= 4u;
1056 		break;
1057 
1058 	case Q_FAIR:
1059 		threshold_level /= 2u;
1060 		break;
1061 
1062 	case Q_WELL:
1063 		break;
1064 	}
1065 
1066 	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1067 	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1068 }
1069 
1070 /*
1071  * If the hotspot queue is performing badly, then we try and move entries
1072  * around more quickly.
1073  */
1074 static void update_level_jump(struct smq_policy *mq)
1075 {
1076 	switch (stats_assess(&mq->hotspot_stats)) {
1077 	case Q_POOR:
1078 		mq->hotspot_level_jump = 4u;
1079 		break;
1080 
1081 	case Q_FAIR:
1082 		mq->hotspot_level_jump = 2u;
1083 		break;
1084 
1085 	case Q_WELL:
1086 		mq->hotspot_level_jump = 1u;
1087 		break;
1088 	}
1089 }
1090 
1091 static void end_hotspot_period(struct smq_policy *mq)
1092 {
1093 	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1094 	update_promote_levels(mq);
1095 
1096 	if (time_after(jiffies, mq->next_hotspot_period)) {
1097 		update_level_jump(mq);
1098 		q_redistribute(&mq->hotspot);
1099 		stats_reset(&mq->hotspot_stats);
1100 		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1101 	}
1102 }
1103 
1104 static void end_cache_period(struct smq_policy *mq)
1105 {
1106 	if (time_after(jiffies, mq->next_cache_period)) {
1107 		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1108 
1109 		q_redistribute(&mq->dirty);
1110 		q_redistribute(&mq->clean);
1111 		stats_reset(&mq->cache_stats);
1112 
1113 		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1114 	}
1115 }
1116 
1117 /*----------------------------------------------------------------*/
1118 
1119 /*
1120  * Targets are given as a percentage.
1121  */
1122 #define CLEAN_TARGET 25u
1123 #define FREE_TARGET 25u
1124 
1125 static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
1126 {
1127 	return from_cblock(mq->cache_size) * p / 100u;
1128 }
1129 
1130 static bool clean_target_met(struct smq_policy *mq, bool idle)
1131 {
1132 	/*
1133 	 * Cache entries may not be populated.  So we cannot rely on the
1134 	 * size of the clean queue.
1135 	 */
1136 	if (idle) {
1137 		/*
1138 		 * We'd like to clean everything.
1139 		 */
1140 		return q_size(&mq->dirty) == 0u;
1141 	}
1142 
1143 	/*
1144 	 * If we're busy we don't worry about cleaning at all.
1145 	 */
1146 	return true;
1147 }
1148 
1149 static bool free_target_met(struct smq_policy *mq)
1150 {
1151 	unsigned nr_free;
1152 
1153 	nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
1154 	return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1155 		percent_to_target(mq, FREE_TARGET);
1156 }
1157 
1158 /*----------------------------------------------------------------*/
1159 
1160 static void mark_pending(struct smq_policy *mq, struct entry *e)
1161 {
1162 	BUG_ON(e->sentinel);
1163 	BUG_ON(!e->allocated);
1164 	BUG_ON(e->pending_work);
1165 	e->pending_work = true;
1166 }
1167 
1168 static void clear_pending(struct smq_policy *mq, struct entry *e)
1169 {
1170 	BUG_ON(!e->pending_work);
1171 	e->pending_work = false;
1172 }
1173 
1174 static void queue_writeback(struct smq_policy *mq, bool idle)
1175 {
1176 	int r;
1177 	struct policy_work work;
1178 	struct entry *e;
1179 
1180 	e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
1181 	if (e) {
1182 		mark_pending(mq, e);
1183 		q_del(&mq->dirty, e);
1184 
1185 		work.op = POLICY_WRITEBACK;
1186 		work.oblock = e->oblock;
1187 		work.cblock = infer_cblock(mq, e);
1188 
1189 		r = btracker_queue(mq->bg_work, &work, NULL);
1190 		if (r) {
1191 			clear_pending(mq, e);
1192 			q_push_front(&mq->dirty, e);
1193 		}
1194 	}
1195 }
1196 
1197 static void queue_demotion(struct smq_policy *mq)
1198 {
1199 	int r;
1200 	struct policy_work work;
1201 	struct entry *e;
1202 
1203 	if (WARN_ON_ONCE(!mq->migrations_allowed))
1204 		return;
1205 
1206 	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
1207 	if (!e) {
1208 		if (!clean_target_met(mq, true))
1209 			queue_writeback(mq, false);
1210 		return;
1211 	}
1212 
1213 	mark_pending(mq, e);
1214 	q_del(&mq->clean, e);
1215 
1216 	work.op = POLICY_DEMOTE;
1217 	work.oblock = e->oblock;
1218 	work.cblock = infer_cblock(mq, e);
1219 	r = btracker_queue(mq->bg_work, &work, NULL);
1220 	if (r) {
1221 		clear_pending(mq, e);
1222 		q_push_front(&mq->clean, e);
1223 	}
1224 }
1225 
1226 static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1227 			    struct policy_work **workp)
1228 {
1229 	int r;
1230 	struct entry *e;
1231 	struct policy_work work;
1232 
1233 	if (!mq->migrations_allowed)
1234 		return;
1235 
1236 	if (allocator_empty(&mq->cache_alloc)) {
1237 		/*
1238 		 * We always claim to be 'idle' to ensure some demotions happen
1239 		 * with continuous loads.
1240 		 */
1241 		if (!free_target_met(mq))
1242 			queue_demotion(mq);
1243 		return;
1244 	}
1245 
1246 	if (btracker_promotion_already_present(mq->bg_work, oblock))
1247 		return;
1248 
1249 	/*
1250 	 * We allocate the entry now to reserve the cblock.  If the
1251 	 * background work is aborted we must remember to free it.
1252 	 */
1253 	e = alloc_entry(&mq->cache_alloc);
1254 	BUG_ON(!e);
1255 	e->pending_work = true;
1256 	work.op = POLICY_PROMOTE;
1257 	work.oblock = oblock;
1258 	work.cblock = infer_cblock(mq, e);
1259 	r = btracker_queue(mq->bg_work, &work, workp);
1260 	if (r)
1261 		free_entry(&mq->cache_alloc, e);
1262 }
1263 
1264 /*----------------------------------------------------------------*/
1265 
1266 enum promote_result {
1267 	PROMOTE_NOT,
1268 	PROMOTE_TEMPORARY,
1269 	PROMOTE_PERMANENT
1270 };
1271 
1272 /*
1273  * Converts a boolean into a promote result.
1274  */
1275 static enum promote_result maybe_promote(bool promote)
1276 {
1277 	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1278 }
1279 
1280 static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1281 					  int data_dir, bool fast_promote)
1282 {
1283 	if (data_dir == WRITE) {
1284 		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1285 			return PROMOTE_TEMPORARY;
1286 
1287 		return maybe_promote(hs_e->level >= mq->write_promote_level);
1288 	} else
1289 		return maybe_promote(hs_e->level >= mq->read_promote_level);
1290 }
1291 
1292 static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1293 {
1294 	sector_t r = from_oblock(b);
1295 	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1296 	return to_oblock(r);
1297 }
1298 
1299 static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1300 {
1301 	unsigned hi;
1302 	dm_oblock_t hb = to_hblock(mq, b);
1303 	struct entry *e = h_lookup(&mq->hotspot_table, hb);
1304 
1305 	if (e) {
1306 		stats_level_accessed(&mq->hotspot_stats, e->level);
1307 
1308 		hi = get_index(&mq->hotspot_alloc, e);
1309 		q_requeue(&mq->hotspot, e,
1310 			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1311 			  0u : mq->hotspot_level_jump,
1312 			  NULL, NULL);
1313 
1314 	} else {
1315 		stats_miss(&mq->hotspot_stats);
1316 
1317 		e = alloc_entry(&mq->hotspot_alloc);
1318 		if (!e) {
1319 			e = q_pop(&mq->hotspot);
1320 			if (e) {
1321 				h_remove(&mq->hotspot_table, e);
1322 				hi = get_index(&mq->hotspot_alloc, e);
1323 				clear_bit(hi, mq->hotspot_hit_bits);
1324 			}
1325 
1326 		}
1327 
1328 		if (e) {
1329 			e->oblock = hb;
1330 			q_push(&mq->hotspot, e);
1331 			h_insert(&mq->hotspot_table, e);
1332 		}
1333 	}
1334 
1335 	return e;
1336 }
1337 
1338 /*----------------------------------------------------------------*/
1339 
1340 /*
1341  * Public interface, via the policy struct.  See dm-cache-policy.h for a
1342  * description of these.
1343  */
1344 
1345 static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1346 {
1347 	return container_of(p, struct smq_policy, policy);
1348 }
1349 
1350 static void smq_destroy(struct dm_cache_policy *p)
1351 {
1352 	struct smq_policy *mq = to_smq_policy(p);
1353 
1354 	btracker_destroy(mq->bg_work);
1355 	h_exit(&mq->hotspot_table);
1356 	h_exit(&mq->table);
1357 	free_bitset(mq->hotspot_hit_bits);
1358 	free_bitset(mq->cache_hit_bits);
1359 	space_exit(&mq->es);
1360 	kfree(mq);
1361 }
1362 
1363 /*----------------------------------------------------------------*/
1364 
1365 static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1366 		    int data_dir, bool fast_copy,
1367 		    struct policy_work **work, bool *background_work)
1368 {
1369 	struct entry *e, *hs_e;
1370 	enum promote_result pr;
1371 
1372 	*background_work = false;
1373 
1374 	e = h_lookup(&mq->table, oblock);
1375 	if (e) {
1376 		stats_level_accessed(&mq->cache_stats, e->level);
1377 
1378 		requeue(mq, e);
1379 		*cblock = infer_cblock(mq, e);
1380 		return 0;
1381 
1382 	} else {
1383 		stats_miss(&mq->cache_stats);
1384 
1385 		/*
1386 		 * The hotspot queue only gets updated with misses.
1387 		 */
1388 		hs_e = update_hotspot_queue(mq, oblock);
1389 
1390 		pr = should_promote(mq, hs_e, data_dir, fast_copy);
1391 		if (pr != PROMOTE_NOT) {
1392 			queue_promotion(mq, oblock, work);
1393 			*background_work = true;
1394 		}
1395 
1396 		return -ENOENT;
1397 	}
1398 }
1399 
1400 static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1401 		      int data_dir, bool fast_copy,
1402 		      bool *background_work)
1403 {
1404 	int r;
1405 	unsigned long flags;
1406 	struct smq_policy *mq = to_smq_policy(p);
1407 
1408 	spin_lock_irqsave(&mq->lock, flags);
1409 	r = __lookup(mq, oblock, cblock,
1410 		     data_dir, fast_copy,
1411 		     NULL, background_work);
1412 	spin_unlock_irqrestore(&mq->lock, flags);
1413 
1414 	return r;
1415 }
1416 
1417 static int smq_lookup_with_work(struct dm_cache_policy *p,
1418 				dm_oblock_t oblock, dm_cblock_t *cblock,
1419 				int data_dir, bool fast_copy,
1420 				struct policy_work **work)
1421 {
1422 	int r;
1423 	bool background_queued;
1424 	unsigned long flags;
1425 	struct smq_policy *mq = to_smq_policy(p);
1426 
1427 	spin_lock_irqsave(&mq->lock, flags);
1428 	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1429 	spin_unlock_irqrestore(&mq->lock, flags);
1430 
1431 	return r;
1432 }
1433 
1434 static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1435 				   struct policy_work **result)
1436 {
1437 	int r;
1438 	unsigned long flags;
1439 	struct smq_policy *mq = to_smq_policy(p);
1440 
1441 	spin_lock_irqsave(&mq->lock, flags);
1442 	r = btracker_issue(mq->bg_work, result);
1443 	if (r == -ENODATA) {
1444 		if (!clean_target_met(mq, idle)) {
1445 			queue_writeback(mq, idle);
1446 			r = btracker_issue(mq->bg_work, result);
1447 		}
1448 	}
1449 	spin_unlock_irqrestore(&mq->lock, flags);
1450 
1451 	return r;
1452 }
1453 
1454 /*
1455  * We need to clear any pending work flags that have been set, and in the
1456  * case of promotion free the entry for the destination cblock.
1457  */
1458 static void __complete_background_work(struct smq_policy *mq,
1459 				       struct policy_work *work,
1460 				       bool success)
1461 {
1462 	struct entry *e = get_entry(&mq->cache_alloc,
1463 				    from_cblock(work->cblock));
1464 
1465 	switch (work->op) {
1466 	case POLICY_PROMOTE:
1467 		// !h, !q, a
1468 		clear_pending(mq, e);
1469 		if (success) {
1470 			e->oblock = work->oblock;
1471 			e->level = NR_CACHE_LEVELS - 1;
1472 			push(mq, e);
1473 			// h, q, a
1474 		} else {
1475 			free_entry(&mq->cache_alloc, e);
1476 			// !h, !q, !a
1477 		}
1478 		break;
1479 
1480 	case POLICY_DEMOTE:
1481 		// h, !q, a
1482 		if (success) {
1483 			h_remove(&mq->table, e);
1484 			free_entry(&mq->cache_alloc, e);
1485 			// !h, !q, !a
1486 		} else {
1487 			clear_pending(mq, e);
1488 			push_queue(mq, e);
1489 			// h, q, a
1490 		}
1491 		break;
1492 
1493 	case POLICY_WRITEBACK:
1494 		// h, !q, a
1495 		clear_pending(mq, e);
1496 		push_queue(mq, e);
1497 		// h, q, a
1498 		break;
1499 	}
1500 
1501 	btracker_complete(mq->bg_work, work);
1502 }
1503 
1504 static void smq_complete_background_work(struct dm_cache_policy *p,
1505 					 struct policy_work *work,
1506 					 bool success)
1507 {
1508 	unsigned long flags;
1509 	struct smq_policy *mq = to_smq_policy(p);
1510 
1511 	spin_lock_irqsave(&mq->lock, flags);
1512 	__complete_background_work(mq, work, success);
1513 	spin_unlock_irqrestore(&mq->lock, flags);
1514 }
1515 
1516 // in_hash(oblock) -> in_hash(oblock)
1517 static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1518 {
1519 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1520 
1521 	if (e->pending_work)
1522 		e->dirty = set;
1523 	else {
1524 		del_queue(mq, e);
1525 		e->dirty = set;
1526 		push_queue(mq, e);
1527 	}
1528 }
1529 
1530 static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1531 {
1532 	unsigned long flags;
1533 	struct smq_policy *mq = to_smq_policy(p);
1534 
1535 	spin_lock_irqsave(&mq->lock, flags);
1536 	__smq_set_clear_dirty(mq, cblock, true);
1537 	spin_unlock_irqrestore(&mq->lock, flags);
1538 }
1539 
1540 static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1541 {
1542 	struct smq_policy *mq = to_smq_policy(p);
1543 	unsigned long flags;
1544 
1545 	spin_lock_irqsave(&mq->lock, flags);
1546 	__smq_set_clear_dirty(mq, cblock, false);
1547 	spin_unlock_irqrestore(&mq->lock, flags);
1548 }
1549 
1550 static unsigned random_level(dm_cblock_t cblock)
1551 {
1552 	return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1553 }
1554 
1555 static int smq_load_mapping(struct dm_cache_policy *p,
1556 			    dm_oblock_t oblock, dm_cblock_t cblock,
1557 			    bool dirty, uint32_t hint, bool hint_valid)
1558 {
1559 	struct smq_policy *mq = to_smq_policy(p);
1560 	struct entry *e;
1561 
1562 	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1563 	e->oblock = oblock;
1564 	e->dirty = dirty;
1565 	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1566 	e->pending_work = false;
1567 
1568 	/*
1569 	 * When we load mappings we push ahead of both sentinels in order to
1570 	 * allow demotions and cleaning to occur immediately.
1571 	 */
1572 	push_front(mq, e);
1573 
1574 	return 0;
1575 }
1576 
1577 static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1578 {
1579 	struct smq_policy *mq = to_smq_policy(p);
1580 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1581 
1582 	if (!e->allocated)
1583 		return -ENODATA;
1584 
1585 	// FIXME: what if this block has pending background work?
1586 	del_queue(mq, e);
1587 	h_remove(&mq->table, e);
1588 	free_entry(&mq->cache_alloc, e);
1589 	return 0;
1590 }
1591 
1592 static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1593 {
1594 	struct smq_policy *mq = to_smq_policy(p);
1595 	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1596 
1597 	if (!e->allocated)
1598 		return 0;
1599 
1600 	return e->level;
1601 }
1602 
1603 static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1604 {
1605 	dm_cblock_t r;
1606 	unsigned long flags;
1607 	struct smq_policy *mq = to_smq_policy(p);
1608 
1609 	spin_lock_irqsave(&mq->lock, flags);
1610 	r = to_cblock(mq->cache_alloc.nr_allocated);
1611 	spin_unlock_irqrestore(&mq->lock, flags);
1612 
1613 	return r;
1614 }
1615 
1616 static void smq_tick(struct dm_cache_policy *p, bool can_block)
1617 {
1618 	struct smq_policy *mq = to_smq_policy(p);
1619 	unsigned long flags;
1620 
1621 	spin_lock_irqsave(&mq->lock, flags);
1622 	mq->tick++;
1623 	update_sentinels(mq);
1624 	end_hotspot_period(mq);
1625 	end_cache_period(mq);
1626 	spin_unlock_irqrestore(&mq->lock, flags);
1627 }
1628 
1629 static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1630 {
1631 	struct smq_policy *mq = to_smq_policy(p);
1632 	mq->migrations_allowed = allow;
1633 }
1634 
1635 /*
1636  * smq has no config values, but the old mq policy did.  To avoid breaking
1637  * software we continue to accept these configurables for the mq policy,
1638  * but they have no effect.
1639  */
1640 static int mq_set_config_value(struct dm_cache_policy *p,
1641 			       const char *key, const char *value)
1642 {
1643 	unsigned long tmp;
1644 
1645 	if (kstrtoul(value, 10, &tmp))
1646 		return -EINVAL;
1647 
1648 	if (!strcasecmp(key, "random_threshold") ||
1649 	    !strcasecmp(key, "sequential_threshold") ||
1650 	    !strcasecmp(key, "discard_promote_adjustment") ||
1651 	    !strcasecmp(key, "read_promote_adjustment") ||
1652 	    !strcasecmp(key, "write_promote_adjustment")) {
1653 		DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1654 		return 0;
1655 	}
1656 
1657 	return -EINVAL;
1658 }
1659 
1660 static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1661 				 unsigned maxlen, ssize_t *sz_ptr)
1662 {
1663 	ssize_t sz = *sz_ptr;
1664 
1665 	DMEMIT("10 random_threshold 0 "
1666 	       "sequential_threshold 0 "
1667 	       "discard_promote_adjustment 0 "
1668 	       "read_promote_adjustment 0 "
1669 	       "write_promote_adjustment 0 ");
1670 
1671 	*sz_ptr = sz;
1672 	return 0;
1673 }
1674 
1675 /* Init the policy plugin interface function pointers. */
1676 static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1677 {
1678 	mq->policy.destroy = smq_destroy;
1679 	mq->policy.lookup = smq_lookup;
1680 	mq->policy.lookup_with_work = smq_lookup_with_work;
1681 	mq->policy.get_background_work = smq_get_background_work;
1682 	mq->policy.complete_background_work = smq_complete_background_work;
1683 	mq->policy.set_dirty = smq_set_dirty;
1684 	mq->policy.clear_dirty = smq_clear_dirty;
1685 	mq->policy.load_mapping = smq_load_mapping;
1686 	mq->policy.invalidate_mapping = smq_invalidate_mapping;
1687 	mq->policy.get_hint = smq_get_hint;
1688 	mq->policy.residency = smq_residency;
1689 	mq->policy.tick = smq_tick;
1690 	mq->policy.allow_migrations = smq_allow_migrations;
1691 
1692 	if (mimic_mq) {
1693 		mq->policy.set_config_value = mq_set_config_value;
1694 		mq->policy.emit_config_values = mq_emit_config_values;
1695 	}
1696 }
1697 
1698 static bool too_many_hotspot_blocks(sector_t origin_size,
1699 				    sector_t hotspot_block_size,
1700 				    unsigned nr_hotspot_blocks)
1701 {
1702 	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1703 }
1704 
1705 static void calc_hotspot_params(sector_t origin_size,
1706 				sector_t cache_block_size,
1707 				unsigned nr_cache_blocks,
1708 				sector_t *hotspot_block_size,
1709 				unsigned *nr_hotspot_blocks)
1710 {
1711 	*hotspot_block_size = cache_block_size * 16u;
1712 	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1713 
1714 	while ((*hotspot_block_size > cache_block_size) &&
1715 	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1716 		*hotspot_block_size /= 2u;
1717 }
1718 
1719 static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1720 					    sector_t origin_size,
1721 					    sector_t cache_block_size,
1722 					    bool mimic_mq,
1723 					    bool migrations_allowed)
1724 {
1725 	unsigned i;
1726 	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1727 	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
1728 	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1729 
1730 	if (!mq)
1731 		return NULL;
1732 
1733 	init_policy_functions(mq, mimic_mq);
1734 	mq->cache_size = cache_size;
1735 	mq->cache_block_size = cache_block_size;
1736 
1737 	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1738 			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1739 
1740 	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1741 	mq->hotspot_level_jump = 1u;
1742 	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1743 		DMERR("couldn't initialize entry space");
1744 		goto bad_pool_init;
1745 	}
1746 
1747 	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1748 	for (i = 0; i < nr_sentinels_per_queue; i++)
1749 		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1750 
1751 	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1752 	for (i = 0; i < nr_sentinels_per_queue; i++)
1753 		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1754 
1755 	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1756 		       total_sentinels + mq->nr_hotspot_blocks);
1757 
1758 	init_allocator(&mq->cache_alloc, &mq->es,
1759 		       total_sentinels + mq->nr_hotspot_blocks,
1760 		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1761 
1762 	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1763 	if (!mq->hotspot_hit_bits) {
1764 		DMERR("couldn't allocate hotspot hit bitset");
1765 		goto bad_hotspot_hit_bits;
1766 	}
1767 	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1768 
1769 	if (from_cblock(cache_size)) {
1770 		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1771 		if (!mq->cache_hit_bits) {
1772 			DMERR("couldn't allocate cache hit bitset");
1773 			goto bad_cache_hit_bits;
1774 		}
1775 		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1776 	} else
1777 		mq->cache_hit_bits = NULL;
1778 
1779 	mq->tick = 0;
1780 	spin_lock_init(&mq->lock);
1781 
1782 	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1783 	mq->hotspot.nr_top_levels = 8;
1784 	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1785 					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1786 
1787 	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1788 	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1789 
1790 	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1791 	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1792 
1793 	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1794 		goto bad_alloc_table;
1795 
1796 	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1797 		goto bad_alloc_hotspot_table;
1798 
1799 	sentinels_init(mq);
1800 	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1801 
1802 	mq->next_hotspot_period = jiffies;
1803 	mq->next_cache_period = jiffies;
1804 
1805 	mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
1806 	if (!mq->bg_work)
1807 		goto bad_btracker;
1808 
1809 	mq->migrations_allowed = migrations_allowed;
1810 
1811 	return &mq->policy;
1812 
1813 bad_btracker:
1814 	h_exit(&mq->hotspot_table);
1815 bad_alloc_hotspot_table:
1816 	h_exit(&mq->table);
1817 bad_alloc_table:
1818 	free_bitset(mq->cache_hit_bits);
1819 bad_cache_hit_bits:
1820 	free_bitset(mq->hotspot_hit_bits);
1821 bad_hotspot_hit_bits:
1822 	space_exit(&mq->es);
1823 bad_pool_init:
1824 	kfree(mq);
1825 
1826 	return NULL;
1827 }
1828 
1829 static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1830 					  sector_t origin_size,
1831 					  sector_t cache_block_size)
1832 {
1833 	return __smq_create(cache_size, origin_size, cache_block_size, false, true);
1834 }
1835 
1836 static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1837 					 sector_t origin_size,
1838 					 sector_t cache_block_size)
1839 {
1840 	return __smq_create(cache_size, origin_size, cache_block_size, true, true);
1841 }
1842 
1843 static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1844 					      sector_t origin_size,
1845 					      sector_t cache_block_size)
1846 {
1847 	return __smq_create(cache_size, origin_size, cache_block_size, false, false);
1848 }
1849 
1850 /*----------------------------------------------------------------*/
1851 
1852 static struct dm_cache_policy_type smq_policy_type = {
1853 	.name = "smq",
1854 	.version = {2, 0, 0},
1855 	.hint_size = 4,
1856 	.owner = THIS_MODULE,
1857 	.create = smq_create
1858 };
1859 
1860 static struct dm_cache_policy_type mq_policy_type = {
1861 	.name = "mq",
1862 	.version = {2, 0, 0},
1863 	.hint_size = 4,
1864 	.owner = THIS_MODULE,
1865 	.create = mq_create,
1866 };
1867 
1868 static struct dm_cache_policy_type cleaner_policy_type = {
1869 	.name = "cleaner",
1870 	.version = {2, 0, 0},
1871 	.hint_size = 4,
1872 	.owner = THIS_MODULE,
1873 	.create = cleaner_create,
1874 };
1875 
1876 static struct dm_cache_policy_type default_policy_type = {
1877 	.name = "default",
1878 	.version = {2, 0, 0},
1879 	.hint_size = 4,
1880 	.owner = THIS_MODULE,
1881 	.create = smq_create,
1882 	.real = &smq_policy_type
1883 };
1884 
1885 static int __init smq_init(void)
1886 {
1887 	int r;
1888 
1889 	r = dm_cache_policy_register(&smq_policy_type);
1890 	if (r) {
1891 		DMERR("register failed %d", r);
1892 		return -ENOMEM;
1893 	}
1894 
1895 	r = dm_cache_policy_register(&mq_policy_type);
1896 	if (r) {
1897 		DMERR("register failed (as mq) %d", r);
1898 		goto out_mq;
1899 	}
1900 
1901 	r = dm_cache_policy_register(&cleaner_policy_type);
1902 	if (r) {
1903 		DMERR("register failed (as cleaner) %d", r);
1904 		goto out_cleaner;
1905 	}
1906 
1907 	r = dm_cache_policy_register(&default_policy_type);
1908 	if (r) {
1909 		DMERR("register failed (as default) %d", r);
1910 		goto out_default;
1911 	}
1912 
1913 	return 0;
1914 
1915 out_default:
1916 	dm_cache_policy_unregister(&cleaner_policy_type);
1917 out_cleaner:
1918 	dm_cache_policy_unregister(&mq_policy_type);
1919 out_mq:
1920 	dm_cache_policy_unregister(&smq_policy_type);
1921 
1922 	return -ENOMEM;
1923 }
1924 
1925 static void __exit smq_exit(void)
1926 {
1927 	dm_cache_policy_unregister(&cleaner_policy_type);
1928 	dm_cache_policy_unregister(&smq_policy_type);
1929 	dm_cache_policy_unregister(&mq_policy_type);
1930 	dm_cache_policy_unregister(&default_policy_type);
1931 }
1932 
1933 module_init(smq_init);
1934 module_exit(smq_exit);
1935 
1936 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1937 MODULE_LICENSE("GPL");
1938 MODULE_DESCRIPTION("smq cache policy");
1939 
1940 MODULE_ALIAS("dm-cache-default");
1941 MODULE_ALIAS("dm-cache-mq");
1942 MODULE_ALIAS("dm-cache-cleaner");
1943