xref: /linux/drivers/md/dm-ps-historical-service-time.c (revision 9a87ffc99ec8eb8d35eed7c4f816d75f5cc9662e)
1*3bd94003SHeinz Mauelshagen // SPDX-License-Identifier: GPL-2.0-only
2298fb372SMike Snitzer /*
3298fb372SMike Snitzer  * Historical Service Time
4298fb372SMike Snitzer  *
5298fb372SMike Snitzer  *  Keeps a time-weighted exponential moving average of the historical
6298fb372SMike Snitzer  *  service time. Estimates future service time based on the historical
7298fb372SMike Snitzer  *  service time and the number of outstanding requests.
8298fb372SMike Snitzer  *
9298fb372SMike Snitzer  *  Marks paths stale if they have not finished within hst *
10298fb372SMike Snitzer  *  num_paths. If a path is stale and unused, we will send a single
11298fb372SMike Snitzer  *  request to probe in case the path has improved. This situation
12298fb372SMike Snitzer  *  generally arises if the path is so much worse than others that it
13298fb372SMike Snitzer  *  will never have the best estimated service time, or if the entire
14298fb372SMike Snitzer  *  multipath device is unused. If a path is stale and in use, limit the
15298fb372SMike Snitzer  *  number of requests it can receive with the assumption that the path
16298fb372SMike Snitzer  *  has become degraded.
17298fb372SMike Snitzer  *
18298fb372SMike Snitzer  *  To avoid repeatedly calculating exponents for time weighting, times
19298fb372SMike Snitzer  *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
20298fb372SMike Snitzer  *  ns, and the weighting is pre-calculated.
21298fb372SMike Snitzer  *
22298fb372SMike Snitzer  */
23298fb372SMike Snitzer 
24298fb372SMike Snitzer #include "dm.h"
25298fb372SMike Snitzer #include "dm-path-selector.h"
26298fb372SMike Snitzer 
27298fb372SMike Snitzer #include <linux/blkdev.h>
28298fb372SMike Snitzer #include <linux/slab.h>
29298fb372SMike Snitzer #include <linux/module.h>
30298fb372SMike Snitzer 
31298fb372SMike Snitzer 
32298fb372SMike Snitzer #define DM_MSG_PREFIX	"multipath historical-service-time"
33298fb372SMike Snitzer #define HST_MIN_IO 1
34298fb372SMike Snitzer #define HST_VERSION "0.1.1"
35298fb372SMike Snitzer 
36298fb372SMike Snitzer #define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
37298fb372SMike Snitzer #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
38298fb372SMike Snitzer #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
39298fb372SMike Snitzer #define HST_FIXED_95 972
40298fb372SMike Snitzer #define HST_MAX_INFLIGHT HST_FIXED_1
41298fb372SMike Snitzer #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
42298fb372SMike Snitzer #define HST_WEIGHT_COUNT 64ULL
43298fb372SMike Snitzer 
44298fb372SMike Snitzer struct selector {
45298fb372SMike Snitzer 	struct list_head valid_paths;
46298fb372SMike Snitzer 	struct list_head failed_paths;
47298fb372SMike Snitzer 	int valid_count;
48298fb372SMike Snitzer 	spinlock_t lock;
49298fb372SMike Snitzer 
50298fb372SMike Snitzer 	unsigned int weights[HST_WEIGHT_COUNT];
51298fb372SMike Snitzer 	unsigned int threshold_multiplier;
52298fb372SMike Snitzer };
53298fb372SMike Snitzer 
54298fb372SMike Snitzer struct path_info {
55298fb372SMike Snitzer 	struct list_head list;
56298fb372SMike Snitzer 	struct dm_path *path;
57298fb372SMike Snitzer 	unsigned int repeat_count;
58298fb372SMike Snitzer 
59298fb372SMike Snitzer 	spinlock_t lock;
60298fb372SMike Snitzer 
61298fb372SMike Snitzer 	u64 historical_service_time; /* Fixed point */
62298fb372SMike Snitzer 
63298fb372SMike Snitzer 	u64 stale_after;
64298fb372SMike Snitzer 	u64 last_finish;
65298fb372SMike Snitzer 
66298fb372SMike Snitzer 	u64 outstanding;
67298fb372SMike Snitzer };
68298fb372SMike Snitzer 
69298fb372SMike Snitzer /**
70298fb372SMike Snitzer  * fixed_power - compute: x^n, in O(log n) time
71298fb372SMike Snitzer  *
72298fb372SMike Snitzer  * @x:         base of the power
73298fb372SMike Snitzer  * @frac_bits: fractional bits of @x
74298fb372SMike Snitzer  * @n:         power to raise @x to.
75298fb372SMike Snitzer  *
76298fb372SMike Snitzer  * By exploiting the relation between the definition of the natural power
77298fb372SMike Snitzer  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
78298fb372SMike Snitzer  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
79298fb372SMike Snitzer  * (where: n_i \elem {0, 1}, the binary vector representing n),
80298fb372SMike Snitzer  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
81298fb372SMike Snitzer  * of course trivially computable in O(log_2 n), the length of our binary
82298fb372SMike Snitzer  * vector.
83298fb372SMike Snitzer  *
84298fb372SMike Snitzer  * (see: kernel/sched/loadavg.c)
85298fb372SMike Snitzer  */
fixed_power(u64 x,unsigned int frac_bits,unsigned int n)86298fb372SMike Snitzer static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
87298fb372SMike Snitzer {
88298fb372SMike Snitzer 	unsigned long result = 1UL << frac_bits;
89298fb372SMike Snitzer 
90298fb372SMike Snitzer 	if (n) {
91298fb372SMike Snitzer 		for (;;) {
92298fb372SMike Snitzer 			if (n & 1) {
93298fb372SMike Snitzer 				result *= x;
94298fb372SMike Snitzer 				result += 1UL << (frac_bits - 1);
95298fb372SMike Snitzer 				result >>= frac_bits;
96298fb372SMike Snitzer 			}
97298fb372SMike Snitzer 			n >>= 1;
98298fb372SMike Snitzer 			if (!n)
99298fb372SMike Snitzer 				break;
100298fb372SMike Snitzer 			x *= x;
101298fb372SMike Snitzer 			x += 1UL << (frac_bits - 1);
102298fb372SMike Snitzer 			x >>= frac_bits;
103298fb372SMike Snitzer 		}
104298fb372SMike Snitzer 	}
105298fb372SMike Snitzer 
106298fb372SMike Snitzer 	return result;
107298fb372SMike Snitzer }
108298fb372SMike Snitzer 
109298fb372SMike Snitzer /*
110298fb372SMike Snitzer  * Calculate the next value of an exponential moving average
111298fb372SMike Snitzer  * a_1 = a_0 * e + a * (1 - e)
112298fb372SMike Snitzer  *
113298fb372SMike Snitzer  * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
114298fb372SMike Snitzer  * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
115298fb372SMike Snitzer  * @weight: [0, HST_FIXED_1]
116298fb372SMike Snitzer  *
117298fb372SMike Snitzer  * Note:
118298fb372SMike Snitzer  *   To account for multiple periods in the same calculation,
119298fb372SMike Snitzer  *   a_n = a_0 * e^n + a * (1 - e^n),
120298fb372SMike Snitzer  *   so call fixed_ema(last, next, pow(weight, N))
121298fb372SMike Snitzer  */
fixed_ema(u64 last,u64 next,u64 weight)122298fb372SMike Snitzer static u64 fixed_ema(u64 last, u64 next, u64 weight)
123298fb372SMike Snitzer {
124298fb372SMike Snitzer 	last *= weight;
125298fb372SMike Snitzer 	last += next * (HST_FIXED_1 - weight);
126298fb372SMike Snitzer 	last += 1ULL << (HST_FIXED_SHIFT - 1);
127298fb372SMike Snitzer 	return last >> HST_FIXED_SHIFT;
128298fb372SMike Snitzer }
129298fb372SMike Snitzer 
alloc_selector(void)130298fb372SMike Snitzer static struct selector *alloc_selector(void)
131298fb372SMike Snitzer {
132298fb372SMike Snitzer 	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
133298fb372SMike Snitzer 
134298fb372SMike Snitzer 	if (s) {
135298fb372SMike Snitzer 		INIT_LIST_HEAD(&s->valid_paths);
136298fb372SMike Snitzer 		INIT_LIST_HEAD(&s->failed_paths);
137298fb372SMike Snitzer 		spin_lock_init(&s->lock);
138298fb372SMike Snitzer 		s->valid_count = 0;
139298fb372SMike Snitzer 	}
140298fb372SMike Snitzer 
141298fb372SMike Snitzer 	return s;
142298fb372SMike Snitzer }
143298fb372SMike Snitzer 
144298fb372SMike Snitzer /*
145298fb372SMike Snitzer  * Get the weight for a given time span.
146298fb372SMike Snitzer  */
hst_weight(struct path_selector * ps,u64 delta)147298fb372SMike Snitzer static u64 hst_weight(struct path_selector *ps, u64 delta)
148298fb372SMike Snitzer {
149298fb372SMike Snitzer 	struct selector *s = ps->context;
150298fb372SMike Snitzer 	int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
151298fb372SMike Snitzer 			   HST_WEIGHT_COUNT - 1);
152298fb372SMike Snitzer 
153298fb372SMike Snitzer 	return s->weights[bucket];
154298fb372SMike Snitzer }
155298fb372SMike Snitzer 
156298fb372SMike Snitzer /*
157298fb372SMike Snitzer  * Set up the weights array.
158298fb372SMike Snitzer  *
159298fb372SMike Snitzer  * weights[len-1] = 0
160298fb372SMike Snitzer  * weights[n] = base ^ (n + 1)
161298fb372SMike Snitzer  */
hst_set_weights(struct path_selector * ps,unsigned int base)162298fb372SMike Snitzer static void hst_set_weights(struct path_selector *ps, unsigned int base)
163298fb372SMike Snitzer {
164298fb372SMike Snitzer 	struct selector *s = ps->context;
165298fb372SMike Snitzer 	int i;
166298fb372SMike Snitzer 
167298fb372SMike Snitzer 	if (base >= HST_FIXED_1)
168298fb372SMike Snitzer 		return;
169298fb372SMike Snitzer 
170298fb372SMike Snitzer 	for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
171298fb372SMike Snitzer 		s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
172298fb372SMike Snitzer 	s->weights[HST_WEIGHT_COUNT - 1] = 0;
173298fb372SMike Snitzer }
174298fb372SMike Snitzer 
hst_create(struct path_selector * ps,unsigned int argc,char ** argv)175298fb372SMike Snitzer static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
176298fb372SMike Snitzer {
177298fb372SMike Snitzer 	struct selector *s;
178298fb372SMike Snitzer 	unsigned int base_weight = HST_FIXED_95;
179298fb372SMike Snitzer 	unsigned int threshold_multiplier = 0;
180298fb372SMike Snitzer 	char dummy;
181298fb372SMike Snitzer 
182298fb372SMike Snitzer 	/*
183298fb372SMike Snitzer 	 * Arguments: [<base_weight> [<threshold_multiplier>]]
184298fb372SMike Snitzer 	 *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
185298fb372SMike Snitzer 	 *                  value of 0 will completely ignore any history.
186298fb372SMike Snitzer 	 *                  If not given, default (HST_FIXED_95) is used.
187298fb372SMike Snitzer 	 *   <threshold_multiplier>: Minimum threshold multiplier for paths to
188298fb372SMike Snitzer 	 *                  be considered different. That is, a path is
189298fb372SMike Snitzer 	 *                  considered different iff (p1 > N * p2) where p1
190298fb372SMike Snitzer 	 *                  is the path with higher service time. A threshold
191298fb372SMike Snitzer 	 *                  of 1 or 0 has no effect. Defaults to 0.
192298fb372SMike Snitzer 	 */
193298fb372SMike Snitzer 	if (argc > 2)
194298fb372SMike Snitzer 		return -EINVAL;
195298fb372SMike Snitzer 
196298fb372SMike Snitzer 	if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
197298fb372SMike Snitzer 	     base_weight >= HST_FIXED_1)) {
198298fb372SMike Snitzer 		return -EINVAL;
199298fb372SMike Snitzer 	}
200298fb372SMike Snitzer 
201298fb372SMike Snitzer 	if (argc > 1 && (sscanf(argv[1], "%u%c",
202298fb372SMike Snitzer 				&threshold_multiplier, &dummy) != 1)) {
203298fb372SMike Snitzer 		return -EINVAL;
204298fb372SMike Snitzer 	}
205298fb372SMike Snitzer 
206298fb372SMike Snitzer 	s = alloc_selector();
207298fb372SMike Snitzer 	if (!s)
208298fb372SMike Snitzer 		return -ENOMEM;
209298fb372SMike Snitzer 
210298fb372SMike Snitzer 	ps->context = s;
211298fb372SMike Snitzer 
212298fb372SMike Snitzer 	hst_set_weights(ps, base_weight);
213298fb372SMike Snitzer 	s->threshold_multiplier = threshold_multiplier;
214298fb372SMike Snitzer 	return 0;
215298fb372SMike Snitzer }
216298fb372SMike Snitzer 
free_paths(struct list_head * paths)217298fb372SMike Snitzer static void free_paths(struct list_head *paths)
218298fb372SMike Snitzer {
219298fb372SMike Snitzer 	struct path_info *pi, *next;
220298fb372SMike Snitzer 
221298fb372SMike Snitzer 	list_for_each_entry_safe(pi, next, paths, list) {
222298fb372SMike Snitzer 		list_del(&pi->list);
223298fb372SMike Snitzer 		kfree(pi);
224298fb372SMike Snitzer 	}
225298fb372SMike Snitzer }
226298fb372SMike Snitzer 
hst_destroy(struct path_selector * ps)227298fb372SMike Snitzer static void hst_destroy(struct path_selector *ps)
228298fb372SMike Snitzer {
229298fb372SMike Snitzer 	struct selector *s = ps->context;
230298fb372SMike Snitzer 
231298fb372SMike Snitzer 	free_paths(&s->valid_paths);
232298fb372SMike Snitzer 	free_paths(&s->failed_paths);
233298fb372SMike Snitzer 	kfree(s);
234298fb372SMike Snitzer 	ps->context = NULL;
235298fb372SMike Snitzer }
236298fb372SMike Snitzer 
hst_status(struct path_selector * ps,struct dm_path * path,status_type_t type,char * result,unsigned int maxlen)237298fb372SMike Snitzer static int hst_status(struct path_selector *ps, struct dm_path *path,
238298fb372SMike Snitzer 		     status_type_t type, char *result, unsigned int maxlen)
239298fb372SMike Snitzer {
240298fb372SMike Snitzer 	unsigned int sz = 0;
241298fb372SMike Snitzer 	struct path_info *pi;
242298fb372SMike Snitzer 
243298fb372SMike Snitzer 	if (!path) {
244298fb372SMike Snitzer 		struct selector *s = ps->context;
245298fb372SMike Snitzer 
246298fb372SMike Snitzer 		DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
247298fb372SMike Snitzer 	} else {
248298fb372SMike Snitzer 		pi = path->pscontext;
249298fb372SMike Snitzer 
250298fb372SMike Snitzer 		switch (type) {
251298fb372SMike Snitzer 		case STATUSTYPE_INFO:
252298fb372SMike Snitzer 			DMEMIT("%llu %llu %llu ", pi->historical_service_time,
253298fb372SMike Snitzer 			       pi->outstanding, pi->stale_after);
254298fb372SMike Snitzer 			break;
255298fb372SMike Snitzer 		case STATUSTYPE_TABLE:
256298fb372SMike Snitzer 			DMEMIT("0 ");
257298fb372SMike Snitzer 			break;
2588ec45662STushar Sugandhi 		case STATUSTYPE_IMA:
2598ec45662STushar Sugandhi 			*result = '\0';
2608ec45662STushar Sugandhi 			break;
261298fb372SMike Snitzer 		}
262298fb372SMike Snitzer 	}
263298fb372SMike Snitzer 
264298fb372SMike Snitzer 	return sz;
265298fb372SMike Snitzer }
266298fb372SMike Snitzer 
hst_add_path(struct path_selector * ps,struct dm_path * path,int argc,char ** argv,char ** error)267298fb372SMike Snitzer static int hst_add_path(struct path_selector *ps, struct dm_path *path,
268298fb372SMike Snitzer 		       int argc, char **argv, char **error)
269298fb372SMike Snitzer {
270298fb372SMike Snitzer 	struct selector *s = ps->context;
271298fb372SMike Snitzer 	struct path_info *pi;
272298fb372SMike Snitzer 	unsigned int repeat_count = HST_MIN_IO;
273298fb372SMike Snitzer 	char dummy;
274298fb372SMike Snitzer 	unsigned long flags;
275298fb372SMike Snitzer 
276298fb372SMike Snitzer 	/*
277298fb372SMike Snitzer 	 * Arguments: [<repeat_count>]
278298fb372SMike Snitzer 	 *   <repeat_count>: The number of I/Os before switching path.
279298fb372SMike Snitzer 	 *                   If not given, default (HST_MIN_IO) is used.
280298fb372SMike Snitzer 	 */
281298fb372SMike Snitzer 	if (argc > 1) {
282298fb372SMike Snitzer 		*error = "historical-service-time ps: incorrect number of arguments";
283298fb372SMike Snitzer 		return -EINVAL;
284298fb372SMike Snitzer 	}
285298fb372SMike Snitzer 
286298fb372SMike Snitzer 	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
287298fb372SMike Snitzer 		*error = "historical-service-time ps: invalid repeat count";
288298fb372SMike Snitzer 		return -EINVAL;
289298fb372SMike Snitzer 	}
290298fb372SMike Snitzer 
291298fb372SMike Snitzer 	/* allocate the path */
292298fb372SMike Snitzer 	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
293298fb372SMike Snitzer 	if (!pi) {
294298fb372SMike Snitzer 		*error = "historical-service-time ps: Error allocating path context";
295298fb372SMike Snitzer 		return -ENOMEM;
296298fb372SMike Snitzer 	}
297298fb372SMike Snitzer 
298298fb372SMike Snitzer 	pi->path = path;
299298fb372SMike Snitzer 	pi->repeat_count = repeat_count;
300298fb372SMike Snitzer 
301298fb372SMike Snitzer 	pi->historical_service_time = HST_FIXED_1;
302298fb372SMike Snitzer 
303298fb372SMike Snitzer 	spin_lock_init(&pi->lock);
304298fb372SMike Snitzer 	pi->outstanding = 0;
305298fb372SMike Snitzer 
306298fb372SMike Snitzer 	pi->stale_after = 0;
307298fb372SMike Snitzer 	pi->last_finish = 0;
308298fb372SMike Snitzer 
309298fb372SMike Snitzer 	path->pscontext = pi;
310298fb372SMike Snitzer 
311298fb372SMike Snitzer 	spin_lock_irqsave(&s->lock, flags);
312298fb372SMike Snitzer 	list_add_tail(&pi->list, &s->valid_paths);
313298fb372SMike Snitzer 	s->valid_count++;
314298fb372SMike Snitzer 	spin_unlock_irqrestore(&s->lock, flags);
315298fb372SMike Snitzer 
316298fb372SMike Snitzer 	return 0;
317298fb372SMike Snitzer }
318298fb372SMike Snitzer 
hst_fail_path(struct path_selector * ps,struct dm_path * path)319298fb372SMike Snitzer static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
320298fb372SMike Snitzer {
321298fb372SMike Snitzer 	struct selector *s = ps->context;
322298fb372SMike Snitzer 	struct path_info *pi = path->pscontext;
323298fb372SMike Snitzer 	unsigned long flags;
324298fb372SMike Snitzer 
325298fb372SMike Snitzer 	spin_lock_irqsave(&s->lock, flags);
326298fb372SMike Snitzer 	list_move(&pi->list, &s->failed_paths);
327298fb372SMike Snitzer 	s->valid_count--;
328298fb372SMike Snitzer 	spin_unlock_irqrestore(&s->lock, flags);
329298fb372SMike Snitzer }
330298fb372SMike Snitzer 
hst_reinstate_path(struct path_selector * ps,struct dm_path * path)331298fb372SMike Snitzer static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
332298fb372SMike Snitzer {
333298fb372SMike Snitzer 	struct selector *s = ps->context;
334298fb372SMike Snitzer 	struct path_info *pi = path->pscontext;
335298fb372SMike Snitzer 	unsigned long flags;
336298fb372SMike Snitzer 
337298fb372SMike Snitzer 	spin_lock_irqsave(&s->lock, flags);
338298fb372SMike Snitzer 	list_move_tail(&pi->list, &s->valid_paths);
339298fb372SMike Snitzer 	s->valid_count++;
340298fb372SMike Snitzer 	spin_unlock_irqrestore(&s->lock, flags);
341298fb372SMike Snitzer 
342298fb372SMike Snitzer 	return 0;
343298fb372SMike Snitzer }
344298fb372SMike Snitzer 
hst_fill_compare(struct path_info * pi,u64 * hst,u64 * out,u64 * stale)345298fb372SMike Snitzer static void hst_fill_compare(struct path_info *pi, u64 *hst,
346298fb372SMike Snitzer 			     u64 *out, u64 *stale)
347298fb372SMike Snitzer {
348298fb372SMike Snitzer 	unsigned long flags;
349298fb372SMike Snitzer 
350298fb372SMike Snitzer 	spin_lock_irqsave(&pi->lock, flags);
351298fb372SMike Snitzer 	*hst = pi->historical_service_time;
352298fb372SMike Snitzer 	*out = pi->outstanding;
353298fb372SMike Snitzer 	*stale = pi->stale_after;
354298fb372SMike Snitzer 	spin_unlock_irqrestore(&pi->lock, flags);
355298fb372SMike Snitzer }
356298fb372SMike Snitzer 
357298fb372SMike Snitzer /*
358298fb372SMike Snitzer  * Compare the estimated service time of 2 paths, pi1 and pi2,
359298fb372SMike Snitzer  * for the incoming I/O.
360298fb372SMike Snitzer  *
361298fb372SMike Snitzer  * Returns:
362298fb372SMike Snitzer  * < 0 : pi1 is better
363298fb372SMike Snitzer  * 0   : no difference between pi1 and pi2
364298fb372SMike Snitzer  * > 0 : pi2 is better
365298fb372SMike Snitzer  *
366298fb372SMike Snitzer  */
hst_compare(struct path_info * pi1,struct path_info * pi2,u64 time_now,struct path_selector * ps)367298fb372SMike Snitzer static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
368298fb372SMike Snitzer 			     u64 time_now, struct path_selector *ps)
369298fb372SMike Snitzer {
370298fb372SMike Snitzer 	struct selector *s = ps->context;
371298fb372SMike Snitzer 	u64 hst1, hst2;
372298fb372SMike Snitzer 	long long out1, out2, stale1, stale2;
373298fb372SMike Snitzer 	int pi2_better, over_threshold;
374298fb372SMike Snitzer 
375298fb372SMike Snitzer 	hst_fill_compare(pi1, &hst1, &out1, &stale1);
376298fb372SMike Snitzer 	hst_fill_compare(pi2, &hst2, &out2, &stale2);
377298fb372SMike Snitzer 
378298fb372SMike Snitzer 	/* Check here if estimated latency for two paths are too similar.
379298fb372SMike Snitzer 	 * If this is the case, we skip extra calculation and just compare
380298fb372SMike Snitzer 	 * outstanding requests. In this case, any unloaded paths will
381298fb372SMike Snitzer 	 * be preferred.
382298fb372SMike Snitzer 	 */
383298fb372SMike Snitzer 	if (hst1 > hst2)
384298fb372SMike Snitzer 		over_threshold = hst1 > (s->threshold_multiplier * hst2);
385298fb372SMike Snitzer 	else
386298fb372SMike Snitzer 		over_threshold = hst2 > (s->threshold_multiplier * hst1);
387298fb372SMike Snitzer 
388298fb372SMike Snitzer 	if (!over_threshold)
389298fb372SMike Snitzer 		return out1 - out2;
390298fb372SMike Snitzer 
391298fb372SMike Snitzer 	/*
392298fb372SMike Snitzer 	 * If an unloaded path is stale, choose it. If both paths are unloaded,
393298fb372SMike Snitzer 	 * choose path that is the most stale.
394298fb372SMike Snitzer 	 * (If one path is loaded, choose the other)
395298fb372SMike Snitzer 	 */
396298fb372SMike Snitzer 	if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
397298fb372SMike Snitzer 	    (!out1 && !out2))
398298fb372SMike Snitzer 		return (!out2 * stale1) - (!out1 * stale2);
399298fb372SMike Snitzer 
400298fb372SMike Snitzer 	/* Compare estimated service time. If outstanding is the same, we
401298fb372SMike Snitzer 	 * don't need to multiply
402298fb372SMike Snitzer 	 */
403298fb372SMike Snitzer 	if (out1 == out2) {
404298fb372SMike Snitzer 		pi2_better = hst1 > hst2;
405298fb372SMike Snitzer 	} else {
406298fb372SMike Snitzer 		/* Potential overflow with out >= 1024 */
407298fb372SMike Snitzer 		if (unlikely(out1 >= HST_MAX_INFLIGHT ||
408298fb372SMike Snitzer 			     out2 >= HST_MAX_INFLIGHT)) {
409298fb372SMike Snitzer 			/* If over 1023 in-flights, we may overflow if hst
410298fb372SMike Snitzer 			 * is at max. (With this shift we still overflow at
411298fb372SMike Snitzer 			 * 1048576 in-flights, which is high enough).
412298fb372SMike Snitzer 			 */
413298fb372SMike Snitzer 			hst1 >>= HST_FIXED_SHIFT;
414298fb372SMike Snitzer 			hst2 >>= HST_FIXED_SHIFT;
415298fb372SMike Snitzer 		}
416298fb372SMike Snitzer 		pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
417298fb372SMike Snitzer 	}
418298fb372SMike Snitzer 
419298fb372SMike Snitzer 	/* In the case that the 'winner' is stale, limit to equal usage. */
420298fb372SMike Snitzer 	if (pi2_better) {
421298fb372SMike Snitzer 		if (stale2 < time_now)
422298fb372SMike Snitzer 			return out1 - out2;
423298fb372SMike Snitzer 		return 1;
424298fb372SMike Snitzer 	}
425298fb372SMike Snitzer 	if (stale1 < time_now)
426298fb372SMike Snitzer 		return out1 - out2;
427298fb372SMike Snitzer 	return -1;
428298fb372SMike Snitzer }
429298fb372SMike Snitzer 
hst_select_path(struct path_selector * ps,size_t nr_bytes)430298fb372SMike Snitzer static struct dm_path *hst_select_path(struct path_selector *ps,
431298fb372SMike Snitzer 				       size_t nr_bytes)
432298fb372SMike Snitzer {
433298fb372SMike Snitzer 	struct selector *s = ps->context;
434298fb372SMike Snitzer 	struct path_info *pi = NULL, *best = NULL;
435ce40426fSKhazhismel Kumykov 	u64 time_now = ktime_get_ns();
436298fb372SMike Snitzer 	struct dm_path *ret = NULL;
437298fb372SMike Snitzer 	unsigned long flags;
438298fb372SMike Snitzer 
439298fb372SMike Snitzer 	spin_lock_irqsave(&s->lock, flags);
440298fb372SMike Snitzer 	if (list_empty(&s->valid_paths))
441298fb372SMike Snitzer 		goto out;
442298fb372SMike Snitzer 
443298fb372SMike Snitzer 	list_for_each_entry(pi, &s->valid_paths, list) {
444298fb372SMike Snitzer 		if (!best || (hst_compare(pi, best, time_now, ps) < 0))
445298fb372SMike Snitzer 			best = pi;
446298fb372SMike Snitzer 	}
447298fb372SMike Snitzer 
448298fb372SMike Snitzer 	if (!best)
449298fb372SMike Snitzer 		goto out;
450298fb372SMike Snitzer 
451298fb372SMike Snitzer 	/* Move last used path to end (least preferred in case of ties) */
452298fb372SMike Snitzer 	list_move_tail(&best->list, &s->valid_paths);
453298fb372SMike Snitzer 
454298fb372SMike Snitzer 	ret = best->path;
455298fb372SMike Snitzer 
456298fb372SMike Snitzer out:
457298fb372SMike Snitzer 	spin_unlock_irqrestore(&s->lock, flags);
458298fb372SMike Snitzer 	return ret;
459298fb372SMike Snitzer }
460298fb372SMike Snitzer 
hst_start_io(struct path_selector * ps,struct dm_path * path,size_t nr_bytes)461298fb372SMike Snitzer static int hst_start_io(struct path_selector *ps, struct dm_path *path,
462298fb372SMike Snitzer 			size_t nr_bytes)
463298fb372SMike Snitzer {
464298fb372SMike Snitzer 	struct path_info *pi = path->pscontext;
465298fb372SMike Snitzer 	unsigned long flags;
466298fb372SMike Snitzer 
467298fb372SMike Snitzer 	spin_lock_irqsave(&pi->lock, flags);
468298fb372SMike Snitzer 	pi->outstanding++;
469298fb372SMike Snitzer 	spin_unlock_irqrestore(&pi->lock, flags);
470298fb372SMike Snitzer 
471298fb372SMike Snitzer 	return 0;
472298fb372SMike Snitzer }
473298fb372SMike Snitzer 
path_service_time(struct path_info * pi,u64 start_time)474298fb372SMike Snitzer static u64 path_service_time(struct path_info *pi, u64 start_time)
475298fb372SMike Snitzer {
476ce40426fSKhazhismel Kumykov 	u64 now = ktime_get_ns();
477298fb372SMike Snitzer 
478298fb372SMike Snitzer 	/* if a previous disk request has finished after this IO was
479298fb372SMike Snitzer 	 * sent to the hardware, pretend the submission happened
480298fb372SMike Snitzer 	 * serially.
481298fb372SMike Snitzer 	 */
482298fb372SMike Snitzer 	if (time_after64(pi->last_finish, start_time))
483298fb372SMike Snitzer 		start_time = pi->last_finish;
484298fb372SMike Snitzer 
485ce40426fSKhazhismel Kumykov 	pi->last_finish = now;
486ce40426fSKhazhismel Kumykov 	if (time_before64(now, start_time))
487298fb372SMike Snitzer 		return 0;
488298fb372SMike Snitzer 
489ce40426fSKhazhismel Kumykov 	return now - start_time;
490298fb372SMike Snitzer }
491298fb372SMike Snitzer 
hst_end_io(struct path_selector * ps,struct dm_path * path,size_t nr_bytes,u64 start_time)492298fb372SMike Snitzer static int hst_end_io(struct path_selector *ps, struct dm_path *path,
493298fb372SMike Snitzer 		      size_t nr_bytes, u64 start_time)
494298fb372SMike Snitzer {
495298fb372SMike Snitzer 	struct path_info *pi = path->pscontext;
496298fb372SMike Snitzer 	struct selector *s = ps->context;
497298fb372SMike Snitzer 	unsigned long flags;
498298fb372SMike Snitzer 	u64 st;
499298fb372SMike Snitzer 
500298fb372SMike Snitzer 	spin_lock_irqsave(&pi->lock, flags);
501298fb372SMike Snitzer 
502298fb372SMike Snitzer 	st = path_service_time(pi, start_time);
503298fb372SMike Snitzer 	pi->outstanding--;
504298fb372SMike Snitzer 	pi->historical_service_time =
505298fb372SMike Snitzer 		fixed_ema(pi->historical_service_time,
506298fb372SMike Snitzer 			  min(st * HST_FIXED_1, HST_FIXED_MAX),
507298fb372SMike Snitzer 			  hst_weight(ps, st));
508298fb372SMike Snitzer 
509298fb372SMike Snitzer 	/*
510298fb372SMike Snitzer 	 * On request end, mark path as fresh. If a path hasn't
511298fb372SMike Snitzer 	 * finished any requests within the fresh period, the estimated
512298fb372SMike Snitzer 	 * service time is considered too optimistic and we limit the
513298fb372SMike Snitzer 	 * maximum requests on that path.
514298fb372SMike Snitzer 	 */
515298fb372SMike Snitzer 	pi->stale_after = pi->last_finish +
516298fb372SMike Snitzer 		(s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
517298fb372SMike Snitzer 
518298fb372SMike Snitzer 	spin_unlock_irqrestore(&pi->lock, flags);
519298fb372SMike Snitzer 
520298fb372SMike Snitzer 	return 0;
521298fb372SMike Snitzer }
522298fb372SMike Snitzer 
523298fb372SMike Snitzer static struct path_selector_type hst_ps = {
524298fb372SMike Snitzer 	.name		= "historical-service-time",
525298fb372SMike Snitzer 	.module		= THIS_MODULE,
526c06dfd12SGabriel Krisman Bertazi 	.features	= DM_PS_USE_HR_TIMER,
527298fb372SMike Snitzer 	.table_args	= 1,
528298fb372SMike Snitzer 	.info_args	= 3,
529298fb372SMike Snitzer 	.create		= hst_create,
530298fb372SMike Snitzer 	.destroy	= hst_destroy,
531298fb372SMike Snitzer 	.status		= hst_status,
532298fb372SMike Snitzer 	.add_path	= hst_add_path,
533298fb372SMike Snitzer 	.fail_path	= hst_fail_path,
534298fb372SMike Snitzer 	.reinstate_path	= hst_reinstate_path,
535298fb372SMike Snitzer 	.select_path	= hst_select_path,
536298fb372SMike Snitzer 	.start_io	= hst_start_io,
537298fb372SMike Snitzer 	.end_io		= hst_end_io,
538298fb372SMike Snitzer };
539298fb372SMike Snitzer 
dm_hst_init(void)540298fb372SMike Snitzer static int __init dm_hst_init(void)
541298fb372SMike Snitzer {
542298fb372SMike Snitzer 	int r = dm_register_path_selector(&hst_ps);
543298fb372SMike Snitzer 
544298fb372SMike Snitzer 	if (r < 0)
545298fb372SMike Snitzer 		DMERR("register failed %d", r);
546298fb372SMike Snitzer 
547298fb372SMike Snitzer 	DMINFO("version " HST_VERSION " loaded");
548298fb372SMike Snitzer 
549298fb372SMike Snitzer 	return r;
550298fb372SMike Snitzer }
551298fb372SMike Snitzer 
dm_hst_exit(void)552298fb372SMike Snitzer static void __exit dm_hst_exit(void)
553298fb372SMike Snitzer {
554298fb372SMike Snitzer 	int r = dm_unregister_path_selector(&hst_ps);
555298fb372SMike Snitzer 
556298fb372SMike Snitzer 	if (r < 0)
557298fb372SMike Snitzer 		DMERR("unregister failed %d", r);
558298fb372SMike Snitzer }
559298fb372SMike Snitzer 
560298fb372SMike Snitzer module_init(dm_hst_init);
561298fb372SMike Snitzer module_exit(dm_hst_exit);
562298fb372SMike Snitzer 
563298fb372SMike Snitzer MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
564298fb372SMike Snitzer MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
565298fb372SMike Snitzer MODULE_LICENSE("GPL");
566