1298fb372SMike Snitzer // SPDX-License-Identifier: GPL-2.0 2298fb372SMike Snitzer /* 3298fb372SMike Snitzer * Historical Service Time 4298fb372SMike Snitzer * 5298fb372SMike Snitzer * Keeps a time-weighted exponential moving average of the historical 6298fb372SMike Snitzer * service time. Estimates future service time based on the historical 7298fb372SMike Snitzer * service time and the number of outstanding requests. 8298fb372SMike Snitzer * 9298fb372SMike Snitzer * Marks paths stale if they have not finished within hst * 10298fb372SMike Snitzer * num_paths. If a path is stale and unused, we will send a single 11298fb372SMike Snitzer * request to probe in case the path has improved. This situation 12298fb372SMike Snitzer * generally arises if the path is so much worse than others that it 13298fb372SMike Snitzer * will never have the best estimated service time, or if the entire 14298fb372SMike Snitzer * multipath device is unused. If a path is stale and in use, limit the 15298fb372SMike Snitzer * number of requests it can receive with the assumption that the path 16298fb372SMike Snitzer * has become degraded. 17298fb372SMike Snitzer * 18298fb372SMike Snitzer * To avoid repeatedly calculating exponents for time weighting, times 19298fb372SMike Snitzer * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) 20298fb372SMike Snitzer * ns, and the weighting is pre-calculated. 21298fb372SMike Snitzer * 22298fb372SMike Snitzer */ 23298fb372SMike Snitzer 24298fb372SMike Snitzer #include "dm.h" 25298fb372SMike Snitzer #include "dm-path-selector.h" 26298fb372SMike Snitzer 27298fb372SMike Snitzer #include <linux/blkdev.h> 28298fb372SMike Snitzer #include <linux/slab.h> 29298fb372SMike Snitzer #include <linux/module.h> 30298fb372SMike Snitzer 31298fb372SMike Snitzer 32298fb372SMike Snitzer #define DM_MSG_PREFIX "multipath historical-service-time" 33298fb372SMike Snitzer #define HST_MIN_IO 1 34298fb372SMike Snitzer #define HST_VERSION "0.1.1" 35298fb372SMike Snitzer 36298fb372SMike Snitzer #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ 37298fb372SMike Snitzer #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) 38298fb372SMike Snitzer #define HST_FIXED_1 (1 << HST_FIXED_SHIFT) 39298fb372SMike Snitzer #define HST_FIXED_95 972 40298fb372SMike Snitzer #define HST_MAX_INFLIGHT HST_FIXED_1 41298fb372SMike Snitzer #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ 42298fb372SMike Snitzer #define HST_WEIGHT_COUNT 64ULL 43298fb372SMike Snitzer 44298fb372SMike Snitzer struct selector { 45298fb372SMike Snitzer struct list_head valid_paths; 46298fb372SMike Snitzer struct list_head failed_paths; 47298fb372SMike Snitzer int valid_count; 48298fb372SMike Snitzer spinlock_t lock; 49298fb372SMike Snitzer 50298fb372SMike Snitzer unsigned int weights[HST_WEIGHT_COUNT]; 51298fb372SMike Snitzer unsigned int threshold_multiplier; 52298fb372SMike Snitzer }; 53298fb372SMike Snitzer 54298fb372SMike Snitzer struct path_info { 55298fb372SMike Snitzer struct list_head list; 56298fb372SMike Snitzer struct dm_path *path; 57298fb372SMike Snitzer unsigned int repeat_count; 58298fb372SMike Snitzer 59298fb372SMike Snitzer spinlock_t lock; 60298fb372SMike Snitzer 61298fb372SMike Snitzer u64 historical_service_time; /* Fixed point */ 62298fb372SMike Snitzer 63298fb372SMike Snitzer u64 stale_after; 64298fb372SMike Snitzer u64 last_finish; 65298fb372SMike Snitzer 66298fb372SMike Snitzer u64 outstanding; 67298fb372SMike Snitzer }; 68298fb372SMike Snitzer 69298fb372SMike Snitzer /** 70298fb372SMike Snitzer * fixed_power - compute: x^n, in O(log n) time 71298fb372SMike Snitzer * 72298fb372SMike Snitzer * @x: base of the power 73298fb372SMike Snitzer * @frac_bits: fractional bits of @x 74298fb372SMike Snitzer * @n: power to raise @x to. 75298fb372SMike Snitzer * 76298fb372SMike Snitzer * By exploiting the relation between the definition of the natural power 77298fb372SMike Snitzer * function: x^n := x*x*...*x (x multiplied by itself for n times), and 78298fb372SMike Snitzer * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 79298fb372SMike Snitzer * (where: n_i \elem {0, 1}, the binary vector representing n), 80298fb372SMike Snitzer * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 81298fb372SMike Snitzer * of course trivially computable in O(log_2 n), the length of our binary 82298fb372SMike Snitzer * vector. 83298fb372SMike Snitzer * 84298fb372SMike Snitzer * (see: kernel/sched/loadavg.c) 85298fb372SMike Snitzer */ 86298fb372SMike Snitzer static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) 87298fb372SMike Snitzer { 88298fb372SMike Snitzer unsigned long result = 1UL << frac_bits; 89298fb372SMike Snitzer 90298fb372SMike Snitzer if (n) { 91298fb372SMike Snitzer for (;;) { 92298fb372SMike Snitzer if (n & 1) { 93298fb372SMike Snitzer result *= x; 94298fb372SMike Snitzer result += 1UL << (frac_bits - 1); 95298fb372SMike Snitzer result >>= frac_bits; 96298fb372SMike Snitzer } 97298fb372SMike Snitzer n >>= 1; 98298fb372SMike Snitzer if (!n) 99298fb372SMike Snitzer break; 100298fb372SMike Snitzer x *= x; 101298fb372SMike Snitzer x += 1UL << (frac_bits - 1); 102298fb372SMike Snitzer x >>= frac_bits; 103298fb372SMike Snitzer } 104298fb372SMike Snitzer } 105298fb372SMike Snitzer 106298fb372SMike Snitzer return result; 107298fb372SMike Snitzer } 108298fb372SMike Snitzer 109298fb372SMike Snitzer /* 110298fb372SMike Snitzer * Calculate the next value of an exponential moving average 111298fb372SMike Snitzer * a_1 = a_0 * e + a * (1 - e) 112298fb372SMike Snitzer * 113298fb372SMike Snitzer * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 114298fb372SMike Snitzer * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] 115298fb372SMike Snitzer * @weight: [0, HST_FIXED_1] 116298fb372SMike Snitzer * 117298fb372SMike Snitzer * Note: 118298fb372SMike Snitzer * To account for multiple periods in the same calculation, 119298fb372SMike Snitzer * a_n = a_0 * e^n + a * (1 - e^n), 120298fb372SMike Snitzer * so call fixed_ema(last, next, pow(weight, N)) 121298fb372SMike Snitzer */ 122298fb372SMike Snitzer static u64 fixed_ema(u64 last, u64 next, u64 weight) 123298fb372SMike Snitzer { 124298fb372SMike Snitzer last *= weight; 125298fb372SMike Snitzer last += next * (HST_FIXED_1 - weight); 126298fb372SMike Snitzer last += 1ULL << (HST_FIXED_SHIFT - 1); 127298fb372SMike Snitzer return last >> HST_FIXED_SHIFT; 128298fb372SMike Snitzer } 129298fb372SMike Snitzer 130298fb372SMike Snitzer static struct selector *alloc_selector(void) 131298fb372SMike Snitzer { 132298fb372SMike Snitzer struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); 133298fb372SMike Snitzer 134298fb372SMike Snitzer if (s) { 135298fb372SMike Snitzer INIT_LIST_HEAD(&s->valid_paths); 136298fb372SMike Snitzer INIT_LIST_HEAD(&s->failed_paths); 137298fb372SMike Snitzer spin_lock_init(&s->lock); 138298fb372SMike Snitzer s->valid_count = 0; 139298fb372SMike Snitzer } 140298fb372SMike Snitzer 141298fb372SMike Snitzer return s; 142298fb372SMike Snitzer } 143298fb372SMike Snitzer 144298fb372SMike Snitzer /* 145298fb372SMike Snitzer * Get the weight for a given time span. 146298fb372SMike Snitzer */ 147298fb372SMike Snitzer static u64 hst_weight(struct path_selector *ps, u64 delta) 148298fb372SMike Snitzer { 149298fb372SMike Snitzer struct selector *s = ps->context; 150298fb372SMike Snitzer int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, 151298fb372SMike Snitzer HST_WEIGHT_COUNT - 1); 152298fb372SMike Snitzer 153298fb372SMike Snitzer return s->weights[bucket]; 154298fb372SMike Snitzer } 155298fb372SMike Snitzer 156298fb372SMike Snitzer /* 157298fb372SMike Snitzer * Set up the weights array. 158298fb372SMike Snitzer * 159298fb372SMike Snitzer * weights[len-1] = 0 160298fb372SMike Snitzer * weights[n] = base ^ (n + 1) 161298fb372SMike Snitzer */ 162298fb372SMike Snitzer static void hst_set_weights(struct path_selector *ps, unsigned int base) 163298fb372SMike Snitzer { 164298fb372SMike Snitzer struct selector *s = ps->context; 165298fb372SMike Snitzer int i; 166298fb372SMike Snitzer 167298fb372SMike Snitzer if (base >= HST_FIXED_1) 168298fb372SMike Snitzer return; 169298fb372SMike Snitzer 170298fb372SMike Snitzer for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) 171298fb372SMike Snitzer s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); 172298fb372SMike Snitzer s->weights[HST_WEIGHT_COUNT - 1] = 0; 173298fb372SMike Snitzer } 174298fb372SMike Snitzer 175298fb372SMike Snitzer static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) 176298fb372SMike Snitzer { 177298fb372SMike Snitzer struct selector *s; 178298fb372SMike Snitzer unsigned int base_weight = HST_FIXED_95; 179298fb372SMike Snitzer unsigned int threshold_multiplier = 0; 180298fb372SMike Snitzer char dummy; 181298fb372SMike Snitzer 182298fb372SMike Snitzer /* 183298fb372SMike Snitzer * Arguments: [<base_weight> [<threshold_multiplier>]] 184298fb372SMike Snitzer * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A 185298fb372SMike Snitzer * value of 0 will completely ignore any history. 186298fb372SMike Snitzer * If not given, default (HST_FIXED_95) is used. 187298fb372SMike Snitzer * <threshold_multiplier>: Minimum threshold multiplier for paths to 188298fb372SMike Snitzer * be considered different. That is, a path is 189298fb372SMike Snitzer * considered different iff (p1 > N * p2) where p1 190298fb372SMike Snitzer * is the path with higher service time. A threshold 191298fb372SMike Snitzer * of 1 or 0 has no effect. Defaults to 0. 192298fb372SMike Snitzer */ 193298fb372SMike Snitzer if (argc > 2) 194298fb372SMike Snitzer return -EINVAL; 195298fb372SMike Snitzer 196298fb372SMike Snitzer if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || 197298fb372SMike Snitzer base_weight >= HST_FIXED_1)) { 198298fb372SMike Snitzer return -EINVAL; 199298fb372SMike Snitzer } 200298fb372SMike Snitzer 201298fb372SMike Snitzer if (argc > 1 && (sscanf(argv[1], "%u%c", 202298fb372SMike Snitzer &threshold_multiplier, &dummy) != 1)) { 203298fb372SMike Snitzer return -EINVAL; 204298fb372SMike Snitzer } 205298fb372SMike Snitzer 206298fb372SMike Snitzer s = alloc_selector(); 207298fb372SMike Snitzer if (!s) 208298fb372SMike Snitzer return -ENOMEM; 209298fb372SMike Snitzer 210298fb372SMike Snitzer ps->context = s; 211298fb372SMike Snitzer 212298fb372SMike Snitzer hst_set_weights(ps, base_weight); 213298fb372SMike Snitzer s->threshold_multiplier = threshold_multiplier; 214298fb372SMike Snitzer return 0; 215298fb372SMike Snitzer } 216298fb372SMike Snitzer 217298fb372SMike Snitzer static void free_paths(struct list_head *paths) 218298fb372SMike Snitzer { 219298fb372SMike Snitzer struct path_info *pi, *next; 220298fb372SMike Snitzer 221298fb372SMike Snitzer list_for_each_entry_safe(pi, next, paths, list) { 222298fb372SMike Snitzer list_del(&pi->list); 223298fb372SMike Snitzer kfree(pi); 224298fb372SMike Snitzer } 225298fb372SMike Snitzer } 226298fb372SMike Snitzer 227298fb372SMike Snitzer static void hst_destroy(struct path_selector *ps) 228298fb372SMike Snitzer { 229298fb372SMike Snitzer struct selector *s = ps->context; 230298fb372SMike Snitzer 231298fb372SMike Snitzer free_paths(&s->valid_paths); 232298fb372SMike Snitzer free_paths(&s->failed_paths); 233298fb372SMike Snitzer kfree(s); 234298fb372SMike Snitzer ps->context = NULL; 235298fb372SMike Snitzer } 236298fb372SMike Snitzer 237298fb372SMike Snitzer static int hst_status(struct path_selector *ps, struct dm_path *path, 238298fb372SMike Snitzer status_type_t type, char *result, unsigned int maxlen) 239298fb372SMike Snitzer { 240298fb372SMike Snitzer unsigned int sz = 0; 241298fb372SMike Snitzer struct path_info *pi; 242298fb372SMike Snitzer 243298fb372SMike Snitzer if (!path) { 244298fb372SMike Snitzer struct selector *s = ps->context; 245298fb372SMike Snitzer 246298fb372SMike Snitzer DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); 247298fb372SMike Snitzer } else { 248298fb372SMike Snitzer pi = path->pscontext; 249298fb372SMike Snitzer 250298fb372SMike Snitzer switch (type) { 251298fb372SMike Snitzer case STATUSTYPE_INFO: 252298fb372SMike Snitzer DMEMIT("%llu %llu %llu ", pi->historical_service_time, 253298fb372SMike Snitzer pi->outstanding, pi->stale_after); 254298fb372SMike Snitzer break; 255298fb372SMike Snitzer case STATUSTYPE_TABLE: 256298fb372SMike Snitzer DMEMIT("0 "); 257298fb372SMike Snitzer break; 2588ec45662STushar Sugandhi case STATUSTYPE_IMA: 2598ec45662STushar Sugandhi *result = '\0'; 2608ec45662STushar Sugandhi break; 261298fb372SMike Snitzer } 262298fb372SMike Snitzer } 263298fb372SMike Snitzer 264298fb372SMike Snitzer return sz; 265298fb372SMike Snitzer } 266298fb372SMike Snitzer 267298fb372SMike Snitzer static int hst_add_path(struct path_selector *ps, struct dm_path *path, 268298fb372SMike Snitzer int argc, char **argv, char **error) 269298fb372SMike Snitzer { 270298fb372SMike Snitzer struct selector *s = ps->context; 271298fb372SMike Snitzer struct path_info *pi; 272298fb372SMike Snitzer unsigned int repeat_count = HST_MIN_IO; 273298fb372SMike Snitzer char dummy; 274298fb372SMike Snitzer unsigned long flags; 275298fb372SMike Snitzer 276298fb372SMike Snitzer /* 277298fb372SMike Snitzer * Arguments: [<repeat_count>] 278298fb372SMike Snitzer * <repeat_count>: The number of I/Os before switching path. 279298fb372SMike Snitzer * If not given, default (HST_MIN_IO) is used. 280298fb372SMike Snitzer */ 281298fb372SMike Snitzer if (argc > 1) { 282298fb372SMike Snitzer *error = "historical-service-time ps: incorrect number of arguments"; 283298fb372SMike Snitzer return -EINVAL; 284298fb372SMike Snitzer } 285298fb372SMike Snitzer 286298fb372SMike Snitzer if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { 287298fb372SMike Snitzer *error = "historical-service-time ps: invalid repeat count"; 288298fb372SMike Snitzer return -EINVAL; 289298fb372SMike Snitzer } 290298fb372SMike Snitzer 291298fb372SMike Snitzer /* allocate the path */ 292298fb372SMike Snitzer pi = kmalloc(sizeof(*pi), GFP_KERNEL); 293298fb372SMike Snitzer if (!pi) { 294298fb372SMike Snitzer *error = "historical-service-time ps: Error allocating path context"; 295298fb372SMike Snitzer return -ENOMEM; 296298fb372SMike Snitzer } 297298fb372SMike Snitzer 298298fb372SMike Snitzer pi->path = path; 299298fb372SMike Snitzer pi->repeat_count = repeat_count; 300298fb372SMike Snitzer 301298fb372SMike Snitzer pi->historical_service_time = HST_FIXED_1; 302298fb372SMike Snitzer 303298fb372SMike Snitzer spin_lock_init(&pi->lock); 304298fb372SMike Snitzer pi->outstanding = 0; 305298fb372SMike Snitzer 306298fb372SMike Snitzer pi->stale_after = 0; 307298fb372SMike Snitzer pi->last_finish = 0; 308298fb372SMike Snitzer 309298fb372SMike Snitzer path->pscontext = pi; 310298fb372SMike Snitzer 311298fb372SMike Snitzer spin_lock_irqsave(&s->lock, flags); 312298fb372SMike Snitzer list_add_tail(&pi->list, &s->valid_paths); 313298fb372SMike Snitzer s->valid_count++; 314298fb372SMike Snitzer spin_unlock_irqrestore(&s->lock, flags); 315298fb372SMike Snitzer 316298fb372SMike Snitzer return 0; 317298fb372SMike Snitzer } 318298fb372SMike Snitzer 319298fb372SMike Snitzer static void hst_fail_path(struct path_selector *ps, struct dm_path *path) 320298fb372SMike Snitzer { 321298fb372SMike Snitzer struct selector *s = ps->context; 322298fb372SMike Snitzer struct path_info *pi = path->pscontext; 323298fb372SMike Snitzer unsigned long flags; 324298fb372SMike Snitzer 325298fb372SMike Snitzer spin_lock_irqsave(&s->lock, flags); 326298fb372SMike Snitzer list_move(&pi->list, &s->failed_paths); 327298fb372SMike Snitzer s->valid_count--; 328298fb372SMike Snitzer spin_unlock_irqrestore(&s->lock, flags); 329298fb372SMike Snitzer } 330298fb372SMike Snitzer 331298fb372SMike Snitzer static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) 332298fb372SMike Snitzer { 333298fb372SMike Snitzer struct selector *s = ps->context; 334298fb372SMike Snitzer struct path_info *pi = path->pscontext; 335298fb372SMike Snitzer unsigned long flags; 336298fb372SMike Snitzer 337298fb372SMike Snitzer spin_lock_irqsave(&s->lock, flags); 338298fb372SMike Snitzer list_move_tail(&pi->list, &s->valid_paths); 339298fb372SMike Snitzer s->valid_count++; 340298fb372SMike Snitzer spin_unlock_irqrestore(&s->lock, flags); 341298fb372SMike Snitzer 342298fb372SMike Snitzer return 0; 343298fb372SMike Snitzer } 344298fb372SMike Snitzer 345298fb372SMike Snitzer static void hst_fill_compare(struct path_info *pi, u64 *hst, 346298fb372SMike Snitzer u64 *out, u64 *stale) 347298fb372SMike Snitzer { 348298fb372SMike Snitzer unsigned long flags; 349298fb372SMike Snitzer 350298fb372SMike Snitzer spin_lock_irqsave(&pi->lock, flags); 351298fb372SMike Snitzer *hst = pi->historical_service_time; 352298fb372SMike Snitzer *out = pi->outstanding; 353298fb372SMike Snitzer *stale = pi->stale_after; 354298fb372SMike Snitzer spin_unlock_irqrestore(&pi->lock, flags); 355298fb372SMike Snitzer } 356298fb372SMike Snitzer 357298fb372SMike Snitzer /* 358298fb372SMike Snitzer * Compare the estimated service time of 2 paths, pi1 and pi2, 359298fb372SMike Snitzer * for the incoming I/O. 360298fb372SMike Snitzer * 361298fb372SMike Snitzer * Returns: 362298fb372SMike Snitzer * < 0 : pi1 is better 363298fb372SMike Snitzer * 0 : no difference between pi1 and pi2 364298fb372SMike Snitzer * > 0 : pi2 is better 365298fb372SMike Snitzer * 366298fb372SMike Snitzer */ 367298fb372SMike Snitzer static long long hst_compare(struct path_info *pi1, struct path_info *pi2, 368298fb372SMike Snitzer u64 time_now, struct path_selector *ps) 369298fb372SMike Snitzer { 370298fb372SMike Snitzer struct selector *s = ps->context; 371298fb372SMike Snitzer u64 hst1, hst2; 372298fb372SMike Snitzer long long out1, out2, stale1, stale2; 373298fb372SMike Snitzer int pi2_better, over_threshold; 374298fb372SMike Snitzer 375298fb372SMike Snitzer hst_fill_compare(pi1, &hst1, &out1, &stale1); 376298fb372SMike Snitzer hst_fill_compare(pi2, &hst2, &out2, &stale2); 377298fb372SMike Snitzer 378298fb372SMike Snitzer /* Check here if estimated latency for two paths are too similar. 379298fb372SMike Snitzer * If this is the case, we skip extra calculation and just compare 380298fb372SMike Snitzer * outstanding requests. In this case, any unloaded paths will 381298fb372SMike Snitzer * be preferred. 382298fb372SMike Snitzer */ 383298fb372SMike Snitzer if (hst1 > hst2) 384298fb372SMike Snitzer over_threshold = hst1 > (s->threshold_multiplier * hst2); 385298fb372SMike Snitzer else 386298fb372SMike Snitzer over_threshold = hst2 > (s->threshold_multiplier * hst1); 387298fb372SMike Snitzer 388298fb372SMike Snitzer if (!over_threshold) 389298fb372SMike Snitzer return out1 - out2; 390298fb372SMike Snitzer 391298fb372SMike Snitzer /* 392298fb372SMike Snitzer * If an unloaded path is stale, choose it. If both paths are unloaded, 393298fb372SMike Snitzer * choose path that is the most stale. 394298fb372SMike Snitzer * (If one path is loaded, choose the other) 395298fb372SMike Snitzer */ 396298fb372SMike Snitzer if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || 397298fb372SMike Snitzer (!out1 && !out2)) 398298fb372SMike Snitzer return (!out2 * stale1) - (!out1 * stale2); 399298fb372SMike Snitzer 400298fb372SMike Snitzer /* Compare estimated service time. If outstanding is the same, we 401298fb372SMike Snitzer * don't need to multiply 402298fb372SMike Snitzer */ 403298fb372SMike Snitzer if (out1 == out2) { 404298fb372SMike Snitzer pi2_better = hst1 > hst2; 405298fb372SMike Snitzer } else { 406298fb372SMike Snitzer /* Potential overflow with out >= 1024 */ 407298fb372SMike Snitzer if (unlikely(out1 >= HST_MAX_INFLIGHT || 408298fb372SMike Snitzer out2 >= HST_MAX_INFLIGHT)) { 409298fb372SMike Snitzer /* If over 1023 in-flights, we may overflow if hst 410298fb372SMike Snitzer * is at max. (With this shift we still overflow at 411298fb372SMike Snitzer * 1048576 in-flights, which is high enough). 412298fb372SMike Snitzer */ 413298fb372SMike Snitzer hst1 >>= HST_FIXED_SHIFT; 414298fb372SMike Snitzer hst2 >>= HST_FIXED_SHIFT; 415298fb372SMike Snitzer } 416298fb372SMike Snitzer pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; 417298fb372SMike Snitzer } 418298fb372SMike Snitzer 419298fb372SMike Snitzer /* In the case that the 'winner' is stale, limit to equal usage. */ 420298fb372SMike Snitzer if (pi2_better) { 421298fb372SMike Snitzer if (stale2 < time_now) 422298fb372SMike Snitzer return out1 - out2; 423298fb372SMike Snitzer return 1; 424298fb372SMike Snitzer } 425298fb372SMike Snitzer if (stale1 < time_now) 426298fb372SMike Snitzer return out1 - out2; 427298fb372SMike Snitzer return -1; 428298fb372SMike Snitzer } 429298fb372SMike Snitzer 430298fb372SMike Snitzer static struct dm_path *hst_select_path(struct path_selector *ps, 431298fb372SMike Snitzer size_t nr_bytes) 432298fb372SMike Snitzer { 433298fb372SMike Snitzer struct selector *s = ps->context; 434298fb372SMike Snitzer struct path_info *pi = NULL, *best = NULL; 435*ce40426fSKhazhismel Kumykov u64 time_now = ktime_get_ns(); 436298fb372SMike Snitzer struct dm_path *ret = NULL; 437298fb372SMike Snitzer unsigned long flags; 438298fb372SMike Snitzer 439298fb372SMike Snitzer spin_lock_irqsave(&s->lock, flags); 440298fb372SMike Snitzer if (list_empty(&s->valid_paths)) 441298fb372SMike Snitzer goto out; 442298fb372SMike Snitzer 443298fb372SMike Snitzer list_for_each_entry(pi, &s->valid_paths, list) { 444298fb372SMike Snitzer if (!best || (hst_compare(pi, best, time_now, ps) < 0)) 445298fb372SMike Snitzer best = pi; 446298fb372SMike Snitzer } 447298fb372SMike Snitzer 448298fb372SMike Snitzer if (!best) 449298fb372SMike Snitzer goto out; 450298fb372SMike Snitzer 451298fb372SMike Snitzer /* Move last used path to end (least preferred in case of ties) */ 452298fb372SMike Snitzer list_move_tail(&best->list, &s->valid_paths); 453298fb372SMike Snitzer 454298fb372SMike Snitzer ret = best->path; 455298fb372SMike Snitzer 456298fb372SMike Snitzer out: 457298fb372SMike Snitzer spin_unlock_irqrestore(&s->lock, flags); 458298fb372SMike Snitzer return ret; 459298fb372SMike Snitzer } 460298fb372SMike Snitzer 461298fb372SMike Snitzer static int hst_start_io(struct path_selector *ps, struct dm_path *path, 462298fb372SMike Snitzer size_t nr_bytes) 463298fb372SMike Snitzer { 464298fb372SMike Snitzer struct path_info *pi = path->pscontext; 465298fb372SMike Snitzer unsigned long flags; 466298fb372SMike Snitzer 467298fb372SMike Snitzer spin_lock_irqsave(&pi->lock, flags); 468298fb372SMike Snitzer pi->outstanding++; 469298fb372SMike Snitzer spin_unlock_irqrestore(&pi->lock, flags); 470298fb372SMike Snitzer 471298fb372SMike Snitzer return 0; 472298fb372SMike Snitzer } 473298fb372SMike Snitzer 474298fb372SMike Snitzer static u64 path_service_time(struct path_info *pi, u64 start_time) 475298fb372SMike Snitzer { 476*ce40426fSKhazhismel Kumykov u64 now = ktime_get_ns(); 477298fb372SMike Snitzer 478298fb372SMike Snitzer /* if a previous disk request has finished after this IO was 479298fb372SMike Snitzer * sent to the hardware, pretend the submission happened 480298fb372SMike Snitzer * serially. 481298fb372SMike Snitzer */ 482298fb372SMike Snitzer if (time_after64(pi->last_finish, start_time)) 483298fb372SMike Snitzer start_time = pi->last_finish; 484298fb372SMike Snitzer 485*ce40426fSKhazhismel Kumykov pi->last_finish = now; 486*ce40426fSKhazhismel Kumykov if (time_before64(now, start_time)) 487298fb372SMike Snitzer return 0; 488298fb372SMike Snitzer 489*ce40426fSKhazhismel Kumykov return now - start_time; 490298fb372SMike Snitzer } 491298fb372SMike Snitzer 492298fb372SMike Snitzer static int hst_end_io(struct path_selector *ps, struct dm_path *path, 493298fb372SMike Snitzer size_t nr_bytes, u64 start_time) 494298fb372SMike Snitzer { 495298fb372SMike Snitzer struct path_info *pi = path->pscontext; 496298fb372SMike Snitzer struct selector *s = ps->context; 497298fb372SMike Snitzer unsigned long flags; 498298fb372SMike Snitzer u64 st; 499298fb372SMike Snitzer 500298fb372SMike Snitzer spin_lock_irqsave(&pi->lock, flags); 501298fb372SMike Snitzer 502298fb372SMike Snitzer st = path_service_time(pi, start_time); 503298fb372SMike Snitzer pi->outstanding--; 504298fb372SMike Snitzer pi->historical_service_time = 505298fb372SMike Snitzer fixed_ema(pi->historical_service_time, 506298fb372SMike Snitzer min(st * HST_FIXED_1, HST_FIXED_MAX), 507298fb372SMike Snitzer hst_weight(ps, st)); 508298fb372SMike Snitzer 509298fb372SMike Snitzer /* 510298fb372SMike Snitzer * On request end, mark path as fresh. If a path hasn't 511298fb372SMike Snitzer * finished any requests within the fresh period, the estimated 512298fb372SMike Snitzer * service time is considered too optimistic and we limit the 513298fb372SMike Snitzer * maximum requests on that path. 514298fb372SMike Snitzer */ 515298fb372SMike Snitzer pi->stale_after = pi->last_finish + 516298fb372SMike Snitzer (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); 517298fb372SMike Snitzer 518298fb372SMike Snitzer spin_unlock_irqrestore(&pi->lock, flags); 519298fb372SMike Snitzer 520298fb372SMike Snitzer return 0; 521298fb372SMike Snitzer } 522298fb372SMike Snitzer 523298fb372SMike Snitzer static struct path_selector_type hst_ps = { 524298fb372SMike Snitzer .name = "historical-service-time", 525298fb372SMike Snitzer .module = THIS_MODULE, 526298fb372SMike Snitzer .table_args = 1, 527298fb372SMike Snitzer .info_args = 3, 528298fb372SMike Snitzer .create = hst_create, 529298fb372SMike Snitzer .destroy = hst_destroy, 530298fb372SMike Snitzer .status = hst_status, 531298fb372SMike Snitzer .add_path = hst_add_path, 532298fb372SMike Snitzer .fail_path = hst_fail_path, 533298fb372SMike Snitzer .reinstate_path = hst_reinstate_path, 534298fb372SMike Snitzer .select_path = hst_select_path, 535298fb372SMike Snitzer .start_io = hst_start_io, 536298fb372SMike Snitzer .end_io = hst_end_io, 537298fb372SMike Snitzer }; 538298fb372SMike Snitzer 539298fb372SMike Snitzer static int __init dm_hst_init(void) 540298fb372SMike Snitzer { 541298fb372SMike Snitzer int r = dm_register_path_selector(&hst_ps); 542298fb372SMike Snitzer 543298fb372SMike Snitzer if (r < 0) 544298fb372SMike Snitzer DMERR("register failed %d", r); 545298fb372SMike Snitzer 546298fb372SMike Snitzer DMINFO("version " HST_VERSION " loaded"); 547298fb372SMike Snitzer 548298fb372SMike Snitzer return r; 549298fb372SMike Snitzer } 550298fb372SMike Snitzer 551298fb372SMike Snitzer static void __exit dm_hst_exit(void) 552298fb372SMike Snitzer { 553298fb372SMike Snitzer int r = dm_unregister_path_selector(&hst_ps); 554298fb372SMike Snitzer 555298fb372SMike Snitzer if (r < 0) 556298fb372SMike Snitzer DMERR("unregister failed %d", r); 557298fb372SMike Snitzer } 558298fb372SMike Snitzer 559298fb372SMike Snitzer module_init(dm_hst_init); 560298fb372SMike Snitzer module_exit(dm_hst_exit); 561298fb372SMike Snitzer 562298fb372SMike Snitzer MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); 563298fb372SMike Snitzer MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>"); 564298fb372SMike Snitzer MODULE_LICENSE("GPL"); 565