1 /* 2 * buffered writeback throttling. loosely based on CoDel. We can't drop 3 * packets for IO scheduling, so the logic is something like this: 4 * 5 * - Monitor latencies in a defined window of time. 6 * - If the minimum latency in the above window exceeds some target, increment 7 * scaling step and scale down queue depth by a factor of 2x. The monitoring 8 * window is then shrunk to 100 / sqrt(scaling step + 1). 9 * - For any window where we don't have solid data on what the latencies 10 * look like, retain status quo. 11 * - If latencies look good, decrement scaling step. 12 * - If we're only doing writes, allow the scaling step to go negative. This 13 * will temporarily boost write performance, snapping back to a stable 14 * scaling step of 0 if reads show up or the heavy writers finish. Unlike 15 * positive scaling steps where we shrink the monitoring window, a negative 16 * scaling step retains the default step==0 window size. 17 * 18 * Copyright (C) 2016 Jens Axboe 19 * 20 */ 21 #include <linux/kernel.h> 22 #include <linux/blk_types.h> 23 #include <linux/slab.h> 24 #include <linux/backing-dev.h> 25 #include <linux/swap.h> 26 27 #include "blk-wbt.h" 28 29 #define CREATE_TRACE_POINTS 30 #include <trace/events/wbt.h> 31 32 enum { 33 /* 34 * Default setting, we'll scale up (to 75% of QD max) or down (min 1) 35 * from here depending on device stats 36 */ 37 RWB_DEF_DEPTH = 16, 38 39 /* 40 * 100msec window 41 */ 42 RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, 43 44 /* 45 * Disregard stats, if we don't meet this minimum 46 */ 47 RWB_MIN_WRITE_SAMPLES = 3, 48 49 /* 50 * If we have this number of consecutive windows with not enough 51 * information to scale up or down, scale up. 52 */ 53 RWB_UNKNOWN_BUMP = 5, 54 }; 55 56 static inline bool rwb_enabled(struct rq_wb *rwb) 57 { 58 return rwb && rwb->wb_normal != 0; 59 } 60 61 /* 62 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, 63 * false if 'v' + 1 would be bigger than 'below'. 64 */ 65 static bool atomic_inc_below(atomic_t *v, int below) 66 { 67 int cur = atomic_read(v); 68 69 for (;;) { 70 int old; 71 72 if (cur >= below) 73 return false; 74 old = atomic_cmpxchg(v, cur, cur + 1); 75 if (old == cur) 76 break; 77 cur = old; 78 } 79 80 return true; 81 } 82 83 static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) 84 { 85 if (rwb_enabled(rwb)) { 86 const unsigned long cur = jiffies; 87 88 if (cur != *var) 89 *var = cur; 90 } 91 } 92 93 /* 94 * If a task was rate throttled in balance_dirty_pages() within the last 95 * second or so, use that to indicate a higher cleaning rate. 96 */ 97 static bool wb_recent_wait(struct rq_wb *rwb) 98 { 99 struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; 100 101 return time_before(jiffies, wb->dirty_sleep + HZ); 102 } 103 104 static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) 105 { 106 return &rwb->rq_wait[is_kswapd]; 107 } 108 109 static void rwb_wake_all(struct rq_wb *rwb) 110 { 111 int i; 112 113 for (i = 0; i < WBT_NUM_RWQ; i++) { 114 struct rq_wait *rqw = &rwb->rq_wait[i]; 115 116 if (waitqueue_active(&rqw->wait)) 117 wake_up_all(&rqw->wait); 118 } 119 } 120 121 void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) 122 { 123 struct rq_wait *rqw; 124 int inflight, limit; 125 126 if (!(wb_acct & WBT_TRACKED)) 127 return; 128 129 rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); 130 inflight = atomic_dec_return(&rqw->inflight); 131 132 /* 133 * wbt got disabled with IO in flight. Wake up any potential 134 * waiters, we don't have to do more than that. 135 */ 136 if (unlikely(!rwb_enabled(rwb))) { 137 rwb_wake_all(rwb); 138 return; 139 } 140 141 /* 142 * If the device does write back caching, drop further down 143 * before we wake people up. 144 */ 145 if (rwb->wc && !wb_recent_wait(rwb)) 146 limit = 0; 147 else 148 limit = rwb->wb_normal; 149 150 /* 151 * Don't wake anyone up if we are above the normal limit. 152 */ 153 if (inflight && inflight >= limit) 154 return; 155 156 if (waitqueue_active(&rqw->wait)) { 157 int diff = limit - inflight; 158 159 if (!inflight || diff >= rwb->wb_background / 2) 160 wake_up_all(&rqw->wait); 161 } 162 } 163 164 /* 165 * Called on completion of a request. Note that it's also called when 166 * a request is merged, when the request gets freed. 167 */ 168 void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) 169 { 170 if (!rwb) 171 return; 172 173 if (!wbt_is_tracked(stat)) { 174 if (rwb->sync_cookie == stat) { 175 rwb->sync_issue = 0; 176 rwb->sync_cookie = NULL; 177 } 178 179 if (wbt_is_read(stat)) 180 wb_timestamp(rwb, &rwb->last_comp); 181 } else { 182 WARN_ON_ONCE(stat == rwb->sync_cookie); 183 __wbt_done(rwb, wbt_stat_to_mask(stat)); 184 } 185 wbt_clear_state(stat); 186 } 187 188 /* 189 * Return true, if we can't increase the depth further by scaling 190 */ 191 static bool calc_wb_limits(struct rq_wb *rwb) 192 { 193 unsigned int depth; 194 bool ret = false; 195 196 if (!rwb->min_lat_nsec) { 197 rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; 198 return false; 199 } 200 201 /* 202 * For QD=1 devices, this is a special case. It's important for those 203 * to have one request ready when one completes, so force a depth of 204 * 2 for those devices. On the backend, it'll be a depth of 1 anyway, 205 * since the device can't have more than that in flight. If we're 206 * scaling down, then keep a setting of 1/1/1. 207 */ 208 if (rwb->queue_depth == 1) { 209 if (rwb->scale_step > 0) 210 rwb->wb_max = rwb->wb_normal = 1; 211 else { 212 rwb->wb_max = rwb->wb_normal = 2; 213 ret = true; 214 } 215 rwb->wb_background = 1; 216 } else { 217 /* 218 * scale_step == 0 is our default state. If we have suffered 219 * latency spikes, step will be > 0, and we shrink the 220 * allowed write depths. If step is < 0, we're only doing 221 * writes, and we allow a temporarily higher depth to 222 * increase performance. 223 */ 224 depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); 225 if (rwb->scale_step > 0) 226 depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); 227 else if (rwb->scale_step < 0) { 228 unsigned int maxd = 3 * rwb->queue_depth / 4; 229 230 depth = 1 + ((depth - 1) << -rwb->scale_step); 231 if (depth > maxd) { 232 depth = maxd; 233 ret = true; 234 } 235 } 236 237 /* 238 * Set our max/normal/bg queue depths based on how far 239 * we have scaled down (->scale_step). 240 */ 241 rwb->wb_max = depth; 242 rwb->wb_normal = (rwb->wb_max + 1) / 2; 243 rwb->wb_background = (rwb->wb_max + 3) / 4; 244 } 245 246 return ret; 247 } 248 249 static inline bool stat_sample_valid(struct blk_rq_stat *stat) 250 { 251 /* 252 * We need at least one read sample, and a minimum of 253 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know 254 * that it's writes impacting us, and not just some sole read on 255 * a device that is in a lower power state. 256 */ 257 return (stat[READ].nr_samples >= 1 && 258 stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); 259 } 260 261 static u64 rwb_sync_issue_lat(struct rq_wb *rwb) 262 { 263 u64 now, issue = READ_ONCE(rwb->sync_issue); 264 265 if (!issue || !rwb->sync_cookie) 266 return 0; 267 268 now = ktime_to_ns(ktime_get()); 269 return now - issue; 270 } 271 272 enum { 273 LAT_OK = 1, 274 LAT_UNKNOWN, 275 LAT_UNKNOWN_WRITES, 276 LAT_EXCEEDED, 277 }; 278 279 static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 280 { 281 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 282 u64 thislat; 283 284 /* 285 * If our stored sync issue exceeds the window size, or it 286 * exceeds our min target AND we haven't logged any entries, 287 * flag the latency as exceeded. wbt works off completion latencies, 288 * but for a flooded device, a single sync IO can take a long time 289 * to complete after being issued. If this time exceeds our 290 * monitoring window AND we didn't see any other completions in that 291 * window, then count that sync IO as a violation of the latency. 292 */ 293 thislat = rwb_sync_issue_lat(rwb); 294 if (thislat > rwb->cur_win_nsec || 295 (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { 296 trace_wbt_lat(bdi, thislat); 297 return LAT_EXCEEDED; 298 } 299 300 /* 301 * No read/write mix, if stat isn't valid 302 */ 303 if (!stat_sample_valid(stat)) { 304 /* 305 * If we had writes in this stat window and the window is 306 * current, we're only doing writes. If a task recently 307 * waited or still has writes in flights, consider us doing 308 * just writes as well. 309 */ 310 if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || 311 wbt_inflight(rwb)) 312 return LAT_UNKNOWN_WRITES; 313 return LAT_UNKNOWN; 314 } 315 316 /* 317 * If the 'min' latency exceeds our target, step down. 318 */ 319 if (stat[READ].min > rwb->min_lat_nsec) { 320 trace_wbt_lat(bdi, stat[READ].min); 321 trace_wbt_stat(bdi, stat); 322 return LAT_EXCEEDED; 323 } 324 325 if (rwb->scale_step) 326 trace_wbt_stat(bdi, stat); 327 328 return LAT_OK; 329 } 330 331 static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 332 { 333 struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 334 335 trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, 336 rwb->wb_background, rwb->wb_normal, rwb->wb_max); 337 } 338 339 static void scale_up(struct rq_wb *rwb) 340 { 341 /* 342 * Hit max in previous round, stop here 343 */ 344 if (rwb->scaled_max) 345 return; 346 347 rwb->scale_step--; 348 rwb->unknown_cnt = 0; 349 350 rwb->scaled_max = calc_wb_limits(rwb); 351 352 rwb_wake_all(rwb); 353 354 rwb_trace_step(rwb, "step up"); 355 } 356 357 /* 358 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we 359 * had a latency violation. 360 */ 361 static void scale_down(struct rq_wb *rwb, bool hard_throttle) 362 { 363 /* 364 * Stop scaling down when we've hit the limit. This also prevents 365 * ->scale_step from going to crazy values, if the device can't 366 * keep up. 367 */ 368 if (rwb->wb_max == 1) 369 return; 370 371 if (rwb->scale_step < 0 && hard_throttle) 372 rwb->scale_step = 0; 373 else 374 rwb->scale_step++; 375 376 rwb->scaled_max = false; 377 rwb->unknown_cnt = 0; 378 calc_wb_limits(rwb); 379 rwb_trace_step(rwb, "step down"); 380 } 381 382 static void rwb_arm_timer(struct rq_wb *rwb) 383 { 384 if (rwb->scale_step > 0) { 385 /* 386 * We should speed this up, using some variant of a fast 387 * integer inverse square root calculation. Since we only do 388 * this for every window expiration, it's not a huge deal, 389 * though. 390 */ 391 rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, 392 int_sqrt((rwb->scale_step + 1) << 8)); 393 } else { 394 /* 395 * For step < 0, we don't want to increase/decrease the 396 * window size. 397 */ 398 rwb->cur_win_nsec = rwb->win_nsec; 399 } 400 401 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); 402 } 403 404 static void wb_timer_fn(struct blk_stat_callback *cb) 405 { 406 struct rq_wb *rwb = cb->data; 407 unsigned int inflight = wbt_inflight(rwb); 408 int status; 409 410 status = latency_exceeded(rwb, cb->stat); 411 412 trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, 413 inflight); 414 415 /* 416 * If we exceeded the latency target, step down. If we did not, 417 * step one level up. If we don't know enough to say either exceeded 418 * or ok, then don't do anything. 419 */ 420 switch (status) { 421 case LAT_EXCEEDED: 422 scale_down(rwb, true); 423 break; 424 case LAT_OK: 425 scale_up(rwb); 426 break; 427 case LAT_UNKNOWN_WRITES: 428 /* 429 * We started a the center step, but don't have a valid 430 * read/write sample, but we do have writes going on. 431 * Allow step to go negative, to increase write perf. 432 */ 433 scale_up(rwb); 434 break; 435 case LAT_UNKNOWN: 436 if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) 437 break; 438 /* 439 * We get here when previously scaled reduced depth, and we 440 * currently don't have a valid read/write sample. For that 441 * case, slowly return to center state (step == 0). 442 */ 443 if (rwb->scale_step > 0) 444 scale_up(rwb); 445 else if (rwb->scale_step < 0) 446 scale_down(rwb, false); 447 break; 448 default: 449 break; 450 } 451 452 /* 453 * Re-arm timer, if we have IO in flight 454 */ 455 if (rwb->scale_step || inflight) 456 rwb_arm_timer(rwb); 457 } 458 459 void wbt_update_limits(struct rq_wb *rwb) 460 { 461 rwb->scale_step = 0; 462 rwb->scaled_max = false; 463 calc_wb_limits(rwb); 464 465 rwb_wake_all(rwb); 466 } 467 468 static bool close_io(struct rq_wb *rwb) 469 { 470 const unsigned long now = jiffies; 471 472 return time_before(now, rwb->last_issue + HZ / 10) || 473 time_before(now, rwb->last_comp + HZ / 10); 474 } 475 476 #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) 477 478 static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) 479 { 480 unsigned int limit; 481 482 /* 483 * At this point we know it's a buffered write. If this is 484 * kswapd trying to free memory, or REQ_SYNC is set, then 485 * it's WB_SYNC_ALL writeback, and we'll use the max limit for 486 * that. If the write is marked as a background write, then use 487 * the idle limit, or go to normal if we haven't had competing 488 * IO for a bit. 489 */ 490 if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) 491 limit = rwb->wb_max; 492 else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { 493 /* 494 * If less than 100ms since we completed unrelated IO, 495 * limit us to half the depth for background writeback. 496 */ 497 limit = rwb->wb_background; 498 } else 499 limit = rwb->wb_normal; 500 501 return limit; 502 } 503 504 static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, 505 wait_queue_entry_t *wait, unsigned long rw) 506 { 507 /* 508 * inc it here even if disabled, since we'll dec it at completion. 509 * this only happens if the task was sleeping in __wbt_wait(), 510 * and someone turned it off at the same time. 511 */ 512 if (!rwb_enabled(rwb)) { 513 atomic_inc(&rqw->inflight); 514 return true; 515 } 516 517 /* 518 * If the waitqueue is already active and we are not the next 519 * in line to be woken up, wait for our turn. 520 */ 521 if (waitqueue_active(&rqw->wait) && 522 rqw->wait.head.next != &wait->entry) 523 return false; 524 525 return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); 526 } 527 528 /* 529 * Block if we will exceed our limit, or if we are currently waiting for 530 * the timer to kick off queuing again. 531 */ 532 static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) 533 __releases(lock) 534 __acquires(lock) 535 { 536 struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); 537 DEFINE_WAIT(wait); 538 539 if (may_queue(rwb, rqw, &wait, rw)) 540 return; 541 542 do { 543 prepare_to_wait_exclusive(&rqw->wait, &wait, 544 TASK_UNINTERRUPTIBLE); 545 546 if (may_queue(rwb, rqw, &wait, rw)) 547 break; 548 549 if (lock) { 550 spin_unlock_irq(lock); 551 io_schedule(); 552 spin_lock_irq(lock); 553 } else 554 io_schedule(); 555 } while (1); 556 557 finish_wait(&rqw->wait, &wait); 558 } 559 560 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 561 { 562 const int op = bio_op(bio); 563 564 /* 565 * If not a WRITE, do nothing 566 */ 567 if (op != REQ_OP_WRITE) 568 return false; 569 570 /* 571 * Don't throttle WRITE_ODIRECT 572 */ 573 if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) 574 return false; 575 576 return true; 577 } 578 579 /* 580 * Returns true if the IO request should be accounted, false if not. 581 * May sleep, if we have exceeded the writeback limits. Caller can pass 582 * in an irq held spinlock, if it holds one when calling this function. 583 * If we do sleep, we'll release and re-grab it. 584 */ 585 enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) 586 { 587 unsigned int ret = 0; 588 589 if (!rwb_enabled(rwb)) 590 return 0; 591 592 if (bio_op(bio) == REQ_OP_READ) 593 ret = WBT_READ; 594 595 if (!wbt_should_throttle(rwb, bio)) { 596 if (ret & WBT_READ) 597 wb_timestamp(rwb, &rwb->last_issue); 598 return ret; 599 } 600 601 __wbt_wait(rwb, bio->bi_opf, lock); 602 603 if (!blk_stat_is_active(rwb->cb)) 604 rwb_arm_timer(rwb); 605 606 if (current_is_kswapd()) 607 ret |= WBT_KSWAPD; 608 609 return ret | WBT_TRACKED; 610 } 611 612 void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) 613 { 614 if (!rwb_enabled(rwb)) 615 return; 616 617 /* 618 * Track sync issue, in case it takes a long time to complete. Allows 619 * us to react quicker, if a sync IO takes a long time to complete. 620 * Note that this is just a hint. 'stat' can go away when the 621 * request completes, so it's important we never dereference it. We 622 * only use the address to compare with, which is why we store the 623 * sync_issue time locally. 624 */ 625 if (wbt_is_read(stat) && !rwb->sync_issue) { 626 rwb->sync_cookie = stat; 627 rwb->sync_issue = blk_stat_time(stat); 628 } 629 } 630 631 void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) 632 { 633 if (!rwb_enabled(rwb)) 634 return; 635 if (stat == rwb->sync_cookie) { 636 rwb->sync_issue = 0; 637 rwb->sync_cookie = NULL; 638 } 639 } 640 641 void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) 642 { 643 if (rwb) { 644 rwb->queue_depth = depth; 645 wbt_update_limits(rwb); 646 } 647 } 648 649 void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) 650 { 651 if (rwb) 652 rwb->wc = write_cache_on; 653 } 654 655 /* 656 * Disable wbt, if enabled by default. 657 */ 658 void wbt_disable_default(struct request_queue *q) 659 { 660 struct rq_wb *rwb = q->rq_wb; 661 662 if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) 663 wbt_exit(q); 664 } 665 EXPORT_SYMBOL_GPL(wbt_disable_default); 666 667 /* 668 * Enable wbt if defaults are configured that way 669 */ 670 void wbt_enable_default(struct request_queue *q) 671 { 672 /* Throttling already enabled? */ 673 if (q->rq_wb) 674 return; 675 676 /* Queue not registered? Maybe shutting down... */ 677 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 678 return; 679 680 if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || 681 (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) 682 wbt_init(q); 683 } 684 EXPORT_SYMBOL_GPL(wbt_enable_default); 685 686 u64 wbt_default_latency_nsec(struct request_queue *q) 687 { 688 /* 689 * We default to 2msec for non-rotational storage, and 75msec 690 * for rotational storage. 691 */ 692 if (blk_queue_nonrot(q)) 693 return 2000000ULL; 694 else 695 return 75000000ULL; 696 } 697 698 static int wbt_data_dir(const struct request *rq) 699 { 700 const int op = req_op(rq); 701 702 if (op == REQ_OP_READ) 703 return READ; 704 else if (op == REQ_OP_WRITE || op == REQ_OP_FLUSH) 705 return WRITE; 706 707 /* don't account */ 708 return -1; 709 } 710 711 int wbt_init(struct request_queue *q) 712 { 713 struct rq_wb *rwb; 714 int i; 715 716 BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); 717 718 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); 719 if (!rwb) 720 return -ENOMEM; 721 722 rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); 723 if (!rwb->cb) { 724 kfree(rwb); 725 return -ENOMEM; 726 } 727 728 for (i = 0; i < WBT_NUM_RWQ; i++) { 729 atomic_set(&rwb->rq_wait[i].inflight, 0); 730 init_waitqueue_head(&rwb->rq_wait[i].wait); 731 } 732 733 rwb->last_comp = rwb->last_issue = jiffies; 734 rwb->queue = q; 735 rwb->win_nsec = RWB_WINDOW_NSEC; 736 rwb->enable_state = WBT_STATE_ON_DEFAULT; 737 wbt_update_limits(rwb); 738 739 /* 740 * Assign rwb and add the stats callback. 741 */ 742 q->rq_wb = rwb; 743 blk_stat_add_callback(q, rwb->cb); 744 745 rwb->min_lat_nsec = wbt_default_latency_nsec(q); 746 747 wbt_set_queue_depth(rwb, blk_queue_depth(q)); 748 wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 749 750 return 0; 751 } 752 753 void wbt_exit(struct request_queue *q) 754 { 755 struct rq_wb *rwb = q->rq_wb; 756 757 if (rwb) { 758 blk_stat_remove_callback(q, rwb->cb); 759 blk_stat_free_callback(rwb->cb); 760 q->rq_wb = NULL; 761 kfree(rwb); 762 } 763 } 764