1 /* 2 * buffered writeback throttling. loosely based on CoDel. We can't drop 3 * packets for IO scheduling, so the logic is something like this: 4 * 5 * - Monitor latencies in a defined window of time. 6 * - If the minimum latency in the above window exceeds some target, increment 7 * scaling step and scale down queue depth by a factor of 2x. The monitoring 8 * window is then shrunk to 100 / sqrt(scaling step + 1). 9 * - For any window where we don't have solid data on what the latencies 10 * look like, retain status quo. 11 * - If latencies look good, decrement scaling step. 12 * - If we're only doing writes, allow the scaling step to go negative. This 13 * will temporarily boost write performance, snapping back to a stable 14 * scaling step of 0 if reads show up or the heavy writers finish. Unlike 15 * positive scaling steps where we shrink the monitoring window, a negative 16 * scaling step retains the default step==0 window size. 17 * 18 * Copyright (C) 2016 Jens Axboe 19 * 20 */ 21 #include <linux/kernel.h> 22 #include <linux/blk_types.h> 23 #include <linux/slab.h> 24 #include <linux/backing-dev.h> 25 #include <linux/swap.h> 26 27 #include "blk-wbt.h" 28 #include "blk-rq-qos.h" 29 30 #define CREATE_TRACE_POINTS 31 #include <trace/events/wbt.h> 32 33 static inline void wbt_clear_state(struct request *rq) 34 { 35 rq->wbt_flags = 0; 36 } 37 38 static inline enum wbt_flags wbt_flags(struct request *rq) 39 { 40 return rq->wbt_flags; 41 } 42 43 static inline bool wbt_is_tracked(struct request *rq) 44 { 45 return rq->wbt_flags & WBT_TRACKED; 46 } 47 48 static inline bool wbt_is_read(struct request *rq) 49 { 50 return rq->wbt_flags & WBT_READ; 51 } 52 53 enum { 54 /* 55 * Default setting, we'll scale up (to 75% of QD max) or down (min 1) 56 * from here depending on device stats 57 */ 58 RWB_DEF_DEPTH = 16, 59 60 /* 61 * 100msec window 62 */ 63 RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, 64 65 /* 66 * Disregard stats, if we don't meet this minimum 67 */ 68 RWB_MIN_WRITE_SAMPLES = 3, 69 70 /* 71 * If we have this number of consecutive windows with not enough 72 * information to scale up or down, scale up. 73 */ 74 RWB_UNKNOWN_BUMP = 5, 75 }; 76 77 static inline bool rwb_enabled(struct rq_wb *rwb) 78 { 79 return rwb && rwb->wb_normal != 0; 80 } 81 82 static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) 83 { 84 if (rwb_enabled(rwb)) { 85 const unsigned long cur = jiffies; 86 87 if (cur != *var) 88 *var = cur; 89 } 90 } 91 92 /* 93 * If a task was rate throttled in balance_dirty_pages() within the last 94 * second or so, use that to indicate a higher cleaning rate. 95 */ 96 static bool wb_recent_wait(struct rq_wb *rwb) 97 { 98 struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; 99 100 return time_before(jiffies, wb->dirty_sleep + HZ); 101 } 102 103 static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, 104 enum wbt_flags wb_acct) 105 { 106 if (wb_acct & WBT_KSWAPD) 107 return &rwb->rq_wait[WBT_RWQ_KSWAPD]; 108 else if (wb_acct & WBT_DISCARD) 109 return &rwb->rq_wait[WBT_RWQ_DISCARD]; 110 111 return &rwb->rq_wait[WBT_RWQ_BG]; 112 } 113 114 static void rwb_wake_all(struct rq_wb *rwb) 115 { 116 int i; 117 118 for (i = 0; i < WBT_NUM_RWQ; i++) { 119 struct rq_wait *rqw = &rwb->rq_wait[i]; 120 121 if (wq_has_sleeper(&rqw->wait)) 122 wake_up_all(&rqw->wait); 123 } 124 } 125 126 static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, 127 enum wbt_flags wb_acct) 128 { 129 int inflight, limit; 130 131 inflight = atomic_dec_return(&rqw->inflight); 132 133 /* 134 * wbt got disabled with IO in flight. Wake up any potential 135 * waiters, we don't have to do more than that. 136 */ 137 if (unlikely(!rwb_enabled(rwb))) { 138 rwb_wake_all(rwb); 139 return; 140 } 141 142 /* 143 * For discards, our limit is always the background. For writes, if 144 * the device does write back caching, drop further down before we 145 * wake people up. 146 */ 147 if (wb_acct & WBT_DISCARD) 148 limit = rwb->wb_background; 149 else if (rwb->wc && !wb_recent_wait(rwb)) 150 limit = 0; 151 else 152 limit = rwb->wb_normal; 153 154 /* 155 * Don't wake anyone up if we are above the normal limit. 156 */ 157 if (inflight && inflight >= limit) 158 return; 159 160 if (wq_has_sleeper(&rqw->wait)) { 161 int diff = limit - inflight; 162 163 if (!inflight || diff >= rwb->wb_background / 2) 164 wake_up_all(&rqw->wait); 165 } 166 } 167 168 static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) 169 { 170 struct rq_wb *rwb = RQWB(rqos); 171 struct rq_wait *rqw; 172 173 if (!(wb_acct & WBT_TRACKED)) 174 return; 175 176 rqw = get_rq_wait(rwb, wb_acct); 177 wbt_rqw_done(rwb, rqw, wb_acct); 178 } 179 180 /* 181 * Called on completion of a request. Note that it's also called when 182 * a request is merged, when the request gets freed. 183 */ 184 static void wbt_done(struct rq_qos *rqos, struct request *rq) 185 { 186 struct rq_wb *rwb = RQWB(rqos); 187 188 if (!wbt_is_tracked(rq)) { 189 if (rwb->sync_cookie == rq) { 190 rwb->sync_issue = 0; 191 rwb->sync_cookie = NULL; 192 } 193 194 if (wbt_is_read(rq)) 195 wb_timestamp(rwb, &rwb->last_comp); 196 } else { 197 WARN_ON_ONCE(rq == rwb->sync_cookie); 198 __wbt_done(rqos, wbt_flags(rq)); 199 } 200 wbt_clear_state(rq); 201 } 202 203 static inline bool stat_sample_valid(struct blk_rq_stat *stat) 204 { 205 /* 206 * We need at least one read sample, and a minimum of 207 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know 208 * that it's writes impacting us, and not just some sole read on 209 * a device that is in a lower power state. 210 */ 211 return (stat[READ].nr_samples >= 1 && 212 stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); 213 } 214 215 static u64 rwb_sync_issue_lat(struct rq_wb *rwb) 216 { 217 u64 now, issue = READ_ONCE(rwb->sync_issue); 218 219 if (!issue || !rwb->sync_cookie) 220 return 0; 221 222 now = ktime_to_ns(ktime_get()); 223 return now - issue; 224 } 225 226 enum { 227 LAT_OK = 1, 228 LAT_UNKNOWN, 229 LAT_UNKNOWN_WRITES, 230 LAT_EXCEEDED, 231 }; 232 233 static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 234 { 235 struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; 236 struct rq_depth *rqd = &rwb->rq_depth; 237 u64 thislat; 238 239 /* 240 * If our stored sync issue exceeds the window size, or it 241 * exceeds our min target AND we haven't logged any entries, 242 * flag the latency as exceeded. wbt works off completion latencies, 243 * but for a flooded device, a single sync IO can take a long time 244 * to complete after being issued. If this time exceeds our 245 * monitoring window AND we didn't see any other completions in that 246 * window, then count that sync IO as a violation of the latency. 247 */ 248 thislat = rwb_sync_issue_lat(rwb); 249 if (thislat > rwb->cur_win_nsec || 250 (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { 251 trace_wbt_lat(bdi, thislat); 252 return LAT_EXCEEDED; 253 } 254 255 /* 256 * No read/write mix, if stat isn't valid 257 */ 258 if (!stat_sample_valid(stat)) { 259 /* 260 * If we had writes in this stat window and the window is 261 * current, we're only doing writes. If a task recently 262 * waited or still has writes in flights, consider us doing 263 * just writes as well. 264 */ 265 if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || 266 wbt_inflight(rwb)) 267 return LAT_UNKNOWN_WRITES; 268 return LAT_UNKNOWN; 269 } 270 271 /* 272 * If the 'min' latency exceeds our target, step down. 273 */ 274 if (stat[READ].min > rwb->min_lat_nsec) { 275 trace_wbt_lat(bdi, stat[READ].min); 276 trace_wbt_stat(bdi, stat); 277 return LAT_EXCEEDED; 278 } 279 280 if (rqd->scale_step) 281 trace_wbt_stat(bdi, stat); 282 283 return LAT_OK; 284 } 285 286 static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 287 { 288 struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; 289 struct rq_depth *rqd = &rwb->rq_depth; 290 291 trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, 292 rwb->wb_background, rwb->wb_normal, rqd->max_depth); 293 } 294 295 static void calc_wb_limits(struct rq_wb *rwb) 296 { 297 if (rwb->min_lat_nsec == 0) { 298 rwb->wb_normal = rwb->wb_background = 0; 299 } else if (rwb->rq_depth.max_depth <= 2) { 300 rwb->wb_normal = rwb->rq_depth.max_depth; 301 rwb->wb_background = 1; 302 } else { 303 rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; 304 rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; 305 } 306 } 307 308 static void scale_up(struct rq_wb *rwb) 309 { 310 rq_depth_scale_up(&rwb->rq_depth); 311 calc_wb_limits(rwb); 312 rwb->unknown_cnt = 0; 313 rwb_trace_step(rwb, "scale up"); 314 } 315 316 static void scale_down(struct rq_wb *rwb, bool hard_throttle) 317 { 318 rq_depth_scale_down(&rwb->rq_depth, hard_throttle); 319 calc_wb_limits(rwb); 320 rwb->unknown_cnt = 0; 321 rwb_wake_all(rwb); 322 rwb_trace_step(rwb, "scale down"); 323 } 324 325 static void rwb_arm_timer(struct rq_wb *rwb) 326 { 327 struct rq_depth *rqd = &rwb->rq_depth; 328 329 if (rqd->scale_step > 0) { 330 /* 331 * We should speed this up, using some variant of a fast 332 * integer inverse square root calculation. Since we only do 333 * this for every window expiration, it's not a huge deal, 334 * though. 335 */ 336 rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, 337 int_sqrt((rqd->scale_step + 1) << 8)); 338 } else { 339 /* 340 * For step < 0, we don't want to increase/decrease the 341 * window size. 342 */ 343 rwb->cur_win_nsec = rwb->win_nsec; 344 } 345 346 blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); 347 } 348 349 static void wb_timer_fn(struct blk_stat_callback *cb) 350 { 351 struct rq_wb *rwb = cb->data; 352 struct rq_depth *rqd = &rwb->rq_depth; 353 unsigned int inflight = wbt_inflight(rwb); 354 int status; 355 356 status = latency_exceeded(rwb, cb->stat); 357 358 trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, 359 inflight); 360 361 /* 362 * If we exceeded the latency target, step down. If we did not, 363 * step one level up. If we don't know enough to say either exceeded 364 * or ok, then don't do anything. 365 */ 366 switch (status) { 367 case LAT_EXCEEDED: 368 scale_down(rwb, true); 369 break; 370 case LAT_OK: 371 scale_up(rwb); 372 break; 373 case LAT_UNKNOWN_WRITES: 374 /* 375 * We started a the center step, but don't have a valid 376 * read/write sample, but we do have writes going on. 377 * Allow step to go negative, to increase write perf. 378 */ 379 scale_up(rwb); 380 break; 381 case LAT_UNKNOWN: 382 if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) 383 break; 384 /* 385 * We get here when previously scaled reduced depth, and we 386 * currently don't have a valid read/write sample. For that 387 * case, slowly return to center state (step == 0). 388 */ 389 if (rqd->scale_step > 0) 390 scale_up(rwb); 391 else if (rqd->scale_step < 0) 392 scale_down(rwb, false); 393 break; 394 default: 395 break; 396 } 397 398 /* 399 * Re-arm timer, if we have IO in flight 400 */ 401 if (rqd->scale_step || inflight) 402 rwb_arm_timer(rwb); 403 } 404 405 static void __wbt_update_limits(struct rq_wb *rwb) 406 { 407 struct rq_depth *rqd = &rwb->rq_depth; 408 409 rqd->scale_step = 0; 410 rqd->scaled_max = false; 411 412 rq_depth_calc_max_depth(rqd); 413 calc_wb_limits(rwb); 414 415 rwb_wake_all(rwb); 416 } 417 418 void wbt_update_limits(struct request_queue *q) 419 { 420 struct rq_qos *rqos = wbt_rq_qos(q); 421 if (!rqos) 422 return; 423 __wbt_update_limits(RQWB(rqos)); 424 } 425 426 u64 wbt_get_min_lat(struct request_queue *q) 427 { 428 struct rq_qos *rqos = wbt_rq_qos(q); 429 if (!rqos) 430 return 0; 431 return RQWB(rqos)->min_lat_nsec; 432 } 433 434 void wbt_set_min_lat(struct request_queue *q, u64 val) 435 { 436 struct rq_qos *rqos = wbt_rq_qos(q); 437 if (!rqos) 438 return; 439 RQWB(rqos)->min_lat_nsec = val; 440 RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; 441 __wbt_update_limits(RQWB(rqos)); 442 } 443 444 445 static bool close_io(struct rq_wb *rwb) 446 { 447 const unsigned long now = jiffies; 448 449 return time_before(now, rwb->last_issue + HZ / 10) || 450 time_before(now, rwb->last_comp + HZ / 10); 451 } 452 453 #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) 454 455 static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) 456 { 457 unsigned int limit; 458 459 /* 460 * If we got disabled, just return UINT_MAX. This ensures that 461 * we'll properly inc a new IO, and dec+wakeup at the end. 462 */ 463 if (!rwb_enabled(rwb)) 464 return UINT_MAX; 465 466 if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD) 467 return rwb->wb_background; 468 469 /* 470 * At this point we know it's a buffered write. If this is 471 * kswapd trying to free memory, or REQ_SYNC is set, then 472 * it's WB_SYNC_ALL writeback, and we'll use the max limit for 473 * that. If the write is marked as a background write, then use 474 * the idle limit, or go to normal if we haven't had competing 475 * IO for a bit. 476 */ 477 if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) 478 limit = rwb->rq_depth.max_depth; 479 else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { 480 /* 481 * If less than 100ms since we completed unrelated IO, 482 * limit us to half the depth for background writeback. 483 */ 484 limit = rwb->wb_background; 485 } else 486 limit = rwb->wb_normal; 487 488 return limit; 489 } 490 491 struct wbt_wait_data { 492 struct wait_queue_entry wq; 493 struct task_struct *task; 494 struct rq_wb *rwb; 495 struct rq_wait *rqw; 496 unsigned long rw; 497 bool got_token; 498 }; 499 500 static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, 501 int wake_flags, void *key) 502 { 503 struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, 504 wq); 505 506 /* 507 * If we fail to get a budget, return -1 to interrupt the wake up 508 * loop in __wake_up_common. 509 */ 510 if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw))) 511 return -1; 512 513 data->got_token = true; 514 list_del_init(&curr->entry); 515 wake_up_process(data->task); 516 return 1; 517 } 518 519 /* 520 * Block if we will exceed our limit, or if we are currently waiting for 521 * the timer to kick off queuing again. 522 */ 523 static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, 524 unsigned long rw, spinlock_t *lock) 525 __releases(lock) 526 __acquires(lock) 527 { 528 struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); 529 struct wbt_wait_data data = { 530 .wq = { 531 .func = wbt_wake_function, 532 .entry = LIST_HEAD_INIT(data.wq.entry), 533 }, 534 .task = current, 535 .rwb = rwb, 536 .rqw = rqw, 537 .rw = rw, 538 }; 539 bool has_sleeper; 540 541 has_sleeper = wq_has_sleeper(&rqw->wait); 542 if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) 543 return; 544 545 prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); 546 do { 547 if (data.got_token) 548 break; 549 550 if (!has_sleeper && 551 rq_wait_inc_below(rqw, get_limit(rwb, rw))) { 552 finish_wait(&rqw->wait, &data.wq); 553 554 /* 555 * We raced with wbt_wake_function() getting a token, 556 * which means we now have two. Put our local token 557 * and wake anyone else potentially waiting for one. 558 */ 559 if (data.got_token) 560 wbt_rqw_done(rwb, rqw, wb_acct); 561 break; 562 } 563 564 if (lock) { 565 spin_unlock_irq(lock); 566 io_schedule(); 567 spin_lock_irq(lock); 568 } else 569 io_schedule(); 570 571 has_sleeper = false; 572 } while (1); 573 574 finish_wait(&rqw->wait, &data.wq); 575 } 576 577 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 578 { 579 switch (bio_op(bio)) { 580 case REQ_OP_WRITE: 581 /* 582 * Don't throttle WRITE_ODIRECT 583 */ 584 if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == 585 (REQ_SYNC | REQ_IDLE)) 586 return false; 587 /* fallthrough */ 588 case REQ_OP_DISCARD: 589 return true; 590 default: 591 return false; 592 } 593 } 594 595 static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) 596 { 597 enum wbt_flags flags = 0; 598 599 if (!rwb_enabled(rwb)) 600 return 0; 601 602 if (bio_op(bio) == REQ_OP_READ) { 603 flags = WBT_READ; 604 } else if (wbt_should_throttle(rwb, bio)) { 605 if (current_is_kswapd()) 606 flags |= WBT_KSWAPD; 607 if (bio_op(bio) == REQ_OP_DISCARD) 608 flags |= WBT_DISCARD; 609 flags |= WBT_TRACKED; 610 } 611 return flags; 612 } 613 614 static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) 615 { 616 struct rq_wb *rwb = RQWB(rqos); 617 enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); 618 __wbt_done(rqos, flags); 619 } 620 621 /* 622 * Returns true if the IO request should be accounted, false if not. 623 * May sleep, if we have exceeded the writeback limits. Caller can pass 624 * in an irq held spinlock, if it holds one when calling this function. 625 * If we do sleep, we'll release and re-grab it. 626 */ 627 static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) 628 { 629 struct rq_wb *rwb = RQWB(rqos); 630 enum wbt_flags flags; 631 632 flags = bio_to_wbt_flags(rwb, bio); 633 if (!(flags & WBT_TRACKED)) { 634 if (flags & WBT_READ) 635 wb_timestamp(rwb, &rwb->last_issue); 636 return; 637 } 638 639 __wbt_wait(rwb, flags, bio->bi_opf, lock); 640 641 if (!blk_stat_is_active(rwb->cb)) 642 rwb_arm_timer(rwb); 643 } 644 645 static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) 646 { 647 struct rq_wb *rwb = RQWB(rqos); 648 rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); 649 } 650 651 void wbt_issue(struct rq_qos *rqos, struct request *rq) 652 { 653 struct rq_wb *rwb = RQWB(rqos); 654 655 if (!rwb_enabled(rwb)) 656 return; 657 658 /* 659 * Track sync issue, in case it takes a long time to complete. Allows us 660 * to react quicker, if a sync IO takes a long time to complete. Note 661 * that this is just a hint. The request can go away when it completes, 662 * so it's important we never dereference it. We only use the address to 663 * compare with, which is why we store the sync_issue time locally. 664 */ 665 if (wbt_is_read(rq) && !rwb->sync_issue) { 666 rwb->sync_cookie = rq; 667 rwb->sync_issue = rq->io_start_time_ns; 668 } 669 } 670 671 void wbt_requeue(struct rq_qos *rqos, struct request *rq) 672 { 673 struct rq_wb *rwb = RQWB(rqos); 674 if (!rwb_enabled(rwb)) 675 return; 676 if (rq == rwb->sync_cookie) { 677 rwb->sync_issue = 0; 678 rwb->sync_cookie = NULL; 679 } 680 } 681 682 void wbt_set_queue_depth(struct request_queue *q, unsigned int depth) 683 { 684 struct rq_qos *rqos = wbt_rq_qos(q); 685 if (rqos) { 686 RQWB(rqos)->rq_depth.queue_depth = depth; 687 __wbt_update_limits(RQWB(rqos)); 688 } 689 } 690 691 void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) 692 { 693 struct rq_qos *rqos = wbt_rq_qos(q); 694 if (rqos) 695 RQWB(rqos)->wc = write_cache_on; 696 } 697 698 /* 699 * Enable wbt if defaults are configured that way 700 */ 701 void wbt_enable_default(struct request_queue *q) 702 { 703 struct rq_qos *rqos = wbt_rq_qos(q); 704 /* Throttling already enabled? */ 705 if (rqos) 706 return; 707 708 /* Queue not registered? Maybe shutting down... */ 709 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 710 return; 711 712 if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || 713 (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) 714 wbt_init(q); 715 } 716 EXPORT_SYMBOL_GPL(wbt_enable_default); 717 718 u64 wbt_default_latency_nsec(struct request_queue *q) 719 { 720 /* 721 * We default to 2msec for non-rotational storage, and 75msec 722 * for rotational storage. 723 */ 724 if (blk_queue_nonrot(q)) 725 return 2000000ULL; 726 else 727 return 75000000ULL; 728 } 729 730 static int wbt_data_dir(const struct request *rq) 731 { 732 const int op = req_op(rq); 733 734 if (op == REQ_OP_READ) 735 return READ; 736 else if (op_is_write(op)) 737 return WRITE; 738 739 /* don't account */ 740 return -1; 741 } 742 743 static void wbt_exit(struct rq_qos *rqos) 744 { 745 struct rq_wb *rwb = RQWB(rqos); 746 struct request_queue *q = rqos->q; 747 748 blk_stat_remove_callback(q, rwb->cb); 749 blk_stat_free_callback(rwb->cb); 750 kfree(rwb); 751 } 752 753 /* 754 * Disable wbt, if enabled by default. 755 */ 756 void wbt_disable_default(struct request_queue *q) 757 { 758 struct rq_qos *rqos = wbt_rq_qos(q); 759 struct rq_wb *rwb; 760 if (!rqos) 761 return; 762 rwb = RQWB(rqos); 763 if (rwb->enable_state == WBT_STATE_ON_DEFAULT) 764 rwb->wb_normal = 0; 765 } 766 EXPORT_SYMBOL_GPL(wbt_disable_default); 767 768 769 static struct rq_qos_ops wbt_rqos_ops = { 770 .throttle = wbt_wait, 771 .issue = wbt_issue, 772 .track = wbt_track, 773 .requeue = wbt_requeue, 774 .done = wbt_done, 775 .cleanup = wbt_cleanup, 776 .exit = wbt_exit, 777 }; 778 779 int wbt_init(struct request_queue *q) 780 { 781 struct rq_wb *rwb; 782 int i; 783 784 rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); 785 if (!rwb) 786 return -ENOMEM; 787 788 rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); 789 if (!rwb->cb) { 790 kfree(rwb); 791 return -ENOMEM; 792 } 793 794 for (i = 0; i < WBT_NUM_RWQ; i++) 795 rq_wait_init(&rwb->rq_wait[i]); 796 797 rwb->rqos.id = RQ_QOS_WBT; 798 rwb->rqos.ops = &wbt_rqos_ops; 799 rwb->rqos.q = q; 800 rwb->last_comp = rwb->last_issue = jiffies; 801 rwb->win_nsec = RWB_WINDOW_NSEC; 802 rwb->enable_state = WBT_STATE_ON_DEFAULT; 803 rwb->wc = 1; 804 rwb->rq_depth.default_depth = RWB_DEF_DEPTH; 805 __wbt_update_limits(rwb); 806 807 /* 808 * Assign rwb and add the stats callback. 809 */ 810 rq_qos_add(q, &rwb->rqos); 811 blk_stat_add_callback(q, rwb->cb); 812 813 rwb->min_lat_nsec = wbt_default_latency_nsec(q); 814 815 wbt_set_queue_depth(q, blk_queue_depth(q)); 816 wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 817 818 return 0; 819 } 820