1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Interface for controlling IO bandwidth on a request queue 4 * 5 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/bio.h> 12 #include <linux/blktrace_api.h> 13 #include "blk.h" 14 #include "blk-cgroup-rwstat.h" 15 #include "blk-throttle.h" 16 17 /* Max dispatch from a group in 1 round */ 18 #define THROTL_GRP_QUANTUM 8 19 20 /* Total max dispatch from all groups in one round */ 21 #define THROTL_QUANTUM 32 22 23 /* Throttling is performed over a slice and after that slice is renewed */ 24 #define DFL_THROTL_SLICE (HZ / 10) 25 26 /* A workqueue to queue throttle related work */ 27 static struct workqueue_struct *kthrotld_workqueue; 28 29 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 30 31 struct throtl_data 32 { 33 /* service tree for active throtl groups */ 34 struct throtl_service_queue service_queue; 35 36 struct request_queue *queue; 37 38 /* Total Number of queued bios on READ and WRITE lists */ 39 unsigned int nr_queued[2]; 40 41 /* Work for dispatching throttled bios */ 42 struct work_struct dispatch_work; 43 }; 44 45 static void throtl_pending_timer_fn(struct timer_list *t); 46 47 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 48 { 49 return pd_to_blkg(&tg->pd); 50 } 51 52 /** 53 * sq_to_tg - return the throl_grp the specified service queue belongs to 54 * @sq: the throtl_service_queue of interest 55 * 56 * Return the throtl_grp @sq belongs to. If @sq is the top-level one 57 * embedded in throtl_data, %NULL is returned. 58 */ 59 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 60 { 61 if (sq && sq->parent_sq) 62 return container_of(sq, struct throtl_grp, service_queue); 63 else 64 return NULL; 65 } 66 67 /** 68 * sq_to_td - return throtl_data the specified service queue belongs to 69 * @sq: the throtl_service_queue of interest 70 * 71 * A service_queue can be embedded in either a throtl_grp or throtl_data. 72 * Determine the associated throtl_data accordingly and return it. 73 */ 74 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 75 { 76 struct throtl_grp *tg = sq_to_tg(sq); 77 78 if (tg) 79 return tg->td; 80 else 81 return container_of(sq, struct throtl_data, service_queue); 82 } 83 84 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 85 { 86 struct blkcg_gq *blkg = tg_to_blkg(tg); 87 88 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 89 return U64_MAX; 90 91 return tg->bps[rw]; 92 } 93 94 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 95 { 96 struct blkcg_gq *blkg = tg_to_blkg(tg); 97 98 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 99 return UINT_MAX; 100 101 return tg->iops[rw]; 102 } 103 104 /** 105 * throtl_log - log debug message via blktrace 106 * @sq: the service_queue being reported 107 * @fmt: printf format string 108 * @args: printf args 109 * 110 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 111 * throtl_grp; otherwise, just "throtl". 112 */ 113 #define throtl_log(sq, fmt, args...) do { \ 114 struct throtl_grp *__tg = sq_to_tg((sq)); \ 115 struct throtl_data *__td = sq_to_td((sq)); \ 116 \ 117 (void)__td; \ 118 if (likely(!blk_trace_note_message_enabled(__td->queue))) \ 119 break; \ 120 if ((__tg)) { \ 121 blk_add_cgroup_trace_msg(__td->queue, \ 122 &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ 123 } else { \ 124 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 125 } \ 126 } while (0) 127 128 static inline unsigned int throtl_bio_data_size(struct bio *bio) 129 { 130 /* assume it's one sector */ 131 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 132 return 512; 133 return bio->bi_iter.bi_size; 134 } 135 136 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 137 { 138 INIT_LIST_HEAD(&qn->node); 139 bio_list_init(&qn->bios_bps); 140 bio_list_init(&qn->bios_iops); 141 qn->tg = tg; 142 } 143 144 /** 145 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 146 * @bio: bio being added 147 * @qn: qnode to add bio to 148 * @sq: the service_queue @qn belongs to 149 * 150 * Add @bio to @qn and put @qn on @sq->queued if it's not already on. 151 * @qn->tg's reference count is bumped when @qn is activated. See the 152 * comment on top of throtl_qnode definition for details. 153 */ 154 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 155 struct throtl_service_queue *sq) 156 { 157 bool rw = bio_data_dir(bio); 158 159 /* 160 * Split bios have already been throttled by bps, so they are 161 * directly queued into the iops path. 162 */ 163 if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) || 164 bio_flagged(bio, BIO_BPS_THROTTLED)) { 165 bio_list_add(&qn->bios_iops, bio); 166 sq->nr_queued_iops[rw]++; 167 } else { 168 bio_list_add(&qn->bios_bps, bio); 169 sq->nr_queued_bps[rw]++; 170 } 171 172 if (list_empty(&qn->node)) { 173 list_add_tail(&qn->node, &sq->queued[rw]); 174 blkg_get(tg_to_blkg(qn->tg)); 175 } 176 } 177 178 /** 179 * throtl_peek_queued - peek the first bio on a qnode list 180 * @queued: the qnode list to peek 181 * 182 * Always take a bio from the head of the iops queue first. If the queue is 183 * empty, we then take it from the bps queue to maintain the overall idea of 184 * fetching bios from the head. 185 */ 186 static struct bio *throtl_peek_queued(struct list_head *queued) 187 { 188 struct throtl_qnode *qn; 189 struct bio *bio; 190 191 if (list_empty(queued)) 192 return NULL; 193 194 qn = list_first_entry(queued, struct throtl_qnode, node); 195 bio = bio_list_peek(&qn->bios_iops); 196 if (!bio) 197 bio = bio_list_peek(&qn->bios_bps); 198 WARN_ON_ONCE(!bio); 199 return bio; 200 } 201 202 /** 203 * throtl_pop_queued - pop the first bio form a qnode list 204 * @sq: the service_queue to pop a bio from 205 * @tg_to_put: optional out argument for throtl_grp to put 206 * @rw: read/write 207 * 208 * Pop the first bio from the qnode list @sq->queued. Note that we firstly 209 * focus on the iops list because bios are ultimately dispatched from it. 210 * After popping, the first qnode is removed from @sq->queued if empty or moved 211 * to the end of @sq->queued so that the popping order is round-robin. 212 * 213 * When the first qnode is removed, its associated throtl_grp should be put 214 * too. If @tg_to_put is NULL, this function automatically puts it; 215 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 216 * responsible for putting it. 217 */ 218 static struct bio *throtl_pop_queued(struct throtl_service_queue *sq, 219 struct throtl_grp **tg_to_put, bool rw) 220 { 221 struct list_head *queued = &sq->queued[rw]; 222 struct throtl_qnode *qn; 223 struct bio *bio; 224 225 if (list_empty(queued)) 226 return NULL; 227 228 qn = list_first_entry(queued, struct throtl_qnode, node); 229 bio = bio_list_pop(&qn->bios_iops); 230 if (bio) { 231 sq->nr_queued_iops[rw]--; 232 } else { 233 bio = bio_list_pop(&qn->bios_bps); 234 if (bio) 235 sq->nr_queued_bps[rw]--; 236 } 237 WARN_ON_ONCE(!bio); 238 239 if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) { 240 list_del_init(&qn->node); 241 if (tg_to_put) 242 *tg_to_put = qn->tg; 243 else 244 blkg_put(tg_to_blkg(qn->tg)); 245 } else { 246 list_move_tail(&qn->node, queued); 247 } 248 249 return bio; 250 } 251 252 /* init a service_queue, assumes the caller zeroed it */ 253 static void throtl_service_queue_init(struct throtl_service_queue *sq) 254 { 255 INIT_LIST_HEAD(&sq->queued[READ]); 256 INIT_LIST_HEAD(&sq->queued[WRITE]); 257 sq->pending_tree = RB_ROOT_CACHED; 258 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); 259 } 260 261 static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, 262 struct blkcg *blkcg, gfp_t gfp) 263 { 264 struct throtl_grp *tg; 265 int rw; 266 267 tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); 268 if (!tg) 269 return NULL; 270 271 if (blkg_rwstat_init(&tg->stat_bytes, gfp)) 272 goto err_free_tg; 273 274 if (blkg_rwstat_init(&tg->stat_ios, gfp)) 275 goto err_exit_stat_bytes; 276 277 throtl_service_queue_init(&tg->service_queue); 278 279 for (rw = READ; rw <= WRITE; rw++) { 280 throtl_qnode_init(&tg->qnode_on_self[rw], tg); 281 throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 282 } 283 284 RB_CLEAR_NODE(&tg->rb_node); 285 tg->bps[READ] = U64_MAX; 286 tg->bps[WRITE] = U64_MAX; 287 tg->iops[READ] = UINT_MAX; 288 tg->iops[WRITE] = UINT_MAX; 289 290 return &tg->pd; 291 292 err_exit_stat_bytes: 293 blkg_rwstat_exit(&tg->stat_bytes); 294 err_free_tg: 295 kfree(tg); 296 return NULL; 297 } 298 299 static void throtl_pd_init(struct blkg_policy_data *pd) 300 { 301 struct throtl_grp *tg = pd_to_tg(pd); 302 struct blkcg_gq *blkg = tg_to_blkg(tg); 303 struct throtl_data *td = blkg->q->td; 304 struct throtl_service_queue *sq = &tg->service_queue; 305 306 /* 307 * If on the default hierarchy, we switch to properly hierarchical 308 * behavior where limits on a given throtl_grp are applied to the 309 * whole subtree rather than just the group itself. e.g. If 16M 310 * read_bps limit is set on a parent group, summary bps of 311 * parent group and its subtree groups can't exceed 16M for the 312 * device. 313 * 314 * If not on the default hierarchy, the broken flat hierarchy 315 * behavior is retained where all throtl_grps are treated as if 316 * they're all separate root groups right below throtl_data. 317 * Limits of a group don't interact with limits of other groups 318 * regardless of the position of the group in the hierarchy. 319 */ 320 sq->parent_sq = &td->service_queue; 321 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 322 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 323 tg->td = td; 324 } 325 326 /* 327 * Set has_rules[] if @tg or any of its parents have limits configured. 328 * This doesn't require walking up to the top of the hierarchy as the 329 * parent's has_rules[] is guaranteed to be correct. 330 */ 331 static void tg_update_has_rules(struct throtl_grp *tg) 332 { 333 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 334 int rw; 335 336 for (rw = READ; rw <= WRITE; rw++) { 337 tg->has_rules_iops[rw] = 338 (parent_tg && parent_tg->has_rules_iops[rw]) || 339 tg_iops_limit(tg, rw) != UINT_MAX; 340 tg->has_rules_bps[rw] = 341 (parent_tg && parent_tg->has_rules_bps[rw]) || 342 tg_bps_limit(tg, rw) != U64_MAX; 343 } 344 } 345 346 static void throtl_pd_online(struct blkg_policy_data *pd) 347 { 348 struct throtl_grp *tg = pd_to_tg(pd); 349 /* 350 * We don't want new groups to escape the limits of its ancestors. 351 * Update has_rules[] after a new group is brought online. 352 */ 353 tg_update_has_rules(tg); 354 } 355 356 static void tg_release(struct rcu_head *rcu) 357 { 358 struct blkg_policy_data *pd = 359 container_of(rcu, struct blkg_policy_data, rcu_head); 360 struct throtl_grp *tg = pd_to_tg(pd); 361 362 blkg_rwstat_exit(&tg->stat_bytes); 363 blkg_rwstat_exit(&tg->stat_ios); 364 kfree(tg); 365 } 366 367 static void throtl_pd_free(struct blkg_policy_data *pd) 368 { 369 struct throtl_grp *tg = pd_to_tg(pd); 370 371 timer_delete_sync(&tg->service_queue.pending_timer); 372 call_rcu(&pd->rcu_head, tg_release); 373 } 374 375 static struct throtl_grp * 376 throtl_rb_first(struct throtl_service_queue *parent_sq) 377 { 378 struct rb_node *n; 379 380 n = rb_first_cached(&parent_sq->pending_tree); 381 WARN_ON_ONCE(!n); 382 if (!n) 383 return NULL; 384 return rb_entry_tg(n); 385 } 386 387 static void throtl_rb_erase(struct rb_node *n, 388 struct throtl_service_queue *parent_sq) 389 { 390 rb_erase_cached(n, &parent_sq->pending_tree); 391 RB_CLEAR_NODE(n); 392 } 393 394 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 395 { 396 struct throtl_grp *tg; 397 398 tg = throtl_rb_first(parent_sq); 399 if (!tg) 400 return; 401 402 parent_sq->first_pending_disptime = tg->disptime; 403 } 404 405 static void tg_service_queue_add(struct throtl_grp *tg) 406 { 407 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 408 struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; 409 struct rb_node *parent = NULL; 410 struct throtl_grp *__tg; 411 unsigned long key = tg->disptime; 412 bool leftmost = true; 413 414 while (*node != NULL) { 415 parent = *node; 416 __tg = rb_entry_tg(parent); 417 418 if (time_before(key, __tg->disptime)) 419 node = &parent->rb_left; 420 else { 421 node = &parent->rb_right; 422 leftmost = false; 423 } 424 } 425 426 rb_link_node(&tg->rb_node, parent, node); 427 rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, 428 leftmost); 429 } 430 431 static void throtl_enqueue_tg(struct throtl_grp *tg) 432 { 433 if (!(tg->flags & THROTL_TG_PENDING)) { 434 tg_service_queue_add(tg); 435 tg->flags |= THROTL_TG_PENDING; 436 tg->service_queue.parent_sq->nr_pending++; 437 } 438 } 439 440 static void throtl_dequeue_tg(struct throtl_grp *tg) 441 { 442 if (tg->flags & THROTL_TG_PENDING) { 443 struct throtl_service_queue *parent_sq = 444 tg->service_queue.parent_sq; 445 446 throtl_rb_erase(&tg->rb_node, parent_sq); 447 --parent_sq->nr_pending; 448 tg->flags &= ~THROTL_TG_PENDING; 449 } 450 } 451 452 /* Call with queue lock held */ 453 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 454 unsigned long expires) 455 { 456 unsigned long max_expire = jiffies + 8 * DFL_THROTL_SLICE; 457 458 /* 459 * Since we are adjusting the throttle limit dynamically, the sleep 460 * time calculated according to previous limit might be invalid. It's 461 * possible the cgroup sleep time is very long and no other cgroups 462 * have IO running so notify the limit changes. Make sure the cgroup 463 * doesn't sleep too long to avoid the missed notification. 464 */ 465 if (time_after(expires, max_expire)) 466 expires = max_expire; 467 mod_timer(&sq->pending_timer, expires); 468 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 469 expires - jiffies, jiffies); 470 } 471 472 /** 473 * throtl_schedule_next_dispatch - schedule the next dispatch cycle 474 * @sq: the service_queue to schedule dispatch for 475 * @force: force scheduling 476 * 477 * Arm @sq->pending_timer so that the next dispatch cycle starts on the 478 * dispatch time of the first pending child. Returns %true if either timer 479 * is armed or there's no pending child left. %false if the current 480 * dispatch window is still open and the caller should continue 481 * dispatching. 482 * 483 * If @force is %true, the dispatch timer is always scheduled and this 484 * function is guaranteed to return %true. This is to be used when the 485 * caller can't dispatch itself and needs to invoke pending_timer 486 * unconditionally. Note that forced scheduling is likely to induce short 487 * delay before dispatch starts even if @sq->first_pending_disptime is not 488 * in the future and thus shouldn't be used in hot paths. 489 */ 490 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 491 bool force) 492 { 493 /* any pending children left? */ 494 if (!sq->nr_pending) 495 return true; 496 497 update_min_dispatch_time(sq); 498 499 /* is the next dispatch time in the future? */ 500 if (force || time_after(sq->first_pending_disptime, jiffies)) { 501 throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 502 return true; 503 } 504 505 /* tell the caller to continue dispatching */ 506 return false; 507 } 508 509 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 510 bool rw, unsigned long start) 511 { 512 tg->bytes_disp[rw] = 0; 513 tg->io_disp[rw] = 0; 514 515 /* 516 * Previous slice has expired. We must have trimmed it after last 517 * bio dispatch. That means since start of last slice, we never used 518 * that bandwidth. Do try to make use of that bandwidth while giving 519 * credit. 520 */ 521 if (time_after(start, tg->slice_start[rw])) 522 tg->slice_start[rw] = start; 523 524 tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; 525 throtl_log(&tg->service_queue, 526 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 527 rw == READ ? 'R' : 'W', tg->slice_start[rw], 528 tg->slice_end[rw], jiffies); 529 } 530 531 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, 532 bool clear) 533 { 534 if (clear) { 535 tg->bytes_disp[rw] = 0; 536 tg->io_disp[rw] = 0; 537 } 538 tg->slice_start[rw] = jiffies; 539 tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; 540 541 throtl_log(&tg->service_queue, 542 "[%c] new slice start=%lu end=%lu jiffies=%lu", 543 rw == READ ? 'R' : 'W', tg->slice_start[rw], 544 tg->slice_end[rw], jiffies); 545 } 546 547 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 548 unsigned long jiffy_end) 549 { 550 tg->slice_end[rw] = roundup(jiffy_end, DFL_THROTL_SLICE); 551 } 552 553 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 554 unsigned long jiffy_end) 555 { 556 if (!time_before(tg->slice_end[rw], jiffy_end)) 557 return; 558 559 throtl_set_slice_end(tg, rw, jiffy_end); 560 throtl_log(&tg->service_queue, 561 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 562 rw == READ ? 'R' : 'W', tg->slice_start[rw], 563 tg->slice_end[rw], jiffies); 564 } 565 566 /* Determine if previously allocated or extended slice is complete or not */ 567 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 568 { 569 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 570 return false; 571 572 return true; 573 } 574 575 static unsigned int sq_queued(struct throtl_service_queue *sq, int type) 576 { 577 return sq->nr_queued_bps[type] + sq->nr_queued_iops[type]; 578 } 579 580 static unsigned int calculate_io_allowed(u32 iops_limit, 581 unsigned long jiffy_elapsed) 582 { 583 unsigned int io_allowed; 584 u64 tmp; 585 586 /* 587 * jiffy_elapsed should not be a big value as minimum iops can be 588 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 589 * will allow dispatch after 1 second and after that slice should 590 * have been trimmed. 591 */ 592 593 tmp = (u64)iops_limit * jiffy_elapsed; 594 do_div(tmp, HZ); 595 596 if (tmp > UINT_MAX) 597 io_allowed = UINT_MAX; 598 else 599 io_allowed = tmp; 600 601 return io_allowed; 602 } 603 604 static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) 605 { 606 /* 607 * Can result be wider than 64 bits? 608 * We check against 62, not 64, due to ilog2 truncation. 609 */ 610 if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) 611 return U64_MAX; 612 return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); 613 } 614 615 static long long throtl_trim_bps(struct throtl_grp *tg, bool rw, 616 unsigned long time_elapsed) 617 { 618 u64 bps_limit = tg_bps_limit(tg, rw); 619 long long bytes_trim; 620 621 if (bps_limit == U64_MAX) 622 return 0; 623 624 /* Need to consider the case of bytes_allowed overflow. */ 625 bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed); 626 if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) { 627 bytes_trim = tg->bytes_disp[rw]; 628 tg->bytes_disp[rw] = 0; 629 } else { 630 tg->bytes_disp[rw] -= bytes_trim; 631 } 632 633 return bytes_trim; 634 } 635 636 static int throtl_trim_iops(struct throtl_grp *tg, bool rw, 637 unsigned long time_elapsed) 638 { 639 u32 iops_limit = tg_iops_limit(tg, rw); 640 int io_trim; 641 642 if (iops_limit == UINT_MAX) 643 return 0; 644 645 /* Need to consider the case of io_allowed overflow. */ 646 io_trim = calculate_io_allowed(iops_limit, time_elapsed); 647 if (io_trim <= 0 || tg->io_disp[rw] < io_trim) { 648 io_trim = tg->io_disp[rw]; 649 tg->io_disp[rw] = 0; 650 } else { 651 tg->io_disp[rw] -= io_trim; 652 } 653 654 return io_trim; 655 } 656 657 /* Trim the used slices and adjust slice start accordingly */ 658 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 659 { 660 unsigned long time_elapsed; 661 long long bytes_trim; 662 int io_trim; 663 664 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 665 666 /* 667 * If bps are unlimited (-1), then time slice don't get 668 * renewed. Don't try to trim the slice if slice is used. A new 669 * slice will start when appropriate. 670 */ 671 if (throtl_slice_used(tg, rw)) 672 return; 673 674 /* 675 * A bio has been dispatched. Also adjust slice_end. It might happen 676 * that initially cgroup limit was very low resulting in high 677 * slice_end, but later limit was bumped up and bio was dispatched 678 * sooner, then we need to reduce slice_end. A high bogus slice_end 679 * is bad because it does not allow new slice to start. 680 */ 681 throtl_set_slice_end(tg, rw, jiffies + DFL_THROTL_SLICE); 682 683 time_elapsed = rounddown(jiffies - tg->slice_start[rw], 684 DFL_THROTL_SLICE); 685 /* Don't trim slice until at least 2 slices are used */ 686 if (time_elapsed < DFL_THROTL_SLICE * 2) 687 return; 688 689 /* 690 * The bio submission time may be a few jiffies more than the expected 691 * waiting time, due to 'extra_bytes' can't be divided in 692 * tg_within_bps_limit(), and also due to timer wakeup delay. In this 693 * case, adjust slice_start will discard the extra wait time, causing 694 * lower rate than expected. Therefore, other than the above rounddown, 695 * one extra slice is preserved for deviation. 696 */ 697 time_elapsed -= DFL_THROTL_SLICE; 698 bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); 699 io_trim = throtl_trim_iops(tg, rw, time_elapsed); 700 if (!bytes_trim && !io_trim) 701 return; 702 703 tg->slice_start[rw] += time_elapsed; 704 705 throtl_log(&tg->service_queue, 706 "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", 707 rw == READ ? 'R' : 'W', time_elapsed / DFL_THROTL_SLICE, 708 bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], 709 jiffies); 710 } 711 712 static void __tg_update_carryover(struct throtl_grp *tg, bool rw, 713 long long *bytes, int *ios) 714 { 715 unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; 716 u64 bps_limit = tg_bps_limit(tg, rw); 717 u32 iops_limit = tg_iops_limit(tg, rw); 718 long long bytes_allowed; 719 int io_allowed; 720 721 /* 722 * If the queue is empty, carryover handling is not needed. In such cases, 723 * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch 724 * of subsequent bios. The same handling applies when the previous BPS/IOPS 725 * limit was set to max. 726 */ 727 if (sq_queued(&tg->service_queue, rw) == 0) { 728 tg->bytes_disp[rw] = 0; 729 tg->io_disp[rw] = 0; 730 return; 731 } 732 733 /* 734 * If config is updated while bios are still throttled, calculate and 735 * accumulate how many bytes/ios are waited across changes. And use the 736 * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which 737 * will be used to calculate new wait time under new configuration. 738 * And we need to consider the case of bytes/io_allowed overflow. 739 */ 740 if (bps_limit != U64_MAX) { 741 bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed); 742 if (bytes_allowed > 0) 743 *bytes = bytes_allowed - tg->bytes_disp[rw]; 744 } 745 if (iops_limit != UINT_MAX) { 746 io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed); 747 if (io_allowed > 0) 748 *ios = io_allowed - tg->io_disp[rw]; 749 } 750 751 tg->bytes_disp[rw] = -*bytes; 752 tg->io_disp[rw] = -*ios; 753 } 754 755 static void tg_update_carryover(struct throtl_grp *tg) 756 { 757 long long bytes[2] = {0}; 758 int ios[2] = {0}; 759 760 __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); 761 __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); 762 763 /* see comments in struct throtl_grp for meaning of carryover. */ 764 throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, 765 bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); 766 } 767 768 static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, 769 u32 iops_limit) 770 { 771 bool rw = bio_data_dir(bio); 772 int io_allowed; 773 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 774 775 jiffy_elapsed = jiffies - tg->slice_start[rw]; 776 777 /* Round up to the next throttle slice, wait time must be nonzero */ 778 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, DFL_THROTL_SLICE); 779 io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); 780 if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) 781 return 0; 782 783 /* Calc approx time to dispatch */ 784 jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; 785 786 /* make sure at least one io can be dispatched after waiting */ 787 jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); 788 return jiffy_wait; 789 } 790 791 static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, 792 u64 bps_limit) 793 { 794 bool rw = bio_data_dir(bio); 795 long long bytes_allowed; 796 u64 extra_bytes; 797 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 798 unsigned int bio_size = throtl_bio_data_size(bio); 799 800 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 801 802 /* Slice has just started. Consider one slice interval */ 803 if (!jiffy_elapsed) 804 jiffy_elapsed_rnd = DFL_THROTL_SLICE; 805 806 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, DFL_THROTL_SLICE); 807 bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); 808 /* Need to consider the case of bytes_allowed overflow. */ 809 if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) 810 || bytes_allowed < 0) 811 return 0; 812 813 /* Calc approx time to dispatch */ 814 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; 815 jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); 816 817 if (!jiffy_wait) 818 jiffy_wait = 1; 819 820 /* 821 * This wait time is without taking into consideration the rounding 822 * up we did. Add that time also. 823 */ 824 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 825 return jiffy_wait; 826 } 827 828 static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio) 829 { 830 unsigned int bio_size = throtl_bio_data_size(bio); 831 832 /* Charge the bio to the group */ 833 if (!bio_flagged(bio, BIO_BPS_THROTTLED) && 834 !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) { 835 bio_set_flag(bio, BIO_TG_BPS_THROTTLED); 836 tg->bytes_disp[bio_data_dir(bio)] += bio_size; 837 } 838 } 839 840 static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio) 841 { 842 bio_clear_flag(bio, BIO_TG_BPS_THROTTLED); 843 tg->io_disp[bio_data_dir(bio)]++; 844 } 845 846 /* 847 * If previous slice expired, start a new one otherwise renew/extend existing 848 * slice to make sure it is at least throtl_slice interval long since now. New 849 * slice is started only for empty throttle group. If there is queued bio, that 850 * means there should be an active slice and it should be extended instead. 851 */ 852 static void tg_update_slice(struct throtl_grp *tg, bool rw) 853 { 854 if (throtl_slice_used(tg, rw) && 855 sq_queued(&tg->service_queue, rw) == 0) 856 throtl_start_new_slice(tg, rw, true); 857 else 858 throtl_extend_slice(tg, rw, jiffies + DFL_THROTL_SLICE); 859 } 860 861 static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) 862 { 863 bool rw = bio_data_dir(bio); 864 u64 bps_limit = tg_bps_limit(tg, rw); 865 unsigned long bps_wait; 866 867 /* no need to throttle if this bio's bytes have been accounted */ 868 if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING || 869 bio_flagged(bio, BIO_BPS_THROTTLED) || 870 bio_flagged(bio, BIO_TG_BPS_THROTTLED)) 871 return 0; 872 873 tg_update_slice(tg, rw); 874 bps_wait = tg_within_bps_limit(tg, bio, bps_limit); 875 throtl_extend_slice(tg, rw, jiffies + bps_wait); 876 877 return bps_wait; 878 } 879 880 static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio) 881 { 882 bool rw = bio_data_dir(bio); 883 u32 iops_limit = tg_iops_limit(tg, rw); 884 unsigned long iops_wait; 885 886 if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING) 887 return 0; 888 889 tg_update_slice(tg, rw); 890 iops_wait = tg_within_iops_limit(tg, bio, iops_limit); 891 throtl_extend_slice(tg, rw, jiffies + iops_wait); 892 893 return iops_wait; 894 } 895 896 /* 897 * Returns approx number of jiffies to wait before this bio is with-in IO rate 898 * and can be moved to other queue or dispatched. 899 */ 900 static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) 901 { 902 bool rw = bio_data_dir(bio); 903 unsigned long wait; 904 905 /* 906 * Currently whole state machine of group depends on first bio 907 * queued in the group bio list. So one should not be calling 908 * this function with a different bio if there are other bios 909 * queued. 910 */ 911 BUG_ON(sq_queued(&tg->service_queue, rw) && 912 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 913 914 wait = tg_dispatch_bps_time(tg, bio); 915 if (wait != 0) 916 return wait; 917 918 /* 919 * Charge bps here because @bio will be directly placed into the 920 * iops queue afterward. 921 */ 922 throtl_charge_bps_bio(tg, bio); 923 924 return tg_dispatch_iops_time(tg, bio); 925 } 926 927 /** 928 * throtl_add_bio_tg - add a bio to the specified throtl_grp 929 * @bio: bio to add 930 * @qn: qnode to use 931 * @tg: the target throtl_grp 932 * 933 * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 934 * tg->qnode_on_self[] is used. 935 */ 936 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 937 struct throtl_grp *tg) 938 { 939 struct throtl_service_queue *sq = &tg->service_queue; 940 bool rw = bio_data_dir(bio); 941 942 if (!qn) 943 qn = &tg->qnode_on_self[rw]; 944 945 /* 946 * If @tg doesn't currently have any bios queued in the same 947 * direction, queueing @bio can change when @tg should be 948 * dispatched. Mark that @tg was empty. This is automatically 949 * cleared on the next tg_update_disptime(). 950 */ 951 if (sq_queued(sq, rw) == 0) 952 tg->flags |= THROTL_TG_WAS_EMPTY; 953 954 throtl_qnode_add_bio(bio, qn, sq); 955 956 /* 957 * Since we have split the queues, when the iops queue is 958 * previously empty and a new @bio is added into the first @qn, 959 * we also need to update the @tg->disptime. 960 */ 961 if (bio_flagged(bio, BIO_BPS_THROTTLED) && 962 bio == throtl_peek_queued(&sq->queued[rw])) 963 tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; 964 965 throtl_enqueue_tg(tg); 966 } 967 968 static void tg_update_disptime(struct throtl_grp *tg) 969 { 970 struct throtl_service_queue *sq = &tg->service_queue; 971 unsigned long read_wait = -1, write_wait = -1, min_wait, disptime; 972 struct bio *bio; 973 974 bio = throtl_peek_queued(&sq->queued[READ]); 975 if (bio) 976 read_wait = tg_dispatch_time(tg, bio); 977 978 bio = throtl_peek_queued(&sq->queued[WRITE]); 979 if (bio) 980 write_wait = tg_dispatch_time(tg, bio); 981 982 min_wait = min(read_wait, write_wait); 983 disptime = jiffies + min_wait; 984 985 /* Update dispatch time */ 986 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 987 tg->disptime = disptime; 988 tg_service_queue_add(tg); 989 990 /* see throtl_add_bio_tg() */ 991 tg->flags &= ~THROTL_TG_WAS_EMPTY; 992 tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY; 993 } 994 995 static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 996 struct throtl_grp *parent_tg, bool rw) 997 { 998 if (throtl_slice_used(parent_tg, rw)) { 999 throtl_start_new_slice_with_credit(parent_tg, rw, 1000 child_tg->slice_start[rw]); 1001 } 1002 1003 } 1004 1005 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 1006 { 1007 struct throtl_service_queue *sq = &tg->service_queue; 1008 struct throtl_service_queue *parent_sq = sq->parent_sq; 1009 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 1010 struct throtl_grp *tg_to_put = NULL; 1011 struct bio *bio; 1012 1013 /* 1014 * @bio is being transferred from @tg to @parent_sq. Popping a bio 1015 * from @tg may put its reference and @parent_sq might end up 1016 * getting released prematurely. Remember the tg to put and put it 1017 * after @bio is transferred to @parent_sq. 1018 */ 1019 bio = throtl_pop_queued(sq, &tg_to_put, rw); 1020 1021 throtl_charge_iops_bio(tg, bio); 1022 1023 /* 1024 * If our parent is another tg, we just need to transfer @bio to 1025 * the parent using throtl_add_bio_tg(). If our parent is 1026 * @td->service_queue, @bio is ready to be issued. Put it on its 1027 * bio_lists[] and decrease total number queued. The caller is 1028 * responsible for issuing these bios. 1029 */ 1030 if (parent_tg) { 1031 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 1032 start_parent_slice_with_credit(tg, parent_tg, rw); 1033 } else { 1034 bio_set_flag(bio, BIO_BPS_THROTTLED); 1035 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 1036 parent_sq); 1037 BUG_ON(tg->td->nr_queued[rw] <= 0); 1038 tg->td->nr_queued[rw]--; 1039 } 1040 1041 throtl_trim_slice(tg, rw); 1042 1043 if (tg_to_put) 1044 blkg_put(tg_to_blkg(tg_to_put)); 1045 } 1046 1047 static int throtl_dispatch_tg(struct throtl_grp *tg) 1048 { 1049 struct throtl_service_queue *sq = &tg->service_queue; 1050 unsigned int nr_reads = 0, nr_writes = 0; 1051 unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; 1052 unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; 1053 struct bio *bio; 1054 1055 /* Try to dispatch 75% READS and 25% WRITES */ 1056 1057 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 1058 tg_dispatch_time(tg, bio) == 0) { 1059 1060 tg_dispatch_one_bio(tg, READ); 1061 nr_reads++; 1062 1063 if (nr_reads >= max_nr_reads) 1064 break; 1065 } 1066 1067 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 1068 tg_dispatch_time(tg, bio) == 0) { 1069 1070 tg_dispatch_one_bio(tg, WRITE); 1071 nr_writes++; 1072 1073 if (nr_writes >= max_nr_writes) 1074 break; 1075 } 1076 1077 return nr_reads + nr_writes; 1078 } 1079 1080 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 1081 { 1082 unsigned int nr_disp = 0; 1083 1084 while (1) { 1085 struct throtl_grp *tg; 1086 struct throtl_service_queue *sq; 1087 1088 if (!parent_sq->nr_pending) 1089 break; 1090 1091 tg = throtl_rb_first(parent_sq); 1092 if (!tg) 1093 break; 1094 1095 if (time_before(jiffies, tg->disptime)) 1096 break; 1097 1098 nr_disp += throtl_dispatch_tg(tg); 1099 1100 sq = &tg->service_queue; 1101 if (sq_queued(sq, READ) || sq_queued(sq, WRITE)) 1102 tg_update_disptime(tg); 1103 else 1104 throtl_dequeue_tg(tg); 1105 1106 if (nr_disp >= THROTL_QUANTUM) 1107 break; 1108 } 1109 1110 return nr_disp; 1111 } 1112 1113 /** 1114 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1115 * @t: the pending_timer member of the throtl_service_queue being serviced 1116 * 1117 * This timer is armed when a child throtl_grp with active bio's become 1118 * pending and queued on the service_queue's pending_tree and expires when 1119 * the first child throtl_grp should be dispatched. This function 1120 * dispatches bio's from the children throtl_grps to the parent 1121 * service_queue. 1122 * 1123 * If the parent's parent is another throtl_grp, dispatching is propagated 1124 * by either arming its pending_timer or repeating dispatch directly. If 1125 * the top-level service_tree is reached, throtl_data->dispatch_work is 1126 * kicked so that the ready bio's are issued. 1127 */ 1128 static void throtl_pending_timer_fn(struct timer_list *t) 1129 { 1130 struct throtl_service_queue *sq = timer_container_of(sq, t, 1131 pending_timer); 1132 struct throtl_grp *tg = sq_to_tg(sq); 1133 struct throtl_data *td = sq_to_td(sq); 1134 struct throtl_service_queue *parent_sq; 1135 struct request_queue *q; 1136 bool dispatched; 1137 int ret; 1138 1139 /* throtl_data may be gone, so figure out request queue by blkg */ 1140 if (tg) 1141 q = tg->pd.blkg->q; 1142 else 1143 q = td->queue; 1144 1145 spin_lock_irq(&q->queue_lock); 1146 1147 if (!q->root_blkg) 1148 goto out_unlock; 1149 1150 again: 1151 parent_sq = sq->parent_sq; 1152 dispatched = false; 1153 1154 while (true) { 1155 unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ); 1156 unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE); 1157 1158 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 1159 bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w); 1160 1161 ret = throtl_select_dispatch(sq); 1162 if (ret) { 1163 throtl_log(sq, "bios disp=%u", ret); 1164 dispatched = true; 1165 } 1166 1167 if (throtl_schedule_next_dispatch(sq, false)) 1168 break; 1169 1170 /* this dispatch windows is still open, relax and repeat */ 1171 spin_unlock_irq(&q->queue_lock); 1172 cpu_relax(); 1173 spin_lock_irq(&q->queue_lock); 1174 } 1175 1176 if (!dispatched) 1177 goto out_unlock; 1178 1179 if (parent_sq) { 1180 /* @parent_sq is another throl_grp, propagate dispatch */ 1181 if (tg->flags & THROTL_TG_WAS_EMPTY || 1182 tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { 1183 tg_update_disptime(tg); 1184 if (!throtl_schedule_next_dispatch(parent_sq, false)) { 1185 /* window is already open, repeat dispatching */ 1186 sq = parent_sq; 1187 tg = sq_to_tg(sq); 1188 goto again; 1189 } 1190 } 1191 } else { 1192 /* reached the top-level, queue issuing */ 1193 queue_work(kthrotld_workqueue, &td->dispatch_work); 1194 } 1195 out_unlock: 1196 spin_unlock_irq(&q->queue_lock); 1197 } 1198 1199 /** 1200 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1201 * @work: work item being executed 1202 * 1203 * This function is queued for execution when bios reach the bio_lists[] 1204 * of throtl_data->service_queue. Those bios are ready and issued by this 1205 * function. 1206 */ 1207 static void blk_throtl_dispatch_work_fn(struct work_struct *work) 1208 { 1209 struct throtl_data *td = container_of(work, struct throtl_data, 1210 dispatch_work); 1211 struct throtl_service_queue *td_sq = &td->service_queue; 1212 struct request_queue *q = td->queue; 1213 struct bio_list bio_list_on_stack; 1214 struct bio *bio; 1215 struct blk_plug plug; 1216 int rw; 1217 1218 bio_list_init(&bio_list_on_stack); 1219 1220 spin_lock_irq(&q->queue_lock); 1221 for (rw = READ; rw <= WRITE; rw++) 1222 while ((bio = throtl_pop_queued(td_sq, NULL, rw))) 1223 bio_list_add(&bio_list_on_stack, bio); 1224 spin_unlock_irq(&q->queue_lock); 1225 1226 if (!bio_list_empty(&bio_list_on_stack)) { 1227 blk_start_plug(&plug); 1228 while ((bio = bio_list_pop(&bio_list_on_stack))) 1229 submit_bio_noacct_nocheck(bio, false); 1230 blk_finish_plug(&plug); 1231 } 1232 } 1233 1234 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1235 int off) 1236 { 1237 struct throtl_grp *tg = pd_to_tg(pd); 1238 u64 v = *(u64 *)((void *)tg + off); 1239 1240 if (v == U64_MAX) 1241 return 0; 1242 return __blkg_prfill_u64(sf, pd, v); 1243 } 1244 1245 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1246 int off) 1247 { 1248 struct throtl_grp *tg = pd_to_tg(pd); 1249 unsigned int v = *(unsigned int *)((void *)tg + off); 1250 1251 if (v == UINT_MAX) 1252 return 0; 1253 return __blkg_prfill_u64(sf, pd, v); 1254 } 1255 1256 static int tg_print_conf_u64(struct seq_file *sf, void *v) 1257 { 1258 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1259 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1260 return 0; 1261 } 1262 1263 static int tg_print_conf_uint(struct seq_file *sf, void *v) 1264 { 1265 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1266 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1267 return 0; 1268 } 1269 1270 static void tg_conf_updated(struct throtl_grp *tg, bool global) 1271 { 1272 struct throtl_service_queue *sq = &tg->service_queue; 1273 struct cgroup_subsys_state *pos_css; 1274 struct blkcg_gq *blkg; 1275 1276 throtl_log(&tg->service_queue, 1277 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1278 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), 1279 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); 1280 1281 rcu_read_lock(); 1282 /* 1283 * Update has_rules[] flags for the updated tg's subtree. A tg is 1284 * considered to have rules if either the tg itself or any of its 1285 * ancestors has rules. This identifies groups without any 1286 * restrictions in the whole hierarchy and allows them to bypass 1287 * blk-throttle. 1288 */ 1289 blkg_for_each_descendant_pre(blkg, pos_css, 1290 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { 1291 struct throtl_grp *this_tg = blkg_to_tg(blkg); 1292 1293 tg_update_has_rules(this_tg); 1294 /* ignore root/second level */ 1295 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || 1296 !blkg->parent->parent) 1297 continue; 1298 } 1299 rcu_read_unlock(); 1300 1301 /* 1302 * We're already holding queue_lock and know @tg is valid. Let's 1303 * apply the new config directly. 1304 * 1305 * Restart the slices for both READ and WRITES. It might happen 1306 * that a group's limit are dropped suddenly and we don't want to 1307 * account recently dispatched IO with new low rate. 1308 */ 1309 throtl_start_new_slice(tg, READ, false); 1310 throtl_start_new_slice(tg, WRITE, false); 1311 1312 if (tg->flags & THROTL_TG_PENDING) { 1313 tg_update_disptime(tg); 1314 throtl_schedule_next_dispatch(sq->parent_sq, true); 1315 } 1316 } 1317 1318 static int blk_throtl_init(struct gendisk *disk) 1319 { 1320 struct request_queue *q = disk->queue; 1321 struct throtl_data *td; 1322 unsigned int memflags; 1323 int ret; 1324 1325 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1326 if (!td) 1327 return -ENOMEM; 1328 1329 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1330 throtl_service_queue_init(&td->service_queue); 1331 1332 memflags = blk_mq_freeze_queue(disk->queue); 1333 blk_mq_quiesce_queue(disk->queue); 1334 1335 q->td = td; 1336 td->queue = q; 1337 1338 /* activate policy, blk_throtl_activated() will return true */ 1339 ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); 1340 if (ret) { 1341 q->td = NULL; 1342 kfree(td); 1343 } 1344 1345 blk_mq_unquiesce_queue(disk->queue); 1346 blk_mq_unfreeze_queue(disk->queue, memflags); 1347 1348 return ret; 1349 } 1350 1351 1352 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1353 char *buf, size_t nbytes, loff_t off, bool is_u64) 1354 { 1355 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1356 struct blkg_conf_ctx ctx; 1357 struct throtl_grp *tg; 1358 int ret; 1359 u64 v; 1360 1361 blkg_conf_init(&ctx, buf); 1362 1363 ret = blkg_conf_open_bdev(&ctx); 1364 if (ret) 1365 return ret; 1366 1367 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1368 ret = blk_throtl_init(ctx.bdev->bd_disk); 1369 if (ret) 1370 goto close_bdev; 1371 } 1372 1373 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1374 if (ret) 1375 goto close_bdev; 1376 1377 ret = -EINVAL; 1378 if (sscanf(ctx.body, "%llu", &v) != 1) 1379 goto unprep; 1380 if (!v) 1381 v = U64_MAX; 1382 1383 tg = blkg_to_tg(ctx.blkg); 1384 tg_update_carryover(tg); 1385 1386 if (is_u64) 1387 *(u64 *)((void *)tg + of_cft(of)->private) = v; 1388 else 1389 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1390 1391 tg_conf_updated(tg, false); 1392 ret = 0; 1393 1394 unprep: 1395 blkg_conf_unprep(&ctx); 1396 1397 close_bdev: 1398 blkg_conf_close_bdev(&ctx); 1399 return ret ?: nbytes; 1400 } 1401 1402 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1403 char *buf, size_t nbytes, loff_t off) 1404 { 1405 return tg_set_conf(of, buf, nbytes, off, true); 1406 } 1407 1408 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1409 char *buf, size_t nbytes, loff_t off) 1410 { 1411 return tg_set_conf(of, buf, nbytes, off, false); 1412 } 1413 1414 static int tg_print_rwstat(struct seq_file *sf, void *v) 1415 { 1416 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1417 blkg_prfill_rwstat, &blkcg_policy_throtl, 1418 seq_cft(sf)->private, true); 1419 return 0; 1420 } 1421 1422 static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, 1423 struct blkg_policy_data *pd, int off) 1424 { 1425 struct blkg_rwstat_sample sum; 1426 1427 blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, 1428 &sum); 1429 return __blkg_prfill_rwstat(sf, pd, &sum); 1430 } 1431 1432 static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) 1433 { 1434 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1435 tg_prfill_rwstat_recursive, &blkcg_policy_throtl, 1436 seq_cft(sf)->private, true); 1437 return 0; 1438 } 1439 1440 static struct cftype throtl_legacy_files[] = { 1441 { 1442 .name = "throttle.read_bps_device", 1443 .private = offsetof(struct throtl_grp, bps[READ]), 1444 .seq_show = tg_print_conf_u64, 1445 .write = tg_set_conf_u64, 1446 }, 1447 { 1448 .name = "throttle.write_bps_device", 1449 .private = offsetof(struct throtl_grp, bps[WRITE]), 1450 .seq_show = tg_print_conf_u64, 1451 .write = tg_set_conf_u64, 1452 }, 1453 { 1454 .name = "throttle.read_iops_device", 1455 .private = offsetof(struct throtl_grp, iops[READ]), 1456 .seq_show = tg_print_conf_uint, 1457 .write = tg_set_conf_uint, 1458 }, 1459 { 1460 .name = "throttle.write_iops_device", 1461 .private = offsetof(struct throtl_grp, iops[WRITE]), 1462 .seq_show = tg_print_conf_uint, 1463 .write = tg_set_conf_uint, 1464 }, 1465 { 1466 .name = "throttle.io_service_bytes", 1467 .private = offsetof(struct throtl_grp, stat_bytes), 1468 .seq_show = tg_print_rwstat, 1469 }, 1470 { 1471 .name = "throttle.io_service_bytes_recursive", 1472 .private = offsetof(struct throtl_grp, stat_bytes), 1473 .seq_show = tg_print_rwstat_recursive, 1474 }, 1475 { 1476 .name = "throttle.io_serviced", 1477 .private = offsetof(struct throtl_grp, stat_ios), 1478 .seq_show = tg_print_rwstat, 1479 }, 1480 { 1481 .name = "throttle.io_serviced_recursive", 1482 .private = offsetof(struct throtl_grp, stat_ios), 1483 .seq_show = tg_print_rwstat_recursive, 1484 }, 1485 { } /* terminate */ 1486 }; 1487 1488 static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, 1489 int off) 1490 { 1491 struct throtl_grp *tg = pd_to_tg(pd); 1492 const char *dname = blkg_dev_name(pd->blkg); 1493 u64 bps_dft; 1494 unsigned int iops_dft; 1495 1496 if (!dname) 1497 return 0; 1498 1499 bps_dft = U64_MAX; 1500 iops_dft = UINT_MAX; 1501 1502 if (tg->bps[READ] == bps_dft && 1503 tg->bps[WRITE] == bps_dft && 1504 tg->iops[READ] == iops_dft && 1505 tg->iops[WRITE] == iops_dft) 1506 return 0; 1507 1508 seq_printf(sf, "%s", dname); 1509 if (tg->bps[READ] == U64_MAX) 1510 seq_printf(sf, " rbps=max"); 1511 else 1512 seq_printf(sf, " rbps=%llu", tg->bps[READ]); 1513 1514 if (tg->bps[WRITE] == U64_MAX) 1515 seq_printf(sf, " wbps=max"); 1516 else 1517 seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); 1518 1519 if (tg->iops[READ] == UINT_MAX) 1520 seq_printf(sf, " riops=max"); 1521 else 1522 seq_printf(sf, " riops=%u", tg->iops[READ]); 1523 1524 if (tg->iops[WRITE] == UINT_MAX) 1525 seq_printf(sf, " wiops=max"); 1526 else 1527 seq_printf(sf, " wiops=%u", tg->iops[WRITE]); 1528 1529 seq_printf(sf, "\n"); 1530 return 0; 1531 } 1532 1533 static int tg_print_limit(struct seq_file *sf, void *v) 1534 { 1535 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, 1536 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1537 return 0; 1538 } 1539 1540 static ssize_t tg_set_limit(struct kernfs_open_file *of, 1541 char *buf, size_t nbytes, loff_t off) 1542 { 1543 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1544 struct blkg_conf_ctx ctx; 1545 struct throtl_grp *tg; 1546 u64 v[4]; 1547 int ret; 1548 1549 blkg_conf_init(&ctx, buf); 1550 1551 ret = blkg_conf_open_bdev(&ctx); 1552 if (ret) 1553 return ret; 1554 1555 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1556 ret = blk_throtl_init(ctx.bdev->bd_disk); 1557 if (ret) 1558 goto close_bdev; 1559 } 1560 1561 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1562 if (ret) 1563 goto close_bdev; 1564 1565 tg = blkg_to_tg(ctx.blkg); 1566 tg_update_carryover(tg); 1567 1568 v[0] = tg->bps[READ]; 1569 v[1] = tg->bps[WRITE]; 1570 v[2] = tg->iops[READ]; 1571 v[3] = tg->iops[WRITE]; 1572 1573 while (true) { 1574 char tok[27]; /* wiops=18446744073709551616 */ 1575 char *p; 1576 u64 val = U64_MAX; 1577 int len; 1578 1579 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1580 break; 1581 if (tok[0] == '\0') 1582 break; 1583 ctx.body += len; 1584 1585 ret = -EINVAL; 1586 p = tok; 1587 strsep(&p, "="); 1588 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) 1589 goto unprep; 1590 1591 ret = -ERANGE; 1592 if (!val) 1593 goto unprep; 1594 1595 ret = -EINVAL; 1596 if (!strcmp(tok, "rbps")) 1597 v[0] = val; 1598 else if (!strcmp(tok, "wbps")) 1599 v[1] = val; 1600 else if (!strcmp(tok, "riops")) 1601 v[2] = min_t(u64, val, UINT_MAX); 1602 else if (!strcmp(tok, "wiops")) 1603 v[3] = min_t(u64, val, UINT_MAX); 1604 else 1605 goto unprep; 1606 } 1607 1608 tg->bps[READ] = v[0]; 1609 tg->bps[WRITE] = v[1]; 1610 tg->iops[READ] = v[2]; 1611 tg->iops[WRITE] = v[3]; 1612 1613 tg_conf_updated(tg, false); 1614 ret = 0; 1615 unprep: 1616 blkg_conf_unprep(&ctx); 1617 close_bdev: 1618 blkg_conf_close_bdev(&ctx); 1619 return ret ?: nbytes; 1620 } 1621 1622 static struct cftype throtl_files[] = { 1623 { 1624 .name = "max", 1625 .flags = CFTYPE_NOT_ON_ROOT, 1626 .seq_show = tg_print_limit, 1627 .write = tg_set_limit, 1628 }, 1629 { } /* terminate */ 1630 }; 1631 1632 static void throtl_shutdown_wq(struct request_queue *q) 1633 { 1634 struct throtl_data *td = q->td; 1635 1636 cancel_work_sync(&td->dispatch_work); 1637 } 1638 1639 static void tg_flush_bios(struct throtl_grp *tg) 1640 { 1641 struct throtl_service_queue *sq = &tg->service_queue; 1642 1643 if (tg->flags & THROTL_TG_CANCELING) 1644 return; 1645 /* 1646 * Set the flag to make sure throtl_pending_timer_fn() won't 1647 * stop until all throttled bios are dispatched. 1648 */ 1649 tg->flags |= THROTL_TG_CANCELING; 1650 1651 /* 1652 * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup 1653 * will be inserted to service queue without THROTL_TG_PENDING 1654 * set in tg_update_disptime below. Then IO dispatched from 1655 * child in tg_dispatch_one_bio will trigger double insertion 1656 * and corrupt the tree. 1657 */ 1658 if (!(tg->flags & THROTL_TG_PENDING)) 1659 return; 1660 1661 /* 1662 * Update disptime after setting the above flag to make sure 1663 * throtl_select_dispatch() won't exit without dispatching. 1664 */ 1665 tg_update_disptime(tg); 1666 1667 throtl_schedule_next_dispatch(sq->parent_sq, true); 1668 } 1669 1670 static void throtl_pd_offline(struct blkg_policy_data *pd) 1671 { 1672 tg_flush_bios(pd_to_tg(pd)); 1673 } 1674 1675 struct blkcg_policy blkcg_policy_throtl = { 1676 .dfl_cftypes = throtl_files, 1677 .legacy_cftypes = throtl_legacy_files, 1678 1679 .pd_alloc_fn = throtl_pd_alloc, 1680 .pd_init_fn = throtl_pd_init, 1681 .pd_online_fn = throtl_pd_online, 1682 .pd_offline_fn = throtl_pd_offline, 1683 .pd_free_fn = throtl_pd_free, 1684 }; 1685 1686 static void tg_cancel_writeback_bios(struct throtl_grp *tg, 1687 struct bio_list *cancel_bios) 1688 { 1689 struct throtl_service_queue *sq = &tg->service_queue; 1690 struct throtl_data *td = sq_to_td(sq); 1691 int rw; 1692 1693 if (tg->flags & THROTL_TG_CANCELING) 1694 return; 1695 tg->flags |= THROTL_TG_CANCELING; 1696 1697 for (rw = READ; rw <= WRITE; rw++) { 1698 struct throtl_qnode *qn, *tmp; 1699 unsigned int nr_bios = 0; 1700 1701 list_for_each_entry_safe(qn, tmp, &sq->queued[rw], node) { 1702 struct bio *bio; 1703 1704 while ((bio = bio_list_pop(&qn->bios_iops))) { 1705 sq->nr_queued_iops[rw]--; 1706 bio_list_add(&cancel_bios[rw], bio); 1707 nr_bios++; 1708 } 1709 while ((bio = bio_list_pop(&qn->bios_bps))) { 1710 sq->nr_queued_bps[rw]--; 1711 bio_list_add(&cancel_bios[rw], bio); 1712 nr_bios++; 1713 } 1714 1715 list_del_init(&qn->node); 1716 blkg_put(tg_to_blkg(qn->tg)); 1717 } 1718 1719 td->nr_queued[rw] -= nr_bios; 1720 } 1721 1722 throtl_dequeue_tg(tg); 1723 } 1724 1725 void blk_throtl_cancel_bios(struct gendisk *disk) 1726 { 1727 struct request_queue *q = disk->queue; 1728 struct cgroup_subsys_state *pos_css; 1729 struct blkcg_gq *blkg; 1730 struct bio_list cancel_bios[2] = { }; 1731 int rw; 1732 1733 if (!blk_throtl_activated(q)) 1734 return; 1735 1736 spin_lock_irq(&q->queue_lock); 1737 /* 1738 * queue_lock is held, rcu lock is not needed here technically. 1739 * However, rcu lock is still held to emphasize that following 1740 * path need RCU protection and to prevent warning from lockdep. 1741 */ 1742 rcu_read_lock(); 1743 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1744 /* 1745 * disk_release will call pd_offline_fn to cancel bios. 1746 * However, disk_release can't be called if someone get 1747 * the refcount of device and issued bios which are 1748 * inflight after del_gendisk. 1749 * Cancel bios here to ensure no bios are inflight after 1750 * del_gendisk. 1751 */ 1752 tg_cancel_writeback_bios(blkg_to_tg(blkg), cancel_bios); 1753 } 1754 rcu_read_unlock(); 1755 spin_unlock_irq(&q->queue_lock); 1756 1757 for (rw = READ; rw <= WRITE; rw++) { 1758 struct bio *bio; 1759 while ((bio = bio_list_pop(&cancel_bios[rw]))) 1760 bio_io_error(bio); 1761 } 1762 } 1763 1764 static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) 1765 { 1766 struct throtl_service_queue *sq = &tg->service_queue; 1767 1768 /* 1769 * For a split bio, we need to specifically distinguish whether the 1770 * iops queue is empty. 1771 */ 1772 if (bio_flagged(bio, BIO_BPS_THROTTLED)) 1773 return sq->nr_queued_iops[rw] == 0 && 1774 tg_dispatch_iops_time(tg, bio) == 0; 1775 1776 /* 1777 * Throtl is FIFO - if bios are already queued, should queue. 1778 * If the bps queue is empty and @bio is within the bps limit, charge 1779 * bps here for direct placement into the iops queue. 1780 */ 1781 if (sq_queued(&tg->service_queue, rw)) { 1782 if (sq->nr_queued_bps[rw] == 0 && 1783 tg_dispatch_bps_time(tg, bio) == 0) 1784 throtl_charge_bps_bio(tg, bio); 1785 1786 return false; 1787 } 1788 1789 return tg_dispatch_time(tg, bio) == 0; 1790 } 1791 1792 bool __blk_throtl_bio(struct bio *bio) 1793 { 1794 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1795 struct blkcg_gq *blkg = bio->bi_blkg; 1796 struct throtl_qnode *qn = NULL; 1797 struct throtl_grp *tg = blkg_to_tg(blkg); 1798 struct throtl_service_queue *sq; 1799 bool rw = bio_data_dir(bio); 1800 bool throttled = false; 1801 struct throtl_data *td = tg->td; 1802 1803 rcu_read_lock(); 1804 spin_lock_irq(&q->queue_lock); 1805 sq = &tg->service_queue; 1806 1807 while (true) { 1808 if (tg_within_limit(tg, bio, rw)) { 1809 /* within limits, let's charge and dispatch directly */ 1810 throtl_charge_iops_bio(tg, bio); 1811 1812 /* 1813 * We need to trim slice even when bios are not being 1814 * queued otherwise it might happen that a bio is not 1815 * queued for a long time and slice keeps on extending 1816 * and trim is not called for a long time. Now if limits 1817 * are reduced suddenly we take into account all the IO 1818 * dispatched so far at new low rate and * newly queued 1819 * IO gets a really long dispatch time. 1820 * 1821 * So keep on trimming slice even if bio is not queued. 1822 */ 1823 throtl_trim_slice(tg, rw); 1824 } else if (bio_issue_as_root_blkg(bio)) { 1825 /* 1826 * IOs which may cause priority inversions are 1827 * dispatched directly, even if they're over limit. 1828 * 1829 * Charge and dispatch directly, and our throttle 1830 * control algorithm is adaptive, and extra IO bytes 1831 * will be throttled for paying the debt 1832 */ 1833 throtl_charge_bps_bio(tg, bio); 1834 throtl_charge_iops_bio(tg, bio); 1835 } else { 1836 /* if above limits, break to queue */ 1837 break; 1838 } 1839 1840 /* 1841 * @bio passed through this layer without being throttled. 1842 * Climb up the ladder. If we're already at the top, it 1843 * can be executed directly. 1844 */ 1845 qn = &tg->qnode_on_parent[rw]; 1846 sq = sq->parent_sq; 1847 tg = sq_to_tg(sq); 1848 if (!tg) { 1849 bio_set_flag(bio, BIO_BPS_THROTTLED); 1850 goto out_unlock; 1851 } 1852 } 1853 1854 /* out-of-limit, queue to @tg */ 1855 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 1856 rw == READ ? 'R' : 'W', 1857 tg->bytes_disp[rw], bio->bi_iter.bi_size, 1858 tg_bps_limit(tg, rw), 1859 tg->io_disp[rw], tg_iops_limit(tg, rw), 1860 sq_queued(sq, READ), sq_queued(sq, WRITE)); 1861 1862 td->nr_queued[rw]++; 1863 throtl_add_bio_tg(bio, qn, tg); 1864 throttled = true; 1865 1866 /* 1867 * Update @tg's dispatch time and force schedule dispatch if @tg 1868 * was empty before @bio, or the iops queue is empty and @bio will 1869 * add to. The forced scheduling isn't likely to cause undue 1870 * delay as @bio is likely to be dispatched directly if its @tg's 1871 * disptime is not in the future. 1872 */ 1873 if (tg->flags & THROTL_TG_WAS_EMPTY || 1874 tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { 1875 tg_update_disptime(tg); 1876 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1877 } 1878 1879 out_unlock: 1880 spin_unlock_irq(&q->queue_lock); 1881 1882 rcu_read_unlock(); 1883 return throttled; 1884 } 1885 1886 void blk_throtl_exit(struct gendisk *disk) 1887 { 1888 struct request_queue *q = disk->queue; 1889 1890 /* 1891 * blkg_destroy_all() already deactivate throtl policy, just check and 1892 * free throtl data. 1893 */ 1894 if (!q->td) 1895 return; 1896 1897 timer_delete_sync(&q->td->service_queue.pending_timer); 1898 throtl_shutdown_wq(q); 1899 kfree(q->td); 1900 } 1901 1902 static int __init throtl_init(void) 1903 { 1904 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM | WQ_PERCPU, 0); 1905 if (!kthrotld_workqueue) 1906 panic("Failed to create kthrotld\n"); 1907 1908 return blkcg_policy_register(&blkcg_policy_throtl); 1909 } 1910 1911 module_init(throtl_init); 1912