1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Interface for controlling IO bandwidth on a request queue 4 * 5 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/bio.h> 12 #include <linux/blktrace_api.h> 13 #include "blk.h" 14 #include "blk-cgroup-rwstat.h" 15 #include "blk-throttle.h" 16 17 /* Max dispatch from a group in 1 round */ 18 #define THROTL_GRP_QUANTUM 8 19 20 /* Total max dispatch from all groups in one round */ 21 #define THROTL_QUANTUM 32 22 23 /* Throttling is performed over a slice and after that slice is renewed */ 24 #define DFL_THROTL_SLICE (HZ / 10) 25 26 /* A workqueue to queue throttle related work */ 27 static struct workqueue_struct *kthrotld_workqueue; 28 29 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 30 31 struct throtl_data 32 { 33 /* service tree for active throtl groups */ 34 struct throtl_service_queue service_queue; 35 36 struct request_queue *queue; 37 38 /* Total Number of queued bios on READ and WRITE lists */ 39 unsigned int nr_queued[2]; 40 41 /* Work for dispatching throttled bios */ 42 struct work_struct dispatch_work; 43 }; 44 45 static void throtl_pending_timer_fn(struct timer_list *t); 46 47 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 48 { 49 return pd_to_blkg(&tg->pd); 50 } 51 52 /** 53 * sq_to_tg - return the throl_grp the specified service queue belongs to 54 * @sq: the throtl_service_queue of interest 55 * 56 * Return the throtl_grp @sq belongs to. If @sq is the top-level one 57 * embedded in throtl_data, %NULL is returned. 58 */ 59 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 60 { 61 if (sq && sq->parent_sq) 62 return container_of(sq, struct throtl_grp, service_queue); 63 else 64 return NULL; 65 } 66 67 /** 68 * sq_to_td - return throtl_data the specified service queue belongs to 69 * @sq: the throtl_service_queue of interest 70 * 71 * A service_queue can be embedded in either a throtl_grp or throtl_data. 72 * Determine the associated throtl_data accordingly and return it. 73 */ 74 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 75 { 76 struct throtl_grp *tg = sq_to_tg(sq); 77 78 if (tg) 79 return tg->td; 80 else 81 return container_of(sq, struct throtl_data, service_queue); 82 } 83 84 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 85 { 86 struct blkcg_gq *blkg = tg_to_blkg(tg); 87 88 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 89 return U64_MAX; 90 91 return tg->bps[rw]; 92 } 93 94 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 95 { 96 struct blkcg_gq *blkg = tg_to_blkg(tg); 97 98 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 99 return UINT_MAX; 100 101 return tg->iops[rw]; 102 } 103 104 /** 105 * throtl_log - log debug message via blktrace 106 * @sq: the service_queue being reported 107 * @fmt: printf format string 108 * @args: printf args 109 * 110 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 111 * throtl_grp; otherwise, just "throtl". 112 */ 113 #define throtl_log(sq, fmt, args...) do { \ 114 struct throtl_grp *__tg = sq_to_tg((sq)); \ 115 struct throtl_data *__td = sq_to_td((sq)); \ 116 \ 117 (void)__td; \ 118 if (likely(!blk_trace_note_message_enabled(__td->queue))) \ 119 break; \ 120 if ((__tg)) { \ 121 blk_add_cgroup_trace_msg(__td->queue, \ 122 &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ 123 } else { \ 124 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 125 } \ 126 } while (0) 127 128 static inline unsigned int throtl_bio_data_size(struct bio *bio) 129 { 130 /* assume it's one sector */ 131 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 132 return 512; 133 return bio->bi_iter.bi_size; 134 } 135 136 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 137 { 138 INIT_LIST_HEAD(&qn->node); 139 bio_list_init(&qn->bios_bps); 140 bio_list_init(&qn->bios_iops); 141 qn->tg = tg; 142 } 143 144 /** 145 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 146 * @bio: bio being added 147 * @qn: qnode to add bio to 148 * @sq: the service_queue @qn belongs to 149 * 150 * Add @bio to @qn and put @qn on @sq->queued if it's not already on. 151 * @qn->tg's reference count is bumped when @qn is activated. See the 152 * comment on top of throtl_qnode definition for details. 153 */ 154 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 155 struct throtl_service_queue *sq) 156 { 157 bool rw = bio_data_dir(bio); 158 159 /* 160 * Split bios have already been throttled by bps, so they are 161 * directly queued into the iops path. 162 */ 163 if (bio_flagged(bio, BIO_TG_BPS_THROTTLED) || 164 bio_flagged(bio, BIO_BPS_THROTTLED)) { 165 bio_list_add(&qn->bios_iops, bio); 166 sq->nr_queued_iops[rw]++; 167 } else { 168 bio_list_add(&qn->bios_bps, bio); 169 sq->nr_queued_bps[rw]++; 170 } 171 172 if (list_empty(&qn->node)) { 173 list_add_tail(&qn->node, &sq->queued[rw]); 174 blkg_get(tg_to_blkg(qn->tg)); 175 } 176 } 177 178 /** 179 * throtl_peek_queued - peek the first bio on a qnode list 180 * @queued: the qnode list to peek 181 * 182 * Always take a bio from the head of the iops queue first. If the queue is 183 * empty, we then take it from the bps queue to maintain the overall idea of 184 * fetching bios from the head. 185 */ 186 static struct bio *throtl_peek_queued(struct list_head *queued) 187 { 188 struct throtl_qnode *qn; 189 struct bio *bio; 190 191 if (list_empty(queued)) 192 return NULL; 193 194 qn = list_first_entry(queued, struct throtl_qnode, node); 195 bio = bio_list_peek(&qn->bios_iops); 196 if (!bio) 197 bio = bio_list_peek(&qn->bios_bps); 198 WARN_ON_ONCE(!bio); 199 return bio; 200 } 201 202 /** 203 * throtl_pop_queued - pop the first bio form a qnode list 204 * @sq: the service_queue to pop a bio from 205 * @tg_to_put: optional out argument for throtl_grp to put 206 * @rw: read/write 207 * 208 * Pop the first bio from the qnode list @sq->queued. Note that we firstly 209 * focus on the iops list because bios are ultimately dispatched from it. 210 * After popping, the first qnode is removed from @sq->queued if empty or moved 211 * to the end of @sq->queued so that the popping order is round-robin. 212 * 213 * When the first qnode is removed, its associated throtl_grp should be put 214 * too. If @tg_to_put is NULL, this function automatically puts it; 215 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 216 * responsible for putting it. 217 */ 218 static struct bio *throtl_pop_queued(struct throtl_service_queue *sq, 219 struct throtl_grp **tg_to_put, bool rw) 220 { 221 struct list_head *queued = &sq->queued[rw]; 222 struct throtl_qnode *qn; 223 struct bio *bio; 224 225 if (list_empty(queued)) 226 return NULL; 227 228 qn = list_first_entry(queued, struct throtl_qnode, node); 229 bio = bio_list_pop(&qn->bios_iops); 230 if (bio) { 231 sq->nr_queued_iops[rw]--; 232 } else { 233 bio = bio_list_pop(&qn->bios_bps); 234 if (bio) 235 sq->nr_queued_bps[rw]--; 236 } 237 WARN_ON_ONCE(!bio); 238 239 if (bio_list_empty(&qn->bios_bps) && bio_list_empty(&qn->bios_iops)) { 240 list_del_init(&qn->node); 241 if (tg_to_put) 242 *tg_to_put = qn->tg; 243 else 244 blkg_put(tg_to_blkg(qn->tg)); 245 } else { 246 list_move_tail(&qn->node, queued); 247 } 248 249 return bio; 250 } 251 252 /* init a service_queue, assumes the caller zeroed it */ 253 static void throtl_service_queue_init(struct throtl_service_queue *sq) 254 { 255 INIT_LIST_HEAD(&sq->queued[READ]); 256 INIT_LIST_HEAD(&sq->queued[WRITE]); 257 sq->pending_tree = RB_ROOT_CACHED; 258 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); 259 } 260 261 static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, 262 struct blkcg *blkcg, gfp_t gfp) 263 { 264 struct throtl_grp *tg; 265 int rw; 266 267 tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); 268 if (!tg) 269 return NULL; 270 271 if (blkg_rwstat_init(&tg->stat_bytes, gfp)) 272 goto err_free_tg; 273 274 if (blkg_rwstat_init(&tg->stat_ios, gfp)) 275 goto err_exit_stat_bytes; 276 277 throtl_service_queue_init(&tg->service_queue); 278 279 for (rw = READ; rw <= WRITE; rw++) { 280 throtl_qnode_init(&tg->qnode_on_self[rw], tg); 281 throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 282 } 283 284 RB_CLEAR_NODE(&tg->rb_node); 285 tg->bps[READ] = U64_MAX; 286 tg->bps[WRITE] = U64_MAX; 287 tg->iops[READ] = UINT_MAX; 288 tg->iops[WRITE] = UINT_MAX; 289 290 return &tg->pd; 291 292 err_exit_stat_bytes: 293 blkg_rwstat_exit(&tg->stat_bytes); 294 err_free_tg: 295 kfree(tg); 296 return NULL; 297 } 298 299 static void throtl_pd_init(struct blkg_policy_data *pd) 300 { 301 struct throtl_grp *tg = pd_to_tg(pd); 302 struct blkcg_gq *blkg = tg_to_blkg(tg); 303 struct throtl_data *td = blkg->q->td; 304 struct throtl_service_queue *sq = &tg->service_queue; 305 306 /* 307 * If on the default hierarchy, we switch to properly hierarchical 308 * behavior where limits on a given throtl_grp are applied to the 309 * whole subtree rather than just the group itself. e.g. If 16M 310 * read_bps limit is set on a parent group, summary bps of 311 * parent group and its subtree groups can't exceed 16M for the 312 * device. 313 * 314 * If not on the default hierarchy, the broken flat hierarchy 315 * behavior is retained where all throtl_grps are treated as if 316 * they're all separate root groups right below throtl_data. 317 * Limits of a group don't interact with limits of other groups 318 * regardless of the position of the group in the hierarchy. 319 */ 320 sq->parent_sq = &td->service_queue; 321 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 322 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 323 tg->td = td; 324 } 325 326 /* 327 * Set has_rules[] if @tg or any of its parents have limits configured. 328 * This doesn't require walking up to the top of the hierarchy as the 329 * parent's has_rules[] is guaranteed to be correct. 330 */ 331 static void tg_update_has_rules(struct throtl_grp *tg) 332 { 333 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 334 int rw; 335 336 for (rw = READ; rw <= WRITE; rw++) { 337 tg->has_rules_iops[rw] = 338 (parent_tg && parent_tg->has_rules_iops[rw]) || 339 tg_iops_limit(tg, rw) != UINT_MAX; 340 tg->has_rules_bps[rw] = 341 (parent_tg && parent_tg->has_rules_bps[rw]) || 342 tg_bps_limit(tg, rw) != U64_MAX; 343 } 344 } 345 346 static void throtl_pd_online(struct blkg_policy_data *pd) 347 { 348 struct throtl_grp *tg = pd_to_tg(pd); 349 /* 350 * We don't want new groups to escape the limits of its ancestors. 351 * Update has_rules[] after a new group is brought online. 352 */ 353 tg_update_has_rules(tg); 354 } 355 356 static void throtl_pd_free(struct blkg_policy_data *pd) 357 { 358 struct throtl_grp *tg = pd_to_tg(pd); 359 360 timer_delete_sync(&tg->service_queue.pending_timer); 361 blkg_rwstat_exit(&tg->stat_bytes); 362 blkg_rwstat_exit(&tg->stat_ios); 363 kfree(tg); 364 } 365 366 static struct throtl_grp * 367 throtl_rb_first(struct throtl_service_queue *parent_sq) 368 { 369 struct rb_node *n; 370 371 n = rb_first_cached(&parent_sq->pending_tree); 372 WARN_ON_ONCE(!n); 373 if (!n) 374 return NULL; 375 return rb_entry_tg(n); 376 } 377 378 static void throtl_rb_erase(struct rb_node *n, 379 struct throtl_service_queue *parent_sq) 380 { 381 rb_erase_cached(n, &parent_sq->pending_tree); 382 RB_CLEAR_NODE(n); 383 } 384 385 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 386 { 387 struct throtl_grp *tg; 388 389 tg = throtl_rb_first(parent_sq); 390 if (!tg) 391 return; 392 393 parent_sq->first_pending_disptime = tg->disptime; 394 } 395 396 static void tg_service_queue_add(struct throtl_grp *tg) 397 { 398 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 399 struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; 400 struct rb_node *parent = NULL; 401 struct throtl_grp *__tg; 402 unsigned long key = tg->disptime; 403 bool leftmost = true; 404 405 while (*node != NULL) { 406 parent = *node; 407 __tg = rb_entry_tg(parent); 408 409 if (time_before(key, __tg->disptime)) 410 node = &parent->rb_left; 411 else { 412 node = &parent->rb_right; 413 leftmost = false; 414 } 415 } 416 417 rb_link_node(&tg->rb_node, parent, node); 418 rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, 419 leftmost); 420 } 421 422 static void throtl_enqueue_tg(struct throtl_grp *tg) 423 { 424 if (!(tg->flags & THROTL_TG_PENDING)) { 425 tg_service_queue_add(tg); 426 tg->flags |= THROTL_TG_PENDING; 427 tg->service_queue.parent_sq->nr_pending++; 428 } 429 } 430 431 static void throtl_dequeue_tg(struct throtl_grp *tg) 432 { 433 if (tg->flags & THROTL_TG_PENDING) { 434 struct throtl_service_queue *parent_sq = 435 tg->service_queue.parent_sq; 436 437 throtl_rb_erase(&tg->rb_node, parent_sq); 438 --parent_sq->nr_pending; 439 tg->flags &= ~THROTL_TG_PENDING; 440 } 441 } 442 443 /* Call with queue lock held */ 444 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 445 unsigned long expires) 446 { 447 unsigned long max_expire = jiffies + 8 * DFL_THROTL_SLICE; 448 449 /* 450 * Since we are adjusting the throttle limit dynamically, the sleep 451 * time calculated according to previous limit might be invalid. It's 452 * possible the cgroup sleep time is very long and no other cgroups 453 * have IO running so notify the limit changes. Make sure the cgroup 454 * doesn't sleep too long to avoid the missed notification. 455 */ 456 if (time_after(expires, max_expire)) 457 expires = max_expire; 458 mod_timer(&sq->pending_timer, expires); 459 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 460 expires - jiffies, jiffies); 461 } 462 463 /** 464 * throtl_schedule_next_dispatch - schedule the next dispatch cycle 465 * @sq: the service_queue to schedule dispatch for 466 * @force: force scheduling 467 * 468 * Arm @sq->pending_timer so that the next dispatch cycle starts on the 469 * dispatch time of the first pending child. Returns %true if either timer 470 * is armed or there's no pending child left. %false if the current 471 * dispatch window is still open and the caller should continue 472 * dispatching. 473 * 474 * If @force is %true, the dispatch timer is always scheduled and this 475 * function is guaranteed to return %true. This is to be used when the 476 * caller can't dispatch itself and needs to invoke pending_timer 477 * unconditionally. Note that forced scheduling is likely to induce short 478 * delay before dispatch starts even if @sq->first_pending_disptime is not 479 * in the future and thus shouldn't be used in hot paths. 480 */ 481 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 482 bool force) 483 { 484 /* any pending children left? */ 485 if (!sq->nr_pending) 486 return true; 487 488 update_min_dispatch_time(sq); 489 490 /* is the next dispatch time in the future? */ 491 if (force || time_after(sq->first_pending_disptime, jiffies)) { 492 throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 493 return true; 494 } 495 496 /* tell the caller to continue dispatching */ 497 return false; 498 } 499 500 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 501 bool rw, unsigned long start) 502 { 503 tg->bytes_disp[rw] = 0; 504 tg->io_disp[rw] = 0; 505 506 /* 507 * Previous slice has expired. We must have trimmed it after last 508 * bio dispatch. That means since start of last slice, we never used 509 * that bandwidth. Do try to make use of that bandwidth while giving 510 * credit. 511 */ 512 if (time_after(start, tg->slice_start[rw])) 513 tg->slice_start[rw] = start; 514 515 tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; 516 throtl_log(&tg->service_queue, 517 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 518 rw == READ ? 'R' : 'W', tg->slice_start[rw], 519 tg->slice_end[rw], jiffies); 520 } 521 522 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, 523 bool clear) 524 { 525 if (clear) { 526 tg->bytes_disp[rw] = 0; 527 tg->io_disp[rw] = 0; 528 } 529 tg->slice_start[rw] = jiffies; 530 tg->slice_end[rw] = jiffies + DFL_THROTL_SLICE; 531 532 throtl_log(&tg->service_queue, 533 "[%c] new slice start=%lu end=%lu jiffies=%lu", 534 rw == READ ? 'R' : 'W', tg->slice_start[rw], 535 tg->slice_end[rw], jiffies); 536 } 537 538 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 539 unsigned long jiffy_end) 540 { 541 tg->slice_end[rw] = roundup(jiffy_end, DFL_THROTL_SLICE); 542 } 543 544 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 545 unsigned long jiffy_end) 546 { 547 if (!time_before(tg->slice_end[rw], jiffy_end)) 548 return; 549 550 throtl_set_slice_end(tg, rw, jiffy_end); 551 throtl_log(&tg->service_queue, 552 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 553 rw == READ ? 'R' : 'W', tg->slice_start[rw], 554 tg->slice_end[rw], jiffies); 555 } 556 557 /* Determine if previously allocated or extended slice is complete or not */ 558 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 559 { 560 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 561 return false; 562 563 return true; 564 } 565 566 static unsigned int sq_queued(struct throtl_service_queue *sq, int type) 567 { 568 return sq->nr_queued_bps[type] + sq->nr_queued_iops[type]; 569 } 570 571 static unsigned int calculate_io_allowed(u32 iops_limit, 572 unsigned long jiffy_elapsed) 573 { 574 unsigned int io_allowed; 575 u64 tmp; 576 577 /* 578 * jiffy_elapsed should not be a big value as minimum iops can be 579 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 580 * will allow dispatch after 1 second and after that slice should 581 * have been trimmed. 582 */ 583 584 tmp = (u64)iops_limit * jiffy_elapsed; 585 do_div(tmp, HZ); 586 587 if (tmp > UINT_MAX) 588 io_allowed = UINT_MAX; 589 else 590 io_allowed = tmp; 591 592 return io_allowed; 593 } 594 595 static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) 596 { 597 /* 598 * Can result be wider than 64 bits? 599 * We check against 62, not 64, due to ilog2 truncation. 600 */ 601 if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) 602 return U64_MAX; 603 return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); 604 } 605 606 static long long throtl_trim_bps(struct throtl_grp *tg, bool rw, 607 unsigned long time_elapsed) 608 { 609 u64 bps_limit = tg_bps_limit(tg, rw); 610 long long bytes_trim; 611 612 if (bps_limit == U64_MAX) 613 return 0; 614 615 /* Need to consider the case of bytes_allowed overflow. */ 616 bytes_trim = calculate_bytes_allowed(bps_limit, time_elapsed); 617 if (bytes_trim <= 0 || tg->bytes_disp[rw] < bytes_trim) { 618 bytes_trim = tg->bytes_disp[rw]; 619 tg->bytes_disp[rw] = 0; 620 } else { 621 tg->bytes_disp[rw] -= bytes_trim; 622 } 623 624 return bytes_trim; 625 } 626 627 static int throtl_trim_iops(struct throtl_grp *tg, bool rw, 628 unsigned long time_elapsed) 629 { 630 u32 iops_limit = tg_iops_limit(tg, rw); 631 int io_trim; 632 633 if (iops_limit == UINT_MAX) 634 return 0; 635 636 /* Need to consider the case of io_allowed overflow. */ 637 io_trim = calculate_io_allowed(iops_limit, time_elapsed); 638 if (io_trim <= 0 || tg->io_disp[rw] < io_trim) { 639 io_trim = tg->io_disp[rw]; 640 tg->io_disp[rw] = 0; 641 } else { 642 tg->io_disp[rw] -= io_trim; 643 } 644 645 return io_trim; 646 } 647 648 /* Trim the used slices and adjust slice start accordingly */ 649 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 650 { 651 unsigned long time_elapsed; 652 long long bytes_trim; 653 int io_trim; 654 655 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 656 657 /* 658 * If bps are unlimited (-1), then time slice don't get 659 * renewed. Don't try to trim the slice if slice is used. A new 660 * slice will start when appropriate. 661 */ 662 if (throtl_slice_used(tg, rw)) 663 return; 664 665 /* 666 * A bio has been dispatched. Also adjust slice_end. It might happen 667 * that initially cgroup limit was very low resulting in high 668 * slice_end, but later limit was bumped up and bio was dispatched 669 * sooner, then we need to reduce slice_end. A high bogus slice_end 670 * is bad because it does not allow new slice to start. 671 */ 672 throtl_set_slice_end(tg, rw, jiffies + DFL_THROTL_SLICE); 673 674 time_elapsed = rounddown(jiffies - tg->slice_start[rw], 675 DFL_THROTL_SLICE); 676 /* Don't trim slice until at least 2 slices are used */ 677 if (time_elapsed < DFL_THROTL_SLICE * 2) 678 return; 679 680 /* 681 * The bio submission time may be a few jiffies more than the expected 682 * waiting time, due to 'extra_bytes' can't be divided in 683 * tg_within_bps_limit(), and also due to timer wakeup delay. In this 684 * case, adjust slice_start will discard the extra wait time, causing 685 * lower rate than expected. Therefore, other than the above rounddown, 686 * one extra slice is preserved for deviation. 687 */ 688 time_elapsed -= DFL_THROTL_SLICE; 689 bytes_trim = throtl_trim_bps(tg, rw, time_elapsed); 690 io_trim = throtl_trim_iops(tg, rw, time_elapsed); 691 if (!bytes_trim && !io_trim) 692 return; 693 694 tg->slice_start[rw] += time_elapsed; 695 696 throtl_log(&tg->service_queue, 697 "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", 698 rw == READ ? 'R' : 'W', time_elapsed / DFL_THROTL_SLICE, 699 bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], 700 jiffies); 701 } 702 703 static void __tg_update_carryover(struct throtl_grp *tg, bool rw, 704 long long *bytes, int *ios) 705 { 706 unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; 707 u64 bps_limit = tg_bps_limit(tg, rw); 708 u32 iops_limit = tg_iops_limit(tg, rw); 709 long long bytes_allowed; 710 int io_allowed; 711 712 /* 713 * If the queue is empty, carryover handling is not needed. In such cases, 714 * tg->[bytes/io]_disp should be reset to 0 to avoid impacting the dispatch 715 * of subsequent bios. The same handling applies when the previous BPS/IOPS 716 * limit was set to max. 717 */ 718 if (sq_queued(&tg->service_queue, rw) == 0) { 719 tg->bytes_disp[rw] = 0; 720 tg->io_disp[rw] = 0; 721 return; 722 } 723 724 /* 725 * If config is updated while bios are still throttled, calculate and 726 * accumulate how many bytes/ios are waited across changes. And use the 727 * calculated carryover (@bytes/@ios) to update [bytes/io]_disp, which 728 * will be used to calculate new wait time under new configuration. 729 * And we need to consider the case of bytes/io_allowed overflow. 730 */ 731 if (bps_limit != U64_MAX) { 732 bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed); 733 if (bytes_allowed > 0) 734 *bytes = bytes_allowed - tg->bytes_disp[rw]; 735 } 736 if (iops_limit != UINT_MAX) { 737 io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed); 738 if (io_allowed > 0) 739 *ios = io_allowed - tg->io_disp[rw]; 740 } 741 742 tg->bytes_disp[rw] = -*bytes; 743 tg->io_disp[rw] = -*ios; 744 } 745 746 static void tg_update_carryover(struct throtl_grp *tg) 747 { 748 long long bytes[2] = {0}; 749 int ios[2] = {0}; 750 751 __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); 752 __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); 753 754 /* see comments in struct throtl_grp for meaning of carryover. */ 755 throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, 756 bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); 757 } 758 759 static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, 760 u32 iops_limit) 761 { 762 bool rw = bio_data_dir(bio); 763 int io_allowed; 764 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 765 766 jiffy_elapsed = jiffies - tg->slice_start[rw]; 767 768 /* Round up to the next throttle slice, wait time must be nonzero */ 769 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, DFL_THROTL_SLICE); 770 io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); 771 if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) 772 return 0; 773 774 /* Calc approx time to dispatch */ 775 jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; 776 777 /* make sure at least one io can be dispatched after waiting */ 778 jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); 779 return jiffy_wait; 780 } 781 782 static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, 783 u64 bps_limit) 784 { 785 bool rw = bio_data_dir(bio); 786 long long bytes_allowed; 787 u64 extra_bytes; 788 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 789 unsigned int bio_size = throtl_bio_data_size(bio); 790 791 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 792 793 /* Slice has just started. Consider one slice interval */ 794 if (!jiffy_elapsed) 795 jiffy_elapsed_rnd = DFL_THROTL_SLICE; 796 797 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, DFL_THROTL_SLICE); 798 bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); 799 /* Need to consider the case of bytes_allowed overflow. */ 800 if ((bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) 801 || bytes_allowed < 0) 802 return 0; 803 804 /* Calc approx time to dispatch */ 805 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; 806 jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); 807 808 if (!jiffy_wait) 809 jiffy_wait = 1; 810 811 /* 812 * This wait time is without taking into consideration the rounding 813 * up we did. Add that time also. 814 */ 815 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 816 return jiffy_wait; 817 } 818 819 static void throtl_charge_bps_bio(struct throtl_grp *tg, struct bio *bio) 820 { 821 unsigned int bio_size = throtl_bio_data_size(bio); 822 823 /* Charge the bio to the group */ 824 if (!bio_flagged(bio, BIO_BPS_THROTTLED) && 825 !bio_flagged(bio, BIO_TG_BPS_THROTTLED)) { 826 bio_set_flag(bio, BIO_TG_BPS_THROTTLED); 827 tg->bytes_disp[bio_data_dir(bio)] += bio_size; 828 } 829 } 830 831 static void throtl_charge_iops_bio(struct throtl_grp *tg, struct bio *bio) 832 { 833 bio_clear_flag(bio, BIO_TG_BPS_THROTTLED); 834 tg->io_disp[bio_data_dir(bio)]++; 835 } 836 837 /* 838 * If previous slice expired, start a new one otherwise renew/extend existing 839 * slice to make sure it is at least throtl_slice interval long since now. New 840 * slice is started only for empty throttle group. If there is queued bio, that 841 * means there should be an active slice and it should be extended instead. 842 */ 843 static void tg_update_slice(struct throtl_grp *tg, bool rw) 844 { 845 if (throtl_slice_used(tg, rw) && 846 sq_queued(&tg->service_queue, rw) == 0) 847 throtl_start_new_slice(tg, rw, true); 848 else 849 throtl_extend_slice(tg, rw, jiffies + DFL_THROTL_SLICE); 850 } 851 852 static unsigned long tg_dispatch_bps_time(struct throtl_grp *tg, struct bio *bio) 853 { 854 bool rw = bio_data_dir(bio); 855 u64 bps_limit = tg_bps_limit(tg, rw); 856 unsigned long bps_wait; 857 858 /* no need to throttle if this bio's bytes have been accounted */ 859 if (bps_limit == U64_MAX || tg->flags & THROTL_TG_CANCELING || 860 bio_flagged(bio, BIO_BPS_THROTTLED) || 861 bio_flagged(bio, BIO_TG_BPS_THROTTLED)) 862 return 0; 863 864 tg_update_slice(tg, rw); 865 bps_wait = tg_within_bps_limit(tg, bio, bps_limit); 866 throtl_extend_slice(tg, rw, jiffies + bps_wait); 867 868 return bps_wait; 869 } 870 871 static unsigned long tg_dispatch_iops_time(struct throtl_grp *tg, struct bio *bio) 872 { 873 bool rw = bio_data_dir(bio); 874 u32 iops_limit = tg_iops_limit(tg, rw); 875 unsigned long iops_wait; 876 877 if (iops_limit == UINT_MAX || tg->flags & THROTL_TG_CANCELING) 878 return 0; 879 880 tg_update_slice(tg, rw); 881 iops_wait = tg_within_iops_limit(tg, bio, iops_limit); 882 throtl_extend_slice(tg, rw, jiffies + iops_wait); 883 884 return iops_wait; 885 } 886 887 /* 888 * Returns approx number of jiffies to wait before this bio is with-in IO rate 889 * and can be moved to other queue or dispatched. 890 */ 891 static unsigned long tg_dispatch_time(struct throtl_grp *tg, struct bio *bio) 892 { 893 bool rw = bio_data_dir(bio); 894 unsigned long wait; 895 896 /* 897 * Currently whole state machine of group depends on first bio 898 * queued in the group bio list. So one should not be calling 899 * this function with a different bio if there are other bios 900 * queued. 901 */ 902 BUG_ON(sq_queued(&tg->service_queue, rw) && 903 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 904 905 wait = tg_dispatch_bps_time(tg, bio); 906 if (wait != 0) 907 return wait; 908 909 /* 910 * Charge bps here because @bio will be directly placed into the 911 * iops queue afterward. 912 */ 913 throtl_charge_bps_bio(tg, bio); 914 915 return tg_dispatch_iops_time(tg, bio); 916 } 917 918 /** 919 * throtl_add_bio_tg - add a bio to the specified throtl_grp 920 * @bio: bio to add 921 * @qn: qnode to use 922 * @tg: the target throtl_grp 923 * 924 * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 925 * tg->qnode_on_self[] is used. 926 */ 927 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 928 struct throtl_grp *tg) 929 { 930 struct throtl_service_queue *sq = &tg->service_queue; 931 bool rw = bio_data_dir(bio); 932 933 if (!qn) 934 qn = &tg->qnode_on_self[rw]; 935 936 /* 937 * If @tg doesn't currently have any bios queued in the same 938 * direction, queueing @bio can change when @tg should be 939 * dispatched. Mark that @tg was empty. This is automatically 940 * cleared on the next tg_update_disptime(). 941 */ 942 if (sq_queued(sq, rw) == 0) 943 tg->flags |= THROTL_TG_WAS_EMPTY; 944 945 throtl_qnode_add_bio(bio, qn, sq); 946 947 /* 948 * Since we have split the queues, when the iops queue is 949 * previously empty and a new @bio is added into the first @qn, 950 * we also need to update the @tg->disptime. 951 */ 952 if (bio_flagged(bio, BIO_BPS_THROTTLED) && 953 bio == throtl_peek_queued(&sq->queued[rw])) 954 tg->flags |= THROTL_TG_IOPS_WAS_EMPTY; 955 956 throtl_enqueue_tg(tg); 957 } 958 959 static void tg_update_disptime(struct throtl_grp *tg) 960 { 961 struct throtl_service_queue *sq = &tg->service_queue; 962 unsigned long read_wait = -1, write_wait = -1, min_wait, disptime; 963 struct bio *bio; 964 965 bio = throtl_peek_queued(&sq->queued[READ]); 966 if (bio) 967 read_wait = tg_dispatch_time(tg, bio); 968 969 bio = throtl_peek_queued(&sq->queued[WRITE]); 970 if (bio) 971 write_wait = tg_dispatch_time(tg, bio); 972 973 min_wait = min(read_wait, write_wait); 974 disptime = jiffies + min_wait; 975 976 /* Update dispatch time */ 977 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 978 tg->disptime = disptime; 979 tg_service_queue_add(tg); 980 981 /* see throtl_add_bio_tg() */ 982 tg->flags &= ~THROTL_TG_WAS_EMPTY; 983 tg->flags &= ~THROTL_TG_IOPS_WAS_EMPTY; 984 } 985 986 static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 987 struct throtl_grp *parent_tg, bool rw) 988 { 989 if (throtl_slice_used(parent_tg, rw)) { 990 throtl_start_new_slice_with_credit(parent_tg, rw, 991 child_tg->slice_start[rw]); 992 } 993 994 } 995 996 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 997 { 998 struct throtl_service_queue *sq = &tg->service_queue; 999 struct throtl_service_queue *parent_sq = sq->parent_sq; 1000 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 1001 struct throtl_grp *tg_to_put = NULL; 1002 struct bio *bio; 1003 1004 /* 1005 * @bio is being transferred from @tg to @parent_sq. Popping a bio 1006 * from @tg may put its reference and @parent_sq might end up 1007 * getting released prematurely. Remember the tg to put and put it 1008 * after @bio is transferred to @parent_sq. 1009 */ 1010 bio = throtl_pop_queued(sq, &tg_to_put, rw); 1011 1012 throtl_charge_iops_bio(tg, bio); 1013 1014 /* 1015 * If our parent is another tg, we just need to transfer @bio to 1016 * the parent using throtl_add_bio_tg(). If our parent is 1017 * @td->service_queue, @bio is ready to be issued. Put it on its 1018 * bio_lists[] and decrease total number queued. The caller is 1019 * responsible for issuing these bios. 1020 */ 1021 if (parent_tg) { 1022 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 1023 start_parent_slice_with_credit(tg, parent_tg, rw); 1024 } else { 1025 bio_set_flag(bio, BIO_BPS_THROTTLED); 1026 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 1027 parent_sq); 1028 BUG_ON(tg->td->nr_queued[rw] <= 0); 1029 tg->td->nr_queued[rw]--; 1030 } 1031 1032 throtl_trim_slice(tg, rw); 1033 1034 if (tg_to_put) 1035 blkg_put(tg_to_blkg(tg_to_put)); 1036 } 1037 1038 static int throtl_dispatch_tg(struct throtl_grp *tg) 1039 { 1040 struct throtl_service_queue *sq = &tg->service_queue; 1041 unsigned int nr_reads = 0, nr_writes = 0; 1042 unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; 1043 unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; 1044 struct bio *bio; 1045 1046 /* Try to dispatch 75% READS and 25% WRITES */ 1047 1048 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 1049 tg_dispatch_time(tg, bio) == 0) { 1050 1051 tg_dispatch_one_bio(tg, READ); 1052 nr_reads++; 1053 1054 if (nr_reads >= max_nr_reads) 1055 break; 1056 } 1057 1058 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 1059 tg_dispatch_time(tg, bio) == 0) { 1060 1061 tg_dispatch_one_bio(tg, WRITE); 1062 nr_writes++; 1063 1064 if (nr_writes >= max_nr_writes) 1065 break; 1066 } 1067 1068 return nr_reads + nr_writes; 1069 } 1070 1071 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 1072 { 1073 unsigned int nr_disp = 0; 1074 1075 while (1) { 1076 struct throtl_grp *tg; 1077 struct throtl_service_queue *sq; 1078 1079 if (!parent_sq->nr_pending) 1080 break; 1081 1082 tg = throtl_rb_first(parent_sq); 1083 if (!tg) 1084 break; 1085 1086 if (time_before(jiffies, tg->disptime)) 1087 break; 1088 1089 nr_disp += throtl_dispatch_tg(tg); 1090 1091 sq = &tg->service_queue; 1092 if (sq_queued(sq, READ) || sq_queued(sq, WRITE)) 1093 tg_update_disptime(tg); 1094 else 1095 throtl_dequeue_tg(tg); 1096 1097 if (nr_disp >= THROTL_QUANTUM) 1098 break; 1099 } 1100 1101 return nr_disp; 1102 } 1103 1104 /** 1105 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1106 * @t: the pending_timer member of the throtl_service_queue being serviced 1107 * 1108 * This timer is armed when a child throtl_grp with active bio's become 1109 * pending and queued on the service_queue's pending_tree and expires when 1110 * the first child throtl_grp should be dispatched. This function 1111 * dispatches bio's from the children throtl_grps to the parent 1112 * service_queue. 1113 * 1114 * If the parent's parent is another throtl_grp, dispatching is propagated 1115 * by either arming its pending_timer or repeating dispatch directly. If 1116 * the top-level service_tree is reached, throtl_data->dispatch_work is 1117 * kicked so that the ready bio's are issued. 1118 */ 1119 static void throtl_pending_timer_fn(struct timer_list *t) 1120 { 1121 struct throtl_service_queue *sq = timer_container_of(sq, t, 1122 pending_timer); 1123 struct throtl_grp *tg = sq_to_tg(sq); 1124 struct throtl_data *td = sq_to_td(sq); 1125 struct throtl_service_queue *parent_sq; 1126 struct request_queue *q; 1127 bool dispatched; 1128 int ret; 1129 1130 /* throtl_data may be gone, so figure out request queue by blkg */ 1131 if (tg) 1132 q = tg->pd.blkg->q; 1133 else 1134 q = td->queue; 1135 1136 spin_lock_irq(&q->queue_lock); 1137 1138 if (!q->root_blkg) 1139 goto out_unlock; 1140 1141 again: 1142 parent_sq = sq->parent_sq; 1143 dispatched = false; 1144 1145 while (true) { 1146 unsigned int __maybe_unused bio_cnt_r = sq_queued(sq, READ); 1147 unsigned int __maybe_unused bio_cnt_w = sq_queued(sq, WRITE); 1148 1149 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 1150 bio_cnt_r + bio_cnt_w, bio_cnt_r, bio_cnt_w); 1151 1152 ret = throtl_select_dispatch(sq); 1153 if (ret) { 1154 throtl_log(sq, "bios disp=%u", ret); 1155 dispatched = true; 1156 } 1157 1158 if (throtl_schedule_next_dispatch(sq, false)) 1159 break; 1160 1161 /* this dispatch windows is still open, relax and repeat */ 1162 spin_unlock_irq(&q->queue_lock); 1163 cpu_relax(); 1164 spin_lock_irq(&q->queue_lock); 1165 } 1166 1167 if (!dispatched) 1168 goto out_unlock; 1169 1170 if (parent_sq) { 1171 /* @parent_sq is another throl_grp, propagate dispatch */ 1172 if (tg->flags & THROTL_TG_WAS_EMPTY || 1173 tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { 1174 tg_update_disptime(tg); 1175 if (!throtl_schedule_next_dispatch(parent_sq, false)) { 1176 /* window is already open, repeat dispatching */ 1177 sq = parent_sq; 1178 tg = sq_to_tg(sq); 1179 goto again; 1180 } 1181 } 1182 } else { 1183 /* reached the top-level, queue issuing */ 1184 queue_work(kthrotld_workqueue, &td->dispatch_work); 1185 } 1186 out_unlock: 1187 spin_unlock_irq(&q->queue_lock); 1188 } 1189 1190 /** 1191 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1192 * @work: work item being executed 1193 * 1194 * This function is queued for execution when bios reach the bio_lists[] 1195 * of throtl_data->service_queue. Those bios are ready and issued by this 1196 * function. 1197 */ 1198 static void blk_throtl_dispatch_work_fn(struct work_struct *work) 1199 { 1200 struct throtl_data *td = container_of(work, struct throtl_data, 1201 dispatch_work); 1202 struct throtl_service_queue *td_sq = &td->service_queue; 1203 struct request_queue *q = td->queue; 1204 struct bio_list bio_list_on_stack; 1205 struct bio *bio; 1206 struct blk_plug plug; 1207 int rw; 1208 1209 bio_list_init(&bio_list_on_stack); 1210 1211 spin_lock_irq(&q->queue_lock); 1212 for (rw = READ; rw <= WRITE; rw++) 1213 while ((bio = throtl_pop_queued(td_sq, NULL, rw))) 1214 bio_list_add(&bio_list_on_stack, bio); 1215 spin_unlock_irq(&q->queue_lock); 1216 1217 if (!bio_list_empty(&bio_list_on_stack)) { 1218 blk_start_plug(&plug); 1219 while ((bio = bio_list_pop(&bio_list_on_stack))) 1220 submit_bio_noacct_nocheck(bio, false); 1221 blk_finish_plug(&plug); 1222 } 1223 } 1224 1225 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1226 int off) 1227 { 1228 struct throtl_grp *tg = pd_to_tg(pd); 1229 u64 v = *(u64 *)((void *)tg + off); 1230 1231 if (v == U64_MAX) 1232 return 0; 1233 return __blkg_prfill_u64(sf, pd, v); 1234 } 1235 1236 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1237 int off) 1238 { 1239 struct throtl_grp *tg = pd_to_tg(pd); 1240 unsigned int v = *(unsigned int *)((void *)tg + off); 1241 1242 if (v == UINT_MAX) 1243 return 0; 1244 return __blkg_prfill_u64(sf, pd, v); 1245 } 1246 1247 static int tg_print_conf_u64(struct seq_file *sf, void *v) 1248 { 1249 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1250 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1251 return 0; 1252 } 1253 1254 static int tg_print_conf_uint(struct seq_file *sf, void *v) 1255 { 1256 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1257 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1258 return 0; 1259 } 1260 1261 static void tg_conf_updated(struct throtl_grp *tg, bool global) 1262 { 1263 struct throtl_service_queue *sq = &tg->service_queue; 1264 struct cgroup_subsys_state *pos_css; 1265 struct blkcg_gq *blkg; 1266 1267 throtl_log(&tg->service_queue, 1268 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1269 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), 1270 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); 1271 1272 rcu_read_lock(); 1273 /* 1274 * Update has_rules[] flags for the updated tg's subtree. A tg is 1275 * considered to have rules if either the tg itself or any of its 1276 * ancestors has rules. This identifies groups without any 1277 * restrictions in the whole hierarchy and allows them to bypass 1278 * blk-throttle. 1279 */ 1280 blkg_for_each_descendant_pre(blkg, pos_css, 1281 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { 1282 struct throtl_grp *this_tg = blkg_to_tg(blkg); 1283 1284 tg_update_has_rules(this_tg); 1285 /* ignore root/second level */ 1286 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || 1287 !blkg->parent->parent) 1288 continue; 1289 } 1290 rcu_read_unlock(); 1291 1292 /* 1293 * We're already holding queue_lock and know @tg is valid. Let's 1294 * apply the new config directly. 1295 * 1296 * Restart the slices for both READ and WRITES. It might happen 1297 * that a group's limit are dropped suddenly and we don't want to 1298 * account recently dispatched IO with new low rate. 1299 */ 1300 throtl_start_new_slice(tg, READ, false); 1301 throtl_start_new_slice(tg, WRITE, false); 1302 1303 if (tg->flags & THROTL_TG_PENDING) { 1304 tg_update_disptime(tg); 1305 throtl_schedule_next_dispatch(sq->parent_sq, true); 1306 } 1307 } 1308 1309 static int blk_throtl_init(struct gendisk *disk) 1310 { 1311 struct request_queue *q = disk->queue; 1312 struct throtl_data *td; 1313 unsigned int memflags; 1314 int ret; 1315 1316 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1317 if (!td) 1318 return -ENOMEM; 1319 1320 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1321 throtl_service_queue_init(&td->service_queue); 1322 1323 memflags = blk_mq_freeze_queue(disk->queue); 1324 blk_mq_quiesce_queue(disk->queue); 1325 1326 q->td = td; 1327 td->queue = q; 1328 1329 /* activate policy, blk_throtl_activated() will return true */ 1330 ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); 1331 if (ret) { 1332 q->td = NULL; 1333 kfree(td); 1334 } 1335 1336 blk_mq_unquiesce_queue(disk->queue); 1337 blk_mq_unfreeze_queue(disk->queue, memflags); 1338 1339 return ret; 1340 } 1341 1342 1343 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1344 char *buf, size_t nbytes, loff_t off, bool is_u64) 1345 { 1346 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1347 struct blkg_conf_ctx ctx; 1348 struct throtl_grp *tg; 1349 int ret; 1350 u64 v; 1351 1352 blkg_conf_init(&ctx, buf); 1353 1354 ret = blkg_conf_open_bdev(&ctx); 1355 if (ret) 1356 goto out_finish; 1357 1358 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1359 ret = blk_throtl_init(ctx.bdev->bd_disk); 1360 if (ret) 1361 goto out_finish; 1362 } 1363 1364 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1365 if (ret) 1366 goto out_finish; 1367 1368 ret = -EINVAL; 1369 if (sscanf(ctx.body, "%llu", &v) != 1) 1370 goto out_finish; 1371 if (!v) 1372 v = U64_MAX; 1373 1374 tg = blkg_to_tg(ctx.blkg); 1375 tg_update_carryover(tg); 1376 1377 if (is_u64) 1378 *(u64 *)((void *)tg + of_cft(of)->private) = v; 1379 else 1380 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1381 1382 tg_conf_updated(tg, false); 1383 ret = 0; 1384 out_finish: 1385 blkg_conf_exit(&ctx); 1386 return ret ?: nbytes; 1387 } 1388 1389 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1390 char *buf, size_t nbytes, loff_t off) 1391 { 1392 return tg_set_conf(of, buf, nbytes, off, true); 1393 } 1394 1395 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1396 char *buf, size_t nbytes, loff_t off) 1397 { 1398 return tg_set_conf(of, buf, nbytes, off, false); 1399 } 1400 1401 static int tg_print_rwstat(struct seq_file *sf, void *v) 1402 { 1403 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1404 blkg_prfill_rwstat, &blkcg_policy_throtl, 1405 seq_cft(sf)->private, true); 1406 return 0; 1407 } 1408 1409 static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, 1410 struct blkg_policy_data *pd, int off) 1411 { 1412 struct blkg_rwstat_sample sum; 1413 1414 blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, 1415 &sum); 1416 return __blkg_prfill_rwstat(sf, pd, &sum); 1417 } 1418 1419 static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) 1420 { 1421 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1422 tg_prfill_rwstat_recursive, &blkcg_policy_throtl, 1423 seq_cft(sf)->private, true); 1424 return 0; 1425 } 1426 1427 static struct cftype throtl_legacy_files[] = { 1428 { 1429 .name = "throttle.read_bps_device", 1430 .private = offsetof(struct throtl_grp, bps[READ]), 1431 .seq_show = tg_print_conf_u64, 1432 .write = tg_set_conf_u64, 1433 }, 1434 { 1435 .name = "throttle.write_bps_device", 1436 .private = offsetof(struct throtl_grp, bps[WRITE]), 1437 .seq_show = tg_print_conf_u64, 1438 .write = tg_set_conf_u64, 1439 }, 1440 { 1441 .name = "throttle.read_iops_device", 1442 .private = offsetof(struct throtl_grp, iops[READ]), 1443 .seq_show = tg_print_conf_uint, 1444 .write = tg_set_conf_uint, 1445 }, 1446 { 1447 .name = "throttle.write_iops_device", 1448 .private = offsetof(struct throtl_grp, iops[WRITE]), 1449 .seq_show = tg_print_conf_uint, 1450 .write = tg_set_conf_uint, 1451 }, 1452 { 1453 .name = "throttle.io_service_bytes", 1454 .private = offsetof(struct throtl_grp, stat_bytes), 1455 .seq_show = tg_print_rwstat, 1456 }, 1457 { 1458 .name = "throttle.io_service_bytes_recursive", 1459 .private = offsetof(struct throtl_grp, stat_bytes), 1460 .seq_show = tg_print_rwstat_recursive, 1461 }, 1462 { 1463 .name = "throttle.io_serviced", 1464 .private = offsetof(struct throtl_grp, stat_ios), 1465 .seq_show = tg_print_rwstat, 1466 }, 1467 { 1468 .name = "throttle.io_serviced_recursive", 1469 .private = offsetof(struct throtl_grp, stat_ios), 1470 .seq_show = tg_print_rwstat_recursive, 1471 }, 1472 { } /* terminate */ 1473 }; 1474 1475 static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, 1476 int off) 1477 { 1478 struct throtl_grp *tg = pd_to_tg(pd); 1479 const char *dname = blkg_dev_name(pd->blkg); 1480 u64 bps_dft; 1481 unsigned int iops_dft; 1482 1483 if (!dname) 1484 return 0; 1485 1486 bps_dft = U64_MAX; 1487 iops_dft = UINT_MAX; 1488 1489 if (tg->bps[READ] == bps_dft && 1490 tg->bps[WRITE] == bps_dft && 1491 tg->iops[READ] == iops_dft && 1492 tg->iops[WRITE] == iops_dft) 1493 return 0; 1494 1495 seq_printf(sf, "%s", dname); 1496 if (tg->bps[READ] == U64_MAX) 1497 seq_printf(sf, " rbps=max"); 1498 else 1499 seq_printf(sf, " rbps=%llu", tg->bps[READ]); 1500 1501 if (tg->bps[WRITE] == U64_MAX) 1502 seq_printf(sf, " wbps=max"); 1503 else 1504 seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); 1505 1506 if (tg->iops[READ] == UINT_MAX) 1507 seq_printf(sf, " riops=max"); 1508 else 1509 seq_printf(sf, " riops=%u", tg->iops[READ]); 1510 1511 if (tg->iops[WRITE] == UINT_MAX) 1512 seq_printf(sf, " wiops=max"); 1513 else 1514 seq_printf(sf, " wiops=%u", tg->iops[WRITE]); 1515 1516 seq_printf(sf, "\n"); 1517 return 0; 1518 } 1519 1520 static int tg_print_limit(struct seq_file *sf, void *v) 1521 { 1522 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, 1523 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1524 return 0; 1525 } 1526 1527 static ssize_t tg_set_limit(struct kernfs_open_file *of, 1528 char *buf, size_t nbytes, loff_t off) 1529 { 1530 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1531 struct blkg_conf_ctx ctx; 1532 struct throtl_grp *tg; 1533 u64 v[4]; 1534 int ret; 1535 1536 blkg_conf_init(&ctx, buf); 1537 1538 ret = blkg_conf_open_bdev(&ctx); 1539 if (ret) 1540 goto out_finish; 1541 1542 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1543 ret = blk_throtl_init(ctx.bdev->bd_disk); 1544 if (ret) 1545 goto out_finish; 1546 } 1547 1548 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1549 if (ret) 1550 goto out_finish; 1551 1552 tg = blkg_to_tg(ctx.blkg); 1553 tg_update_carryover(tg); 1554 1555 v[0] = tg->bps[READ]; 1556 v[1] = tg->bps[WRITE]; 1557 v[2] = tg->iops[READ]; 1558 v[3] = tg->iops[WRITE]; 1559 1560 while (true) { 1561 char tok[27]; /* wiops=18446744073709551616 */ 1562 char *p; 1563 u64 val = U64_MAX; 1564 int len; 1565 1566 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1567 break; 1568 if (tok[0] == '\0') 1569 break; 1570 ctx.body += len; 1571 1572 ret = -EINVAL; 1573 p = tok; 1574 strsep(&p, "="); 1575 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) 1576 goto out_finish; 1577 1578 ret = -ERANGE; 1579 if (!val) 1580 goto out_finish; 1581 1582 ret = -EINVAL; 1583 if (!strcmp(tok, "rbps")) 1584 v[0] = val; 1585 else if (!strcmp(tok, "wbps")) 1586 v[1] = val; 1587 else if (!strcmp(tok, "riops")) 1588 v[2] = min_t(u64, val, UINT_MAX); 1589 else if (!strcmp(tok, "wiops")) 1590 v[3] = min_t(u64, val, UINT_MAX); 1591 else 1592 goto out_finish; 1593 } 1594 1595 tg->bps[READ] = v[0]; 1596 tg->bps[WRITE] = v[1]; 1597 tg->iops[READ] = v[2]; 1598 tg->iops[WRITE] = v[3]; 1599 1600 tg_conf_updated(tg, false); 1601 ret = 0; 1602 out_finish: 1603 blkg_conf_exit(&ctx); 1604 return ret ?: nbytes; 1605 } 1606 1607 static struct cftype throtl_files[] = { 1608 { 1609 .name = "max", 1610 .flags = CFTYPE_NOT_ON_ROOT, 1611 .seq_show = tg_print_limit, 1612 .write = tg_set_limit, 1613 }, 1614 { } /* terminate */ 1615 }; 1616 1617 static void throtl_shutdown_wq(struct request_queue *q) 1618 { 1619 struct throtl_data *td = q->td; 1620 1621 cancel_work_sync(&td->dispatch_work); 1622 } 1623 1624 static void tg_flush_bios(struct throtl_grp *tg) 1625 { 1626 struct throtl_service_queue *sq = &tg->service_queue; 1627 1628 if (tg->flags & THROTL_TG_CANCELING) 1629 return; 1630 /* 1631 * Set the flag to make sure throtl_pending_timer_fn() won't 1632 * stop until all throttled bios are dispatched. 1633 */ 1634 tg->flags |= THROTL_TG_CANCELING; 1635 1636 /* 1637 * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup 1638 * will be inserted to service queue without THROTL_TG_PENDING 1639 * set in tg_update_disptime below. Then IO dispatched from 1640 * child in tg_dispatch_one_bio will trigger double insertion 1641 * and corrupt the tree. 1642 */ 1643 if (!(tg->flags & THROTL_TG_PENDING)) 1644 return; 1645 1646 /* 1647 * Update disptime after setting the above flag to make sure 1648 * throtl_select_dispatch() won't exit without dispatching. 1649 */ 1650 tg_update_disptime(tg); 1651 1652 throtl_schedule_pending_timer(sq, jiffies + 1); 1653 } 1654 1655 static void throtl_pd_offline(struct blkg_policy_data *pd) 1656 { 1657 tg_flush_bios(pd_to_tg(pd)); 1658 } 1659 1660 struct blkcg_policy blkcg_policy_throtl = { 1661 .dfl_cftypes = throtl_files, 1662 .legacy_cftypes = throtl_legacy_files, 1663 1664 .pd_alloc_fn = throtl_pd_alloc, 1665 .pd_init_fn = throtl_pd_init, 1666 .pd_online_fn = throtl_pd_online, 1667 .pd_offline_fn = throtl_pd_offline, 1668 .pd_free_fn = throtl_pd_free, 1669 }; 1670 1671 void blk_throtl_cancel_bios(struct gendisk *disk) 1672 { 1673 struct request_queue *q = disk->queue; 1674 struct cgroup_subsys_state *pos_css; 1675 struct blkcg_gq *blkg; 1676 1677 if (!blk_throtl_activated(q)) 1678 return; 1679 1680 spin_lock_irq(&q->queue_lock); 1681 /* 1682 * queue_lock is held, rcu lock is not needed here technically. 1683 * However, rcu lock is still held to emphasize that following 1684 * path need RCU protection and to prevent warning from lockdep. 1685 */ 1686 rcu_read_lock(); 1687 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1688 /* 1689 * disk_release will call pd_offline_fn to cancel bios. 1690 * However, disk_release can't be called if someone get 1691 * the refcount of device and issued bios which are 1692 * inflight after del_gendisk. 1693 * Cancel bios here to ensure no bios are inflight after 1694 * del_gendisk. 1695 */ 1696 tg_flush_bios(blkg_to_tg(blkg)); 1697 } 1698 rcu_read_unlock(); 1699 spin_unlock_irq(&q->queue_lock); 1700 } 1701 1702 static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) 1703 { 1704 struct throtl_service_queue *sq = &tg->service_queue; 1705 1706 /* 1707 * For a split bio, we need to specifically distinguish whether the 1708 * iops queue is empty. 1709 */ 1710 if (bio_flagged(bio, BIO_BPS_THROTTLED)) 1711 return sq->nr_queued_iops[rw] == 0 && 1712 tg_dispatch_iops_time(tg, bio) == 0; 1713 1714 /* 1715 * Throtl is FIFO - if bios are already queued, should queue. 1716 * If the bps queue is empty and @bio is within the bps limit, charge 1717 * bps here for direct placement into the iops queue. 1718 */ 1719 if (sq_queued(&tg->service_queue, rw)) { 1720 if (sq->nr_queued_bps[rw] == 0 && 1721 tg_dispatch_bps_time(tg, bio) == 0) 1722 throtl_charge_bps_bio(tg, bio); 1723 1724 return false; 1725 } 1726 1727 return tg_dispatch_time(tg, bio) == 0; 1728 } 1729 1730 bool __blk_throtl_bio(struct bio *bio) 1731 { 1732 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1733 struct blkcg_gq *blkg = bio->bi_blkg; 1734 struct throtl_qnode *qn = NULL; 1735 struct throtl_grp *tg = blkg_to_tg(blkg); 1736 struct throtl_service_queue *sq; 1737 bool rw = bio_data_dir(bio); 1738 bool throttled = false; 1739 struct throtl_data *td = tg->td; 1740 1741 rcu_read_lock(); 1742 spin_lock_irq(&q->queue_lock); 1743 sq = &tg->service_queue; 1744 1745 while (true) { 1746 if (tg_within_limit(tg, bio, rw)) { 1747 /* within limits, let's charge and dispatch directly */ 1748 throtl_charge_iops_bio(tg, bio); 1749 1750 /* 1751 * We need to trim slice even when bios are not being 1752 * queued otherwise it might happen that a bio is not 1753 * queued for a long time and slice keeps on extending 1754 * and trim is not called for a long time. Now if limits 1755 * are reduced suddenly we take into account all the IO 1756 * dispatched so far at new low rate and * newly queued 1757 * IO gets a really long dispatch time. 1758 * 1759 * So keep on trimming slice even if bio is not queued. 1760 */ 1761 throtl_trim_slice(tg, rw); 1762 } else if (bio_issue_as_root_blkg(bio)) { 1763 /* 1764 * IOs which may cause priority inversions are 1765 * dispatched directly, even if they're over limit. 1766 * 1767 * Charge and dispatch directly, and our throttle 1768 * control algorithm is adaptive, and extra IO bytes 1769 * will be throttled for paying the debt 1770 */ 1771 throtl_charge_bps_bio(tg, bio); 1772 throtl_charge_iops_bio(tg, bio); 1773 } else { 1774 /* if above limits, break to queue */ 1775 break; 1776 } 1777 1778 /* 1779 * @bio passed through this layer without being throttled. 1780 * Climb up the ladder. If we're already at the top, it 1781 * can be executed directly. 1782 */ 1783 qn = &tg->qnode_on_parent[rw]; 1784 sq = sq->parent_sq; 1785 tg = sq_to_tg(sq); 1786 if (!tg) { 1787 bio_set_flag(bio, BIO_BPS_THROTTLED); 1788 goto out_unlock; 1789 } 1790 } 1791 1792 /* out-of-limit, queue to @tg */ 1793 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 1794 rw == READ ? 'R' : 'W', 1795 tg->bytes_disp[rw], bio->bi_iter.bi_size, 1796 tg_bps_limit(tg, rw), 1797 tg->io_disp[rw], tg_iops_limit(tg, rw), 1798 sq_queued(sq, READ), sq_queued(sq, WRITE)); 1799 1800 td->nr_queued[rw]++; 1801 throtl_add_bio_tg(bio, qn, tg); 1802 throttled = true; 1803 1804 /* 1805 * Update @tg's dispatch time and force schedule dispatch if @tg 1806 * was empty before @bio, or the iops queue is empty and @bio will 1807 * add to. The forced scheduling isn't likely to cause undue 1808 * delay as @bio is likely to be dispatched directly if its @tg's 1809 * disptime is not in the future. 1810 */ 1811 if (tg->flags & THROTL_TG_WAS_EMPTY || 1812 tg->flags & THROTL_TG_IOPS_WAS_EMPTY) { 1813 tg_update_disptime(tg); 1814 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1815 } 1816 1817 out_unlock: 1818 spin_unlock_irq(&q->queue_lock); 1819 1820 rcu_read_unlock(); 1821 return throttled; 1822 } 1823 1824 void blk_throtl_exit(struct gendisk *disk) 1825 { 1826 struct request_queue *q = disk->queue; 1827 1828 /* 1829 * blkg_destroy_all() already deactivate throtl policy, just check and 1830 * free throtl data. 1831 */ 1832 if (!q->td) 1833 return; 1834 1835 timer_delete_sync(&q->td->service_queue.pending_timer); 1836 throtl_shutdown_wq(q); 1837 kfree(q->td); 1838 } 1839 1840 static int __init throtl_init(void) 1841 { 1842 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1843 if (!kthrotld_workqueue) 1844 panic("Failed to create kthrotld\n"); 1845 1846 return blkcg_policy_register(&blkcg_policy_throtl); 1847 } 1848 1849 module_init(throtl_init); 1850