1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Interface for controlling IO bandwidth on a request queue 4 * 5 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/bio.h> 12 #include <linux/blktrace_api.h> 13 #include "blk.h" 14 #include "blk-cgroup-rwstat.h" 15 #include "blk-stat.h" 16 #include "blk-throttle.h" 17 18 /* Max dispatch from a group in 1 round */ 19 #define THROTL_GRP_QUANTUM 8 20 21 /* Total max dispatch from all groups in one round */ 22 #define THROTL_QUANTUM 32 23 24 /* Throttling is performed over a slice and after that slice is renewed */ 25 #define DFL_THROTL_SLICE_HD (HZ / 10) 26 #define DFL_THROTL_SLICE_SSD (HZ / 50) 27 #define MAX_THROTL_SLICE (HZ) 28 29 /* A workqueue to queue throttle related work */ 30 static struct workqueue_struct *kthrotld_workqueue; 31 32 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 33 34 /* We measure latency for request size from <= 4k to >= 1M */ 35 #define LATENCY_BUCKET_SIZE 9 36 37 struct latency_bucket { 38 unsigned long total_latency; /* ns / 1024 */ 39 int samples; 40 }; 41 42 struct throtl_data 43 { 44 /* service tree for active throtl groups */ 45 struct throtl_service_queue service_queue; 46 47 struct request_queue *queue; 48 49 /* Total Number of queued bios on READ and WRITE lists */ 50 unsigned int nr_queued[2]; 51 52 unsigned int throtl_slice; 53 54 /* Work for dispatching throttled bios */ 55 struct work_struct dispatch_work; 56 57 bool track_bio_latency; 58 }; 59 60 static void throtl_pending_timer_fn(struct timer_list *t); 61 62 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 63 { 64 return pd_to_blkg(&tg->pd); 65 } 66 67 /** 68 * sq_to_tg - return the throl_grp the specified service queue belongs to 69 * @sq: the throtl_service_queue of interest 70 * 71 * Return the throtl_grp @sq belongs to. If @sq is the top-level one 72 * embedded in throtl_data, %NULL is returned. 73 */ 74 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 75 { 76 if (sq && sq->parent_sq) 77 return container_of(sq, struct throtl_grp, service_queue); 78 else 79 return NULL; 80 } 81 82 /** 83 * sq_to_td - return throtl_data the specified service queue belongs to 84 * @sq: the throtl_service_queue of interest 85 * 86 * A service_queue can be embedded in either a throtl_grp or throtl_data. 87 * Determine the associated throtl_data accordingly and return it. 88 */ 89 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 90 { 91 struct throtl_grp *tg = sq_to_tg(sq); 92 93 if (tg) 94 return tg->td; 95 else 96 return container_of(sq, struct throtl_data, service_queue); 97 } 98 99 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 100 { 101 struct blkcg_gq *blkg = tg_to_blkg(tg); 102 103 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 104 return U64_MAX; 105 106 return tg->bps[rw]; 107 } 108 109 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 110 { 111 struct blkcg_gq *blkg = tg_to_blkg(tg); 112 113 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 114 return UINT_MAX; 115 116 return tg->iops[rw]; 117 } 118 119 #define request_bucket_index(sectors) \ 120 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) 121 122 /** 123 * throtl_log - log debug message via blktrace 124 * @sq: the service_queue being reported 125 * @fmt: printf format string 126 * @args: printf args 127 * 128 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 129 * throtl_grp; otherwise, just "throtl". 130 */ 131 #define throtl_log(sq, fmt, args...) do { \ 132 struct throtl_grp *__tg = sq_to_tg((sq)); \ 133 struct throtl_data *__td = sq_to_td((sq)); \ 134 \ 135 (void)__td; \ 136 if (likely(!blk_trace_note_message_enabled(__td->queue))) \ 137 break; \ 138 if ((__tg)) { \ 139 blk_add_cgroup_trace_msg(__td->queue, \ 140 &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ 141 } else { \ 142 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 143 } \ 144 } while (0) 145 146 static inline unsigned int throtl_bio_data_size(struct bio *bio) 147 { 148 /* assume it's one sector */ 149 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 150 return 512; 151 return bio->bi_iter.bi_size; 152 } 153 154 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 155 { 156 INIT_LIST_HEAD(&qn->node); 157 bio_list_init(&qn->bios); 158 qn->tg = tg; 159 } 160 161 /** 162 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 163 * @bio: bio being added 164 * @qn: qnode to add bio to 165 * @queued: the service_queue->queued[] list @qn belongs to 166 * 167 * Add @bio to @qn and put @qn on @queued if it's not already on. 168 * @qn->tg's reference count is bumped when @qn is activated. See the 169 * comment on top of throtl_qnode definition for details. 170 */ 171 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 172 struct list_head *queued) 173 { 174 bio_list_add(&qn->bios, bio); 175 if (list_empty(&qn->node)) { 176 list_add_tail(&qn->node, queued); 177 blkg_get(tg_to_blkg(qn->tg)); 178 } 179 } 180 181 /** 182 * throtl_peek_queued - peek the first bio on a qnode list 183 * @queued: the qnode list to peek 184 */ 185 static struct bio *throtl_peek_queued(struct list_head *queued) 186 { 187 struct throtl_qnode *qn; 188 struct bio *bio; 189 190 if (list_empty(queued)) 191 return NULL; 192 193 qn = list_first_entry(queued, struct throtl_qnode, node); 194 bio = bio_list_peek(&qn->bios); 195 WARN_ON_ONCE(!bio); 196 return bio; 197 } 198 199 /** 200 * throtl_pop_queued - pop the first bio form a qnode list 201 * @queued: the qnode list to pop a bio from 202 * @tg_to_put: optional out argument for throtl_grp to put 203 * 204 * Pop the first bio from the qnode list @queued. After popping, the first 205 * qnode is removed from @queued if empty or moved to the end of @queued so 206 * that the popping order is round-robin. 207 * 208 * When the first qnode is removed, its associated throtl_grp should be put 209 * too. If @tg_to_put is NULL, this function automatically puts it; 210 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 211 * responsible for putting it. 212 */ 213 static struct bio *throtl_pop_queued(struct list_head *queued, 214 struct throtl_grp **tg_to_put) 215 { 216 struct throtl_qnode *qn; 217 struct bio *bio; 218 219 if (list_empty(queued)) 220 return NULL; 221 222 qn = list_first_entry(queued, struct throtl_qnode, node); 223 bio = bio_list_pop(&qn->bios); 224 WARN_ON_ONCE(!bio); 225 226 if (bio_list_empty(&qn->bios)) { 227 list_del_init(&qn->node); 228 if (tg_to_put) 229 *tg_to_put = qn->tg; 230 else 231 blkg_put(tg_to_blkg(qn->tg)); 232 } else { 233 list_move_tail(&qn->node, queued); 234 } 235 236 return bio; 237 } 238 239 /* init a service_queue, assumes the caller zeroed it */ 240 static void throtl_service_queue_init(struct throtl_service_queue *sq) 241 { 242 INIT_LIST_HEAD(&sq->queued[READ]); 243 INIT_LIST_HEAD(&sq->queued[WRITE]); 244 sq->pending_tree = RB_ROOT_CACHED; 245 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); 246 } 247 248 static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, 249 struct blkcg *blkcg, gfp_t gfp) 250 { 251 struct throtl_grp *tg; 252 int rw; 253 254 tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); 255 if (!tg) 256 return NULL; 257 258 if (blkg_rwstat_init(&tg->stat_bytes, gfp)) 259 goto err_free_tg; 260 261 if (blkg_rwstat_init(&tg->stat_ios, gfp)) 262 goto err_exit_stat_bytes; 263 264 throtl_service_queue_init(&tg->service_queue); 265 266 for (rw = READ; rw <= WRITE; rw++) { 267 throtl_qnode_init(&tg->qnode_on_self[rw], tg); 268 throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 269 } 270 271 RB_CLEAR_NODE(&tg->rb_node); 272 tg->bps[READ] = U64_MAX; 273 tg->bps[WRITE] = U64_MAX; 274 tg->iops[READ] = UINT_MAX; 275 tg->iops[WRITE] = UINT_MAX; 276 277 return &tg->pd; 278 279 err_exit_stat_bytes: 280 blkg_rwstat_exit(&tg->stat_bytes); 281 err_free_tg: 282 kfree(tg); 283 return NULL; 284 } 285 286 static void throtl_pd_init(struct blkg_policy_data *pd) 287 { 288 struct throtl_grp *tg = pd_to_tg(pd); 289 struct blkcg_gq *blkg = tg_to_blkg(tg); 290 struct throtl_data *td = blkg->q->td; 291 struct throtl_service_queue *sq = &tg->service_queue; 292 293 /* 294 * If on the default hierarchy, we switch to properly hierarchical 295 * behavior where limits on a given throtl_grp are applied to the 296 * whole subtree rather than just the group itself. e.g. If 16M 297 * read_bps limit is set on a parent group, summary bps of 298 * parent group and its subtree groups can't exceed 16M for the 299 * device. 300 * 301 * If not on the default hierarchy, the broken flat hierarchy 302 * behavior is retained where all throtl_grps are treated as if 303 * they're all separate root groups right below throtl_data. 304 * Limits of a group don't interact with limits of other groups 305 * regardless of the position of the group in the hierarchy. 306 */ 307 sq->parent_sq = &td->service_queue; 308 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 309 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 310 tg->td = td; 311 } 312 313 /* 314 * Set has_rules[] if @tg or any of its parents have limits configured. 315 * This doesn't require walking up to the top of the hierarchy as the 316 * parent's has_rules[] is guaranteed to be correct. 317 */ 318 static void tg_update_has_rules(struct throtl_grp *tg) 319 { 320 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 321 int rw; 322 323 for (rw = READ; rw <= WRITE; rw++) { 324 tg->has_rules_iops[rw] = 325 (parent_tg && parent_tg->has_rules_iops[rw]) || 326 tg_iops_limit(tg, rw) != UINT_MAX; 327 tg->has_rules_bps[rw] = 328 (parent_tg && parent_tg->has_rules_bps[rw]) || 329 tg_bps_limit(tg, rw) != U64_MAX; 330 } 331 } 332 333 static void throtl_pd_online(struct blkg_policy_data *pd) 334 { 335 struct throtl_grp *tg = pd_to_tg(pd); 336 /* 337 * We don't want new groups to escape the limits of its ancestors. 338 * Update has_rules[] after a new group is brought online. 339 */ 340 tg_update_has_rules(tg); 341 } 342 343 static void throtl_pd_free(struct blkg_policy_data *pd) 344 { 345 struct throtl_grp *tg = pd_to_tg(pd); 346 347 del_timer_sync(&tg->service_queue.pending_timer); 348 blkg_rwstat_exit(&tg->stat_bytes); 349 blkg_rwstat_exit(&tg->stat_ios); 350 kfree(tg); 351 } 352 353 static struct throtl_grp * 354 throtl_rb_first(struct throtl_service_queue *parent_sq) 355 { 356 struct rb_node *n; 357 358 n = rb_first_cached(&parent_sq->pending_tree); 359 WARN_ON_ONCE(!n); 360 if (!n) 361 return NULL; 362 return rb_entry_tg(n); 363 } 364 365 static void throtl_rb_erase(struct rb_node *n, 366 struct throtl_service_queue *parent_sq) 367 { 368 rb_erase_cached(n, &parent_sq->pending_tree); 369 RB_CLEAR_NODE(n); 370 } 371 372 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 373 { 374 struct throtl_grp *tg; 375 376 tg = throtl_rb_first(parent_sq); 377 if (!tg) 378 return; 379 380 parent_sq->first_pending_disptime = tg->disptime; 381 } 382 383 static void tg_service_queue_add(struct throtl_grp *tg) 384 { 385 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 386 struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; 387 struct rb_node *parent = NULL; 388 struct throtl_grp *__tg; 389 unsigned long key = tg->disptime; 390 bool leftmost = true; 391 392 while (*node != NULL) { 393 parent = *node; 394 __tg = rb_entry_tg(parent); 395 396 if (time_before(key, __tg->disptime)) 397 node = &parent->rb_left; 398 else { 399 node = &parent->rb_right; 400 leftmost = false; 401 } 402 } 403 404 rb_link_node(&tg->rb_node, parent, node); 405 rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, 406 leftmost); 407 } 408 409 static void throtl_enqueue_tg(struct throtl_grp *tg) 410 { 411 if (!(tg->flags & THROTL_TG_PENDING)) { 412 tg_service_queue_add(tg); 413 tg->flags |= THROTL_TG_PENDING; 414 tg->service_queue.parent_sq->nr_pending++; 415 } 416 } 417 418 static void throtl_dequeue_tg(struct throtl_grp *tg) 419 { 420 if (tg->flags & THROTL_TG_PENDING) { 421 struct throtl_service_queue *parent_sq = 422 tg->service_queue.parent_sq; 423 424 throtl_rb_erase(&tg->rb_node, parent_sq); 425 --parent_sq->nr_pending; 426 tg->flags &= ~THROTL_TG_PENDING; 427 } 428 } 429 430 /* Call with queue lock held */ 431 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 432 unsigned long expires) 433 { 434 unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; 435 436 /* 437 * Since we are adjusting the throttle limit dynamically, the sleep 438 * time calculated according to previous limit might be invalid. It's 439 * possible the cgroup sleep time is very long and no other cgroups 440 * have IO running so notify the limit changes. Make sure the cgroup 441 * doesn't sleep too long to avoid the missed notification. 442 */ 443 if (time_after(expires, max_expire)) 444 expires = max_expire; 445 mod_timer(&sq->pending_timer, expires); 446 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 447 expires - jiffies, jiffies); 448 } 449 450 /** 451 * throtl_schedule_next_dispatch - schedule the next dispatch cycle 452 * @sq: the service_queue to schedule dispatch for 453 * @force: force scheduling 454 * 455 * Arm @sq->pending_timer so that the next dispatch cycle starts on the 456 * dispatch time of the first pending child. Returns %true if either timer 457 * is armed or there's no pending child left. %false if the current 458 * dispatch window is still open and the caller should continue 459 * dispatching. 460 * 461 * If @force is %true, the dispatch timer is always scheduled and this 462 * function is guaranteed to return %true. This is to be used when the 463 * caller can't dispatch itself and needs to invoke pending_timer 464 * unconditionally. Note that forced scheduling is likely to induce short 465 * delay before dispatch starts even if @sq->first_pending_disptime is not 466 * in the future and thus shouldn't be used in hot paths. 467 */ 468 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 469 bool force) 470 { 471 /* any pending children left? */ 472 if (!sq->nr_pending) 473 return true; 474 475 update_min_dispatch_time(sq); 476 477 /* is the next dispatch time in the future? */ 478 if (force || time_after(sq->first_pending_disptime, jiffies)) { 479 throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 480 return true; 481 } 482 483 /* tell the caller to continue dispatching */ 484 return false; 485 } 486 487 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 488 bool rw, unsigned long start) 489 { 490 tg->bytes_disp[rw] = 0; 491 tg->io_disp[rw] = 0; 492 tg->carryover_bytes[rw] = 0; 493 tg->carryover_ios[rw] = 0; 494 495 /* 496 * Previous slice has expired. We must have trimmed it after last 497 * bio dispatch. That means since start of last slice, we never used 498 * that bandwidth. Do try to make use of that bandwidth while giving 499 * credit. 500 */ 501 if (time_after(start, tg->slice_start[rw])) 502 tg->slice_start[rw] = start; 503 504 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 505 throtl_log(&tg->service_queue, 506 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 507 rw == READ ? 'R' : 'W', tg->slice_start[rw], 508 tg->slice_end[rw], jiffies); 509 } 510 511 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, 512 bool clear_carryover) 513 { 514 tg->bytes_disp[rw] = 0; 515 tg->io_disp[rw] = 0; 516 tg->slice_start[rw] = jiffies; 517 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 518 if (clear_carryover) { 519 tg->carryover_bytes[rw] = 0; 520 tg->carryover_ios[rw] = 0; 521 } 522 523 throtl_log(&tg->service_queue, 524 "[%c] new slice start=%lu end=%lu jiffies=%lu", 525 rw == READ ? 'R' : 'W', tg->slice_start[rw], 526 tg->slice_end[rw], jiffies); 527 } 528 529 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 530 unsigned long jiffy_end) 531 { 532 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); 533 } 534 535 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 536 unsigned long jiffy_end) 537 { 538 throtl_set_slice_end(tg, rw, jiffy_end); 539 throtl_log(&tg->service_queue, 540 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 541 rw == READ ? 'R' : 'W', tg->slice_start[rw], 542 tg->slice_end[rw], jiffies); 543 } 544 545 /* Determine if previously allocated or extended slice is complete or not */ 546 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 547 { 548 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 549 return false; 550 551 return true; 552 } 553 554 static unsigned int calculate_io_allowed(u32 iops_limit, 555 unsigned long jiffy_elapsed) 556 { 557 unsigned int io_allowed; 558 u64 tmp; 559 560 /* 561 * jiffy_elapsed should not be a big value as minimum iops can be 562 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 563 * will allow dispatch after 1 second and after that slice should 564 * have been trimmed. 565 */ 566 567 tmp = (u64)iops_limit * jiffy_elapsed; 568 do_div(tmp, HZ); 569 570 if (tmp > UINT_MAX) 571 io_allowed = UINT_MAX; 572 else 573 io_allowed = tmp; 574 575 return io_allowed; 576 } 577 578 static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) 579 { 580 /* 581 * Can result be wider than 64 bits? 582 * We check against 62, not 64, due to ilog2 truncation. 583 */ 584 if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) 585 return U64_MAX; 586 return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); 587 } 588 589 /* Trim the used slices and adjust slice start accordingly */ 590 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 591 { 592 unsigned long time_elapsed; 593 long long bytes_trim; 594 int io_trim; 595 596 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 597 598 /* 599 * If bps are unlimited (-1), then time slice don't get 600 * renewed. Don't try to trim the slice if slice is used. A new 601 * slice will start when appropriate. 602 */ 603 if (throtl_slice_used(tg, rw)) 604 return; 605 606 /* 607 * A bio has been dispatched. Also adjust slice_end. It might happen 608 * that initially cgroup limit was very low resulting in high 609 * slice_end, but later limit was bumped up and bio was dispatched 610 * sooner, then we need to reduce slice_end. A high bogus slice_end 611 * is bad because it does not allow new slice to start. 612 */ 613 614 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); 615 616 time_elapsed = rounddown(jiffies - tg->slice_start[rw], 617 tg->td->throtl_slice); 618 if (!time_elapsed) 619 return; 620 621 bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), 622 time_elapsed) + 623 tg->carryover_bytes[rw]; 624 io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + 625 tg->carryover_ios[rw]; 626 if (bytes_trim <= 0 && io_trim <= 0) 627 return; 628 629 tg->carryover_bytes[rw] = 0; 630 if ((long long)tg->bytes_disp[rw] >= bytes_trim) 631 tg->bytes_disp[rw] -= bytes_trim; 632 else 633 tg->bytes_disp[rw] = 0; 634 635 tg->carryover_ios[rw] = 0; 636 if ((int)tg->io_disp[rw] >= io_trim) 637 tg->io_disp[rw] -= io_trim; 638 else 639 tg->io_disp[rw] = 0; 640 641 tg->slice_start[rw] += time_elapsed; 642 643 throtl_log(&tg->service_queue, 644 "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", 645 rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, 646 bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], 647 jiffies); 648 } 649 650 static void __tg_update_carryover(struct throtl_grp *tg, bool rw) 651 { 652 unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; 653 u64 bps_limit = tg_bps_limit(tg, rw); 654 u32 iops_limit = tg_iops_limit(tg, rw); 655 656 /* 657 * If config is updated while bios are still throttled, calculate and 658 * accumulate how many bytes/ios are waited across changes. And 659 * carryover_bytes/ios will be used to calculate new wait time under new 660 * configuration. 661 */ 662 if (bps_limit != U64_MAX) 663 tg->carryover_bytes[rw] += 664 calculate_bytes_allowed(bps_limit, jiffy_elapsed) - 665 tg->bytes_disp[rw]; 666 if (iops_limit != UINT_MAX) 667 tg->carryover_ios[rw] += 668 calculate_io_allowed(iops_limit, jiffy_elapsed) - 669 tg->io_disp[rw]; 670 } 671 672 static void tg_update_carryover(struct throtl_grp *tg) 673 { 674 if (tg->service_queue.nr_queued[READ]) 675 __tg_update_carryover(tg, READ); 676 if (tg->service_queue.nr_queued[WRITE]) 677 __tg_update_carryover(tg, WRITE); 678 679 /* see comments in struct throtl_grp for meaning of these fields. */ 680 throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, 681 tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], 682 tg->carryover_ios[READ], tg->carryover_ios[WRITE]); 683 } 684 685 static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, 686 u32 iops_limit) 687 { 688 bool rw = bio_data_dir(bio); 689 int io_allowed; 690 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 691 692 if (iops_limit == UINT_MAX) { 693 return 0; 694 } 695 696 jiffy_elapsed = jiffies - tg->slice_start[rw]; 697 698 /* Round up to the next throttle slice, wait time must be nonzero */ 699 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); 700 io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + 701 tg->carryover_ios[rw]; 702 if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) 703 return 0; 704 705 /* Calc approx time to dispatch */ 706 jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; 707 return jiffy_wait; 708 } 709 710 static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, 711 u64 bps_limit) 712 { 713 bool rw = bio_data_dir(bio); 714 long long bytes_allowed; 715 u64 extra_bytes; 716 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 717 unsigned int bio_size = throtl_bio_data_size(bio); 718 719 /* no need to throttle if this bio's bytes have been accounted */ 720 if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { 721 return 0; 722 } 723 724 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 725 726 /* Slice has just started. Consider one slice interval */ 727 if (!jiffy_elapsed) 728 jiffy_elapsed_rnd = tg->td->throtl_slice; 729 730 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 731 bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + 732 tg->carryover_bytes[rw]; 733 if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) 734 return 0; 735 736 /* Calc approx time to dispatch */ 737 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; 738 jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); 739 740 if (!jiffy_wait) 741 jiffy_wait = 1; 742 743 /* 744 * This wait time is without taking into consideration the rounding 745 * up we did. Add that time also. 746 */ 747 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 748 return jiffy_wait; 749 } 750 751 /* 752 * Returns whether one can dispatch a bio or not. Also returns approx number 753 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 754 */ 755 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 756 unsigned long *wait) 757 { 758 bool rw = bio_data_dir(bio); 759 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 760 u64 bps_limit = tg_bps_limit(tg, rw); 761 u32 iops_limit = tg_iops_limit(tg, rw); 762 763 /* 764 * Currently whole state machine of group depends on first bio 765 * queued in the group bio list. So one should not be calling 766 * this function with a different bio if there are other bios 767 * queued. 768 */ 769 BUG_ON(tg->service_queue.nr_queued[rw] && 770 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 771 772 /* If tg->bps = -1, then BW is unlimited */ 773 if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || 774 tg->flags & THROTL_TG_CANCELING) { 775 if (wait) 776 *wait = 0; 777 return true; 778 } 779 780 /* 781 * If previous slice expired, start a new one otherwise renew/extend 782 * existing slice to make sure it is at least throtl_slice interval 783 * long since now. New slice is started only for empty throttle group. 784 * If there is queued bio, that means there should be an active 785 * slice and it should be extended instead. 786 */ 787 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) 788 throtl_start_new_slice(tg, rw, true); 789 else { 790 if (time_before(tg->slice_end[rw], 791 jiffies + tg->td->throtl_slice)) 792 throtl_extend_slice(tg, rw, 793 jiffies + tg->td->throtl_slice); 794 } 795 796 bps_wait = tg_within_bps_limit(tg, bio, bps_limit); 797 iops_wait = tg_within_iops_limit(tg, bio, iops_limit); 798 if (bps_wait + iops_wait == 0) { 799 if (wait) 800 *wait = 0; 801 return true; 802 } 803 804 max_wait = max(bps_wait, iops_wait); 805 806 if (wait) 807 *wait = max_wait; 808 809 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 810 throtl_extend_slice(tg, rw, jiffies + max_wait); 811 812 return false; 813 } 814 815 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 816 { 817 bool rw = bio_data_dir(bio); 818 unsigned int bio_size = throtl_bio_data_size(bio); 819 820 /* Charge the bio to the group */ 821 if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { 822 tg->bytes_disp[rw] += bio_size; 823 tg->last_bytes_disp[rw] += bio_size; 824 } 825 826 tg->io_disp[rw]++; 827 tg->last_io_disp[rw]++; 828 } 829 830 /** 831 * throtl_add_bio_tg - add a bio to the specified throtl_grp 832 * @bio: bio to add 833 * @qn: qnode to use 834 * @tg: the target throtl_grp 835 * 836 * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 837 * tg->qnode_on_self[] is used. 838 */ 839 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 840 struct throtl_grp *tg) 841 { 842 struct throtl_service_queue *sq = &tg->service_queue; 843 bool rw = bio_data_dir(bio); 844 845 if (!qn) 846 qn = &tg->qnode_on_self[rw]; 847 848 /* 849 * If @tg doesn't currently have any bios queued in the same 850 * direction, queueing @bio can change when @tg should be 851 * dispatched. Mark that @tg was empty. This is automatically 852 * cleared on the next tg_update_disptime(). 853 */ 854 if (!sq->nr_queued[rw]) 855 tg->flags |= THROTL_TG_WAS_EMPTY; 856 857 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 858 859 sq->nr_queued[rw]++; 860 throtl_enqueue_tg(tg); 861 } 862 863 static void tg_update_disptime(struct throtl_grp *tg) 864 { 865 struct throtl_service_queue *sq = &tg->service_queue; 866 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 867 struct bio *bio; 868 869 bio = throtl_peek_queued(&sq->queued[READ]); 870 if (bio) 871 tg_may_dispatch(tg, bio, &read_wait); 872 873 bio = throtl_peek_queued(&sq->queued[WRITE]); 874 if (bio) 875 tg_may_dispatch(tg, bio, &write_wait); 876 877 min_wait = min(read_wait, write_wait); 878 disptime = jiffies + min_wait; 879 880 /* Update dispatch time */ 881 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 882 tg->disptime = disptime; 883 tg_service_queue_add(tg); 884 885 /* see throtl_add_bio_tg() */ 886 tg->flags &= ~THROTL_TG_WAS_EMPTY; 887 } 888 889 static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 890 struct throtl_grp *parent_tg, bool rw) 891 { 892 if (throtl_slice_used(parent_tg, rw)) { 893 throtl_start_new_slice_with_credit(parent_tg, rw, 894 child_tg->slice_start[rw]); 895 } 896 897 } 898 899 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 900 { 901 struct throtl_service_queue *sq = &tg->service_queue; 902 struct throtl_service_queue *parent_sq = sq->parent_sq; 903 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 904 struct throtl_grp *tg_to_put = NULL; 905 struct bio *bio; 906 907 /* 908 * @bio is being transferred from @tg to @parent_sq. Popping a bio 909 * from @tg may put its reference and @parent_sq might end up 910 * getting released prematurely. Remember the tg to put and put it 911 * after @bio is transferred to @parent_sq. 912 */ 913 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 914 sq->nr_queued[rw]--; 915 916 throtl_charge_bio(tg, bio); 917 918 /* 919 * If our parent is another tg, we just need to transfer @bio to 920 * the parent using throtl_add_bio_tg(). If our parent is 921 * @td->service_queue, @bio is ready to be issued. Put it on its 922 * bio_lists[] and decrease total number queued. The caller is 923 * responsible for issuing these bios. 924 */ 925 if (parent_tg) { 926 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 927 start_parent_slice_with_credit(tg, parent_tg, rw); 928 } else { 929 bio_set_flag(bio, BIO_BPS_THROTTLED); 930 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 931 &parent_sq->queued[rw]); 932 BUG_ON(tg->td->nr_queued[rw] <= 0); 933 tg->td->nr_queued[rw]--; 934 } 935 936 throtl_trim_slice(tg, rw); 937 938 if (tg_to_put) 939 blkg_put(tg_to_blkg(tg_to_put)); 940 } 941 942 static int throtl_dispatch_tg(struct throtl_grp *tg) 943 { 944 struct throtl_service_queue *sq = &tg->service_queue; 945 unsigned int nr_reads = 0, nr_writes = 0; 946 unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; 947 unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; 948 struct bio *bio; 949 950 /* Try to dispatch 75% READS and 25% WRITES */ 951 952 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 953 tg_may_dispatch(tg, bio, NULL)) { 954 955 tg_dispatch_one_bio(tg, READ); 956 nr_reads++; 957 958 if (nr_reads >= max_nr_reads) 959 break; 960 } 961 962 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 963 tg_may_dispatch(tg, bio, NULL)) { 964 965 tg_dispatch_one_bio(tg, WRITE); 966 nr_writes++; 967 968 if (nr_writes >= max_nr_writes) 969 break; 970 } 971 972 return nr_reads + nr_writes; 973 } 974 975 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 976 { 977 unsigned int nr_disp = 0; 978 979 while (1) { 980 struct throtl_grp *tg; 981 struct throtl_service_queue *sq; 982 983 if (!parent_sq->nr_pending) 984 break; 985 986 tg = throtl_rb_first(parent_sq); 987 if (!tg) 988 break; 989 990 if (time_before(jiffies, tg->disptime)) 991 break; 992 993 nr_disp += throtl_dispatch_tg(tg); 994 995 sq = &tg->service_queue; 996 if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) 997 tg_update_disptime(tg); 998 else 999 throtl_dequeue_tg(tg); 1000 1001 if (nr_disp >= THROTL_QUANTUM) 1002 break; 1003 } 1004 1005 return nr_disp; 1006 } 1007 1008 /** 1009 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1010 * @t: the pending_timer member of the throtl_service_queue being serviced 1011 * 1012 * This timer is armed when a child throtl_grp with active bio's become 1013 * pending and queued on the service_queue's pending_tree and expires when 1014 * the first child throtl_grp should be dispatched. This function 1015 * dispatches bio's from the children throtl_grps to the parent 1016 * service_queue. 1017 * 1018 * If the parent's parent is another throtl_grp, dispatching is propagated 1019 * by either arming its pending_timer or repeating dispatch directly. If 1020 * the top-level service_tree is reached, throtl_data->dispatch_work is 1021 * kicked so that the ready bio's are issued. 1022 */ 1023 static void throtl_pending_timer_fn(struct timer_list *t) 1024 { 1025 struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); 1026 struct throtl_grp *tg = sq_to_tg(sq); 1027 struct throtl_data *td = sq_to_td(sq); 1028 struct throtl_service_queue *parent_sq; 1029 struct request_queue *q; 1030 bool dispatched; 1031 int ret; 1032 1033 /* throtl_data may be gone, so figure out request queue by blkg */ 1034 if (tg) 1035 q = tg->pd.blkg->q; 1036 else 1037 q = td->queue; 1038 1039 spin_lock_irq(&q->queue_lock); 1040 1041 if (!q->root_blkg) 1042 goto out_unlock; 1043 1044 again: 1045 parent_sq = sq->parent_sq; 1046 dispatched = false; 1047 1048 while (true) { 1049 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 1050 sq->nr_queued[READ] + sq->nr_queued[WRITE], 1051 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1052 1053 ret = throtl_select_dispatch(sq); 1054 if (ret) { 1055 throtl_log(sq, "bios disp=%u", ret); 1056 dispatched = true; 1057 } 1058 1059 if (throtl_schedule_next_dispatch(sq, false)) 1060 break; 1061 1062 /* this dispatch windows is still open, relax and repeat */ 1063 spin_unlock_irq(&q->queue_lock); 1064 cpu_relax(); 1065 spin_lock_irq(&q->queue_lock); 1066 } 1067 1068 if (!dispatched) 1069 goto out_unlock; 1070 1071 if (parent_sq) { 1072 /* @parent_sq is another throl_grp, propagate dispatch */ 1073 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1074 tg_update_disptime(tg); 1075 if (!throtl_schedule_next_dispatch(parent_sq, false)) { 1076 /* window is already open, repeat dispatching */ 1077 sq = parent_sq; 1078 tg = sq_to_tg(sq); 1079 goto again; 1080 } 1081 } 1082 } else { 1083 /* reached the top-level, queue issuing */ 1084 queue_work(kthrotld_workqueue, &td->dispatch_work); 1085 } 1086 out_unlock: 1087 spin_unlock_irq(&q->queue_lock); 1088 } 1089 1090 /** 1091 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1092 * @work: work item being executed 1093 * 1094 * This function is queued for execution when bios reach the bio_lists[] 1095 * of throtl_data->service_queue. Those bios are ready and issued by this 1096 * function. 1097 */ 1098 static void blk_throtl_dispatch_work_fn(struct work_struct *work) 1099 { 1100 struct throtl_data *td = container_of(work, struct throtl_data, 1101 dispatch_work); 1102 struct throtl_service_queue *td_sq = &td->service_queue; 1103 struct request_queue *q = td->queue; 1104 struct bio_list bio_list_on_stack; 1105 struct bio *bio; 1106 struct blk_plug plug; 1107 int rw; 1108 1109 bio_list_init(&bio_list_on_stack); 1110 1111 spin_lock_irq(&q->queue_lock); 1112 for (rw = READ; rw <= WRITE; rw++) 1113 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1114 bio_list_add(&bio_list_on_stack, bio); 1115 spin_unlock_irq(&q->queue_lock); 1116 1117 if (!bio_list_empty(&bio_list_on_stack)) { 1118 blk_start_plug(&plug); 1119 while ((bio = bio_list_pop(&bio_list_on_stack))) 1120 submit_bio_noacct_nocheck(bio); 1121 blk_finish_plug(&plug); 1122 } 1123 } 1124 1125 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1126 int off) 1127 { 1128 struct throtl_grp *tg = pd_to_tg(pd); 1129 u64 v = *(u64 *)((void *)tg + off); 1130 1131 if (v == U64_MAX) 1132 return 0; 1133 return __blkg_prfill_u64(sf, pd, v); 1134 } 1135 1136 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1137 int off) 1138 { 1139 struct throtl_grp *tg = pd_to_tg(pd); 1140 unsigned int v = *(unsigned int *)((void *)tg + off); 1141 1142 if (v == UINT_MAX) 1143 return 0; 1144 return __blkg_prfill_u64(sf, pd, v); 1145 } 1146 1147 static int tg_print_conf_u64(struct seq_file *sf, void *v) 1148 { 1149 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1150 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1151 return 0; 1152 } 1153 1154 static int tg_print_conf_uint(struct seq_file *sf, void *v) 1155 { 1156 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1157 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1158 return 0; 1159 } 1160 1161 static void tg_conf_updated(struct throtl_grp *tg, bool global) 1162 { 1163 struct throtl_service_queue *sq = &tg->service_queue; 1164 struct cgroup_subsys_state *pos_css; 1165 struct blkcg_gq *blkg; 1166 1167 throtl_log(&tg->service_queue, 1168 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1169 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), 1170 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); 1171 1172 rcu_read_lock(); 1173 /* 1174 * Update has_rules[] flags for the updated tg's subtree. A tg is 1175 * considered to have rules if either the tg itself or any of its 1176 * ancestors has rules. This identifies groups without any 1177 * restrictions in the whole hierarchy and allows them to bypass 1178 * blk-throttle. 1179 */ 1180 blkg_for_each_descendant_pre(blkg, pos_css, 1181 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { 1182 struct throtl_grp *this_tg = blkg_to_tg(blkg); 1183 1184 tg_update_has_rules(this_tg); 1185 /* ignore root/second level */ 1186 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || 1187 !blkg->parent->parent) 1188 continue; 1189 } 1190 rcu_read_unlock(); 1191 1192 /* 1193 * We're already holding queue_lock and know @tg is valid. Let's 1194 * apply the new config directly. 1195 * 1196 * Restart the slices for both READ and WRITES. It might happen 1197 * that a group's limit are dropped suddenly and we don't want to 1198 * account recently dispatched IO with new low rate. 1199 */ 1200 throtl_start_new_slice(tg, READ, false); 1201 throtl_start_new_slice(tg, WRITE, false); 1202 1203 if (tg->flags & THROTL_TG_PENDING) { 1204 tg_update_disptime(tg); 1205 throtl_schedule_next_dispatch(sq->parent_sq, true); 1206 } 1207 } 1208 1209 static int blk_throtl_init(struct gendisk *disk) 1210 { 1211 struct request_queue *q = disk->queue; 1212 struct throtl_data *td; 1213 int ret; 1214 1215 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1216 if (!td) 1217 return -ENOMEM; 1218 1219 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1220 throtl_service_queue_init(&td->service_queue); 1221 1222 /* 1223 * Freeze queue before activating policy, to synchronize with IO path, 1224 * which is protected by 'q_usage_counter'. 1225 */ 1226 blk_mq_freeze_queue(disk->queue); 1227 blk_mq_quiesce_queue(disk->queue); 1228 1229 q->td = td; 1230 td->queue = q; 1231 1232 /* activate policy */ 1233 ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); 1234 if (ret) { 1235 q->td = NULL; 1236 kfree(td); 1237 goto out; 1238 } 1239 1240 if (blk_queue_nonrot(q)) 1241 td->throtl_slice = DFL_THROTL_SLICE_SSD; 1242 else 1243 td->throtl_slice = DFL_THROTL_SLICE_HD; 1244 td->track_bio_latency = !queue_is_mq(q); 1245 if (!td->track_bio_latency) 1246 blk_stat_enable_accounting(q); 1247 1248 out: 1249 blk_mq_unquiesce_queue(disk->queue); 1250 blk_mq_unfreeze_queue(disk->queue); 1251 1252 return ret; 1253 } 1254 1255 1256 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1257 char *buf, size_t nbytes, loff_t off, bool is_u64) 1258 { 1259 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1260 struct blkg_conf_ctx ctx; 1261 struct throtl_grp *tg; 1262 int ret; 1263 u64 v; 1264 1265 blkg_conf_init(&ctx, buf); 1266 1267 ret = blkg_conf_open_bdev(&ctx); 1268 if (ret) 1269 goto out_finish; 1270 1271 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1272 ret = blk_throtl_init(ctx.bdev->bd_disk); 1273 if (ret) 1274 goto out_finish; 1275 } 1276 1277 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1278 if (ret) 1279 goto out_finish; 1280 1281 ret = -EINVAL; 1282 if (sscanf(ctx.body, "%llu", &v) != 1) 1283 goto out_finish; 1284 if (!v) 1285 v = U64_MAX; 1286 1287 tg = blkg_to_tg(ctx.blkg); 1288 tg_update_carryover(tg); 1289 1290 if (is_u64) 1291 *(u64 *)((void *)tg + of_cft(of)->private) = v; 1292 else 1293 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1294 1295 tg_conf_updated(tg, false); 1296 ret = 0; 1297 out_finish: 1298 blkg_conf_exit(&ctx); 1299 return ret ?: nbytes; 1300 } 1301 1302 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1303 char *buf, size_t nbytes, loff_t off) 1304 { 1305 return tg_set_conf(of, buf, nbytes, off, true); 1306 } 1307 1308 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1309 char *buf, size_t nbytes, loff_t off) 1310 { 1311 return tg_set_conf(of, buf, nbytes, off, false); 1312 } 1313 1314 static int tg_print_rwstat(struct seq_file *sf, void *v) 1315 { 1316 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1317 blkg_prfill_rwstat, &blkcg_policy_throtl, 1318 seq_cft(sf)->private, true); 1319 return 0; 1320 } 1321 1322 static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, 1323 struct blkg_policy_data *pd, int off) 1324 { 1325 struct blkg_rwstat_sample sum; 1326 1327 blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, 1328 &sum); 1329 return __blkg_prfill_rwstat(sf, pd, &sum); 1330 } 1331 1332 static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) 1333 { 1334 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1335 tg_prfill_rwstat_recursive, &blkcg_policy_throtl, 1336 seq_cft(sf)->private, true); 1337 return 0; 1338 } 1339 1340 static struct cftype throtl_legacy_files[] = { 1341 { 1342 .name = "throttle.read_bps_device", 1343 .private = offsetof(struct throtl_grp, bps[READ]), 1344 .seq_show = tg_print_conf_u64, 1345 .write = tg_set_conf_u64, 1346 }, 1347 { 1348 .name = "throttle.write_bps_device", 1349 .private = offsetof(struct throtl_grp, bps[WRITE]), 1350 .seq_show = tg_print_conf_u64, 1351 .write = tg_set_conf_u64, 1352 }, 1353 { 1354 .name = "throttle.read_iops_device", 1355 .private = offsetof(struct throtl_grp, iops[READ]), 1356 .seq_show = tg_print_conf_uint, 1357 .write = tg_set_conf_uint, 1358 }, 1359 { 1360 .name = "throttle.write_iops_device", 1361 .private = offsetof(struct throtl_grp, iops[WRITE]), 1362 .seq_show = tg_print_conf_uint, 1363 .write = tg_set_conf_uint, 1364 }, 1365 { 1366 .name = "throttle.io_service_bytes", 1367 .private = offsetof(struct throtl_grp, stat_bytes), 1368 .seq_show = tg_print_rwstat, 1369 }, 1370 { 1371 .name = "throttle.io_service_bytes_recursive", 1372 .private = offsetof(struct throtl_grp, stat_bytes), 1373 .seq_show = tg_print_rwstat_recursive, 1374 }, 1375 { 1376 .name = "throttle.io_serviced", 1377 .private = offsetof(struct throtl_grp, stat_ios), 1378 .seq_show = tg_print_rwstat, 1379 }, 1380 { 1381 .name = "throttle.io_serviced_recursive", 1382 .private = offsetof(struct throtl_grp, stat_ios), 1383 .seq_show = tg_print_rwstat_recursive, 1384 }, 1385 { } /* terminate */ 1386 }; 1387 1388 static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, 1389 int off) 1390 { 1391 struct throtl_grp *tg = pd_to_tg(pd); 1392 const char *dname = blkg_dev_name(pd->blkg); 1393 u64 bps_dft; 1394 unsigned int iops_dft; 1395 1396 if (!dname) 1397 return 0; 1398 1399 bps_dft = U64_MAX; 1400 iops_dft = UINT_MAX; 1401 1402 if (tg->bps[READ] == bps_dft && 1403 tg->bps[WRITE] == bps_dft && 1404 tg->iops[READ] == iops_dft && 1405 tg->iops[WRITE] == iops_dft) 1406 return 0; 1407 1408 seq_printf(sf, "%s", dname); 1409 if (tg->bps[READ] == U64_MAX) 1410 seq_printf(sf, " rbps=max"); 1411 else 1412 seq_printf(sf, " rbps=%llu", tg->bps[READ]); 1413 1414 if (tg->bps[WRITE] == U64_MAX) 1415 seq_printf(sf, " wbps=max"); 1416 else 1417 seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); 1418 1419 if (tg->iops[READ] == UINT_MAX) 1420 seq_printf(sf, " riops=max"); 1421 else 1422 seq_printf(sf, " riops=%u", tg->iops[READ]); 1423 1424 if (tg->iops[WRITE] == UINT_MAX) 1425 seq_printf(sf, " wiops=max"); 1426 else 1427 seq_printf(sf, " wiops=%u", tg->iops[WRITE]); 1428 1429 seq_printf(sf, "\n"); 1430 return 0; 1431 } 1432 1433 static int tg_print_limit(struct seq_file *sf, void *v) 1434 { 1435 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, 1436 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1437 return 0; 1438 } 1439 1440 static ssize_t tg_set_limit(struct kernfs_open_file *of, 1441 char *buf, size_t nbytes, loff_t off) 1442 { 1443 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1444 struct blkg_conf_ctx ctx; 1445 struct throtl_grp *tg; 1446 u64 v[4]; 1447 int ret; 1448 1449 blkg_conf_init(&ctx, buf); 1450 1451 ret = blkg_conf_open_bdev(&ctx); 1452 if (ret) 1453 goto out_finish; 1454 1455 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1456 ret = blk_throtl_init(ctx.bdev->bd_disk); 1457 if (ret) 1458 goto out_finish; 1459 } 1460 1461 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1462 if (ret) 1463 goto out_finish; 1464 1465 tg = blkg_to_tg(ctx.blkg); 1466 tg_update_carryover(tg); 1467 1468 v[0] = tg->bps[READ]; 1469 v[1] = tg->bps[WRITE]; 1470 v[2] = tg->iops[READ]; 1471 v[3] = tg->iops[WRITE]; 1472 1473 while (true) { 1474 char tok[27]; /* wiops=18446744073709551616 */ 1475 char *p; 1476 u64 val = U64_MAX; 1477 int len; 1478 1479 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1480 break; 1481 if (tok[0] == '\0') 1482 break; 1483 ctx.body += len; 1484 1485 ret = -EINVAL; 1486 p = tok; 1487 strsep(&p, "="); 1488 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) 1489 goto out_finish; 1490 1491 ret = -ERANGE; 1492 if (!val) 1493 goto out_finish; 1494 1495 ret = -EINVAL; 1496 if (!strcmp(tok, "rbps") && val > 1) 1497 v[0] = val; 1498 else if (!strcmp(tok, "wbps") && val > 1) 1499 v[1] = val; 1500 else if (!strcmp(tok, "riops") && val > 1) 1501 v[2] = min_t(u64, val, UINT_MAX); 1502 else if (!strcmp(tok, "wiops") && val > 1) 1503 v[3] = min_t(u64, val, UINT_MAX); 1504 else 1505 goto out_finish; 1506 } 1507 1508 tg->bps[READ] = v[0]; 1509 tg->bps[WRITE] = v[1]; 1510 tg->iops[READ] = v[2]; 1511 tg->iops[WRITE] = v[3]; 1512 1513 tg_conf_updated(tg, false); 1514 ret = 0; 1515 out_finish: 1516 blkg_conf_exit(&ctx); 1517 return ret ?: nbytes; 1518 } 1519 1520 static struct cftype throtl_files[] = { 1521 { 1522 .name = "max", 1523 .flags = CFTYPE_NOT_ON_ROOT, 1524 .seq_show = tg_print_limit, 1525 .write = tg_set_limit, 1526 }, 1527 { } /* terminate */ 1528 }; 1529 1530 static void throtl_shutdown_wq(struct request_queue *q) 1531 { 1532 struct throtl_data *td = q->td; 1533 1534 cancel_work_sync(&td->dispatch_work); 1535 } 1536 1537 struct blkcg_policy blkcg_policy_throtl = { 1538 .dfl_cftypes = throtl_files, 1539 .legacy_cftypes = throtl_legacy_files, 1540 1541 .pd_alloc_fn = throtl_pd_alloc, 1542 .pd_init_fn = throtl_pd_init, 1543 .pd_online_fn = throtl_pd_online, 1544 .pd_free_fn = throtl_pd_free, 1545 }; 1546 1547 void blk_throtl_cancel_bios(struct gendisk *disk) 1548 { 1549 struct request_queue *q = disk->queue; 1550 struct cgroup_subsys_state *pos_css; 1551 struct blkcg_gq *blkg; 1552 1553 if (!blk_throtl_activated(q)) 1554 return; 1555 1556 spin_lock_irq(&q->queue_lock); 1557 /* 1558 * queue_lock is held, rcu lock is not needed here technically. 1559 * However, rcu lock is still held to emphasize that following 1560 * path need RCU protection and to prevent warning from lockdep. 1561 */ 1562 rcu_read_lock(); 1563 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1564 struct throtl_grp *tg = blkg_to_tg(blkg); 1565 struct throtl_service_queue *sq = &tg->service_queue; 1566 1567 /* 1568 * Set the flag to make sure throtl_pending_timer_fn() won't 1569 * stop until all throttled bios are dispatched. 1570 */ 1571 tg->flags |= THROTL_TG_CANCELING; 1572 1573 /* 1574 * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup 1575 * will be inserted to service queue without THROTL_TG_PENDING 1576 * set in tg_update_disptime below. Then IO dispatched from 1577 * child in tg_dispatch_one_bio will trigger double insertion 1578 * and corrupt the tree. 1579 */ 1580 if (!(tg->flags & THROTL_TG_PENDING)) 1581 continue; 1582 1583 /* 1584 * Update disptime after setting the above flag to make sure 1585 * throtl_select_dispatch() won't exit without dispatching. 1586 */ 1587 tg_update_disptime(tg); 1588 1589 throtl_schedule_pending_timer(sq, jiffies + 1); 1590 } 1591 rcu_read_unlock(); 1592 spin_unlock_irq(&q->queue_lock); 1593 } 1594 1595 bool __blk_throtl_bio(struct bio *bio) 1596 { 1597 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1598 struct blkcg_gq *blkg = bio->bi_blkg; 1599 struct throtl_qnode *qn = NULL; 1600 struct throtl_grp *tg = blkg_to_tg(blkg); 1601 struct throtl_service_queue *sq; 1602 bool rw = bio_data_dir(bio); 1603 bool throttled = false; 1604 struct throtl_data *td = tg->td; 1605 1606 rcu_read_lock(); 1607 spin_lock_irq(&q->queue_lock); 1608 sq = &tg->service_queue; 1609 1610 while (true) { 1611 if (tg->last_low_overflow_time[rw] == 0) 1612 tg->last_low_overflow_time[rw] = jiffies; 1613 /* throtl is FIFO - if bios are already queued, should queue */ 1614 if (sq->nr_queued[rw]) 1615 break; 1616 1617 /* if above limits, break to queue */ 1618 if (!tg_may_dispatch(tg, bio, NULL)) { 1619 tg->last_low_overflow_time[rw] = jiffies; 1620 break; 1621 } 1622 1623 /* within limits, let's charge and dispatch directly */ 1624 throtl_charge_bio(tg, bio); 1625 1626 /* 1627 * We need to trim slice even when bios are not being queued 1628 * otherwise it might happen that a bio is not queued for 1629 * a long time and slice keeps on extending and trim is not 1630 * called for a long time. Now if limits are reduced suddenly 1631 * we take into account all the IO dispatched so far at new 1632 * low rate and * newly queued IO gets a really long dispatch 1633 * time. 1634 * 1635 * So keep on trimming slice even if bio is not queued. 1636 */ 1637 throtl_trim_slice(tg, rw); 1638 1639 /* 1640 * @bio passed through this layer without being throttled. 1641 * Climb up the ladder. If we're already at the top, it 1642 * can be executed directly. 1643 */ 1644 qn = &tg->qnode_on_parent[rw]; 1645 sq = sq->parent_sq; 1646 tg = sq_to_tg(sq); 1647 if (!tg) { 1648 bio_set_flag(bio, BIO_BPS_THROTTLED); 1649 goto out_unlock; 1650 } 1651 } 1652 1653 /* out-of-limit, queue to @tg */ 1654 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 1655 rw == READ ? 'R' : 'W', 1656 tg->bytes_disp[rw], bio->bi_iter.bi_size, 1657 tg_bps_limit(tg, rw), 1658 tg->io_disp[rw], tg_iops_limit(tg, rw), 1659 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1660 1661 tg->last_low_overflow_time[rw] = jiffies; 1662 1663 td->nr_queued[rw]++; 1664 throtl_add_bio_tg(bio, qn, tg); 1665 throttled = true; 1666 1667 /* 1668 * Update @tg's dispatch time and force schedule dispatch if @tg 1669 * was empty before @bio. The forced scheduling isn't likely to 1670 * cause undue delay as @bio is likely to be dispatched directly if 1671 * its @tg's disptime is not in the future. 1672 */ 1673 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1674 tg_update_disptime(tg); 1675 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1676 } 1677 1678 out_unlock: 1679 spin_unlock_irq(&q->queue_lock); 1680 1681 rcu_read_unlock(); 1682 return throttled; 1683 } 1684 1685 void blk_throtl_exit(struct gendisk *disk) 1686 { 1687 struct request_queue *q = disk->queue; 1688 1689 if (!blk_throtl_activated(q)) 1690 return; 1691 1692 del_timer_sync(&q->td->service_queue.pending_timer); 1693 throtl_shutdown_wq(q); 1694 blkcg_deactivate_policy(disk, &blkcg_policy_throtl); 1695 kfree(q->td); 1696 } 1697 1698 static int __init throtl_init(void) 1699 { 1700 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1701 if (!kthrotld_workqueue) 1702 panic("Failed to create kthrotld\n"); 1703 1704 return blkcg_policy_register(&blkcg_policy_throtl); 1705 } 1706 1707 module_init(throtl_init); 1708