1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Interface for controlling IO bandwidth on a request queue 4 * 5 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/bio.h> 12 #include <linux/blktrace_api.h> 13 #include "blk.h" 14 #include "blk-cgroup-rwstat.h" 15 #include "blk-stat.h" 16 #include "blk-throttle.h" 17 18 /* Max dispatch from a group in 1 round */ 19 #define THROTL_GRP_QUANTUM 8 20 21 /* Total max dispatch from all groups in one round */ 22 #define THROTL_QUANTUM 32 23 24 /* Throttling is performed over a slice and after that slice is renewed */ 25 #define DFL_THROTL_SLICE_HD (HZ / 10) 26 #define DFL_THROTL_SLICE_SSD (HZ / 50) 27 #define MAX_THROTL_SLICE (HZ) 28 29 /* A workqueue to queue throttle related work */ 30 static struct workqueue_struct *kthrotld_workqueue; 31 32 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 33 34 /* We measure latency for request size from <= 4k to >= 1M */ 35 #define LATENCY_BUCKET_SIZE 9 36 37 struct latency_bucket { 38 unsigned long total_latency; /* ns / 1024 */ 39 int samples; 40 }; 41 42 struct throtl_data 43 { 44 /* service tree for active throtl groups */ 45 struct throtl_service_queue service_queue; 46 47 struct request_queue *queue; 48 49 /* Total Number of queued bios on READ and WRITE lists */ 50 unsigned int nr_queued[2]; 51 52 unsigned int throtl_slice; 53 54 /* Work for dispatching throttled bios */ 55 struct work_struct dispatch_work; 56 57 bool track_bio_latency; 58 }; 59 60 static void throtl_pending_timer_fn(struct timer_list *t); 61 62 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 63 { 64 return pd_to_blkg(&tg->pd); 65 } 66 67 /** 68 * sq_to_tg - return the throl_grp the specified service queue belongs to 69 * @sq: the throtl_service_queue of interest 70 * 71 * Return the throtl_grp @sq belongs to. If @sq is the top-level one 72 * embedded in throtl_data, %NULL is returned. 73 */ 74 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 75 { 76 if (sq && sq->parent_sq) 77 return container_of(sq, struct throtl_grp, service_queue); 78 else 79 return NULL; 80 } 81 82 /** 83 * sq_to_td - return throtl_data the specified service queue belongs to 84 * @sq: the throtl_service_queue of interest 85 * 86 * A service_queue can be embedded in either a throtl_grp or throtl_data. 87 * Determine the associated throtl_data accordingly and return it. 88 */ 89 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 90 { 91 struct throtl_grp *tg = sq_to_tg(sq); 92 93 if (tg) 94 return tg->td; 95 else 96 return container_of(sq, struct throtl_data, service_queue); 97 } 98 99 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 100 { 101 struct blkcg_gq *blkg = tg_to_blkg(tg); 102 103 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 104 return U64_MAX; 105 106 return tg->bps[rw]; 107 } 108 109 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 110 { 111 struct blkcg_gq *blkg = tg_to_blkg(tg); 112 113 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 114 return UINT_MAX; 115 116 return tg->iops[rw]; 117 } 118 119 #define request_bucket_index(sectors) \ 120 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) 121 122 /** 123 * throtl_log - log debug message via blktrace 124 * @sq: the service_queue being reported 125 * @fmt: printf format string 126 * @args: printf args 127 * 128 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 129 * throtl_grp; otherwise, just "throtl". 130 */ 131 #define throtl_log(sq, fmt, args...) do { \ 132 struct throtl_grp *__tg = sq_to_tg((sq)); \ 133 struct throtl_data *__td = sq_to_td((sq)); \ 134 \ 135 (void)__td; \ 136 if (likely(!blk_trace_note_message_enabled(__td->queue))) \ 137 break; \ 138 if ((__tg)) { \ 139 blk_add_cgroup_trace_msg(__td->queue, \ 140 &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ 141 } else { \ 142 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 143 } \ 144 } while (0) 145 146 static inline unsigned int throtl_bio_data_size(struct bio *bio) 147 { 148 /* assume it's one sector */ 149 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 150 return 512; 151 return bio->bi_iter.bi_size; 152 } 153 154 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 155 { 156 INIT_LIST_HEAD(&qn->node); 157 bio_list_init(&qn->bios); 158 qn->tg = tg; 159 } 160 161 /** 162 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 163 * @bio: bio being added 164 * @qn: qnode to add bio to 165 * @queued: the service_queue->queued[] list @qn belongs to 166 * 167 * Add @bio to @qn and put @qn on @queued if it's not already on. 168 * @qn->tg's reference count is bumped when @qn is activated. See the 169 * comment on top of throtl_qnode definition for details. 170 */ 171 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 172 struct list_head *queued) 173 { 174 bio_list_add(&qn->bios, bio); 175 if (list_empty(&qn->node)) { 176 list_add_tail(&qn->node, queued); 177 blkg_get(tg_to_blkg(qn->tg)); 178 } 179 } 180 181 /** 182 * throtl_peek_queued - peek the first bio on a qnode list 183 * @queued: the qnode list to peek 184 */ 185 static struct bio *throtl_peek_queued(struct list_head *queued) 186 { 187 struct throtl_qnode *qn; 188 struct bio *bio; 189 190 if (list_empty(queued)) 191 return NULL; 192 193 qn = list_first_entry(queued, struct throtl_qnode, node); 194 bio = bio_list_peek(&qn->bios); 195 WARN_ON_ONCE(!bio); 196 return bio; 197 } 198 199 /** 200 * throtl_pop_queued - pop the first bio form a qnode list 201 * @queued: the qnode list to pop a bio from 202 * @tg_to_put: optional out argument for throtl_grp to put 203 * 204 * Pop the first bio from the qnode list @queued. After popping, the first 205 * qnode is removed from @queued if empty or moved to the end of @queued so 206 * that the popping order is round-robin. 207 * 208 * When the first qnode is removed, its associated throtl_grp should be put 209 * too. If @tg_to_put is NULL, this function automatically puts it; 210 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 211 * responsible for putting it. 212 */ 213 static struct bio *throtl_pop_queued(struct list_head *queued, 214 struct throtl_grp **tg_to_put) 215 { 216 struct throtl_qnode *qn; 217 struct bio *bio; 218 219 if (list_empty(queued)) 220 return NULL; 221 222 qn = list_first_entry(queued, struct throtl_qnode, node); 223 bio = bio_list_pop(&qn->bios); 224 WARN_ON_ONCE(!bio); 225 226 if (bio_list_empty(&qn->bios)) { 227 list_del_init(&qn->node); 228 if (tg_to_put) 229 *tg_to_put = qn->tg; 230 else 231 blkg_put(tg_to_blkg(qn->tg)); 232 } else { 233 list_move_tail(&qn->node, queued); 234 } 235 236 return bio; 237 } 238 239 /* init a service_queue, assumes the caller zeroed it */ 240 static void throtl_service_queue_init(struct throtl_service_queue *sq) 241 { 242 INIT_LIST_HEAD(&sq->queued[READ]); 243 INIT_LIST_HEAD(&sq->queued[WRITE]); 244 sq->pending_tree = RB_ROOT_CACHED; 245 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); 246 } 247 248 static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, 249 struct blkcg *blkcg, gfp_t gfp) 250 { 251 struct throtl_grp *tg; 252 int rw; 253 254 tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); 255 if (!tg) 256 return NULL; 257 258 if (blkg_rwstat_init(&tg->stat_bytes, gfp)) 259 goto err_free_tg; 260 261 if (blkg_rwstat_init(&tg->stat_ios, gfp)) 262 goto err_exit_stat_bytes; 263 264 throtl_service_queue_init(&tg->service_queue); 265 266 for (rw = READ; rw <= WRITE; rw++) { 267 throtl_qnode_init(&tg->qnode_on_self[rw], tg); 268 throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 269 } 270 271 RB_CLEAR_NODE(&tg->rb_node); 272 tg->bps[READ] = U64_MAX; 273 tg->bps[WRITE] = U64_MAX; 274 tg->iops[READ] = UINT_MAX; 275 tg->iops[WRITE] = UINT_MAX; 276 277 return &tg->pd; 278 279 err_exit_stat_bytes: 280 blkg_rwstat_exit(&tg->stat_bytes); 281 err_free_tg: 282 kfree(tg); 283 return NULL; 284 } 285 286 static void throtl_pd_init(struct blkg_policy_data *pd) 287 { 288 struct throtl_grp *tg = pd_to_tg(pd); 289 struct blkcg_gq *blkg = tg_to_blkg(tg); 290 struct throtl_data *td = blkg->q->td; 291 struct throtl_service_queue *sq = &tg->service_queue; 292 293 /* 294 * If on the default hierarchy, we switch to properly hierarchical 295 * behavior where limits on a given throtl_grp are applied to the 296 * whole subtree rather than just the group itself. e.g. If 16M 297 * read_bps limit is set on a parent group, summary bps of 298 * parent group and its subtree groups can't exceed 16M for the 299 * device. 300 * 301 * If not on the default hierarchy, the broken flat hierarchy 302 * behavior is retained where all throtl_grps are treated as if 303 * they're all separate root groups right below throtl_data. 304 * Limits of a group don't interact with limits of other groups 305 * regardless of the position of the group in the hierarchy. 306 */ 307 sq->parent_sq = &td->service_queue; 308 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 309 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 310 tg->td = td; 311 } 312 313 /* 314 * Set has_rules[] if @tg or any of its parents have limits configured. 315 * This doesn't require walking up to the top of the hierarchy as the 316 * parent's has_rules[] is guaranteed to be correct. 317 */ 318 static void tg_update_has_rules(struct throtl_grp *tg) 319 { 320 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 321 int rw; 322 323 for (rw = READ; rw <= WRITE; rw++) { 324 tg->has_rules_iops[rw] = 325 (parent_tg && parent_tg->has_rules_iops[rw]) || 326 tg_iops_limit(tg, rw) != UINT_MAX; 327 tg->has_rules_bps[rw] = 328 (parent_tg && parent_tg->has_rules_bps[rw]) || 329 tg_bps_limit(tg, rw) != U64_MAX; 330 } 331 } 332 333 static void throtl_pd_online(struct blkg_policy_data *pd) 334 { 335 struct throtl_grp *tg = pd_to_tg(pd); 336 /* 337 * We don't want new groups to escape the limits of its ancestors. 338 * Update has_rules[] after a new group is brought online. 339 */ 340 tg_update_has_rules(tg); 341 } 342 343 static void throtl_pd_free(struct blkg_policy_data *pd) 344 { 345 struct throtl_grp *tg = pd_to_tg(pd); 346 347 del_timer_sync(&tg->service_queue.pending_timer); 348 blkg_rwstat_exit(&tg->stat_bytes); 349 blkg_rwstat_exit(&tg->stat_ios); 350 kfree(tg); 351 } 352 353 static struct throtl_grp * 354 throtl_rb_first(struct throtl_service_queue *parent_sq) 355 { 356 struct rb_node *n; 357 358 n = rb_first_cached(&parent_sq->pending_tree); 359 WARN_ON_ONCE(!n); 360 if (!n) 361 return NULL; 362 return rb_entry_tg(n); 363 } 364 365 static void throtl_rb_erase(struct rb_node *n, 366 struct throtl_service_queue *parent_sq) 367 { 368 rb_erase_cached(n, &parent_sq->pending_tree); 369 RB_CLEAR_NODE(n); 370 } 371 372 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 373 { 374 struct throtl_grp *tg; 375 376 tg = throtl_rb_first(parent_sq); 377 if (!tg) 378 return; 379 380 parent_sq->first_pending_disptime = tg->disptime; 381 } 382 383 static void tg_service_queue_add(struct throtl_grp *tg) 384 { 385 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 386 struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; 387 struct rb_node *parent = NULL; 388 struct throtl_grp *__tg; 389 unsigned long key = tg->disptime; 390 bool leftmost = true; 391 392 while (*node != NULL) { 393 parent = *node; 394 __tg = rb_entry_tg(parent); 395 396 if (time_before(key, __tg->disptime)) 397 node = &parent->rb_left; 398 else { 399 node = &parent->rb_right; 400 leftmost = false; 401 } 402 } 403 404 rb_link_node(&tg->rb_node, parent, node); 405 rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, 406 leftmost); 407 } 408 409 static void throtl_enqueue_tg(struct throtl_grp *tg) 410 { 411 if (!(tg->flags & THROTL_TG_PENDING)) { 412 tg_service_queue_add(tg); 413 tg->flags |= THROTL_TG_PENDING; 414 tg->service_queue.parent_sq->nr_pending++; 415 } 416 } 417 418 static void throtl_dequeue_tg(struct throtl_grp *tg) 419 { 420 if (tg->flags & THROTL_TG_PENDING) { 421 struct throtl_service_queue *parent_sq = 422 tg->service_queue.parent_sq; 423 424 throtl_rb_erase(&tg->rb_node, parent_sq); 425 --parent_sq->nr_pending; 426 tg->flags &= ~THROTL_TG_PENDING; 427 } 428 } 429 430 /* Call with queue lock held */ 431 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 432 unsigned long expires) 433 { 434 unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; 435 436 /* 437 * Since we are adjusting the throttle limit dynamically, the sleep 438 * time calculated according to previous limit might be invalid. It's 439 * possible the cgroup sleep time is very long and no other cgroups 440 * have IO running so notify the limit changes. Make sure the cgroup 441 * doesn't sleep too long to avoid the missed notification. 442 */ 443 if (time_after(expires, max_expire)) 444 expires = max_expire; 445 mod_timer(&sq->pending_timer, expires); 446 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 447 expires - jiffies, jiffies); 448 } 449 450 /** 451 * throtl_schedule_next_dispatch - schedule the next dispatch cycle 452 * @sq: the service_queue to schedule dispatch for 453 * @force: force scheduling 454 * 455 * Arm @sq->pending_timer so that the next dispatch cycle starts on the 456 * dispatch time of the first pending child. Returns %true if either timer 457 * is armed or there's no pending child left. %false if the current 458 * dispatch window is still open and the caller should continue 459 * dispatching. 460 * 461 * If @force is %true, the dispatch timer is always scheduled and this 462 * function is guaranteed to return %true. This is to be used when the 463 * caller can't dispatch itself and needs to invoke pending_timer 464 * unconditionally. Note that forced scheduling is likely to induce short 465 * delay before dispatch starts even if @sq->first_pending_disptime is not 466 * in the future and thus shouldn't be used in hot paths. 467 */ 468 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 469 bool force) 470 { 471 /* any pending children left? */ 472 if (!sq->nr_pending) 473 return true; 474 475 update_min_dispatch_time(sq); 476 477 /* is the next dispatch time in the future? */ 478 if (force || time_after(sq->first_pending_disptime, jiffies)) { 479 throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 480 return true; 481 } 482 483 /* tell the caller to continue dispatching */ 484 return false; 485 } 486 487 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 488 bool rw, unsigned long start) 489 { 490 tg->bytes_disp[rw] = 0; 491 tg->io_disp[rw] = 0; 492 tg->carryover_bytes[rw] = 0; 493 tg->carryover_ios[rw] = 0; 494 495 /* 496 * Previous slice has expired. We must have trimmed it after last 497 * bio dispatch. That means since start of last slice, we never used 498 * that bandwidth. Do try to make use of that bandwidth while giving 499 * credit. 500 */ 501 if (time_after(start, tg->slice_start[rw])) 502 tg->slice_start[rw] = start; 503 504 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 505 throtl_log(&tg->service_queue, 506 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 507 rw == READ ? 'R' : 'W', tg->slice_start[rw], 508 tg->slice_end[rw], jiffies); 509 } 510 511 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, 512 bool clear_carryover) 513 { 514 tg->bytes_disp[rw] = 0; 515 tg->io_disp[rw] = 0; 516 tg->slice_start[rw] = jiffies; 517 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 518 if (clear_carryover) { 519 tg->carryover_bytes[rw] = 0; 520 tg->carryover_ios[rw] = 0; 521 } 522 523 throtl_log(&tg->service_queue, 524 "[%c] new slice start=%lu end=%lu jiffies=%lu", 525 rw == READ ? 'R' : 'W', tg->slice_start[rw], 526 tg->slice_end[rw], jiffies); 527 } 528 529 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 530 unsigned long jiffy_end) 531 { 532 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); 533 } 534 535 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 536 unsigned long jiffy_end) 537 { 538 throtl_set_slice_end(tg, rw, jiffy_end); 539 throtl_log(&tg->service_queue, 540 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 541 rw == READ ? 'R' : 'W', tg->slice_start[rw], 542 tg->slice_end[rw], jiffies); 543 } 544 545 /* Determine if previously allocated or extended slice is complete or not */ 546 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 547 { 548 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 549 return false; 550 551 return true; 552 } 553 554 static unsigned int calculate_io_allowed(u32 iops_limit, 555 unsigned long jiffy_elapsed) 556 { 557 unsigned int io_allowed; 558 u64 tmp; 559 560 /* 561 * jiffy_elapsed should not be a big value as minimum iops can be 562 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 563 * will allow dispatch after 1 second and after that slice should 564 * have been trimmed. 565 */ 566 567 tmp = (u64)iops_limit * jiffy_elapsed; 568 do_div(tmp, HZ); 569 570 if (tmp > UINT_MAX) 571 io_allowed = UINT_MAX; 572 else 573 io_allowed = tmp; 574 575 return io_allowed; 576 } 577 578 static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) 579 { 580 /* 581 * Can result be wider than 64 bits? 582 * We check against 62, not 64, due to ilog2 truncation. 583 */ 584 if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) 585 return U64_MAX; 586 return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); 587 } 588 589 /* Trim the used slices and adjust slice start accordingly */ 590 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 591 { 592 unsigned long time_elapsed; 593 long long bytes_trim; 594 int io_trim; 595 596 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 597 598 /* 599 * If bps are unlimited (-1), then time slice don't get 600 * renewed. Don't try to trim the slice if slice is used. A new 601 * slice will start when appropriate. 602 */ 603 if (throtl_slice_used(tg, rw)) 604 return; 605 606 /* 607 * A bio has been dispatched. Also adjust slice_end. It might happen 608 * that initially cgroup limit was very low resulting in high 609 * slice_end, but later limit was bumped up and bio was dispatched 610 * sooner, then we need to reduce slice_end. A high bogus slice_end 611 * is bad because it does not allow new slice to start. 612 */ 613 614 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); 615 616 time_elapsed = rounddown(jiffies - tg->slice_start[rw], 617 tg->td->throtl_slice); 618 if (!time_elapsed) 619 return; 620 621 bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), 622 time_elapsed) + 623 tg->carryover_bytes[rw]; 624 io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + 625 tg->carryover_ios[rw]; 626 if (bytes_trim <= 0 && io_trim <= 0) 627 return; 628 629 tg->carryover_bytes[rw] = 0; 630 if ((long long)tg->bytes_disp[rw] >= bytes_trim) 631 tg->bytes_disp[rw] -= bytes_trim; 632 else 633 tg->bytes_disp[rw] = 0; 634 635 tg->carryover_ios[rw] = 0; 636 if ((int)tg->io_disp[rw] >= io_trim) 637 tg->io_disp[rw] -= io_trim; 638 else 639 tg->io_disp[rw] = 0; 640 641 tg->slice_start[rw] += time_elapsed; 642 643 throtl_log(&tg->service_queue, 644 "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", 645 rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, 646 bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], 647 jiffies); 648 } 649 650 static void __tg_update_carryover(struct throtl_grp *tg, bool rw) 651 { 652 unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; 653 u64 bps_limit = tg_bps_limit(tg, rw); 654 u32 iops_limit = tg_iops_limit(tg, rw); 655 656 /* 657 * If config is updated while bios are still throttled, calculate and 658 * accumulate how many bytes/ios are waited across changes. And 659 * carryover_bytes/ios will be used to calculate new wait time under new 660 * configuration. 661 */ 662 if (bps_limit != U64_MAX) 663 tg->carryover_bytes[rw] += 664 calculate_bytes_allowed(bps_limit, jiffy_elapsed) - 665 tg->bytes_disp[rw]; 666 if (iops_limit != UINT_MAX) 667 tg->carryover_ios[rw] += 668 calculate_io_allowed(iops_limit, jiffy_elapsed) - 669 tg->io_disp[rw]; 670 } 671 672 static void tg_update_carryover(struct throtl_grp *tg) 673 { 674 if (tg->service_queue.nr_queued[READ]) 675 __tg_update_carryover(tg, READ); 676 if (tg->service_queue.nr_queued[WRITE]) 677 __tg_update_carryover(tg, WRITE); 678 679 /* see comments in struct throtl_grp for meaning of these fields. */ 680 throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, 681 tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], 682 tg->carryover_ios[READ], tg->carryover_ios[WRITE]); 683 } 684 685 static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, 686 u32 iops_limit) 687 { 688 bool rw = bio_data_dir(bio); 689 int io_allowed; 690 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 691 692 if (iops_limit == UINT_MAX) { 693 return 0; 694 } 695 696 jiffy_elapsed = jiffies - tg->slice_start[rw]; 697 698 /* Round up to the next throttle slice, wait time must be nonzero */ 699 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); 700 io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + 701 tg->carryover_ios[rw]; 702 if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) 703 return 0; 704 705 /* Calc approx time to dispatch */ 706 jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; 707 708 /* make sure at least one io can be dispatched after waiting */ 709 jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); 710 return jiffy_wait; 711 } 712 713 static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, 714 u64 bps_limit) 715 { 716 bool rw = bio_data_dir(bio); 717 long long bytes_allowed; 718 u64 extra_bytes; 719 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 720 unsigned int bio_size = throtl_bio_data_size(bio); 721 722 /* no need to throttle if this bio's bytes have been accounted */ 723 if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { 724 return 0; 725 } 726 727 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 728 729 /* Slice has just started. Consider one slice interval */ 730 if (!jiffy_elapsed) 731 jiffy_elapsed_rnd = tg->td->throtl_slice; 732 733 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 734 bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + 735 tg->carryover_bytes[rw]; 736 if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) 737 return 0; 738 739 /* Calc approx time to dispatch */ 740 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; 741 jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); 742 743 if (!jiffy_wait) 744 jiffy_wait = 1; 745 746 /* 747 * This wait time is without taking into consideration the rounding 748 * up we did. Add that time also. 749 */ 750 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 751 return jiffy_wait; 752 } 753 754 /* 755 * Returns whether one can dispatch a bio or not. Also returns approx number 756 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 757 */ 758 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 759 unsigned long *wait) 760 { 761 bool rw = bio_data_dir(bio); 762 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 763 u64 bps_limit = tg_bps_limit(tg, rw); 764 u32 iops_limit = tg_iops_limit(tg, rw); 765 766 /* 767 * Currently whole state machine of group depends on first bio 768 * queued in the group bio list. So one should not be calling 769 * this function with a different bio if there are other bios 770 * queued. 771 */ 772 BUG_ON(tg->service_queue.nr_queued[rw] && 773 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 774 775 /* If tg->bps = -1, then BW is unlimited */ 776 if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || 777 tg->flags & THROTL_TG_CANCELING) { 778 if (wait) 779 *wait = 0; 780 return true; 781 } 782 783 /* 784 * If previous slice expired, start a new one otherwise renew/extend 785 * existing slice to make sure it is at least throtl_slice interval 786 * long since now. New slice is started only for empty throttle group. 787 * If there is queued bio, that means there should be an active 788 * slice and it should be extended instead. 789 */ 790 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) 791 throtl_start_new_slice(tg, rw, true); 792 else { 793 if (time_before(tg->slice_end[rw], 794 jiffies + tg->td->throtl_slice)) 795 throtl_extend_slice(tg, rw, 796 jiffies + tg->td->throtl_slice); 797 } 798 799 bps_wait = tg_within_bps_limit(tg, bio, bps_limit); 800 iops_wait = tg_within_iops_limit(tg, bio, iops_limit); 801 if (bps_wait + iops_wait == 0) { 802 if (wait) 803 *wait = 0; 804 return true; 805 } 806 807 max_wait = max(bps_wait, iops_wait); 808 809 if (wait) 810 *wait = max_wait; 811 812 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 813 throtl_extend_slice(tg, rw, jiffies + max_wait); 814 815 return false; 816 } 817 818 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 819 { 820 bool rw = bio_data_dir(bio); 821 unsigned int bio_size = throtl_bio_data_size(bio); 822 823 /* Charge the bio to the group */ 824 if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { 825 tg->bytes_disp[rw] += bio_size; 826 tg->last_bytes_disp[rw] += bio_size; 827 } 828 829 tg->io_disp[rw]++; 830 tg->last_io_disp[rw]++; 831 } 832 833 /** 834 * throtl_add_bio_tg - add a bio to the specified throtl_grp 835 * @bio: bio to add 836 * @qn: qnode to use 837 * @tg: the target throtl_grp 838 * 839 * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 840 * tg->qnode_on_self[] is used. 841 */ 842 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 843 struct throtl_grp *tg) 844 { 845 struct throtl_service_queue *sq = &tg->service_queue; 846 bool rw = bio_data_dir(bio); 847 848 if (!qn) 849 qn = &tg->qnode_on_self[rw]; 850 851 /* 852 * If @tg doesn't currently have any bios queued in the same 853 * direction, queueing @bio can change when @tg should be 854 * dispatched. Mark that @tg was empty. This is automatically 855 * cleared on the next tg_update_disptime(). 856 */ 857 if (!sq->nr_queued[rw]) 858 tg->flags |= THROTL_TG_WAS_EMPTY; 859 860 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 861 862 sq->nr_queued[rw]++; 863 throtl_enqueue_tg(tg); 864 } 865 866 static void tg_update_disptime(struct throtl_grp *tg) 867 { 868 struct throtl_service_queue *sq = &tg->service_queue; 869 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 870 struct bio *bio; 871 872 bio = throtl_peek_queued(&sq->queued[READ]); 873 if (bio) 874 tg_may_dispatch(tg, bio, &read_wait); 875 876 bio = throtl_peek_queued(&sq->queued[WRITE]); 877 if (bio) 878 tg_may_dispatch(tg, bio, &write_wait); 879 880 min_wait = min(read_wait, write_wait); 881 disptime = jiffies + min_wait; 882 883 /* Update dispatch time */ 884 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 885 tg->disptime = disptime; 886 tg_service_queue_add(tg); 887 888 /* see throtl_add_bio_tg() */ 889 tg->flags &= ~THROTL_TG_WAS_EMPTY; 890 } 891 892 static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 893 struct throtl_grp *parent_tg, bool rw) 894 { 895 if (throtl_slice_used(parent_tg, rw)) { 896 throtl_start_new_slice_with_credit(parent_tg, rw, 897 child_tg->slice_start[rw]); 898 } 899 900 } 901 902 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 903 { 904 struct throtl_service_queue *sq = &tg->service_queue; 905 struct throtl_service_queue *parent_sq = sq->parent_sq; 906 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 907 struct throtl_grp *tg_to_put = NULL; 908 struct bio *bio; 909 910 /* 911 * @bio is being transferred from @tg to @parent_sq. Popping a bio 912 * from @tg may put its reference and @parent_sq might end up 913 * getting released prematurely. Remember the tg to put and put it 914 * after @bio is transferred to @parent_sq. 915 */ 916 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 917 sq->nr_queued[rw]--; 918 919 throtl_charge_bio(tg, bio); 920 921 /* 922 * If our parent is another tg, we just need to transfer @bio to 923 * the parent using throtl_add_bio_tg(). If our parent is 924 * @td->service_queue, @bio is ready to be issued. Put it on its 925 * bio_lists[] and decrease total number queued. The caller is 926 * responsible for issuing these bios. 927 */ 928 if (parent_tg) { 929 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 930 start_parent_slice_with_credit(tg, parent_tg, rw); 931 } else { 932 bio_set_flag(bio, BIO_BPS_THROTTLED); 933 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 934 &parent_sq->queued[rw]); 935 BUG_ON(tg->td->nr_queued[rw] <= 0); 936 tg->td->nr_queued[rw]--; 937 } 938 939 throtl_trim_slice(tg, rw); 940 941 if (tg_to_put) 942 blkg_put(tg_to_blkg(tg_to_put)); 943 } 944 945 static int throtl_dispatch_tg(struct throtl_grp *tg) 946 { 947 struct throtl_service_queue *sq = &tg->service_queue; 948 unsigned int nr_reads = 0, nr_writes = 0; 949 unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; 950 unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; 951 struct bio *bio; 952 953 /* Try to dispatch 75% READS and 25% WRITES */ 954 955 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 956 tg_may_dispatch(tg, bio, NULL)) { 957 958 tg_dispatch_one_bio(tg, READ); 959 nr_reads++; 960 961 if (nr_reads >= max_nr_reads) 962 break; 963 } 964 965 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 966 tg_may_dispatch(tg, bio, NULL)) { 967 968 tg_dispatch_one_bio(tg, WRITE); 969 nr_writes++; 970 971 if (nr_writes >= max_nr_writes) 972 break; 973 } 974 975 return nr_reads + nr_writes; 976 } 977 978 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 979 { 980 unsigned int nr_disp = 0; 981 982 while (1) { 983 struct throtl_grp *tg; 984 struct throtl_service_queue *sq; 985 986 if (!parent_sq->nr_pending) 987 break; 988 989 tg = throtl_rb_first(parent_sq); 990 if (!tg) 991 break; 992 993 if (time_before(jiffies, tg->disptime)) 994 break; 995 996 nr_disp += throtl_dispatch_tg(tg); 997 998 sq = &tg->service_queue; 999 if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) 1000 tg_update_disptime(tg); 1001 else 1002 throtl_dequeue_tg(tg); 1003 1004 if (nr_disp >= THROTL_QUANTUM) 1005 break; 1006 } 1007 1008 return nr_disp; 1009 } 1010 1011 /** 1012 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1013 * @t: the pending_timer member of the throtl_service_queue being serviced 1014 * 1015 * This timer is armed when a child throtl_grp with active bio's become 1016 * pending and queued on the service_queue's pending_tree and expires when 1017 * the first child throtl_grp should be dispatched. This function 1018 * dispatches bio's from the children throtl_grps to the parent 1019 * service_queue. 1020 * 1021 * If the parent's parent is another throtl_grp, dispatching is propagated 1022 * by either arming its pending_timer or repeating dispatch directly. If 1023 * the top-level service_tree is reached, throtl_data->dispatch_work is 1024 * kicked so that the ready bio's are issued. 1025 */ 1026 static void throtl_pending_timer_fn(struct timer_list *t) 1027 { 1028 struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); 1029 struct throtl_grp *tg = sq_to_tg(sq); 1030 struct throtl_data *td = sq_to_td(sq); 1031 struct throtl_service_queue *parent_sq; 1032 struct request_queue *q; 1033 bool dispatched; 1034 int ret; 1035 1036 /* throtl_data may be gone, so figure out request queue by blkg */ 1037 if (tg) 1038 q = tg->pd.blkg->q; 1039 else 1040 q = td->queue; 1041 1042 spin_lock_irq(&q->queue_lock); 1043 1044 if (!q->root_blkg) 1045 goto out_unlock; 1046 1047 again: 1048 parent_sq = sq->parent_sq; 1049 dispatched = false; 1050 1051 while (true) { 1052 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 1053 sq->nr_queued[READ] + sq->nr_queued[WRITE], 1054 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1055 1056 ret = throtl_select_dispatch(sq); 1057 if (ret) { 1058 throtl_log(sq, "bios disp=%u", ret); 1059 dispatched = true; 1060 } 1061 1062 if (throtl_schedule_next_dispatch(sq, false)) 1063 break; 1064 1065 /* this dispatch windows is still open, relax and repeat */ 1066 spin_unlock_irq(&q->queue_lock); 1067 cpu_relax(); 1068 spin_lock_irq(&q->queue_lock); 1069 } 1070 1071 if (!dispatched) 1072 goto out_unlock; 1073 1074 if (parent_sq) { 1075 /* @parent_sq is another throl_grp, propagate dispatch */ 1076 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1077 tg_update_disptime(tg); 1078 if (!throtl_schedule_next_dispatch(parent_sq, false)) { 1079 /* window is already open, repeat dispatching */ 1080 sq = parent_sq; 1081 tg = sq_to_tg(sq); 1082 goto again; 1083 } 1084 } 1085 } else { 1086 /* reached the top-level, queue issuing */ 1087 queue_work(kthrotld_workqueue, &td->dispatch_work); 1088 } 1089 out_unlock: 1090 spin_unlock_irq(&q->queue_lock); 1091 } 1092 1093 /** 1094 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1095 * @work: work item being executed 1096 * 1097 * This function is queued for execution when bios reach the bio_lists[] 1098 * of throtl_data->service_queue. Those bios are ready and issued by this 1099 * function. 1100 */ 1101 static void blk_throtl_dispatch_work_fn(struct work_struct *work) 1102 { 1103 struct throtl_data *td = container_of(work, struct throtl_data, 1104 dispatch_work); 1105 struct throtl_service_queue *td_sq = &td->service_queue; 1106 struct request_queue *q = td->queue; 1107 struct bio_list bio_list_on_stack; 1108 struct bio *bio; 1109 struct blk_plug plug; 1110 int rw; 1111 1112 bio_list_init(&bio_list_on_stack); 1113 1114 spin_lock_irq(&q->queue_lock); 1115 for (rw = READ; rw <= WRITE; rw++) 1116 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1117 bio_list_add(&bio_list_on_stack, bio); 1118 spin_unlock_irq(&q->queue_lock); 1119 1120 if (!bio_list_empty(&bio_list_on_stack)) { 1121 blk_start_plug(&plug); 1122 while ((bio = bio_list_pop(&bio_list_on_stack))) 1123 submit_bio_noacct_nocheck(bio); 1124 blk_finish_plug(&plug); 1125 } 1126 } 1127 1128 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1129 int off) 1130 { 1131 struct throtl_grp *tg = pd_to_tg(pd); 1132 u64 v = *(u64 *)((void *)tg + off); 1133 1134 if (v == U64_MAX) 1135 return 0; 1136 return __blkg_prfill_u64(sf, pd, v); 1137 } 1138 1139 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1140 int off) 1141 { 1142 struct throtl_grp *tg = pd_to_tg(pd); 1143 unsigned int v = *(unsigned int *)((void *)tg + off); 1144 1145 if (v == UINT_MAX) 1146 return 0; 1147 return __blkg_prfill_u64(sf, pd, v); 1148 } 1149 1150 static int tg_print_conf_u64(struct seq_file *sf, void *v) 1151 { 1152 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1153 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1154 return 0; 1155 } 1156 1157 static int tg_print_conf_uint(struct seq_file *sf, void *v) 1158 { 1159 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1160 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1161 return 0; 1162 } 1163 1164 static void tg_conf_updated(struct throtl_grp *tg, bool global) 1165 { 1166 struct throtl_service_queue *sq = &tg->service_queue; 1167 struct cgroup_subsys_state *pos_css; 1168 struct blkcg_gq *blkg; 1169 1170 throtl_log(&tg->service_queue, 1171 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1172 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), 1173 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); 1174 1175 rcu_read_lock(); 1176 /* 1177 * Update has_rules[] flags for the updated tg's subtree. A tg is 1178 * considered to have rules if either the tg itself or any of its 1179 * ancestors has rules. This identifies groups without any 1180 * restrictions in the whole hierarchy and allows them to bypass 1181 * blk-throttle. 1182 */ 1183 blkg_for_each_descendant_pre(blkg, pos_css, 1184 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { 1185 struct throtl_grp *this_tg = blkg_to_tg(blkg); 1186 1187 tg_update_has_rules(this_tg); 1188 /* ignore root/second level */ 1189 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || 1190 !blkg->parent->parent) 1191 continue; 1192 } 1193 rcu_read_unlock(); 1194 1195 /* 1196 * We're already holding queue_lock and know @tg is valid. Let's 1197 * apply the new config directly. 1198 * 1199 * Restart the slices for both READ and WRITES. It might happen 1200 * that a group's limit are dropped suddenly and we don't want to 1201 * account recently dispatched IO with new low rate. 1202 */ 1203 throtl_start_new_slice(tg, READ, false); 1204 throtl_start_new_slice(tg, WRITE, false); 1205 1206 if (tg->flags & THROTL_TG_PENDING) { 1207 tg_update_disptime(tg); 1208 throtl_schedule_next_dispatch(sq->parent_sq, true); 1209 } 1210 } 1211 1212 static int blk_throtl_init(struct gendisk *disk) 1213 { 1214 struct request_queue *q = disk->queue; 1215 struct throtl_data *td; 1216 int ret; 1217 1218 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1219 if (!td) 1220 return -ENOMEM; 1221 1222 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1223 throtl_service_queue_init(&td->service_queue); 1224 1225 /* 1226 * Freeze queue before activating policy, to synchronize with IO path, 1227 * which is protected by 'q_usage_counter'. 1228 */ 1229 blk_mq_freeze_queue(disk->queue); 1230 blk_mq_quiesce_queue(disk->queue); 1231 1232 q->td = td; 1233 td->queue = q; 1234 1235 /* activate policy */ 1236 ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); 1237 if (ret) { 1238 q->td = NULL; 1239 kfree(td); 1240 goto out; 1241 } 1242 1243 if (blk_queue_nonrot(q)) 1244 td->throtl_slice = DFL_THROTL_SLICE_SSD; 1245 else 1246 td->throtl_slice = DFL_THROTL_SLICE_HD; 1247 td->track_bio_latency = !queue_is_mq(q); 1248 if (!td->track_bio_latency) 1249 blk_stat_enable_accounting(q); 1250 1251 out: 1252 blk_mq_unquiesce_queue(disk->queue); 1253 blk_mq_unfreeze_queue(disk->queue); 1254 1255 return ret; 1256 } 1257 1258 1259 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1260 char *buf, size_t nbytes, loff_t off, bool is_u64) 1261 { 1262 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1263 struct blkg_conf_ctx ctx; 1264 struct throtl_grp *tg; 1265 int ret; 1266 u64 v; 1267 1268 blkg_conf_init(&ctx, buf); 1269 1270 ret = blkg_conf_open_bdev(&ctx); 1271 if (ret) 1272 goto out_finish; 1273 1274 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1275 ret = blk_throtl_init(ctx.bdev->bd_disk); 1276 if (ret) 1277 goto out_finish; 1278 } 1279 1280 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1281 if (ret) 1282 goto out_finish; 1283 1284 ret = -EINVAL; 1285 if (sscanf(ctx.body, "%llu", &v) != 1) 1286 goto out_finish; 1287 if (!v) 1288 v = U64_MAX; 1289 1290 tg = blkg_to_tg(ctx.blkg); 1291 tg_update_carryover(tg); 1292 1293 if (is_u64) 1294 *(u64 *)((void *)tg + of_cft(of)->private) = v; 1295 else 1296 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1297 1298 tg_conf_updated(tg, false); 1299 ret = 0; 1300 out_finish: 1301 blkg_conf_exit(&ctx); 1302 return ret ?: nbytes; 1303 } 1304 1305 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1306 char *buf, size_t nbytes, loff_t off) 1307 { 1308 return tg_set_conf(of, buf, nbytes, off, true); 1309 } 1310 1311 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1312 char *buf, size_t nbytes, loff_t off) 1313 { 1314 return tg_set_conf(of, buf, nbytes, off, false); 1315 } 1316 1317 static int tg_print_rwstat(struct seq_file *sf, void *v) 1318 { 1319 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1320 blkg_prfill_rwstat, &blkcg_policy_throtl, 1321 seq_cft(sf)->private, true); 1322 return 0; 1323 } 1324 1325 static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, 1326 struct blkg_policy_data *pd, int off) 1327 { 1328 struct blkg_rwstat_sample sum; 1329 1330 blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, 1331 &sum); 1332 return __blkg_prfill_rwstat(sf, pd, &sum); 1333 } 1334 1335 static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) 1336 { 1337 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1338 tg_prfill_rwstat_recursive, &blkcg_policy_throtl, 1339 seq_cft(sf)->private, true); 1340 return 0; 1341 } 1342 1343 static struct cftype throtl_legacy_files[] = { 1344 { 1345 .name = "throttle.read_bps_device", 1346 .private = offsetof(struct throtl_grp, bps[READ]), 1347 .seq_show = tg_print_conf_u64, 1348 .write = tg_set_conf_u64, 1349 }, 1350 { 1351 .name = "throttle.write_bps_device", 1352 .private = offsetof(struct throtl_grp, bps[WRITE]), 1353 .seq_show = tg_print_conf_u64, 1354 .write = tg_set_conf_u64, 1355 }, 1356 { 1357 .name = "throttle.read_iops_device", 1358 .private = offsetof(struct throtl_grp, iops[READ]), 1359 .seq_show = tg_print_conf_uint, 1360 .write = tg_set_conf_uint, 1361 }, 1362 { 1363 .name = "throttle.write_iops_device", 1364 .private = offsetof(struct throtl_grp, iops[WRITE]), 1365 .seq_show = tg_print_conf_uint, 1366 .write = tg_set_conf_uint, 1367 }, 1368 { 1369 .name = "throttle.io_service_bytes", 1370 .private = offsetof(struct throtl_grp, stat_bytes), 1371 .seq_show = tg_print_rwstat, 1372 }, 1373 { 1374 .name = "throttle.io_service_bytes_recursive", 1375 .private = offsetof(struct throtl_grp, stat_bytes), 1376 .seq_show = tg_print_rwstat_recursive, 1377 }, 1378 { 1379 .name = "throttle.io_serviced", 1380 .private = offsetof(struct throtl_grp, stat_ios), 1381 .seq_show = tg_print_rwstat, 1382 }, 1383 { 1384 .name = "throttle.io_serviced_recursive", 1385 .private = offsetof(struct throtl_grp, stat_ios), 1386 .seq_show = tg_print_rwstat_recursive, 1387 }, 1388 { } /* terminate */ 1389 }; 1390 1391 static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, 1392 int off) 1393 { 1394 struct throtl_grp *tg = pd_to_tg(pd); 1395 const char *dname = blkg_dev_name(pd->blkg); 1396 u64 bps_dft; 1397 unsigned int iops_dft; 1398 1399 if (!dname) 1400 return 0; 1401 1402 bps_dft = U64_MAX; 1403 iops_dft = UINT_MAX; 1404 1405 if (tg->bps[READ] == bps_dft && 1406 tg->bps[WRITE] == bps_dft && 1407 tg->iops[READ] == iops_dft && 1408 tg->iops[WRITE] == iops_dft) 1409 return 0; 1410 1411 seq_printf(sf, "%s", dname); 1412 if (tg->bps[READ] == U64_MAX) 1413 seq_printf(sf, " rbps=max"); 1414 else 1415 seq_printf(sf, " rbps=%llu", tg->bps[READ]); 1416 1417 if (tg->bps[WRITE] == U64_MAX) 1418 seq_printf(sf, " wbps=max"); 1419 else 1420 seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); 1421 1422 if (tg->iops[READ] == UINT_MAX) 1423 seq_printf(sf, " riops=max"); 1424 else 1425 seq_printf(sf, " riops=%u", tg->iops[READ]); 1426 1427 if (tg->iops[WRITE] == UINT_MAX) 1428 seq_printf(sf, " wiops=max"); 1429 else 1430 seq_printf(sf, " wiops=%u", tg->iops[WRITE]); 1431 1432 seq_printf(sf, "\n"); 1433 return 0; 1434 } 1435 1436 static int tg_print_limit(struct seq_file *sf, void *v) 1437 { 1438 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, 1439 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1440 return 0; 1441 } 1442 1443 static ssize_t tg_set_limit(struct kernfs_open_file *of, 1444 char *buf, size_t nbytes, loff_t off) 1445 { 1446 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1447 struct blkg_conf_ctx ctx; 1448 struct throtl_grp *tg; 1449 u64 v[4]; 1450 int ret; 1451 1452 blkg_conf_init(&ctx, buf); 1453 1454 ret = blkg_conf_open_bdev(&ctx); 1455 if (ret) 1456 goto out_finish; 1457 1458 if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1459 ret = blk_throtl_init(ctx.bdev->bd_disk); 1460 if (ret) 1461 goto out_finish; 1462 } 1463 1464 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1465 if (ret) 1466 goto out_finish; 1467 1468 tg = blkg_to_tg(ctx.blkg); 1469 tg_update_carryover(tg); 1470 1471 v[0] = tg->bps[READ]; 1472 v[1] = tg->bps[WRITE]; 1473 v[2] = tg->iops[READ]; 1474 v[3] = tg->iops[WRITE]; 1475 1476 while (true) { 1477 char tok[27]; /* wiops=18446744073709551616 */ 1478 char *p; 1479 u64 val = U64_MAX; 1480 int len; 1481 1482 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1483 break; 1484 if (tok[0] == '\0') 1485 break; 1486 ctx.body += len; 1487 1488 ret = -EINVAL; 1489 p = tok; 1490 strsep(&p, "="); 1491 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) 1492 goto out_finish; 1493 1494 ret = -ERANGE; 1495 if (!val) 1496 goto out_finish; 1497 1498 ret = -EINVAL; 1499 if (!strcmp(tok, "rbps") && val > 1) 1500 v[0] = val; 1501 else if (!strcmp(tok, "wbps") && val > 1) 1502 v[1] = val; 1503 else if (!strcmp(tok, "riops") && val > 1) 1504 v[2] = min_t(u64, val, UINT_MAX); 1505 else if (!strcmp(tok, "wiops") && val > 1) 1506 v[3] = min_t(u64, val, UINT_MAX); 1507 else 1508 goto out_finish; 1509 } 1510 1511 tg->bps[READ] = v[0]; 1512 tg->bps[WRITE] = v[1]; 1513 tg->iops[READ] = v[2]; 1514 tg->iops[WRITE] = v[3]; 1515 1516 tg_conf_updated(tg, false); 1517 ret = 0; 1518 out_finish: 1519 blkg_conf_exit(&ctx); 1520 return ret ?: nbytes; 1521 } 1522 1523 static struct cftype throtl_files[] = { 1524 { 1525 .name = "max", 1526 .flags = CFTYPE_NOT_ON_ROOT, 1527 .seq_show = tg_print_limit, 1528 .write = tg_set_limit, 1529 }, 1530 { } /* terminate */ 1531 }; 1532 1533 static void throtl_shutdown_wq(struct request_queue *q) 1534 { 1535 struct throtl_data *td = q->td; 1536 1537 cancel_work_sync(&td->dispatch_work); 1538 } 1539 1540 struct blkcg_policy blkcg_policy_throtl = { 1541 .dfl_cftypes = throtl_files, 1542 .legacy_cftypes = throtl_legacy_files, 1543 1544 .pd_alloc_fn = throtl_pd_alloc, 1545 .pd_init_fn = throtl_pd_init, 1546 .pd_online_fn = throtl_pd_online, 1547 .pd_free_fn = throtl_pd_free, 1548 }; 1549 1550 void blk_throtl_cancel_bios(struct gendisk *disk) 1551 { 1552 struct request_queue *q = disk->queue; 1553 struct cgroup_subsys_state *pos_css; 1554 struct blkcg_gq *blkg; 1555 1556 if (!blk_throtl_activated(q)) 1557 return; 1558 1559 spin_lock_irq(&q->queue_lock); 1560 /* 1561 * queue_lock is held, rcu lock is not needed here technically. 1562 * However, rcu lock is still held to emphasize that following 1563 * path need RCU protection and to prevent warning from lockdep. 1564 */ 1565 rcu_read_lock(); 1566 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1567 struct throtl_grp *tg = blkg_to_tg(blkg); 1568 struct throtl_service_queue *sq = &tg->service_queue; 1569 1570 /* 1571 * Set the flag to make sure throtl_pending_timer_fn() won't 1572 * stop until all throttled bios are dispatched. 1573 */ 1574 tg->flags |= THROTL_TG_CANCELING; 1575 1576 /* 1577 * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup 1578 * will be inserted to service queue without THROTL_TG_PENDING 1579 * set in tg_update_disptime below. Then IO dispatched from 1580 * child in tg_dispatch_one_bio will trigger double insertion 1581 * and corrupt the tree. 1582 */ 1583 if (!(tg->flags & THROTL_TG_PENDING)) 1584 continue; 1585 1586 /* 1587 * Update disptime after setting the above flag to make sure 1588 * throtl_select_dispatch() won't exit without dispatching. 1589 */ 1590 tg_update_disptime(tg); 1591 1592 throtl_schedule_pending_timer(sq, jiffies + 1); 1593 } 1594 rcu_read_unlock(); 1595 spin_unlock_irq(&q->queue_lock); 1596 } 1597 1598 bool __blk_throtl_bio(struct bio *bio) 1599 { 1600 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1601 struct blkcg_gq *blkg = bio->bi_blkg; 1602 struct throtl_qnode *qn = NULL; 1603 struct throtl_grp *tg = blkg_to_tg(blkg); 1604 struct throtl_service_queue *sq; 1605 bool rw = bio_data_dir(bio); 1606 bool throttled = false; 1607 struct throtl_data *td = tg->td; 1608 1609 rcu_read_lock(); 1610 spin_lock_irq(&q->queue_lock); 1611 sq = &tg->service_queue; 1612 1613 while (true) { 1614 if (tg->last_low_overflow_time[rw] == 0) 1615 tg->last_low_overflow_time[rw] = jiffies; 1616 /* throtl is FIFO - if bios are already queued, should queue */ 1617 if (sq->nr_queued[rw]) 1618 break; 1619 1620 /* if above limits, break to queue */ 1621 if (!tg_may_dispatch(tg, bio, NULL)) { 1622 tg->last_low_overflow_time[rw] = jiffies; 1623 break; 1624 } 1625 1626 /* within limits, let's charge and dispatch directly */ 1627 throtl_charge_bio(tg, bio); 1628 1629 /* 1630 * We need to trim slice even when bios are not being queued 1631 * otherwise it might happen that a bio is not queued for 1632 * a long time and slice keeps on extending and trim is not 1633 * called for a long time. Now if limits are reduced suddenly 1634 * we take into account all the IO dispatched so far at new 1635 * low rate and * newly queued IO gets a really long dispatch 1636 * time. 1637 * 1638 * So keep on trimming slice even if bio is not queued. 1639 */ 1640 throtl_trim_slice(tg, rw); 1641 1642 /* 1643 * @bio passed through this layer without being throttled. 1644 * Climb up the ladder. If we're already at the top, it 1645 * can be executed directly. 1646 */ 1647 qn = &tg->qnode_on_parent[rw]; 1648 sq = sq->parent_sq; 1649 tg = sq_to_tg(sq); 1650 if (!tg) { 1651 bio_set_flag(bio, BIO_BPS_THROTTLED); 1652 goto out_unlock; 1653 } 1654 } 1655 1656 /* out-of-limit, queue to @tg */ 1657 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 1658 rw == READ ? 'R' : 'W', 1659 tg->bytes_disp[rw], bio->bi_iter.bi_size, 1660 tg_bps_limit(tg, rw), 1661 tg->io_disp[rw], tg_iops_limit(tg, rw), 1662 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1663 1664 tg->last_low_overflow_time[rw] = jiffies; 1665 1666 td->nr_queued[rw]++; 1667 throtl_add_bio_tg(bio, qn, tg); 1668 throttled = true; 1669 1670 /* 1671 * Update @tg's dispatch time and force schedule dispatch if @tg 1672 * was empty before @bio. The forced scheduling isn't likely to 1673 * cause undue delay as @bio is likely to be dispatched directly if 1674 * its @tg's disptime is not in the future. 1675 */ 1676 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1677 tg_update_disptime(tg); 1678 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1679 } 1680 1681 out_unlock: 1682 spin_unlock_irq(&q->queue_lock); 1683 1684 rcu_read_unlock(); 1685 return throttled; 1686 } 1687 1688 void blk_throtl_exit(struct gendisk *disk) 1689 { 1690 struct request_queue *q = disk->queue; 1691 1692 if (!blk_throtl_activated(q)) 1693 return; 1694 1695 del_timer_sync(&q->td->service_queue.pending_timer); 1696 throtl_shutdown_wq(q); 1697 blkcg_deactivate_policy(disk, &blkcg_policy_throtl); 1698 kfree(q->td); 1699 } 1700 1701 static int __init throtl_init(void) 1702 { 1703 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1704 if (!kthrotld_workqueue) 1705 panic("Failed to create kthrotld\n"); 1706 1707 return blkcg_policy_register(&blkcg_policy_throtl); 1708 } 1709 1710 module_init(throtl_init); 1711