1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 4 * policies) 5 */ 6 7 int sched_rr_timeslice = RR_TIMESLICE; 8 /* More than 4 hours if BW_SHIFT equals 20. */ 9 static const u64 max_rt_runtime = MAX_BW; 10 11 /* 12 * period over which we measure -rt task CPU usage in us. 13 * default: 1s 14 */ 15 int sysctl_sched_rt_period = 1000000; 16 17 /* 18 * part of the period that we allow rt tasks to run in us. 19 * default: 0.95s 20 */ 21 int sysctl_sched_rt_runtime = 950000; 22 23 #ifdef CONFIG_SYSCTL 24 static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; 25 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, 26 size_t *lenp, loff_t *ppos); 27 static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, 28 size_t *lenp, loff_t *ppos); 29 static const struct ctl_table sched_rt_sysctls[] = { 30 { 31 .procname = "sched_rt_period_us", 32 .data = &sysctl_sched_rt_period, 33 .maxlen = sizeof(int), 34 .mode = 0644, 35 .proc_handler = sched_rt_handler, 36 .extra1 = SYSCTL_ONE, 37 .extra2 = SYSCTL_INT_MAX, 38 }, 39 { 40 .procname = "sched_rt_runtime_us", 41 .data = &sysctl_sched_rt_runtime, 42 .maxlen = sizeof(int), 43 .mode = 0644, 44 .proc_handler = sched_rt_handler, 45 .extra1 = SYSCTL_NEG_ONE, 46 .extra2 = (void *)&sysctl_sched_rt_period, 47 }, 48 { 49 .procname = "sched_rr_timeslice_ms", 50 .data = &sysctl_sched_rr_timeslice, 51 .maxlen = sizeof(int), 52 .mode = 0644, 53 .proc_handler = sched_rr_handler, 54 }, 55 }; 56 57 static int __init sched_rt_sysctl_init(void) 58 { 59 register_sysctl_init("kernel", sched_rt_sysctls); 60 return 0; 61 } 62 late_initcall(sched_rt_sysctl_init); 63 #endif 64 65 void init_rt_rq(struct rt_rq *rt_rq) 66 { 67 struct rt_prio_array *array; 68 int i; 69 70 array = &rt_rq->active; 71 for (i = 0; i < MAX_RT_PRIO; i++) { 72 INIT_LIST_HEAD(array->queue + i); 73 __clear_bit(i, array->bitmap); 74 } 75 /* delimiter for bitsearch: */ 76 __set_bit(MAX_RT_PRIO, array->bitmap); 77 78 #if defined CONFIG_SMP 79 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 80 rt_rq->highest_prio.next = MAX_RT_PRIO-1; 81 rt_rq->overloaded = 0; 82 plist_head_init(&rt_rq->pushable_tasks); 83 #endif /* CONFIG_SMP */ 84 /* We start is dequeued state, because no RT tasks are queued */ 85 rt_rq->rt_queued = 0; 86 87 #ifdef CONFIG_RT_GROUP_SCHED 88 rt_rq->rt_time = 0; 89 rt_rq->rt_throttled = 0; 90 rt_rq->rt_runtime = 0; 91 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 92 #endif 93 } 94 95 #ifdef CONFIG_RT_GROUP_SCHED 96 97 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 98 99 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 100 { 101 struct rt_bandwidth *rt_b = 102 container_of(timer, struct rt_bandwidth, rt_period_timer); 103 int idle = 0; 104 int overrun; 105 106 raw_spin_lock(&rt_b->rt_runtime_lock); 107 for (;;) { 108 overrun = hrtimer_forward_now(timer, rt_b->rt_period); 109 if (!overrun) 110 break; 111 112 raw_spin_unlock(&rt_b->rt_runtime_lock); 113 idle = do_sched_rt_period_timer(rt_b, overrun); 114 raw_spin_lock(&rt_b->rt_runtime_lock); 115 } 116 if (idle) 117 rt_b->rt_period_active = 0; 118 raw_spin_unlock(&rt_b->rt_runtime_lock); 119 120 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 121 } 122 123 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 124 { 125 rt_b->rt_period = ns_to_ktime(period); 126 rt_b->rt_runtime = runtime; 127 128 raw_spin_lock_init(&rt_b->rt_runtime_lock); 129 130 hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC, 131 HRTIMER_MODE_REL_HARD); 132 } 133 134 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) 135 { 136 raw_spin_lock(&rt_b->rt_runtime_lock); 137 if (!rt_b->rt_period_active) { 138 rt_b->rt_period_active = 1; 139 /* 140 * SCHED_DEADLINE updates the bandwidth, as a run away 141 * RT task with a DL task could hog a CPU. But DL does 142 * not reset the period. If a deadline task was running 143 * without an RT task running, it can cause RT tasks to 144 * throttle when they start up. Kick the timer right away 145 * to update the period. 146 */ 147 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); 148 hrtimer_start_expires(&rt_b->rt_period_timer, 149 HRTIMER_MODE_ABS_PINNED_HARD); 150 } 151 raw_spin_unlock(&rt_b->rt_runtime_lock); 152 } 153 154 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 155 { 156 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 157 return; 158 159 do_start_rt_bandwidth(rt_b); 160 } 161 162 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 163 { 164 hrtimer_cancel(&rt_b->rt_period_timer); 165 } 166 167 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 168 169 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 170 { 171 #ifdef CONFIG_SCHED_DEBUG 172 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 173 #endif 174 return container_of(rt_se, struct task_struct, rt); 175 } 176 177 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 178 { 179 return rt_rq->rq; 180 } 181 182 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 183 { 184 return rt_se->rt_rq; 185 } 186 187 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 188 { 189 struct rt_rq *rt_rq = rt_se->rt_rq; 190 191 return rt_rq->rq; 192 } 193 194 void unregister_rt_sched_group(struct task_group *tg) 195 { 196 if (tg->rt_se) 197 destroy_rt_bandwidth(&tg->rt_bandwidth); 198 } 199 200 void free_rt_sched_group(struct task_group *tg) 201 { 202 int i; 203 204 for_each_possible_cpu(i) { 205 if (tg->rt_rq) 206 kfree(tg->rt_rq[i]); 207 if (tg->rt_se) 208 kfree(tg->rt_se[i]); 209 } 210 211 kfree(tg->rt_rq); 212 kfree(tg->rt_se); 213 } 214 215 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 216 struct sched_rt_entity *rt_se, int cpu, 217 struct sched_rt_entity *parent) 218 { 219 struct rq *rq = cpu_rq(cpu); 220 221 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 222 rt_rq->rt_nr_boosted = 0; 223 rt_rq->rq = rq; 224 rt_rq->tg = tg; 225 226 tg->rt_rq[cpu] = rt_rq; 227 tg->rt_se[cpu] = rt_se; 228 229 if (!rt_se) 230 return; 231 232 if (!parent) 233 rt_se->rt_rq = &rq->rt; 234 else 235 rt_se->rt_rq = parent->my_q; 236 237 rt_se->my_q = rt_rq; 238 rt_se->parent = parent; 239 INIT_LIST_HEAD(&rt_se->run_list); 240 } 241 242 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 243 { 244 struct rt_rq *rt_rq; 245 struct sched_rt_entity *rt_se; 246 int i; 247 248 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); 249 if (!tg->rt_rq) 250 goto err; 251 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); 252 if (!tg->rt_se) 253 goto err; 254 255 init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); 256 257 for_each_possible_cpu(i) { 258 rt_rq = kzalloc_node(sizeof(struct rt_rq), 259 GFP_KERNEL, cpu_to_node(i)); 260 if (!rt_rq) 261 goto err; 262 263 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 264 GFP_KERNEL, cpu_to_node(i)); 265 if (!rt_se) 266 goto err_free_rq; 267 268 init_rt_rq(rt_rq); 269 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 270 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 271 } 272 273 return 1; 274 275 err_free_rq: 276 kfree(rt_rq); 277 err: 278 return 0; 279 } 280 281 #else /* CONFIG_RT_GROUP_SCHED */ 282 283 #define rt_entity_is_task(rt_se) (1) 284 285 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 286 { 287 return container_of(rt_se, struct task_struct, rt); 288 } 289 290 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 291 { 292 return container_of(rt_rq, struct rq, rt); 293 } 294 295 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 296 { 297 struct task_struct *p = rt_task_of(rt_se); 298 299 return task_rq(p); 300 } 301 302 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 303 { 304 struct rq *rq = rq_of_rt_se(rt_se); 305 306 return &rq->rt; 307 } 308 309 void unregister_rt_sched_group(struct task_group *tg) { } 310 311 void free_rt_sched_group(struct task_group *tg) { } 312 313 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 314 { 315 return 1; 316 } 317 #endif /* CONFIG_RT_GROUP_SCHED */ 318 319 #ifdef CONFIG_SMP 320 321 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 322 { 323 /* Try to pull RT tasks here if we lower this rq's prio */ 324 return rq->online && rq->rt.highest_prio.curr > prev->prio; 325 } 326 327 static inline int rt_overloaded(struct rq *rq) 328 { 329 return atomic_read(&rq->rd->rto_count); 330 } 331 332 static inline void rt_set_overload(struct rq *rq) 333 { 334 if (!rq->online) 335 return; 336 337 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 338 /* 339 * Make sure the mask is visible before we set 340 * the overload count. That is checked to determine 341 * if we should look at the mask. It would be a shame 342 * if we looked at the mask, but the mask was not 343 * updated yet. 344 * 345 * Matched by the barrier in pull_rt_task(). 346 */ 347 smp_wmb(); 348 atomic_inc(&rq->rd->rto_count); 349 } 350 351 static inline void rt_clear_overload(struct rq *rq) 352 { 353 if (!rq->online) 354 return; 355 356 /* the order here really doesn't matter */ 357 atomic_dec(&rq->rd->rto_count); 358 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 359 } 360 361 static inline int has_pushable_tasks(struct rq *rq) 362 { 363 return !plist_head_empty(&rq->rt.pushable_tasks); 364 } 365 366 static DEFINE_PER_CPU(struct balance_callback, rt_push_head); 367 static DEFINE_PER_CPU(struct balance_callback, rt_pull_head); 368 369 static void push_rt_tasks(struct rq *); 370 static void pull_rt_task(struct rq *); 371 372 static inline void rt_queue_push_tasks(struct rq *rq) 373 { 374 if (!has_pushable_tasks(rq)) 375 return; 376 377 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 378 } 379 380 static inline void rt_queue_pull_task(struct rq *rq) 381 { 382 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 383 } 384 385 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 386 { 387 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 388 plist_node_init(&p->pushable_tasks, p->prio); 389 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 390 391 /* Update the highest prio pushable task */ 392 if (p->prio < rq->rt.highest_prio.next) 393 rq->rt.highest_prio.next = p->prio; 394 395 if (!rq->rt.overloaded) { 396 rt_set_overload(rq); 397 rq->rt.overloaded = 1; 398 } 399 } 400 401 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 402 { 403 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 404 405 /* Update the new highest prio pushable task */ 406 if (has_pushable_tasks(rq)) { 407 p = plist_first_entry(&rq->rt.pushable_tasks, 408 struct task_struct, pushable_tasks); 409 rq->rt.highest_prio.next = p->prio; 410 } else { 411 rq->rt.highest_prio.next = MAX_RT_PRIO-1; 412 413 if (rq->rt.overloaded) { 414 rt_clear_overload(rq); 415 rq->rt.overloaded = 0; 416 } 417 } 418 } 419 420 #else 421 422 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 423 { 424 } 425 426 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 427 { 428 } 429 430 static inline void rt_queue_push_tasks(struct rq *rq) 431 { 432 } 433 #endif /* CONFIG_SMP */ 434 435 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 436 static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); 437 438 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 439 { 440 return rt_se->on_rq; 441 } 442 443 #ifdef CONFIG_UCLAMP_TASK 444 /* 445 * Verify the fitness of task @p to run on @cpu taking into account the uclamp 446 * settings. 447 * 448 * This check is only important for heterogeneous systems where uclamp_min value 449 * is higher than the capacity of a @cpu. For non-heterogeneous system this 450 * function will always return true. 451 * 452 * The function will return true if the capacity of the @cpu is >= the 453 * uclamp_min and false otherwise. 454 * 455 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min 456 * > uclamp_max. 457 */ 458 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 459 { 460 unsigned int min_cap; 461 unsigned int max_cap; 462 unsigned int cpu_cap; 463 464 /* Only heterogeneous systems can benefit from this check */ 465 if (!sched_asym_cpucap_active()) 466 return true; 467 468 min_cap = uclamp_eff_value(p, UCLAMP_MIN); 469 max_cap = uclamp_eff_value(p, UCLAMP_MAX); 470 471 cpu_cap = arch_scale_cpu_capacity(cpu); 472 473 return cpu_cap >= min(min_cap, max_cap); 474 } 475 #else 476 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 477 { 478 return true; 479 } 480 #endif 481 482 #ifdef CONFIG_RT_GROUP_SCHED 483 484 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 485 { 486 if (!rt_rq->tg) 487 return RUNTIME_INF; 488 489 return rt_rq->rt_runtime; 490 } 491 492 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 493 { 494 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 495 } 496 497 typedef struct task_group *rt_rq_iter_t; 498 499 static inline struct task_group *next_task_group(struct task_group *tg) 500 { 501 do { 502 tg = list_entry_rcu(tg->list.next, 503 typeof(struct task_group), list); 504 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 505 506 if (&tg->list == &task_groups) 507 tg = NULL; 508 509 return tg; 510 } 511 512 #define for_each_rt_rq(rt_rq, iter, rq) \ 513 for (iter = container_of(&task_groups, typeof(*iter), list); \ 514 (iter = next_task_group(iter)) && \ 515 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 516 517 #define for_each_sched_rt_entity(rt_se) \ 518 for (; rt_se; rt_se = rt_se->parent) 519 520 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 521 { 522 return rt_se->my_q; 523 } 524 525 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 526 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 527 528 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 529 { 530 struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; 531 struct rq *rq = rq_of_rt_rq(rt_rq); 532 struct sched_rt_entity *rt_se; 533 534 int cpu = cpu_of(rq); 535 536 rt_se = rt_rq->tg->rt_se[cpu]; 537 538 if (rt_rq->rt_nr_running) { 539 if (!rt_se) 540 enqueue_top_rt_rq(rt_rq); 541 else if (!on_rt_rq(rt_se)) 542 enqueue_rt_entity(rt_se, 0); 543 544 if (rt_rq->highest_prio.curr < donor->prio) 545 resched_curr(rq); 546 } 547 } 548 549 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 550 { 551 struct sched_rt_entity *rt_se; 552 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 553 554 rt_se = rt_rq->tg->rt_se[cpu]; 555 556 if (!rt_se) { 557 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); 558 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 559 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); 560 } 561 else if (on_rt_rq(rt_se)) 562 dequeue_rt_entity(rt_se, 0); 563 } 564 565 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 566 { 567 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 568 } 569 570 static int rt_se_boosted(struct sched_rt_entity *rt_se) 571 { 572 struct rt_rq *rt_rq = group_rt_rq(rt_se); 573 struct task_struct *p; 574 575 if (rt_rq) 576 return !!rt_rq->rt_nr_boosted; 577 578 p = rt_task_of(rt_se); 579 return p->prio != p->normal_prio; 580 } 581 582 #ifdef CONFIG_SMP 583 static inline const struct cpumask *sched_rt_period_mask(void) 584 { 585 return this_rq()->rd->span; 586 } 587 #else 588 static inline const struct cpumask *sched_rt_period_mask(void) 589 { 590 return cpu_online_mask; 591 } 592 #endif 593 594 static inline 595 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 596 { 597 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 598 } 599 600 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 601 { 602 return &rt_rq->tg->rt_bandwidth; 603 } 604 605 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 606 { 607 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 608 609 return (hrtimer_active(&rt_b->rt_period_timer) || 610 rt_rq->rt_time < rt_b->rt_runtime); 611 } 612 613 #ifdef CONFIG_SMP 614 /* 615 * We ran out of runtime, see if we can borrow some from our neighbours. 616 */ 617 static void do_balance_runtime(struct rt_rq *rt_rq) 618 { 619 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 620 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 621 int i, weight; 622 u64 rt_period; 623 624 weight = cpumask_weight(rd->span); 625 626 raw_spin_lock(&rt_b->rt_runtime_lock); 627 rt_period = ktime_to_ns(rt_b->rt_period); 628 for_each_cpu(i, rd->span) { 629 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 630 s64 diff; 631 632 if (iter == rt_rq) 633 continue; 634 635 raw_spin_lock(&iter->rt_runtime_lock); 636 /* 637 * Either all rqs have inf runtime and there's nothing to steal 638 * or __disable_runtime() below sets a specific rq to inf to 639 * indicate its been disabled and disallow stealing. 640 */ 641 if (iter->rt_runtime == RUNTIME_INF) 642 goto next; 643 644 /* 645 * From runqueues with spare time, take 1/n part of their 646 * spare time, but no more than our period. 647 */ 648 diff = iter->rt_runtime - iter->rt_time; 649 if (diff > 0) { 650 diff = div_u64((u64)diff, weight); 651 if (rt_rq->rt_runtime + diff > rt_period) 652 diff = rt_period - rt_rq->rt_runtime; 653 iter->rt_runtime -= diff; 654 rt_rq->rt_runtime += diff; 655 if (rt_rq->rt_runtime == rt_period) { 656 raw_spin_unlock(&iter->rt_runtime_lock); 657 break; 658 } 659 } 660 next: 661 raw_spin_unlock(&iter->rt_runtime_lock); 662 } 663 raw_spin_unlock(&rt_b->rt_runtime_lock); 664 } 665 666 /* 667 * Ensure this RQ takes back all the runtime it lend to its neighbours. 668 */ 669 static void __disable_runtime(struct rq *rq) 670 { 671 struct root_domain *rd = rq->rd; 672 rt_rq_iter_t iter; 673 struct rt_rq *rt_rq; 674 675 if (unlikely(!scheduler_running)) 676 return; 677 678 for_each_rt_rq(rt_rq, iter, rq) { 679 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 680 s64 want; 681 int i; 682 683 raw_spin_lock(&rt_b->rt_runtime_lock); 684 raw_spin_lock(&rt_rq->rt_runtime_lock); 685 /* 686 * Either we're all inf and nobody needs to borrow, or we're 687 * already disabled and thus have nothing to do, or we have 688 * exactly the right amount of runtime to take out. 689 */ 690 if (rt_rq->rt_runtime == RUNTIME_INF || 691 rt_rq->rt_runtime == rt_b->rt_runtime) 692 goto balanced; 693 raw_spin_unlock(&rt_rq->rt_runtime_lock); 694 695 /* 696 * Calculate the difference between what we started out with 697 * and what we current have, that's the amount of runtime 698 * we lend and now have to reclaim. 699 */ 700 want = rt_b->rt_runtime - rt_rq->rt_runtime; 701 702 /* 703 * Greedy reclaim, take back as much as we can. 704 */ 705 for_each_cpu(i, rd->span) { 706 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 707 s64 diff; 708 709 /* 710 * Can't reclaim from ourselves or disabled runqueues. 711 */ 712 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 713 continue; 714 715 raw_spin_lock(&iter->rt_runtime_lock); 716 if (want > 0) { 717 diff = min_t(s64, iter->rt_runtime, want); 718 iter->rt_runtime -= diff; 719 want -= diff; 720 } else { 721 iter->rt_runtime -= want; 722 want -= want; 723 } 724 raw_spin_unlock(&iter->rt_runtime_lock); 725 726 if (!want) 727 break; 728 } 729 730 raw_spin_lock(&rt_rq->rt_runtime_lock); 731 /* 732 * We cannot be left wanting - that would mean some runtime 733 * leaked out of the system. 734 */ 735 WARN_ON_ONCE(want); 736 balanced: 737 /* 738 * Disable all the borrow logic by pretending we have inf 739 * runtime - in which case borrowing doesn't make sense. 740 */ 741 rt_rq->rt_runtime = RUNTIME_INF; 742 rt_rq->rt_throttled = 0; 743 raw_spin_unlock(&rt_rq->rt_runtime_lock); 744 raw_spin_unlock(&rt_b->rt_runtime_lock); 745 746 /* Make rt_rq available for pick_next_task() */ 747 sched_rt_rq_enqueue(rt_rq); 748 } 749 } 750 751 static void __enable_runtime(struct rq *rq) 752 { 753 rt_rq_iter_t iter; 754 struct rt_rq *rt_rq; 755 756 if (unlikely(!scheduler_running)) 757 return; 758 759 /* 760 * Reset each runqueue's bandwidth settings 761 */ 762 for_each_rt_rq(rt_rq, iter, rq) { 763 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 764 765 raw_spin_lock(&rt_b->rt_runtime_lock); 766 raw_spin_lock(&rt_rq->rt_runtime_lock); 767 rt_rq->rt_runtime = rt_b->rt_runtime; 768 rt_rq->rt_time = 0; 769 rt_rq->rt_throttled = 0; 770 raw_spin_unlock(&rt_rq->rt_runtime_lock); 771 raw_spin_unlock(&rt_b->rt_runtime_lock); 772 } 773 } 774 775 static void balance_runtime(struct rt_rq *rt_rq) 776 { 777 if (!sched_feat(RT_RUNTIME_SHARE)) 778 return; 779 780 if (rt_rq->rt_time > rt_rq->rt_runtime) { 781 raw_spin_unlock(&rt_rq->rt_runtime_lock); 782 do_balance_runtime(rt_rq); 783 raw_spin_lock(&rt_rq->rt_runtime_lock); 784 } 785 } 786 #else /* !CONFIG_SMP */ 787 static inline void balance_runtime(struct rt_rq *rt_rq) {} 788 #endif /* CONFIG_SMP */ 789 790 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 791 { 792 int i, idle = 1, throttled = 0; 793 const struct cpumask *span; 794 795 span = sched_rt_period_mask(); 796 797 /* 798 * FIXME: isolated CPUs should really leave the root task group, 799 * whether they are isolcpus or were isolated via cpusets, lest 800 * the timer run on a CPU which does not service all runqueues, 801 * potentially leaving other CPUs indefinitely throttled. If 802 * isolation is really required, the user will turn the throttle 803 * off to kill the perturbations it causes anyway. Meanwhile, 804 * this maintains functionality for boot and/or troubleshooting. 805 */ 806 if (rt_b == &root_task_group.rt_bandwidth) 807 span = cpu_online_mask; 808 809 for_each_cpu(i, span) { 810 int enqueue = 0; 811 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 812 struct rq *rq = rq_of_rt_rq(rt_rq); 813 struct rq_flags rf; 814 int skip; 815 816 /* 817 * When span == cpu_online_mask, taking each rq->lock 818 * can be time-consuming. Try to avoid it when possible. 819 */ 820 raw_spin_lock(&rt_rq->rt_runtime_lock); 821 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) 822 rt_rq->rt_runtime = rt_b->rt_runtime; 823 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; 824 raw_spin_unlock(&rt_rq->rt_runtime_lock); 825 if (skip) 826 continue; 827 828 rq_lock(rq, &rf); 829 update_rq_clock(rq); 830 831 if (rt_rq->rt_time) { 832 u64 runtime; 833 834 raw_spin_lock(&rt_rq->rt_runtime_lock); 835 if (rt_rq->rt_throttled) 836 balance_runtime(rt_rq); 837 runtime = rt_rq->rt_runtime; 838 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 839 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 840 rt_rq->rt_throttled = 0; 841 enqueue = 1; 842 843 /* 844 * When we're idle and a woken (rt) task is 845 * throttled wakeup_preempt() will set 846 * skip_update and the time between the wakeup 847 * and this unthrottle will get accounted as 848 * 'runtime'. 849 */ 850 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 851 rq_clock_cancel_skipupdate(rq); 852 } 853 if (rt_rq->rt_time || rt_rq->rt_nr_running) 854 idle = 0; 855 raw_spin_unlock(&rt_rq->rt_runtime_lock); 856 } else if (rt_rq->rt_nr_running) { 857 idle = 0; 858 if (!rt_rq_throttled(rt_rq)) 859 enqueue = 1; 860 } 861 if (rt_rq->rt_throttled) 862 throttled = 1; 863 864 if (enqueue) 865 sched_rt_rq_enqueue(rt_rq); 866 rq_unlock(rq, &rf); 867 } 868 869 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 870 return 1; 871 872 return idle; 873 } 874 875 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 876 { 877 u64 runtime = sched_rt_runtime(rt_rq); 878 879 if (rt_rq->rt_throttled) 880 return rt_rq_throttled(rt_rq); 881 882 if (runtime >= sched_rt_period(rt_rq)) 883 return 0; 884 885 balance_runtime(rt_rq); 886 runtime = sched_rt_runtime(rt_rq); 887 if (runtime == RUNTIME_INF) 888 return 0; 889 890 if (rt_rq->rt_time > runtime) { 891 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 892 893 /* 894 * Don't actually throttle groups that have no runtime assigned 895 * but accrue some time due to boosting. 896 */ 897 if (likely(rt_b->rt_runtime)) { 898 rt_rq->rt_throttled = 1; 899 printk_deferred_once("sched: RT throttling activated\n"); 900 } else { 901 /* 902 * In case we did anyway, make it go away, 903 * replenishment is a joke, since it will replenish us 904 * with exactly 0 ns. 905 */ 906 rt_rq->rt_time = 0; 907 } 908 909 if (rt_rq_throttled(rt_rq)) { 910 sched_rt_rq_dequeue(rt_rq); 911 return 1; 912 } 913 } 914 915 return 0; 916 } 917 918 #else /* !CONFIG_RT_GROUP_SCHED */ 919 920 typedef struct rt_rq *rt_rq_iter_t; 921 922 #define for_each_rt_rq(rt_rq, iter, rq) \ 923 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 924 925 #define for_each_sched_rt_entity(rt_se) \ 926 for (; rt_se; rt_se = NULL) 927 928 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 929 { 930 return NULL; 931 } 932 933 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 934 { 935 struct rq *rq = rq_of_rt_rq(rt_rq); 936 937 if (!rt_rq->rt_nr_running) 938 return; 939 940 enqueue_top_rt_rq(rt_rq); 941 resched_curr(rq); 942 } 943 944 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 945 { 946 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); 947 } 948 949 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 950 { 951 return false; 952 } 953 954 static inline const struct cpumask *sched_rt_period_mask(void) 955 { 956 return cpu_online_mask; 957 } 958 959 static inline 960 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 961 { 962 return &cpu_rq(cpu)->rt; 963 } 964 965 #ifdef CONFIG_SMP 966 static void __enable_runtime(struct rq *rq) { } 967 static void __disable_runtime(struct rq *rq) { } 968 #endif 969 970 #endif /* CONFIG_RT_GROUP_SCHED */ 971 972 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 973 { 974 #ifdef CONFIG_RT_GROUP_SCHED 975 struct rt_rq *rt_rq = group_rt_rq(rt_se); 976 977 if (rt_rq) 978 return rt_rq->highest_prio.curr; 979 #endif 980 981 return rt_task_of(rt_se)->prio; 982 } 983 984 /* 985 * Update the current task's runtime statistics. Skip current tasks that 986 * are not in our scheduling class. 987 */ 988 static void update_curr_rt(struct rq *rq) 989 { 990 struct task_struct *donor = rq->donor; 991 s64 delta_exec; 992 993 if (donor->sched_class != &rt_sched_class) 994 return; 995 996 delta_exec = update_curr_common(rq); 997 if (unlikely(delta_exec <= 0)) 998 return; 999 1000 #ifdef CONFIG_RT_GROUP_SCHED 1001 struct sched_rt_entity *rt_se = &donor->rt; 1002 1003 if (!rt_bandwidth_enabled()) 1004 return; 1005 1006 for_each_sched_rt_entity(rt_se) { 1007 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1008 int exceeded; 1009 1010 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 1011 raw_spin_lock(&rt_rq->rt_runtime_lock); 1012 rt_rq->rt_time += delta_exec; 1013 exceeded = sched_rt_runtime_exceeded(rt_rq); 1014 if (exceeded) 1015 resched_curr(rq); 1016 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1017 if (exceeded) 1018 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); 1019 } 1020 } 1021 #endif 1022 } 1023 1024 static void 1025 dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count) 1026 { 1027 struct rq *rq = rq_of_rt_rq(rt_rq); 1028 1029 BUG_ON(&rq->rt != rt_rq); 1030 1031 if (!rt_rq->rt_queued) 1032 return; 1033 1034 BUG_ON(!rq->nr_running); 1035 1036 sub_nr_running(rq, count); 1037 rt_rq->rt_queued = 0; 1038 1039 } 1040 1041 static void 1042 enqueue_top_rt_rq(struct rt_rq *rt_rq) 1043 { 1044 struct rq *rq = rq_of_rt_rq(rt_rq); 1045 1046 BUG_ON(&rq->rt != rt_rq); 1047 1048 if (rt_rq->rt_queued) 1049 return; 1050 1051 if (rt_rq_throttled(rt_rq)) 1052 return; 1053 1054 if (rt_rq->rt_nr_running) { 1055 add_nr_running(rq, rt_rq->rt_nr_running); 1056 rt_rq->rt_queued = 1; 1057 } 1058 1059 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1060 cpufreq_update_util(rq, 0); 1061 } 1062 1063 #if defined CONFIG_SMP 1064 1065 static void 1066 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1067 { 1068 struct rq *rq = rq_of_rt_rq(rt_rq); 1069 1070 #ifdef CONFIG_RT_GROUP_SCHED 1071 /* 1072 * Change rq's cpupri only if rt_rq is the top queue. 1073 */ 1074 if (&rq->rt != rt_rq) 1075 return; 1076 #endif 1077 if (rq->online && prio < prev_prio) 1078 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1079 } 1080 1081 static void 1082 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1083 { 1084 struct rq *rq = rq_of_rt_rq(rt_rq); 1085 1086 #ifdef CONFIG_RT_GROUP_SCHED 1087 /* 1088 * Change rq's cpupri only if rt_rq is the top queue. 1089 */ 1090 if (&rq->rt != rt_rq) 1091 return; 1092 #endif 1093 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1094 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1095 } 1096 1097 #else /* CONFIG_SMP */ 1098 1099 static inline 1100 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1101 static inline 1102 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1103 1104 #endif /* CONFIG_SMP */ 1105 1106 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1107 static void 1108 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1109 { 1110 int prev_prio = rt_rq->highest_prio.curr; 1111 1112 if (prio < prev_prio) 1113 rt_rq->highest_prio.curr = prio; 1114 1115 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1116 } 1117 1118 static void 1119 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1120 { 1121 int prev_prio = rt_rq->highest_prio.curr; 1122 1123 if (rt_rq->rt_nr_running) { 1124 1125 WARN_ON(prio < prev_prio); 1126 1127 /* 1128 * This may have been our highest task, and therefore 1129 * we may have some re-computation to do 1130 */ 1131 if (prio == prev_prio) { 1132 struct rt_prio_array *array = &rt_rq->active; 1133 1134 rt_rq->highest_prio.curr = 1135 sched_find_first_bit(array->bitmap); 1136 } 1137 1138 } else { 1139 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 1140 } 1141 1142 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1143 } 1144 1145 #else 1146 1147 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1148 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1149 1150 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1151 1152 #ifdef CONFIG_RT_GROUP_SCHED 1153 1154 static void 1155 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1156 { 1157 if (rt_se_boosted(rt_se)) 1158 rt_rq->rt_nr_boosted++; 1159 1160 if (rt_rq->tg) 1161 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1162 } 1163 1164 static void 1165 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1166 { 1167 if (rt_se_boosted(rt_se)) 1168 rt_rq->rt_nr_boosted--; 1169 1170 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1171 } 1172 1173 #else /* CONFIG_RT_GROUP_SCHED */ 1174 1175 static void 1176 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1177 { 1178 } 1179 1180 static inline 1181 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1182 1183 #endif /* CONFIG_RT_GROUP_SCHED */ 1184 1185 static inline 1186 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1187 { 1188 struct rt_rq *group_rq = group_rt_rq(rt_se); 1189 1190 if (group_rq) 1191 return group_rq->rt_nr_running; 1192 else 1193 return 1; 1194 } 1195 1196 static inline 1197 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) 1198 { 1199 struct rt_rq *group_rq = group_rt_rq(rt_se); 1200 struct task_struct *tsk; 1201 1202 if (group_rq) 1203 return group_rq->rr_nr_running; 1204 1205 tsk = rt_task_of(rt_se); 1206 1207 return (tsk->policy == SCHED_RR) ? 1 : 0; 1208 } 1209 1210 static inline 1211 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1212 { 1213 int prio = rt_se_prio(rt_se); 1214 1215 WARN_ON(!rt_prio(prio)); 1216 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1217 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); 1218 1219 inc_rt_prio(rt_rq, prio); 1220 inc_rt_group(rt_se, rt_rq); 1221 } 1222 1223 static inline 1224 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1225 { 1226 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1227 WARN_ON(!rt_rq->rt_nr_running); 1228 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1229 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); 1230 1231 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1232 dec_rt_group(rt_se, rt_rq); 1233 } 1234 1235 /* 1236 * Change rt_se->run_list location unless SAVE && !MOVE 1237 * 1238 * assumes ENQUEUE/DEQUEUE flags match 1239 */ 1240 static inline bool move_entity(unsigned int flags) 1241 { 1242 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) 1243 return false; 1244 1245 return true; 1246 } 1247 1248 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) 1249 { 1250 list_del_init(&rt_se->run_list); 1251 1252 if (list_empty(array->queue + rt_se_prio(rt_se))) 1253 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1254 1255 rt_se->on_list = 0; 1256 } 1257 1258 static inline struct sched_statistics * 1259 __schedstats_from_rt_se(struct sched_rt_entity *rt_se) 1260 { 1261 #ifdef CONFIG_RT_GROUP_SCHED 1262 /* schedstats is not supported for rt group. */ 1263 if (!rt_entity_is_task(rt_se)) 1264 return NULL; 1265 #endif 1266 1267 return &rt_task_of(rt_se)->stats; 1268 } 1269 1270 static inline void 1271 update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1272 { 1273 struct sched_statistics *stats; 1274 struct task_struct *p = NULL; 1275 1276 if (!schedstat_enabled()) 1277 return; 1278 1279 if (rt_entity_is_task(rt_se)) 1280 p = rt_task_of(rt_se); 1281 1282 stats = __schedstats_from_rt_se(rt_se); 1283 if (!stats) 1284 return; 1285 1286 __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); 1287 } 1288 1289 static inline void 1290 update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1291 { 1292 struct sched_statistics *stats; 1293 struct task_struct *p = NULL; 1294 1295 if (!schedstat_enabled()) 1296 return; 1297 1298 if (rt_entity_is_task(rt_se)) 1299 p = rt_task_of(rt_se); 1300 1301 stats = __schedstats_from_rt_se(rt_se); 1302 if (!stats) 1303 return; 1304 1305 __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); 1306 } 1307 1308 static inline void 1309 update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1310 int flags) 1311 { 1312 if (!schedstat_enabled()) 1313 return; 1314 1315 if (flags & ENQUEUE_WAKEUP) 1316 update_stats_enqueue_sleeper_rt(rt_rq, rt_se); 1317 } 1318 1319 static inline void 1320 update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1321 { 1322 struct sched_statistics *stats; 1323 struct task_struct *p = NULL; 1324 1325 if (!schedstat_enabled()) 1326 return; 1327 1328 if (rt_entity_is_task(rt_se)) 1329 p = rt_task_of(rt_se); 1330 1331 stats = __schedstats_from_rt_se(rt_se); 1332 if (!stats) 1333 return; 1334 1335 __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); 1336 } 1337 1338 static inline void 1339 update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1340 int flags) 1341 { 1342 struct task_struct *p = NULL; 1343 1344 if (!schedstat_enabled()) 1345 return; 1346 1347 if (rt_entity_is_task(rt_se)) 1348 p = rt_task_of(rt_se); 1349 1350 if ((flags & DEQUEUE_SLEEP) && p) { 1351 unsigned int state; 1352 1353 state = READ_ONCE(p->__state); 1354 if (state & TASK_INTERRUPTIBLE) 1355 __schedstat_set(p->stats.sleep_start, 1356 rq_clock(rq_of_rt_rq(rt_rq))); 1357 1358 if (state & TASK_UNINTERRUPTIBLE) 1359 __schedstat_set(p->stats.block_start, 1360 rq_clock(rq_of_rt_rq(rt_rq))); 1361 } 1362 } 1363 1364 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1365 { 1366 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1367 struct rt_prio_array *array = &rt_rq->active; 1368 struct rt_rq *group_rq = group_rt_rq(rt_se); 1369 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1370 1371 /* 1372 * Don't enqueue the group if its throttled, or when empty. 1373 * The latter is a consequence of the former when a child group 1374 * get throttled and the current group doesn't have any other 1375 * active members. 1376 */ 1377 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { 1378 if (rt_se->on_list) 1379 __delist_rt_entity(rt_se, array); 1380 return; 1381 } 1382 1383 if (move_entity(flags)) { 1384 WARN_ON_ONCE(rt_se->on_list); 1385 if (flags & ENQUEUE_HEAD) 1386 list_add(&rt_se->run_list, queue); 1387 else 1388 list_add_tail(&rt_se->run_list, queue); 1389 1390 __set_bit(rt_se_prio(rt_se), array->bitmap); 1391 rt_se->on_list = 1; 1392 } 1393 rt_se->on_rq = 1; 1394 1395 inc_rt_tasks(rt_se, rt_rq); 1396 } 1397 1398 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1399 { 1400 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1401 struct rt_prio_array *array = &rt_rq->active; 1402 1403 if (move_entity(flags)) { 1404 WARN_ON_ONCE(!rt_se->on_list); 1405 __delist_rt_entity(rt_se, array); 1406 } 1407 rt_se->on_rq = 0; 1408 1409 dec_rt_tasks(rt_se, rt_rq); 1410 } 1411 1412 /* 1413 * Because the prio of an upper entry depends on the lower 1414 * entries, we must remove entries top - down. 1415 */ 1416 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) 1417 { 1418 struct sched_rt_entity *back = NULL; 1419 unsigned int rt_nr_running; 1420 1421 for_each_sched_rt_entity(rt_se) { 1422 rt_se->back = back; 1423 back = rt_se; 1424 } 1425 1426 rt_nr_running = rt_rq_of_se(back)->rt_nr_running; 1427 1428 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1429 if (on_rt_rq(rt_se)) 1430 __dequeue_rt_entity(rt_se, flags); 1431 } 1432 1433 dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running); 1434 } 1435 1436 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1437 { 1438 struct rq *rq = rq_of_rt_se(rt_se); 1439 1440 update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1441 1442 dequeue_rt_stack(rt_se, flags); 1443 for_each_sched_rt_entity(rt_se) 1444 __enqueue_rt_entity(rt_se, flags); 1445 enqueue_top_rt_rq(&rq->rt); 1446 } 1447 1448 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1449 { 1450 struct rq *rq = rq_of_rt_se(rt_se); 1451 1452 update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1453 1454 dequeue_rt_stack(rt_se, flags); 1455 1456 for_each_sched_rt_entity(rt_se) { 1457 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1458 1459 if (rt_rq && rt_rq->rt_nr_running) 1460 __enqueue_rt_entity(rt_se, flags); 1461 } 1462 enqueue_top_rt_rq(&rq->rt); 1463 } 1464 1465 /* 1466 * Adding/removing a task to/from a priority array: 1467 */ 1468 static void 1469 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1470 { 1471 struct sched_rt_entity *rt_se = &p->rt; 1472 1473 if (flags & ENQUEUE_WAKEUP) 1474 rt_se->timeout = 0; 1475 1476 check_schedstat_required(); 1477 update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); 1478 1479 enqueue_rt_entity(rt_se, flags); 1480 1481 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1482 enqueue_pushable_task(rq, p); 1483 } 1484 1485 static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1486 { 1487 struct sched_rt_entity *rt_se = &p->rt; 1488 1489 update_curr_rt(rq); 1490 dequeue_rt_entity(rt_se, flags); 1491 1492 dequeue_pushable_task(rq, p); 1493 1494 return true; 1495 } 1496 1497 /* 1498 * Put task to the head or the end of the run list without the overhead of 1499 * dequeue followed by enqueue. 1500 */ 1501 static void 1502 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1503 { 1504 if (on_rt_rq(rt_se)) { 1505 struct rt_prio_array *array = &rt_rq->active; 1506 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1507 1508 if (head) 1509 list_move(&rt_se->run_list, queue); 1510 else 1511 list_move_tail(&rt_se->run_list, queue); 1512 } 1513 } 1514 1515 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1516 { 1517 struct sched_rt_entity *rt_se = &p->rt; 1518 struct rt_rq *rt_rq; 1519 1520 for_each_sched_rt_entity(rt_se) { 1521 rt_rq = rt_rq_of_se(rt_se); 1522 requeue_rt_entity(rt_rq, rt_se, head); 1523 } 1524 } 1525 1526 static void yield_task_rt(struct rq *rq) 1527 { 1528 requeue_task_rt(rq, rq->curr, 0); 1529 } 1530 1531 #ifdef CONFIG_SMP 1532 static int find_lowest_rq(struct task_struct *task); 1533 1534 static int 1535 select_task_rq_rt(struct task_struct *p, int cpu, int flags) 1536 { 1537 struct task_struct *curr, *donor; 1538 struct rq *rq; 1539 bool test; 1540 1541 /* For anything but wake ups, just return the task_cpu */ 1542 if (!(flags & (WF_TTWU | WF_FORK))) 1543 goto out; 1544 1545 rq = cpu_rq(cpu); 1546 1547 rcu_read_lock(); 1548 curr = READ_ONCE(rq->curr); /* unlocked access */ 1549 donor = READ_ONCE(rq->donor); 1550 1551 /* 1552 * If the current task on @p's runqueue is an RT task, then 1553 * try to see if we can wake this RT task up on another 1554 * runqueue. Otherwise simply start this RT task 1555 * on its current runqueue. 1556 * 1557 * We want to avoid overloading runqueues. If the woken 1558 * task is a higher priority, then it will stay on this CPU 1559 * and the lower prio task should be moved to another CPU. 1560 * Even though this will probably make the lower prio task 1561 * lose its cache, we do not want to bounce a higher task 1562 * around just because it gave up its CPU, perhaps for a 1563 * lock? 1564 * 1565 * For equal prio tasks, we just let the scheduler sort it out. 1566 * 1567 * Otherwise, just let it ride on the affine RQ and the 1568 * post-schedule router will push the preempted task away 1569 * 1570 * This test is optimistic, if we get it wrong the load-balancer 1571 * will have to sort it out. 1572 * 1573 * We take into account the capacity of the CPU to ensure it fits the 1574 * requirement of the task - which is only important on heterogeneous 1575 * systems like big.LITTLE. 1576 */ 1577 test = curr && 1578 unlikely(rt_task(donor)) && 1579 (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); 1580 1581 if (test || !rt_task_fits_capacity(p, cpu)) { 1582 int target = find_lowest_rq(p); 1583 1584 /* 1585 * Bail out if we were forcing a migration to find a better 1586 * fitting CPU but our search failed. 1587 */ 1588 if (!test && target != -1 && !rt_task_fits_capacity(p, target)) 1589 goto out_unlock; 1590 1591 /* 1592 * Don't bother moving it if the destination CPU is 1593 * not running a lower priority task. 1594 */ 1595 if (target != -1 && 1596 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1597 cpu = target; 1598 } 1599 1600 out_unlock: 1601 rcu_read_unlock(); 1602 1603 out: 1604 return cpu; 1605 } 1606 1607 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1608 { 1609 if (rq->curr->nr_cpus_allowed == 1 || 1610 !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) 1611 return; 1612 1613 /* 1614 * p is migratable, so let's not schedule it and 1615 * see if it is pushed or pulled somewhere else. 1616 */ 1617 if (p->nr_cpus_allowed != 1 && 1618 cpupri_find(&rq->rd->cpupri, p, NULL)) 1619 return; 1620 1621 /* 1622 * There appear to be other CPUs that can accept 1623 * the current task but none can run 'p', so lets reschedule 1624 * to try and push the current task away: 1625 */ 1626 requeue_task_rt(rq, p, 1); 1627 resched_curr(rq); 1628 } 1629 1630 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1631 { 1632 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { 1633 /* 1634 * This is OK, because current is on_cpu, which avoids it being 1635 * picked for load-balance and preemption/IRQs are still 1636 * disabled avoiding further scheduler activity on it and we've 1637 * not yet started the picking loop. 1638 */ 1639 rq_unpin_lock(rq, rf); 1640 pull_rt_task(rq); 1641 rq_repin_lock(rq, rf); 1642 } 1643 1644 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); 1645 } 1646 #endif /* CONFIG_SMP */ 1647 1648 /* 1649 * Preempt the current task with a newly woken task if needed: 1650 */ 1651 static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) 1652 { 1653 struct task_struct *donor = rq->donor; 1654 1655 if (p->prio < donor->prio) { 1656 resched_curr(rq); 1657 return; 1658 } 1659 1660 #ifdef CONFIG_SMP 1661 /* 1662 * If: 1663 * 1664 * - the newly woken task is of equal priority to the current task 1665 * - the newly woken task is non-migratable while current is migratable 1666 * - current will be preempted on the next reschedule 1667 * 1668 * we should check to see if current can readily move to a different 1669 * cpu. If so, we will reschedule to allow the push logic to try 1670 * to move current somewhere else, making room for our non-migratable 1671 * task. 1672 */ 1673 if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) 1674 check_preempt_equal_prio(rq, p); 1675 #endif 1676 } 1677 1678 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) 1679 { 1680 struct sched_rt_entity *rt_se = &p->rt; 1681 struct rt_rq *rt_rq = &rq->rt; 1682 1683 p->se.exec_start = rq_clock_task(rq); 1684 if (on_rt_rq(&p->rt)) 1685 update_stats_wait_end_rt(rt_rq, rt_se); 1686 1687 /* The running task is never eligible for pushing */ 1688 dequeue_pushable_task(rq, p); 1689 1690 if (!first) 1691 return; 1692 1693 /* 1694 * If prev task was rt, put_prev_task() has already updated the 1695 * utilization. We only care of the case where we start to schedule a 1696 * rt task 1697 */ 1698 if (rq->donor->sched_class != &rt_sched_class) 1699 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 1700 1701 rt_queue_push_tasks(rq); 1702 } 1703 1704 static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) 1705 { 1706 struct rt_prio_array *array = &rt_rq->active; 1707 struct sched_rt_entity *next = NULL; 1708 struct list_head *queue; 1709 int idx; 1710 1711 idx = sched_find_first_bit(array->bitmap); 1712 BUG_ON(idx >= MAX_RT_PRIO); 1713 1714 queue = array->queue + idx; 1715 if (SCHED_WARN_ON(list_empty(queue))) 1716 return NULL; 1717 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1718 1719 return next; 1720 } 1721 1722 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1723 { 1724 struct sched_rt_entity *rt_se; 1725 struct rt_rq *rt_rq = &rq->rt; 1726 1727 do { 1728 rt_se = pick_next_rt_entity(rt_rq); 1729 if (unlikely(!rt_se)) 1730 return NULL; 1731 rt_rq = group_rt_rq(rt_se); 1732 } while (rt_rq); 1733 1734 return rt_task_of(rt_se); 1735 } 1736 1737 static struct task_struct *pick_task_rt(struct rq *rq) 1738 { 1739 struct task_struct *p; 1740 1741 if (!sched_rt_runnable(rq)) 1742 return NULL; 1743 1744 p = _pick_next_task_rt(rq); 1745 1746 return p; 1747 } 1748 1749 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) 1750 { 1751 struct sched_rt_entity *rt_se = &p->rt; 1752 struct rt_rq *rt_rq = &rq->rt; 1753 1754 if (on_rt_rq(&p->rt)) 1755 update_stats_wait_start_rt(rt_rq, rt_se); 1756 1757 update_curr_rt(rq); 1758 1759 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 1760 1761 /* 1762 * The previous task needs to be made eligible for pushing 1763 * if it is still active 1764 */ 1765 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1766 enqueue_pushable_task(rq, p); 1767 } 1768 1769 #ifdef CONFIG_SMP 1770 1771 /* Only try algorithms three times */ 1772 #define RT_MAX_TRIES 3 1773 1774 /* 1775 * Return the highest pushable rq's task, which is suitable to be executed 1776 * on the CPU, NULL otherwise 1777 */ 1778 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1779 { 1780 struct plist_head *head = &rq->rt.pushable_tasks; 1781 struct task_struct *p; 1782 1783 if (!has_pushable_tasks(rq)) 1784 return NULL; 1785 1786 plist_for_each_entry(p, head, pushable_tasks) { 1787 if (task_is_pushable(rq, p, cpu)) 1788 return p; 1789 } 1790 1791 return NULL; 1792 } 1793 1794 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1795 1796 static int find_lowest_rq(struct task_struct *task) 1797 { 1798 struct sched_domain *sd; 1799 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1800 int this_cpu = smp_processor_id(); 1801 int cpu = task_cpu(task); 1802 int ret; 1803 1804 /* Make sure the mask is initialized first */ 1805 if (unlikely(!lowest_mask)) 1806 return -1; 1807 1808 if (task->nr_cpus_allowed == 1) 1809 return -1; /* No other targets possible */ 1810 1811 /* 1812 * If we're on asym system ensure we consider the different capacities 1813 * of the CPUs when searching for the lowest_mask. 1814 */ 1815 if (sched_asym_cpucap_active()) { 1816 1817 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, 1818 task, lowest_mask, 1819 rt_task_fits_capacity); 1820 } else { 1821 1822 ret = cpupri_find(&task_rq(task)->rd->cpupri, 1823 task, lowest_mask); 1824 } 1825 1826 if (!ret) 1827 return -1; /* No targets found */ 1828 1829 /* 1830 * At this point we have built a mask of CPUs representing the 1831 * lowest priority tasks in the system. Now we want to elect 1832 * the best one based on our affinity and topology. 1833 * 1834 * We prioritize the last CPU that the task executed on since 1835 * it is most likely cache-hot in that location. 1836 */ 1837 if (cpumask_test_cpu(cpu, lowest_mask)) 1838 return cpu; 1839 1840 /* 1841 * Otherwise, we consult the sched_domains span maps to figure 1842 * out which CPU is logically closest to our hot cache data. 1843 */ 1844 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1845 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1846 1847 rcu_read_lock(); 1848 for_each_domain(cpu, sd) { 1849 if (sd->flags & SD_WAKE_AFFINE) { 1850 int best_cpu; 1851 1852 /* 1853 * "this_cpu" is cheaper to preempt than a 1854 * remote processor. 1855 */ 1856 if (this_cpu != -1 && 1857 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1858 rcu_read_unlock(); 1859 return this_cpu; 1860 } 1861 1862 best_cpu = cpumask_any_and_distribute(lowest_mask, 1863 sched_domain_span(sd)); 1864 if (best_cpu < nr_cpu_ids) { 1865 rcu_read_unlock(); 1866 return best_cpu; 1867 } 1868 } 1869 } 1870 rcu_read_unlock(); 1871 1872 /* 1873 * And finally, if there were no matches within the domains 1874 * just give the caller *something* to work with from the compatible 1875 * locations. 1876 */ 1877 if (this_cpu != -1) 1878 return this_cpu; 1879 1880 cpu = cpumask_any_distribute(lowest_mask); 1881 if (cpu < nr_cpu_ids) 1882 return cpu; 1883 1884 return -1; 1885 } 1886 1887 /* Will lock the rq it finds */ 1888 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1889 { 1890 struct rq *lowest_rq = NULL; 1891 int tries; 1892 int cpu; 1893 1894 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1895 cpu = find_lowest_rq(task); 1896 1897 if ((cpu == -1) || (cpu == rq->cpu)) 1898 break; 1899 1900 lowest_rq = cpu_rq(cpu); 1901 1902 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 1903 /* 1904 * Target rq has tasks of equal or higher priority, 1905 * retrying does not release any lock and is unlikely 1906 * to yield a different result. 1907 */ 1908 lowest_rq = NULL; 1909 break; 1910 } 1911 1912 /* if the prio of this runqueue changed, try again */ 1913 if (double_lock_balance(rq, lowest_rq)) { 1914 /* 1915 * We had to unlock the run queue. In 1916 * the mean time, task could have 1917 * migrated already or had its affinity changed. 1918 * Also make sure that it wasn't scheduled on its rq. 1919 * It is possible the task was scheduled, set 1920 * "migrate_disabled" and then got preempted, so we must 1921 * check the task migration disable flag here too. 1922 */ 1923 if (unlikely(task_rq(task) != rq || 1924 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 1925 task_on_cpu(rq, task) || 1926 !rt_task(task) || 1927 is_migration_disabled(task) || 1928 !task_on_rq_queued(task))) { 1929 1930 double_unlock_balance(rq, lowest_rq); 1931 lowest_rq = NULL; 1932 break; 1933 } 1934 } 1935 1936 /* If this rq is still suitable use it. */ 1937 if (lowest_rq->rt.highest_prio.curr > task->prio) 1938 break; 1939 1940 /* try again */ 1941 double_unlock_balance(rq, lowest_rq); 1942 lowest_rq = NULL; 1943 } 1944 1945 return lowest_rq; 1946 } 1947 1948 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1949 { 1950 struct task_struct *p; 1951 1952 if (!has_pushable_tasks(rq)) 1953 return NULL; 1954 1955 p = plist_first_entry(&rq->rt.pushable_tasks, 1956 struct task_struct, pushable_tasks); 1957 1958 BUG_ON(rq->cpu != task_cpu(p)); 1959 BUG_ON(task_current(rq, p)); 1960 BUG_ON(task_current_donor(rq, p)); 1961 BUG_ON(p->nr_cpus_allowed <= 1); 1962 1963 BUG_ON(!task_on_rq_queued(p)); 1964 BUG_ON(!rt_task(p)); 1965 1966 return p; 1967 } 1968 1969 /* 1970 * If the current CPU has more than one RT task, see if the non 1971 * running task can migrate over to a CPU that is running a task 1972 * of lesser priority. 1973 */ 1974 static int push_rt_task(struct rq *rq, bool pull) 1975 { 1976 struct task_struct *next_task; 1977 struct rq *lowest_rq; 1978 int ret = 0; 1979 1980 if (!rq->rt.overloaded) 1981 return 0; 1982 1983 next_task = pick_next_pushable_task(rq); 1984 if (!next_task) 1985 return 0; 1986 1987 retry: 1988 /* 1989 * It's possible that the next_task slipped in of 1990 * higher priority than current. If that's the case 1991 * just reschedule current. 1992 */ 1993 if (unlikely(next_task->prio < rq->donor->prio)) { 1994 resched_curr(rq); 1995 return 0; 1996 } 1997 1998 if (is_migration_disabled(next_task)) { 1999 struct task_struct *push_task = NULL; 2000 int cpu; 2001 2002 if (!pull || rq->push_busy) 2003 return 0; 2004 2005 /* 2006 * Invoking find_lowest_rq() on anything but an RT task doesn't 2007 * make sense. Per the above priority check, curr has to 2008 * be of higher priority than next_task, so no need to 2009 * reschedule when bailing out. 2010 * 2011 * Note that the stoppers are masqueraded as SCHED_FIFO 2012 * (cf. sched_set_stop_task()), so we can't rely on rt_task(). 2013 */ 2014 if (rq->donor->sched_class != &rt_sched_class) 2015 return 0; 2016 2017 cpu = find_lowest_rq(rq->curr); 2018 if (cpu == -1 || cpu == rq->cpu) 2019 return 0; 2020 2021 /* 2022 * Given we found a CPU with lower priority than @next_task, 2023 * therefore it should be running. However we cannot migrate it 2024 * to this other CPU, instead attempt to push the current 2025 * running task on this CPU away. 2026 */ 2027 push_task = get_push_task(rq); 2028 if (push_task) { 2029 preempt_disable(); 2030 raw_spin_rq_unlock(rq); 2031 stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 2032 push_task, &rq->push_work); 2033 preempt_enable(); 2034 raw_spin_rq_lock(rq); 2035 } 2036 2037 return 0; 2038 } 2039 2040 if (WARN_ON(next_task == rq->curr)) 2041 return 0; 2042 2043 /* We might release rq lock */ 2044 get_task_struct(next_task); 2045 2046 /* find_lock_lowest_rq locks the rq if found */ 2047 lowest_rq = find_lock_lowest_rq(next_task, rq); 2048 if (!lowest_rq) { 2049 struct task_struct *task; 2050 /* 2051 * find_lock_lowest_rq releases rq->lock 2052 * so it is possible that next_task has migrated. 2053 * 2054 * We need to make sure that the task is still on the same 2055 * run-queue and is also still the next task eligible for 2056 * pushing. 2057 */ 2058 task = pick_next_pushable_task(rq); 2059 if (task == next_task) { 2060 /* 2061 * The task hasn't migrated, and is still the next 2062 * eligible task, but we failed to find a run-queue 2063 * to push it to. Do not retry in this case, since 2064 * other CPUs will pull from us when ready. 2065 */ 2066 goto out; 2067 } 2068 2069 if (!task) 2070 /* No more tasks, just exit */ 2071 goto out; 2072 2073 /* 2074 * Something has shifted, try again. 2075 */ 2076 put_task_struct(next_task); 2077 next_task = task; 2078 goto retry; 2079 } 2080 2081 move_queued_task_locked(rq, lowest_rq, next_task); 2082 resched_curr(lowest_rq); 2083 ret = 1; 2084 2085 double_unlock_balance(rq, lowest_rq); 2086 out: 2087 put_task_struct(next_task); 2088 2089 return ret; 2090 } 2091 2092 static void push_rt_tasks(struct rq *rq) 2093 { 2094 /* push_rt_task will return true if it moved an RT */ 2095 while (push_rt_task(rq, false)) 2096 ; 2097 } 2098 2099 #ifdef HAVE_RT_PUSH_IPI 2100 2101 /* 2102 * When a high priority task schedules out from a CPU and a lower priority 2103 * task is scheduled in, a check is made to see if there's any RT tasks 2104 * on other CPUs that are waiting to run because a higher priority RT task 2105 * is currently running on its CPU. In this case, the CPU with multiple RT 2106 * tasks queued on it (overloaded) needs to be notified that a CPU has opened 2107 * up that may be able to run one of its non-running queued RT tasks. 2108 * 2109 * All CPUs with overloaded RT tasks need to be notified as there is currently 2110 * no way to know which of these CPUs have the highest priority task waiting 2111 * to run. Instead of trying to take a spinlock on each of these CPUs, 2112 * which has shown to cause large latency when done on machines with many 2113 * CPUs, sending an IPI to the CPUs to have them push off the overloaded 2114 * RT tasks waiting to run. 2115 * 2116 * Just sending an IPI to each of the CPUs is also an issue, as on large 2117 * count CPU machines, this can cause an IPI storm on a CPU, especially 2118 * if its the only CPU with multiple RT tasks queued, and a large number 2119 * of CPUs scheduling a lower priority task at the same time. 2120 * 2121 * Each root domain has its own IRQ work function that can iterate over 2122 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT 2123 * task must be checked if there's one or many CPUs that are lowering 2124 * their priority, there's a single IRQ work iterator that will try to 2125 * push off RT tasks that are waiting to run. 2126 * 2127 * When a CPU schedules a lower priority task, it will kick off the 2128 * IRQ work iterator that will jump to each CPU with overloaded RT tasks. 2129 * As it only takes the first CPU that schedules a lower priority task 2130 * to start the process, the rto_start variable is incremented and if 2131 * the atomic result is one, then that CPU will try to take the rto_lock. 2132 * This prevents high contention on the lock as the process handles all 2133 * CPUs scheduling lower priority tasks. 2134 * 2135 * All CPUs that are scheduling a lower priority task will increment the 2136 * rt_loop_next variable. This will make sure that the IRQ work iterator 2137 * checks all RT overloaded CPUs whenever a CPU schedules a new lower 2138 * priority task, even if the iterator is in the middle of a scan. Incrementing 2139 * the rt_loop_next will cause the iterator to perform another scan. 2140 * 2141 */ 2142 static int rto_next_cpu(struct root_domain *rd) 2143 { 2144 int next; 2145 int cpu; 2146 2147 /* 2148 * When starting the IPI RT pushing, the rto_cpu is set to -1, 2149 * rt_next_cpu() will simply return the first CPU found in 2150 * the rto_mask. 2151 * 2152 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it 2153 * will return the next CPU found in the rto_mask. 2154 * 2155 * If there are no more CPUs left in the rto_mask, then a check is made 2156 * against rto_loop and rto_loop_next. rto_loop is only updated with 2157 * the rto_lock held, but any CPU may increment the rto_loop_next 2158 * without any locking. 2159 */ 2160 for (;;) { 2161 2162 /* When rto_cpu is -1 this acts like cpumask_first() */ 2163 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); 2164 2165 rd->rto_cpu = cpu; 2166 2167 if (cpu < nr_cpu_ids) 2168 return cpu; 2169 2170 rd->rto_cpu = -1; 2171 2172 /* 2173 * ACQUIRE ensures we see the @rto_mask changes 2174 * made prior to the @next value observed. 2175 * 2176 * Matches WMB in rt_set_overload(). 2177 */ 2178 next = atomic_read_acquire(&rd->rto_loop_next); 2179 2180 if (rd->rto_loop == next) 2181 break; 2182 2183 rd->rto_loop = next; 2184 } 2185 2186 return -1; 2187 } 2188 2189 static inline bool rto_start_trylock(atomic_t *v) 2190 { 2191 return !atomic_cmpxchg_acquire(v, 0, 1); 2192 } 2193 2194 static inline void rto_start_unlock(atomic_t *v) 2195 { 2196 atomic_set_release(v, 0); 2197 } 2198 2199 static void tell_cpu_to_push(struct rq *rq) 2200 { 2201 int cpu = -1; 2202 2203 /* Keep the loop going if the IPI is currently active */ 2204 atomic_inc(&rq->rd->rto_loop_next); 2205 2206 /* Only one CPU can initiate a loop at a time */ 2207 if (!rto_start_trylock(&rq->rd->rto_loop_start)) 2208 return; 2209 2210 raw_spin_lock(&rq->rd->rto_lock); 2211 2212 /* 2213 * The rto_cpu is updated under the lock, if it has a valid CPU 2214 * then the IPI is still running and will continue due to the 2215 * update to loop_next, and nothing needs to be done here. 2216 * Otherwise it is finishing up and an IPI needs to be sent. 2217 */ 2218 if (rq->rd->rto_cpu < 0) 2219 cpu = rto_next_cpu(rq->rd); 2220 2221 raw_spin_unlock(&rq->rd->rto_lock); 2222 2223 rto_start_unlock(&rq->rd->rto_loop_start); 2224 2225 if (cpu >= 0) { 2226 /* Make sure the rd does not get freed while pushing */ 2227 sched_get_rd(rq->rd); 2228 irq_work_queue_on(&rq->rd->rto_push_work, cpu); 2229 } 2230 } 2231 2232 /* Called from hardirq context */ 2233 void rto_push_irq_work_func(struct irq_work *work) 2234 { 2235 struct root_domain *rd = 2236 container_of(work, struct root_domain, rto_push_work); 2237 struct rq *rq; 2238 int cpu; 2239 2240 rq = this_rq(); 2241 2242 /* 2243 * We do not need to grab the lock to check for has_pushable_tasks. 2244 * When it gets updated, a check is made if a push is possible. 2245 */ 2246 if (has_pushable_tasks(rq)) { 2247 raw_spin_rq_lock(rq); 2248 while (push_rt_task(rq, true)) 2249 ; 2250 raw_spin_rq_unlock(rq); 2251 } 2252 2253 raw_spin_lock(&rd->rto_lock); 2254 2255 /* Pass the IPI to the next rt overloaded queue */ 2256 cpu = rto_next_cpu(rd); 2257 2258 raw_spin_unlock(&rd->rto_lock); 2259 2260 if (cpu < 0) { 2261 sched_put_rd(rd); 2262 return; 2263 } 2264 2265 /* Try the next RT overloaded CPU */ 2266 irq_work_queue_on(&rd->rto_push_work, cpu); 2267 } 2268 #endif /* HAVE_RT_PUSH_IPI */ 2269 2270 static void pull_rt_task(struct rq *this_rq) 2271 { 2272 int this_cpu = this_rq->cpu, cpu; 2273 bool resched = false; 2274 struct task_struct *p, *push_task; 2275 struct rq *src_rq; 2276 int rt_overload_count = rt_overloaded(this_rq); 2277 2278 if (likely(!rt_overload_count)) 2279 return; 2280 2281 /* 2282 * Match the barrier from rt_set_overloaded; this guarantees that if we 2283 * see overloaded we must also see the rto_mask bit. 2284 */ 2285 smp_rmb(); 2286 2287 /* If we are the only overloaded CPU do nothing */ 2288 if (rt_overload_count == 1 && 2289 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) 2290 return; 2291 2292 #ifdef HAVE_RT_PUSH_IPI 2293 if (sched_feat(RT_PUSH_IPI)) { 2294 tell_cpu_to_push(this_rq); 2295 return; 2296 } 2297 #endif 2298 2299 for_each_cpu(cpu, this_rq->rd->rto_mask) { 2300 if (this_cpu == cpu) 2301 continue; 2302 2303 src_rq = cpu_rq(cpu); 2304 2305 /* 2306 * Don't bother taking the src_rq->lock if the next highest 2307 * task is known to be lower-priority than our current task. 2308 * This may look racy, but if this value is about to go 2309 * logically higher, the src_rq will push this task away. 2310 * And if its going logically lower, we do not care 2311 */ 2312 if (src_rq->rt.highest_prio.next >= 2313 this_rq->rt.highest_prio.curr) 2314 continue; 2315 2316 /* 2317 * We can potentially drop this_rq's lock in 2318 * double_lock_balance, and another CPU could 2319 * alter this_rq 2320 */ 2321 push_task = NULL; 2322 double_lock_balance(this_rq, src_rq); 2323 2324 /* 2325 * We can pull only a task, which is pushable 2326 * on its rq, and no others. 2327 */ 2328 p = pick_highest_pushable_task(src_rq, this_cpu); 2329 2330 /* 2331 * Do we have an RT task that preempts 2332 * the to-be-scheduled task? 2333 */ 2334 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2335 WARN_ON(p == src_rq->curr); 2336 WARN_ON(!task_on_rq_queued(p)); 2337 2338 /* 2339 * There's a chance that p is higher in priority 2340 * than what's currently running on its CPU. 2341 * This is just that p is waking up and hasn't 2342 * had a chance to schedule. We only pull 2343 * p if it is lower in priority than the 2344 * current task on the run queue 2345 */ 2346 if (p->prio < src_rq->donor->prio) 2347 goto skip; 2348 2349 if (is_migration_disabled(p)) { 2350 push_task = get_push_task(src_rq); 2351 } else { 2352 move_queued_task_locked(src_rq, this_rq, p); 2353 resched = true; 2354 } 2355 /* 2356 * We continue with the search, just in 2357 * case there's an even higher prio task 2358 * in another runqueue. (low likelihood 2359 * but possible) 2360 */ 2361 } 2362 skip: 2363 double_unlock_balance(this_rq, src_rq); 2364 2365 if (push_task) { 2366 preempt_disable(); 2367 raw_spin_rq_unlock(this_rq); 2368 stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2369 push_task, &src_rq->push_work); 2370 preempt_enable(); 2371 raw_spin_rq_lock(this_rq); 2372 } 2373 } 2374 2375 if (resched) 2376 resched_curr(this_rq); 2377 } 2378 2379 /* 2380 * If we are not running and we are not going to reschedule soon, we should 2381 * try to push tasks away now 2382 */ 2383 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2384 { 2385 bool need_to_push = !task_on_cpu(rq, p) && 2386 !test_tsk_need_resched(rq->curr) && 2387 p->nr_cpus_allowed > 1 && 2388 (dl_task(rq->donor) || rt_task(rq->donor)) && 2389 (rq->curr->nr_cpus_allowed < 2 || 2390 rq->donor->prio <= p->prio); 2391 2392 if (need_to_push) 2393 push_rt_tasks(rq); 2394 } 2395 2396 /* Assumes rq->lock is held */ 2397 static void rq_online_rt(struct rq *rq) 2398 { 2399 if (rq->rt.overloaded) 2400 rt_set_overload(rq); 2401 2402 __enable_runtime(rq); 2403 2404 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2405 } 2406 2407 /* Assumes rq->lock is held */ 2408 static void rq_offline_rt(struct rq *rq) 2409 { 2410 if (rq->rt.overloaded) 2411 rt_clear_overload(rq); 2412 2413 __disable_runtime(rq); 2414 2415 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2416 } 2417 2418 /* 2419 * When switch from the rt queue, we bring ourselves to a position 2420 * that we might want to pull RT tasks from other runqueues. 2421 */ 2422 static void switched_from_rt(struct rq *rq, struct task_struct *p) 2423 { 2424 /* 2425 * If there are other RT tasks then we will reschedule 2426 * and the scheduling of the other RT tasks will handle 2427 * the balancing. But if we are the last RT task 2428 * we may need to handle the pulling of RT tasks 2429 * now. 2430 */ 2431 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2432 return; 2433 2434 rt_queue_pull_task(rq); 2435 } 2436 2437 void __init init_sched_rt_class(void) 2438 { 2439 unsigned int i; 2440 2441 for_each_possible_cpu(i) { 2442 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 2443 GFP_KERNEL, cpu_to_node(i)); 2444 } 2445 } 2446 #endif /* CONFIG_SMP */ 2447 2448 /* 2449 * When switching a task to RT, we may overload the runqueue 2450 * with RT tasks. In this case we try to push them off to 2451 * other runqueues. 2452 */ 2453 static void switched_to_rt(struct rq *rq, struct task_struct *p) 2454 { 2455 /* 2456 * If we are running, update the avg_rt tracking, as the running time 2457 * will now on be accounted into the latter. 2458 */ 2459 if (task_current(rq, p)) { 2460 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 2461 return; 2462 } 2463 2464 /* 2465 * If we are not running we may need to preempt the current 2466 * running task. If that current running task is also an RT task 2467 * then see if we can move to another run queue. 2468 */ 2469 if (task_on_rq_queued(p)) { 2470 #ifdef CONFIG_SMP 2471 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2472 rt_queue_push_tasks(rq); 2473 #endif /* CONFIG_SMP */ 2474 if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) 2475 resched_curr(rq); 2476 } 2477 } 2478 2479 /* 2480 * Priority of the task has changed. This may cause 2481 * us to initiate a push or pull. 2482 */ 2483 static void 2484 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2485 { 2486 if (!task_on_rq_queued(p)) 2487 return; 2488 2489 if (task_current_donor(rq, p)) { 2490 #ifdef CONFIG_SMP 2491 /* 2492 * If our priority decreases while running, we 2493 * may need to pull tasks to this runqueue. 2494 */ 2495 if (oldprio < p->prio) 2496 rt_queue_pull_task(rq); 2497 2498 /* 2499 * If there's a higher priority task waiting to run 2500 * then reschedule. 2501 */ 2502 if (p->prio > rq->rt.highest_prio.curr) 2503 resched_curr(rq); 2504 #else 2505 /* For UP simply resched on drop of prio */ 2506 if (oldprio < p->prio) 2507 resched_curr(rq); 2508 #endif /* CONFIG_SMP */ 2509 } else { 2510 /* 2511 * This task is not running, but if it is 2512 * greater than the current running task 2513 * then reschedule. 2514 */ 2515 if (p->prio < rq->donor->prio) 2516 resched_curr(rq); 2517 } 2518 } 2519 2520 #ifdef CONFIG_POSIX_TIMERS 2521 static void watchdog(struct rq *rq, struct task_struct *p) 2522 { 2523 unsigned long soft, hard; 2524 2525 /* max may change after cur was read, this will be fixed next tick */ 2526 soft = task_rlimit(p, RLIMIT_RTTIME); 2527 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2528 2529 if (soft != RLIM_INFINITY) { 2530 unsigned long next; 2531 2532 if (p->rt.watchdog_stamp != jiffies) { 2533 p->rt.timeout++; 2534 p->rt.watchdog_stamp = jiffies; 2535 } 2536 2537 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 2538 if (p->rt.timeout > next) { 2539 posix_cputimers_rt_watchdog(&p->posix_cputimers, 2540 p->se.sum_exec_runtime); 2541 } 2542 } 2543 } 2544 #else 2545 static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2546 #endif 2547 2548 /* 2549 * scheduler tick hitting a task of our scheduling class. 2550 * 2551 * NOTE: This function can be called remotely by the tick offload that 2552 * goes along full dynticks. Therefore no local assumption can be made 2553 * and everything must be accessed through the @rq and @curr passed in 2554 * parameters. 2555 */ 2556 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2557 { 2558 struct sched_rt_entity *rt_se = &p->rt; 2559 2560 update_curr_rt(rq); 2561 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 2562 2563 watchdog(rq, p); 2564 2565 /* 2566 * RR tasks need a special form of time-slice management. 2567 * FIFO tasks have no timeslices. 2568 */ 2569 if (p->policy != SCHED_RR) 2570 return; 2571 2572 if (--p->rt.time_slice) 2573 return; 2574 2575 p->rt.time_slice = sched_rr_timeslice; 2576 2577 /* 2578 * Requeue to the end of queue if we (and all of our ancestors) are not 2579 * the only element on the queue 2580 */ 2581 for_each_sched_rt_entity(rt_se) { 2582 if (rt_se->run_list.prev != rt_se->run_list.next) { 2583 requeue_task_rt(rq, p, 0); 2584 resched_curr(rq); 2585 return; 2586 } 2587 } 2588 } 2589 2590 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2591 { 2592 /* 2593 * Time slice is 0 for SCHED_FIFO tasks 2594 */ 2595 if (task->policy == SCHED_RR) 2596 return sched_rr_timeslice; 2597 else 2598 return 0; 2599 } 2600 2601 #ifdef CONFIG_SCHED_CORE 2602 static int task_is_throttled_rt(struct task_struct *p, int cpu) 2603 { 2604 struct rt_rq *rt_rq; 2605 2606 #ifdef CONFIG_RT_GROUP_SCHED 2607 rt_rq = task_group(p)->rt_rq[cpu]; 2608 #else 2609 rt_rq = &cpu_rq(cpu)->rt; 2610 #endif 2611 2612 return rt_rq_throttled(rt_rq); 2613 } 2614 #endif 2615 2616 DEFINE_SCHED_CLASS(rt) = { 2617 2618 .enqueue_task = enqueue_task_rt, 2619 .dequeue_task = dequeue_task_rt, 2620 .yield_task = yield_task_rt, 2621 2622 .wakeup_preempt = wakeup_preempt_rt, 2623 2624 .pick_task = pick_task_rt, 2625 .put_prev_task = put_prev_task_rt, 2626 .set_next_task = set_next_task_rt, 2627 2628 #ifdef CONFIG_SMP 2629 .balance = balance_rt, 2630 .select_task_rq = select_task_rq_rt, 2631 .set_cpus_allowed = set_cpus_allowed_common, 2632 .rq_online = rq_online_rt, 2633 .rq_offline = rq_offline_rt, 2634 .task_woken = task_woken_rt, 2635 .switched_from = switched_from_rt, 2636 .find_lock_rq = find_lock_lowest_rq, 2637 #endif 2638 2639 .task_tick = task_tick_rt, 2640 2641 .get_rr_interval = get_rr_interval_rt, 2642 2643 .prio_changed = prio_changed_rt, 2644 .switched_to = switched_to_rt, 2645 2646 .update_curr = update_curr_rt, 2647 2648 #ifdef CONFIG_SCHED_CORE 2649 .task_is_throttled = task_is_throttled_rt, 2650 #endif 2651 2652 #ifdef CONFIG_UCLAMP_TASK 2653 .uclamp_enabled = 1, 2654 #endif 2655 }; 2656 2657 #ifdef CONFIG_RT_GROUP_SCHED 2658 /* 2659 * Ensure that the real time constraints are schedulable. 2660 */ 2661 static DEFINE_MUTEX(rt_constraints_mutex); 2662 2663 static inline int tg_has_rt_tasks(struct task_group *tg) 2664 { 2665 struct task_struct *task; 2666 struct css_task_iter it; 2667 int ret = 0; 2668 2669 /* 2670 * Autogroups do not have RT tasks; see autogroup_create(). 2671 */ 2672 if (task_group_is_autogroup(tg)) 2673 return 0; 2674 2675 css_task_iter_start(&tg->css, 0, &it); 2676 while (!ret && (task = css_task_iter_next(&it))) 2677 ret |= rt_task(task); 2678 css_task_iter_end(&it); 2679 2680 return ret; 2681 } 2682 2683 struct rt_schedulable_data { 2684 struct task_group *tg; 2685 u64 rt_period; 2686 u64 rt_runtime; 2687 }; 2688 2689 static int tg_rt_schedulable(struct task_group *tg, void *data) 2690 { 2691 struct rt_schedulable_data *d = data; 2692 struct task_group *child; 2693 unsigned long total, sum = 0; 2694 u64 period, runtime; 2695 2696 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2697 runtime = tg->rt_bandwidth.rt_runtime; 2698 2699 if (tg == d->tg) { 2700 period = d->rt_period; 2701 runtime = d->rt_runtime; 2702 } 2703 2704 /* 2705 * Cannot have more runtime than the period. 2706 */ 2707 if (runtime > period && runtime != RUNTIME_INF) 2708 return -EINVAL; 2709 2710 /* 2711 * Ensure we don't starve existing RT tasks if runtime turns zero. 2712 */ 2713 if (rt_bandwidth_enabled() && !runtime && 2714 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) 2715 return -EBUSY; 2716 2717 total = to_ratio(period, runtime); 2718 2719 /* 2720 * Nobody can have more than the global setting allows. 2721 */ 2722 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 2723 return -EINVAL; 2724 2725 /* 2726 * The sum of our children's runtime should not exceed our own. 2727 */ 2728 list_for_each_entry_rcu(child, &tg->children, siblings) { 2729 period = ktime_to_ns(child->rt_bandwidth.rt_period); 2730 runtime = child->rt_bandwidth.rt_runtime; 2731 2732 if (child == d->tg) { 2733 period = d->rt_period; 2734 runtime = d->rt_runtime; 2735 } 2736 2737 sum += to_ratio(period, runtime); 2738 } 2739 2740 if (sum > total) 2741 return -EINVAL; 2742 2743 return 0; 2744 } 2745 2746 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 2747 { 2748 int ret; 2749 2750 struct rt_schedulable_data data = { 2751 .tg = tg, 2752 .rt_period = period, 2753 .rt_runtime = runtime, 2754 }; 2755 2756 rcu_read_lock(); 2757 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 2758 rcu_read_unlock(); 2759 2760 return ret; 2761 } 2762 2763 static int tg_set_rt_bandwidth(struct task_group *tg, 2764 u64 rt_period, u64 rt_runtime) 2765 { 2766 int i, err = 0; 2767 2768 /* 2769 * Disallowing the root group RT runtime is BAD, it would disallow the 2770 * kernel creating (and or operating) RT threads. 2771 */ 2772 if (tg == &root_task_group && rt_runtime == 0) 2773 return -EINVAL; 2774 2775 /* No period doesn't make any sense. */ 2776 if (rt_period == 0) 2777 return -EINVAL; 2778 2779 /* 2780 * Bound quota to defend quota against overflow during bandwidth shift. 2781 */ 2782 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) 2783 return -EINVAL; 2784 2785 mutex_lock(&rt_constraints_mutex); 2786 err = __rt_schedulable(tg, rt_period, rt_runtime); 2787 if (err) 2788 goto unlock; 2789 2790 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2791 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 2792 tg->rt_bandwidth.rt_runtime = rt_runtime; 2793 2794 for_each_possible_cpu(i) { 2795 struct rt_rq *rt_rq = tg->rt_rq[i]; 2796 2797 raw_spin_lock(&rt_rq->rt_runtime_lock); 2798 rt_rq->rt_runtime = rt_runtime; 2799 raw_spin_unlock(&rt_rq->rt_runtime_lock); 2800 } 2801 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2802 unlock: 2803 mutex_unlock(&rt_constraints_mutex); 2804 2805 return err; 2806 } 2807 2808 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 2809 { 2810 u64 rt_runtime, rt_period; 2811 2812 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2813 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 2814 if (rt_runtime_us < 0) 2815 rt_runtime = RUNTIME_INF; 2816 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) 2817 return -EINVAL; 2818 2819 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2820 } 2821 2822 long sched_group_rt_runtime(struct task_group *tg) 2823 { 2824 u64 rt_runtime_us; 2825 2826 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 2827 return -1; 2828 2829 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 2830 do_div(rt_runtime_us, NSEC_PER_USEC); 2831 return rt_runtime_us; 2832 } 2833 2834 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 2835 { 2836 u64 rt_runtime, rt_period; 2837 2838 if (rt_period_us > U64_MAX / NSEC_PER_USEC) 2839 return -EINVAL; 2840 2841 rt_period = rt_period_us * NSEC_PER_USEC; 2842 rt_runtime = tg->rt_bandwidth.rt_runtime; 2843 2844 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2845 } 2846 2847 long sched_group_rt_period(struct task_group *tg) 2848 { 2849 u64 rt_period_us; 2850 2851 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 2852 do_div(rt_period_us, NSEC_PER_USEC); 2853 return rt_period_us; 2854 } 2855 2856 #ifdef CONFIG_SYSCTL 2857 static int sched_rt_global_constraints(void) 2858 { 2859 int ret = 0; 2860 2861 mutex_lock(&rt_constraints_mutex); 2862 ret = __rt_schedulable(NULL, 0, 0); 2863 mutex_unlock(&rt_constraints_mutex); 2864 2865 return ret; 2866 } 2867 #endif /* CONFIG_SYSCTL */ 2868 2869 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 2870 { 2871 /* Don't accept real-time tasks when there is no way for them to run */ 2872 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 2873 return 0; 2874 2875 return 1; 2876 } 2877 2878 #else /* !CONFIG_RT_GROUP_SCHED */ 2879 2880 #ifdef CONFIG_SYSCTL 2881 static int sched_rt_global_constraints(void) 2882 { 2883 return 0; 2884 } 2885 #endif /* CONFIG_SYSCTL */ 2886 #endif /* CONFIG_RT_GROUP_SCHED */ 2887 2888 #ifdef CONFIG_SYSCTL 2889 static int sched_rt_global_validate(void) 2890 { 2891 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 2892 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || 2893 ((u64)sysctl_sched_rt_runtime * 2894 NSEC_PER_USEC > max_rt_runtime))) 2895 return -EINVAL; 2896 2897 return 0; 2898 } 2899 2900 static void sched_rt_do_global(void) 2901 { 2902 } 2903 2904 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, 2905 size_t *lenp, loff_t *ppos) 2906 { 2907 int old_period, old_runtime; 2908 static DEFINE_MUTEX(mutex); 2909 int ret; 2910 2911 mutex_lock(&mutex); 2912 old_period = sysctl_sched_rt_period; 2913 old_runtime = sysctl_sched_rt_runtime; 2914 2915 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 2916 2917 if (!ret && write) { 2918 ret = sched_rt_global_validate(); 2919 if (ret) 2920 goto undo; 2921 2922 ret = sched_dl_global_validate(); 2923 if (ret) 2924 goto undo; 2925 2926 ret = sched_rt_global_constraints(); 2927 if (ret) 2928 goto undo; 2929 2930 sched_rt_do_global(); 2931 sched_dl_do_global(); 2932 } 2933 if (0) { 2934 undo: 2935 sysctl_sched_rt_period = old_period; 2936 sysctl_sched_rt_runtime = old_runtime; 2937 } 2938 mutex_unlock(&mutex); 2939 2940 return ret; 2941 } 2942 2943 static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer, 2944 size_t *lenp, loff_t *ppos) 2945 { 2946 int ret; 2947 static DEFINE_MUTEX(mutex); 2948 2949 mutex_lock(&mutex); 2950 ret = proc_dointvec(table, write, buffer, lenp, ppos); 2951 /* 2952 * Make sure that internally we keep jiffies. 2953 * Also, writing zero resets the time-slice to default: 2954 */ 2955 if (!ret && write) { 2956 sched_rr_timeslice = 2957 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : 2958 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2959 2960 if (sysctl_sched_rr_timeslice <= 0) 2961 sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE); 2962 } 2963 mutex_unlock(&mutex); 2964 2965 return ret; 2966 } 2967 #endif /* CONFIG_SYSCTL */ 2968 2969 #ifdef CONFIG_SCHED_DEBUG 2970 void print_rt_stats(struct seq_file *m, int cpu) 2971 { 2972 rt_rq_iter_t iter; 2973 struct rt_rq *rt_rq; 2974 2975 rcu_read_lock(); 2976 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2977 print_rt_rq(m, cpu, rt_rq); 2978 rcu_read_unlock(); 2979 } 2980 #endif /* CONFIG_SCHED_DEBUG */ 2981