1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 4 * policies) 5 */ 6 #include "sched.h" 7 8 #include "pelt.h" 9 10 int sched_rr_timeslice = RR_TIMESLICE; 11 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 12 /* More than 4 hours if BW_SHIFT equals 20. */ 13 static const u64 max_rt_runtime = MAX_BW; 14 15 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 16 17 struct rt_bandwidth def_rt_bandwidth; 18 19 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 20 { 21 struct rt_bandwidth *rt_b = 22 container_of(timer, struct rt_bandwidth, rt_period_timer); 23 int idle = 0; 24 int overrun; 25 26 raw_spin_lock(&rt_b->rt_runtime_lock); 27 for (;;) { 28 overrun = hrtimer_forward_now(timer, rt_b->rt_period); 29 if (!overrun) 30 break; 31 32 raw_spin_unlock(&rt_b->rt_runtime_lock); 33 idle = do_sched_rt_period_timer(rt_b, overrun); 34 raw_spin_lock(&rt_b->rt_runtime_lock); 35 } 36 if (idle) 37 rt_b->rt_period_active = 0; 38 raw_spin_unlock(&rt_b->rt_runtime_lock); 39 40 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 41 } 42 43 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 44 { 45 rt_b->rt_period = ns_to_ktime(period); 46 rt_b->rt_runtime = runtime; 47 48 raw_spin_lock_init(&rt_b->rt_runtime_lock); 49 50 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, 51 HRTIMER_MODE_REL_HARD); 52 rt_b->rt_period_timer.function = sched_rt_period_timer; 53 } 54 55 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 56 { 57 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 58 return; 59 60 raw_spin_lock(&rt_b->rt_runtime_lock); 61 if (!rt_b->rt_period_active) { 62 rt_b->rt_period_active = 1; 63 /* 64 * SCHED_DEADLINE updates the bandwidth, as a run away 65 * RT task with a DL task could hog a CPU. But DL does 66 * not reset the period. If a deadline task was running 67 * without an RT task running, it can cause RT tasks to 68 * throttle when they start up. Kick the timer right away 69 * to update the period. 70 */ 71 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); 72 hrtimer_start_expires(&rt_b->rt_period_timer, 73 HRTIMER_MODE_ABS_PINNED_HARD); 74 } 75 raw_spin_unlock(&rt_b->rt_runtime_lock); 76 } 77 78 void init_rt_rq(struct rt_rq *rt_rq) 79 { 80 struct rt_prio_array *array; 81 int i; 82 83 array = &rt_rq->active; 84 for (i = 0; i < MAX_RT_PRIO; i++) { 85 INIT_LIST_HEAD(array->queue + i); 86 __clear_bit(i, array->bitmap); 87 } 88 /* delimiter for bitsearch: */ 89 __set_bit(MAX_RT_PRIO, array->bitmap); 90 91 #if defined CONFIG_SMP 92 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 93 rt_rq->highest_prio.next = MAX_RT_PRIO-1; 94 rt_rq->rt_nr_migratory = 0; 95 rt_rq->overloaded = 0; 96 plist_head_init(&rt_rq->pushable_tasks); 97 #endif /* CONFIG_SMP */ 98 /* We start is dequeued state, because no RT tasks are queued */ 99 rt_rq->rt_queued = 0; 100 101 rt_rq->rt_time = 0; 102 rt_rq->rt_throttled = 0; 103 rt_rq->rt_runtime = 0; 104 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 105 } 106 107 #ifdef CONFIG_RT_GROUP_SCHED 108 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 109 { 110 hrtimer_cancel(&rt_b->rt_period_timer); 111 } 112 113 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 114 115 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 116 { 117 #ifdef CONFIG_SCHED_DEBUG 118 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 119 #endif 120 return container_of(rt_se, struct task_struct, rt); 121 } 122 123 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 124 { 125 return rt_rq->rq; 126 } 127 128 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 129 { 130 return rt_se->rt_rq; 131 } 132 133 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 134 { 135 struct rt_rq *rt_rq = rt_se->rt_rq; 136 137 return rt_rq->rq; 138 } 139 140 void unregister_rt_sched_group(struct task_group *tg) 141 { 142 if (tg->rt_se) 143 destroy_rt_bandwidth(&tg->rt_bandwidth); 144 145 } 146 147 void free_rt_sched_group(struct task_group *tg) 148 { 149 int i; 150 151 for_each_possible_cpu(i) { 152 if (tg->rt_rq) 153 kfree(tg->rt_rq[i]); 154 if (tg->rt_se) 155 kfree(tg->rt_se[i]); 156 } 157 158 kfree(tg->rt_rq); 159 kfree(tg->rt_se); 160 } 161 162 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 163 struct sched_rt_entity *rt_se, int cpu, 164 struct sched_rt_entity *parent) 165 { 166 struct rq *rq = cpu_rq(cpu); 167 168 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 169 rt_rq->rt_nr_boosted = 0; 170 rt_rq->rq = rq; 171 rt_rq->tg = tg; 172 173 tg->rt_rq[cpu] = rt_rq; 174 tg->rt_se[cpu] = rt_se; 175 176 if (!rt_se) 177 return; 178 179 if (!parent) 180 rt_se->rt_rq = &rq->rt; 181 else 182 rt_se->rt_rq = parent->my_q; 183 184 rt_se->my_q = rt_rq; 185 rt_se->parent = parent; 186 INIT_LIST_HEAD(&rt_se->run_list); 187 } 188 189 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 190 { 191 struct rt_rq *rt_rq; 192 struct sched_rt_entity *rt_se; 193 int i; 194 195 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL); 196 if (!tg->rt_rq) 197 goto err; 198 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL); 199 if (!tg->rt_se) 200 goto err; 201 202 init_rt_bandwidth(&tg->rt_bandwidth, 203 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 204 205 for_each_possible_cpu(i) { 206 rt_rq = kzalloc_node(sizeof(struct rt_rq), 207 GFP_KERNEL, cpu_to_node(i)); 208 if (!rt_rq) 209 goto err; 210 211 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 212 GFP_KERNEL, cpu_to_node(i)); 213 if (!rt_se) 214 goto err_free_rq; 215 216 init_rt_rq(rt_rq); 217 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 218 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 219 } 220 221 return 1; 222 223 err_free_rq: 224 kfree(rt_rq); 225 err: 226 return 0; 227 } 228 229 #else /* CONFIG_RT_GROUP_SCHED */ 230 231 #define rt_entity_is_task(rt_se) (1) 232 233 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 234 { 235 return container_of(rt_se, struct task_struct, rt); 236 } 237 238 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 239 { 240 return container_of(rt_rq, struct rq, rt); 241 } 242 243 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 244 { 245 struct task_struct *p = rt_task_of(rt_se); 246 247 return task_rq(p); 248 } 249 250 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 251 { 252 struct rq *rq = rq_of_rt_se(rt_se); 253 254 return &rq->rt; 255 } 256 257 void unregister_rt_sched_group(struct task_group *tg) { } 258 259 void free_rt_sched_group(struct task_group *tg) { } 260 261 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 262 { 263 return 1; 264 } 265 #endif /* CONFIG_RT_GROUP_SCHED */ 266 267 #ifdef CONFIG_SMP 268 269 static void pull_rt_task(struct rq *this_rq); 270 271 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 272 { 273 /* Try to pull RT tasks here if we lower this rq's prio */ 274 return rq->online && rq->rt.highest_prio.curr > prev->prio; 275 } 276 277 static inline int rt_overloaded(struct rq *rq) 278 { 279 return atomic_read(&rq->rd->rto_count); 280 } 281 282 static inline void rt_set_overload(struct rq *rq) 283 { 284 if (!rq->online) 285 return; 286 287 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 288 /* 289 * Make sure the mask is visible before we set 290 * the overload count. That is checked to determine 291 * if we should look at the mask. It would be a shame 292 * if we looked at the mask, but the mask was not 293 * updated yet. 294 * 295 * Matched by the barrier in pull_rt_task(). 296 */ 297 smp_wmb(); 298 atomic_inc(&rq->rd->rto_count); 299 } 300 301 static inline void rt_clear_overload(struct rq *rq) 302 { 303 if (!rq->online) 304 return; 305 306 /* the order here really doesn't matter */ 307 atomic_dec(&rq->rd->rto_count); 308 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 309 } 310 311 static void update_rt_migration(struct rt_rq *rt_rq) 312 { 313 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 314 if (!rt_rq->overloaded) { 315 rt_set_overload(rq_of_rt_rq(rt_rq)); 316 rt_rq->overloaded = 1; 317 } 318 } else if (rt_rq->overloaded) { 319 rt_clear_overload(rq_of_rt_rq(rt_rq)); 320 rt_rq->overloaded = 0; 321 } 322 } 323 324 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 325 { 326 struct task_struct *p; 327 328 if (!rt_entity_is_task(rt_se)) 329 return; 330 331 p = rt_task_of(rt_se); 332 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 333 334 rt_rq->rt_nr_total++; 335 if (p->nr_cpus_allowed > 1) 336 rt_rq->rt_nr_migratory++; 337 338 update_rt_migration(rt_rq); 339 } 340 341 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 342 { 343 struct task_struct *p; 344 345 if (!rt_entity_is_task(rt_se)) 346 return; 347 348 p = rt_task_of(rt_se); 349 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 350 351 rt_rq->rt_nr_total--; 352 if (p->nr_cpus_allowed > 1) 353 rt_rq->rt_nr_migratory--; 354 355 update_rt_migration(rt_rq); 356 } 357 358 static inline int has_pushable_tasks(struct rq *rq) 359 { 360 return !plist_head_empty(&rq->rt.pushable_tasks); 361 } 362 363 static DEFINE_PER_CPU(struct callback_head, rt_push_head); 364 static DEFINE_PER_CPU(struct callback_head, rt_pull_head); 365 366 static void push_rt_tasks(struct rq *); 367 static void pull_rt_task(struct rq *); 368 369 static inline void rt_queue_push_tasks(struct rq *rq) 370 { 371 if (!has_pushable_tasks(rq)) 372 return; 373 374 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 375 } 376 377 static inline void rt_queue_pull_task(struct rq *rq) 378 { 379 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 380 } 381 382 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 383 { 384 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 385 plist_node_init(&p->pushable_tasks, p->prio); 386 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 387 388 /* Update the highest prio pushable task */ 389 if (p->prio < rq->rt.highest_prio.next) 390 rq->rt.highest_prio.next = p->prio; 391 } 392 393 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 394 { 395 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 396 397 /* Update the new highest prio pushable task */ 398 if (has_pushable_tasks(rq)) { 399 p = plist_first_entry(&rq->rt.pushable_tasks, 400 struct task_struct, pushable_tasks); 401 rq->rt.highest_prio.next = p->prio; 402 } else { 403 rq->rt.highest_prio.next = MAX_RT_PRIO-1; 404 } 405 } 406 407 #else 408 409 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 410 { 411 } 412 413 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 414 { 415 } 416 417 static inline 418 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 419 { 420 } 421 422 static inline 423 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 424 { 425 } 426 427 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 428 { 429 return false; 430 } 431 432 static inline void pull_rt_task(struct rq *this_rq) 433 { 434 } 435 436 static inline void rt_queue_push_tasks(struct rq *rq) 437 { 438 } 439 #endif /* CONFIG_SMP */ 440 441 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 442 static void dequeue_top_rt_rq(struct rt_rq *rt_rq); 443 444 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 445 { 446 return rt_se->on_rq; 447 } 448 449 #ifdef CONFIG_UCLAMP_TASK 450 /* 451 * Verify the fitness of task @p to run on @cpu taking into account the uclamp 452 * settings. 453 * 454 * This check is only important for heterogeneous systems where uclamp_min value 455 * is higher than the capacity of a @cpu. For non-heterogeneous system this 456 * function will always return true. 457 * 458 * The function will return true if the capacity of the @cpu is >= the 459 * uclamp_min and false otherwise. 460 * 461 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min 462 * > uclamp_max. 463 */ 464 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 465 { 466 unsigned int min_cap; 467 unsigned int max_cap; 468 unsigned int cpu_cap; 469 470 /* Only heterogeneous systems can benefit from this check */ 471 if (!static_branch_unlikely(&sched_asym_cpucapacity)) 472 return true; 473 474 min_cap = uclamp_eff_value(p, UCLAMP_MIN); 475 max_cap = uclamp_eff_value(p, UCLAMP_MAX); 476 477 cpu_cap = capacity_orig_of(cpu); 478 479 return cpu_cap >= min(min_cap, max_cap); 480 } 481 #else 482 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) 483 { 484 return true; 485 } 486 #endif 487 488 #ifdef CONFIG_RT_GROUP_SCHED 489 490 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 491 { 492 if (!rt_rq->tg) 493 return RUNTIME_INF; 494 495 return rt_rq->rt_runtime; 496 } 497 498 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 499 { 500 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 501 } 502 503 typedef struct task_group *rt_rq_iter_t; 504 505 static inline struct task_group *next_task_group(struct task_group *tg) 506 { 507 do { 508 tg = list_entry_rcu(tg->list.next, 509 typeof(struct task_group), list); 510 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 511 512 if (&tg->list == &task_groups) 513 tg = NULL; 514 515 return tg; 516 } 517 518 #define for_each_rt_rq(rt_rq, iter, rq) \ 519 for (iter = container_of(&task_groups, typeof(*iter), list); \ 520 (iter = next_task_group(iter)) && \ 521 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 522 523 #define for_each_sched_rt_entity(rt_se) \ 524 for (; rt_se; rt_se = rt_se->parent) 525 526 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 527 { 528 return rt_se->my_q; 529 } 530 531 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 532 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); 533 534 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 535 { 536 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 537 struct rq *rq = rq_of_rt_rq(rt_rq); 538 struct sched_rt_entity *rt_se; 539 540 int cpu = cpu_of(rq); 541 542 rt_se = rt_rq->tg->rt_se[cpu]; 543 544 if (rt_rq->rt_nr_running) { 545 if (!rt_se) 546 enqueue_top_rt_rq(rt_rq); 547 else if (!on_rt_rq(rt_se)) 548 enqueue_rt_entity(rt_se, 0); 549 550 if (rt_rq->highest_prio.curr < curr->prio) 551 resched_curr(rq); 552 } 553 } 554 555 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 556 { 557 struct sched_rt_entity *rt_se; 558 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 559 560 rt_se = rt_rq->tg->rt_se[cpu]; 561 562 if (!rt_se) { 563 dequeue_top_rt_rq(rt_rq); 564 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 565 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); 566 } 567 else if (on_rt_rq(rt_se)) 568 dequeue_rt_entity(rt_se, 0); 569 } 570 571 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 572 { 573 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 574 } 575 576 static int rt_se_boosted(struct sched_rt_entity *rt_se) 577 { 578 struct rt_rq *rt_rq = group_rt_rq(rt_se); 579 struct task_struct *p; 580 581 if (rt_rq) 582 return !!rt_rq->rt_nr_boosted; 583 584 p = rt_task_of(rt_se); 585 return p->prio != p->normal_prio; 586 } 587 588 #ifdef CONFIG_SMP 589 static inline const struct cpumask *sched_rt_period_mask(void) 590 { 591 return this_rq()->rd->span; 592 } 593 #else 594 static inline const struct cpumask *sched_rt_period_mask(void) 595 { 596 return cpu_online_mask; 597 } 598 #endif 599 600 static inline 601 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 602 { 603 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 604 } 605 606 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 607 { 608 return &rt_rq->tg->rt_bandwidth; 609 } 610 611 #else /* !CONFIG_RT_GROUP_SCHED */ 612 613 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 614 { 615 return rt_rq->rt_runtime; 616 } 617 618 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 619 { 620 return ktime_to_ns(def_rt_bandwidth.rt_period); 621 } 622 623 typedef struct rt_rq *rt_rq_iter_t; 624 625 #define for_each_rt_rq(rt_rq, iter, rq) \ 626 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 627 628 #define for_each_sched_rt_entity(rt_se) \ 629 for (; rt_se; rt_se = NULL) 630 631 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 632 { 633 return NULL; 634 } 635 636 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 637 { 638 struct rq *rq = rq_of_rt_rq(rt_rq); 639 640 if (!rt_rq->rt_nr_running) 641 return; 642 643 enqueue_top_rt_rq(rt_rq); 644 resched_curr(rq); 645 } 646 647 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 648 { 649 dequeue_top_rt_rq(rt_rq); 650 } 651 652 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 653 { 654 return rt_rq->rt_throttled; 655 } 656 657 static inline const struct cpumask *sched_rt_period_mask(void) 658 { 659 return cpu_online_mask; 660 } 661 662 static inline 663 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 664 { 665 return &cpu_rq(cpu)->rt; 666 } 667 668 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 669 { 670 return &def_rt_bandwidth; 671 } 672 673 #endif /* CONFIG_RT_GROUP_SCHED */ 674 675 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 676 { 677 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 678 679 return (hrtimer_active(&rt_b->rt_period_timer) || 680 rt_rq->rt_time < rt_b->rt_runtime); 681 } 682 683 #ifdef CONFIG_SMP 684 /* 685 * We ran out of runtime, see if we can borrow some from our neighbours. 686 */ 687 static void do_balance_runtime(struct rt_rq *rt_rq) 688 { 689 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 690 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 691 int i, weight; 692 u64 rt_period; 693 694 weight = cpumask_weight(rd->span); 695 696 raw_spin_lock(&rt_b->rt_runtime_lock); 697 rt_period = ktime_to_ns(rt_b->rt_period); 698 for_each_cpu(i, rd->span) { 699 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 700 s64 diff; 701 702 if (iter == rt_rq) 703 continue; 704 705 raw_spin_lock(&iter->rt_runtime_lock); 706 /* 707 * Either all rqs have inf runtime and there's nothing to steal 708 * or __disable_runtime() below sets a specific rq to inf to 709 * indicate its been disabled and disallow stealing. 710 */ 711 if (iter->rt_runtime == RUNTIME_INF) 712 goto next; 713 714 /* 715 * From runqueues with spare time, take 1/n part of their 716 * spare time, but no more than our period. 717 */ 718 diff = iter->rt_runtime - iter->rt_time; 719 if (diff > 0) { 720 diff = div_u64((u64)diff, weight); 721 if (rt_rq->rt_runtime + diff > rt_period) 722 diff = rt_period - rt_rq->rt_runtime; 723 iter->rt_runtime -= diff; 724 rt_rq->rt_runtime += diff; 725 if (rt_rq->rt_runtime == rt_period) { 726 raw_spin_unlock(&iter->rt_runtime_lock); 727 break; 728 } 729 } 730 next: 731 raw_spin_unlock(&iter->rt_runtime_lock); 732 } 733 raw_spin_unlock(&rt_b->rt_runtime_lock); 734 } 735 736 /* 737 * Ensure this RQ takes back all the runtime it lend to its neighbours. 738 */ 739 static void __disable_runtime(struct rq *rq) 740 { 741 struct root_domain *rd = rq->rd; 742 rt_rq_iter_t iter; 743 struct rt_rq *rt_rq; 744 745 if (unlikely(!scheduler_running)) 746 return; 747 748 for_each_rt_rq(rt_rq, iter, rq) { 749 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 750 s64 want; 751 int i; 752 753 raw_spin_lock(&rt_b->rt_runtime_lock); 754 raw_spin_lock(&rt_rq->rt_runtime_lock); 755 /* 756 * Either we're all inf and nobody needs to borrow, or we're 757 * already disabled and thus have nothing to do, or we have 758 * exactly the right amount of runtime to take out. 759 */ 760 if (rt_rq->rt_runtime == RUNTIME_INF || 761 rt_rq->rt_runtime == rt_b->rt_runtime) 762 goto balanced; 763 raw_spin_unlock(&rt_rq->rt_runtime_lock); 764 765 /* 766 * Calculate the difference between what we started out with 767 * and what we current have, that's the amount of runtime 768 * we lend and now have to reclaim. 769 */ 770 want = rt_b->rt_runtime - rt_rq->rt_runtime; 771 772 /* 773 * Greedy reclaim, take back as much as we can. 774 */ 775 for_each_cpu(i, rd->span) { 776 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 777 s64 diff; 778 779 /* 780 * Can't reclaim from ourselves or disabled runqueues. 781 */ 782 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 783 continue; 784 785 raw_spin_lock(&iter->rt_runtime_lock); 786 if (want > 0) { 787 diff = min_t(s64, iter->rt_runtime, want); 788 iter->rt_runtime -= diff; 789 want -= diff; 790 } else { 791 iter->rt_runtime -= want; 792 want -= want; 793 } 794 raw_spin_unlock(&iter->rt_runtime_lock); 795 796 if (!want) 797 break; 798 } 799 800 raw_spin_lock(&rt_rq->rt_runtime_lock); 801 /* 802 * We cannot be left wanting - that would mean some runtime 803 * leaked out of the system. 804 */ 805 BUG_ON(want); 806 balanced: 807 /* 808 * Disable all the borrow logic by pretending we have inf 809 * runtime - in which case borrowing doesn't make sense. 810 */ 811 rt_rq->rt_runtime = RUNTIME_INF; 812 rt_rq->rt_throttled = 0; 813 raw_spin_unlock(&rt_rq->rt_runtime_lock); 814 raw_spin_unlock(&rt_b->rt_runtime_lock); 815 816 /* Make rt_rq available for pick_next_task() */ 817 sched_rt_rq_enqueue(rt_rq); 818 } 819 } 820 821 static void __enable_runtime(struct rq *rq) 822 { 823 rt_rq_iter_t iter; 824 struct rt_rq *rt_rq; 825 826 if (unlikely(!scheduler_running)) 827 return; 828 829 /* 830 * Reset each runqueue's bandwidth settings 831 */ 832 for_each_rt_rq(rt_rq, iter, rq) { 833 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 834 835 raw_spin_lock(&rt_b->rt_runtime_lock); 836 raw_spin_lock(&rt_rq->rt_runtime_lock); 837 rt_rq->rt_runtime = rt_b->rt_runtime; 838 rt_rq->rt_time = 0; 839 rt_rq->rt_throttled = 0; 840 raw_spin_unlock(&rt_rq->rt_runtime_lock); 841 raw_spin_unlock(&rt_b->rt_runtime_lock); 842 } 843 } 844 845 static void balance_runtime(struct rt_rq *rt_rq) 846 { 847 if (!sched_feat(RT_RUNTIME_SHARE)) 848 return; 849 850 if (rt_rq->rt_time > rt_rq->rt_runtime) { 851 raw_spin_unlock(&rt_rq->rt_runtime_lock); 852 do_balance_runtime(rt_rq); 853 raw_spin_lock(&rt_rq->rt_runtime_lock); 854 } 855 } 856 #else /* !CONFIG_SMP */ 857 static inline void balance_runtime(struct rt_rq *rt_rq) {} 858 #endif /* CONFIG_SMP */ 859 860 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 861 { 862 int i, idle = 1, throttled = 0; 863 const struct cpumask *span; 864 865 span = sched_rt_period_mask(); 866 #ifdef CONFIG_RT_GROUP_SCHED 867 /* 868 * FIXME: isolated CPUs should really leave the root task group, 869 * whether they are isolcpus or were isolated via cpusets, lest 870 * the timer run on a CPU which does not service all runqueues, 871 * potentially leaving other CPUs indefinitely throttled. If 872 * isolation is really required, the user will turn the throttle 873 * off to kill the perturbations it causes anyway. Meanwhile, 874 * this maintains functionality for boot and/or troubleshooting. 875 */ 876 if (rt_b == &root_task_group.rt_bandwidth) 877 span = cpu_online_mask; 878 #endif 879 for_each_cpu(i, span) { 880 int enqueue = 0; 881 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 882 struct rq *rq = rq_of_rt_rq(rt_rq); 883 int skip; 884 885 /* 886 * When span == cpu_online_mask, taking each rq->lock 887 * can be time-consuming. Try to avoid it when possible. 888 */ 889 raw_spin_lock(&rt_rq->rt_runtime_lock); 890 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF) 891 rt_rq->rt_runtime = rt_b->rt_runtime; 892 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running; 893 raw_spin_unlock(&rt_rq->rt_runtime_lock); 894 if (skip) 895 continue; 896 897 raw_spin_rq_lock(rq); 898 update_rq_clock(rq); 899 900 if (rt_rq->rt_time) { 901 u64 runtime; 902 903 raw_spin_lock(&rt_rq->rt_runtime_lock); 904 if (rt_rq->rt_throttled) 905 balance_runtime(rt_rq); 906 runtime = rt_rq->rt_runtime; 907 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 908 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 909 rt_rq->rt_throttled = 0; 910 enqueue = 1; 911 912 /* 913 * When we're idle and a woken (rt) task is 914 * throttled check_preempt_curr() will set 915 * skip_update and the time between the wakeup 916 * and this unthrottle will get accounted as 917 * 'runtime'. 918 */ 919 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 920 rq_clock_cancel_skipupdate(rq); 921 } 922 if (rt_rq->rt_time || rt_rq->rt_nr_running) 923 idle = 0; 924 raw_spin_unlock(&rt_rq->rt_runtime_lock); 925 } else if (rt_rq->rt_nr_running) { 926 idle = 0; 927 if (!rt_rq_throttled(rt_rq)) 928 enqueue = 1; 929 } 930 if (rt_rq->rt_throttled) 931 throttled = 1; 932 933 if (enqueue) 934 sched_rt_rq_enqueue(rt_rq); 935 raw_spin_rq_unlock(rq); 936 } 937 938 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 939 return 1; 940 941 return idle; 942 } 943 944 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 945 { 946 #ifdef CONFIG_RT_GROUP_SCHED 947 struct rt_rq *rt_rq = group_rt_rq(rt_se); 948 949 if (rt_rq) 950 return rt_rq->highest_prio.curr; 951 #endif 952 953 return rt_task_of(rt_se)->prio; 954 } 955 956 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 957 { 958 u64 runtime = sched_rt_runtime(rt_rq); 959 960 if (rt_rq->rt_throttled) 961 return rt_rq_throttled(rt_rq); 962 963 if (runtime >= sched_rt_period(rt_rq)) 964 return 0; 965 966 balance_runtime(rt_rq); 967 runtime = sched_rt_runtime(rt_rq); 968 if (runtime == RUNTIME_INF) 969 return 0; 970 971 if (rt_rq->rt_time > runtime) { 972 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 973 974 /* 975 * Don't actually throttle groups that have no runtime assigned 976 * but accrue some time due to boosting. 977 */ 978 if (likely(rt_b->rt_runtime)) { 979 rt_rq->rt_throttled = 1; 980 printk_deferred_once("sched: RT throttling activated\n"); 981 } else { 982 /* 983 * In case we did anyway, make it go away, 984 * replenishment is a joke, since it will replenish us 985 * with exactly 0 ns. 986 */ 987 rt_rq->rt_time = 0; 988 } 989 990 if (rt_rq_throttled(rt_rq)) { 991 sched_rt_rq_dequeue(rt_rq); 992 return 1; 993 } 994 } 995 996 return 0; 997 } 998 999 /* 1000 * Update the current task's runtime statistics. Skip current tasks that 1001 * are not in our scheduling class. 1002 */ 1003 static void update_curr_rt(struct rq *rq) 1004 { 1005 struct task_struct *curr = rq->curr; 1006 struct sched_rt_entity *rt_se = &curr->rt; 1007 u64 delta_exec; 1008 u64 now; 1009 1010 if (curr->sched_class != &rt_sched_class) 1011 return; 1012 1013 now = rq_clock_task(rq); 1014 delta_exec = now - curr->se.exec_start; 1015 if (unlikely((s64)delta_exec <= 0)) 1016 return; 1017 1018 schedstat_set(curr->stats.exec_max, 1019 max(curr->stats.exec_max, delta_exec)); 1020 1021 trace_sched_stat_runtime(curr, delta_exec, 0); 1022 1023 curr->se.sum_exec_runtime += delta_exec; 1024 account_group_exec_runtime(curr, delta_exec); 1025 1026 curr->se.exec_start = now; 1027 cgroup_account_cputime(curr, delta_exec); 1028 1029 if (!rt_bandwidth_enabled()) 1030 return; 1031 1032 for_each_sched_rt_entity(rt_se) { 1033 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1034 1035 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 1036 raw_spin_lock(&rt_rq->rt_runtime_lock); 1037 rt_rq->rt_time += delta_exec; 1038 if (sched_rt_runtime_exceeded(rt_rq)) 1039 resched_curr(rq); 1040 raw_spin_unlock(&rt_rq->rt_runtime_lock); 1041 } 1042 } 1043 } 1044 1045 static void 1046 dequeue_top_rt_rq(struct rt_rq *rt_rq) 1047 { 1048 struct rq *rq = rq_of_rt_rq(rt_rq); 1049 1050 BUG_ON(&rq->rt != rt_rq); 1051 1052 if (!rt_rq->rt_queued) 1053 return; 1054 1055 BUG_ON(!rq->nr_running); 1056 1057 sub_nr_running(rq, rt_rq->rt_nr_running); 1058 rt_rq->rt_queued = 0; 1059 1060 } 1061 1062 static void 1063 enqueue_top_rt_rq(struct rt_rq *rt_rq) 1064 { 1065 struct rq *rq = rq_of_rt_rq(rt_rq); 1066 1067 BUG_ON(&rq->rt != rt_rq); 1068 1069 if (rt_rq->rt_queued) 1070 return; 1071 1072 if (rt_rq_throttled(rt_rq)) 1073 return; 1074 1075 if (rt_rq->rt_nr_running) { 1076 add_nr_running(rq, rt_rq->rt_nr_running); 1077 rt_rq->rt_queued = 1; 1078 } 1079 1080 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1081 cpufreq_update_util(rq, 0); 1082 } 1083 1084 #if defined CONFIG_SMP 1085 1086 static void 1087 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1088 { 1089 struct rq *rq = rq_of_rt_rq(rt_rq); 1090 1091 #ifdef CONFIG_RT_GROUP_SCHED 1092 /* 1093 * Change rq's cpupri only if rt_rq is the top queue. 1094 */ 1095 if (&rq->rt != rt_rq) 1096 return; 1097 #endif 1098 if (rq->online && prio < prev_prio) 1099 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1100 } 1101 1102 static void 1103 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1104 { 1105 struct rq *rq = rq_of_rt_rq(rt_rq); 1106 1107 #ifdef CONFIG_RT_GROUP_SCHED 1108 /* 1109 * Change rq's cpupri only if rt_rq is the top queue. 1110 */ 1111 if (&rq->rt != rt_rq) 1112 return; 1113 #endif 1114 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1115 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1116 } 1117 1118 #else /* CONFIG_SMP */ 1119 1120 static inline 1121 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1122 static inline 1123 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1124 1125 #endif /* CONFIG_SMP */ 1126 1127 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1128 static void 1129 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1130 { 1131 int prev_prio = rt_rq->highest_prio.curr; 1132 1133 if (prio < prev_prio) 1134 rt_rq->highest_prio.curr = prio; 1135 1136 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1137 } 1138 1139 static void 1140 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1141 { 1142 int prev_prio = rt_rq->highest_prio.curr; 1143 1144 if (rt_rq->rt_nr_running) { 1145 1146 WARN_ON(prio < prev_prio); 1147 1148 /* 1149 * This may have been our highest task, and therefore 1150 * we may have some recomputation to do 1151 */ 1152 if (prio == prev_prio) { 1153 struct rt_prio_array *array = &rt_rq->active; 1154 1155 rt_rq->highest_prio.curr = 1156 sched_find_first_bit(array->bitmap); 1157 } 1158 1159 } else { 1160 rt_rq->highest_prio.curr = MAX_RT_PRIO-1; 1161 } 1162 1163 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1164 } 1165 1166 #else 1167 1168 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1169 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1170 1171 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1172 1173 #ifdef CONFIG_RT_GROUP_SCHED 1174 1175 static void 1176 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1177 { 1178 if (rt_se_boosted(rt_se)) 1179 rt_rq->rt_nr_boosted++; 1180 1181 if (rt_rq->tg) 1182 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1183 } 1184 1185 static void 1186 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1187 { 1188 if (rt_se_boosted(rt_se)) 1189 rt_rq->rt_nr_boosted--; 1190 1191 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1192 } 1193 1194 #else /* CONFIG_RT_GROUP_SCHED */ 1195 1196 static void 1197 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1198 { 1199 start_rt_bandwidth(&def_rt_bandwidth); 1200 } 1201 1202 static inline 1203 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1204 1205 #endif /* CONFIG_RT_GROUP_SCHED */ 1206 1207 static inline 1208 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1209 { 1210 struct rt_rq *group_rq = group_rt_rq(rt_se); 1211 1212 if (group_rq) 1213 return group_rq->rt_nr_running; 1214 else 1215 return 1; 1216 } 1217 1218 static inline 1219 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) 1220 { 1221 struct rt_rq *group_rq = group_rt_rq(rt_se); 1222 struct task_struct *tsk; 1223 1224 if (group_rq) 1225 return group_rq->rr_nr_running; 1226 1227 tsk = rt_task_of(rt_se); 1228 1229 return (tsk->policy == SCHED_RR) ? 1 : 0; 1230 } 1231 1232 static inline 1233 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1234 { 1235 int prio = rt_se_prio(rt_se); 1236 1237 WARN_ON(!rt_prio(prio)); 1238 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1239 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); 1240 1241 inc_rt_prio(rt_rq, prio); 1242 inc_rt_migration(rt_se, rt_rq); 1243 inc_rt_group(rt_se, rt_rq); 1244 } 1245 1246 static inline 1247 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1248 { 1249 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1250 WARN_ON(!rt_rq->rt_nr_running); 1251 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1252 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); 1253 1254 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1255 dec_rt_migration(rt_se, rt_rq); 1256 dec_rt_group(rt_se, rt_rq); 1257 } 1258 1259 /* 1260 * Change rt_se->run_list location unless SAVE && !MOVE 1261 * 1262 * assumes ENQUEUE/DEQUEUE flags match 1263 */ 1264 static inline bool move_entity(unsigned int flags) 1265 { 1266 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) 1267 return false; 1268 1269 return true; 1270 } 1271 1272 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) 1273 { 1274 list_del_init(&rt_se->run_list); 1275 1276 if (list_empty(array->queue + rt_se_prio(rt_se))) 1277 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1278 1279 rt_se->on_list = 0; 1280 } 1281 1282 static inline struct sched_statistics * 1283 __schedstats_from_rt_se(struct sched_rt_entity *rt_se) 1284 { 1285 #ifdef CONFIG_RT_GROUP_SCHED 1286 /* schedstats is not supported for rt group. */ 1287 if (!rt_entity_is_task(rt_se)) 1288 return NULL; 1289 #endif 1290 1291 return &rt_task_of(rt_se)->stats; 1292 } 1293 1294 static inline void 1295 update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1296 { 1297 struct sched_statistics *stats; 1298 struct task_struct *p = NULL; 1299 1300 if (!schedstat_enabled()) 1301 return; 1302 1303 if (rt_entity_is_task(rt_se)) 1304 p = rt_task_of(rt_se); 1305 1306 stats = __schedstats_from_rt_se(rt_se); 1307 if (!stats) 1308 return; 1309 1310 __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats); 1311 } 1312 1313 static inline void 1314 update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1315 { 1316 struct sched_statistics *stats; 1317 struct task_struct *p = NULL; 1318 1319 if (!schedstat_enabled()) 1320 return; 1321 1322 if (rt_entity_is_task(rt_se)) 1323 p = rt_task_of(rt_se); 1324 1325 stats = __schedstats_from_rt_se(rt_se); 1326 if (!stats) 1327 return; 1328 1329 __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats); 1330 } 1331 1332 static inline void 1333 update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1334 int flags) 1335 { 1336 if (!schedstat_enabled()) 1337 return; 1338 1339 if (flags & ENQUEUE_WAKEUP) 1340 update_stats_enqueue_sleeper_rt(rt_rq, rt_se); 1341 } 1342 1343 static inline void 1344 update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 1345 { 1346 struct sched_statistics *stats; 1347 struct task_struct *p = NULL; 1348 1349 if (!schedstat_enabled()) 1350 return; 1351 1352 if (rt_entity_is_task(rt_se)) 1353 p = rt_task_of(rt_se); 1354 1355 stats = __schedstats_from_rt_se(rt_se); 1356 if (!stats) 1357 return; 1358 1359 __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats); 1360 } 1361 1362 static inline void 1363 update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, 1364 int flags) 1365 { 1366 struct task_struct *p = NULL; 1367 1368 if (!schedstat_enabled()) 1369 return; 1370 1371 if (rt_entity_is_task(rt_se)) 1372 p = rt_task_of(rt_se); 1373 1374 if ((flags & DEQUEUE_SLEEP) && p) { 1375 unsigned int state; 1376 1377 state = READ_ONCE(p->__state); 1378 if (state & TASK_INTERRUPTIBLE) 1379 __schedstat_set(p->stats.sleep_start, 1380 rq_clock(rq_of_rt_rq(rt_rq))); 1381 1382 if (state & TASK_UNINTERRUPTIBLE) 1383 __schedstat_set(p->stats.block_start, 1384 rq_clock(rq_of_rt_rq(rt_rq))); 1385 } 1386 } 1387 1388 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1389 { 1390 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1391 struct rt_prio_array *array = &rt_rq->active; 1392 struct rt_rq *group_rq = group_rt_rq(rt_se); 1393 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1394 1395 /* 1396 * Don't enqueue the group if its throttled, or when empty. 1397 * The latter is a consequence of the former when a child group 1398 * get throttled and the current group doesn't have any other 1399 * active members. 1400 */ 1401 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { 1402 if (rt_se->on_list) 1403 __delist_rt_entity(rt_se, array); 1404 return; 1405 } 1406 1407 if (move_entity(flags)) { 1408 WARN_ON_ONCE(rt_se->on_list); 1409 if (flags & ENQUEUE_HEAD) 1410 list_add(&rt_se->run_list, queue); 1411 else 1412 list_add_tail(&rt_se->run_list, queue); 1413 1414 __set_bit(rt_se_prio(rt_se), array->bitmap); 1415 rt_se->on_list = 1; 1416 } 1417 rt_se->on_rq = 1; 1418 1419 inc_rt_tasks(rt_se, rt_rq); 1420 } 1421 1422 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1423 { 1424 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1425 struct rt_prio_array *array = &rt_rq->active; 1426 1427 if (move_entity(flags)) { 1428 WARN_ON_ONCE(!rt_se->on_list); 1429 __delist_rt_entity(rt_se, array); 1430 } 1431 rt_se->on_rq = 0; 1432 1433 dec_rt_tasks(rt_se, rt_rq); 1434 } 1435 1436 /* 1437 * Because the prio of an upper entry depends on the lower 1438 * entries, we must remove entries top - down. 1439 */ 1440 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) 1441 { 1442 struct sched_rt_entity *back = NULL; 1443 1444 for_each_sched_rt_entity(rt_se) { 1445 rt_se->back = back; 1446 back = rt_se; 1447 } 1448 1449 dequeue_top_rt_rq(rt_rq_of_se(back)); 1450 1451 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1452 if (on_rt_rq(rt_se)) 1453 __dequeue_rt_entity(rt_se, flags); 1454 } 1455 } 1456 1457 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1458 { 1459 struct rq *rq = rq_of_rt_se(rt_se); 1460 1461 update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1462 1463 dequeue_rt_stack(rt_se, flags); 1464 for_each_sched_rt_entity(rt_se) 1465 __enqueue_rt_entity(rt_se, flags); 1466 enqueue_top_rt_rq(&rq->rt); 1467 } 1468 1469 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) 1470 { 1471 struct rq *rq = rq_of_rt_se(rt_se); 1472 1473 update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags); 1474 1475 dequeue_rt_stack(rt_se, flags); 1476 1477 for_each_sched_rt_entity(rt_se) { 1478 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1479 1480 if (rt_rq && rt_rq->rt_nr_running) 1481 __enqueue_rt_entity(rt_se, flags); 1482 } 1483 enqueue_top_rt_rq(&rq->rt); 1484 } 1485 1486 /* 1487 * Adding/removing a task to/from a priority array: 1488 */ 1489 static void 1490 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1491 { 1492 struct sched_rt_entity *rt_se = &p->rt; 1493 1494 if (flags & ENQUEUE_WAKEUP) 1495 rt_se->timeout = 0; 1496 1497 check_schedstat_required(); 1498 update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se); 1499 1500 enqueue_rt_entity(rt_se, flags); 1501 1502 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1503 enqueue_pushable_task(rq, p); 1504 } 1505 1506 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1507 { 1508 struct sched_rt_entity *rt_se = &p->rt; 1509 1510 update_curr_rt(rq); 1511 dequeue_rt_entity(rt_se, flags); 1512 1513 dequeue_pushable_task(rq, p); 1514 } 1515 1516 /* 1517 * Put task to the head or the end of the run list without the overhead of 1518 * dequeue followed by enqueue. 1519 */ 1520 static void 1521 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1522 { 1523 if (on_rt_rq(rt_se)) { 1524 struct rt_prio_array *array = &rt_rq->active; 1525 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1526 1527 if (head) 1528 list_move(&rt_se->run_list, queue); 1529 else 1530 list_move_tail(&rt_se->run_list, queue); 1531 } 1532 } 1533 1534 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1535 { 1536 struct sched_rt_entity *rt_se = &p->rt; 1537 struct rt_rq *rt_rq; 1538 1539 for_each_sched_rt_entity(rt_se) { 1540 rt_rq = rt_rq_of_se(rt_se); 1541 requeue_rt_entity(rt_rq, rt_se, head); 1542 } 1543 } 1544 1545 static void yield_task_rt(struct rq *rq) 1546 { 1547 requeue_task_rt(rq, rq->curr, 0); 1548 } 1549 1550 #ifdef CONFIG_SMP 1551 static int find_lowest_rq(struct task_struct *task); 1552 1553 static int 1554 select_task_rq_rt(struct task_struct *p, int cpu, int flags) 1555 { 1556 struct task_struct *curr; 1557 struct rq *rq; 1558 bool test; 1559 1560 /* For anything but wake ups, just return the task_cpu */ 1561 if (!(flags & (WF_TTWU | WF_FORK))) 1562 goto out; 1563 1564 rq = cpu_rq(cpu); 1565 1566 rcu_read_lock(); 1567 curr = READ_ONCE(rq->curr); /* unlocked access */ 1568 1569 /* 1570 * If the current task on @p's runqueue is an RT task, then 1571 * try to see if we can wake this RT task up on another 1572 * runqueue. Otherwise simply start this RT task 1573 * on its current runqueue. 1574 * 1575 * We want to avoid overloading runqueues. If the woken 1576 * task is a higher priority, then it will stay on this CPU 1577 * and the lower prio task should be moved to another CPU. 1578 * Even though this will probably make the lower prio task 1579 * lose its cache, we do not want to bounce a higher task 1580 * around just because it gave up its CPU, perhaps for a 1581 * lock? 1582 * 1583 * For equal prio tasks, we just let the scheduler sort it out. 1584 * 1585 * Otherwise, just let it ride on the affined RQ and the 1586 * post-schedule router will push the preempted task away 1587 * 1588 * This test is optimistic, if we get it wrong the load-balancer 1589 * will have to sort it out. 1590 * 1591 * We take into account the capacity of the CPU to ensure it fits the 1592 * requirement of the task - which is only important on heterogeneous 1593 * systems like big.LITTLE. 1594 */ 1595 test = curr && 1596 unlikely(rt_task(curr)) && 1597 (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); 1598 1599 if (test || !rt_task_fits_capacity(p, cpu)) { 1600 int target = find_lowest_rq(p); 1601 1602 /* 1603 * Bail out if we were forcing a migration to find a better 1604 * fitting CPU but our search failed. 1605 */ 1606 if (!test && target != -1 && !rt_task_fits_capacity(p, target)) 1607 goto out_unlock; 1608 1609 /* 1610 * Don't bother moving it if the destination CPU is 1611 * not running a lower priority task. 1612 */ 1613 if (target != -1 && 1614 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1615 cpu = target; 1616 } 1617 1618 out_unlock: 1619 rcu_read_unlock(); 1620 1621 out: 1622 return cpu; 1623 } 1624 1625 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1626 { 1627 /* 1628 * Current can't be migrated, useless to reschedule, 1629 * let's hope p can move out. 1630 */ 1631 if (rq->curr->nr_cpus_allowed == 1 || 1632 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1633 return; 1634 1635 /* 1636 * p is migratable, so let's not schedule it and 1637 * see if it is pushed or pulled somewhere else. 1638 */ 1639 if (p->nr_cpus_allowed != 1 && 1640 cpupri_find(&rq->rd->cpupri, p, NULL)) 1641 return; 1642 1643 /* 1644 * There appear to be other CPUs that can accept 1645 * the current task but none can run 'p', so lets reschedule 1646 * to try and push the current task away: 1647 */ 1648 requeue_task_rt(rq, p, 1); 1649 resched_curr(rq); 1650 } 1651 1652 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1653 { 1654 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) { 1655 /* 1656 * This is OK, because current is on_cpu, which avoids it being 1657 * picked for load-balance and preemption/IRQs are still 1658 * disabled avoiding further scheduler activity on it and we've 1659 * not yet started the picking loop. 1660 */ 1661 rq_unpin_lock(rq, rf); 1662 pull_rt_task(rq); 1663 rq_repin_lock(rq, rf); 1664 } 1665 1666 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); 1667 } 1668 #endif /* CONFIG_SMP */ 1669 1670 /* 1671 * Preempt the current task with a newly woken task if needed: 1672 */ 1673 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1674 { 1675 if (p->prio < rq->curr->prio) { 1676 resched_curr(rq); 1677 return; 1678 } 1679 1680 #ifdef CONFIG_SMP 1681 /* 1682 * If: 1683 * 1684 * - the newly woken task is of equal priority to the current task 1685 * - the newly woken task is non-migratable while current is migratable 1686 * - current will be preempted on the next reschedule 1687 * 1688 * we should check to see if current can readily move to a different 1689 * cpu. If so, we will reschedule to allow the push logic to try 1690 * to move current somewhere else, making room for our non-migratable 1691 * task. 1692 */ 1693 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1694 check_preempt_equal_prio(rq, p); 1695 #endif 1696 } 1697 1698 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) 1699 { 1700 struct sched_rt_entity *rt_se = &p->rt; 1701 struct rt_rq *rt_rq = &rq->rt; 1702 1703 p->se.exec_start = rq_clock_task(rq); 1704 if (on_rt_rq(&p->rt)) 1705 update_stats_wait_end_rt(rt_rq, rt_se); 1706 1707 /* The running task is never eligible for pushing */ 1708 dequeue_pushable_task(rq, p); 1709 1710 if (!first) 1711 return; 1712 1713 /* 1714 * If prev task was rt, put_prev_task() has already updated the 1715 * utilization. We only care of the case where we start to schedule a 1716 * rt task 1717 */ 1718 if (rq->curr->sched_class != &rt_sched_class) 1719 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 1720 1721 rt_queue_push_tasks(rq); 1722 } 1723 1724 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1725 struct rt_rq *rt_rq) 1726 { 1727 struct rt_prio_array *array = &rt_rq->active; 1728 struct sched_rt_entity *next = NULL; 1729 struct list_head *queue; 1730 int idx; 1731 1732 idx = sched_find_first_bit(array->bitmap); 1733 BUG_ON(idx >= MAX_RT_PRIO); 1734 1735 queue = array->queue + idx; 1736 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1737 1738 return next; 1739 } 1740 1741 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1742 { 1743 struct sched_rt_entity *rt_se; 1744 struct rt_rq *rt_rq = &rq->rt; 1745 1746 do { 1747 rt_se = pick_next_rt_entity(rq, rt_rq); 1748 BUG_ON(!rt_se); 1749 rt_rq = group_rt_rq(rt_se); 1750 } while (rt_rq); 1751 1752 return rt_task_of(rt_se); 1753 } 1754 1755 static struct task_struct *pick_task_rt(struct rq *rq) 1756 { 1757 struct task_struct *p; 1758 1759 if (!sched_rt_runnable(rq)) 1760 return NULL; 1761 1762 p = _pick_next_task_rt(rq); 1763 1764 return p; 1765 } 1766 1767 static struct task_struct *pick_next_task_rt(struct rq *rq) 1768 { 1769 struct task_struct *p = pick_task_rt(rq); 1770 1771 if (p) 1772 set_next_task_rt(rq, p, true); 1773 1774 return p; 1775 } 1776 1777 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1778 { 1779 struct sched_rt_entity *rt_se = &p->rt; 1780 struct rt_rq *rt_rq = &rq->rt; 1781 1782 if (on_rt_rq(&p->rt)) 1783 update_stats_wait_start_rt(rt_rq, rt_se); 1784 1785 update_curr_rt(rq); 1786 1787 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 1788 1789 /* 1790 * The previous task needs to be made eligible for pushing 1791 * if it is still active 1792 */ 1793 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1794 enqueue_pushable_task(rq, p); 1795 } 1796 1797 #ifdef CONFIG_SMP 1798 1799 /* Only try algorithms three times */ 1800 #define RT_MAX_TRIES 3 1801 1802 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1803 { 1804 if (!task_running(rq, p) && 1805 cpumask_test_cpu(cpu, &p->cpus_mask)) 1806 return 1; 1807 1808 return 0; 1809 } 1810 1811 /* 1812 * Return the highest pushable rq's task, which is suitable to be executed 1813 * on the CPU, NULL otherwise 1814 */ 1815 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1816 { 1817 struct plist_head *head = &rq->rt.pushable_tasks; 1818 struct task_struct *p; 1819 1820 if (!has_pushable_tasks(rq)) 1821 return NULL; 1822 1823 plist_for_each_entry(p, head, pushable_tasks) { 1824 if (pick_rt_task(rq, p, cpu)) 1825 return p; 1826 } 1827 1828 return NULL; 1829 } 1830 1831 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1832 1833 static int find_lowest_rq(struct task_struct *task) 1834 { 1835 struct sched_domain *sd; 1836 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1837 int this_cpu = smp_processor_id(); 1838 int cpu = task_cpu(task); 1839 int ret; 1840 1841 /* Make sure the mask is initialized first */ 1842 if (unlikely(!lowest_mask)) 1843 return -1; 1844 1845 if (task->nr_cpus_allowed == 1) 1846 return -1; /* No other targets possible */ 1847 1848 /* 1849 * If we're on asym system ensure we consider the different capacities 1850 * of the CPUs when searching for the lowest_mask. 1851 */ 1852 if (static_branch_unlikely(&sched_asym_cpucapacity)) { 1853 1854 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, 1855 task, lowest_mask, 1856 rt_task_fits_capacity); 1857 } else { 1858 1859 ret = cpupri_find(&task_rq(task)->rd->cpupri, 1860 task, lowest_mask); 1861 } 1862 1863 if (!ret) 1864 return -1; /* No targets found */ 1865 1866 /* 1867 * At this point we have built a mask of CPUs representing the 1868 * lowest priority tasks in the system. Now we want to elect 1869 * the best one based on our affinity and topology. 1870 * 1871 * We prioritize the last CPU that the task executed on since 1872 * it is most likely cache-hot in that location. 1873 */ 1874 if (cpumask_test_cpu(cpu, lowest_mask)) 1875 return cpu; 1876 1877 /* 1878 * Otherwise, we consult the sched_domains span maps to figure 1879 * out which CPU is logically closest to our hot cache data. 1880 */ 1881 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1882 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1883 1884 rcu_read_lock(); 1885 for_each_domain(cpu, sd) { 1886 if (sd->flags & SD_WAKE_AFFINE) { 1887 int best_cpu; 1888 1889 /* 1890 * "this_cpu" is cheaper to preempt than a 1891 * remote processor. 1892 */ 1893 if (this_cpu != -1 && 1894 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1895 rcu_read_unlock(); 1896 return this_cpu; 1897 } 1898 1899 best_cpu = cpumask_any_and_distribute(lowest_mask, 1900 sched_domain_span(sd)); 1901 if (best_cpu < nr_cpu_ids) { 1902 rcu_read_unlock(); 1903 return best_cpu; 1904 } 1905 } 1906 } 1907 rcu_read_unlock(); 1908 1909 /* 1910 * And finally, if there were no matches within the domains 1911 * just give the caller *something* to work with from the compatible 1912 * locations. 1913 */ 1914 if (this_cpu != -1) 1915 return this_cpu; 1916 1917 cpu = cpumask_any_distribute(lowest_mask); 1918 if (cpu < nr_cpu_ids) 1919 return cpu; 1920 1921 return -1; 1922 } 1923 1924 /* Will lock the rq it finds */ 1925 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1926 { 1927 struct rq *lowest_rq = NULL; 1928 int tries; 1929 int cpu; 1930 1931 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1932 cpu = find_lowest_rq(task); 1933 1934 if ((cpu == -1) || (cpu == rq->cpu)) 1935 break; 1936 1937 lowest_rq = cpu_rq(cpu); 1938 1939 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 1940 /* 1941 * Target rq has tasks of equal or higher priority, 1942 * retrying does not release any lock and is unlikely 1943 * to yield a different result. 1944 */ 1945 lowest_rq = NULL; 1946 break; 1947 } 1948 1949 /* if the prio of this runqueue changed, try again */ 1950 if (double_lock_balance(rq, lowest_rq)) { 1951 /* 1952 * We had to unlock the run queue. In 1953 * the mean time, task could have 1954 * migrated already or had its affinity changed. 1955 * Also make sure that it wasn't scheduled on its rq. 1956 */ 1957 if (unlikely(task_rq(task) != rq || 1958 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 1959 task_running(rq, task) || 1960 !rt_task(task) || 1961 !task_on_rq_queued(task))) { 1962 1963 double_unlock_balance(rq, lowest_rq); 1964 lowest_rq = NULL; 1965 break; 1966 } 1967 } 1968 1969 /* If this rq is still suitable use it. */ 1970 if (lowest_rq->rt.highest_prio.curr > task->prio) 1971 break; 1972 1973 /* try again */ 1974 double_unlock_balance(rq, lowest_rq); 1975 lowest_rq = NULL; 1976 } 1977 1978 return lowest_rq; 1979 } 1980 1981 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1982 { 1983 struct task_struct *p; 1984 1985 if (!has_pushable_tasks(rq)) 1986 return NULL; 1987 1988 p = plist_first_entry(&rq->rt.pushable_tasks, 1989 struct task_struct, pushable_tasks); 1990 1991 BUG_ON(rq->cpu != task_cpu(p)); 1992 BUG_ON(task_current(rq, p)); 1993 BUG_ON(p->nr_cpus_allowed <= 1); 1994 1995 BUG_ON(!task_on_rq_queued(p)); 1996 BUG_ON(!rt_task(p)); 1997 1998 return p; 1999 } 2000 2001 /* 2002 * If the current CPU has more than one RT task, see if the non 2003 * running task can migrate over to a CPU that is running a task 2004 * of lesser priority. 2005 */ 2006 static int push_rt_task(struct rq *rq, bool pull) 2007 { 2008 struct task_struct *next_task; 2009 struct rq *lowest_rq; 2010 int ret = 0; 2011 2012 if (!rq->rt.overloaded) 2013 return 0; 2014 2015 next_task = pick_next_pushable_task(rq); 2016 if (!next_task) 2017 return 0; 2018 2019 retry: 2020 if (is_migration_disabled(next_task)) { 2021 struct task_struct *push_task = NULL; 2022 int cpu; 2023 2024 if (!pull || rq->push_busy) 2025 return 0; 2026 2027 cpu = find_lowest_rq(rq->curr); 2028 if (cpu == -1 || cpu == rq->cpu) 2029 return 0; 2030 2031 /* 2032 * Given we found a CPU with lower priority than @next_task, 2033 * therefore it should be running. However we cannot migrate it 2034 * to this other CPU, instead attempt to push the current 2035 * running task on this CPU away. 2036 */ 2037 push_task = get_push_task(rq); 2038 if (push_task) { 2039 raw_spin_rq_unlock(rq); 2040 stop_one_cpu_nowait(rq->cpu, push_cpu_stop, 2041 push_task, &rq->push_work); 2042 raw_spin_rq_lock(rq); 2043 } 2044 2045 return 0; 2046 } 2047 2048 if (WARN_ON(next_task == rq->curr)) 2049 return 0; 2050 2051 /* 2052 * It's possible that the next_task slipped in of 2053 * higher priority than current. If that's the case 2054 * just reschedule current. 2055 */ 2056 if (unlikely(next_task->prio < rq->curr->prio)) { 2057 resched_curr(rq); 2058 return 0; 2059 } 2060 2061 /* We might release rq lock */ 2062 get_task_struct(next_task); 2063 2064 /* find_lock_lowest_rq locks the rq if found */ 2065 lowest_rq = find_lock_lowest_rq(next_task, rq); 2066 if (!lowest_rq) { 2067 struct task_struct *task; 2068 /* 2069 * find_lock_lowest_rq releases rq->lock 2070 * so it is possible that next_task has migrated. 2071 * 2072 * We need to make sure that the task is still on the same 2073 * run-queue and is also still the next task eligible for 2074 * pushing. 2075 */ 2076 task = pick_next_pushable_task(rq); 2077 if (task == next_task) { 2078 /* 2079 * The task hasn't migrated, and is still the next 2080 * eligible task, but we failed to find a run-queue 2081 * to push it to. Do not retry in this case, since 2082 * other CPUs will pull from us when ready. 2083 */ 2084 goto out; 2085 } 2086 2087 if (!task) 2088 /* No more tasks, just exit */ 2089 goto out; 2090 2091 /* 2092 * Something has shifted, try again. 2093 */ 2094 put_task_struct(next_task); 2095 next_task = task; 2096 goto retry; 2097 } 2098 2099 deactivate_task(rq, next_task, 0); 2100 set_task_cpu(next_task, lowest_rq->cpu); 2101 activate_task(lowest_rq, next_task, 0); 2102 resched_curr(lowest_rq); 2103 ret = 1; 2104 2105 double_unlock_balance(rq, lowest_rq); 2106 out: 2107 put_task_struct(next_task); 2108 2109 return ret; 2110 } 2111 2112 static void push_rt_tasks(struct rq *rq) 2113 { 2114 /* push_rt_task will return true if it moved an RT */ 2115 while (push_rt_task(rq, false)) 2116 ; 2117 } 2118 2119 #ifdef HAVE_RT_PUSH_IPI 2120 2121 /* 2122 * When a high priority task schedules out from a CPU and a lower priority 2123 * task is scheduled in, a check is made to see if there's any RT tasks 2124 * on other CPUs that are waiting to run because a higher priority RT task 2125 * is currently running on its CPU. In this case, the CPU with multiple RT 2126 * tasks queued on it (overloaded) needs to be notified that a CPU has opened 2127 * up that may be able to run one of its non-running queued RT tasks. 2128 * 2129 * All CPUs with overloaded RT tasks need to be notified as there is currently 2130 * no way to know which of these CPUs have the highest priority task waiting 2131 * to run. Instead of trying to take a spinlock on each of these CPUs, 2132 * which has shown to cause large latency when done on machines with many 2133 * CPUs, sending an IPI to the CPUs to have them push off the overloaded 2134 * RT tasks waiting to run. 2135 * 2136 * Just sending an IPI to each of the CPUs is also an issue, as on large 2137 * count CPU machines, this can cause an IPI storm on a CPU, especially 2138 * if its the only CPU with multiple RT tasks queued, and a large number 2139 * of CPUs scheduling a lower priority task at the same time. 2140 * 2141 * Each root domain has its own irq work function that can iterate over 2142 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT 2143 * task must be checked if there's one or many CPUs that are lowering 2144 * their priority, there's a single irq work iterator that will try to 2145 * push off RT tasks that are waiting to run. 2146 * 2147 * When a CPU schedules a lower priority task, it will kick off the 2148 * irq work iterator that will jump to each CPU with overloaded RT tasks. 2149 * As it only takes the first CPU that schedules a lower priority task 2150 * to start the process, the rto_start variable is incremented and if 2151 * the atomic result is one, then that CPU will try to take the rto_lock. 2152 * This prevents high contention on the lock as the process handles all 2153 * CPUs scheduling lower priority tasks. 2154 * 2155 * All CPUs that are scheduling a lower priority task will increment the 2156 * rt_loop_next variable. This will make sure that the irq work iterator 2157 * checks all RT overloaded CPUs whenever a CPU schedules a new lower 2158 * priority task, even if the iterator is in the middle of a scan. Incrementing 2159 * the rt_loop_next will cause the iterator to perform another scan. 2160 * 2161 */ 2162 static int rto_next_cpu(struct root_domain *rd) 2163 { 2164 int next; 2165 int cpu; 2166 2167 /* 2168 * When starting the IPI RT pushing, the rto_cpu is set to -1, 2169 * rt_next_cpu() will simply return the first CPU found in 2170 * the rto_mask. 2171 * 2172 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it 2173 * will return the next CPU found in the rto_mask. 2174 * 2175 * If there are no more CPUs left in the rto_mask, then a check is made 2176 * against rto_loop and rto_loop_next. rto_loop is only updated with 2177 * the rto_lock held, but any CPU may increment the rto_loop_next 2178 * without any locking. 2179 */ 2180 for (;;) { 2181 2182 /* When rto_cpu is -1 this acts like cpumask_first() */ 2183 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); 2184 2185 rd->rto_cpu = cpu; 2186 2187 if (cpu < nr_cpu_ids) 2188 return cpu; 2189 2190 rd->rto_cpu = -1; 2191 2192 /* 2193 * ACQUIRE ensures we see the @rto_mask changes 2194 * made prior to the @next value observed. 2195 * 2196 * Matches WMB in rt_set_overload(). 2197 */ 2198 next = atomic_read_acquire(&rd->rto_loop_next); 2199 2200 if (rd->rto_loop == next) 2201 break; 2202 2203 rd->rto_loop = next; 2204 } 2205 2206 return -1; 2207 } 2208 2209 static inline bool rto_start_trylock(atomic_t *v) 2210 { 2211 return !atomic_cmpxchg_acquire(v, 0, 1); 2212 } 2213 2214 static inline void rto_start_unlock(atomic_t *v) 2215 { 2216 atomic_set_release(v, 0); 2217 } 2218 2219 static void tell_cpu_to_push(struct rq *rq) 2220 { 2221 int cpu = -1; 2222 2223 /* Keep the loop going if the IPI is currently active */ 2224 atomic_inc(&rq->rd->rto_loop_next); 2225 2226 /* Only one CPU can initiate a loop at a time */ 2227 if (!rto_start_trylock(&rq->rd->rto_loop_start)) 2228 return; 2229 2230 raw_spin_lock(&rq->rd->rto_lock); 2231 2232 /* 2233 * The rto_cpu is updated under the lock, if it has a valid CPU 2234 * then the IPI is still running and will continue due to the 2235 * update to loop_next, and nothing needs to be done here. 2236 * Otherwise it is finishing up and an ipi needs to be sent. 2237 */ 2238 if (rq->rd->rto_cpu < 0) 2239 cpu = rto_next_cpu(rq->rd); 2240 2241 raw_spin_unlock(&rq->rd->rto_lock); 2242 2243 rto_start_unlock(&rq->rd->rto_loop_start); 2244 2245 if (cpu >= 0) { 2246 /* Make sure the rd does not get freed while pushing */ 2247 sched_get_rd(rq->rd); 2248 irq_work_queue_on(&rq->rd->rto_push_work, cpu); 2249 } 2250 } 2251 2252 /* Called from hardirq context */ 2253 void rto_push_irq_work_func(struct irq_work *work) 2254 { 2255 struct root_domain *rd = 2256 container_of(work, struct root_domain, rto_push_work); 2257 struct rq *rq; 2258 int cpu; 2259 2260 rq = this_rq(); 2261 2262 /* 2263 * We do not need to grab the lock to check for has_pushable_tasks. 2264 * When it gets updated, a check is made if a push is possible. 2265 */ 2266 if (has_pushable_tasks(rq)) { 2267 raw_spin_rq_lock(rq); 2268 while (push_rt_task(rq, true)) 2269 ; 2270 raw_spin_rq_unlock(rq); 2271 } 2272 2273 raw_spin_lock(&rd->rto_lock); 2274 2275 /* Pass the IPI to the next rt overloaded queue */ 2276 cpu = rto_next_cpu(rd); 2277 2278 raw_spin_unlock(&rd->rto_lock); 2279 2280 if (cpu < 0) { 2281 sched_put_rd(rd); 2282 return; 2283 } 2284 2285 /* Try the next RT overloaded CPU */ 2286 irq_work_queue_on(&rd->rto_push_work, cpu); 2287 } 2288 #endif /* HAVE_RT_PUSH_IPI */ 2289 2290 static void pull_rt_task(struct rq *this_rq) 2291 { 2292 int this_cpu = this_rq->cpu, cpu; 2293 bool resched = false; 2294 struct task_struct *p, *push_task; 2295 struct rq *src_rq; 2296 int rt_overload_count = rt_overloaded(this_rq); 2297 2298 if (likely(!rt_overload_count)) 2299 return; 2300 2301 /* 2302 * Match the barrier from rt_set_overloaded; this guarantees that if we 2303 * see overloaded we must also see the rto_mask bit. 2304 */ 2305 smp_rmb(); 2306 2307 /* If we are the only overloaded CPU do nothing */ 2308 if (rt_overload_count == 1 && 2309 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) 2310 return; 2311 2312 #ifdef HAVE_RT_PUSH_IPI 2313 if (sched_feat(RT_PUSH_IPI)) { 2314 tell_cpu_to_push(this_rq); 2315 return; 2316 } 2317 #endif 2318 2319 for_each_cpu(cpu, this_rq->rd->rto_mask) { 2320 if (this_cpu == cpu) 2321 continue; 2322 2323 src_rq = cpu_rq(cpu); 2324 2325 /* 2326 * Don't bother taking the src_rq->lock if the next highest 2327 * task is known to be lower-priority than our current task. 2328 * This may look racy, but if this value is about to go 2329 * logically higher, the src_rq will push this task away. 2330 * And if its going logically lower, we do not care 2331 */ 2332 if (src_rq->rt.highest_prio.next >= 2333 this_rq->rt.highest_prio.curr) 2334 continue; 2335 2336 /* 2337 * We can potentially drop this_rq's lock in 2338 * double_lock_balance, and another CPU could 2339 * alter this_rq 2340 */ 2341 push_task = NULL; 2342 double_lock_balance(this_rq, src_rq); 2343 2344 /* 2345 * We can pull only a task, which is pushable 2346 * on its rq, and no others. 2347 */ 2348 p = pick_highest_pushable_task(src_rq, this_cpu); 2349 2350 /* 2351 * Do we have an RT task that preempts 2352 * the to-be-scheduled task? 2353 */ 2354 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2355 WARN_ON(p == src_rq->curr); 2356 WARN_ON(!task_on_rq_queued(p)); 2357 2358 /* 2359 * There's a chance that p is higher in priority 2360 * than what's currently running on its CPU. 2361 * This is just that p is waking up and hasn't 2362 * had a chance to schedule. We only pull 2363 * p if it is lower in priority than the 2364 * current task on the run queue 2365 */ 2366 if (p->prio < src_rq->curr->prio) 2367 goto skip; 2368 2369 if (is_migration_disabled(p)) { 2370 push_task = get_push_task(src_rq); 2371 } else { 2372 deactivate_task(src_rq, p, 0); 2373 set_task_cpu(p, this_cpu); 2374 activate_task(this_rq, p, 0); 2375 resched = true; 2376 } 2377 /* 2378 * We continue with the search, just in 2379 * case there's an even higher prio task 2380 * in another runqueue. (low likelihood 2381 * but possible) 2382 */ 2383 } 2384 skip: 2385 double_unlock_balance(this_rq, src_rq); 2386 2387 if (push_task) { 2388 raw_spin_rq_unlock(this_rq); 2389 stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, 2390 push_task, &src_rq->push_work); 2391 raw_spin_rq_lock(this_rq); 2392 } 2393 } 2394 2395 if (resched) 2396 resched_curr(this_rq); 2397 } 2398 2399 /* 2400 * If we are not running and we are not going to reschedule soon, we should 2401 * try to push tasks away now 2402 */ 2403 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2404 { 2405 bool need_to_push = !task_running(rq, p) && 2406 !test_tsk_need_resched(rq->curr) && 2407 p->nr_cpus_allowed > 1 && 2408 (dl_task(rq->curr) || rt_task(rq->curr)) && 2409 (rq->curr->nr_cpus_allowed < 2 || 2410 rq->curr->prio <= p->prio); 2411 2412 if (need_to_push) 2413 push_rt_tasks(rq); 2414 } 2415 2416 /* Assumes rq->lock is held */ 2417 static void rq_online_rt(struct rq *rq) 2418 { 2419 if (rq->rt.overloaded) 2420 rt_set_overload(rq); 2421 2422 __enable_runtime(rq); 2423 2424 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2425 } 2426 2427 /* Assumes rq->lock is held */ 2428 static void rq_offline_rt(struct rq *rq) 2429 { 2430 if (rq->rt.overloaded) 2431 rt_clear_overload(rq); 2432 2433 __disable_runtime(rq); 2434 2435 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2436 } 2437 2438 /* 2439 * When switch from the rt queue, we bring ourselves to a position 2440 * that we might want to pull RT tasks from other runqueues. 2441 */ 2442 static void switched_from_rt(struct rq *rq, struct task_struct *p) 2443 { 2444 /* 2445 * If there are other RT tasks then we will reschedule 2446 * and the scheduling of the other RT tasks will handle 2447 * the balancing. But if we are the last RT task 2448 * we may need to handle the pulling of RT tasks 2449 * now. 2450 */ 2451 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2452 return; 2453 2454 rt_queue_pull_task(rq); 2455 } 2456 2457 void __init init_sched_rt_class(void) 2458 { 2459 unsigned int i; 2460 2461 for_each_possible_cpu(i) { 2462 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 2463 GFP_KERNEL, cpu_to_node(i)); 2464 } 2465 } 2466 #endif /* CONFIG_SMP */ 2467 2468 /* 2469 * When switching a task to RT, we may overload the runqueue 2470 * with RT tasks. In this case we try to push them off to 2471 * other runqueues. 2472 */ 2473 static void switched_to_rt(struct rq *rq, struct task_struct *p) 2474 { 2475 /* 2476 * If we are running, update the avg_rt tracking, as the running time 2477 * will now on be accounted into the latter. 2478 */ 2479 if (task_current(rq, p)) { 2480 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 2481 return; 2482 } 2483 2484 /* 2485 * If we are not running we may need to preempt the current 2486 * running task. If that current running task is also an RT task 2487 * then see if we can move to another run queue. 2488 */ 2489 if (task_on_rq_queued(p)) { 2490 #ifdef CONFIG_SMP 2491 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2492 rt_queue_push_tasks(rq); 2493 #endif /* CONFIG_SMP */ 2494 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2495 resched_curr(rq); 2496 } 2497 } 2498 2499 /* 2500 * Priority of the task has changed. This may cause 2501 * us to initiate a push or pull. 2502 */ 2503 static void 2504 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2505 { 2506 if (!task_on_rq_queued(p)) 2507 return; 2508 2509 if (task_current(rq, p)) { 2510 #ifdef CONFIG_SMP 2511 /* 2512 * If our priority decreases while running, we 2513 * may need to pull tasks to this runqueue. 2514 */ 2515 if (oldprio < p->prio) 2516 rt_queue_pull_task(rq); 2517 2518 /* 2519 * If there's a higher priority task waiting to run 2520 * then reschedule. 2521 */ 2522 if (p->prio > rq->rt.highest_prio.curr) 2523 resched_curr(rq); 2524 #else 2525 /* For UP simply resched on drop of prio */ 2526 if (oldprio < p->prio) 2527 resched_curr(rq); 2528 #endif /* CONFIG_SMP */ 2529 } else { 2530 /* 2531 * This task is not running, but if it is 2532 * greater than the current running task 2533 * then reschedule. 2534 */ 2535 if (p->prio < rq->curr->prio) 2536 resched_curr(rq); 2537 } 2538 } 2539 2540 #ifdef CONFIG_POSIX_TIMERS 2541 static void watchdog(struct rq *rq, struct task_struct *p) 2542 { 2543 unsigned long soft, hard; 2544 2545 /* max may change after cur was read, this will be fixed next tick */ 2546 soft = task_rlimit(p, RLIMIT_RTTIME); 2547 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2548 2549 if (soft != RLIM_INFINITY) { 2550 unsigned long next; 2551 2552 if (p->rt.watchdog_stamp != jiffies) { 2553 p->rt.timeout++; 2554 p->rt.watchdog_stamp = jiffies; 2555 } 2556 2557 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 2558 if (p->rt.timeout > next) { 2559 posix_cputimers_rt_watchdog(&p->posix_cputimers, 2560 p->se.sum_exec_runtime); 2561 } 2562 } 2563 } 2564 #else 2565 static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2566 #endif 2567 2568 /* 2569 * scheduler tick hitting a task of our scheduling class. 2570 * 2571 * NOTE: This function can be called remotely by the tick offload that 2572 * goes along full dynticks. Therefore no local assumption can be made 2573 * and everything must be accessed through the @rq and @curr passed in 2574 * parameters. 2575 */ 2576 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2577 { 2578 struct sched_rt_entity *rt_se = &p->rt; 2579 2580 update_curr_rt(rq); 2581 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); 2582 2583 watchdog(rq, p); 2584 2585 /* 2586 * RR tasks need a special form of timeslice management. 2587 * FIFO tasks have no timeslices. 2588 */ 2589 if (p->policy != SCHED_RR) 2590 return; 2591 2592 if (--p->rt.time_slice) 2593 return; 2594 2595 p->rt.time_slice = sched_rr_timeslice; 2596 2597 /* 2598 * Requeue to the end of queue if we (and all of our ancestors) are not 2599 * the only element on the queue 2600 */ 2601 for_each_sched_rt_entity(rt_se) { 2602 if (rt_se->run_list.prev != rt_se->run_list.next) { 2603 requeue_task_rt(rq, p, 0); 2604 resched_curr(rq); 2605 return; 2606 } 2607 } 2608 } 2609 2610 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2611 { 2612 /* 2613 * Time slice is 0 for SCHED_FIFO tasks 2614 */ 2615 if (task->policy == SCHED_RR) 2616 return sched_rr_timeslice; 2617 else 2618 return 0; 2619 } 2620 2621 DEFINE_SCHED_CLASS(rt) = { 2622 2623 .enqueue_task = enqueue_task_rt, 2624 .dequeue_task = dequeue_task_rt, 2625 .yield_task = yield_task_rt, 2626 2627 .check_preempt_curr = check_preempt_curr_rt, 2628 2629 .pick_next_task = pick_next_task_rt, 2630 .put_prev_task = put_prev_task_rt, 2631 .set_next_task = set_next_task_rt, 2632 2633 #ifdef CONFIG_SMP 2634 .balance = balance_rt, 2635 .pick_task = pick_task_rt, 2636 .select_task_rq = select_task_rq_rt, 2637 .set_cpus_allowed = set_cpus_allowed_common, 2638 .rq_online = rq_online_rt, 2639 .rq_offline = rq_offline_rt, 2640 .task_woken = task_woken_rt, 2641 .switched_from = switched_from_rt, 2642 .find_lock_rq = find_lock_lowest_rq, 2643 #endif 2644 2645 .task_tick = task_tick_rt, 2646 2647 .get_rr_interval = get_rr_interval_rt, 2648 2649 .prio_changed = prio_changed_rt, 2650 .switched_to = switched_to_rt, 2651 2652 .update_curr = update_curr_rt, 2653 2654 #ifdef CONFIG_UCLAMP_TASK 2655 .uclamp_enabled = 1, 2656 #endif 2657 }; 2658 2659 #ifdef CONFIG_RT_GROUP_SCHED 2660 /* 2661 * Ensure that the real time constraints are schedulable. 2662 */ 2663 static DEFINE_MUTEX(rt_constraints_mutex); 2664 2665 static inline int tg_has_rt_tasks(struct task_group *tg) 2666 { 2667 struct task_struct *task; 2668 struct css_task_iter it; 2669 int ret = 0; 2670 2671 /* 2672 * Autogroups do not have RT tasks; see autogroup_create(). 2673 */ 2674 if (task_group_is_autogroup(tg)) 2675 return 0; 2676 2677 css_task_iter_start(&tg->css, 0, &it); 2678 while (!ret && (task = css_task_iter_next(&it))) 2679 ret |= rt_task(task); 2680 css_task_iter_end(&it); 2681 2682 return ret; 2683 } 2684 2685 struct rt_schedulable_data { 2686 struct task_group *tg; 2687 u64 rt_period; 2688 u64 rt_runtime; 2689 }; 2690 2691 static int tg_rt_schedulable(struct task_group *tg, void *data) 2692 { 2693 struct rt_schedulable_data *d = data; 2694 struct task_group *child; 2695 unsigned long total, sum = 0; 2696 u64 period, runtime; 2697 2698 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2699 runtime = tg->rt_bandwidth.rt_runtime; 2700 2701 if (tg == d->tg) { 2702 period = d->rt_period; 2703 runtime = d->rt_runtime; 2704 } 2705 2706 /* 2707 * Cannot have more runtime than the period. 2708 */ 2709 if (runtime > period && runtime != RUNTIME_INF) 2710 return -EINVAL; 2711 2712 /* 2713 * Ensure we don't starve existing RT tasks if runtime turns zero. 2714 */ 2715 if (rt_bandwidth_enabled() && !runtime && 2716 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) 2717 return -EBUSY; 2718 2719 total = to_ratio(period, runtime); 2720 2721 /* 2722 * Nobody can have more than the global setting allows. 2723 */ 2724 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 2725 return -EINVAL; 2726 2727 /* 2728 * The sum of our children's runtime should not exceed our own. 2729 */ 2730 list_for_each_entry_rcu(child, &tg->children, siblings) { 2731 period = ktime_to_ns(child->rt_bandwidth.rt_period); 2732 runtime = child->rt_bandwidth.rt_runtime; 2733 2734 if (child == d->tg) { 2735 period = d->rt_period; 2736 runtime = d->rt_runtime; 2737 } 2738 2739 sum += to_ratio(period, runtime); 2740 } 2741 2742 if (sum > total) 2743 return -EINVAL; 2744 2745 return 0; 2746 } 2747 2748 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 2749 { 2750 int ret; 2751 2752 struct rt_schedulable_data data = { 2753 .tg = tg, 2754 .rt_period = period, 2755 .rt_runtime = runtime, 2756 }; 2757 2758 rcu_read_lock(); 2759 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 2760 rcu_read_unlock(); 2761 2762 return ret; 2763 } 2764 2765 static int tg_set_rt_bandwidth(struct task_group *tg, 2766 u64 rt_period, u64 rt_runtime) 2767 { 2768 int i, err = 0; 2769 2770 /* 2771 * Disallowing the root group RT runtime is BAD, it would disallow the 2772 * kernel creating (and or operating) RT threads. 2773 */ 2774 if (tg == &root_task_group && rt_runtime == 0) 2775 return -EINVAL; 2776 2777 /* No period doesn't make any sense. */ 2778 if (rt_period == 0) 2779 return -EINVAL; 2780 2781 /* 2782 * Bound quota to defend quota against overflow during bandwidth shift. 2783 */ 2784 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime) 2785 return -EINVAL; 2786 2787 mutex_lock(&rt_constraints_mutex); 2788 err = __rt_schedulable(tg, rt_period, rt_runtime); 2789 if (err) 2790 goto unlock; 2791 2792 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2793 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 2794 tg->rt_bandwidth.rt_runtime = rt_runtime; 2795 2796 for_each_possible_cpu(i) { 2797 struct rt_rq *rt_rq = tg->rt_rq[i]; 2798 2799 raw_spin_lock(&rt_rq->rt_runtime_lock); 2800 rt_rq->rt_runtime = rt_runtime; 2801 raw_spin_unlock(&rt_rq->rt_runtime_lock); 2802 } 2803 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 2804 unlock: 2805 mutex_unlock(&rt_constraints_mutex); 2806 2807 return err; 2808 } 2809 2810 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 2811 { 2812 u64 rt_runtime, rt_period; 2813 2814 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 2815 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 2816 if (rt_runtime_us < 0) 2817 rt_runtime = RUNTIME_INF; 2818 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) 2819 return -EINVAL; 2820 2821 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2822 } 2823 2824 long sched_group_rt_runtime(struct task_group *tg) 2825 { 2826 u64 rt_runtime_us; 2827 2828 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 2829 return -1; 2830 2831 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 2832 do_div(rt_runtime_us, NSEC_PER_USEC); 2833 return rt_runtime_us; 2834 } 2835 2836 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) 2837 { 2838 u64 rt_runtime, rt_period; 2839 2840 if (rt_period_us > U64_MAX / NSEC_PER_USEC) 2841 return -EINVAL; 2842 2843 rt_period = rt_period_us * NSEC_PER_USEC; 2844 rt_runtime = tg->rt_bandwidth.rt_runtime; 2845 2846 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 2847 } 2848 2849 long sched_group_rt_period(struct task_group *tg) 2850 { 2851 u64 rt_period_us; 2852 2853 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 2854 do_div(rt_period_us, NSEC_PER_USEC); 2855 return rt_period_us; 2856 } 2857 2858 static int sched_rt_global_constraints(void) 2859 { 2860 int ret = 0; 2861 2862 mutex_lock(&rt_constraints_mutex); 2863 ret = __rt_schedulable(NULL, 0, 0); 2864 mutex_unlock(&rt_constraints_mutex); 2865 2866 return ret; 2867 } 2868 2869 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 2870 { 2871 /* Don't accept realtime tasks when there is no way for them to run */ 2872 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 2873 return 0; 2874 2875 return 1; 2876 } 2877 2878 #else /* !CONFIG_RT_GROUP_SCHED */ 2879 static int sched_rt_global_constraints(void) 2880 { 2881 unsigned long flags; 2882 int i; 2883 2884 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 2885 for_each_possible_cpu(i) { 2886 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 2887 2888 raw_spin_lock(&rt_rq->rt_runtime_lock); 2889 rt_rq->rt_runtime = global_rt_runtime(); 2890 raw_spin_unlock(&rt_rq->rt_runtime_lock); 2891 } 2892 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 2893 2894 return 0; 2895 } 2896 #endif /* CONFIG_RT_GROUP_SCHED */ 2897 2898 static int sched_rt_global_validate(void) 2899 { 2900 if (sysctl_sched_rt_period <= 0) 2901 return -EINVAL; 2902 2903 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 2904 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || 2905 ((u64)sysctl_sched_rt_runtime * 2906 NSEC_PER_USEC > max_rt_runtime))) 2907 return -EINVAL; 2908 2909 return 0; 2910 } 2911 2912 static void sched_rt_do_global(void) 2913 { 2914 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 2915 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 2916 } 2917 2918 int sched_rt_handler(struct ctl_table *table, int write, void *buffer, 2919 size_t *lenp, loff_t *ppos) 2920 { 2921 int old_period, old_runtime; 2922 static DEFINE_MUTEX(mutex); 2923 int ret; 2924 2925 mutex_lock(&mutex); 2926 old_period = sysctl_sched_rt_period; 2927 old_runtime = sysctl_sched_rt_runtime; 2928 2929 ret = proc_dointvec(table, write, buffer, lenp, ppos); 2930 2931 if (!ret && write) { 2932 ret = sched_rt_global_validate(); 2933 if (ret) 2934 goto undo; 2935 2936 ret = sched_dl_global_validate(); 2937 if (ret) 2938 goto undo; 2939 2940 ret = sched_rt_global_constraints(); 2941 if (ret) 2942 goto undo; 2943 2944 sched_rt_do_global(); 2945 sched_dl_do_global(); 2946 } 2947 if (0) { 2948 undo: 2949 sysctl_sched_rt_period = old_period; 2950 sysctl_sched_rt_runtime = old_runtime; 2951 } 2952 mutex_unlock(&mutex); 2953 2954 return ret; 2955 } 2956 2957 int sched_rr_handler(struct ctl_table *table, int write, void *buffer, 2958 size_t *lenp, loff_t *ppos) 2959 { 2960 int ret; 2961 static DEFINE_MUTEX(mutex); 2962 2963 mutex_lock(&mutex); 2964 ret = proc_dointvec(table, write, buffer, lenp, ppos); 2965 /* 2966 * Make sure that internally we keep jiffies. 2967 * Also, writing zero resets the timeslice to default: 2968 */ 2969 if (!ret && write) { 2970 sched_rr_timeslice = 2971 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : 2972 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2973 } 2974 mutex_unlock(&mutex); 2975 2976 return ret; 2977 } 2978 2979 #ifdef CONFIG_SCHED_DEBUG 2980 void print_rt_stats(struct seq_file *m, int cpu) 2981 { 2982 rt_rq_iter_t iter; 2983 struct rt_rq *rt_rq; 2984 2985 rcu_read_lock(); 2986 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2987 print_rt_rq(m, cpu, rt_rq); 2988 rcu_read_unlock(); 2989 } 2990 #endif /* CONFIG_SCHED_DEBUG */ 2991