1 /* 2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * policies) 4 */ 5 6 #include "sched.h" 7 8 #include <linux/slab.h> 9 #include <linux/irq_work.h> 10 11 int sched_rr_timeslice = RR_TIMESLICE; 12 13 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 14 15 struct rt_bandwidth def_rt_bandwidth; 16 17 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 18 { 19 struct rt_bandwidth *rt_b = 20 container_of(timer, struct rt_bandwidth, rt_period_timer); 21 int idle = 0; 22 int overrun; 23 24 raw_spin_lock(&rt_b->rt_runtime_lock); 25 for (;;) { 26 overrun = hrtimer_forward_now(timer, rt_b->rt_period); 27 if (!overrun) 28 break; 29 30 raw_spin_unlock(&rt_b->rt_runtime_lock); 31 idle = do_sched_rt_period_timer(rt_b, overrun); 32 raw_spin_lock(&rt_b->rt_runtime_lock); 33 } 34 if (idle) 35 rt_b->rt_period_active = 0; 36 raw_spin_unlock(&rt_b->rt_runtime_lock); 37 38 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 39 } 40 41 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 42 { 43 rt_b->rt_period = ns_to_ktime(period); 44 rt_b->rt_runtime = runtime; 45 46 raw_spin_lock_init(&rt_b->rt_runtime_lock); 47 48 hrtimer_init(&rt_b->rt_period_timer, 49 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 50 rt_b->rt_period_timer.function = sched_rt_period_timer; 51 } 52 53 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 54 { 55 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 56 return; 57 58 raw_spin_lock(&rt_b->rt_runtime_lock); 59 if (!rt_b->rt_period_active) { 60 rt_b->rt_period_active = 1; 61 hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); 62 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); 63 } 64 raw_spin_unlock(&rt_b->rt_runtime_lock); 65 } 66 67 #ifdef CONFIG_SMP 68 static void push_irq_work_func(struct irq_work *work); 69 #endif 70 71 void init_rt_rq(struct rt_rq *rt_rq) 72 { 73 struct rt_prio_array *array; 74 int i; 75 76 array = &rt_rq->active; 77 for (i = 0; i < MAX_RT_PRIO; i++) { 78 INIT_LIST_HEAD(array->queue + i); 79 __clear_bit(i, array->bitmap); 80 } 81 /* delimiter for bitsearch: */ 82 __set_bit(MAX_RT_PRIO, array->bitmap); 83 84 #if defined CONFIG_SMP 85 rt_rq->highest_prio.curr = MAX_RT_PRIO; 86 rt_rq->highest_prio.next = MAX_RT_PRIO; 87 rt_rq->rt_nr_migratory = 0; 88 rt_rq->overloaded = 0; 89 plist_head_init(&rt_rq->pushable_tasks); 90 91 #ifdef HAVE_RT_PUSH_IPI 92 rt_rq->push_flags = 0; 93 rt_rq->push_cpu = nr_cpu_ids; 94 raw_spin_lock_init(&rt_rq->push_lock); 95 init_irq_work(&rt_rq->push_work, push_irq_work_func); 96 #endif 97 #endif /* CONFIG_SMP */ 98 /* We start is dequeued state, because no RT tasks are queued */ 99 rt_rq->rt_queued = 0; 100 101 rt_rq->rt_time = 0; 102 rt_rq->rt_throttled = 0; 103 rt_rq->rt_runtime = 0; 104 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 105 } 106 107 #ifdef CONFIG_RT_GROUP_SCHED 108 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 109 { 110 hrtimer_cancel(&rt_b->rt_period_timer); 111 } 112 113 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 114 115 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 116 { 117 #ifdef CONFIG_SCHED_DEBUG 118 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 119 #endif 120 return container_of(rt_se, struct task_struct, rt); 121 } 122 123 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 124 { 125 return rt_rq->rq; 126 } 127 128 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 129 { 130 return rt_se->rt_rq; 131 } 132 133 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 134 { 135 struct rt_rq *rt_rq = rt_se->rt_rq; 136 137 return rt_rq->rq; 138 } 139 140 void free_rt_sched_group(struct task_group *tg) 141 { 142 int i; 143 144 if (tg->rt_se) 145 destroy_rt_bandwidth(&tg->rt_bandwidth); 146 147 for_each_possible_cpu(i) { 148 if (tg->rt_rq) 149 kfree(tg->rt_rq[i]); 150 if (tg->rt_se) 151 kfree(tg->rt_se[i]); 152 } 153 154 kfree(tg->rt_rq); 155 kfree(tg->rt_se); 156 } 157 158 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 159 struct sched_rt_entity *rt_se, int cpu, 160 struct sched_rt_entity *parent) 161 { 162 struct rq *rq = cpu_rq(cpu); 163 164 rt_rq->highest_prio.curr = MAX_RT_PRIO; 165 rt_rq->rt_nr_boosted = 0; 166 rt_rq->rq = rq; 167 rt_rq->tg = tg; 168 169 tg->rt_rq[cpu] = rt_rq; 170 tg->rt_se[cpu] = rt_se; 171 172 if (!rt_se) 173 return; 174 175 if (!parent) 176 rt_se->rt_rq = &rq->rt; 177 else 178 rt_se->rt_rq = parent->my_q; 179 180 rt_se->my_q = rt_rq; 181 rt_se->parent = parent; 182 INIT_LIST_HEAD(&rt_se->run_list); 183 } 184 185 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 186 { 187 struct rt_rq *rt_rq; 188 struct sched_rt_entity *rt_se; 189 int i; 190 191 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 192 if (!tg->rt_rq) 193 goto err; 194 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 195 if (!tg->rt_se) 196 goto err; 197 198 init_rt_bandwidth(&tg->rt_bandwidth, 199 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 200 201 for_each_possible_cpu(i) { 202 rt_rq = kzalloc_node(sizeof(struct rt_rq), 203 GFP_KERNEL, cpu_to_node(i)); 204 if (!rt_rq) 205 goto err; 206 207 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 208 GFP_KERNEL, cpu_to_node(i)); 209 if (!rt_se) 210 goto err_free_rq; 211 212 init_rt_rq(rt_rq); 213 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 214 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 215 } 216 217 return 1; 218 219 err_free_rq: 220 kfree(rt_rq); 221 err: 222 return 0; 223 } 224 225 #else /* CONFIG_RT_GROUP_SCHED */ 226 227 #define rt_entity_is_task(rt_se) (1) 228 229 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 230 { 231 return container_of(rt_se, struct task_struct, rt); 232 } 233 234 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 235 { 236 return container_of(rt_rq, struct rq, rt); 237 } 238 239 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se) 240 { 241 struct task_struct *p = rt_task_of(rt_se); 242 243 return task_rq(p); 244 } 245 246 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 247 { 248 struct rq *rq = rq_of_rt_se(rt_se); 249 250 return &rq->rt; 251 } 252 253 void free_rt_sched_group(struct task_group *tg) { } 254 255 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 256 { 257 return 1; 258 } 259 #endif /* CONFIG_RT_GROUP_SCHED */ 260 261 #ifdef CONFIG_SMP 262 263 static int pull_rt_task(struct rq *this_rq); 264 265 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 266 { 267 /* Try to pull RT tasks here if we lower this rq's prio */ 268 return rq->rt.highest_prio.curr > prev->prio; 269 } 270 271 static inline int rt_overloaded(struct rq *rq) 272 { 273 return atomic_read(&rq->rd->rto_count); 274 } 275 276 static inline void rt_set_overload(struct rq *rq) 277 { 278 if (!rq->online) 279 return; 280 281 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 282 /* 283 * Make sure the mask is visible before we set 284 * the overload count. That is checked to determine 285 * if we should look at the mask. It would be a shame 286 * if we looked at the mask, but the mask was not 287 * updated yet. 288 * 289 * Matched by the barrier in pull_rt_task(). 290 */ 291 smp_wmb(); 292 atomic_inc(&rq->rd->rto_count); 293 } 294 295 static inline void rt_clear_overload(struct rq *rq) 296 { 297 if (!rq->online) 298 return; 299 300 /* the order here really doesn't matter */ 301 atomic_dec(&rq->rd->rto_count); 302 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 303 } 304 305 static void update_rt_migration(struct rt_rq *rt_rq) 306 { 307 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 308 if (!rt_rq->overloaded) { 309 rt_set_overload(rq_of_rt_rq(rt_rq)); 310 rt_rq->overloaded = 1; 311 } 312 } else if (rt_rq->overloaded) { 313 rt_clear_overload(rq_of_rt_rq(rt_rq)); 314 rt_rq->overloaded = 0; 315 } 316 } 317 318 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 319 { 320 struct task_struct *p; 321 322 if (!rt_entity_is_task(rt_se)) 323 return; 324 325 p = rt_task_of(rt_se); 326 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 327 328 rt_rq->rt_nr_total++; 329 if (p->nr_cpus_allowed > 1) 330 rt_rq->rt_nr_migratory++; 331 332 update_rt_migration(rt_rq); 333 } 334 335 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 336 { 337 struct task_struct *p; 338 339 if (!rt_entity_is_task(rt_se)) 340 return; 341 342 p = rt_task_of(rt_se); 343 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 344 345 rt_rq->rt_nr_total--; 346 if (p->nr_cpus_allowed > 1) 347 rt_rq->rt_nr_migratory--; 348 349 update_rt_migration(rt_rq); 350 } 351 352 static inline int has_pushable_tasks(struct rq *rq) 353 { 354 return !plist_head_empty(&rq->rt.pushable_tasks); 355 } 356 357 static inline void set_post_schedule(struct rq *rq) 358 { 359 /* 360 * We detect this state here so that we can avoid taking the RQ 361 * lock again later if there is no need to push 362 */ 363 rq->post_schedule = has_pushable_tasks(rq); 364 } 365 366 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 367 { 368 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 369 plist_node_init(&p->pushable_tasks, p->prio); 370 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 371 372 /* Update the highest prio pushable task */ 373 if (p->prio < rq->rt.highest_prio.next) 374 rq->rt.highest_prio.next = p->prio; 375 } 376 377 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 378 { 379 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 380 381 /* Update the new highest prio pushable task */ 382 if (has_pushable_tasks(rq)) { 383 p = plist_first_entry(&rq->rt.pushable_tasks, 384 struct task_struct, pushable_tasks); 385 rq->rt.highest_prio.next = p->prio; 386 } else 387 rq->rt.highest_prio.next = MAX_RT_PRIO; 388 } 389 390 #else 391 392 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 393 { 394 } 395 396 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 397 { 398 } 399 400 static inline 401 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 402 { 403 } 404 405 static inline 406 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 407 { 408 } 409 410 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) 411 { 412 return false; 413 } 414 415 static inline int pull_rt_task(struct rq *this_rq) 416 { 417 return 0; 418 } 419 420 static inline void set_post_schedule(struct rq *rq) 421 { 422 } 423 #endif /* CONFIG_SMP */ 424 425 static void enqueue_top_rt_rq(struct rt_rq *rt_rq); 426 static void dequeue_top_rt_rq(struct rt_rq *rt_rq); 427 428 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 429 { 430 return !list_empty(&rt_se->run_list); 431 } 432 433 #ifdef CONFIG_RT_GROUP_SCHED 434 435 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 436 { 437 if (!rt_rq->tg) 438 return RUNTIME_INF; 439 440 return rt_rq->rt_runtime; 441 } 442 443 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 444 { 445 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 446 } 447 448 typedef struct task_group *rt_rq_iter_t; 449 450 static inline struct task_group *next_task_group(struct task_group *tg) 451 { 452 do { 453 tg = list_entry_rcu(tg->list.next, 454 typeof(struct task_group), list); 455 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 456 457 if (&tg->list == &task_groups) 458 tg = NULL; 459 460 return tg; 461 } 462 463 #define for_each_rt_rq(rt_rq, iter, rq) \ 464 for (iter = container_of(&task_groups, typeof(*iter), list); \ 465 (iter = next_task_group(iter)) && \ 466 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 467 468 #define for_each_sched_rt_entity(rt_se) \ 469 for (; rt_se; rt_se = rt_se->parent) 470 471 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 472 { 473 return rt_se->my_q; 474 } 475 476 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); 477 static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 478 479 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 480 { 481 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 482 struct rq *rq = rq_of_rt_rq(rt_rq); 483 struct sched_rt_entity *rt_se; 484 485 int cpu = cpu_of(rq); 486 487 rt_se = rt_rq->tg->rt_se[cpu]; 488 489 if (rt_rq->rt_nr_running) { 490 if (!rt_se) 491 enqueue_top_rt_rq(rt_rq); 492 else if (!on_rt_rq(rt_se)) 493 enqueue_rt_entity(rt_se, false); 494 495 if (rt_rq->highest_prio.curr < curr->prio) 496 resched_curr(rq); 497 } 498 } 499 500 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 501 { 502 struct sched_rt_entity *rt_se; 503 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 504 505 rt_se = rt_rq->tg->rt_se[cpu]; 506 507 if (!rt_se) 508 dequeue_top_rt_rq(rt_rq); 509 else if (on_rt_rq(rt_se)) 510 dequeue_rt_entity(rt_se); 511 } 512 513 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 514 { 515 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 516 } 517 518 static int rt_se_boosted(struct sched_rt_entity *rt_se) 519 { 520 struct rt_rq *rt_rq = group_rt_rq(rt_se); 521 struct task_struct *p; 522 523 if (rt_rq) 524 return !!rt_rq->rt_nr_boosted; 525 526 p = rt_task_of(rt_se); 527 return p->prio != p->normal_prio; 528 } 529 530 #ifdef CONFIG_SMP 531 static inline const struct cpumask *sched_rt_period_mask(void) 532 { 533 return this_rq()->rd->span; 534 } 535 #else 536 static inline const struct cpumask *sched_rt_period_mask(void) 537 { 538 return cpu_online_mask; 539 } 540 #endif 541 542 static inline 543 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 544 { 545 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 546 } 547 548 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 549 { 550 return &rt_rq->tg->rt_bandwidth; 551 } 552 553 #else /* !CONFIG_RT_GROUP_SCHED */ 554 555 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 556 { 557 return rt_rq->rt_runtime; 558 } 559 560 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 561 { 562 return ktime_to_ns(def_rt_bandwidth.rt_period); 563 } 564 565 typedef struct rt_rq *rt_rq_iter_t; 566 567 #define for_each_rt_rq(rt_rq, iter, rq) \ 568 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 569 570 #define for_each_sched_rt_entity(rt_se) \ 571 for (; rt_se; rt_se = NULL) 572 573 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 574 { 575 return NULL; 576 } 577 578 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 579 { 580 struct rq *rq = rq_of_rt_rq(rt_rq); 581 582 if (!rt_rq->rt_nr_running) 583 return; 584 585 enqueue_top_rt_rq(rt_rq); 586 resched_curr(rq); 587 } 588 589 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 590 { 591 dequeue_top_rt_rq(rt_rq); 592 } 593 594 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 595 { 596 return rt_rq->rt_throttled; 597 } 598 599 static inline const struct cpumask *sched_rt_period_mask(void) 600 { 601 return cpu_online_mask; 602 } 603 604 static inline 605 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 606 { 607 return &cpu_rq(cpu)->rt; 608 } 609 610 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 611 { 612 return &def_rt_bandwidth; 613 } 614 615 #endif /* CONFIG_RT_GROUP_SCHED */ 616 617 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) 618 { 619 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 620 621 return (hrtimer_active(&rt_b->rt_period_timer) || 622 rt_rq->rt_time < rt_b->rt_runtime); 623 } 624 625 #ifdef CONFIG_SMP 626 /* 627 * We ran out of runtime, see if we can borrow some from our neighbours. 628 */ 629 static int do_balance_runtime(struct rt_rq *rt_rq) 630 { 631 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 632 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 633 int i, weight, more = 0; 634 u64 rt_period; 635 636 weight = cpumask_weight(rd->span); 637 638 raw_spin_lock(&rt_b->rt_runtime_lock); 639 rt_period = ktime_to_ns(rt_b->rt_period); 640 for_each_cpu(i, rd->span) { 641 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 642 s64 diff; 643 644 if (iter == rt_rq) 645 continue; 646 647 raw_spin_lock(&iter->rt_runtime_lock); 648 /* 649 * Either all rqs have inf runtime and there's nothing to steal 650 * or __disable_runtime() below sets a specific rq to inf to 651 * indicate its been disabled and disalow stealing. 652 */ 653 if (iter->rt_runtime == RUNTIME_INF) 654 goto next; 655 656 /* 657 * From runqueues with spare time, take 1/n part of their 658 * spare time, but no more than our period. 659 */ 660 diff = iter->rt_runtime - iter->rt_time; 661 if (diff > 0) { 662 diff = div_u64((u64)diff, weight); 663 if (rt_rq->rt_runtime + diff > rt_period) 664 diff = rt_period - rt_rq->rt_runtime; 665 iter->rt_runtime -= diff; 666 rt_rq->rt_runtime += diff; 667 more = 1; 668 if (rt_rq->rt_runtime == rt_period) { 669 raw_spin_unlock(&iter->rt_runtime_lock); 670 break; 671 } 672 } 673 next: 674 raw_spin_unlock(&iter->rt_runtime_lock); 675 } 676 raw_spin_unlock(&rt_b->rt_runtime_lock); 677 678 return more; 679 } 680 681 /* 682 * Ensure this RQ takes back all the runtime it lend to its neighbours. 683 */ 684 static void __disable_runtime(struct rq *rq) 685 { 686 struct root_domain *rd = rq->rd; 687 rt_rq_iter_t iter; 688 struct rt_rq *rt_rq; 689 690 if (unlikely(!scheduler_running)) 691 return; 692 693 for_each_rt_rq(rt_rq, iter, rq) { 694 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 695 s64 want; 696 int i; 697 698 raw_spin_lock(&rt_b->rt_runtime_lock); 699 raw_spin_lock(&rt_rq->rt_runtime_lock); 700 /* 701 * Either we're all inf and nobody needs to borrow, or we're 702 * already disabled and thus have nothing to do, or we have 703 * exactly the right amount of runtime to take out. 704 */ 705 if (rt_rq->rt_runtime == RUNTIME_INF || 706 rt_rq->rt_runtime == rt_b->rt_runtime) 707 goto balanced; 708 raw_spin_unlock(&rt_rq->rt_runtime_lock); 709 710 /* 711 * Calculate the difference between what we started out with 712 * and what we current have, that's the amount of runtime 713 * we lend and now have to reclaim. 714 */ 715 want = rt_b->rt_runtime - rt_rq->rt_runtime; 716 717 /* 718 * Greedy reclaim, take back as much as we can. 719 */ 720 for_each_cpu(i, rd->span) { 721 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 722 s64 diff; 723 724 /* 725 * Can't reclaim from ourselves or disabled runqueues. 726 */ 727 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 728 continue; 729 730 raw_spin_lock(&iter->rt_runtime_lock); 731 if (want > 0) { 732 diff = min_t(s64, iter->rt_runtime, want); 733 iter->rt_runtime -= diff; 734 want -= diff; 735 } else { 736 iter->rt_runtime -= want; 737 want -= want; 738 } 739 raw_spin_unlock(&iter->rt_runtime_lock); 740 741 if (!want) 742 break; 743 } 744 745 raw_spin_lock(&rt_rq->rt_runtime_lock); 746 /* 747 * We cannot be left wanting - that would mean some runtime 748 * leaked out of the system. 749 */ 750 BUG_ON(want); 751 balanced: 752 /* 753 * Disable all the borrow logic by pretending we have inf 754 * runtime - in which case borrowing doesn't make sense. 755 */ 756 rt_rq->rt_runtime = RUNTIME_INF; 757 rt_rq->rt_throttled = 0; 758 raw_spin_unlock(&rt_rq->rt_runtime_lock); 759 raw_spin_unlock(&rt_b->rt_runtime_lock); 760 761 /* Make rt_rq available for pick_next_task() */ 762 sched_rt_rq_enqueue(rt_rq); 763 } 764 } 765 766 static void __enable_runtime(struct rq *rq) 767 { 768 rt_rq_iter_t iter; 769 struct rt_rq *rt_rq; 770 771 if (unlikely(!scheduler_running)) 772 return; 773 774 /* 775 * Reset each runqueue's bandwidth settings 776 */ 777 for_each_rt_rq(rt_rq, iter, rq) { 778 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 779 780 raw_spin_lock(&rt_b->rt_runtime_lock); 781 raw_spin_lock(&rt_rq->rt_runtime_lock); 782 rt_rq->rt_runtime = rt_b->rt_runtime; 783 rt_rq->rt_time = 0; 784 rt_rq->rt_throttled = 0; 785 raw_spin_unlock(&rt_rq->rt_runtime_lock); 786 raw_spin_unlock(&rt_b->rt_runtime_lock); 787 } 788 } 789 790 static int balance_runtime(struct rt_rq *rt_rq) 791 { 792 int more = 0; 793 794 if (!sched_feat(RT_RUNTIME_SHARE)) 795 return more; 796 797 if (rt_rq->rt_time > rt_rq->rt_runtime) { 798 raw_spin_unlock(&rt_rq->rt_runtime_lock); 799 more = do_balance_runtime(rt_rq); 800 raw_spin_lock(&rt_rq->rt_runtime_lock); 801 } 802 803 return more; 804 } 805 #else /* !CONFIG_SMP */ 806 static inline int balance_runtime(struct rt_rq *rt_rq) 807 { 808 return 0; 809 } 810 #endif /* CONFIG_SMP */ 811 812 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 813 { 814 int i, idle = 1, throttled = 0; 815 const struct cpumask *span; 816 817 span = sched_rt_period_mask(); 818 #ifdef CONFIG_RT_GROUP_SCHED 819 /* 820 * FIXME: isolated CPUs should really leave the root task group, 821 * whether they are isolcpus or were isolated via cpusets, lest 822 * the timer run on a CPU which does not service all runqueues, 823 * potentially leaving other CPUs indefinitely throttled. If 824 * isolation is really required, the user will turn the throttle 825 * off to kill the perturbations it causes anyway. Meanwhile, 826 * this maintains functionality for boot and/or troubleshooting. 827 */ 828 if (rt_b == &root_task_group.rt_bandwidth) 829 span = cpu_online_mask; 830 #endif 831 for_each_cpu(i, span) { 832 int enqueue = 0; 833 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 834 struct rq *rq = rq_of_rt_rq(rt_rq); 835 836 raw_spin_lock(&rq->lock); 837 if (rt_rq->rt_time) { 838 u64 runtime; 839 840 raw_spin_lock(&rt_rq->rt_runtime_lock); 841 if (rt_rq->rt_throttled) 842 balance_runtime(rt_rq); 843 runtime = rt_rq->rt_runtime; 844 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 845 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 846 rt_rq->rt_throttled = 0; 847 enqueue = 1; 848 849 /* 850 * When we're idle and a woken (rt) task is 851 * throttled check_preempt_curr() will set 852 * skip_update and the time between the wakeup 853 * and this unthrottle will get accounted as 854 * 'runtime'. 855 */ 856 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 857 rq_clock_skip_update(rq, false); 858 } 859 if (rt_rq->rt_time || rt_rq->rt_nr_running) 860 idle = 0; 861 raw_spin_unlock(&rt_rq->rt_runtime_lock); 862 } else if (rt_rq->rt_nr_running) { 863 idle = 0; 864 if (!rt_rq_throttled(rt_rq)) 865 enqueue = 1; 866 } 867 if (rt_rq->rt_throttled) 868 throttled = 1; 869 870 if (enqueue) 871 sched_rt_rq_enqueue(rt_rq); 872 raw_spin_unlock(&rq->lock); 873 } 874 875 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) 876 return 1; 877 878 return idle; 879 } 880 881 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 882 { 883 #ifdef CONFIG_RT_GROUP_SCHED 884 struct rt_rq *rt_rq = group_rt_rq(rt_se); 885 886 if (rt_rq) 887 return rt_rq->highest_prio.curr; 888 #endif 889 890 return rt_task_of(rt_se)->prio; 891 } 892 893 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 894 { 895 u64 runtime = sched_rt_runtime(rt_rq); 896 897 if (rt_rq->rt_throttled) 898 return rt_rq_throttled(rt_rq); 899 900 if (runtime >= sched_rt_period(rt_rq)) 901 return 0; 902 903 balance_runtime(rt_rq); 904 runtime = sched_rt_runtime(rt_rq); 905 if (runtime == RUNTIME_INF) 906 return 0; 907 908 if (rt_rq->rt_time > runtime) { 909 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 910 911 /* 912 * Don't actually throttle groups that have no runtime assigned 913 * but accrue some time due to boosting. 914 */ 915 if (likely(rt_b->rt_runtime)) { 916 rt_rq->rt_throttled = 1; 917 printk_deferred_once("sched: RT throttling activated\n"); 918 } else { 919 /* 920 * In case we did anyway, make it go away, 921 * replenishment is a joke, since it will replenish us 922 * with exactly 0 ns. 923 */ 924 rt_rq->rt_time = 0; 925 } 926 927 if (rt_rq_throttled(rt_rq)) { 928 sched_rt_rq_dequeue(rt_rq); 929 return 1; 930 } 931 } 932 933 return 0; 934 } 935 936 /* 937 * Update the current task's runtime statistics. Skip current tasks that 938 * are not in our scheduling class. 939 */ 940 static void update_curr_rt(struct rq *rq) 941 { 942 struct task_struct *curr = rq->curr; 943 struct sched_rt_entity *rt_se = &curr->rt; 944 u64 delta_exec; 945 946 if (curr->sched_class != &rt_sched_class) 947 return; 948 949 delta_exec = rq_clock_task(rq) - curr->se.exec_start; 950 if (unlikely((s64)delta_exec <= 0)) 951 return; 952 953 schedstat_set(curr->se.statistics.exec_max, 954 max(curr->se.statistics.exec_max, delta_exec)); 955 956 curr->se.sum_exec_runtime += delta_exec; 957 account_group_exec_runtime(curr, delta_exec); 958 959 curr->se.exec_start = rq_clock_task(rq); 960 cpuacct_charge(curr, delta_exec); 961 962 sched_rt_avg_update(rq, delta_exec); 963 964 if (!rt_bandwidth_enabled()) 965 return; 966 967 for_each_sched_rt_entity(rt_se) { 968 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 969 970 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 971 raw_spin_lock(&rt_rq->rt_runtime_lock); 972 rt_rq->rt_time += delta_exec; 973 if (sched_rt_runtime_exceeded(rt_rq)) 974 resched_curr(rq); 975 raw_spin_unlock(&rt_rq->rt_runtime_lock); 976 } 977 } 978 } 979 980 static void 981 dequeue_top_rt_rq(struct rt_rq *rt_rq) 982 { 983 struct rq *rq = rq_of_rt_rq(rt_rq); 984 985 BUG_ON(&rq->rt != rt_rq); 986 987 if (!rt_rq->rt_queued) 988 return; 989 990 BUG_ON(!rq->nr_running); 991 992 sub_nr_running(rq, rt_rq->rt_nr_running); 993 rt_rq->rt_queued = 0; 994 } 995 996 static void 997 enqueue_top_rt_rq(struct rt_rq *rt_rq) 998 { 999 struct rq *rq = rq_of_rt_rq(rt_rq); 1000 1001 BUG_ON(&rq->rt != rt_rq); 1002 1003 if (rt_rq->rt_queued) 1004 return; 1005 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) 1006 return; 1007 1008 add_nr_running(rq, rt_rq->rt_nr_running); 1009 rt_rq->rt_queued = 1; 1010 } 1011 1012 #if defined CONFIG_SMP 1013 1014 static void 1015 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1016 { 1017 struct rq *rq = rq_of_rt_rq(rt_rq); 1018 1019 #ifdef CONFIG_RT_GROUP_SCHED 1020 /* 1021 * Change rq's cpupri only if rt_rq is the top queue. 1022 */ 1023 if (&rq->rt != rt_rq) 1024 return; 1025 #endif 1026 if (rq->online && prio < prev_prio) 1027 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 1028 } 1029 1030 static void 1031 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 1032 { 1033 struct rq *rq = rq_of_rt_rq(rt_rq); 1034 1035 #ifdef CONFIG_RT_GROUP_SCHED 1036 /* 1037 * Change rq's cpupri only if rt_rq is the top queue. 1038 */ 1039 if (&rq->rt != rt_rq) 1040 return; 1041 #endif 1042 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 1043 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 1044 } 1045 1046 #else /* CONFIG_SMP */ 1047 1048 static inline 1049 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1050 static inline 1051 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 1052 1053 #endif /* CONFIG_SMP */ 1054 1055 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 1056 static void 1057 inc_rt_prio(struct rt_rq *rt_rq, int prio) 1058 { 1059 int prev_prio = rt_rq->highest_prio.curr; 1060 1061 if (prio < prev_prio) 1062 rt_rq->highest_prio.curr = prio; 1063 1064 inc_rt_prio_smp(rt_rq, prio, prev_prio); 1065 } 1066 1067 static void 1068 dec_rt_prio(struct rt_rq *rt_rq, int prio) 1069 { 1070 int prev_prio = rt_rq->highest_prio.curr; 1071 1072 if (rt_rq->rt_nr_running) { 1073 1074 WARN_ON(prio < prev_prio); 1075 1076 /* 1077 * This may have been our highest task, and therefore 1078 * we may have some recomputation to do 1079 */ 1080 if (prio == prev_prio) { 1081 struct rt_prio_array *array = &rt_rq->active; 1082 1083 rt_rq->highest_prio.curr = 1084 sched_find_first_bit(array->bitmap); 1085 } 1086 1087 } else 1088 rt_rq->highest_prio.curr = MAX_RT_PRIO; 1089 1090 dec_rt_prio_smp(rt_rq, prio, prev_prio); 1091 } 1092 1093 #else 1094 1095 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 1096 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 1097 1098 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 1099 1100 #ifdef CONFIG_RT_GROUP_SCHED 1101 1102 static void 1103 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1104 { 1105 if (rt_se_boosted(rt_se)) 1106 rt_rq->rt_nr_boosted++; 1107 1108 if (rt_rq->tg) 1109 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 1110 } 1111 1112 static void 1113 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1114 { 1115 if (rt_se_boosted(rt_se)) 1116 rt_rq->rt_nr_boosted--; 1117 1118 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1119 } 1120 1121 #else /* CONFIG_RT_GROUP_SCHED */ 1122 1123 static void 1124 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1125 { 1126 start_rt_bandwidth(&def_rt_bandwidth); 1127 } 1128 1129 static inline 1130 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1131 1132 #endif /* CONFIG_RT_GROUP_SCHED */ 1133 1134 static inline 1135 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) 1136 { 1137 struct rt_rq *group_rq = group_rt_rq(rt_se); 1138 1139 if (group_rq) 1140 return group_rq->rt_nr_running; 1141 else 1142 return 1; 1143 } 1144 1145 static inline 1146 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1147 { 1148 int prio = rt_se_prio(rt_se); 1149 1150 WARN_ON(!rt_prio(prio)); 1151 rt_rq->rt_nr_running += rt_se_nr_running(rt_se); 1152 1153 inc_rt_prio(rt_rq, prio); 1154 inc_rt_migration(rt_se, rt_rq); 1155 inc_rt_group(rt_se, rt_rq); 1156 } 1157 1158 static inline 1159 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1160 { 1161 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1162 WARN_ON(!rt_rq->rt_nr_running); 1163 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); 1164 1165 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1166 dec_rt_migration(rt_se, rt_rq); 1167 dec_rt_group(rt_se, rt_rq); 1168 } 1169 1170 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1171 { 1172 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1173 struct rt_prio_array *array = &rt_rq->active; 1174 struct rt_rq *group_rq = group_rt_rq(rt_se); 1175 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1176 1177 /* 1178 * Don't enqueue the group if its throttled, or when empty. 1179 * The latter is a consequence of the former when a child group 1180 * get throttled and the current group doesn't have any other 1181 * active members. 1182 */ 1183 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1184 return; 1185 1186 if (head) 1187 list_add(&rt_se->run_list, queue); 1188 else 1189 list_add_tail(&rt_se->run_list, queue); 1190 __set_bit(rt_se_prio(rt_se), array->bitmap); 1191 1192 inc_rt_tasks(rt_se, rt_rq); 1193 } 1194 1195 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) 1196 { 1197 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1198 struct rt_prio_array *array = &rt_rq->active; 1199 1200 list_del_init(&rt_se->run_list); 1201 if (list_empty(array->queue + rt_se_prio(rt_se))) 1202 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1203 1204 dec_rt_tasks(rt_se, rt_rq); 1205 } 1206 1207 /* 1208 * Because the prio of an upper entry depends on the lower 1209 * entries, we must remove entries top - down. 1210 */ 1211 static void dequeue_rt_stack(struct sched_rt_entity *rt_se) 1212 { 1213 struct sched_rt_entity *back = NULL; 1214 1215 for_each_sched_rt_entity(rt_se) { 1216 rt_se->back = back; 1217 back = rt_se; 1218 } 1219 1220 dequeue_top_rt_rq(rt_rq_of_se(back)); 1221 1222 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1223 if (on_rt_rq(rt_se)) 1224 __dequeue_rt_entity(rt_se); 1225 } 1226 } 1227 1228 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1229 { 1230 struct rq *rq = rq_of_rt_se(rt_se); 1231 1232 dequeue_rt_stack(rt_se); 1233 for_each_sched_rt_entity(rt_se) 1234 __enqueue_rt_entity(rt_se, head); 1235 enqueue_top_rt_rq(&rq->rt); 1236 } 1237 1238 static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1239 { 1240 struct rq *rq = rq_of_rt_se(rt_se); 1241 1242 dequeue_rt_stack(rt_se); 1243 1244 for_each_sched_rt_entity(rt_se) { 1245 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1246 1247 if (rt_rq && rt_rq->rt_nr_running) 1248 __enqueue_rt_entity(rt_se, false); 1249 } 1250 enqueue_top_rt_rq(&rq->rt); 1251 } 1252 1253 /* 1254 * Adding/removing a task to/from a priority array: 1255 */ 1256 static void 1257 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1258 { 1259 struct sched_rt_entity *rt_se = &p->rt; 1260 1261 if (flags & ENQUEUE_WAKEUP) 1262 rt_se->timeout = 0; 1263 1264 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1265 1266 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1267 enqueue_pushable_task(rq, p); 1268 } 1269 1270 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1271 { 1272 struct sched_rt_entity *rt_se = &p->rt; 1273 1274 update_curr_rt(rq); 1275 dequeue_rt_entity(rt_se); 1276 1277 dequeue_pushable_task(rq, p); 1278 } 1279 1280 /* 1281 * Put task to the head or the end of the run list without the overhead of 1282 * dequeue followed by enqueue. 1283 */ 1284 static void 1285 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1286 { 1287 if (on_rt_rq(rt_se)) { 1288 struct rt_prio_array *array = &rt_rq->active; 1289 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1290 1291 if (head) 1292 list_move(&rt_se->run_list, queue); 1293 else 1294 list_move_tail(&rt_se->run_list, queue); 1295 } 1296 } 1297 1298 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1299 { 1300 struct sched_rt_entity *rt_se = &p->rt; 1301 struct rt_rq *rt_rq; 1302 1303 for_each_sched_rt_entity(rt_se) { 1304 rt_rq = rt_rq_of_se(rt_se); 1305 requeue_rt_entity(rt_rq, rt_se, head); 1306 } 1307 } 1308 1309 static void yield_task_rt(struct rq *rq) 1310 { 1311 requeue_task_rt(rq, rq->curr, 0); 1312 } 1313 1314 #ifdef CONFIG_SMP 1315 static int find_lowest_rq(struct task_struct *task); 1316 1317 static int 1318 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) 1319 { 1320 struct task_struct *curr; 1321 struct rq *rq; 1322 1323 /* For anything but wake ups, just return the task_cpu */ 1324 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1325 goto out; 1326 1327 rq = cpu_rq(cpu); 1328 1329 rcu_read_lock(); 1330 curr = READ_ONCE(rq->curr); /* unlocked access */ 1331 1332 /* 1333 * If the current task on @p's runqueue is an RT task, then 1334 * try to see if we can wake this RT task up on another 1335 * runqueue. Otherwise simply start this RT task 1336 * on its current runqueue. 1337 * 1338 * We want to avoid overloading runqueues. If the woken 1339 * task is a higher priority, then it will stay on this CPU 1340 * and the lower prio task should be moved to another CPU. 1341 * Even though this will probably make the lower prio task 1342 * lose its cache, we do not want to bounce a higher task 1343 * around just because it gave up its CPU, perhaps for a 1344 * lock? 1345 * 1346 * For equal prio tasks, we just let the scheduler sort it out. 1347 * 1348 * Otherwise, just let it ride on the affined RQ and the 1349 * post-schedule router will push the preempted task away 1350 * 1351 * This test is optimistic, if we get it wrong the load-balancer 1352 * will have to sort it out. 1353 */ 1354 if (curr && unlikely(rt_task(curr)) && 1355 (curr->nr_cpus_allowed < 2 || 1356 curr->prio <= p->prio)) { 1357 int target = find_lowest_rq(p); 1358 1359 /* 1360 * Don't bother moving it if the destination CPU is 1361 * not running a lower priority task. 1362 */ 1363 if (target != -1 && 1364 p->prio < cpu_rq(target)->rt.highest_prio.curr) 1365 cpu = target; 1366 } 1367 rcu_read_unlock(); 1368 1369 out: 1370 return cpu; 1371 } 1372 1373 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1374 { 1375 /* 1376 * Current can't be migrated, useless to reschedule, 1377 * let's hope p can move out. 1378 */ 1379 if (rq->curr->nr_cpus_allowed == 1 || 1380 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1381 return; 1382 1383 /* 1384 * p is migratable, so let's not schedule it and 1385 * see if it is pushed or pulled somewhere else. 1386 */ 1387 if (p->nr_cpus_allowed != 1 1388 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1389 return; 1390 1391 /* 1392 * There appears to be other cpus that can accept 1393 * current and none to run 'p', so lets reschedule 1394 * to try and push current away: 1395 */ 1396 requeue_task_rt(rq, p, 1); 1397 resched_curr(rq); 1398 } 1399 1400 #endif /* CONFIG_SMP */ 1401 1402 /* 1403 * Preempt the current task with a newly woken task if needed: 1404 */ 1405 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1406 { 1407 if (p->prio < rq->curr->prio) { 1408 resched_curr(rq); 1409 return; 1410 } 1411 1412 #ifdef CONFIG_SMP 1413 /* 1414 * If: 1415 * 1416 * - the newly woken task is of equal priority to the current task 1417 * - the newly woken task is non-migratable while current is migratable 1418 * - current will be preempted on the next reschedule 1419 * 1420 * we should check to see if current can readily move to a different 1421 * cpu. If so, we will reschedule to allow the push logic to try 1422 * to move current somewhere else, making room for our non-migratable 1423 * task. 1424 */ 1425 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1426 check_preempt_equal_prio(rq, p); 1427 #endif 1428 } 1429 1430 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1431 struct rt_rq *rt_rq) 1432 { 1433 struct rt_prio_array *array = &rt_rq->active; 1434 struct sched_rt_entity *next = NULL; 1435 struct list_head *queue; 1436 int idx; 1437 1438 idx = sched_find_first_bit(array->bitmap); 1439 BUG_ON(idx >= MAX_RT_PRIO); 1440 1441 queue = array->queue + idx; 1442 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1443 1444 return next; 1445 } 1446 1447 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1448 { 1449 struct sched_rt_entity *rt_se; 1450 struct task_struct *p; 1451 struct rt_rq *rt_rq = &rq->rt; 1452 1453 do { 1454 rt_se = pick_next_rt_entity(rq, rt_rq); 1455 BUG_ON(!rt_se); 1456 rt_rq = group_rt_rq(rt_se); 1457 } while (rt_rq); 1458 1459 p = rt_task_of(rt_se); 1460 p->se.exec_start = rq_clock_task(rq); 1461 1462 return p; 1463 } 1464 1465 static struct task_struct * 1466 pick_next_task_rt(struct rq *rq, struct task_struct *prev) 1467 { 1468 struct task_struct *p; 1469 struct rt_rq *rt_rq = &rq->rt; 1470 1471 if (need_pull_rt_task(rq, prev)) { 1472 pull_rt_task(rq); 1473 /* 1474 * pull_rt_task() can drop (and re-acquire) rq->lock; this 1475 * means a dl or stop task can slip in, in which case we need 1476 * to re-start task selection. 1477 */ 1478 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) || 1479 rq->dl.dl_nr_running)) 1480 return RETRY_TASK; 1481 } 1482 1483 /* 1484 * We may dequeue prev's rt_rq in put_prev_task(). 1485 * So, we update time before rt_nr_running check. 1486 */ 1487 if (prev->sched_class == &rt_sched_class) 1488 update_curr_rt(rq); 1489 1490 if (!rt_rq->rt_queued) 1491 return NULL; 1492 1493 put_prev_task(rq, prev); 1494 1495 p = _pick_next_task_rt(rq); 1496 1497 /* The running task is never eligible for pushing */ 1498 dequeue_pushable_task(rq, p); 1499 1500 set_post_schedule(rq); 1501 1502 return p; 1503 } 1504 1505 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1506 { 1507 update_curr_rt(rq); 1508 1509 /* 1510 * The previous task needs to be made eligible for pushing 1511 * if it is still active 1512 */ 1513 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1514 enqueue_pushable_task(rq, p); 1515 } 1516 1517 #ifdef CONFIG_SMP 1518 1519 /* Only try algorithms three times */ 1520 #define RT_MAX_TRIES 3 1521 1522 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1523 { 1524 if (!task_running(rq, p) && 1525 cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 1526 return 1; 1527 return 0; 1528 } 1529 1530 /* 1531 * Return the highest pushable rq's task, which is suitable to be executed 1532 * on the cpu, NULL otherwise 1533 */ 1534 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1535 { 1536 struct plist_head *head = &rq->rt.pushable_tasks; 1537 struct task_struct *p; 1538 1539 if (!has_pushable_tasks(rq)) 1540 return NULL; 1541 1542 plist_for_each_entry(p, head, pushable_tasks) { 1543 if (pick_rt_task(rq, p, cpu)) 1544 return p; 1545 } 1546 1547 return NULL; 1548 } 1549 1550 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1551 1552 static int find_lowest_rq(struct task_struct *task) 1553 { 1554 struct sched_domain *sd; 1555 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); 1556 int this_cpu = smp_processor_id(); 1557 int cpu = task_cpu(task); 1558 1559 /* Make sure the mask is initialized first */ 1560 if (unlikely(!lowest_mask)) 1561 return -1; 1562 1563 if (task->nr_cpus_allowed == 1) 1564 return -1; /* No other targets possible */ 1565 1566 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1567 return -1; /* No targets found */ 1568 1569 /* 1570 * At this point we have built a mask of cpus representing the 1571 * lowest priority tasks in the system. Now we want to elect 1572 * the best one based on our affinity and topology. 1573 * 1574 * We prioritize the last cpu that the task executed on since 1575 * it is most likely cache-hot in that location. 1576 */ 1577 if (cpumask_test_cpu(cpu, lowest_mask)) 1578 return cpu; 1579 1580 /* 1581 * Otherwise, we consult the sched_domains span maps to figure 1582 * out which cpu is logically closest to our hot cache data. 1583 */ 1584 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1585 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1586 1587 rcu_read_lock(); 1588 for_each_domain(cpu, sd) { 1589 if (sd->flags & SD_WAKE_AFFINE) { 1590 int best_cpu; 1591 1592 /* 1593 * "this_cpu" is cheaper to preempt than a 1594 * remote processor. 1595 */ 1596 if (this_cpu != -1 && 1597 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1598 rcu_read_unlock(); 1599 return this_cpu; 1600 } 1601 1602 best_cpu = cpumask_first_and(lowest_mask, 1603 sched_domain_span(sd)); 1604 if (best_cpu < nr_cpu_ids) { 1605 rcu_read_unlock(); 1606 return best_cpu; 1607 } 1608 } 1609 } 1610 rcu_read_unlock(); 1611 1612 /* 1613 * And finally, if there were no matches within the domains 1614 * just give the caller *something* to work with from the compatible 1615 * locations. 1616 */ 1617 if (this_cpu != -1) 1618 return this_cpu; 1619 1620 cpu = cpumask_any(lowest_mask); 1621 if (cpu < nr_cpu_ids) 1622 return cpu; 1623 return -1; 1624 } 1625 1626 /* Will lock the rq it finds */ 1627 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1628 { 1629 struct rq *lowest_rq = NULL; 1630 int tries; 1631 int cpu; 1632 1633 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1634 cpu = find_lowest_rq(task); 1635 1636 if ((cpu == -1) || (cpu == rq->cpu)) 1637 break; 1638 1639 lowest_rq = cpu_rq(cpu); 1640 1641 if (lowest_rq->rt.highest_prio.curr <= task->prio) { 1642 /* 1643 * Target rq has tasks of equal or higher priority, 1644 * retrying does not release any lock and is unlikely 1645 * to yield a different result. 1646 */ 1647 lowest_rq = NULL; 1648 break; 1649 } 1650 1651 /* if the prio of this runqueue changed, try again */ 1652 if (double_lock_balance(rq, lowest_rq)) { 1653 /* 1654 * We had to unlock the run queue. In 1655 * the mean time, task could have 1656 * migrated already or had its affinity changed. 1657 * Also make sure that it wasn't scheduled on its rq. 1658 */ 1659 if (unlikely(task_rq(task) != rq || 1660 !cpumask_test_cpu(lowest_rq->cpu, 1661 tsk_cpus_allowed(task)) || 1662 task_running(rq, task) || 1663 !task_on_rq_queued(task))) { 1664 1665 double_unlock_balance(rq, lowest_rq); 1666 lowest_rq = NULL; 1667 break; 1668 } 1669 } 1670 1671 /* If this rq is still suitable use it. */ 1672 if (lowest_rq->rt.highest_prio.curr > task->prio) 1673 break; 1674 1675 /* try again */ 1676 double_unlock_balance(rq, lowest_rq); 1677 lowest_rq = NULL; 1678 } 1679 1680 return lowest_rq; 1681 } 1682 1683 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1684 { 1685 struct task_struct *p; 1686 1687 if (!has_pushable_tasks(rq)) 1688 return NULL; 1689 1690 p = plist_first_entry(&rq->rt.pushable_tasks, 1691 struct task_struct, pushable_tasks); 1692 1693 BUG_ON(rq->cpu != task_cpu(p)); 1694 BUG_ON(task_current(rq, p)); 1695 BUG_ON(p->nr_cpus_allowed <= 1); 1696 1697 BUG_ON(!task_on_rq_queued(p)); 1698 BUG_ON(!rt_task(p)); 1699 1700 return p; 1701 } 1702 1703 /* 1704 * If the current CPU has more than one RT task, see if the non 1705 * running task can migrate over to a CPU that is running a task 1706 * of lesser priority. 1707 */ 1708 static int push_rt_task(struct rq *rq) 1709 { 1710 struct task_struct *next_task; 1711 struct rq *lowest_rq; 1712 int ret = 0; 1713 1714 if (!rq->rt.overloaded) 1715 return 0; 1716 1717 next_task = pick_next_pushable_task(rq); 1718 if (!next_task) 1719 return 0; 1720 1721 retry: 1722 if (unlikely(next_task == rq->curr)) { 1723 WARN_ON(1); 1724 return 0; 1725 } 1726 1727 /* 1728 * It's possible that the next_task slipped in of 1729 * higher priority than current. If that's the case 1730 * just reschedule current. 1731 */ 1732 if (unlikely(next_task->prio < rq->curr->prio)) { 1733 resched_curr(rq); 1734 return 0; 1735 } 1736 1737 /* We might release rq lock */ 1738 get_task_struct(next_task); 1739 1740 /* find_lock_lowest_rq locks the rq if found */ 1741 lowest_rq = find_lock_lowest_rq(next_task, rq); 1742 if (!lowest_rq) { 1743 struct task_struct *task; 1744 /* 1745 * find_lock_lowest_rq releases rq->lock 1746 * so it is possible that next_task has migrated. 1747 * 1748 * We need to make sure that the task is still on the same 1749 * run-queue and is also still the next task eligible for 1750 * pushing. 1751 */ 1752 task = pick_next_pushable_task(rq); 1753 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1754 /* 1755 * The task hasn't migrated, and is still the next 1756 * eligible task, but we failed to find a run-queue 1757 * to push it to. Do not retry in this case, since 1758 * other cpus will pull from us when ready. 1759 */ 1760 goto out; 1761 } 1762 1763 if (!task) 1764 /* No more tasks, just exit */ 1765 goto out; 1766 1767 /* 1768 * Something has shifted, try again. 1769 */ 1770 put_task_struct(next_task); 1771 next_task = task; 1772 goto retry; 1773 } 1774 1775 deactivate_task(rq, next_task, 0); 1776 set_task_cpu(next_task, lowest_rq->cpu); 1777 activate_task(lowest_rq, next_task, 0); 1778 ret = 1; 1779 1780 resched_curr(lowest_rq); 1781 1782 double_unlock_balance(rq, lowest_rq); 1783 1784 out: 1785 put_task_struct(next_task); 1786 1787 return ret; 1788 } 1789 1790 static void push_rt_tasks(struct rq *rq) 1791 { 1792 /* push_rt_task will return true if it moved an RT */ 1793 while (push_rt_task(rq)) 1794 ; 1795 } 1796 1797 #ifdef HAVE_RT_PUSH_IPI 1798 /* 1799 * The search for the next cpu always starts at rq->cpu and ends 1800 * when we reach rq->cpu again. It will never return rq->cpu. 1801 * This returns the next cpu to check, or nr_cpu_ids if the loop 1802 * is complete. 1803 * 1804 * rq->rt.push_cpu holds the last cpu returned by this function, 1805 * or if this is the first instance, it must hold rq->cpu. 1806 */ 1807 static int rto_next_cpu(struct rq *rq) 1808 { 1809 int prev_cpu = rq->rt.push_cpu; 1810 int cpu; 1811 1812 cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); 1813 1814 /* 1815 * If the previous cpu is less than the rq's CPU, then it already 1816 * passed the end of the mask, and has started from the beginning. 1817 * We end if the next CPU is greater or equal to rq's CPU. 1818 */ 1819 if (prev_cpu < rq->cpu) { 1820 if (cpu >= rq->cpu) 1821 return nr_cpu_ids; 1822 1823 } else if (cpu >= nr_cpu_ids) { 1824 /* 1825 * We passed the end of the mask, start at the beginning. 1826 * If the result is greater or equal to the rq's CPU, then 1827 * the loop is finished. 1828 */ 1829 cpu = cpumask_first(rq->rd->rto_mask); 1830 if (cpu >= rq->cpu) 1831 return nr_cpu_ids; 1832 } 1833 rq->rt.push_cpu = cpu; 1834 1835 /* Return cpu to let the caller know if the loop is finished or not */ 1836 return cpu; 1837 } 1838 1839 static int find_next_push_cpu(struct rq *rq) 1840 { 1841 struct rq *next_rq; 1842 int cpu; 1843 1844 while (1) { 1845 cpu = rto_next_cpu(rq); 1846 if (cpu >= nr_cpu_ids) 1847 break; 1848 next_rq = cpu_rq(cpu); 1849 1850 /* Make sure the next rq can push to this rq */ 1851 if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) 1852 break; 1853 } 1854 1855 return cpu; 1856 } 1857 1858 #define RT_PUSH_IPI_EXECUTING 1 1859 #define RT_PUSH_IPI_RESTART 2 1860 1861 static void tell_cpu_to_push(struct rq *rq) 1862 { 1863 int cpu; 1864 1865 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 1866 raw_spin_lock(&rq->rt.push_lock); 1867 /* Make sure it's still executing */ 1868 if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { 1869 /* 1870 * Tell the IPI to restart the loop as things have 1871 * changed since it started. 1872 */ 1873 rq->rt.push_flags |= RT_PUSH_IPI_RESTART; 1874 raw_spin_unlock(&rq->rt.push_lock); 1875 return; 1876 } 1877 raw_spin_unlock(&rq->rt.push_lock); 1878 } 1879 1880 /* When here, there's no IPI going around */ 1881 1882 rq->rt.push_cpu = rq->cpu; 1883 cpu = find_next_push_cpu(rq); 1884 if (cpu >= nr_cpu_ids) 1885 return; 1886 1887 rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; 1888 1889 irq_work_queue_on(&rq->rt.push_work, cpu); 1890 } 1891 1892 /* Called from hardirq context */ 1893 static void try_to_push_tasks(void *arg) 1894 { 1895 struct rt_rq *rt_rq = arg; 1896 struct rq *rq, *src_rq; 1897 int this_cpu; 1898 int cpu; 1899 1900 this_cpu = rt_rq->push_cpu; 1901 1902 /* Paranoid check */ 1903 BUG_ON(this_cpu != smp_processor_id()); 1904 1905 rq = cpu_rq(this_cpu); 1906 src_rq = rq_of_rt_rq(rt_rq); 1907 1908 again: 1909 if (has_pushable_tasks(rq)) { 1910 raw_spin_lock(&rq->lock); 1911 push_rt_task(rq); 1912 raw_spin_unlock(&rq->lock); 1913 } 1914 1915 /* Pass the IPI to the next rt overloaded queue */ 1916 raw_spin_lock(&rt_rq->push_lock); 1917 /* 1918 * If the source queue changed since the IPI went out, 1919 * we need to restart the search from that CPU again. 1920 */ 1921 if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { 1922 rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; 1923 rt_rq->push_cpu = src_rq->cpu; 1924 } 1925 1926 cpu = find_next_push_cpu(src_rq); 1927 1928 if (cpu >= nr_cpu_ids) 1929 rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; 1930 raw_spin_unlock(&rt_rq->push_lock); 1931 1932 if (cpu >= nr_cpu_ids) 1933 return; 1934 1935 /* 1936 * It is possible that a restart caused this CPU to be 1937 * chosen again. Don't bother with an IPI, just see if we 1938 * have more to push. 1939 */ 1940 if (unlikely(cpu == rq->cpu)) 1941 goto again; 1942 1943 /* Try the next RT overloaded CPU */ 1944 irq_work_queue_on(&rt_rq->push_work, cpu); 1945 } 1946 1947 static void push_irq_work_func(struct irq_work *work) 1948 { 1949 struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); 1950 1951 try_to_push_tasks(rt_rq); 1952 } 1953 #endif /* HAVE_RT_PUSH_IPI */ 1954 1955 static int pull_rt_task(struct rq *this_rq) 1956 { 1957 int this_cpu = this_rq->cpu, ret = 0, cpu; 1958 struct task_struct *p; 1959 struct rq *src_rq; 1960 1961 if (likely(!rt_overloaded(this_rq))) 1962 return 0; 1963 1964 /* 1965 * Match the barrier from rt_set_overloaded; this guarantees that if we 1966 * see overloaded we must also see the rto_mask bit. 1967 */ 1968 smp_rmb(); 1969 1970 #ifdef HAVE_RT_PUSH_IPI 1971 if (sched_feat(RT_PUSH_IPI)) { 1972 tell_cpu_to_push(this_rq); 1973 return 0; 1974 } 1975 #endif 1976 1977 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1978 if (this_cpu == cpu) 1979 continue; 1980 1981 src_rq = cpu_rq(cpu); 1982 1983 /* 1984 * Don't bother taking the src_rq->lock if the next highest 1985 * task is known to be lower-priority than our current task. 1986 * This may look racy, but if this value is about to go 1987 * logically higher, the src_rq will push this task away. 1988 * And if its going logically lower, we do not care 1989 */ 1990 if (src_rq->rt.highest_prio.next >= 1991 this_rq->rt.highest_prio.curr) 1992 continue; 1993 1994 /* 1995 * We can potentially drop this_rq's lock in 1996 * double_lock_balance, and another CPU could 1997 * alter this_rq 1998 */ 1999 double_lock_balance(this_rq, src_rq); 2000 2001 /* 2002 * We can pull only a task, which is pushable 2003 * on its rq, and no others. 2004 */ 2005 p = pick_highest_pushable_task(src_rq, this_cpu); 2006 2007 /* 2008 * Do we have an RT task that preempts 2009 * the to-be-scheduled task? 2010 */ 2011 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 2012 WARN_ON(p == src_rq->curr); 2013 WARN_ON(!task_on_rq_queued(p)); 2014 2015 /* 2016 * There's a chance that p is higher in priority 2017 * than what's currently running on its cpu. 2018 * This is just that p is wakeing up and hasn't 2019 * had a chance to schedule. We only pull 2020 * p if it is lower in priority than the 2021 * current task on the run queue 2022 */ 2023 if (p->prio < src_rq->curr->prio) 2024 goto skip; 2025 2026 ret = 1; 2027 2028 deactivate_task(src_rq, p, 0); 2029 set_task_cpu(p, this_cpu); 2030 activate_task(this_rq, p, 0); 2031 /* 2032 * We continue with the search, just in 2033 * case there's an even higher prio task 2034 * in another runqueue. (low likelihood 2035 * but possible) 2036 */ 2037 } 2038 skip: 2039 double_unlock_balance(this_rq, src_rq); 2040 } 2041 2042 return ret; 2043 } 2044 2045 static void post_schedule_rt(struct rq *rq) 2046 { 2047 push_rt_tasks(rq); 2048 } 2049 2050 /* 2051 * If we are not running and we are not going to reschedule soon, we should 2052 * try to push tasks away now 2053 */ 2054 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2055 { 2056 if (!task_running(rq, p) && 2057 !test_tsk_need_resched(rq->curr) && 2058 has_pushable_tasks(rq) && 2059 p->nr_cpus_allowed > 1 && 2060 (dl_task(rq->curr) || rt_task(rq->curr)) && 2061 (rq->curr->nr_cpus_allowed < 2 || 2062 rq->curr->prio <= p->prio)) 2063 push_rt_tasks(rq); 2064 } 2065 2066 static void set_cpus_allowed_rt(struct task_struct *p, 2067 const struct cpumask *new_mask) 2068 { 2069 struct rq *rq; 2070 int weight; 2071 2072 BUG_ON(!rt_task(p)); 2073 2074 if (!task_on_rq_queued(p)) 2075 return; 2076 2077 weight = cpumask_weight(new_mask); 2078 2079 /* 2080 * Only update if the process changes its state from whether it 2081 * can migrate or not. 2082 */ 2083 if ((p->nr_cpus_allowed > 1) == (weight > 1)) 2084 return; 2085 2086 rq = task_rq(p); 2087 2088 /* 2089 * The process used to be able to migrate OR it can now migrate 2090 */ 2091 if (weight <= 1) { 2092 if (!task_current(rq, p)) 2093 dequeue_pushable_task(rq, p); 2094 BUG_ON(!rq->rt.rt_nr_migratory); 2095 rq->rt.rt_nr_migratory--; 2096 } else { 2097 if (!task_current(rq, p)) 2098 enqueue_pushable_task(rq, p); 2099 rq->rt.rt_nr_migratory++; 2100 } 2101 2102 update_rt_migration(&rq->rt); 2103 } 2104 2105 /* Assumes rq->lock is held */ 2106 static void rq_online_rt(struct rq *rq) 2107 { 2108 if (rq->rt.overloaded) 2109 rt_set_overload(rq); 2110 2111 __enable_runtime(rq); 2112 2113 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2114 } 2115 2116 /* Assumes rq->lock is held */ 2117 static void rq_offline_rt(struct rq *rq) 2118 { 2119 if (rq->rt.overloaded) 2120 rt_clear_overload(rq); 2121 2122 __disable_runtime(rq); 2123 2124 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 2125 } 2126 2127 /* 2128 * When switch from the rt queue, we bring ourselves to a position 2129 * that we might want to pull RT tasks from other runqueues. 2130 */ 2131 static void switched_from_rt(struct rq *rq, struct task_struct *p) 2132 { 2133 /* 2134 * If there are other RT tasks then we will reschedule 2135 * and the scheduling of the other RT tasks will handle 2136 * the balancing. But if we are the last RT task 2137 * we may need to handle the pulling of RT tasks 2138 * now. 2139 */ 2140 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2141 return; 2142 2143 if (pull_rt_task(rq)) 2144 resched_curr(rq); 2145 } 2146 2147 void __init init_sched_rt_class(void) 2148 { 2149 unsigned int i; 2150 2151 for_each_possible_cpu(i) { 2152 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 2153 GFP_KERNEL, cpu_to_node(i)); 2154 } 2155 } 2156 #endif /* CONFIG_SMP */ 2157 2158 /* 2159 * When switching a task to RT, we may overload the runqueue 2160 * with RT tasks. In this case we try to push them off to 2161 * other runqueues. 2162 */ 2163 static void switched_to_rt(struct rq *rq, struct task_struct *p) 2164 { 2165 int check_resched = 1; 2166 2167 /* 2168 * If we are already running, then there's nothing 2169 * that needs to be done. But if we are not running 2170 * we may need to preempt the current running task. 2171 * If that current running task is also an RT task 2172 * then see if we can move to another run queue. 2173 */ 2174 if (task_on_rq_queued(p) && rq->curr != p) { 2175 #ifdef CONFIG_SMP 2176 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && 2177 /* Don't resched if we changed runqueues */ 2178 push_rt_task(rq) && rq != task_rq(p)) 2179 check_resched = 0; 2180 #endif /* CONFIG_SMP */ 2181 if (check_resched && p->prio < rq->curr->prio) 2182 resched_curr(rq); 2183 } 2184 } 2185 2186 /* 2187 * Priority of the task has changed. This may cause 2188 * us to initiate a push or pull. 2189 */ 2190 static void 2191 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2192 { 2193 if (!task_on_rq_queued(p)) 2194 return; 2195 2196 if (rq->curr == p) { 2197 #ifdef CONFIG_SMP 2198 /* 2199 * If our priority decreases while running, we 2200 * may need to pull tasks to this runqueue. 2201 */ 2202 if (oldprio < p->prio) 2203 pull_rt_task(rq); 2204 /* 2205 * If there's a higher priority task waiting to run 2206 * then reschedule. Note, the above pull_rt_task 2207 * can release the rq lock and p could migrate. 2208 * Only reschedule if p is still on the same runqueue. 2209 */ 2210 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 2211 resched_curr(rq); 2212 #else 2213 /* For UP simply resched on drop of prio */ 2214 if (oldprio < p->prio) 2215 resched_curr(rq); 2216 #endif /* CONFIG_SMP */ 2217 } else { 2218 /* 2219 * This task is not running, but if it is 2220 * greater than the current running task 2221 * then reschedule. 2222 */ 2223 if (p->prio < rq->curr->prio) 2224 resched_curr(rq); 2225 } 2226 } 2227 2228 static void watchdog(struct rq *rq, struct task_struct *p) 2229 { 2230 unsigned long soft, hard; 2231 2232 /* max may change after cur was read, this will be fixed next tick */ 2233 soft = task_rlimit(p, RLIMIT_RTTIME); 2234 hard = task_rlimit_max(p, RLIMIT_RTTIME); 2235 2236 if (soft != RLIM_INFINITY) { 2237 unsigned long next; 2238 2239 if (p->rt.watchdog_stamp != jiffies) { 2240 p->rt.timeout++; 2241 p->rt.watchdog_stamp = jiffies; 2242 } 2243 2244 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 2245 if (p->rt.timeout > next) 2246 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 2247 } 2248 } 2249 2250 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2251 { 2252 struct sched_rt_entity *rt_se = &p->rt; 2253 2254 update_curr_rt(rq); 2255 2256 watchdog(rq, p); 2257 2258 /* 2259 * RR tasks need a special form of timeslice management. 2260 * FIFO tasks have no timeslices. 2261 */ 2262 if (p->policy != SCHED_RR) 2263 return; 2264 2265 if (--p->rt.time_slice) 2266 return; 2267 2268 p->rt.time_slice = sched_rr_timeslice; 2269 2270 /* 2271 * Requeue to the end of queue if we (and all of our ancestors) are not 2272 * the only element on the queue 2273 */ 2274 for_each_sched_rt_entity(rt_se) { 2275 if (rt_se->run_list.prev != rt_se->run_list.next) { 2276 requeue_task_rt(rq, p, 0); 2277 resched_curr(rq); 2278 return; 2279 } 2280 } 2281 } 2282 2283 static void set_curr_task_rt(struct rq *rq) 2284 { 2285 struct task_struct *p = rq->curr; 2286 2287 p->se.exec_start = rq_clock_task(rq); 2288 2289 /* The running task is never eligible for pushing */ 2290 dequeue_pushable_task(rq, p); 2291 } 2292 2293 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2294 { 2295 /* 2296 * Time slice is 0 for SCHED_FIFO tasks 2297 */ 2298 if (task->policy == SCHED_RR) 2299 return sched_rr_timeslice; 2300 else 2301 return 0; 2302 } 2303 2304 const struct sched_class rt_sched_class = { 2305 .next = &fair_sched_class, 2306 .enqueue_task = enqueue_task_rt, 2307 .dequeue_task = dequeue_task_rt, 2308 .yield_task = yield_task_rt, 2309 2310 .check_preempt_curr = check_preempt_curr_rt, 2311 2312 .pick_next_task = pick_next_task_rt, 2313 .put_prev_task = put_prev_task_rt, 2314 2315 #ifdef CONFIG_SMP 2316 .select_task_rq = select_task_rq_rt, 2317 2318 .set_cpus_allowed = set_cpus_allowed_rt, 2319 .rq_online = rq_online_rt, 2320 .rq_offline = rq_offline_rt, 2321 .post_schedule = post_schedule_rt, 2322 .task_woken = task_woken_rt, 2323 .switched_from = switched_from_rt, 2324 #endif 2325 2326 .set_curr_task = set_curr_task_rt, 2327 .task_tick = task_tick_rt, 2328 2329 .get_rr_interval = get_rr_interval_rt, 2330 2331 .prio_changed = prio_changed_rt, 2332 .switched_to = switched_to_rt, 2333 2334 .update_curr = update_curr_rt, 2335 }; 2336 2337 #ifdef CONFIG_SCHED_DEBUG 2338 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2339 2340 void print_rt_stats(struct seq_file *m, int cpu) 2341 { 2342 rt_rq_iter_t iter; 2343 struct rt_rq *rt_rq; 2344 2345 rcu_read_lock(); 2346 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2347 print_rt_rq(m, cpu, rt_rq); 2348 rcu_read_unlock(); 2349 } 2350 #endif /* CONFIG_SCHED_DEBUG */ 2351