1 /* 2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * policies) 4 */ 5 6 #include "sched.h" 7 8 #include <linux/slab.h> 9 10 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 11 12 struct rt_bandwidth def_rt_bandwidth; 13 14 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 15 { 16 struct rt_bandwidth *rt_b = 17 container_of(timer, struct rt_bandwidth, rt_period_timer); 18 ktime_t now; 19 int overrun; 20 int idle = 0; 21 22 for (;;) { 23 now = hrtimer_cb_get_time(timer); 24 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 25 26 if (!overrun) 27 break; 28 29 idle = do_sched_rt_period_timer(rt_b, overrun); 30 } 31 32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 33 } 34 35 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 36 { 37 rt_b->rt_period = ns_to_ktime(period); 38 rt_b->rt_runtime = runtime; 39 40 raw_spin_lock_init(&rt_b->rt_runtime_lock); 41 42 hrtimer_init(&rt_b->rt_period_timer, 43 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 44 rt_b->rt_period_timer.function = sched_rt_period_timer; 45 } 46 47 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 48 { 49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 50 return; 51 52 if (hrtimer_active(&rt_b->rt_period_timer)) 53 return; 54 55 raw_spin_lock(&rt_b->rt_runtime_lock); 56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); 57 raw_spin_unlock(&rt_b->rt_runtime_lock); 58 } 59 60 void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 61 { 62 struct rt_prio_array *array; 63 int i; 64 65 array = &rt_rq->active; 66 for (i = 0; i < MAX_RT_PRIO; i++) { 67 INIT_LIST_HEAD(array->queue + i); 68 __clear_bit(i, array->bitmap); 69 } 70 /* delimiter for bitsearch: */ 71 __set_bit(MAX_RT_PRIO, array->bitmap); 72 73 #if defined CONFIG_SMP 74 rt_rq->highest_prio.curr = MAX_RT_PRIO; 75 rt_rq->highest_prio.next = MAX_RT_PRIO; 76 rt_rq->rt_nr_migratory = 0; 77 rt_rq->overloaded = 0; 78 plist_head_init(&rt_rq->pushable_tasks); 79 #endif 80 81 rt_rq->rt_time = 0; 82 rt_rq->rt_throttled = 0; 83 rt_rq->rt_runtime = 0; 84 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 85 } 86 87 #ifdef CONFIG_RT_GROUP_SCHED 88 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 89 { 90 hrtimer_cancel(&rt_b->rt_period_timer); 91 } 92 93 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 94 95 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 96 { 97 #ifdef CONFIG_SCHED_DEBUG 98 WARN_ON_ONCE(!rt_entity_is_task(rt_se)); 99 #endif 100 return container_of(rt_se, struct task_struct, rt); 101 } 102 103 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 104 { 105 return rt_rq->rq; 106 } 107 108 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 109 { 110 return rt_se->rt_rq; 111 } 112 113 void free_rt_sched_group(struct task_group *tg) 114 { 115 int i; 116 117 if (tg->rt_se) 118 destroy_rt_bandwidth(&tg->rt_bandwidth); 119 120 for_each_possible_cpu(i) { 121 if (tg->rt_rq) 122 kfree(tg->rt_rq[i]); 123 if (tg->rt_se) 124 kfree(tg->rt_se[i]); 125 } 126 127 kfree(tg->rt_rq); 128 kfree(tg->rt_se); 129 } 130 131 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 132 struct sched_rt_entity *rt_se, int cpu, 133 struct sched_rt_entity *parent) 134 { 135 struct rq *rq = cpu_rq(cpu); 136 137 rt_rq->highest_prio.curr = MAX_RT_PRIO; 138 rt_rq->rt_nr_boosted = 0; 139 rt_rq->rq = rq; 140 rt_rq->tg = tg; 141 142 tg->rt_rq[cpu] = rt_rq; 143 tg->rt_se[cpu] = rt_se; 144 145 if (!rt_se) 146 return; 147 148 if (!parent) 149 rt_se->rt_rq = &rq->rt; 150 else 151 rt_se->rt_rq = parent->my_q; 152 153 rt_se->my_q = rt_rq; 154 rt_se->parent = parent; 155 INIT_LIST_HEAD(&rt_se->run_list); 156 } 157 158 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 159 { 160 struct rt_rq *rt_rq; 161 struct sched_rt_entity *rt_se; 162 int i; 163 164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 165 if (!tg->rt_rq) 166 goto err; 167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 168 if (!tg->rt_se) 169 goto err; 170 171 init_rt_bandwidth(&tg->rt_bandwidth, 172 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 173 174 for_each_possible_cpu(i) { 175 rt_rq = kzalloc_node(sizeof(struct rt_rq), 176 GFP_KERNEL, cpu_to_node(i)); 177 if (!rt_rq) 178 goto err; 179 180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 181 GFP_KERNEL, cpu_to_node(i)); 182 if (!rt_se) 183 goto err_free_rq; 184 185 init_rt_rq(rt_rq, cpu_rq(i)); 186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); 188 } 189 190 return 1; 191 192 err_free_rq: 193 kfree(rt_rq); 194 err: 195 return 0; 196 } 197 198 #else /* CONFIG_RT_GROUP_SCHED */ 199 200 #define rt_entity_is_task(rt_se) (1) 201 202 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 203 { 204 return container_of(rt_se, struct task_struct, rt); 205 } 206 207 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 208 { 209 return container_of(rt_rq, struct rq, rt); 210 } 211 212 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) 213 { 214 struct task_struct *p = rt_task_of(rt_se); 215 struct rq *rq = task_rq(p); 216 217 return &rq->rt; 218 } 219 220 void free_rt_sched_group(struct task_group *tg) { } 221 222 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 223 { 224 return 1; 225 } 226 #endif /* CONFIG_RT_GROUP_SCHED */ 227 228 #ifdef CONFIG_SMP 229 230 static inline int rt_overloaded(struct rq *rq) 231 { 232 return atomic_read(&rq->rd->rto_count); 233 } 234 235 static inline void rt_set_overload(struct rq *rq) 236 { 237 if (!rq->online) 238 return; 239 240 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); 241 /* 242 * Make sure the mask is visible before we set 243 * the overload count. That is checked to determine 244 * if we should look at the mask. It would be a shame 245 * if we looked at the mask, but the mask was not 246 * updated yet. 247 */ 248 wmb(); 249 atomic_inc(&rq->rd->rto_count); 250 } 251 252 static inline void rt_clear_overload(struct rq *rq) 253 { 254 if (!rq->online) 255 return; 256 257 /* the order here really doesn't matter */ 258 atomic_dec(&rq->rd->rto_count); 259 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); 260 } 261 262 static void update_rt_migration(struct rt_rq *rt_rq) 263 { 264 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { 265 if (!rt_rq->overloaded) { 266 rt_set_overload(rq_of_rt_rq(rt_rq)); 267 rt_rq->overloaded = 1; 268 } 269 } else if (rt_rq->overloaded) { 270 rt_clear_overload(rq_of_rt_rq(rt_rq)); 271 rt_rq->overloaded = 0; 272 } 273 } 274 275 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 276 { 277 if (!rt_entity_is_task(rt_se)) 278 return; 279 280 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 281 282 rt_rq->rt_nr_total++; 283 if (rt_se->nr_cpus_allowed > 1) 284 rt_rq->rt_nr_migratory++; 285 286 update_rt_migration(rt_rq); 287 } 288 289 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 290 { 291 if (!rt_entity_is_task(rt_se)) 292 return; 293 294 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 295 296 rt_rq->rt_nr_total--; 297 if (rt_se->nr_cpus_allowed > 1) 298 rt_rq->rt_nr_migratory--; 299 300 update_rt_migration(rt_rq); 301 } 302 303 static inline int has_pushable_tasks(struct rq *rq) 304 { 305 return !plist_head_empty(&rq->rt.pushable_tasks); 306 } 307 308 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 309 { 310 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 311 plist_node_init(&p->pushable_tasks, p->prio); 312 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); 313 314 /* Update the highest prio pushable task */ 315 if (p->prio < rq->rt.highest_prio.next) 316 rq->rt.highest_prio.next = p->prio; 317 } 318 319 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 320 { 321 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 322 323 /* Update the new highest prio pushable task */ 324 if (has_pushable_tasks(rq)) { 325 p = plist_first_entry(&rq->rt.pushable_tasks, 326 struct task_struct, pushable_tasks); 327 rq->rt.highest_prio.next = p->prio; 328 } else 329 rq->rt.highest_prio.next = MAX_RT_PRIO; 330 } 331 332 #else 333 334 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 335 { 336 } 337 338 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) 339 { 340 } 341 342 static inline 343 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 344 { 345 } 346 347 static inline 348 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 349 { 350 } 351 352 #endif /* CONFIG_SMP */ 353 354 static inline int on_rt_rq(struct sched_rt_entity *rt_se) 355 { 356 return !list_empty(&rt_se->run_list); 357 } 358 359 #ifdef CONFIG_RT_GROUP_SCHED 360 361 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 362 { 363 if (!rt_rq->tg) 364 return RUNTIME_INF; 365 366 return rt_rq->rt_runtime; 367 } 368 369 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 370 { 371 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 372 } 373 374 typedef struct task_group *rt_rq_iter_t; 375 376 static inline struct task_group *next_task_group(struct task_group *tg) 377 { 378 do { 379 tg = list_entry_rcu(tg->list.next, 380 typeof(struct task_group), list); 381 } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); 382 383 if (&tg->list == &task_groups) 384 tg = NULL; 385 386 return tg; 387 } 388 389 #define for_each_rt_rq(rt_rq, iter, rq) \ 390 for (iter = container_of(&task_groups, typeof(*iter), list); \ 391 (iter = next_task_group(iter)) && \ 392 (rt_rq = iter->rt_rq[cpu_of(rq)]);) 393 394 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 395 { 396 list_add_rcu(&rt_rq->leaf_rt_rq_list, 397 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); 398 } 399 400 static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) 401 { 402 list_del_rcu(&rt_rq->leaf_rt_rq_list); 403 } 404 405 #define for_each_leaf_rt_rq(rt_rq, rq) \ 406 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 407 408 #define for_each_sched_rt_entity(rt_se) \ 409 for (; rt_se; rt_se = rt_se->parent) 410 411 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 412 { 413 return rt_se->my_q; 414 } 415 416 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); 417 static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 418 419 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 420 { 421 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 422 struct sched_rt_entity *rt_se; 423 424 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 425 426 rt_se = rt_rq->tg->rt_se[cpu]; 427 428 if (rt_rq->rt_nr_running) { 429 if (rt_se && !on_rt_rq(rt_se)) 430 enqueue_rt_entity(rt_se, false); 431 if (rt_rq->highest_prio.curr < curr->prio) 432 resched_task(curr); 433 } 434 } 435 436 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 437 { 438 struct sched_rt_entity *rt_se; 439 int cpu = cpu_of(rq_of_rt_rq(rt_rq)); 440 441 rt_se = rt_rq->tg->rt_se[cpu]; 442 443 if (rt_se && on_rt_rq(rt_se)) 444 dequeue_rt_entity(rt_se); 445 } 446 447 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 448 { 449 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; 450 } 451 452 static int rt_se_boosted(struct sched_rt_entity *rt_se) 453 { 454 struct rt_rq *rt_rq = group_rt_rq(rt_se); 455 struct task_struct *p; 456 457 if (rt_rq) 458 return !!rt_rq->rt_nr_boosted; 459 460 p = rt_task_of(rt_se); 461 return p->prio != p->normal_prio; 462 } 463 464 #ifdef CONFIG_SMP 465 static inline const struct cpumask *sched_rt_period_mask(void) 466 { 467 return cpu_rq(smp_processor_id())->rd->span; 468 } 469 #else 470 static inline const struct cpumask *sched_rt_period_mask(void) 471 { 472 return cpu_online_mask; 473 } 474 #endif 475 476 static inline 477 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 478 { 479 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; 480 } 481 482 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 483 { 484 return &rt_rq->tg->rt_bandwidth; 485 } 486 487 #else /* !CONFIG_RT_GROUP_SCHED */ 488 489 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 490 { 491 return rt_rq->rt_runtime; 492 } 493 494 static inline u64 sched_rt_period(struct rt_rq *rt_rq) 495 { 496 return ktime_to_ns(def_rt_bandwidth.rt_period); 497 } 498 499 typedef struct rt_rq *rt_rq_iter_t; 500 501 #define for_each_rt_rq(rt_rq, iter, rq) \ 502 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 503 504 static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) 505 { 506 } 507 508 static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) 509 { 510 } 511 512 #define for_each_leaf_rt_rq(rt_rq, rq) \ 513 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 514 515 #define for_each_sched_rt_entity(rt_se) \ 516 for (; rt_se; rt_se = NULL) 517 518 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) 519 { 520 return NULL; 521 } 522 523 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 524 { 525 if (rt_rq->rt_nr_running) 526 resched_task(rq_of_rt_rq(rt_rq)->curr); 527 } 528 529 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 530 { 531 } 532 533 static inline int rt_rq_throttled(struct rt_rq *rt_rq) 534 { 535 return rt_rq->rt_throttled; 536 } 537 538 static inline const struct cpumask *sched_rt_period_mask(void) 539 { 540 return cpu_online_mask; 541 } 542 543 static inline 544 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) 545 { 546 return &cpu_rq(cpu)->rt; 547 } 548 549 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) 550 { 551 return &def_rt_bandwidth; 552 } 553 554 #endif /* CONFIG_RT_GROUP_SCHED */ 555 556 #ifdef CONFIG_SMP 557 /* 558 * We ran out of runtime, see if we can borrow some from our neighbours. 559 */ 560 static int do_balance_runtime(struct rt_rq *rt_rq) 561 { 562 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 563 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 564 int i, weight, more = 0; 565 u64 rt_period; 566 567 weight = cpumask_weight(rd->span); 568 569 raw_spin_lock(&rt_b->rt_runtime_lock); 570 rt_period = ktime_to_ns(rt_b->rt_period); 571 for_each_cpu(i, rd->span) { 572 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 573 s64 diff; 574 575 if (iter == rt_rq) 576 continue; 577 578 raw_spin_lock(&iter->rt_runtime_lock); 579 /* 580 * Either all rqs have inf runtime and there's nothing to steal 581 * or __disable_runtime() below sets a specific rq to inf to 582 * indicate its been disabled and disalow stealing. 583 */ 584 if (iter->rt_runtime == RUNTIME_INF) 585 goto next; 586 587 /* 588 * From runqueues with spare time, take 1/n part of their 589 * spare time, but no more than our period. 590 */ 591 diff = iter->rt_runtime - iter->rt_time; 592 if (diff > 0) { 593 diff = div_u64((u64)diff, weight); 594 if (rt_rq->rt_runtime + diff > rt_period) 595 diff = rt_period - rt_rq->rt_runtime; 596 iter->rt_runtime -= diff; 597 rt_rq->rt_runtime += diff; 598 more = 1; 599 if (rt_rq->rt_runtime == rt_period) { 600 raw_spin_unlock(&iter->rt_runtime_lock); 601 break; 602 } 603 } 604 next: 605 raw_spin_unlock(&iter->rt_runtime_lock); 606 } 607 raw_spin_unlock(&rt_b->rt_runtime_lock); 608 609 return more; 610 } 611 612 /* 613 * Ensure this RQ takes back all the runtime it lend to its neighbours. 614 */ 615 static void __disable_runtime(struct rq *rq) 616 { 617 struct root_domain *rd = rq->rd; 618 rt_rq_iter_t iter; 619 struct rt_rq *rt_rq; 620 621 if (unlikely(!scheduler_running)) 622 return; 623 624 for_each_rt_rq(rt_rq, iter, rq) { 625 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 626 s64 want; 627 int i; 628 629 raw_spin_lock(&rt_b->rt_runtime_lock); 630 raw_spin_lock(&rt_rq->rt_runtime_lock); 631 /* 632 * Either we're all inf and nobody needs to borrow, or we're 633 * already disabled and thus have nothing to do, or we have 634 * exactly the right amount of runtime to take out. 635 */ 636 if (rt_rq->rt_runtime == RUNTIME_INF || 637 rt_rq->rt_runtime == rt_b->rt_runtime) 638 goto balanced; 639 raw_spin_unlock(&rt_rq->rt_runtime_lock); 640 641 /* 642 * Calculate the difference between what we started out with 643 * and what we current have, that's the amount of runtime 644 * we lend and now have to reclaim. 645 */ 646 want = rt_b->rt_runtime - rt_rq->rt_runtime; 647 648 /* 649 * Greedy reclaim, take back as much as we can. 650 */ 651 for_each_cpu(i, rd->span) { 652 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 653 s64 diff; 654 655 /* 656 * Can't reclaim from ourselves or disabled runqueues. 657 */ 658 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 659 continue; 660 661 raw_spin_lock(&iter->rt_runtime_lock); 662 if (want > 0) { 663 diff = min_t(s64, iter->rt_runtime, want); 664 iter->rt_runtime -= diff; 665 want -= diff; 666 } else { 667 iter->rt_runtime -= want; 668 want -= want; 669 } 670 raw_spin_unlock(&iter->rt_runtime_lock); 671 672 if (!want) 673 break; 674 } 675 676 raw_spin_lock(&rt_rq->rt_runtime_lock); 677 /* 678 * We cannot be left wanting - that would mean some runtime 679 * leaked out of the system. 680 */ 681 BUG_ON(want); 682 balanced: 683 /* 684 * Disable all the borrow logic by pretending we have inf 685 * runtime - in which case borrowing doesn't make sense. 686 */ 687 rt_rq->rt_runtime = RUNTIME_INF; 688 raw_spin_unlock(&rt_rq->rt_runtime_lock); 689 raw_spin_unlock(&rt_b->rt_runtime_lock); 690 } 691 } 692 693 static void disable_runtime(struct rq *rq) 694 { 695 unsigned long flags; 696 697 raw_spin_lock_irqsave(&rq->lock, flags); 698 __disable_runtime(rq); 699 raw_spin_unlock_irqrestore(&rq->lock, flags); 700 } 701 702 static void __enable_runtime(struct rq *rq) 703 { 704 rt_rq_iter_t iter; 705 struct rt_rq *rt_rq; 706 707 if (unlikely(!scheduler_running)) 708 return; 709 710 /* 711 * Reset each runqueue's bandwidth settings 712 */ 713 for_each_rt_rq(rt_rq, iter, rq) { 714 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 715 716 raw_spin_lock(&rt_b->rt_runtime_lock); 717 raw_spin_lock(&rt_rq->rt_runtime_lock); 718 rt_rq->rt_runtime = rt_b->rt_runtime; 719 rt_rq->rt_time = 0; 720 rt_rq->rt_throttled = 0; 721 raw_spin_unlock(&rt_rq->rt_runtime_lock); 722 raw_spin_unlock(&rt_b->rt_runtime_lock); 723 } 724 } 725 726 static void enable_runtime(struct rq *rq) 727 { 728 unsigned long flags; 729 730 raw_spin_lock_irqsave(&rq->lock, flags); 731 __enable_runtime(rq); 732 raw_spin_unlock_irqrestore(&rq->lock, flags); 733 } 734 735 int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) 736 { 737 int cpu = (int)(long)hcpu; 738 739 switch (action) { 740 case CPU_DOWN_PREPARE: 741 case CPU_DOWN_PREPARE_FROZEN: 742 disable_runtime(cpu_rq(cpu)); 743 return NOTIFY_OK; 744 745 case CPU_DOWN_FAILED: 746 case CPU_DOWN_FAILED_FROZEN: 747 case CPU_ONLINE: 748 case CPU_ONLINE_FROZEN: 749 enable_runtime(cpu_rq(cpu)); 750 return NOTIFY_OK; 751 752 default: 753 return NOTIFY_DONE; 754 } 755 } 756 757 static int balance_runtime(struct rt_rq *rt_rq) 758 { 759 int more = 0; 760 761 if (!sched_feat(RT_RUNTIME_SHARE)) 762 return more; 763 764 if (rt_rq->rt_time > rt_rq->rt_runtime) { 765 raw_spin_unlock(&rt_rq->rt_runtime_lock); 766 more = do_balance_runtime(rt_rq); 767 raw_spin_lock(&rt_rq->rt_runtime_lock); 768 } 769 770 return more; 771 } 772 #else /* !CONFIG_SMP */ 773 static inline int balance_runtime(struct rt_rq *rt_rq) 774 { 775 return 0; 776 } 777 #endif /* CONFIG_SMP */ 778 779 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 780 { 781 int i, idle = 1; 782 const struct cpumask *span; 783 784 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 785 return 1; 786 787 span = sched_rt_period_mask(); 788 for_each_cpu(i, span) { 789 int enqueue = 0; 790 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 791 struct rq *rq = rq_of_rt_rq(rt_rq); 792 793 raw_spin_lock(&rq->lock); 794 if (rt_rq->rt_time) { 795 u64 runtime; 796 797 raw_spin_lock(&rt_rq->rt_runtime_lock); 798 if (rt_rq->rt_throttled) 799 balance_runtime(rt_rq); 800 runtime = rt_rq->rt_runtime; 801 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 802 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 803 rt_rq->rt_throttled = 0; 804 enqueue = 1; 805 806 /* 807 * Force a clock update if the CPU was idle, 808 * lest wakeup -> unthrottle time accumulate. 809 */ 810 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 811 rq->skip_clock_update = -1; 812 } 813 if (rt_rq->rt_time || rt_rq->rt_nr_running) 814 idle = 0; 815 raw_spin_unlock(&rt_rq->rt_runtime_lock); 816 } else if (rt_rq->rt_nr_running) { 817 idle = 0; 818 if (!rt_rq_throttled(rt_rq)) 819 enqueue = 1; 820 } 821 822 if (enqueue) 823 sched_rt_rq_enqueue(rt_rq); 824 raw_spin_unlock(&rq->lock); 825 } 826 827 return idle; 828 } 829 830 static inline int rt_se_prio(struct sched_rt_entity *rt_se) 831 { 832 #ifdef CONFIG_RT_GROUP_SCHED 833 struct rt_rq *rt_rq = group_rt_rq(rt_se); 834 835 if (rt_rq) 836 return rt_rq->highest_prio.curr; 837 #endif 838 839 return rt_task_of(rt_se)->prio; 840 } 841 842 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) 843 { 844 u64 runtime = sched_rt_runtime(rt_rq); 845 846 if (rt_rq->rt_throttled) 847 return rt_rq_throttled(rt_rq); 848 849 if (runtime >= sched_rt_period(rt_rq)) 850 return 0; 851 852 balance_runtime(rt_rq); 853 runtime = sched_rt_runtime(rt_rq); 854 if (runtime == RUNTIME_INF) 855 return 0; 856 857 if (rt_rq->rt_time > runtime) { 858 rt_rq->rt_throttled = 1; 859 printk_once(KERN_WARNING "sched: RT throttling activated\n"); 860 if (rt_rq_throttled(rt_rq)) { 861 sched_rt_rq_dequeue(rt_rq); 862 return 1; 863 } 864 } 865 866 return 0; 867 } 868 869 /* 870 * Update the current task's runtime statistics. Skip current tasks that 871 * are not in our scheduling class. 872 */ 873 static void update_curr_rt(struct rq *rq) 874 { 875 struct task_struct *curr = rq->curr; 876 struct sched_rt_entity *rt_se = &curr->rt; 877 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 878 u64 delta_exec; 879 880 if (curr->sched_class != &rt_sched_class) 881 return; 882 883 delta_exec = rq->clock_task - curr->se.exec_start; 884 if (unlikely((s64)delta_exec < 0)) 885 delta_exec = 0; 886 887 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); 888 889 curr->se.sum_exec_runtime += delta_exec; 890 account_group_exec_runtime(curr, delta_exec); 891 892 curr->se.exec_start = rq->clock_task; 893 cpuacct_charge(curr, delta_exec); 894 895 sched_rt_avg_update(rq, delta_exec); 896 897 if (!rt_bandwidth_enabled()) 898 return; 899 900 for_each_sched_rt_entity(rt_se) { 901 rt_rq = rt_rq_of_se(rt_se); 902 903 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 904 raw_spin_lock(&rt_rq->rt_runtime_lock); 905 rt_rq->rt_time += delta_exec; 906 if (sched_rt_runtime_exceeded(rt_rq)) 907 resched_task(curr); 908 raw_spin_unlock(&rt_rq->rt_runtime_lock); 909 } 910 } 911 } 912 913 #if defined CONFIG_SMP 914 915 static void 916 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 917 { 918 struct rq *rq = rq_of_rt_rq(rt_rq); 919 920 if (rq->online && prio < prev_prio) 921 cpupri_set(&rq->rd->cpupri, rq->cpu, prio); 922 } 923 924 static void 925 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) 926 { 927 struct rq *rq = rq_of_rt_rq(rt_rq); 928 929 if (rq->online && rt_rq->highest_prio.curr != prev_prio) 930 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); 931 } 932 933 #else /* CONFIG_SMP */ 934 935 static inline 936 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 937 static inline 938 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} 939 940 #endif /* CONFIG_SMP */ 941 942 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 943 static void 944 inc_rt_prio(struct rt_rq *rt_rq, int prio) 945 { 946 int prev_prio = rt_rq->highest_prio.curr; 947 948 if (prio < prev_prio) 949 rt_rq->highest_prio.curr = prio; 950 951 inc_rt_prio_smp(rt_rq, prio, prev_prio); 952 } 953 954 static void 955 dec_rt_prio(struct rt_rq *rt_rq, int prio) 956 { 957 int prev_prio = rt_rq->highest_prio.curr; 958 959 if (rt_rq->rt_nr_running) { 960 961 WARN_ON(prio < prev_prio); 962 963 /* 964 * This may have been our highest task, and therefore 965 * we may have some recomputation to do 966 */ 967 if (prio == prev_prio) { 968 struct rt_prio_array *array = &rt_rq->active; 969 970 rt_rq->highest_prio.curr = 971 sched_find_first_bit(array->bitmap); 972 } 973 974 } else 975 rt_rq->highest_prio.curr = MAX_RT_PRIO; 976 977 dec_rt_prio_smp(rt_rq, prio, prev_prio); 978 } 979 980 #else 981 982 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} 983 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} 984 985 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ 986 987 #ifdef CONFIG_RT_GROUP_SCHED 988 989 static void 990 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 991 { 992 if (rt_se_boosted(rt_se)) 993 rt_rq->rt_nr_boosted++; 994 995 if (rt_rq->tg) 996 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); 997 } 998 999 static void 1000 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1001 { 1002 if (rt_se_boosted(rt_se)) 1003 rt_rq->rt_nr_boosted--; 1004 1005 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); 1006 } 1007 1008 #else /* CONFIG_RT_GROUP_SCHED */ 1009 1010 static void 1011 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1012 { 1013 start_rt_bandwidth(&def_rt_bandwidth); 1014 } 1015 1016 static inline 1017 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} 1018 1019 #endif /* CONFIG_RT_GROUP_SCHED */ 1020 1021 static inline 1022 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1023 { 1024 int prio = rt_se_prio(rt_se); 1025 1026 WARN_ON(!rt_prio(prio)); 1027 rt_rq->rt_nr_running++; 1028 1029 inc_rt_prio(rt_rq, prio); 1030 inc_rt_migration(rt_se, rt_rq); 1031 inc_rt_group(rt_se, rt_rq); 1032 } 1033 1034 static inline 1035 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 1036 { 1037 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 1038 WARN_ON(!rt_rq->rt_nr_running); 1039 rt_rq->rt_nr_running--; 1040 1041 dec_rt_prio(rt_rq, rt_se_prio(rt_se)); 1042 dec_rt_migration(rt_se, rt_rq); 1043 dec_rt_group(rt_se, rt_rq); 1044 } 1045 1046 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1047 { 1048 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1049 struct rt_prio_array *array = &rt_rq->active; 1050 struct rt_rq *group_rq = group_rt_rq(rt_se); 1051 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1052 1053 /* 1054 * Don't enqueue the group if its throttled, or when empty. 1055 * The latter is a consequence of the former when a child group 1056 * get throttled and the current group doesn't have any other 1057 * active members. 1058 */ 1059 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 1060 return; 1061 1062 if (!rt_rq->rt_nr_running) 1063 list_add_leaf_rt_rq(rt_rq); 1064 1065 if (head) 1066 list_add(&rt_se->run_list, queue); 1067 else 1068 list_add_tail(&rt_se->run_list, queue); 1069 __set_bit(rt_se_prio(rt_se), array->bitmap); 1070 1071 inc_rt_tasks(rt_se, rt_rq); 1072 } 1073 1074 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) 1075 { 1076 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 1077 struct rt_prio_array *array = &rt_rq->active; 1078 1079 list_del_init(&rt_se->run_list); 1080 if (list_empty(array->queue + rt_se_prio(rt_se))) 1081 __clear_bit(rt_se_prio(rt_se), array->bitmap); 1082 1083 dec_rt_tasks(rt_se, rt_rq); 1084 if (!rt_rq->rt_nr_running) 1085 list_del_leaf_rt_rq(rt_rq); 1086 } 1087 1088 /* 1089 * Because the prio of an upper entry depends on the lower 1090 * entries, we must remove entries top - down. 1091 */ 1092 static void dequeue_rt_stack(struct sched_rt_entity *rt_se) 1093 { 1094 struct sched_rt_entity *back = NULL; 1095 1096 for_each_sched_rt_entity(rt_se) { 1097 rt_se->back = back; 1098 back = rt_se; 1099 } 1100 1101 for (rt_se = back; rt_se; rt_se = rt_se->back) { 1102 if (on_rt_rq(rt_se)) 1103 __dequeue_rt_entity(rt_se); 1104 } 1105 } 1106 1107 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) 1108 { 1109 dequeue_rt_stack(rt_se); 1110 for_each_sched_rt_entity(rt_se) 1111 __enqueue_rt_entity(rt_se, head); 1112 } 1113 1114 static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 1115 { 1116 dequeue_rt_stack(rt_se); 1117 1118 for_each_sched_rt_entity(rt_se) { 1119 struct rt_rq *rt_rq = group_rt_rq(rt_se); 1120 1121 if (rt_rq && rt_rq->rt_nr_running) 1122 __enqueue_rt_entity(rt_se, false); 1123 } 1124 } 1125 1126 /* 1127 * Adding/removing a task to/from a priority array: 1128 */ 1129 static void 1130 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1131 { 1132 struct sched_rt_entity *rt_se = &p->rt; 1133 1134 if (flags & ENQUEUE_WAKEUP) 1135 rt_se->timeout = 0; 1136 1137 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1138 1139 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 1140 enqueue_pushable_task(rq, p); 1141 1142 inc_nr_running(rq); 1143 } 1144 1145 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) 1146 { 1147 struct sched_rt_entity *rt_se = &p->rt; 1148 1149 update_curr_rt(rq); 1150 dequeue_rt_entity(rt_se); 1151 1152 dequeue_pushable_task(rq, p); 1153 1154 dec_nr_running(rq); 1155 } 1156 1157 /* 1158 * Put task to the head or the end of the run list without the overhead of 1159 * dequeue followed by enqueue. 1160 */ 1161 static void 1162 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1163 { 1164 if (on_rt_rq(rt_se)) { 1165 struct rt_prio_array *array = &rt_rq->active; 1166 struct list_head *queue = array->queue + rt_se_prio(rt_se); 1167 1168 if (head) 1169 list_move(&rt_se->run_list, queue); 1170 else 1171 list_move_tail(&rt_se->run_list, queue); 1172 } 1173 } 1174 1175 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) 1176 { 1177 struct sched_rt_entity *rt_se = &p->rt; 1178 struct rt_rq *rt_rq; 1179 1180 for_each_sched_rt_entity(rt_se) { 1181 rt_rq = rt_rq_of_se(rt_se); 1182 requeue_rt_entity(rt_rq, rt_se, head); 1183 } 1184 } 1185 1186 static void yield_task_rt(struct rq *rq) 1187 { 1188 requeue_task_rt(rq, rq->curr, 0); 1189 } 1190 1191 #ifdef CONFIG_SMP 1192 static int find_lowest_rq(struct task_struct *task); 1193 1194 static int 1195 select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1196 { 1197 struct task_struct *curr; 1198 struct rq *rq; 1199 int cpu; 1200 1201 cpu = task_cpu(p); 1202 1203 if (p->rt.nr_cpus_allowed == 1) 1204 goto out; 1205 1206 /* For anything but wake ups, just return the task_cpu */ 1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1208 goto out; 1209 1210 rq = cpu_rq(cpu); 1211 1212 rcu_read_lock(); 1213 curr = ACCESS_ONCE(rq->curr); /* unlocked access */ 1214 1215 /* 1216 * If the current task on @p's runqueue is an RT task, then 1217 * try to see if we can wake this RT task up on another 1218 * runqueue. Otherwise simply start this RT task 1219 * on its current runqueue. 1220 * 1221 * We want to avoid overloading runqueues. If the woken 1222 * task is a higher priority, then it will stay on this CPU 1223 * and the lower prio task should be moved to another CPU. 1224 * Even though this will probably make the lower prio task 1225 * lose its cache, we do not want to bounce a higher task 1226 * around just because it gave up its CPU, perhaps for a 1227 * lock? 1228 * 1229 * For equal prio tasks, we just let the scheduler sort it out. 1230 * 1231 * Otherwise, just let it ride on the affined RQ and the 1232 * post-schedule router will push the preempted task away 1233 * 1234 * This test is optimistic, if we get it wrong the load-balancer 1235 * will have to sort it out. 1236 */ 1237 if (curr && unlikely(rt_task(curr)) && 1238 (curr->rt.nr_cpus_allowed < 2 || 1239 curr->prio <= p->prio) && 1240 (p->rt.nr_cpus_allowed > 1)) { 1241 int target = find_lowest_rq(p); 1242 1243 if (target != -1) 1244 cpu = target; 1245 } 1246 rcu_read_unlock(); 1247 1248 out: 1249 return cpu; 1250 } 1251 1252 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1253 { 1254 if (rq->curr->rt.nr_cpus_allowed == 1) 1255 return; 1256 1257 if (p->rt.nr_cpus_allowed != 1 1258 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1259 return; 1260 1261 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1262 return; 1263 1264 /* 1265 * There appears to be other cpus that can accept 1266 * current and none to run 'p', so lets reschedule 1267 * to try and push current away: 1268 */ 1269 requeue_task_rt(rq, p, 1); 1270 resched_task(rq->curr); 1271 } 1272 1273 #endif /* CONFIG_SMP */ 1274 1275 /* 1276 * Preempt the current task with a newly woken task if needed: 1277 */ 1278 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) 1279 { 1280 if (p->prio < rq->curr->prio) { 1281 resched_task(rq->curr); 1282 return; 1283 } 1284 1285 #ifdef CONFIG_SMP 1286 /* 1287 * If: 1288 * 1289 * - the newly woken task is of equal priority to the current task 1290 * - the newly woken task is non-migratable while current is migratable 1291 * - current will be preempted on the next reschedule 1292 * 1293 * we should check to see if current can readily move to a different 1294 * cpu. If so, we will reschedule to allow the push logic to try 1295 * to move current somewhere else, making room for our non-migratable 1296 * task. 1297 */ 1298 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1299 check_preempt_equal_prio(rq, p); 1300 #endif 1301 } 1302 1303 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1304 struct rt_rq *rt_rq) 1305 { 1306 struct rt_prio_array *array = &rt_rq->active; 1307 struct sched_rt_entity *next = NULL; 1308 struct list_head *queue; 1309 int idx; 1310 1311 idx = sched_find_first_bit(array->bitmap); 1312 BUG_ON(idx >= MAX_RT_PRIO); 1313 1314 queue = array->queue + idx; 1315 next = list_entry(queue->next, struct sched_rt_entity, run_list); 1316 1317 return next; 1318 } 1319 1320 static struct task_struct *_pick_next_task_rt(struct rq *rq) 1321 { 1322 struct sched_rt_entity *rt_se; 1323 struct task_struct *p; 1324 struct rt_rq *rt_rq; 1325 1326 rt_rq = &rq->rt; 1327 1328 if (!rt_rq->rt_nr_running) 1329 return NULL; 1330 1331 if (rt_rq_throttled(rt_rq)) 1332 return NULL; 1333 1334 do { 1335 rt_se = pick_next_rt_entity(rq, rt_rq); 1336 BUG_ON(!rt_se); 1337 rt_rq = group_rt_rq(rt_se); 1338 } while (rt_rq); 1339 1340 p = rt_task_of(rt_se); 1341 p->se.exec_start = rq->clock_task; 1342 1343 return p; 1344 } 1345 1346 static struct task_struct *pick_next_task_rt(struct rq *rq) 1347 { 1348 struct task_struct *p = _pick_next_task_rt(rq); 1349 1350 /* The running task is never eligible for pushing */ 1351 if (p) 1352 dequeue_pushable_task(rq, p); 1353 1354 #ifdef CONFIG_SMP 1355 /* 1356 * We detect this state here so that we can avoid taking the RQ 1357 * lock again later if there is no need to push 1358 */ 1359 rq->post_schedule = has_pushable_tasks(rq); 1360 #endif 1361 1362 return p; 1363 } 1364 1365 static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1366 { 1367 update_curr_rt(rq); 1368 1369 /* 1370 * The previous task needs to be made eligible for pushing 1371 * if it is still active 1372 */ 1373 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) 1374 enqueue_pushable_task(rq, p); 1375 } 1376 1377 #ifdef CONFIG_SMP 1378 1379 /* Only try algorithms three times */ 1380 #define RT_MAX_TRIES 3 1381 1382 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1383 { 1384 if (!task_running(rq, p) && 1385 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1386 (p->rt.nr_cpus_allowed > 1)) 1387 return 1; 1388 return 0; 1389 } 1390 1391 /* Return the second highest RT task, NULL otherwise */ 1392 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) 1393 { 1394 struct task_struct *next = NULL; 1395 struct sched_rt_entity *rt_se; 1396 struct rt_prio_array *array; 1397 struct rt_rq *rt_rq; 1398 int idx; 1399 1400 for_each_leaf_rt_rq(rt_rq, rq) { 1401 array = &rt_rq->active; 1402 idx = sched_find_first_bit(array->bitmap); 1403 next_idx: 1404 if (idx >= MAX_RT_PRIO) 1405 continue; 1406 if (next && next->prio < idx) 1407 continue; 1408 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1409 struct task_struct *p; 1410 1411 if (!rt_entity_is_task(rt_se)) 1412 continue; 1413 1414 p = rt_task_of(rt_se); 1415 if (pick_rt_task(rq, p, cpu)) { 1416 next = p; 1417 break; 1418 } 1419 } 1420 if (!next) { 1421 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); 1422 goto next_idx; 1423 } 1424 } 1425 1426 return next; 1427 } 1428 1429 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); 1430 1431 static int find_lowest_rq(struct task_struct *task) 1432 { 1433 struct sched_domain *sd; 1434 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1435 int this_cpu = smp_processor_id(); 1436 int cpu = task_cpu(task); 1437 1438 /* Make sure the mask is initialized first */ 1439 if (unlikely(!lowest_mask)) 1440 return -1; 1441 1442 if (task->rt.nr_cpus_allowed == 1) 1443 return -1; /* No other targets possible */ 1444 1445 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1446 return -1; /* No targets found */ 1447 1448 /* 1449 * At this point we have built a mask of cpus representing the 1450 * lowest priority tasks in the system. Now we want to elect 1451 * the best one based on our affinity and topology. 1452 * 1453 * We prioritize the last cpu that the task executed on since 1454 * it is most likely cache-hot in that location. 1455 */ 1456 if (cpumask_test_cpu(cpu, lowest_mask)) 1457 return cpu; 1458 1459 /* 1460 * Otherwise, we consult the sched_domains span maps to figure 1461 * out which cpu is logically closest to our hot cache data. 1462 */ 1463 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1464 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1465 1466 rcu_read_lock(); 1467 for_each_domain(cpu, sd) { 1468 if (sd->flags & SD_WAKE_AFFINE) { 1469 int best_cpu; 1470 1471 /* 1472 * "this_cpu" is cheaper to preempt than a 1473 * remote processor. 1474 */ 1475 if (this_cpu != -1 && 1476 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { 1477 rcu_read_unlock(); 1478 return this_cpu; 1479 } 1480 1481 best_cpu = cpumask_first_and(lowest_mask, 1482 sched_domain_span(sd)); 1483 if (best_cpu < nr_cpu_ids) { 1484 rcu_read_unlock(); 1485 return best_cpu; 1486 } 1487 } 1488 } 1489 rcu_read_unlock(); 1490 1491 /* 1492 * And finally, if there were no matches within the domains 1493 * just give the caller *something* to work with from the compatible 1494 * locations. 1495 */ 1496 if (this_cpu != -1) 1497 return this_cpu; 1498 1499 cpu = cpumask_any(lowest_mask); 1500 if (cpu < nr_cpu_ids) 1501 return cpu; 1502 return -1; 1503 } 1504 1505 /* Will lock the rq it finds */ 1506 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) 1507 { 1508 struct rq *lowest_rq = NULL; 1509 int tries; 1510 int cpu; 1511 1512 for (tries = 0; tries < RT_MAX_TRIES; tries++) { 1513 cpu = find_lowest_rq(task); 1514 1515 if ((cpu == -1) || (cpu == rq->cpu)) 1516 break; 1517 1518 lowest_rq = cpu_rq(cpu); 1519 1520 /* if the prio of this runqueue changed, try again */ 1521 if (double_lock_balance(rq, lowest_rq)) { 1522 /* 1523 * We had to unlock the run queue. In 1524 * the mean time, task could have 1525 * migrated already or had its affinity changed. 1526 * Also make sure that it wasn't scheduled on its rq. 1527 */ 1528 if (unlikely(task_rq(task) != rq || 1529 !cpumask_test_cpu(lowest_rq->cpu, 1530 tsk_cpus_allowed(task)) || 1531 task_running(rq, task) || 1532 !task->on_rq)) { 1533 1534 raw_spin_unlock(&lowest_rq->lock); 1535 lowest_rq = NULL; 1536 break; 1537 } 1538 } 1539 1540 /* If this rq is still suitable use it. */ 1541 if (lowest_rq->rt.highest_prio.curr > task->prio) 1542 break; 1543 1544 /* try again */ 1545 double_unlock_balance(rq, lowest_rq); 1546 lowest_rq = NULL; 1547 } 1548 1549 return lowest_rq; 1550 } 1551 1552 static struct task_struct *pick_next_pushable_task(struct rq *rq) 1553 { 1554 struct task_struct *p; 1555 1556 if (!has_pushable_tasks(rq)) 1557 return NULL; 1558 1559 p = plist_first_entry(&rq->rt.pushable_tasks, 1560 struct task_struct, pushable_tasks); 1561 1562 BUG_ON(rq->cpu != task_cpu(p)); 1563 BUG_ON(task_current(rq, p)); 1564 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1565 1566 BUG_ON(!p->on_rq); 1567 BUG_ON(!rt_task(p)); 1568 1569 return p; 1570 } 1571 1572 /* 1573 * If the current CPU has more than one RT task, see if the non 1574 * running task can migrate over to a CPU that is running a task 1575 * of lesser priority. 1576 */ 1577 static int push_rt_task(struct rq *rq) 1578 { 1579 struct task_struct *next_task; 1580 struct rq *lowest_rq; 1581 int ret = 0; 1582 1583 if (!rq->rt.overloaded) 1584 return 0; 1585 1586 next_task = pick_next_pushable_task(rq); 1587 if (!next_task) 1588 return 0; 1589 1590 retry: 1591 if (unlikely(next_task == rq->curr)) { 1592 WARN_ON(1); 1593 return 0; 1594 } 1595 1596 /* 1597 * It's possible that the next_task slipped in of 1598 * higher priority than current. If that's the case 1599 * just reschedule current. 1600 */ 1601 if (unlikely(next_task->prio < rq->curr->prio)) { 1602 resched_task(rq->curr); 1603 return 0; 1604 } 1605 1606 /* We might release rq lock */ 1607 get_task_struct(next_task); 1608 1609 /* find_lock_lowest_rq locks the rq if found */ 1610 lowest_rq = find_lock_lowest_rq(next_task, rq); 1611 if (!lowest_rq) { 1612 struct task_struct *task; 1613 /* 1614 * find_lock_lowest_rq releases rq->lock 1615 * so it is possible that next_task has migrated. 1616 * 1617 * We need to make sure that the task is still on the same 1618 * run-queue and is also still the next task eligible for 1619 * pushing. 1620 */ 1621 task = pick_next_pushable_task(rq); 1622 if (task_cpu(next_task) == rq->cpu && task == next_task) { 1623 /* 1624 * The task hasn't migrated, and is still the next 1625 * eligible task, but we failed to find a run-queue 1626 * to push it to. Do not retry in this case, since 1627 * other cpus will pull from us when ready. 1628 */ 1629 goto out; 1630 } 1631 1632 if (!task) 1633 /* No more tasks, just exit */ 1634 goto out; 1635 1636 /* 1637 * Something has shifted, try again. 1638 */ 1639 put_task_struct(next_task); 1640 next_task = task; 1641 goto retry; 1642 } 1643 1644 deactivate_task(rq, next_task, 0); 1645 set_task_cpu(next_task, lowest_rq->cpu); 1646 activate_task(lowest_rq, next_task, 0); 1647 ret = 1; 1648 1649 resched_task(lowest_rq->curr); 1650 1651 double_unlock_balance(rq, lowest_rq); 1652 1653 out: 1654 put_task_struct(next_task); 1655 1656 return ret; 1657 } 1658 1659 static void push_rt_tasks(struct rq *rq) 1660 { 1661 /* push_rt_task will return true if it moved an RT */ 1662 while (push_rt_task(rq)) 1663 ; 1664 } 1665 1666 static int pull_rt_task(struct rq *this_rq) 1667 { 1668 int this_cpu = this_rq->cpu, ret = 0, cpu; 1669 struct task_struct *p; 1670 struct rq *src_rq; 1671 1672 if (likely(!rt_overloaded(this_rq))) 1673 return 0; 1674 1675 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1676 if (this_cpu == cpu) 1677 continue; 1678 1679 src_rq = cpu_rq(cpu); 1680 1681 /* 1682 * Don't bother taking the src_rq->lock if the next highest 1683 * task is known to be lower-priority than our current task. 1684 * This may look racy, but if this value is about to go 1685 * logically higher, the src_rq will push this task away. 1686 * And if its going logically lower, we do not care 1687 */ 1688 if (src_rq->rt.highest_prio.next >= 1689 this_rq->rt.highest_prio.curr) 1690 continue; 1691 1692 /* 1693 * We can potentially drop this_rq's lock in 1694 * double_lock_balance, and another CPU could 1695 * alter this_rq 1696 */ 1697 double_lock_balance(this_rq, src_rq); 1698 1699 /* 1700 * Are there still pullable RT tasks? 1701 */ 1702 if (src_rq->rt.rt_nr_running <= 1) 1703 goto skip; 1704 1705 p = pick_next_highest_task_rt(src_rq, this_cpu); 1706 1707 /* 1708 * Do we have an RT task that preempts 1709 * the to-be-scheduled task? 1710 */ 1711 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1712 WARN_ON(p == src_rq->curr); 1713 WARN_ON(!p->on_rq); 1714 1715 /* 1716 * There's a chance that p is higher in priority 1717 * than what's currently running on its cpu. 1718 * This is just that p is wakeing up and hasn't 1719 * had a chance to schedule. We only pull 1720 * p if it is lower in priority than the 1721 * current task on the run queue 1722 */ 1723 if (p->prio < src_rq->curr->prio) 1724 goto skip; 1725 1726 ret = 1; 1727 1728 deactivate_task(src_rq, p, 0); 1729 set_task_cpu(p, this_cpu); 1730 activate_task(this_rq, p, 0); 1731 /* 1732 * We continue with the search, just in 1733 * case there's an even higher prio task 1734 * in another runqueue. (low likelihood 1735 * but possible) 1736 */ 1737 } 1738 skip: 1739 double_unlock_balance(this_rq, src_rq); 1740 } 1741 1742 return ret; 1743 } 1744 1745 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) 1746 { 1747 /* Try to pull RT tasks here if we lower this rq's prio */ 1748 if (rq->rt.highest_prio.curr > prev->prio) 1749 pull_rt_task(rq); 1750 } 1751 1752 static void post_schedule_rt(struct rq *rq) 1753 { 1754 push_rt_tasks(rq); 1755 } 1756 1757 /* 1758 * If we are not running and we are not going to reschedule soon, we should 1759 * try to push tasks away now 1760 */ 1761 static void task_woken_rt(struct rq *rq, struct task_struct *p) 1762 { 1763 if (!task_running(rq, p) && 1764 !test_tsk_need_resched(rq->curr) && 1765 has_pushable_tasks(rq) && 1766 p->rt.nr_cpus_allowed > 1 && 1767 rt_task(rq->curr) && 1768 (rq->curr->rt.nr_cpus_allowed < 2 || 1769 rq->curr->prio <= p->prio)) 1770 push_rt_tasks(rq); 1771 } 1772 1773 static void set_cpus_allowed_rt(struct task_struct *p, 1774 const struct cpumask *new_mask) 1775 { 1776 int weight = cpumask_weight(new_mask); 1777 1778 BUG_ON(!rt_task(p)); 1779 1780 /* 1781 * Update the migration status of the RQ if we have an RT task 1782 * which is running AND changing its weight value. 1783 */ 1784 if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { 1785 struct rq *rq = task_rq(p); 1786 1787 if (!task_current(rq, p)) { 1788 /* 1789 * Make sure we dequeue this task from the pushable list 1790 * before going further. It will either remain off of 1791 * the list because we are no longer pushable, or it 1792 * will be requeued. 1793 */ 1794 if (p->rt.nr_cpus_allowed > 1) 1795 dequeue_pushable_task(rq, p); 1796 1797 /* 1798 * Requeue if our weight is changing and still > 1 1799 */ 1800 if (weight > 1) 1801 enqueue_pushable_task(rq, p); 1802 1803 } 1804 1805 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { 1806 rq->rt.rt_nr_migratory++; 1807 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { 1808 BUG_ON(!rq->rt.rt_nr_migratory); 1809 rq->rt.rt_nr_migratory--; 1810 } 1811 1812 update_rt_migration(&rq->rt); 1813 } 1814 } 1815 1816 /* Assumes rq->lock is held */ 1817 static void rq_online_rt(struct rq *rq) 1818 { 1819 if (rq->rt.overloaded) 1820 rt_set_overload(rq); 1821 1822 __enable_runtime(rq); 1823 1824 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 1825 } 1826 1827 /* Assumes rq->lock is held */ 1828 static void rq_offline_rt(struct rq *rq) 1829 { 1830 if (rq->rt.overloaded) 1831 rt_clear_overload(rq); 1832 1833 __disable_runtime(rq); 1834 1835 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); 1836 } 1837 1838 /* 1839 * When switch from the rt queue, we bring ourselves to a position 1840 * that we might want to pull RT tasks from other runqueues. 1841 */ 1842 static void switched_from_rt(struct rq *rq, struct task_struct *p) 1843 { 1844 /* 1845 * If there are other RT tasks then we will reschedule 1846 * and the scheduling of the other RT tasks will handle 1847 * the balancing. But if we are the last RT task 1848 * we may need to handle the pulling of RT tasks 1849 * now. 1850 */ 1851 if (p->on_rq && !rq->rt.rt_nr_running) 1852 pull_rt_task(rq); 1853 } 1854 1855 void init_sched_rt_class(void) 1856 { 1857 unsigned int i; 1858 1859 for_each_possible_cpu(i) { 1860 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1861 GFP_KERNEL, cpu_to_node(i)); 1862 } 1863 } 1864 #endif /* CONFIG_SMP */ 1865 1866 /* 1867 * When switching a task to RT, we may overload the runqueue 1868 * with RT tasks. In this case we try to push them off to 1869 * other runqueues. 1870 */ 1871 static void switched_to_rt(struct rq *rq, struct task_struct *p) 1872 { 1873 int check_resched = 1; 1874 1875 /* 1876 * If we are already running, then there's nothing 1877 * that needs to be done. But if we are not running 1878 * we may need to preempt the current running task. 1879 * If that current running task is also an RT task 1880 * then see if we can move to another run queue. 1881 */ 1882 if (p->on_rq && rq->curr != p) { 1883 #ifdef CONFIG_SMP 1884 if (rq->rt.overloaded && push_rt_task(rq) && 1885 /* Don't resched if we changed runqueues */ 1886 rq != task_rq(p)) 1887 check_resched = 0; 1888 #endif /* CONFIG_SMP */ 1889 if (check_resched && p->prio < rq->curr->prio) 1890 resched_task(rq->curr); 1891 } 1892 } 1893 1894 /* 1895 * Priority of the task has changed. This may cause 1896 * us to initiate a push or pull. 1897 */ 1898 static void 1899 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1900 { 1901 if (!p->on_rq) 1902 return; 1903 1904 if (rq->curr == p) { 1905 #ifdef CONFIG_SMP 1906 /* 1907 * If our priority decreases while running, we 1908 * may need to pull tasks to this runqueue. 1909 */ 1910 if (oldprio < p->prio) 1911 pull_rt_task(rq); 1912 /* 1913 * If there's a higher priority task waiting to run 1914 * then reschedule. Note, the above pull_rt_task 1915 * can release the rq lock and p could migrate. 1916 * Only reschedule if p is still on the same runqueue. 1917 */ 1918 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) 1919 resched_task(p); 1920 #else 1921 /* For UP simply resched on drop of prio */ 1922 if (oldprio < p->prio) 1923 resched_task(p); 1924 #endif /* CONFIG_SMP */ 1925 } else { 1926 /* 1927 * This task is not running, but if it is 1928 * greater than the current running task 1929 * then reschedule. 1930 */ 1931 if (p->prio < rq->curr->prio) 1932 resched_task(rq->curr); 1933 } 1934 } 1935 1936 static void watchdog(struct rq *rq, struct task_struct *p) 1937 { 1938 unsigned long soft, hard; 1939 1940 /* max may change after cur was read, this will be fixed next tick */ 1941 soft = task_rlimit(p, RLIMIT_RTTIME); 1942 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1943 1944 if (soft != RLIM_INFINITY) { 1945 unsigned long next; 1946 1947 p->rt.timeout++; 1948 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1949 if (p->rt.timeout > next) 1950 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 1951 } 1952 } 1953 1954 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 1955 { 1956 update_curr_rt(rq); 1957 1958 watchdog(rq, p); 1959 1960 /* 1961 * RR tasks need a special form of timeslice management. 1962 * FIFO tasks have no timeslices. 1963 */ 1964 if (p->policy != SCHED_RR) 1965 return; 1966 1967 if (--p->rt.time_slice) 1968 return; 1969 1970 p->rt.time_slice = DEF_TIMESLICE; 1971 1972 /* 1973 * Requeue to the end of queue if we are not the only element 1974 * on the queue: 1975 */ 1976 if (p->rt.run_list.prev != p->rt.run_list.next) { 1977 requeue_task_rt(rq, p, 0); 1978 set_tsk_need_resched(p); 1979 } 1980 } 1981 1982 static void set_curr_task_rt(struct rq *rq) 1983 { 1984 struct task_struct *p = rq->curr; 1985 1986 p->se.exec_start = rq->clock_task; 1987 1988 /* The running task is never eligible for pushing */ 1989 dequeue_pushable_task(rq, p); 1990 } 1991 1992 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1993 { 1994 /* 1995 * Time slice is 0 for SCHED_FIFO tasks 1996 */ 1997 if (task->policy == SCHED_RR) 1998 return DEF_TIMESLICE; 1999 else 2000 return 0; 2001 } 2002 2003 const struct sched_class rt_sched_class = { 2004 .next = &fair_sched_class, 2005 .enqueue_task = enqueue_task_rt, 2006 .dequeue_task = dequeue_task_rt, 2007 .yield_task = yield_task_rt, 2008 2009 .check_preempt_curr = check_preempt_curr_rt, 2010 2011 .pick_next_task = pick_next_task_rt, 2012 .put_prev_task = put_prev_task_rt, 2013 2014 #ifdef CONFIG_SMP 2015 .select_task_rq = select_task_rq_rt, 2016 2017 .set_cpus_allowed = set_cpus_allowed_rt, 2018 .rq_online = rq_online_rt, 2019 .rq_offline = rq_offline_rt, 2020 .pre_schedule = pre_schedule_rt, 2021 .post_schedule = post_schedule_rt, 2022 .task_woken = task_woken_rt, 2023 .switched_from = switched_from_rt, 2024 #endif 2025 2026 .set_curr_task = set_curr_task_rt, 2027 .task_tick = task_tick_rt, 2028 2029 .get_rr_interval = get_rr_interval_rt, 2030 2031 .prio_changed = prio_changed_rt, 2032 .switched_to = switched_to_rt, 2033 }; 2034 2035 #ifdef CONFIG_SCHED_DEBUG 2036 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2037 2038 void print_rt_stats(struct seq_file *m, int cpu) 2039 { 2040 rt_rq_iter_t iter; 2041 struct rt_rq *rt_rq; 2042 2043 rcu_read_lock(); 2044 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) 2045 print_rt_rq(m, cpu, rt_rq); 2046 rcu_read_unlock(); 2047 } 2048 #endif /* CONFIG_SCHED_DEBUG */ 2049