1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Scheduler topology setup/handling methods 4 */ 5 6 #include <linux/sched/isolation.h> 7 #include <linux/sched/clock.h> 8 #include <linux/bsearch.h> 9 #include "sched.h" 10 11 DEFINE_MUTEX(sched_domains_mutex); 12 void sched_domains_mutex_lock(void) 13 { 14 mutex_lock(&sched_domains_mutex); 15 } 16 void sched_domains_mutex_unlock(void) 17 { 18 mutex_unlock(&sched_domains_mutex); 19 } 20 21 /* Protected by sched_domains_mutex: */ 22 static cpumask_var_t sched_domains_llc_id_allocmask; 23 static cpumask_var_t sched_domains_tmpmask; 24 static cpumask_var_t sched_domains_tmpmask2; 25 int max_lid; 26 27 static int __init sched_debug_setup(char *str) 28 { 29 sched_debug_verbose = true; 30 31 return 0; 32 } 33 early_param("sched_verbose", sched_debug_setup); 34 35 static inline bool sched_debug(void) 36 { 37 return sched_debug_verbose; 38 } 39 40 #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, 41 const struct sd_flag_debug sd_flag_debug[] = { 42 #include <linux/sched/sd_flags.h> 43 }; 44 #undef SD_FLAG 45 46 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 47 struct cpumask *groupmask) 48 { 49 struct sched_group *group = sd->groups; 50 unsigned long flags = sd->flags; 51 unsigned int idx; 52 53 cpumask_clear(groupmask); 54 55 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); 56 printk(KERN_CONT "span=%*pbl level=%s\n", 57 cpumask_pr_args(sched_domain_span(sd)), sd->name); 58 59 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 60 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); 61 } 62 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { 63 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); 64 } 65 66 for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { 67 unsigned int flag = BIT(idx); 68 unsigned int meta_flags = sd_flag_debug[idx].meta_flags; 69 70 if ((meta_flags & SDF_SHARED_CHILD) && sd->child && 71 !(sd->child->flags & flag)) 72 printk(KERN_ERR "ERROR: flag %s set here but not in child\n", 73 sd_flag_debug[idx].name); 74 75 if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && 76 !(sd->parent->flags & flag)) 77 printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", 78 sd_flag_debug[idx].name); 79 } 80 81 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 82 do { 83 if (!group) { 84 printk("\n"); 85 printk(KERN_ERR "ERROR: group is NULL\n"); 86 break; 87 } 88 89 if (cpumask_empty(sched_group_span(group))) { 90 printk(KERN_CONT "\n"); 91 printk(KERN_ERR "ERROR: empty group\n"); 92 break; 93 } 94 95 if (!(sd->flags & SD_NUMA) && 96 cpumask_intersects(groupmask, sched_group_span(group))) { 97 printk(KERN_CONT "\n"); 98 printk(KERN_ERR "ERROR: repeated CPUs\n"); 99 break; 100 } 101 102 cpumask_or(groupmask, groupmask, sched_group_span(group)); 103 104 printk(KERN_CONT " %d:{ span=%*pbl", 105 group->sgc->id, 106 cpumask_pr_args(sched_group_span(group))); 107 108 if ((sd->flags & SD_NUMA) && 109 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { 110 printk(KERN_CONT " mask=%*pbl", 111 cpumask_pr_args(group_balance_mask(group))); 112 } 113 114 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) 115 printk(KERN_CONT " cap=%lu", group->sgc->capacity); 116 117 if (group == sd->groups && sd->child && 118 !cpumask_equal(sched_domain_span(sd->child), 119 sched_group_span(group))) { 120 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n"); 121 } 122 123 printk(KERN_CONT " }"); 124 125 group = group->next; 126 127 if (group != sd->groups) 128 printk(KERN_CONT ","); 129 130 } while (group != sd->groups); 131 printk(KERN_CONT "\n"); 132 133 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 134 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 135 136 if (sd->parent && 137 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 138 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); 139 return 0; 140 } 141 142 static void sched_domain_debug(struct sched_domain *sd, int cpu) 143 { 144 int level = 0; 145 146 if (!sched_debug_verbose) 147 return; 148 149 if (!sd) { 150 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 151 return; 152 } 153 154 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu); 155 156 for (;;) { 157 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 158 break; 159 level++; 160 sd = sd->parent; 161 if (!sd) 162 break; 163 } 164 } 165 166 /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ 167 #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | 168 static const unsigned int SD_DEGENERATE_GROUPS_MASK = 169 #include <linux/sched/sd_flags.h> 170 0; 171 #undef SD_FLAG 172 173 static int sd_degenerate(struct sched_domain *sd) 174 { 175 if (cpumask_weight(sched_domain_span(sd)) == 1) 176 return 1; 177 178 /* Following flags need at least 2 groups */ 179 if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && 180 (sd->groups != sd->groups->next)) 181 return 0; 182 183 /* Following flags don't use groups */ 184 if (sd->flags & (SD_WAKE_AFFINE)) 185 return 0; 186 187 return 1; 188 } 189 190 static int 191 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 192 { 193 unsigned long cflags = sd->flags, pflags = parent->flags; 194 195 if (sd_degenerate(parent)) 196 return 1; 197 198 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 199 return 0; 200 201 /* Flags needing groups don't count if only 1 group in parent */ 202 if (parent->groups == parent->groups->next) 203 pflags &= ~SD_DEGENERATE_GROUPS_MASK; 204 205 if (~cflags & pflags) 206 return 0; 207 208 return 1; 209 } 210 211 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 212 DEFINE_STATIC_KEY_FALSE(sched_energy_present); 213 static unsigned int sysctl_sched_energy_aware = 1; 214 static DEFINE_MUTEX(sched_energy_mutex); 215 static bool sched_energy_update; 216 217 static bool sched_is_eas_possible(const struct cpumask *cpu_mask) 218 { 219 bool any_asym_capacity = false; 220 int i; 221 222 /* EAS is enabled for asymmetric CPU capacity topologies. */ 223 for_each_cpu(i, cpu_mask) { 224 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { 225 any_asym_capacity = true; 226 break; 227 } 228 } 229 if (!any_asym_capacity) { 230 if (sched_debug()) { 231 pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n", 232 cpumask_pr_args(cpu_mask)); 233 } 234 return false; 235 } 236 237 /* EAS definitely does *not* handle SMT */ 238 if (sched_smt_active()) { 239 if (sched_debug()) { 240 pr_info("rd %*pbl: Checking EAS, SMT is not supported\n", 241 cpumask_pr_args(cpu_mask)); 242 } 243 return false; 244 } 245 246 if (!arch_scale_freq_invariant()) { 247 if (sched_debug()) { 248 pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported", 249 cpumask_pr_args(cpu_mask)); 250 } 251 return false; 252 } 253 254 if (!cpufreq_ready_for_eas(cpu_mask)) { 255 if (sched_debug()) { 256 pr_info("rd %*pbl: Checking EAS: cpufreq is not ready\n", 257 cpumask_pr_args(cpu_mask)); 258 } 259 return false; 260 } 261 262 return true; 263 } 264 265 void rebuild_sched_domains_energy(void) 266 { 267 mutex_lock(&sched_energy_mutex); 268 sched_energy_update = true; 269 rebuild_sched_domains(); 270 sched_energy_update = false; 271 mutex_unlock(&sched_energy_mutex); 272 } 273 274 #ifdef CONFIG_PROC_SYSCTL 275 static int sched_energy_aware_handler(const struct ctl_table *table, int write, 276 void *buffer, size_t *lenp, loff_t *ppos) 277 { 278 int ret; 279 280 if (write && !capable(CAP_SYS_ADMIN)) 281 return -EPERM; 282 283 if (!sched_is_eas_possible(cpu_active_mask)) { 284 if (write) { 285 return -EOPNOTSUPP; 286 } else { 287 *lenp = 0; 288 return 0; 289 } 290 } 291 292 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 293 if (!ret && write) { 294 if (sysctl_sched_energy_aware != sched_energy_enabled()) 295 rebuild_sched_domains_energy(); 296 } 297 298 return ret; 299 } 300 301 static const struct ctl_table sched_energy_aware_sysctls[] = { 302 { 303 .procname = "sched_energy_aware", 304 .data = &sysctl_sched_energy_aware, 305 .maxlen = sizeof(unsigned int), 306 .mode = 0644, 307 .proc_handler = sched_energy_aware_handler, 308 .extra1 = SYSCTL_ZERO, 309 .extra2 = SYSCTL_ONE, 310 }, 311 }; 312 313 static int __init sched_energy_aware_sysctl_init(void) 314 { 315 register_sysctl_init("kernel", sched_energy_aware_sysctls); 316 return 0; 317 } 318 319 late_initcall(sched_energy_aware_sysctl_init); 320 #endif /* CONFIG_PROC_SYSCTL */ 321 322 static void free_pd(struct perf_domain *pd) 323 { 324 struct perf_domain *tmp; 325 326 while (pd) { 327 tmp = pd->next; 328 kfree(pd); 329 pd = tmp; 330 } 331 } 332 333 static struct perf_domain *find_pd(struct perf_domain *pd, int cpu) 334 { 335 while (pd) { 336 if (cpumask_test_cpu(cpu, perf_domain_span(pd))) 337 return pd; 338 pd = pd->next; 339 } 340 341 return NULL; 342 } 343 344 static struct perf_domain *pd_init(int cpu) 345 { 346 struct em_perf_domain *obj = em_cpu_get(cpu); 347 struct perf_domain *pd; 348 349 if (!obj) { 350 if (sched_debug()) 351 pr_info("%s: no EM found for CPU%d\n", __func__, cpu); 352 return NULL; 353 } 354 355 pd = kzalloc_obj(*pd); 356 if (!pd) 357 return NULL; 358 pd->em_pd = obj; 359 360 return pd; 361 } 362 363 static void perf_domain_debug(const struct cpumask *cpu_map, 364 struct perf_domain *pd) 365 { 366 if (!sched_debug() || !pd) 367 return; 368 369 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); 370 371 while (pd) { 372 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", 373 cpumask_first(perf_domain_span(pd)), 374 cpumask_pr_args(perf_domain_span(pd)), 375 em_pd_nr_perf_states(pd->em_pd)); 376 pd = pd->next; 377 } 378 379 printk(KERN_CONT "\n"); 380 } 381 382 static void destroy_perf_domain_rcu(struct rcu_head *rp) 383 { 384 struct perf_domain *pd; 385 386 pd = container_of(rp, struct perf_domain, rcu); 387 free_pd(pd); 388 } 389 390 static void sched_energy_set(bool has_eas) 391 { 392 if (!has_eas && sched_energy_enabled()) { 393 if (sched_debug()) 394 pr_info("%s: stopping EAS\n", __func__); 395 static_branch_disable_cpuslocked(&sched_energy_present); 396 } else if (has_eas && !sched_energy_enabled()) { 397 if (sched_debug()) 398 pr_info("%s: starting EAS\n", __func__); 399 static_branch_enable_cpuslocked(&sched_energy_present); 400 } 401 } 402 403 /* 404 * EAS can be used on a root domain if it meets all the following conditions: 405 * 1. an Energy Model (EM) is available; 406 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. 407 * 3. no SMT is detected. 408 * 4. schedutil is driving the frequency of all CPUs of the rd; 409 * 5. frequency invariance support is present; 410 */ 411 static bool build_perf_domains(const struct cpumask *cpu_map) 412 { 413 int i; 414 struct perf_domain *pd = NULL, *tmp; 415 int cpu = cpumask_first(cpu_map); 416 struct root_domain *rd = cpu_rq(cpu)->rd; 417 418 if (!sysctl_sched_energy_aware) 419 goto free; 420 421 if (!sched_is_eas_possible(cpu_map)) 422 goto free; 423 424 for_each_cpu(i, cpu_map) { 425 /* Skip already covered CPUs. */ 426 if (find_pd(pd, i)) 427 continue; 428 429 /* Create the new pd and add it to the local list. */ 430 tmp = pd_init(i); 431 if (!tmp) 432 goto free; 433 tmp->next = pd; 434 pd = tmp; 435 } 436 437 perf_domain_debug(cpu_map, pd); 438 439 /* Attach the new list of performance domains to the root domain. */ 440 tmp = rd->pd; 441 rcu_assign_pointer(rd->pd, pd); 442 if (tmp) 443 call_rcu(&tmp->rcu, destroy_perf_domain_rcu); 444 445 return !!pd; 446 447 free: 448 free_pd(pd); 449 tmp = rd->pd; 450 rcu_assign_pointer(rd->pd, NULL); 451 if (tmp) 452 call_rcu(&tmp->rcu, destroy_perf_domain_rcu); 453 454 return false; 455 } 456 #else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ 457 static void free_pd(struct perf_domain *pd) { } 458 #endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ 459 460 static void free_rootdomain(struct rcu_head *rcu) 461 { 462 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 463 464 cpupri_cleanup(&rd->cpupri); 465 cpudl_cleanup(&rd->cpudl); 466 free_cpumask_var(rd->dlo_mask); 467 free_cpumask_var(rd->rto_mask); 468 free_cpumask_var(rd->online); 469 free_cpumask_var(rd->span); 470 free_pd(rd->pd); 471 kfree(rd); 472 } 473 474 void rq_attach_root(struct rq *rq, struct root_domain *rd) 475 { 476 struct root_domain *old_rd = NULL; 477 struct rq_flags rf; 478 479 rq_lock_irqsave(rq, &rf); 480 481 if (rq->rd) { 482 old_rd = rq->rd; 483 484 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 485 set_rq_offline(rq); 486 487 cpumask_clear_cpu(rq->cpu, old_rd->span); 488 489 /* 490 * If we don't want to free the old_rd yet then 491 * set old_rd to NULL to skip the freeing later 492 * in this function: 493 */ 494 if (!atomic_dec_and_test(&old_rd->refcount)) 495 old_rd = NULL; 496 } 497 498 atomic_inc(&rd->refcount); 499 rq->rd = rd; 500 501 cpumask_set_cpu(rq->cpu, rd->span); 502 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 503 set_rq_online(rq); 504 505 /* 506 * Because the rq is not a task, dl_add_task_root_domain() did not 507 * move the fair server bw to the rd if it already started. 508 * Add it now. 509 */ 510 if (rq->fair_server.dl_server) 511 __dl_server_attach_root(&rq->fair_server, rq); 512 513 #ifdef CONFIG_SCHED_CLASS_EXT 514 if (rq->ext_server.dl_server) 515 __dl_server_attach_root(&rq->ext_server, rq); 516 #endif 517 518 rq_unlock_irqrestore(rq, &rf); 519 520 if (old_rd) 521 call_rcu(&old_rd->rcu, free_rootdomain); 522 } 523 524 void sched_get_rd(struct root_domain *rd) 525 { 526 atomic_inc(&rd->refcount); 527 } 528 529 void sched_put_rd(struct root_domain *rd) 530 { 531 if (!atomic_dec_and_test(&rd->refcount)) 532 return; 533 534 call_rcu(&rd->rcu, free_rootdomain); 535 } 536 537 static int init_rootdomain(struct root_domain *rd) 538 { 539 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) 540 goto out; 541 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) 542 goto free_span; 543 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 544 goto free_online; 545 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 546 goto free_dlo_mask; 547 548 #ifdef HAVE_RT_PUSH_IPI 549 rd->rto_cpu = -1; 550 raw_spin_lock_init(&rd->rto_lock); 551 rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); 552 #endif 553 554 rd->visit_cookie = 0; 555 init_dl_bw(&rd->dl_bw); 556 if (cpudl_init(&rd->cpudl) != 0) 557 goto free_rto_mask; 558 559 if (cpupri_init(&rd->cpupri) != 0) 560 goto free_cpudl; 561 return 0; 562 563 free_cpudl: 564 cpudl_cleanup(&rd->cpudl); 565 free_rto_mask: 566 free_cpumask_var(rd->rto_mask); 567 free_dlo_mask: 568 free_cpumask_var(rd->dlo_mask); 569 free_online: 570 free_cpumask_var(rd->online); 571 free_span: 572 free_cpumask_var(rd->span); 573 out: 574 return -ENOMEM; 575 } 576 577 /* 578 * By default the system creates a single root-domain with all CPUs as 579 * members (mimicking the global state we have today). 580 */ 581 struct root_domain def_root_domain; 582 583 void __init init_defrootdomain(void) 584 { 585 init_rootdomain(&def_root_domain); 586 587 atomic_set(&def_root_domain.refcount, 1); 588 } 589 590 static struct root_domain *alloc_rootdomain(void) 591 { 592 struct root_domain *rd; 593 594 rd = kzalloc_obj(*rd); 595 if (!rd) 596 return NULL; 597 598 if (init_rootdomain(rd) != 0) { 599 kfree(rd); 600 return NULL; 601 } 602 603 return rd; 604 } 605 606 static void free_sched_groups(struct sched_group *sg, int free_sgc) 607 { 608 struct sched_group *tmp, *first; 609 610 if (!sg) 611 return; 612 613 first = sg; 614 do { 615 tmp = sg->next; 616 617 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) 618 kfree(sg->sgc); 619 620 if (atomic_dec_and_test(&sg->ref)) 621 kfree(sg); 622 sg = tmp; 623 } while (sg != first); 624 } 625 626 static void free_sched_domain_shared(struct sched_domain_shared *sds) 627 { 628 if (sds && atomic_dec_and_test(&sds->ref)) 629 kfree(sds); 630 } 631 632 static void destroy_sched_domain(struct sched_domain *sd) 633 { 634 /* 635 * A normal sched domain may have multiple group references, an 636 * overlapping domain, having private groups, only one. Iterate, 637 * dropping group/capacity references, freeing where none remain. 638 */ 639 free_sched_groups(sd->groups, 1); 640 free_sched_domain_shared(sd->shared); 641 642 #ifdef CONFIG_SCHED_CACHE 643 /* only the bottom sd has llc_counts array */ 644 kfree(sd->llc_counts); 645 #endif 646 kfree(sd); 647 } 648 649 static void destroy_sched_domains_rcu(struct rcu_head *rcu) 650 { 651 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 652 653 while (sd) { 654 struct sched_domain *parent = sd->parent; 655 destroy_sched_domain(sd); 656 sd = parent; 657 } 658 } 659 660 static void destroy_sched_domains(struct sched_domain *sd) 661 { 662 if (sd) 663 call_rcu(&sd->rcu, destroy_sched_domains_rcu); 664 } 665 666 /* 667 * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set 668 * (Last Level Cache Domain) for this allows us to avoid some pointer chasing 669 * select_idle_sibling(). 670 * 671 * Also keep a unique ID per domain (we use the first CPU number in the cpumask 672 * of the domain), this allows us to quickly tell if two CPUs are in the same 673 * cache domain, see cpus_share_cache(). 674 */ 675 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); 676 DEFINE_PER_CPU(int, sd_llc_size); 677 DEFINE_PER_CPU(int, sd_llc_id) = -1; 678 DEFINE_PER_CPU(int, sd_share_id); 679 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); 680 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_balance_shared); 681 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); 682 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); 683 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); 684 685 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); 686 DEFINE_STATIC_KEY_FALSE(sched_cluster_active); 687 688 static void update_top_cache_domain(int cpu) 689 { 690 struct sched_domain_shared *sds = NULL; 691 struct sched_domain *sd; 692 int id = cpu; 693 int size = 1; 694 695 sd = highest_flag_domain(cpu, SD_SHARE_LLC); 696 if (sd) { 697 id = cpumask_first(sched_domain_span(sd)); 698 size = cpumask_weight(sched_domain_span(sd)); 699 700 /* If sd_llc exists, sd_llc_shared should exist too. */ 701 WARN_ON_ONCE(!sd->shared); 702 sds = sd->shared; 703 } 704 705 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 706 per_cpu(sd_llc_size, cpu) = size; 707 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); 708 709 sd = lowest_flag_domain(cpu, SD_CLUSTER); 710 if (sd) 711 id = cpumask_first(sched_domain_span(sd)); 712 713 /* 714 * This assignment should be placed after the sd_llc_id as 715 * we want this id equals to cluster id on cluster machines 716 * but equals to LLC id on non-Cluster machines. 717 */ 718 per_cpu(sd_share_id, cpu) = id; 719 720 sd = lowest_flag_domain(cpu, SD_NUMA); 721 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 722 723 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 724 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); 725 726 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY_FULL); 727 /* 728 * The shared object is attached to sd_asym_cpucapacity only when the 729 * asym domain is non-overlapping (i.e., not built from SD_NUMA). 730 * On overlapping (NUMA) asym domains we fall back to letting the 731 * SD_SHARE_LLC path own the shared object, so sd->shared may be NULL 732 * here. 733 */ 734 if (sd && sd->shared) 735 sds = sd->shared; 736 737 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); 738 rcu_assign_pointer(per_cpu(sd_balance_shared, cpu), sds); 739 } 740 741 /* 742 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 743 * hold the hotplug lock. 744 */ 745 static void 746 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 747 { 748 struct rq *rq = cpu_rq(cpu); 749 struct sched_domain *tmp; 750 751 /* Remove the sched domains which do not contribute to scheduling. */ 752 for (tmp = sd; tmp; ) { 753 struct sched_domain *parent = tmp->parent; 754 if (!parent) 755 break; 756 757 if (sd_parent_degenerate(tmp, parent)) { 758 tmp->parent = parent->parent; 759 760 /* Pick reference to parent->shared. */ 761 if (parent->shared) { 762 /* 763 * It is safe to free a sd->shared that 764 * has not been published yet. If a 765 * sd->shared was published, the refcount 766 * will end up being non-zero and it will 767 * not be freed here. 768 */ 769 free_sched_domain_shared(tmp->shared); 770 tmp->shared = parent->shared; 771 parent->shared = NULL; 772 } 773 774 if (parent->parent) { 775 parent->parent->child = tmp; 776 parent->parent->groups->flags = tmp->flags; 777 } 778 779 /* 780 * Transfer SD_PREFER_SIBLING down in case of a 781 * degenerate parent; the spans match for this 782 * so the property transfers. 783 */ 784 if (parent->flags & SD_PREFER_SIBLING) 785 tmp->flags |= SD_PREFER_SIBLING; 786 destroy_sched_domain(parent); 787 } else 788 tmp = tmp->parent; 789 } 790 791 if (sd && sd_degenerate(sd)) { 792 tmp = sd; 793 sd = sd->parent; 794 795 if (sd) { 796 struct sched_group *sg = sd->groups; 797 798 #ifdef CONFIG_SCHED_CACHE 799 /* move buffer to parent as child is being destroyed */ 800 sd->llc_counts = tmp->llc_counts; 801 sd->llc_max = tmp->llc_max; 802 sd->llc_bytes = tmp->llc_bytes; 803 /* make sure destroy_sched_domain() does not free it */ 804 tmp->llc_counts = NULL; 805 tmp->llc_max = 0; 806 tmp->llc_bytes = 0; 807 #endif 808 /* 809 * sched groups hold the flags of the child sched 810 * domain for convenience. Clear such flags since 811 * the child is being destroyed. 812 */ 813 do { 814 sg->flags = 0; 815 } while (sg != sd->groups); 816 817 sd->child = NULL; 818 } 819 820 destroy_sched_domain(tmp); 821 } 822 823 sched_domain_debug(sd, cpu); 824 825 rq_attach_root(rq, rd); 826 tmp = rq->sd; 827 rcu_assign_pointer(rq->sd, sd); 828 dirty_sched_domain_sysctl(cpu); 829 destroy_sched_domains(tmp); 830 831 update_top_cache_domain(cpu); 832 } 833 834 struct s_data { 835 struct sched_domain_shared * __percpu *sds; 836 struct sched_domain * __percpu *sd; 837 struct root_domain *rd; 838 }; 839 840 enum s_alloc { 841 sa_rootdomain, 842 sa_sd, 843 sa_sd_shared, 844 sa_sd_storage, 845 sa_none, 846 }; 847 848 #ifdef CONFIG_SCHED_CACHE 849 /* hardware support for cache aware scheduling */ 850 DEFINE_STATIC_KEY_FALSE(sched_cache_present); 851 /* 852 * Indicator of whether cache aware scheduling 853 * is active, used by the scheduler. 854 */ 855 DEFINE_STATIC_KEY_FALSE(sched_cache_active); 856 /* user wants cache aware scheduling [0 or 1] */ 857 int sysctl_sched_cache_user = 1; 858 859 /* 860 * Get the effective LLC size in bytes that @cpu's bottom sched_domain 861 * can use. A CPU within a cpuset partition can only use a proportion 862 * of the physical LLC, scaled by the ratio of the partition's span 863 * weight to the hardware LLC sharing weight. @sd should be the 864 * topmost domain with SD_SHARE_LLC. 865 * 866 * Returns 0 if cacheinfo is not yet populated. This happens during 867 * early boot when build_sched_domains() runs before the generic 868 * cacheinfo framework has been initialized (cacheinfo_cpu_online() 869 * is a device_initcall cpuhp callback). In that case, 870 * cacheinfo_cpu_online() will later call sched_update_llc_bytes() 871 * to fill in the bottom domain's llc_bytes once the cache attributes 872 * are available. 873 */ 874 static unsigned long get_effective_llc_bytes(int cpu, 875 struct sched_domain *sd) 876 { 877 struct cacheinfo *ci; 878 unsigned int hw_weight; 879 880 ci = get_cpu_cacheinfo_llc(cpu); 881 if (!ci) 882 return 0; 883 884 hw_weight = cpumask_weight(&ci->shared_cpu_map); 885 if (!hw_weight) 886 return 0; 887 888 return div_u64((u64)ci->size * sd->span_weight, hw_weight); 889 } 890 891 static bool alloc_sd_llc(const struct cpumask *cpu_map, 892 struct s_data *d) 893 { 894 struct sched_domain *sd, *top_llc, *parent; 895 unsigned int *p; 896 int i; 897 898 for_each_cpu(i, cpu_map) { 899 sd = *per_cpu_ptr(d->sd, i); 900 if (!sd) 901 goto err; 902 903 p = kcalloc_node(max_lid + 1, sizeof(unsigned int), 904 GFP_KERNEL, cpu_to_node(i)); 905 if (!p) 906 goto err; 907 908 top_llc = sd; 909 /* 910 * Find the topmost SD_SHARE_LLC domain. 911 * Not yet attached to the CPU, so per_cpu(sd_llc, i) 912 * can not be used. 913 */ 914 while ((parent = rcu_dereference_protected(top_llc->parent, true)) && 915 (parent->flags & SD_SHARE_LLC)) 916 top_llc = parent; 917 918 if (top_llc->flags & SD_SHARE_LLC) { 919 sd->llc_max = max_lid + 1; 920 sd->llc_counts = p; 921 sd->llc_bytes = get_effective_llc_bytes(i, top_llc); 922 } else { 923 /* avoid memory leak */ 924 kfree(p); 925 } 926 } 927 928 return true; 929 err: 930 for_each_cpu(i, cpu_map) { 931 sd = *per_cpu_ptr(d->sd, i); 932 if (sd) { 933 kfree(sd->llc_counts); 934 sd->llc_counts = NULL; 935 sd->llc_max = 0; 936 sd->llc_bytes = 0; 937 } 938 } 939 940 return false; 941 } 942 943 /* 944 * Enable/disable cache aware scheduling according to 945 * user input and the presence of hardware support. 946 */ 947 static void _sched_cache_active_set(void) 948 { 949 lockdep_assert_cpus_held(); 950 lockdep_assert_held(&sched_domains_mutex); 951 952 /* hardware does not support */ 953 if (!static_branch_likely(&sched_cache_present)) { 954 static_branch_disable_cpuslocked(&sched_cache_active); 955 if (sched_debug()) 956 pr_info("%s: cache aware scheduling not supported on this platform\n", __func__); 957 return; 958 } 959 960 /* 961 * user wants it or not ? 962 * TBD: read before writing the static key. 963 * It is not in the critical path, leave as-is 964 * for now. 965 */ 966 if (sysctl_sched_cache_user) { 967 static_branch_enable_cpuslocked(&sched_cache_active); 968 if (sched_debug()) 969 pr_info("%s: enabling cache aware scheduling\n", __func__); 970 } else { 971 static_branch_disable_cpuslocked(&sched_cache_active); 972 if (sched_debug()) 973 pr_info("%s: disabling cache aware scheduling\n", __func__); 974 } 975 } 976 977 /* used by debugfs */ 978 void sched_cache_active_set(void) 979 { 980 cpus_read_lock(); 981 sched_domains_mutex_lock(); 982 _sched_cache_active_set(); 983 sched_domains_mutex_unlock(); 984 cpus_read_unlock(); 985 } 986 987 /* 988 * Update the bottom sched_domain's llc_bytes for @cpu and all its 989 * LLC siblings. Called from cacheinfo_cpu_online() or 990 * cacheinfo_cpu_pre_down() with cpu hotplug lock held. 991 * 992 * Note: get_effective_llc_bytes() returns 0 on PowerPC. 993 * thus cache aware scheduling is disabled on PowerPC for 994 * now. PowerPC does not use the generic cacheinfo framework -- 995 * it has its own cacheinfo with a separate struct cache hierarchy 996 * and does not populates the per-CPU struct cpu_cacheinfo array 997 * that get_cpu_cacheinfo_llc() reads. 998 */ 999 void sched_update_llc_bytes(unsigned int cpu) 1000 { 1001 struct sched_domain *sd, *sdp; 1002 unsigned int i; 1003 1004 sched_domains_mutex_lock(); 1005 1006 sdp = rcu_dereference_sched_domain(per_cpu(sd_llc, cpu)); 1007 if (!sdp) 1008 goto unlock; 1009 1010 /* 1011 * ci->shared_cpu_map is built incrementally as CPUs come 1012 * online, so the first CPU in an LLC initially sees 1013 * hw_weight == 1 and computes an inflated llc_bytes in 1014 * get_effective_llc_bytes(). Re-evaluating every LLC 1015 * sibling on each online event corrects this once the full 1016 * shared_cpu_map is known. 1017 */ 1018 for_each_cpu(i, sched_domain_span(sdp)) { 1019 sd = rcu_dereference_sched_domain(cpu_rq(i)->sd); 1020 if (sd) 1021 sd->llc_bytes = get_effective_llc_bytes(i, sdp); 1022 } 1023 1024 unlock: 1025 sched_domains_mutex_unlock(); 1026 } 1027 1028 static void sched_cache_set(bool has_multi_llcs) 1029 { 1030 /* 1031 * TBD: check before writing to it. sched domain rebuild 1032 * is not in the critical path, leave as-is for now. 1033 */ 1034 if (has_multi_llcs) 1035 static_branch_enable_cpuslocked(&sched_cache_present); 1036 else 1037 static_branch_disable_cpuslocked(&sched_cache_present); 1038 1039 _sched_cache_active_set(); 1040 } 1041 #else 1042 static bool alloc_sd_llc(const struct cpumask *cpu_map, 1043 struct s_data *d) 1044 { 1045 return false; 1046 } 1047 static inline void sched_cache_set(bool has_multi_llcs) { } 1048 #endif 1049 1050 /* 1051 * Return true if @sd belongs to an LLC group whose enclosing 1052 * partition spans more than one LLC. @sd must be the topmost 1053 * SD_SHARE_LLC domain. 1054 * 1055 * Any duplicated parent domains with the same span as @sd are 1056 * skipped: before cpu_attach_domain() degeneration these still 1057 * exist, after degeneration the loop is a no-op. This makes the 1058 * helper usable both during sched domain build and against an 1059 * already-attached domain tree. 1060 * 1061 * Note: For systems with a single LLC per node, cache-aware 1062 * scheduling is still enabled when multiple nodes exist. 1063 * However, NUMA balancing decisions take precedence over 1064 * cache-aware scheduling. Conversely, if there is only one 1065 * LLC per partition, cache-aware scheduling should be disabled. 1066 */ 1067 static bool sd_in_multi_llcs(struct sched_domain *sd) 1068 { 1069 struct sched_domain *sdp = sd->parent; 1070 1071 /* it does not make sense to aggregate to 1 CPU */ 1072 if (sd->span_weight == 1) 1073 return false; 1074 1075 while (sdp && sdp->span_weight == sd->span_weight) 1076 sdp = sdp->parent; 1077 1078 return !!sdp; 1079 } 1080 1081 /* 1082 * Return the canonical balance CPU for this group, this is the first CPU 1083 * of this group that's also in the balance mask. 1084 * 1085 * The balance mask are all those CPUs that could actually end up at this 1086 * group. See build_balance_mask(). 1087 * 1088 * Also see should_we_balance(). 1089 */ 1090 int group_balance_cpu(struct sched_group *sg) 1091 { 1092 return cpumask_first(group_balance_mask(sg)); 1093 } 1094 1095 1096 /* 1097 * NUMA topology (first read the regular topology blurb below) 1098 * 1099 * Given a node-distance table, for example: 1100 * 1101 * node 0 1 2 3 1102 * 0: 10 20 30 20 1103 * 1: 20 10 20 30 1104 * 2: 30 20 10 20 1105 * 3: 20 30 20 10 1106 * 1107 * which represents a 4 node ring topology like: 1108 * 1109 * 0 ----- 1 1110 * | | 1111 * | | 1112 * | | 1113 * 3 ----- 2 1114 * 1115 * We want to construct domains and groups to represent this. The way we go 1116 * about doing this is to build the domains on 'hops'. For each NUMA level we 1117 * construct the mask of all nodes reachable in @level hops. 1118 * 1119 * For the above NUMA topology that gives 3 levels: 1120 * 1121 * NUMA-2 0-3 0-3 0-3 0-3 1122 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} 1123 * 1124 * NUMA-1 0-1,3 0-2 1-3 0,2-3 1125 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} 1126 * 1127 * NUMA-0 0 1 2 3 1128 * 1129 * 1130 * As can be seen; things don't nicely line up as with the regular topology. 1131 * When we iterate a domain in child domain chunks some nodes can be 1132 * represented multiple times -- hence the "overlap" naming for this part of 1133 * the topology. 1134 * 1135 * In order to minimize this overlap, we only build enough groups to cover the 1136 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. 1137 * 1138 * Because: 1139 * 1140 * - the first group of each domain is its child domain; this 1141 * gets us the first 0-1,3 1142 * - the only uncovered node is 2, who's child domain is 1-3. 1143 * 1144 * However, because of the overlap, computing a unique CPU for each group is 1145 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both 1146 * groups include the CPUs of Node-0, while those CPUs would not in fact ever 1147 * end up at those groups (they would end up in group: 0-1,3). 1148 * 1149 * To correct this we have to introduce the group balance mask. This mask 1150 * will contain those CPUs in the group that can reach this group given the 1151 * (child) domain tree. 1152 * 1153 * With this we can once again compute balance_cpu and sched_group_capacity 1154 * relations. 1155 * 1156 * XXX include words on how balance_cpu is unique and therefore can be 1157 * used for sched_group_capacity links. 1158 * 1159 * 1160 * Another 'interesting' topology is: 1161 * 1162 * node 0 1 2 3 1163 * 0: 10 20 20 30 1164 * 1: 20 10 20 20 1165 * 2: 20 20 10 20 1166 * 3: 30 20 20 10 1167 * 1168 * Which looks a little like: 1169 * 1170 * 0 ----- 1 1171 * | / | 1172 * | / | 1173 * | / | 1174 * 2 ----- 3 1175 * 1176 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 1177 * are not. 1178 * 1179 * This leads to a few particularly weird cases where the sched_domain's are 1180 * not of the same number for each CPU. Consider: 1181 * 1182 * NUMA-2 0-3 0-3 1183 * groups: {0-2},{1-3} {1-3},{0-2} 1184 * 1185 * NUMA-1 0-2 0-3 0-3 1-3 1186 * 1187 * NUMA-0 0 1 2 3 1188 * 1189 */ 1190 1191 1192 /* 1193 * Build the balance mask; it contains only those CPUs that can arrive at this 1194 * group and should be considered to continue balancing. 1195 * 1196 * We do this during the group creation pass, therefore the group information 1197 * isn't complete yet, however since each group represents a (child) domain we 1198 * can fully construct this using the sched_domain bits (which are already 1199 * complete). 1200 */ 1201 static void 1202 build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) 1203 { 1204 const struct cpumask *sg_span = sched_group_span(sg); 1205 struct sd_data *sdd = sd->private; 1206 struct sched_domain *sibling; 1207 int i; 1208 1209 cpumask_clear(mask); 1210 1211 for_each_cpu(i, sg_span) { 1212 sibling = *per_cpu_ptr(sdd->sd, i); 1213 1214 /* 1215 * Can happen in the asymmetric case, where these siblings are 1216 * unused. The mask will not be empty because those CPUs that 1217 * do have the top domain _should_ span the domain. 1218 */ 1219 if (!sibling->child) 1220 continue; 1221 1222 /* If we would not end up here, we can't continue from here */ 1223 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) 1224 continue; 1225 1226 cpumask_set_cpu(i, mask); 1227 } 1228 1229 /* We must not have empty masks here */ 1230 WARN_ON_ONCE(cpumask_empty(mask)); 1231 } 1232 1233 /* 1234 * XXX: This creates per-node group entries; since the load-balancer will 1235 * immediately access remote memory to construct this group's load-balance 1236 * statistics having the groups node local is of dubious benefit. 1237 */ 1238 static struct sched_group * 1239 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) 1240 { 1241 struct sched_group *sg; 1242 struct cpumask *sg_span; 1243 1244 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 1245 GFP_KERNEL, cpu_to_node(cpu)); 1246 1247 if (!sg) 1248 return NULL; 1249 1250 sg_span = sched_group_span(sg); 1251 if (sd->child) { 1252 cpumask_copy(sg_span, sched_domain_span(sd->child)); 1253 sg->flags = sd->child->flags; 1254 } else { 1255 cpumask_copy(sg_span, sched_domain_span(sd)); 1256 } 1257 1258 atomic_inc(&sg->ref); 1259 return sg; 1260 } 1261 1262 static void init_overlap_sched_group(struct sched_domain *sd, 1263 struct sched_group *sg) 1264 { 1265 struct cpumask *mask = sched_domains_tmpmask2; 1266 struct sd_data *sdd = sd->private; 1267 struct cpumask *sg_span; 1268 int cpu; 1269 1270 build_balance_mask(sd, sg, mask); 1271 cpu = cpumask_first(mask); 1272 1273 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); 1274 if (atomic_inc_return(&sg->sgc->ref) == 1) 1275 cpumask_copy(group_balance_mask(sg), mask); 1276 else 1277 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); 1278 1279 /* 1280 * Initialize sgc->capacity such that even if we mess up the 1281 * domains and no possible iteration will get us here, we won't 1282 * die on a /0 trap. 1283 */ 1284 sg_span = sched_group_span(sg); 1285 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 1286 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; 1287 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; 1288 } 1289 1290 static struct sched_domain * 1291 find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) 1292 { 1293 /* 1294 * The proper descendant would be the one whose child won't span out 1295 * of sd 1296 */ 1297 while (sibling->child && 1298 !cpumask_subset(sched_domain_span(sibling->child), 1299 sched_domain_span(sd))) 1300 sibling = sibling->child; 1301 1302 /* 1303 * As we are referencing sgc across different topology level, we need 1304 * to go down to skip those sched_domains which don't contribute to 1305 * scheduling because they will be degenerated in cpu_attach_domain 1306 */ 1307 while (sibling->child && 1308 cpumask_equal(sched_domain_span(sibling->child), 1309 sched_domain_span(sibling))) 1310 sibling = sibling->child; 1311 1312 return sibling; 1313 } 1314 1315 static int 1316 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 1317 { 1318 struct sched_group *first = NULL, *last = NULL, *sg; 1319 const struct cpumask *span = sched_domain_span(sd); 1320 struct cpumask *covered = sched_domains_tmpmask; 1321 struct sd_data *sdd = sd->private; 1322 struct sched_domain *sibling; 1323 int i; 1324 1325 cpumask_clear(covered); 1326 1327 for_each_cpu_wrap(i, span, cpu) { 1328 struct cpumask *sg_span; 1329 1330 if (cpumask_test_cpu(i, covered)) 1331 continue; 1332 1333 sibling = *per_cpu_ptr(sdd->sd, i); 1334 1335 /* 1336 * Asymmetric node setups can result in situations where the 1337 * domain tree is of unequal depth, make sure to skip domains 1338 * that already cover the entire range. 1339 * 1340 * In that case build_sched_domains() will have terminated the 1341 * iteration early and our sibling sd spans will be empty. 1342 * Domains should always include the CPU they're built on, so 1343 * check that. 1344 */ 1345 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 1346 continue; 1347 1348 /* 1349 * Usually we build sched_group by sibling's child sched_domain 1350 * But for machines whose NUMA diameter are 3 or above, we move 1351 * to build sched_group by sibling's proper descendant's child 1352 * domain because sibling's child sched_domain will span out of 1353 * the sched_domain being built as below. 1354 * 1355 * Smallest diameter=3 topology is: 1356 * 1357 * node 0 1 2 3 1358 * 0: 10 20 30 40 1359 * 1: 20 10 20 30 1360 * 2: 30 20 10 20 1361 * 3: 40 30 20 10 1362 * 1363 * 0 --- 1 --- 2 --- 3 1364 * 1365 * NUMA-3 0-3 N/A N/A 0-3 1366 * groups: {0-2},{1-3} {1-3},{0-2} 1367 * 1368 * NUMA-2 0-2 0-3 0-3 1-3 1369 * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} 1370 * 1371 * NUMA-1 0-1 0-2 1-3 2-3 1372 * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} 1373 * 1374 * NUMA-0 0 1 2 3 1375 * 1376 * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the 1377 * group span isn't a subset of the domain span. 1378 */ 1379 if (sibling->child && 1380 !cpumask_subset(sched_domain_span(sibling->child), span)) 1381 sibling = find_descended_sibling(sd, sibling); 1382 1383 sg = build_group_from_child_sched_domain(sibling, cpu); 1384 if (!sg) 1385 goto fail; 1386 1387 sg_span = sched_group_span(sg); 1388 cpumask_or(covered, covered, sg_span); 1389 1390 init_overlap_sched_group(sibling, sg); 1391 1392 if (!first) 1393 first = sg; 1394 if (last) 1395 last->next = sg; 1396 last = sg; 1397 last->next = first; 1398 } 1399 sd->groups = first; 1400 1401 return 0; 1402 1403 fail: 1404 free_sched_groups(first, 0); 1405 1406 return -ENOMEM; 1407 } 1408 1409 1410 /* 1411 * Package topology (also see the load-balance blurb in fair.c) 1412 * 1413 * The scheduler builds a tree structure to represent a number of important 1414 * topology features. By default (default_topology[]) these include: 1415 * 1416 * - Simultaneous multithreading (SMT) 1417 * - Multi-Core Cache (MC) 1418 * - Package (PKG) 1419 * 1420 * Where the last one more or less denotes everything up to a NUMA node. 1421 * 1422 * The tree consists of 3 primary data structures: 1423 * 1424 * sched_domain -> sched_group -> sched_group_capacity 1425 * ^ ^ ^ ^ 1426 * `-' `-' 1427 * 1428 * The sched_domains are per-CPU and have a two way link (parent & child) and 1429 * denote the ever growing mask of CPUs belonging to that level of topology. 1430 * 1431 * Each sched_domain has a circular (double) linked list of sched_group's, each 1432 * denoting the domains of the level below (or individual CPUs in case of the 1433 * first domain level). The sched_group linked by a sched_domain includes the 1434 * CPU of that sched_domain [*]. 1435 * 1436 * Take for instance a 2 threaded, 2 core, 2 cache cluster part: 1437 * 1438 * CPU 0 1 2 3 4 5 6 7 1439 * 1440 * PKG [ ] 1441 * MC [ ] [ ] 1442 * SMT [ ] [ ] [ ] [ ] 1443 * 1444 * - or - 1445 * 1446 * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 1447 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 1448 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 1449 * 1450 * CPU 0 1 2 3 4 5 6 7 1451 * 1452 * One way to think about it is: sched_domain moves you up and down among these 1453 * topology levels, while sched_group moves you sideways through it, at child 1454 * domain granularity. 1455 * 1456 * sched_group_capacity ensures each unique sched_group has shared storage. 1457 * 1458 * There are two related construction problems, both require a CPU that 1459 * uniquely identify each group (for a given domain): 1460 * 1461 * - The first is the balance_cpu (see should_we_balance() and the 1462 * load-balance blurb in fair.c); for each group we only want 1 CPU to 1463 * continue balancing at a higher domain. 1464 * 1465 * - The second is the sched_group_capacity; we want all identical groups 1466 * to share a single sched_group_capacity. 1467 * 1468 * Since these topologies are exclusive by construction. That is, its 1469 * impossible for an SMT thread to belong to multiple cores, and cores to 1470 * be part of multiple caches. There is a very clear and unique location 1471 * for each CPU in the hierarchy. 1472 * 1473 * Therefore computing a unique CPU for each group is trivial (the iteration 1474 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ 1475 * group), we can simply pick the first CPU in each group. 1476 * 1477 * 1478 * [*] in other words, the first group of each domain is its child domain. 1479 */ 1480 1481 static struct sched_group *get_group(int cpu, struct sd_data *sdd) 1482 { 1483 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 1484 struct sched_domain *child = sd->child; 1485 struct sched_group *sg; 1486 bool already_visited; 1487 1488 if (child) 1489 cpu = cpumask_first(sched_domain_span(child)); 1490 1491 sg = *per_cpu_ptr(sdd->sg, cpu); 1492 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); 1493 1494 /* Increase refcounts for claim_allocations: */ 1495 already_visited = atomic_inc_return(&sg->ref) > 1; 1496 /* sgc visits should follow a similar trend as sg */ 1497 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); 1498 1499 /* If we have already visited that group, it's already initialized. */ 1500 if (already_visited) 1501 return sg; 1502 1503 if (child) { 1504 cpumask_copy(sched_group_span(sg), sched_domain_span(child)); 1505 cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); 1506 sg->flags = child->flags; 1507 } else { 1508 cpumask_set_cpu(cpu, sched_group_span(sg)); 1509 cpumask_set_cpu(cpu, group_balance_mask(sg)); 1510 } 1511 1512 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); 1513 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; 1514 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; 1515 1516 return sg; 1517 } 1518 1519 /* 1520 * build_sched_groups will build a circular linked list of the groups 1521 * covered by the given span, will set each group's ->cpumask correctly, 1522 * and will initialize their ->sgc. 1523 * 1524 * Assumes the sched_domain tree is fully constructed 1525 */ 1526 static int 1527 build_sched_groups(struct sched_domain *sd, int cpu) 1528 { 1529 struct sched_group *first = NULL, *last = NULL; 1530 struct sd_data *sdd = sd->private; 1531 const struct cpumask *span = sched_domain_span(sd); 1532 struct cpumask *covered; 1533 int i; 1534 1535 lockdep_assert_held(&sched_domains_mutex); 1536 covered = sched_domains_tmpmask; 1537 1538 cpumask_clear(covered); 1539 1540 for_each_cpu_wrap(i, span, cpu) { 1541 struct sched_group *sg; 1542 1543 if (cpumask_test_cpu(i, covered)) 1544 continue; 1545 1546 sg = get_group(i, sdd); 1547 1548 cpumask_or(covered, covered, sched_group_span(sg)); 1549 1550 if (!first) 1551 first = sg; 1552 if (last) 1553 last->next = sg; 1554 last = sg; 1555 } 1556 last->next = first; 1557 sd->groups = first; 1558 1559 return 0; 1560 } 1561 1562 /* 1563 * Initialize sched groups cpu_capacity. 1564 * 1565 * cpu_capacity indicates the capacity of sched group, which is used while 1566 * distributing the load between different sched groups in a sched domain. 1567 * Typically cpu_capacity for all the groups in a sched domain will be same 1568 * unless there are asymmetries in the topology. If there are asymmetries, 1569 * group having more cpu_capacity will pickup more load compared to the 1570 * group having less cpu_capacity. 1571 */ 1572 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) 1573 { 1574 struct sched_group *sg = sd->groups; 1575 struct cpumask *mask = sched_domains_tmpmask2; 1576 1577 WARN_ON(!sg); 1578 1579 do { 1580 int cpu, cores = 0, max_cpu = -1; 1581 1582 sg->group_weight = cpumask_weight(sched_group_span(sg)); 1583 1584 cpumask_copy(mask, sched_group_span(sg)); 1585 for_each_cpu(cpu, mask) { 1586 cores++; 1587 cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); 1588 } 1589 sg->cores = cores; 1590 1591 if (!(sd->flags & SD_ASYM_PACKING)) 1592 goto next; 1593 1594 for_each_cpu(cpu, sched_group_span(sg)) { 1595 if (max_cpu < 0) 1596 max_cpu = cpu; 1597 else if (sched_asym_prefer(cpu, max_cpu)) 1598 max_cpu = cpu; 1599 } 1600 sg->asym_prefer_cpu = max_cpu; 1601 1602 next: 1603 sg = sg->next; 1604 } while (sg != sd->groups); 1605 1606 if (cpu != group_balance_cpu(sg)) 1607 return; 1608 1609 update_group_capacity(sd, cpu); 1610 } 1611 1612 /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ 1613 void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) 1614 { 1615 int asym_prefer_cpu = cpu; 1616 struct sched_domain *sd; 1617 1618 guard(rcu)(); 1619 1620 for_each_domain(cpu, sd) { 1621 struct sched_group *sg; 1622 int group_cpu; 1623 1624 if (!(sd->flags & SD_ASYM_PACKING)) 1625 continue; 1626 1627 /* 1628 * Groups of overlapping domain are replicated per NUMA 1629 * node and will require updating "asym_prefer_cpu" on 1630 * each local copy. 1631 * 1632 * If you are hitting this warning, consider moving 1633 * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" 1634 * which is shared by all the overlapping groups. 1635 */ 1636 WARN_ON_ONCE(sd->flags & SD_NUMA); 1637 1638 sg = sd->groups; 1639 if (cpu != sg->asym_prefer_cpu) { 1640 /* 1641 * Since the parent is a superset of the current group, 1642 * if the cpu is not the "asym_prefer_cpu" at the 1643 * current level, it cannot be the preferred CPU at a 1644 * higher levels either. 1645 */ 1646 if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) 1647 return; 1648 1649 WRITE_ONCE(sg->asym_prefer_cpu, cpu); 1650 continue; 1651 } 1652 1653 /* Ranking has improved; CPU is still the preferred one. */ 1654 if (new_prio >= old_prio) 1655 continue; 1656 1657 for_each_cpu(group_cpu, sched_group_span(sg)) { 1658 if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) 1659 asym_prefer_cpu = group_cpu; 1660 } 1661 1662 WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); 1663 } 1664 } 1665 1666 /* 1667 * Set of available CPUs grouped by their corresponding capacities 1668 * Each list entry contains a CPU mask reflecting CPUs that share the same 1669 * capacity. 1670 * The lifespan of data is unlimited. 1671 */ 1672 LIST_HEAD(asym_cap_list); 1673 1674 /* 1675 * Verify whether there is any CPU capacity asymmetry in a given sched domain. 1676 * Provides sd_flags reflecting the asymmetry scope. 1677 */ 1678 static inline int 1679 asym_cpu_capacity_classify(const struct cpumask *sd_span, 1680 const struct cpumask *cpu_map) 1681 { 1682 struct asym_cap_data *entry; 1683 int count = 0, miss = 0; 1684 1685 /* 1686 * Count how many unique CPU capacities this domain spans across 1687 * (compare sched_domain CPUs mask with ones representing available 1688 * CPUs capacities). Take into account CPUs that might be offline: 1689 * skip those. 1690 */ 1691 list_for_each_entry(entry, &asym_cap_list, link) { 1692 if (cpumask_intersects(sd_span, cpu_capacity_span(entry))) 1693 ++count; 1694 else if (cpumask_intersects(cpu_map, cpu_capacity_span(entry))) 1695 ++miss; 1696 } 1697 1698 WARN_ON_ONCE(!count && !list_empty(&asym_cap_list)); 1699 1700 /* No asymmetry detected */ 1701 if (count < 2) 1702 return 0; 1703 /* Some of the available CPU capacity values have not been detected */ 1704 if (miss) 1705 return SD_ASYM_CPUCAPACITY; 1706 1707 /* Full asymmetry */ 1708 return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL; 1709 1710 } 1711 1712 static void free_asym_cap_entry(struct rcu_head *head) 1713 { 1714 struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu); 1715 kfree(entry); 1716 } 1717 1718 static inline void asym_cpu_capacity_update_data(int cpu) 1719 { 1720 unsigned long capacity = arch_scale_cpu_capacity(cpu); 1721 struct asym_cap_data *insert_entry = NULL; 1722 struct asym_cap_data *entry; 1723 1724 /* 1725 * Search if capacity already exits. If not, track which the entry 1726 * where we should insert to keep the list ordered descending. 1727 */ 1728 list_for_each_entry(entry, &asym_cap_list, link) { 1729 if (capacity == entry->capacity) 1730 goto done; 1731 else if (!insert_entry && capacity > entry->capacity) 1732 insert_entry = list_prev_entry(entry, link); 1733 } 1734 1735 entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL); 1736 if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n")) 1737 return; 1738 entry->capacity = capacity; 1739 1740 /* If NULL then the new capacity is the smallest, add last. */ 1741 if (!insert_entry) 1742 list_add_tail_rcu(&entry->link, &asym_cap_list); 1743 else 1744 list_add_rcu(&entry->link, &insert_entry->link); 1745 done: 1746 __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); 1747 } 1748 1749 /* 1750 * Build-up/update list of CPUs grouped by their capacities 1751 * An update requires explicit request to rebuild sched domains 1752 * with state indicating CPU topology changes. 1753 */ 1754 static void asym_cpu_capacity_scan(void) 1755 { 1756 struct asym_cap_data *entry, *next; 1757 int cpu; 1758 1759 list_for_each_entry(entry, &asym_cap_list, link) 1760 cpumask_clear(cpu_capacity_span(entry)); 1761 1762 for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) 1763 asym_cpu_capacity_update_data(cpu); 1764 1765 list_for_each_entry_safe(entry, next, &asym_cap_list, link) { 1766 if (cpumask_empty(cpu_capacity_span(entry))) { 1767 list_del_rcu(&entry->link); 1768 call_rcu(&entry->rcu, free_asym_cap_entry); 1769 } 1770 } 1771 1772 /* 1773 * Only one capacity value has been detected i.e. this system is symmetric. 1774 * No need to keep this data around. 1775 */ 1776 if (list_is_singular(&asym_cap_list)) { 1777 entry = list_first_entry(&asym_cap_list, typeof(*entry), link); 1778 list_del_rcu(&entry->link); 1779 call_rcu(&entry->rcu, free_asym_cap_entry); 1780 } 1781 } 1782 1783 /* 1784 * Initializers for schedule domains 1785 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 1786 */ 1787 1788 static int default_relax_domain_level = -1; 1789 int sched_domain_level_max; 1790 1791 static int __init setup_relax_domain_level(char *str) 1792 { 1793 if (kstrtoint(str, 0, &default_relax_domain_level)) 1794 pr_warn("Unable to set relax_domain_level\n"); 1795 1796 return 1; 1797 } 1798 __setup("relax_domain_level=", setup_relax_domain_level); 1799 1800 static void set_domain_attribute(struct sched_domain *sd, 1801 struct sched_domain_attr *attr) 1802 { 1803 int request; 1804 1805 if (!attr || attr->relax_domain_level < 0) { 1806 if (default_relax_domain_level < 0) 1807 return; 1808 request = default_relax_domain_level; 1809 } else 1810 request = attr->relax_domain_level; 1811 1812 if (sd->level >= request) { 1813 /* Turn off idle balance on this domain: */ 1814 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 1815 } 1816 } 1817 1818 static void __sdt_free(const struct cpumask *cpu_map); 1819 static int __sdt_alloc(const struct cpumask *cpu_map); 1820 1821 static void __sds_free(struct s_data *d, const struct cpumask *cpu_map); 1822 static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map); 1823 1824 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 1825 const struct cpumask *cpu_map) 1826 { 1827 switch (what) { 1828 case sa_rootdomain: 1829 if (!atomic_read(&d->rd->refcount)) 1830 free_rootdomain(&d->rd->rcu); 1831 fallthrough; 1832 case sa_sd: 1833 free_percpu(d->sd); 1834 fallthrough; 1835 case sa_sd_shared: 1836 __sds_free(d, cpu_map); 1837 fallthrough; 1838 case sa_sd_storage: 1839 __sdt_free(cpu_map); 1840 fallthrough; 1841 case sa_none: 1842 break; 1843 } 1844 } 1845 1846 static enum s_alloc 1847 __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) 1848 { 1849 memset(d, 0, sizeof(*d)); 1850 1851 if (__sdt_alloc(cpu_map)) 1852 return sa_sd_storage; 1853 if (__sds_alloc(d, cpu_map)) 1854 return sa_sd_shared; 1855 d->sd = alloc_percpu(struct sched_domain *); 1856 if (!d->sd) 1857 return sa_sd_shared; 1858 d->rd = alloc_rootdomain(); 1859 if (!d->rd) 1860 return sa_sd; 1861 1862 return sa_rootdomain; 1863 } 1864 1865 /* 1866 * NULL the sd_data elements we've used to build the sched_domain and 1867 * sched_group structure so that the subsequent __free_domain_allocs() 1868 * will not free the data we're using. 1869 */ 1870 static void claim_allocations(int cpu, struct s_data *d) 1871 { 1872 struct sched_domain *sd; 1873 1874 if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref)) 1875 *per_cpu_ptr(d->sds, cpu) = NULL; 1876 1877 for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) { 1878 struct sd_data *sdd = sd->private; 1879 1880 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 1881 *per_cpu_ptr(sdd->sd, cpu) = NULL; 1882 1883 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 1884 *per_cpu_ptr(sdd->sg, cpu) = NULL; 1885 1886 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 1887 *per_cpu_ptr(sdd->sgc, cpu) = NULL; 1888 } 1889 } 1890 1891 #ifdef CONFIG_NUMA 1892 enum numa_topology_type sched_numa_topology_type; 1893 1894 /* 1895 * sched_domains_numa_distance is derived from sched_numa_node_distance 1896 * and provides a simplified view of NUMA distances used specifically 1897 * for building NUMA scheduling domains. 1898 */ 1899 static int sched_domains_numa_levels; 1900 static int sched_numa_node_levels; 1901 1902 int sched_max_numa_distance; 1903 static int *sched_domains_numa_distance; 1904 static int *sched_numa_node_distance; 1905 static struct cpumask ***sched_domains_numa_masks; 1906 #endif /* CONFIG_NUMA */ 1907 1908 /* 1909 * SD_flags allowed in topology descriptions. 1910 * 1911 * These flags are purely descriptive of the topology and do not prescribe 1912 * behaviour. Behaviour is artificial and mapped in the below sd_init() 1913 * function. For details, see include/linux/sched/sd_flags.h. 1914 * 1915 * SD_SHARE_CPUCAPACITY 1916 * SD_SHARE_LLC 1917 * SD_CLUSTER 1918 * SD_NUMA 1919 * 1920 * Odd one out, which beside describing the topology has a quirk also 1921 * prescribes the desired behaviour that goes along with it: 1922 * 1923 * SD_ASYM_PACKING - describes SMT quirks 1924 */ 1925 #define TOPOLOGY_SD_FLAGS \ 1926 (SD_SHARE_CPUCAPACITY | \ 1927 SD_CLUSTER | \ 1928 SD_SHARE_LLC | \ 1929 SD_NUMA | \ 1930 SD_ASYM_PACKING) 1931 1932 static struct sched_domain * 1933 sd_init(struct sched_domain_topology_level *tl, 1934 const struct cpumask *cpu_map, 1935 struct sched_domain *child, int cpu) 1936 { 1937 struct sd_data *sdd = &tl->data; 1938 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 1939 int sd_id, sd_weight, sd_flags = 0; 1940 struct cpumask *sd_span; 1941 u64 now = sched_clock(); 1942 1943 sd_span = sched_domain_span(sd); 1944 cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); 1945 sd_weight = cpumask_weight(sd_span); 1946 sd_id = cpumask_first(sd_span); 1947 1948 if (tl->sd_flags) 1949 sd_flags = (*tl->sd_flags)(); 1950 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 1951 "wrong sd_flags in topology description\n")) 1952 sd_flags &= TOPOLOGY_SD_FLAGS; 1953 sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map); 1954 1955 *sd = (struct sched_domain){ 1956 .min_interval = sd_weight, 1957 .max_interval = 2*sd_weight, 1958 .busy_factor = 16, 1959 .imbalance_pct = 117, 1960 1961 .cache_nice_tries = 0, 1962 1963 .flags = 1*SD_BALANCE_NEWIDLE 1964 | 1*SD_BALANCE_EXEC 1965 | 1*SD_BALANCE_FORK 1966 | 0*SD_BALANCE_WAKE 1967 | 1*SD_WAKE_AFFINE 1968 | 0*SD_SHARE_CPUCAPACITY 1969 | 0*SD_SHARE_LLC 1970 | 0*SD_SERIALIZE 1971 | 1*SD_PREFER_SIBLING 1972 | 0*SD_NUMA 1973 | sd_flags 1974 , 1975 1976 .last_balance = jiffies, 1977 .balance_interval = sd_weight, 1978 1979 /* 50% success rate */ 1980 .newidle_call = 512, 1981 .newidle_success = 256, 1982 .newidle_ratio = 512, 1983 .newidle_stamp = now, 1984 1985 .max_newidle_lb_cost = 0, 1986 .last_decay_max_lb_cost = jiffies, 1987 .child = child, 1988 .name = tl->name, 1989 }; 1990 1991 WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == 1992 (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), 1993 "CPU capacity asymmetry not supported on SMT\n"); 1994 1995 /* 1996 * Convert topological properties into behaviour. 1997 */ 1998 /* Don't attempt to spread across CPUs of different capacities. */ 1999 if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) 2000 sd->child->flags &= ~SD_PREFER_SIBLING; 2001 2002 if (sd->flags & SD_SHARE_CPUCAPACITY) { 2003 sd->imbalance_pct = 110; 2004 2005 } else if (sd->flags & SD_SHARE_LLC) { 2006 sd->imbalance_pct = 117; 2007 sd->cache_nice_tries = 1; 2008 2009 #ifdef CONFIG_NUMA 2010 } else if (sd->flags & SD_NUMA) { 2011 sd->cache_nice_tries = 2; 2012 2013 sd->flags &= ~SD_PREFER_SIBLING; 2014 sd->flags |= SD_SERIALIZE; 2015 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { 2016 sd->flags &= ~(SD_BALANCE_EXEC | 2017 SD_BALANCE_FORK | 2018 SD_WAKE_AFFINE); 2019 } 2020 2021 #endif /* CONFIG_NUMA */ 2022 } else { 2023 sd->cache_nice_tries = 1; 2024 } 2025 2026 sd->private = sdd; 2027 2028 return sd; 2029 } 2030 2031 #ifdef CONFIG_SCHED_SMT 2032 int cpu_smt_flags(void) 2033 { 2034 return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; 2035 } 2036 2037 const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) 2038 { 2039 return cpu_smt_mask(cpu); 2040 } 2041 #endif 2042 2043 #ifdef CONFIG_SCHED_CLUSTER 2044 int cpu_cluster_flags(void) 2045 { 2046 return SD_CLUSTER | SD_SHARE_LLC; 2047 } 2048 2049 const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) 2050 { 2051 return cpu_clustergroup_mask(cpu); 2052 } 2053 #endif 2054 2055 #ifdef CONFIG_SCHED_MC 2056 int cpu_core_flags(void) 2057 { 2058 return SD_SHARE_LLC; 2059 } 2060 2061 const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) 2062 { 2063 return cpu_coregroup_mask(cpu); 2064 } 2065 2066 /* 2067 * Majority of architectures have LLC at MC domain level with exception 2068 * such as powerpc. Provide a way for arch to specify where its LLC is 2069 * if it falls in exception category 2070 */ 2071 # ifndef arch_llc_mask 2072 #define arch_llc_mask(cpu) cpu_coregroup_mask(cpu) 2073 # endif 2074 2075 #else 2076 #define arch_llc_mask(cpu) cpumask_of(cpu) 2077 #endif 2078 2079 #define llc_mask(cpu) arch_llc_mask(cpu) 2080 2081 const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) 2082 { 2083 return cpu_node_mask(cpu); 2084 } 2085 2086 /* 2087 * Topology list, bottom-up. 2088 */ 2089 static struct sched_domain_topology_level default_topology[] = { 2090 #ifdef CONFIG_SCHED_SMT 2091 SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), 2092 #endif 2093 2094 #ifdef CONFIG_SCHED_CLUSTER 2095 SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), 2096 #endif 2097 2098 #ifdef CONFIG_SCHED_MC 2099 SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), 2100 #endif 2101 SDTL_INIT(tl_pkg_mask, NULL, PKG), 2102 { NULL, }, 2103 }; 2104 2105 static struct sched_domain_topology_level *sched_domain_topology = 2106 default_topology; 2107 static struct sched_domain_topology_level *sched_domain_topology_saved; 2108 2109 #define for_each_sd_topology(tl) \ 2110 for (tl = sched_domain_topology; tl->mask; tl++) 2111 2112 void __init set_sched_topology(struct sched_domain_topology_level *tl) 2113 { 2114 if (WARN_ON_ONCE(sched_smp_initialized)) 2115 return; 2116 2117 sched_domain_topology = tl; 2118 sched_domain_topology_saved = NULL; 2119 } 2120 2121 #ifdef CONFIG_NUMA 2122 static int cpu_numa_flags(void) 2123 { 2124 return SD_NUMA; 2125 } 2126 2127 static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) 2128 { 2129 return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; 2130 } 2131 2132 static void sched_numa_warn(const char *str) 2133 { 2134 static int done = false; 2135 int i,j; 2136 2137 if (done) 2138 return; 2139 2140 done = true; 2141 2142 printk(KERN_WARNING "ERROR: %s\n\n", str); 2143 2144 for (i = 0; i < nr_node_ids; i++) { 2145 printk(KERN_WARNING " "); 2146 for (j = 0; j < nr_node_ids; j++) { 2147 if (!node_state(i, N_CPU) || !node_state(j, N_CPU)) 2148 printk(KERN_CONT "(%02d) ", node_distance(i,j)); 2149 else 2150 printk(KERN_CONT " %02d ", node_distance(i,j)); 2151 } 2152 printk(KERN_CONT "\n"); 2153 } 2154 printk(KERN_WARNING "\n"); 2155 } 2156 2157 bool find_numa_distance(int distance) 2158 { 2159 bool found = false; 2160 int i, *distances; 2161 2162 if (distance == node_distance(0, 0)) 2163 return true; 2164 2165 rcu_read_lock(); 2166 distances = rcu_dereference(sched_numa_node_distance); 2167 if (!distances) 2168 goto unlock; 2169 for (i = 0; i < sched_numa_node_levels; i++) { 2170 if (distances[i] == distance) { 2171 found = true; 2172 break; 2173 } 2174 } 2175 unlock: 2176 rcu_read_unlock(); 2177 2178 return found; 2179 } 2180 2181 #define for_each_cpu_node_but(n, nbut) \ 2182 for_each_node_state(n, N_CPU) \ 2183 if (n == nbut) \ 2184 continue; \ 2185 else 2186 2187 /* 2188 * A system can have three types of NUMA topology: 2189 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system 2190 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes 2191 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane 2192 * 2193 * The difference between a glueless mesh topology and a backplane 2194 * topology lies in whether communication between not directly 2195 * connected nodes goes through intermediary nodes (where programs 2196 * could run), or through backplane controllers. This affects 2197 * placement of programs. 2198 * 2199 * The type of topology can be discerned with the following tests: 2200 * - If the maximum distance between any nodes is 1 hop, the system 2201 * is directly connected. 2202 * - If for two nodes A and B, located N > 1 hops away from each other, 2203 * there is an intermediary node C, which is < N hops away from both 2204 * nodes A and B, the system is a glueless mesh. 2205 */ 2206 static void init_numa_topology_type(int offline_node) 2207 { 2208 int a, b, c, n; 2209 2210 n = sched_max_numa_distance; 2211 2212 if (sched_domains_numa_levels <= 2) { 2213 sched_numa_topology_type = NUMA_DIRECT; 2214 return; 2215 } 2216 2217 for_each_cpu_node_but(a, offline_node) { 2218 for_each_cpu_node_but(b, offline_node) { 2219 /* Find two nodes furthest removed from each other. */ 2220 if (node_distance(a, b) < n) 2221 continue; 2222 2223 /* Is there an intermediary node between a and b? */ 2224 for_each_cpu_node_but(c, offline_node) { 2225 if (node_distance(a, c) < n && 2226 node_distance(b, c) < n) { 2227 sched_numa_topology_type = 2228 NUMA_GLUELESS_MESH; 2229 return; 2230 } 2231 } 2232 2233 sched_numa_topology_type = NUMA_BACKPLANE; 2234 return; 2235 } 2236 } 2237 2238 pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n"); 2239 sched_numa_topology_type = NUMA_DIRECT; 2240 } 2241 2242 2243 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) 2244 2245 /* 2246 * An architecture could modify its NUMA distance, to change 2247 * grouping of NUMA nodes and number of NUMA levels when creating 2248 * NUMA level sched domains. 2249 * 2250 * A NUMA level is created for each unique 2251 * arch_sched_node_distance. 2252 */ 2253 static int numa_node_dist(int i, int j) 2254 { 2255 return node_distance(i, j); 2256 } 2257 2258 int arch_sched_node_distance(int from, int to) 2259 __weak __alias(numa_node_dist); 2260 2261 static bool modified_sched_node_distance(void) 2262 { 2263 return numa_node_dist != arch_sched_node_distance; 2264 } 2265 2266 static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), 2267 int **dist, int *levels) 2268 { 2269 unsigned long *distance_map __free(bitmap) = NULL; 2270 int nr_levels = 0; 2271 int i, j; 2272 int *distances; 2273 2274 /* 2275 * O(nr_nodes^2) de-duplicating selection sort -- in order to find the 2276 * unique distances in the node_distance() table. 2277 */ 2278 distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); 2279 if (!distance_map) 2280 return -ENOMEM; 2281 2282 bitmap_zero(distance_map, NR_DISTANCE_VALUES); 2283 for_each_cpu_node_but(i, offline_node) { 2284 for_each_cpu_node_but(j, offline_node) { 2285 int distance = n_dist(i, j); 2286 2287 if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { 2288 sched_numa_warn("Invalid distance value range"); 2289 return -EINVAL; 2290 } 2291 2292 bitmap_set(distance_map, distance, 1); 2293 } 2294 } 2295 /* 2296 * We can now figure out how many unique distance values there are and 2297 * allocate memory accordingly. 2298 */ 2299 nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); 2300 2301 distances = kzalloc_objs(int, nr_levels); 2302 if (!distances) 2303 return -ENOMEM; 2304 2305 for (i = 0, j = 0; i < nr_levels; i++, j++) { 2306 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); 2307 distances[i] = j; 2308 } 2309 *dist = distances; 2310 *levels = nr_levels; 2311 2312 return 0; 2313 } 2314 2315 void sched_init_numa(int offline_node) 2316 { 2317 struct sched_domain_topology_level *tl; 2318 int nr_levels, nr_node_levels; 2319 int i, j; 2320 int *distances, *domain_distances; 2321 struct cpumask ***masks; 2322 2323 /* Record the NUMA distances from SLIT table */ 2324 if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, 2325 &nr_node_levels)) 2326 return; 2327 2328 /* Record modified NUMA distances for building sched domains */ 2329 if (modified_sched_node_distance()) { 2330 if (sched_record_numa_dist(offline_node, arch_sched_node_distance, 2331 &domain_distances, &nr_levels)) { 2332 kfree(distances); 2333 return; 2334 } 2335 } else { 2336 domain_distances = distances; 2337 nr_levels = nr_node_levels; 2338 } 2339 rcu_assign_pointer(sched_numa_node_distance, distances); 2340 WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); 2341 WRITE_ONCE(sched_numa_node_levels, nr_node_levels); 2342 2343 /* 2344 * 'nr_levels' contains the number of unique distances 2345 * 2346 * The sched_domains_numa_distance[] array includes the actual distance 2347 * numbers. 2348 */ 2349 2350 /* 2351 * Here, we should temporarily reset sched_domains_numa_levels to 0. 2352 * If it fails to allocate memory for array sched_domains_numa_masks[][], 2353 * the array will contain less then 'nr_levels' members. This could be 2354 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 2355 * in other functions. 2356 * 2357 * We reset it to 'nr_levels' at the end of this function. 2358 */ 2359 rcu_assign_pointer(sched_domains_numa_distance, domain_distances); 2360 2361 sched_domains_numa_levels = 0; 2362 2363 masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); 2364 if (!masks) 2365 return; 2366 2367 /* 2368 * Now for each level, construct a mask per node which contains all 2369 * CPUs of nodes that are that many hops away from us. 2370 */ 2371 for (i = 0; i < nr_levels; i++) { 2372 masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 2373 if (!masks[i]) 2374 return; 2375 2376 for_each_cpu_node_but(j, offline_node) { 2377 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 2378 int k; 2379 2380 if (!mask) 2381 return; 2382 2383 masks[i][j] = mask; 2384 2385 for_each_cpu_node_but(k, offline_node) { 2386 if (sched_debug() && 2387 (arch_sched_node_distance(j, k) != 2388 arch_sched_node_distance(k, j))) 2389 sched_numa_warn("Node-distance not symmetric"); 2390 2391 if (arch_sched_node_distance(j, k) > 2392 sched_domains_numa_distance[i]) 2393 continue; 2394 2395 cpumask_or(mask, mask, cpumask_of_node(k)); 2396 } 2397 } 2398 } 2399 rcu_assign_pointer(sched_domains_numa_masks, masks); 2400 2401 /* Compute default topology size */ 2402 for (i = 0; sched_domain_topology[i].mask; i++); 2403 2404 tl = kzalloc((i + nr_levels + 1) * 2405 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 2406 if (!tl) 2407 return; 2408 2409 /* 2410 * Copy the default topology bits.. 2411 */ 2412 for (i = 0; sched_domain_topology[i].mask; i++) 2413 tl[i] = sched_domain_topology[i]; 2414 2415 /* 2416 * Add the NUMA identity distance, aka single NODE. 2417 */ 2418 tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); 2419 2420 /* 2421 * .. and append 'j' levels of NUMA goodness. 2422 */ 2423 for (j = 1; j < nr_levels; i++, j++) { 2424 tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); 2425 tl[i].numa_level = j; 2426 } 2427 2428 sched_domain_topology_saved = sched_domain_topology; 2429 sched_domain_topology = tl; 2430 2431 sched_domains_numa_levels = nr_levels; 2432 2433 init_numa_topology_type(offline_node); 2434 } 2435 2436 2437 static void sched_reset_numa(void) 2438 { 2439 int nr_levels, *distances, *dom_distances = NULL; 2440 struct cpumask ***masks; 2441 2442 nr_levels = sched_domains_numa_levels; 2443 sched_numa_node_levels = 0; 2444 sched_domains_numa_levels = 0; 2445 sched_max_numa_distance = 0; 2446 sched_numa_topology_type = NUMA_DIRECT; 2447 distances = sched_numa_node_distance; 2448 if (sched_numa_node_distance != sched_domains_numa_distance) 2449 dom_distances = sched_domains_numa_distance; 2450 rcu_assign_pointer(sched_numa_node_distance, NULL); 2451 rcu_assign_pointer(sched_domains_numa_distance, NULL); 2452 masks = sched_domains_numa_masks; 2453 rcu_assign_pointer(sched_domains_numa_masks, NULL); 2454 if (distances || masks) { 2455 int i, j; 2456 2457 synchronize_rcu(); 2458 kfree(distances); 2459 kfree(dom_distances); 2460 for (i = 0; i < nr_levels && masks; i++) { 2461 if (!masks[i]) 2462 continue; 2463 for_each_node(j) 2464 kfree(masks[i][j]); 2465 kfree(masks[i]); 2466 } 2467 kfree(masks); 2468 } 2469 if (sched_domain_topology_saved) { 2470 kfree(sched_domain_topology); 2471 sched_domain_topology = sched_domain_topology_saved; 2472 sched_domain_topology_saved = NULL; 2473 } 2474 } 2475 2476 /* 2477 * Call with hotplug lock held 2478 */ 2479 void sched_update_numa(int cpu, bool online) 2480 { 2481 int node; 2482 2483 node = cpu_to_node(cpu); 2484 /* 2485 * Scheduler NUMA topology is updated when the first CPU of a 2486 * node is onlined or the last CPU of a node is offlined. 2487 */ 2488 if (cpumask_weight(cpumask_of_node(node)) != 1) 2489 return; 2490 2491 sched_reset_numa(); 2492 sched_init_numa(online ? NUMA_NO_NODE : node); 2493 } 2494 2495 void sched_domains_numa_masks_set(unsigned int cpu) 2496 { 2497 int node = cpu_to_node(cpu); 2498 int i, j; 2499 2500 for (i = 0; i < sched_domains_numa_levels; i++) { 2501 for (j = 0; j < nr_node_ids; j++) { 2502 if (!node_state(j, N_CPU)) 2503 continue; 2504 2505 /* Set ourselves in the remote node's masks */ 2506 if (arch_sched_node_distance(j, node) <= 2507 sched_domains_numa_distance[i]) 2508 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 2509 } 2510 } 2511 } 2512 2513 void sched_domains_numa_masks_clear(unsigned int cpu) 2514 { 2515 int i, j; 2516 2517 for (i = 0; i < sched_domains_numa_levels; i++) { 2518 for (j = 0; j < nr_node_ids; j++) { 2519 if (sched_domains_numa_masks[i][j]) 2520 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 2521 } 2522 } 2523 } 2524 2525 /* 2526 * sched_numa_find_closest() - given the NUMA topology, find the cpu 2527 * closest to @cpu from @cpumask. 2528 * cpumask: cpumask to find a cpu from 2529 * cpu: cpu to be close to 2530 * 2531 * returns: cpu, or nr_cpu_ids when nothing found. 2532 */ 2533 int sched_numa_find_closest(const struct cpumask *cpus, int cpu) 2534 { 2535 int i, j = cpu_to_node(cpu), found = nr_cpu_ids; 2536 struct cpumask ***masks; 2537 2538 rcu_read_lock(); 2539 masks = rcu_dereference(sched_domains_numa_masks); 2540 if (!masks) 2541 goto unlock; 2542 for (i = 0; i < sched_domains_numa_levels; i++) { 2543 if (!masks[i][j]) 2544 break; 2545 cpu = cpumask_any_and_distribute(cpus, masks[i][j]); 2546 if (cpu < nr_cpu_ids) { 2547 found = cpu; 2548 break; 2549 } 2550 } 2551 unlock: 2552 rcu_read_unlock(); 2553 2554 return found; 2555 } 2556 2557 struct __cmp_key { 2558 const struct cpumask *cpus; 2559 struct cpumask ***masks; 2560 int node; 2561 int cpu; 2562 int w; 2563 }; 2564 2565 static int hop_cmp(const void *a, const void *b) 2566 { 2567 struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b; 2568 struct __cmp_key *k = (struct __cmp_key *)a; 2569 2570 if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu) 2571 return 1; 2572 2573 if (b == k->masks) { 2574 k->w = 0; 2575 return 0; 2576 } 2577 2578 prev_hop = *((struct cpumask ***)b - 1); 2579 k->w = cpumask_weight_and(k->cpus, prev_hop[k->node]); 2580 if (k->w <= k->cpu) 2581 return 0; 2582 2583 return -1; 2584 } 2585 2586 /** 2587 * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU 2588 * from @cpus to @cpu, taking into account distance 2589 * from a given @node. 2590 * @cpus: cpumask to find a cpu from 2591 * @cpu: CPU to start searching 2592 * @node: NUMA node to order CPUs by distance 2593 * 2594 * Return: cpu, or nr_cpu_ids when nothing found. 2595 */ 2596 int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) 2597 { 2598 struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; 2599 struct cpumask ***hop_masks; 2600 int hop, ret = nr_cpu_ids; 2601 2602 if (node == NUMA_NO_NODE) 2603 return cpumask_nth_and(cpu, cpus, cpu_online_mask); 2604 2605 rcu_read_lock(); 2606 2607 /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ 2608 node = numa_nearest_node(node, N_CPU); 2609 k.node = node; 2610 2611 k.masks = rcu_dereference(sched_domains_numa_masks); 2612 if (!k.masks) 2613 goto unlock; 2614 2615 hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); 2616 if (!hop_masks) 2617 goto unlock; 2618 hop = hop_masks - k.masks; 2619 2620 ret = hop ? 2621 cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) : 2622 cpumask_nth_and(cpu, cpus, k.masks[0][node]); 2623 unlock: 2624 rcu_read_unlock(); 2625 return ret; 2626 } 2627 EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu); 2628 2629 /** 2630 * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from 2631 * @node 2632 * @node: The node to count hops from. 2633 * @hops: Include CPUs up to that many hops away. 0 means local node. 2634 * 2635 * Return: On success, a pointer to a cpumask of CPUs at most @hops away from 2636 * @node, an error value otherwise. 2637 * 2638 * Requires rcu_lock to be held. Returned cpumask is only valid within that 2639 * read-side section, copy it if required beyond that. 2640 * 2641 * Note that not all hops are equal in distance; see sched_init_numa() for how 2642 * distances and masks are handled. 2643 * Also note that this is a reflection of sched_domains_numa_masks, which may change 2644 * during the lifetime of the system (offline nodes are taken out of the masks). 2645 */ 2646 const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops) 2647 { 2648 struct cpumask ***masks; 2649 2650 if (node >= nr_node_ids || hops >= sched_domains_numa_levels) 2651 return ERR_PTR(-EINVAL); 2652 2653 masks = rcu_dereference(sched_domains_numa_masks); 2654 if (!masks) 2655 return ERR_PTR(-EBUSY); 2656 2657 return masks[hops][node]; 2658 } 2659 EXPORT_SYMBOL_GPL(sched_numa_hop_mask); 2660 2661 #endif /* CONFIG_NUMA */ 2662 2663 static int __sdt_alloc(const struct cpumask *cpu_map) 2664 { 2665 struct sched_domain_topology_level *tl; 2666 int j; 2667 2668 for_each_sd_topology(tl) { 2669 struct sd_data *sdd = &tl->data; 2670 2671 sdd->sd = alloc_percpu(struct sched_domain *); 2672 if (!sdd->sd) 2673 return -ENOMEM; 2674 2675 sdd->sg = alloc_percpu(struct sched_group *); 2676 if (!sdd->sg) 2677 return -ENOMEM; 2678 2679 sdd->sgc = alloc_percpu(struct sched_group_capacity *); 2680 if (!sdd->sgc) 2681 return -ENOMEM; 2682 2683 for_each_cpu(j, cpu_map) { 2684 struct sched_domain *sd; 2685 struct sched_group *sg; 2686 struct sched_group_capacity *sgc; 2687 2688 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 2689 GFP_KERNEL, cpu_to_node(j)); 2690 if (!sd) 2691 return -ENOMEM; 2692 2693 *per_cpu_ptr(sdd->sd, j) = sd; 2694 2695 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 2696 GFP_KERNEL, cpu_to_node(j)); 2697 if (!sg) 2698 return -ENOMEM; 2699 2700 sg->next = sg; 2701 2702 *per_cpu_ptr(sdd->sg, j) = sg; 2703 2704 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), 2705 GFP_KERNEL, cpu_to_node(j)); 2706 if (!sgc) 2707 return -ENOMEM; 2708 2709 sgc->id = j; 2710 2711 *per_cpu_ptr(sdd->sgc, j) = sgc; 2712 } 2713 } 2714 2715 return 0; 2716 } 2717 2718 static void __sdt_free(const struct cpumask *cpu_map) 2719 { 2720 struct sched_domain_topology_level *tl; 2721 int j; 2722 2723 for_each_sd_topology(tl) { 2724 struct sd_data *sdd = &tl->data; 2725 2726 for_each_cpu(j, cpu_map) { 2727 struct sched_domain *sd; 2728 2729 if (sdd->sd) { 2730 sd = *per_cpu_ptr(sdd->sd, j); 2731 if (sd && (sd->flags & SD_NUMA)) 2732 free_sched_groups(sd->groups, 0); 2733 kfree(*per_cpu_ptr(sdd->sd, j)); 2734 } 2735 2736 if (sdd->sg) 2737 kfree(*per_cpu_ptr(sdd->sg, j)); 2738 if (sdd->sgc) 2739 kfree(*per_cpu_ptr(sdd->sgc, j)); 2740 } 2741 free_percpu(sdd->sd); 2742 sdd->sd = NULL; 2743 free_percpu(sdd->sg); 2744 sdd->sg = NULL; 2745 free_percpu(sdd->sgc); 2746 sdd->sgc = NULL; 2747 } 2748 } 2749 2750 static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map) 2751 { 2752 int j; 2753 2754 d->sds = alloc_percpu(struct sched_domain_shared *); 2755 if (!d->sds) 2756 return -ENOMEM; 2757 2758 for_each_cpu(j, cpu_map) { 2759 struct sched_domain_shared *sds; 2760 2761 sds = kzalloc_node(sizeof(struct sched_domain_shared), 2762 GFP_KERNEL, cpu_to_node(j)); 2763 if (!sds) 2764 return -ENOMEM; 2765 2766 *per_cpu_ptr(d->sds, j) = sds; 2767 } 2768 2769 return 0; 2770 } 2771 2772 static void __sds_free(struct s_data *d, const struct cpumask *cpu_map) 2773 { 2774 int j; 2775 2776 if (!d->sds) 2777 return; 2778 2779 for_each_cpu(j, cpu_map) 2780 kfree(*per_cpu_ptr(d->sds, j)); 2781 2782 free_percpu(d->sds); 2783 d->sds = NULL; 2784 } 2785 2786 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 2787 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 2788 struct sched_domain *child, int cpu) 2789 { 2790 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); 2791 2792 if (child) { 2793 sd->level = child->level + 1; 2794 sched_domain_level_max = max(sched_domain_level_max, sd->level); 2795 child->parent = sd; 2796 2797 if (!cpumask_subset(sched_domain_span(child), 2798 sched_domain_span(sd))) { 2799 pr_err("BUG: arch topology borken\n"); 2800 pr_err(" the %s domain not a subset of the %s domain\n", 2801 child->name, sd->name); 2802 /* Fixup, ensure @sd has at least @child CPUs. */ 2803 cpumask_or(sched_domain_span(sd), 2804 sched_domain_span(sd), 2805 sched_domain_span(child)); 2806 } 2807 2808 } 2809 set_domain_attribute(sd, attr); 2810 2811 return sd; 2812 } 2813 2814 /* 2815 * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for 2816 * any two given CPUs on non-NUMA topology levels. 2817 */ 2818 static bool topology_span_sane(const struct cpumask *cpu_map) 2819 { 2820 struct sched_domain_topology_level *tl; 2821 struct cpumask *covered, *id_seen; 2822 int cpu; 2823 2824 lockdep_assert_held(&sched_domains_mutex); 2825 covered = sched_domains_tmpmask; 2826 id_seen = sched_domains_tmpmask2; 2827 2828 for_each_sd_topology(tl) { 2829 int tl_common_flags = 0; 2830 2831 if (tl->sd_flags) 2832 tl_common_flags = (*tl->sd_flags)(); 2833 2834 /* NUMA levels are allowed to overlap */ 2835 if (tl_common_flags & SD_NUMA) 2836 continue; 2837 2838 cpumask_clear(covered); 2839 cpumask_clear(id_seen); 2840 2841 /* 2842 * Non-NUMA levels cannot partially overlap - they must be either 2843 * completely equal or completely disjoint. Otherwise we can end up 2844 * breaking the sched_group lists - i.e. a later get_group() pass 2845 * breaks the linking done for an earlier span. 2846 */ 2847 for_each_cpu(cpu, cpu_map) { 2848 const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); 2849 int id; 2850 2851 /* lowest bit set in this mask is used as a unique id */ 2852 id = cpumask_first(tl_cpu_mask); 2853 2854 if (cpumask_test_cpu(id, id_seen)) { 2855 /* First CPU has already been seen, ensure identical spans */ 2856 if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) 2857 return false; 2858 } else { 2859 /* First CPU hasn't been seen before, ensure it's a completely new span */ 2860 if (cpumask_intersects(tl_cpu_mask, covered)) 2861 return false; 2862 2863 cpumask_or(covered, covered, tl_cpu_mask); 2864 cpumask_set_cpu(id, id_seen); 2865 } 2866 } 2867 } 2868 return true; 2869 } 2870 2871 /* 2872 * Calculate an allowed NUMA imbalance such that LLCs do not get 2873 * imbalanced. 2874 */ 2875 static void adjust_numa_imbalance(struct sched_domain *sd_llc) 2876 { 2877 struct sched_domain *parent; 2878 unsigned int imb_span = 1; 2879 unsigned int imb = 0; 2880 unsigned int nr_llcs; 2881 2882 WARN_ON(!(sd_llc->flags & SD_SHARE_LLC)); 2883 WARN_ON(!sd_llc->parent); 2884 2885 /* 2886 * For a single LLC per node, allow an 2887 * imbalance up to 12.5% of the node. This is 2888 * arbitrary cutoff based two factors -- SMT and 2889 * memory channels. For SMT-2, the intent is to 2890 * avoid premature sharing of HT resources but 2891 * SMT-4 or SMT-8 *may* benefit from a different 2892 * cutoff. For memory channels, this is a very 2893 * rough estimate of how many channels may be 2894 * active and is based on recent CPUs with 2895 * many cores. 2896 * 2897 * For multiple LLCs, allow an imbalance 2898 * until multiple tasks would share an LLC 2899 * on one node while LLCs on another node 2900 * remain idle. This assumes that there are 2901 * enough logical CPUs per LLC to avoid SMT 2902 * factors and that there is a correlation 2903 * between LLCs and memory channels. 2904 */ 2905 nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight; 2906 if (nr_llcs == 1) 2907 imb = sd_llc->parent->span_weight >> 3; 2908 else 2909 imb = nr_llcs; 2910 2911 imb = max(1U, imb); 2912 sd_llc->parent->imb_numa_nr = imb; 2913 2914 /* 2915 * Set span based on the first NUMA domain. 2916 * 2917 * NUMA systems always add a NODE domain before 2918 * iterating the NUMA domains. Since this is before 2919 * degeneration, start from sd_llc's parent's 2920 * parent which is the lowest an SD_NUMA domain can 2921 * be relative to sd_llc. 2922 */ 2923 parent = sd_llc->parent->parent; 2924 while (parent && !(parent->flags & SD_NUMA)) 2925 parent = parent->parent; 2926 2927 imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight; 2928 2929 /* Update the upper remainder of the topology */ 2930 parent = sd_llc->parent; 2931 while (parent) { 2932 int factor = max(1U, (parent->span_weight / imb_span)); 2933 2934 parent->imb_numa_nr = imb * factor; 2935 parent = parent->parent; 2936 } 2937 } 2938 2939 static void 2940 init_sched_domain_shared(struct s_data *d, struct sched_domain *sd, int flags) 2941 { 2942 struct sched_domain_shared *sds = NULL; 2943 int cpu; 2944 2945 /* 2946 * Multiple domains can try to claim a shared object like 2947 * SD_ASYM_CPUCAPACITY and SD_SHARE_LLC which can alias to 2948 * same cpumask_first(sched_domain_span(sd)) CPU and can 2949 * cause "nr_idle_scan" to be populated incorrectly during 2950 * load balancing. 2951 * 2952 * Find the first CPU in sched_domain_span(sd) with an 2953 * unclaimed domain (!alloc_flags) or where the alloc_flag 2954 * matches the requested flag (SD_* flag) 2955 * 2956 * If the domain only has single CPU, allow temporary overlap 2957 * in allocation since the domains will be degenerated later. 2958 */ 2959 for_each_cpu(cpu, sched_domain_span(sd)) { 2960 sds = *per_cpu_ptr(d->sds, cpu); 2961 2962 if (!sds->alloc_flags || 2963 sd->span_weight == 1 || 2964 sds->alloc_flags == flags) { 2965 sds->alloc_flags = flags; 2966 sd->shared = sds; 2967 break; 2968 } 2969 } 2970 2971 /* 2972 * Use the sd_shared corresponding to the last 2973 * CPU in the span if none are avaialable. 2974 */ 2975 if (WARN_ON_ONCE(!sd->shared)) 2976 sd->shared = sds; 2977 2978 /* 2979 * nr_busy_cpus is consumed only by the NOHZ kick path via 2980 * sd_balance_shared; on the asym-capacity path it is initialized but 2981 * never read. 2982 */ 2983 atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); 2984 atomic_inc(&sd->shared->ref); 2985 } 2986 2987 /* 2988 * For asymmetric CPU capacity, attach sched_domain_shared on the innermost 2989 * SD_ASYM_CPUCAPACITY_FULL ancestor of @cpu's base domain when that ancestor is 2990 * not an overlapping NUMA-built domain (then LLC should claim shared). 2991 * 2992 * A CPU may lack any FULL ancestor (e.g., exclusive cpuset symmetric island), 2993 * then LLC must claim shared instead. 2994 * 2995 * Note: SD_ASYM_CPUCAPACITY_FULL is only set when all CPU capacity values 2996 * are present in the domain span, so the asym domain we attach to cannot 2997 * degenerate into a single-capacity group. The relevant edge cases are instead 2998 * covered by the caveats above. 2999 * 3000 * Return true if this CPU's asym path claimed sd->shared, false otherwise. 3001 */ 3002 static bool claim_asym_sched_domain_shared(struct s_data *d, int cpu) 3003 { 3004 struct sched_domain *sd = *per_cpu_ptr(d->sd, cpu); 3005 struct sched_domain *sd_asym; 3006 3007 if (!sd) 3008 return false; 3009 3010 sd_asym = sd; 3011 while (sd_asym && !(sd_asym->flags & SD_ASYM_CPUCAPACITY_FULL)) 3012 sd_asym = sd_asym->parent; 3013 3014 if (!sd_asym || (sd_asym->flags & SD_NUMA)) 3015 return false; 3016 3017 init_sched_domain_shared(d, sd_asym, SD_ASYM_CPUCAPACITY); 3018 return true; 3019 } 3020 3021 static int __sched_domains_alloc_llc_id(void) 3022 { 3023 int lid, max; 3024 3025 lockdep_assert_held(&sched_domains_mutex); 3026 3027 lid = cpumask_first_zero(sched_domains_llc_id_allocmask); 3028 /* 3029 * llc_id space should never grow larger than the 3030 * possible number of CPUs in the system. 3031 */ 3032 if (lid >= nr_cpu_ids) 3033 return -1; 3034 3035 __cpumask_set_cpu(lid, sched_domains_llc_id_allocmask); 3036 max = cpumask_last(sched_domains_llc_id_allocmask); 3037 if (max > max_lid) 3038 max_lid = max; 3039 3040 return lid; 3041 } 3042 3043 static void __sched_domains_free_llc_id(int cpu) 3044 { 3045 int i, lid, max; 3046 3047 lockdep_assert_held(&sched_domains_mutex); 3048 3049 lid = per_cpu(sd_llc_id, cpu); 3050 if (lid == -1 || lid >= nr_cpu_ids) 3051 return; 3052 3053 per_cpu(sd_llc_id, cpu) = -1; 3054 3055 for_each_cpu(i, llc_mask(cpu)) { 3056 /* An online CPU owns the llc_id. */ 3057 if (per_cpu(sd_llc_id, i) == lid) 3058 return; 3059 } 3060 3061 __cpumask_clear_cpu(lid, sched_domains_llc_id_allocmask); 3062 3063 max = cpumask_last(sched_domains_llc_id_allocmask); 3064 /* shrink max lid to save memory */ 3065 if (max < max_lid) 3066 max_lid = max; 3067 } 3068 3069 void sched_domains_free_llc_id(int cpu) 3070 { 3071 sched_domains_mutex_lock(); 3072 __sched_domains_free_llc_id(cpu); 3073 sched_domains_mutex_unlock(); 3074 } 3075 3076 /* 3077 * Build sched domains for a given set of CPUs and attach the sched domains 3078 * to the individual CPUs 3079 */ 3080 static int 3081 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr, 3082 bool *multi_llcs) 3083 { 3084 enum s_alloc alloc_state = sa_none; 3085 bool has_multi_llcs = false; 3086 struct sched_domain *sd; 3087 struct s_data d; 3088 struct rq *rq = NULL; 3089 int i, ret = -ENOMEM; 3090 bool has_asym = false; 3091 bool has_cluster = false; 3092 3093 if (WARN_ON(cpumask_empty(cpu_map))) 3094 goto error; 3095 3096 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 3097 if (alloc_state != sa_rootdomain) 3098 goto error; 3099 3100 /* Set up domains for CPUs specified by the cpu_map: */ 3101 for_each_cpu(i, cpu_map) { 3102 struct sched_domain_topology_level *tl; 3103 int lid; 3104 3105 sd = NULL; 3106 for_each_sd_topology(tl) { 3107 3108 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 3109 3110 has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; 3111 3112 if (tl == sched_domain_topology) 3113 *per_cpu_ptr(d.sd, i) = sd; 3114 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 3115 break; 3116 } 3117 3118 lid = per_cpu(sd_llc_id, i); 3119 if (lid == -1) { 3120 /* try to reuse the llc_id of its siblings */ 3121 for (int j = cpumask_first(llc_mask(i)); 3122 j < nr_cpu_ids; 3123 j = cpumask_next(j, llc_mask(i))) { 3124 if (i == j) 3125 continue; 3126 3127 lid = per_cpu(sd_llc_id, j); 3128 3129 if (lid != -1) { 3130 per_cpu(sd_llc_id, i) = lid; 3131 3132 break; 3133 } 3134 } 3135 3136 /* a new LLC is detected */ 3137 if (lid == -1) 3138 per_cpu(sd_llc_id, i) = __sched_domains_alloc_llc_id(); 3139 } 3140 } 3141 3142 if (WARN_ON(!topology_span_sane(cpu_map))) 3143 goto error; 3144 3145 /* Build the groups for the domains */ 3146 for_each_cpu(i, cpu_map) { 3147 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 3148 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 3149 if (sd->flags & SD_NUMA) { 3150 if (build_overlap_sched_groups(sd, i)) 3151 goto error; 3152 } else { 3153 if (build_sched_groups(sd, i)) 3154 goto error; 3155 } 3156 } 3157 } 3158 3159 for_each_cpu(i, cpu_map) { 3160 sd = *per_cpu_ptr(d.sd, i); 3161 if (!sd) 3162 continue; 3163 3164 if (has_asym) 3165 claim_asym_sched_domain_shared(&d, i); 3166 3167 /* First, find the topmost SD_SHARE_LLC domain */ 3168 while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) 3169 sd = sd->parent; 3170 3171 if (sd->flags & SD_SHARE_LLC) { 3172 init_sched_domain_shared(&d, sd, SD_SHARE_LLC); 3173 3174 /* 3175 * In presence of higher domains, adjust the 3176 * NUMA imbalance stats for the hierarchy. 3177 */ 3178 if (sd->parent) { 3179 if (IS_ENABLED(CONFIG_NUMA)) 3180 adjust_numa_imbalance(sd); 3181 3182 if (sd_in_multi_llcs(sd)) 3183 has_multi_llcs = true; 3184 } 3185 } 3186 } 3187 3188 /* Calculate CPU capacity for physical packages and nodes */ 3189 for (i = nr_cpumask_bits-1; i >= 0; i--) { 3190 if (!cpumask_test_cpu(i, cpu_map)) 3191 continue; 3192 3193 claim_allocations(i, &d); 3194 3195 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) 3196 init_sched_groups_capacity(i, sd); 3197 } 3198 3199 alloc_sd_llc(cpu_map, &d); 3200 3201 /* Attach the domains */ 3202 rcu_read_lock(); 3203 for_each_cpu(i, cpu_map) { 3204 rq = cpu_rq(i); 3205 sd = *per_cpu_ptr(d.sd, i); 3206 3207 cpu_attach_domain(sd, d.rd, i); 3208 3209 if (lowest_flag_domain(i, SD_CLUSTER)) 3210 has_cluster = true; 3211 } 3212 rcu_read_unlock(); 3213 3214 if (has_asym) 3215 static_branch_inc_cpuslocked(&sched_asym_cpucapacity); 3216 3217 if (has_cluster) 3218 static_branch_inc_cpuslocked(&sched_cluster_active); 3219 3220 if (rq && sched_debug_verbose) 3221 pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); 3222 3223 ret = 0; 3224 error: 3225 *multi_llcs = has_multi_llcs; 3226 __free_domain_allocs(&d, alloc_state, cpu_map); 3227 3228 return ret; 3229 } 3230 3231 /* Current sched domains: */ 3232 static cpumask_var_t *doms_cur; 3233 3234 /* Number of sched domains in 'doms_cur': */ 3235 static int ndoms_cur; 3236 3237 /* Attributes of custom domains in 'doms_cur' */ 3238 static struct sched_domain_attr *dattr_cur; 3239 3240 /* 3241 * Special case: If a kmalloc() of a doms_cur partition (array of 3242 * cpumask) fails, then fallback to a single sched domain, 3243 * as determined by the single cpumask fallback_doms. 3244 */ 3245 static cpumask_var_t fallback_doms; 3246 3247 /* 3248 * arch_update_cpu_topology lets virtualized architectures update the 3249 * CPU core maps. It is supposed to return 1 if the topology changed 3250 * or 0 if it stayed the same. 3251 */ 3252 int __weak arch_update_cpu_topology(void) 3253 { 3254 return 0; 3255 } 3256 3257 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 3258 { 3259 int i; 3260 cpumask_var_t *doms; 3261 3262 doms = kmalloc_objs(*doms, ndoms); 3263 if (!doms) 3264 return NULL; 3265 for (i = 0; i < ndoms; i++) { 3266 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 3267 free_sched_domains(doms, i); 3268 return NULL; 3269 } 3270 } 3271 return doms; 3272 } 3273 3274 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 3275 { 3276 unsigned int i; 3277 for (i = 0; i < ndoms; i++) 3278 free_cpumask_var(doms[i]); 3279 kfree(doms); 3280 } 3281 3282 /* 3283 * Set up scheduler domains and groups. For now this just excludes isolated 3284 * CPUs, but could be used to exclude other special cases in the future. 3285 */ 3286 int __init sched_init_domains(const struct cpumask *cpu_map) 3287 { 3288 bool multi_llcs; 3289 int err; 3290 3291 zalloc_cpumask_var(&sched_domains_llc_id_allocmask, GFP_KERNEL); 3292 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL); 3293 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL); 3294 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL); 3295 3296 arch_update_cpu_topology(); 3297 asym_cpu_capacity_scan(); 3298 ndoms_cur = 1; 3299 doms_cur = alloc_sched_domains(ndoms_cur); 3300 if (!doms_cur) 3301 doms_cur = &fallback_doms; 3302 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN)); 3303 err = build_sched_domains(doms_cur[0], NULL, &multi_llcs); 3304 if (!err) 3305 sched_cache_set(multi_llcs); 3306 3307 return err; 3308 } 3309 3310 /* 3311 * Detach sched domains from a group of CPUs specified in cpu_map 3312 * These CPUs will now be attached to the NULL domain 3313 */ 3314 static void detach_destroy_domains(const struct cpumask *cpu_map) 3315 { 3316 unsigned int cpu = cpumask_any(cpu_map); 3317 int i; 3318 3319 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) 3320 static_branch_dec_cpuslocked(&sched_asym_cpucapacity); 3321 3322 if (static_branch_unlikely(&sched_cluster_active)) 3323 static_branch_dec_cpuslocked(&sched_cluster_active); 3324 3325 rcu_read_lock(); 3326 for_each_cpu(i, cpu_map) 3327 cpu_attach_domain(NULL, &def_root_domain, i); 3328 rcu_read_unlock(); 3329 } 3330 3331 /* handle null as "default" */ 3332 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 3333 struct sched_domain_attr *new, int idx_new) 3334 { 3335 struct sched_domain_attr tmp; 3336 3337 /* Fast path: */ 3338 if (!new && !cur) 3339 return 1; 3340 3341 tmp = SD_ATTR_INIT; 3342 3343 return !memcmp(cur ? (cur + idx_cur) : &tmp, 3344 new ? (new + idx_new) : &tmp, 3345 sizeof(struct sched_domain_attr)); 3346 } 3347 3348 /* 3349 * Partition sched domains as specified by the 'ndoms_new' 3350 * cpumasks in the array doms_new[] of cpumasks. This compares 3351 * doms_new[] to the current sched domain partitioning, doms_cur[]. 3352 * It destroys each deleted domain and builds each new domain. 3353 * 3354 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 3355 * The masks don't intersect (don't overlap.) We should setup one 3356 * sched domain for each mask. CPUs not in any of the cpumasks will 3357 * not be load balanced. If the same cpumask appears both in the 3358 * current 'doms_cur' domains and in the new 'doms_new', we can leave 3359 * it as it is. 3360 * 3361 * The passed in 'doms_new' should be allocated using 3362 * alloc_sched_domains. This routine takes ownership of it and will 3363 * free_sched_domains it when done with it. If the caller failed the 3364 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 3365 * and partition_sched_domains() will fallback to the single partition 3366 * 'fallback_doms', it also forces the domains to be rebuilt. 3367 * 3368 * If doms_new == NULL it will be replaced with cpu_online_mask. 3369 * ndoms_new == 0 is a special case for destroying existing domains, 3370 * and it will not create the default domain. 3371 * 3372 * Call with hotplug lock and sched_domains_mutex held 3373 */ 3374 static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], 3375 struct sched_domain_attr *dattr_new) 3376 { 3377 bool __maybe_unused has_eas = false; 3378 bool has_multi_llcs = false, multi_llcs; 3379 int i, j, n; 3380 int new_topology; 3381 3382 lockdep_assert_held(&sched_domains_mutex); 3383 3384 /* Let the architecture update CPU core mappings: */ 3385 new_topology = arch_update_cpu_topology(); 3386 /* Trigger rebuilding CPU capacity asymmetry data */ 3387 if (new_topology) 3388 asym_cpu_capacity_scan(); 3389 3390 if (!doms_new) { 3391 WARN_ON_ONCE(dattr_new); 3392 n = 0; 3393 doms_new = alloc_sched_domains(1); 3394 if (doms_new) { 3395 n = 1; 3396 cpumask_and(doms_new[0], cpu_active_mask, 3397 housekeeping_cpumask(HK_TYPE_DOMAIN)); 3398 } 3399 } else { 3400 n = ndoms_new; 3401 } 3402 3403 /* Destroy deleted domains: */ 3404 for (i = 0; i < ndoms_cur; i++) { 3405 for (j = 0; j < n && !new_topology; j++) { 3406 if (cpumask_equal(doms_cur[i], doms_new[j]) && 3407 dattrs_equal(dattr_cur, i, dattr_new, j)) 3408 goto match1; 3409 } 3410 /* No match - a current sched domain not in new doms_new[] */ 3411 detach_destroy_domains(doms_cur[i]); 3412 match1: 3413 ; 3414 } 3415 3416 n = ndoms_cur; 3417 if (!doms_new) { 3418 n = 0; 3419 doms_new = &fallback_doms; 3420 cpumask_and(doms_new[0], cpu_active_mask, 3421 housekeeping_cpumask(HK_TYPE_DOMAIN)); 3422 } 3423 3424 /* Build new domains: */ 3425 for (i = 0; i < ndoms_new; i++) { 3426 for (j = 0; j < n && !new_topology; j++) { 3427 if (cpumask_equal(doms_new[i], doms_cur[j]) && 3428 dattrs_equal(dattr_new, i, dattr_cur, j)) { 3429 /* 3430 * Reused partition has to be taken care 3431 * of here, because there could be a corner 3432 * case that if the reused partition is skipped 3433 * and only new partition is considered, an 3434 * incorrect has_multi_llcs would be set. For 3435 * example: 3436 * If the only multi-LLC partition is reused 3437 * and a new single-LLC partition is built, 3438 * sched_cache_set(false) disables cache-aware 3439 * scheduling globally despite the reused 3440 * multi-LLC partition still being active. 3441 */ 3442 struct sched_domain *sd; 3443 int cpu = cpumask_first(doms_cur[j]); 3444 3445 guard(rcu)(); 3446 sd = rcu_dereference(cpu_rq(cpu)->sd); 3447 while (sd && sd->parent && (sd->parent->flags & SD_SHARE_LLC)) 3448 sd = sd->parent; 3449 if (sd && (sd->flags & SD_SHARE_LLC) && sd->parent && 3450 sd_in_multi_llcs(sd)) 3451 has_multi_llcs = true; 3452 goto match2; 3453 } 3454 } 3455 /* No match - add a new doms_new */ 3456 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL, 3457 &multi_llcs); 3458 has_multi_llcs |= multi_llcs; 3459 match2: 3460 ; 3461 } 3462 sched_cache_set(has_multi_llcs); 3463 3464 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) 3465 /* Build perf domains: */ 3466 for (i = 0; i < ndoms_new; i++) { 3467 for (j = 0; j < n && !sched_energy_update; j++) { 3468 if (cpumask_equal(doms_new[i], doms_cur[j]) && 3469 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { 3470 has_eas = true; 3471 goto match3; 3472 } 3473 } 3474 /* No match - add perf domains for a new rd */ 3475 has_eas |= build_perf_domains(doms_new[i]); 3476 match3: 3477 ; 3478 } 3479 sched_energy_set(has_eas); 3480 #endif 3481 3482 /* Remember the new sched domains: */ 3483 if (doms_cur != &fallback_doms) 3484 free_sched_domains(doms_cur, ndoms_cur); 3485 3486 kfree(dattr_cur); 3487 doms_cur = doms_new; 3488 dattr_cur = dattr_new; 3489 ndoms_cur = ndoms_new; 3490 3491 update_sched_domain_debugfs(); 3492 dl_rebuild_rd_accounting(); 3493 } 3494 3495 /* 3496 * Call with hotplug lock held 3497 */ 3498 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 3499 struct sched_domain_attr *dattr_new) 3500 { 3501 sched_domains_mutex_lock(); 3502 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); 3503 sched_domains_mutex_unlock(); 3504 } 3505