1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2016 Thomas Gleixner. 4 * Copyright (C) 2016-2017 Christoph Hellwig. 5 */ 6 #include <linux/kernel.h> 7 #include <linux/slab.h> 8 #include <linux/cpu.h> 9 #include <linux/sort.h> 10 #include <linux/group_cpus.h> 11 12 #ifdef CONFIG_SMP 13 14 static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, 15 unsigned int cpus_per_grp) 16 { 17 const struct cpumask *siblmsk; 18 int cpu, sibl; 19 20 for ( ; cpus_per_grp > 0; ) { 21 cpu = cpumask_first(nmsk); 22 23 /* Should not happen, but I'm too lazy to think about it */ 24 if (cpu >= nr_cpu_ids) 25 return; 26 27 cpumask_clear_cpu(cpu, nmsk); 28 cpumask_set_cpu(cpu, irqmsk); 29 cpus_per_grp--; 30 31 /* If the cpu has siblings, use them first */ 32 siblmsk = topology_sibling_cpumask(cpu); 33 for (sibl = -1; cpus_per_grp > 0; ) { 34 sibl = cpumask_next(sibl, siblmsk); 35 if (sibl >= nr_cpu_ids) 36 break; 37 if (!cpumask_test_and_clear_cpu(sibl, nmsk)) 38 continue; 39 cpumask_set_cpu(sibl, irqmsk); 40 cpus_per_grp--; 41 } 42 } 43 } 44 45 static cpumask_var_t *alloc_node_to_cpumask(void) 46 { 47 cpumask_var_t *masks; 48 int node; 49 50 masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); 51 if (!masks) 52 return NULL; 53 54 for (node = 0; node < nr_node_ids; node++) { 55 if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) 56 goto out_unwind; 57 } 58 59 return masks; 60 61 out_unwind: 62 while (--node >= 0) 63 free_cpumask_var(masks[node]); 64 kfree(masks); 65 return NULL; 66 } 67 68 static void free_node_to_cpumask(cpumask_var_t *masks) 69 { 70 int node; 71 72 for (node = 0; node < nr_node_ids; node++) 73 free_cpumask_var(masks[node]); 74 kfree(masks); 75 } 76 77 static void build_node_to_cpumask(cpumask_var_t *masks) 78 { 79 int cpu; 80 81 for_each_possible_cpu(cpu) 82 cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); 83 } 84 85 static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, 86 const struct cpumask *mask, nodemask_t *nodemsk) 87 { 88 int n, nodes = 0; 89 90 /* Calculate the number of nodes in the supplied affinity mask */ 91 for_each_node(n) { 92 if (cpumask_intersects(mask, node_to_cpumask[n])) { 93 node_set(n, *nodemsk); 94 nodes++; 95 } 96 } 97 return nodes; 98 } 99 100 struct node_groups { 101 unsigned id; 102 103 union { 104 unsigned ngroups; 105 unsigned ncpus; 106 }; 107 }; 108 109 static int ncpus_cmp_func(const void *l, const void *r) 110 { 111 const struct node_groups *ln = l; 112 const struct node_groups *rn = r; 113 114 return ln->ncpus - rn->ncpus; 115 } 116 117 static void alloc_groups_to_nodes(unsigned int numgrps, 118 unsigned int numcpus, 119 struct node_groups *node_groups, 120 unsigned int num_nodes) 121 { 122 unsigned int n, remaining_ncpus = numcpus; 123 unsigned int ngroups, ncpus; 124 125 sort(node_groups, num_nodes, sizeof(node_groups[0]), 126 ncpus_cmp_func, NULL); 127 128 /* 129 * Allocate groups for each node according to the ratio of this 130 * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is 131 * bigger than number of active numa nodes. Always start the 132 * allocation from the node with minimized nr_cpus. 133 * 134 * This way guarantees that each active node gets allocated at 135 * least one group, and the theory is simple: over-allocation 136 * is only done when this node is assigned by one group, so 137 * other nodes will be allocated >= 1 groups, since 'numgrps' is 138 * bigger than number of numa nodes. 139 * 140 * One perfect invariant is that number of allocated groups for 141 * each node is <= CPU count of this node: 142 * 143 * 1) suppose there are two nodes: A and B 144 * ncpu(X) is CPU count of node X 145 * grps(X) is the group count allocated to node X via this 146 * algorithm 147 * 148 * ncpu(A) <= ncpu(B) 149 * ncpu(A) + ncpu(B) = N 150 * grps(A) + grps(B) = G 151 * 152 * grps(A) = max(1, round_down(G * ncpu(A) / N)) 153 * grps(B) = G - grps(A) 154 * 155 * both N and G are integer, and 2 <= G <= N, suppose 156 * G = N - delta, and 0 <= delta <= N - 2 157 * 158 * 2) obviously grps(A) <= ncpu(A) because: 159 * 160 * if grps(A) is 1, then grps(A) <= ncpu(A) given 161 * ncpu(A) >= 1 162 * 163 * otherwise, 164 * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N 165 * 166 * 3) prove how grps(B) <= ncpu(B): 167 * 168 * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be 169 * over-allocated, so grps(B) <= ncpu(B), 170 * 171 * otherwise: 172 * 173 * grps(A) = 174 * round_down(G * ncpu(A) / N) = 175 * round_down((N - delta) * ncpu(A) / N) = 176 * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= 177 * round_down((N * ncpu(A) - delta * N) / N) = 178 * cpu(A) - delta 179 * 180 * then: 181 * 182 * grps(A) - G >= ncpu(A) - delta - G 183 * => 184 * G - grps(A) <= G + delta - ncpu(A) 185 * => 186 * grps(B) <= N - ncpu(A) 187 * => 188 * grps(B) <= cpu(B) 189 * 190 * For nodes >= 3, it can be thought as one node and another big 191 * node given that is exactly what this algorithm is implemented, 192 * and we always re-calculate 'remaining_ncpus' & 'numgrps', and 193 * finally for each node X: grps(X) <= ncpu(X). 194 * 195 */ 196 197 for (n = 0; n < num_nodes; n++) { 198 if (node_groups[n].ncpus == UINT_MAX) 199 continue; 200 201 WARN_ON_ONCE(numgrps == 0); 202 203 ncpus = node_groups[n].ncpus; 204 ngroups = max_t(unsigned, 1, 205 numgrps * ncpus / remaining_ncpus); 206 WARN_ON_ONCE(ngroups > ncpus); 207 208 node_groups[n].ngroups = ngroups; 209 210 remaining_ncpus -= ncpus; 211 numgrps -= ngroups; 212 } 213 } 214 215 /* 216 * Allocate group number for each node, so that for each node: 217 * 218 * 1) the allocated number is >= 1 219 * 220 * 2) the allocated number is <= active CPU number of this node 221 * 222 * The actual allocated total groups may be less than @numgrps when 223 * active total CPU number is less than @numgrps. 224 * 225 * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' 226 * for each node. 227 */ 228 static void alloc_nodes_groups(unsigned int numgrps, 229 cpumask_var_t *node_to_cpumask, 230 const struct cpumask *cpu_mask, 231 const nodemask_t nodemsk, 232 struct cpumask *nmsk, 233 struct node_groups *node_groups) 234 { 235 unsigned int n, numcpus = 0; 236 237 for (n = 0; n < nr_node_ids; n++) { 238 node_groups[n].id = n; 239 node_groups[n].ncpus = UINT_MAX; 240 } 241 242 for_each_node_mask(n, nodemsk) { 243 unsigned int ncpus; 244 245 cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 246 ncpus = cpumask_weight(nmsk); 247 248 if (!ncpus) 249 continue; 250 numcpus += ncpus; 251 node_groups[n].ncpus = ncpus; 252 } 253 254 numgrps = min_t(unsigned int, numcpus, numgrps); 255 alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids); 256 } 257 258 static void assign_cpus_to_groups(unsigned int ncpus, 259 struct cpumask *nmsk, 260 struct node_groups *nv, 261 struct cpumask *masks, 262 unsigned int *curgrp, 263 unsigned int last_grp) 264 { 265 unsigned int v, cpus_per_grp, extra_grps; 266 /* Account for rounding errors */ 267 extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); 268 269 /* Spread allocated groups on CPUs of the current node */ 270 for (v = 0; v < nv->ngroups; v++, *curgrp += 1) { 271 cpus_per_grp = ncpus / nv->ngroups; 272 273 /* Account for extra groups to compensate rounding errors */ 274 if (extra_grps) { 275 cpus_per_grp++; 276 --extra_grps; 277 } 278 279 /* 280 * wrapping has to be considered given 'startgrp' 281 * may start anywhere 282 */ 283 if (*curgrp >= last_grp) 284 *curgrp = 0; 285 grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp); 286 } 287 } 288 289 static int alloc_cluster_groups(unsigned int ncpus, 290 unsigned int ngroups, 291 struct cpumask *node_cpumask, 292 cpumask_var_t msk, 293 const struct cpumask ***clusters_ptr, 294 struct node_groups **cluster_groups_ptr) 295 { 296 unsigned int ncluster = 0; 297 unsigned int cpu, nc, n; 298 const struct cpumask *cluster_mask; 299 const struct cpumask **clusters; 300 struct node_groups *cluster_groups; 301 302 cpumask_copy(msk, node_cpumask); 303 304 /* Probe how many clusters in this node. */ 305 while (1) { 306 cpu = cpumask_first(msk); 307 if (cpu >= nr_cpu_ids) 308 break; 309 310 cluster_mask = topology_cluster_cpumask(cpu); 311 if (!cpumask_weight(cluster_mask)) 312 goto no_cluster; 313 /* Clean out CPUs on the same cluster. */ 314 cpumask_andnot(msk, msk, cluster_mask); 315 ncluster++; 316 } 317 318 /* If ngroups < ncluster, cross cluster is inevitable, skip. */ 319 if (ncluster == 0 || ncluster > ngroups) 320 goto no_cluster; 321 322 /* Allocate memory based on cluster number. */ 323 clusters = kcalloc(ncluster, sizeof(struct cpumask *), GFP_KERNEL); 324 if (!clusters) 325 goto no_cluster; 326 cluster_groups = kcalloc(ncluster, sizeof(struct node_groups), GFP_KERNEL); 327 if (!cluster_groups) 328 goto fail_cluster_groups; 329 330 /* Filling cluster info for later process. */ 331 cpumask_copy(msk, node_cpumask); 332 for (n = 0; n < ncluster; n++) { 333 cpu = cpumask_first(msk); 334 cluster_mask = topology_cluster_cpumask(cpu); 335 nc = cpumask_weight_and(cluster_mask, node_cpumask); 336 clusters[n] = cluster_mask; 337 cluster_groups[n].id = n; 338 cluster_groups[n].ncpus = nc; 339 cpumask_andnot(msk, msk, cluster_mask); 340 } 341 342 alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster); 343 344 *clusters_ptr = clusters; 345 *cluster_groups_ptr = cluster_groups; 346 return ncluster; 347 348 fail_cluster_groups: 349 kfree(clusters); 350 no_cluster: 351 return 0; 352 } 353 354 /* 355 * Try group CPUs evenly for cluster locality within a NUMA node. 356 * 357 * Return: true if success, false otherwise. 358 */ 359 static bool __try_group_cluster_cpus(unsigned int ncpus, 360 unsigned int ngroups, 361 struct cpumask *node_cpumask, 362 struct cpumask *masks, 363 unsigned int *curgrp, 364 unsigned int last_grp) 365 { 366 struct node_groups *cluster_groups; 367 const struct cpumask **clusters; 368 unsigned int ncluster; 369 bool ret = false; 370 cpumask_var_t nmsk; 371 unsigned int i, nc; 372 373 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 374 goto fail_nmsk_alloc; 375 376 ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk, 377 &clusters, &cluster_groups); 378 379 if (ncluster == 0) 380 goto fail_no_clusters; 381 382 for (i = 0; i < ncluster; i++) { 383 struct node_groups *nv = &cluster_groups[i]; 384 385 /* Get the cpus on this cluster. */ 386 cpumask_and(nmsk, node_cpumask, clusters[nv->id]); 387 nc = cpumask_weight(nmsk); 388 if (!nc) 389 continue; 390 WARN_ON_ONCE(nv->ngroups > nc); 391 392 assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp); 393 } 394 395 ret = true; 396 kfree(cluster_groups); 397 kfree(clusters); 398 fail_no_clusters: 399 free_cpumask_var(nmsk); 400 fail_nmsk_alloc: 401 return ret; 402 } 403 404 static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, 405 cpumask_var_t *node_to_cpumask, 406 const struct cpumask *cpu_mask, 407 struct cpumask *nmsk, struct cpumask *masks) 408 { 409 unsigned int i, n, nodes, done = 0; 410 unsigned int last_grp = numgrps; 411 unsigned int curgrp = startgrp; 412 nodemask_t nodemsk = NODE_MASK_NONE; 413 struct node_groups *node_groups; 414 415 if (cpumask_empty(cpu_mask)) 416 return 0; 417 418 nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); 419 420 /* 421 * If the number of nodes in the mask is greater than or equal the 422 * number of groups we just spread the groups across the nodes. 423 */ 424 if (numgrps <= nodes) { 425 for_each_node_mask(n, nodemsk) { 426 /* Ensure that only CPUs which are in both masks are set */ 427 cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); 428 cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); 429 if (++curgrp == last_grp) 430 curgrp = 0; 431 } 432 return numgrps; 433 } 434 435 node_groups = kcalloc(nr_node_ids, 436 sizeof(struct node_groups), 437 GFP_KERNEL); 438 if (!node_groups) 439 return -ENOMEM; 440 441 /* allocate group number for each node */ 442 alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, 443 nodemsk, nmsk, node_groups); 444 for (i = 0; i < nr_node_ids; i++) { 445 unsigned int ncpus; 446 struct node_groups *nv = &node_groups[i]; 447 448 if (nv->ngroups == UINT_MAX) 449 continue; 450 451 /* Get the cpus on this node which are in the mask */ 452 cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); 453 ncpus = cpumask_weight(nmsk); 454 if (!ncpus) 455 continue; 456 457 WARN_ON_ONCE(nv->ngroups > ncpus); 458 459 if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk, 460 masks, &curgrp, last_grp)) { 461 done += nv->ngroups; 462 continue; 463 } 464 465 assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp, 466 last_grp); 467 done += nv->ngroups; 468 } 469 kfree(node_groups); 470 return done; 471 } 472 473 /** 474 * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality 475 * @numgrps: number of groups 476 * @nummasks: number of initialized cpumasks 477 * 478 * Return: cpumask array if successful, NULL otherwise. And each element 479 * includes CPUs assigned to this group. nummasks contains the number 480 * of initialized masks which can be less than numgrps. 481 * 482 * Try to put close CPUs from viewpoint of CPU and NUMA locality into 483 * same group, and run two-stage grouping: 484 * 1) allocate present CPUs on these groups evenly first 485 * 2) allocate other possible CPUs on these groups evenly 486 * 487 * We guarantee in the resulted grouping that all CPUs are covered, and 488 * no same CPU is assigned to multiple groups 489 */ 490 struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) 491 { 492 unsigned int curgrp = 0, nr_present = 0, nr_others = 0; 493 cpumask_var_t *node_to_cpumask; 494 cpumask_var_t nmsk, npresmsk; 495 int ret = -ENOMEM; 496 struct cpumask *masks = NULL; 497 498 if (numgrps == 0) 499 return NULL; 500 501 if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) 502 return NULL; 503 504 if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) 505 goto fail_nmsk; 506 507 node_to_cpumask = alloc_node_to_cpumask(); 508 if (!node_to_cpumask) 509 goto fail_npresmsk; 510 511 masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 512 if (!masks) 513 goto fail_node_to_cpumask; 514 515 build_node_to_cpumask(node_to_cpumask); 516 517 /* 518 * Make a local cache of 'cpu_present_mask', so the two stages 519 * spread can observe consistent 'cpu_present_mask' without holding 520 * cpu hotplug lock, then we can reduce deadlock risk with cpu 521 * hotplug code. 522 * 523 * Here CPU hotplug may happen when reading `cpu_present_mask`, and 524 * we can live with the case because it only affects that hotplug 525 * CPU is handled in the 1st or 2nd stage, and either way is correct 526 * from API user viewpoint since 2-stage spread is sort of 527 * optimization. 528 */ 529 cpumask_copy(npresmsk, data_race(cpu_present_mask)); 530 531 /* grouping present CPUs first */ 532 ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 533 npresmsk, nmsk, masks); 534 if (ret < 0) 535 goto fail_node_to_cpumask; 536 nr_present = ret; 537 538 /* 539 * Allocate non present CPUs starting from the next group to be 540 * handled. If the grouping of present CPUs already exhausted the 541 * group space, assign the non present CPUs to the already 542 * allocated out groups. 543 */ 544 if (nr_present >= numgrps) 545 curgrp = 0; 546 else 547 curgrp = nr_present; 548 cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); 549 ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, 550 npresmsk, nmsk, masks); 551 if (ret >= 0) 552 nr_others = ret; 553 554 fail_node_to_cpumask: 555 free_node_to_cpumask(node_to_cpumask); 556 557 fail_npresmsk: 558 free_cpumask_var(npresmsk); 559 560 fail_nmsk: 561 free_cpumask_var(nmsk); 562 if (ret < 0) { 563 kfree(masks); 564 return NULL; 565 } 566 *nummasks = min(nr_present + nr_others, numgrps); 567 return masks; 568 } 569 #else /* CONFIG_SMP */ 570 struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks) 571 { 572 struct cpumask *masks; 573 574 if (numgrps == 0) 575 return NULL; 576 577 masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); 578 if (!masks) 579 return NULL; 580 581 /* assign all CPUs(cpu 0) to the 1st group only */ 582 cpumask_copy(&masks[0], cpu_possible_mask); 583 *nummasks = 1; 584 return masks; 585 } 586 #endif /* CONFIG_SMP */ 587 EXPORT_SYMBOL_GPL(group_cpus_evenly); 588