1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright(c) 2015 - 2020 Intel Corporation. 4 */ 5 6 #include <linux/topology.h> 7 #include <linux/cpumask.h> 8 #include <linux/interrupt.h> 9 #include <linux/numa.h> 10 11 #include "hfi.h" 12 #include "affinity.h" 13 #include "sdma.h" 14 #include "trace.h" 15 16 struct hfi1_affinity_node_list node_affinity = { 17 .list = LIST_HEAD_INIT(node_affinity.list), 18 .lock = __MUTEX_INITIALIZER(node_affinity.lock) 19 }; 20 21 /* Name of IRQ types, indexed by enum irq_type */ 22 static const char * const irq_type_names[] = { 23 "SDMA", 24 "RCVCTXT", 25 "NETDEVCTXT", 26 "GENERAL", 27 "OTHER", 28 }; 29 30 /* Per NUMA node count of HFI devices */ 31 static unsigned int *hfi1_per_node_cntr; 32 33 static inline void init_cpu_mask_set(struct cpu_mask_set *set) 34 { 35 cpumask_clear(&set->mask); 36 cpumask_clear(&set->used); 37 set->gen = 0; 38 } 39 40 /* Increment generation of CPU set if needed */ 41 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set) 42 { 43 if (cpumask_equal(&set->mask, &set->used)) { 44 /* 45 * We've used up all the CPUs, bump up the generation 46 * and reset the 'used' map 47 */ 48 set->gen++; 49 cpumask_clear(&set->used); 50 } 51 } 52 53 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set) 54 { 55 if (cpumask_empty(&set->used) && set->gen) { 56 set->gen--; 57 cpumask_copy(&set->used, &set->mask); 58 } 59 } 60 61 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */ 62 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff) 63 { 64 int cpu; 65 66 if (!diff || !set) 67 return -EINVAL; 68 69 _cpu_mask_set_gen_inc(set); 70 71 /* Find out CPUs left in CPU mask */ 72 cpumask_andnot(diff, &set->mask, &set->used); 73 74 cpu = cpumask_first(diff); 75 if (cpu >= nr_cpu_ids) /* empty */ 76 cpu = -EINVAL; 77 else 78 cpumask_set_cpu(cpu, &set->used); 79 80 return cpu; 81 } 82 83 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu) 84 { 85 if (!set) 86 return; 87 88 cpumask_clear_cpu(cpu, &set->used); 89 _cpu_mask_set_gen_dec(set); 90 } 91 92 /* Initialize non-HT cpu cores mask */ 93 void init_real_cpu_mask(void) 94 { 95 int possible, curr_cpu, ht; 96 97 /* Start with cpu online mask as the real cpu mask */ 98 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); 99 100 /* 101 * Remove HT cores from the real cpu mask. Do this in two steps below. 102 */ 103 possible = cpumask_weight(&node_affinity.real_cpu_mask); 104 ht = cpumask_weight(topology_sibling_cpumask( 105 cpumask_first(&node_affinity.real_cpu_mask))); 106 /* 107 * Step 1. Skip over the first N HT siblings and use them as the 108 * "real" cores. Assumes that HT cores are not enumerated in 109 * succession (except in the single core case). 110 */ 111 curr_cpu = cpumask_nth(possible / ht, &node_affinity.real_cpu_mask) + 1; 112 113 /* Step 2. Remove the remaining HT siblings. */ 114 cpumask_clear_cpus(&node_affinity.real_cpu_mask, curr_cpu, nr_cpu_ids - curr_cpu); 115 } 116 117 int node_affinity_init(void) 118 { 119 int node; 120 struct pci_dev *dev = NULL; 121 const struct pci_device_id *ids = hfi1_pci_tbl; 122 123 cpumask_clear(&node_affinity.proc.used); 124 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); 125 126 node_affinity.proc.gen = 0; 127 node_affinity.num_core_siblings = 128 cpumask_weight(topology_sibling_cpumask( 129 cpumask_first(&node_affinity.proc.mask) 130 )); 131 node_affinity.num_possible_nodes = num_possible_nodes(); 132 node_affinity.num_online_nodes = num_online_nodes(); 133 node_affinity.num_online_cpus = num_online_cpus(); 134 135 /* 136 * The real cpu mask is part of the affinity struct but it has to be 137 * initialized early. It is needed to calculate the number of user 138 * contexts in set_up_context_variables(). 139 */ 140 init_real_cpu_mask(); 141 142 hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes, 143 sizeof(*hfi1_per_node_cntr), GFP_KERNEL); 144 if (!hfi1_per_node_cntr) 145 return -ENOMEM; 146 147 while (ids->vendor) { 148 dev = NULL; 149 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { 150 node = pcibus_to_node(dev->bus); 151 if (node < 0) 152 goto out; 153 154 hfi1_per_node_cntr[node]++; 155 } 156 ids++; 157 } 158 159 return 0; 160 161 out: 162 /* 163 * Invalid PCI NUMA node information found, note it, and populate 164 * our database 1:1. 165 */ 166 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n"); 167 pr_err("HFI: System BIOS may need to be upgraded\n"); 168 for (node = 0; node < node_affinity.num_possible_nodes; node++) 169 hfi1_per_node_cntr[node] = 1; 170 171 pci_dev_put(dev); 172 173 return 0; 174 } 175 176 static void node_affinity_destroy(struct hfi1_affinity_node *entry) 177 { 178 free_percpu(entry->comp_vect_affinity); 179 kfree(entry); 180 } 181 182 void node_affinity_destroy_all(void) 183 { 184 struct list_head *pos, *q; 185 struct hfi1_affinity_node *entry; 186 187 mutex_lock(&node_affinity.lock); 188 list_for_each_safe(pos, q, &node_affinity.list) { 189 entry = list_entry(pos, struct hfi1_affinity_node, 190 list); 191 list_del(pos); 192 node_affinity_destroy(entry); 193 } 194 mutex_unlock(&node_affinity.lock); 195 kfree(hfi1_per_node_cntr); 196 } 197 198 static struct hfi1_affinity_node *node_affinity_allocate(int node) 199 { 200 struct hfi1_affinity_node *entry; 201 202 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 203 if (!entry) 204 return NULL; 205 entry->node = node; 206 entry->comp_vect_affinity = alloc_percpu(u16); 207 INIT_LIST_HEAD(&entry->list); 208 209 return entry; 210 } 211 212 /* 213 * It appends an entry to the list. 214 * It *must* be called with node_affinity.lock held. 215 */ 216 static void node_affinity_add_tail(struct hfi1_affinity_node *entry) 217 { 218 list_add_tail(&entry->list, &node_affinity.list); 219 } 220 221 /* It must be called with node_affinity.lock held */ 222 static struct hfi1_affinity_node *node_affinity_lookup(int node) 223 { 224 struct hfi1_affinity_node *entry; 225 226 list_for_each_entry(entry, &node_affinity.list, list) { 227 if (entry->node == node) 228 return entry; 229 } 230 231 return NULL; 232 } 233 234 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, 235 u16 __percpu *comp_vect_affinity) 236 { 237 int curr_cpu; 238 u16 cntr; 239 u16 prev_cntr; 240 int ret_cpu; 241 242 if (!possible_cpumask) { 243 ret_cpu = -EINVAL; 244 goto fail; 245 } 246 247 if (!comp_vect_affinity) { 248 ret_cpu = -EINVAL; 249 goto fail; 250 } 251 252 ret_cpu = cpumask_first(possible_cpumask); 253 if (ret_cpu >= nr_cpu_ids) { 254 ret_cpu = -EINVAL; 255 goto fail; 256 } 257 258 prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); 259 for_each_cpu(curr_cpu, possible_cpumask) { 260 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 261 262 if (cntr < prev_cntr) { 263 ret_cpu = curr_cpu; 264 prev_cntr = cntr; 265 } 266 } 267 268 *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; 269 270 fail: 271 return ret_cpu; 272 } 273 274 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, 275 u16 __percpu *comp_vect_affinity) 276 { 277 int curr_cpu; 278 int max_cpu; 279 u16 cntr; 280 u16 prev_cntr; 281 282 if (!possible_cpumask) 283 return -EINVAL; 284 285 if (!comp_vect_affinity) 286 return -EINVAL; 287 288 max_cpu = cpumask_first(possible_cpumask); 289 if (max_cpu >= nr_cpu_ids) 290 return -EINVAL; 291 292 prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); 293 for_each_cpu(curr_cpu, possible_cpumask) { 294 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 295 296 if (cntr > prev_cntr) { 297 max_cpu = curr_cpu; 298 prev_cntr = cntr; 299 } 300 } 301 302 *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; 303 304 return max_cpu; 305 } 306 307 /* 308 * Non-interrupt CPUs are used first, then interrupt CPUs. 309 * Two already allocated cpu masks must be passed. 310 */ 311 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, 312 struct hfi1_affinity_node *entry, 313 cpumask_var_t non_intr_cpus, 314 cpumask_var_t available_cpus) 315 __must_hold(&node_affinity.lock) 316 { 317 int cpu; 318 struct cpu_mask_set *set = dd->comp_vect; 319 320 lockdep_assert_held(&node_affinity.lock); 321 if (!non_intr_cpus) { 322 cpu = -1; 323 goto fail; 324 } 325 326 if (!available_cpus) { 327 cpu = -1; 328 goto fail; 329 } 330 331 /* Available CPUs for pinning completion vectors */ 332 _cpu_mask_set_gen_inc(set); 333 cpumask_andnot(available_cpus, &set->mask, &set->used); 334 335 /* Available CPUs without SDMA engine interrupts */ 336 cpumask_andnot(non_intr_cpus, available_cpus, 337 &entry->def_intr.used); 338 339 /* If there are non-interrupt CPUs available, use them first */ 340 cpu = cpumask_first(non_intr_cpus); 341 342 /* Otherwise, use interrupt CPUs */ 343 if (cpu >= nr_cpu_ids) 344 cpu = cpumask_first(available_cpus); 345 346 if (cpu >= nr_cpu_ids) { /* empty */ 347 cpu = -1; 348 goto fail; 349 } 350 cpumask_set_cpu(cpu, &set->used); 351 352 fail: 353 return cpu; 354 } 355 356 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) 357 { 358 struct cpu_mask_set *set = dd->comp_vect; 359 360 if (cpu < 0) 361 return; 362 363 cpu_mask_set_put(set, cpu); 364 } 365 366 /* _dev_comp_vect_mappings_destroy() is reentrant */ 367 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) 368 { 369 int i, cpu; 370 371 if (!dd->comp_vect_mappings) 372 return; 373 374 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 375 cpu = dd->comp_vect_mappings[i]; 376 _dev_comp_vect_cpu_put(dd, cpu); 377 dd->comp_vect_mappings[i] = -1; 378 hfi1_cdbg(AFFINITY, 379 "[%s] Release CPU %d from completion vector %d", 380 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); 381 } 382 383 kfree(dd->comp_vect_mappings); 384 dd->comp_vect_mappings = NULL; 385 } 386 387 /* 388 * This function creates the table for looking up CPUs for completion vectors. 389 * num_comp_vectors needs to have been initilized before calling this function. 390 */ 391 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, 392 struct hfi1_affinity_node *entry) 393 __must_hold(&node_affinity.lock) 394 { 395 int i, cpu, ret; 396 cpumask_var_t non_intr_cpus; 397 cpumask_var_t available_cpus; 398 399 lockdep_assert_held(&node_affinity.lock); 400 401 if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) 402 return -ENOMEM; 403 404 if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { 405 free_cpumask_var(non_intr_cpus); 406 return -ENOMEM; 407 } 408 409 dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus, 410 sizeof(*dd->comp_vect_mappings), 411 GFP_KERNEL); 412 if (!dd->comp_vect_mappings) { 413 ret = -ENOMEM; 414 goto fail; 415 } 416 for (i = 0; i < dd->comp_vect_possible_cpus; i++) 417 dd->comp_vect_mappings[i] = -1; 418 419 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 420 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, 421 available_cpus); 422 if (cpu < 0) { 423 ret = -EINVAL; 424 goto fail; 425 } 426 427 dd->comp_vect_mappings[i] = cpu; 428 hfi1_cdbg(AFFINITY, 429 "[%s] Completion Vector %d -> CPU %d", 430 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); 431 } 432 433 free_cpumask_var(available_cpus); 434 free_cpumask_var(non_intr_cpus); 435 return 0; 436 437 fail: 438 free_cpumask_var(available_cpus); 439 free_cpumask_var(non_intr_cpus); 440 _dev_comp_vect_mappings_destroy(dd); 441 442 return ret; 443 } 444 445 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) 446 { 447 int ret; 448 struct hfi1_affinity_node *entry; 449 450 mutex_lock(&node_affinity.lock); 451 entry = node_affinity_lookup(dd->node); 452 if (!entry) { 453 ret = -EINVAL; 454 goto unlock; 455 } 456 ret = _dev_comp_vect_mappings_create(dd, entry); 457 unlock: 458 mutex_unlock(&node_affinity.lock); 459 460 return ret; 461 } 462 463 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) 464 { 465 _dev_comp_vect_mappings_destroy(dd); 466 } 467 468 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) 469 { 470 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); 471 struct hfi1_devdata *dd = dd_from_dev(verbs_dev); 472 473 if (!dd->comp_vect_mappings) 474 return -EINVAL; 475 if (comp_vect >= dd->comp_vect_possible_cpus) 476 return -EINVAL; 477 478 return dd->comp_vect_mappings[comp_vect]; 479 } 480 481 /* 482 * It assumes dd->comp_vect_possible_cpus is available. 483 */ 484 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, 485 struct hfi1_affinity_node *entry, 486 bool first_dev_init) 487 __must_hold(&node_affinity.lock) 488 { 489 int i, j, curr_cpu; 490 int possible_cpus_comp_vect = 0; 491 struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; 492 493 lockdep_assert_held(&node_affinity.lock); 494 /* 495 * If there's only one CPU available for completion vectors, then 496 * there will only be one completion vector available. Othewise, 497 * the number of completion vector available will be the number of 498 * available CPUs divide it by the number of devices in the 499 * local NUMA node. 500 */ 501 if (cpumask_weight(&entry->comp_vect_mask) == 1) { 502 possible_cpus_comp_vect = 1; 503 dd_dev_warn(dd, 504 "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); 505 } else { 506 possible_cpus_comp_vect += 507 cpumask_weight(&entry->comp_vect_mask) / 508 hfi1_per_node_cntr[dd->node]; 509 510 /* 511 * If the completion vector CPUs available doesn't divide 512 * evenly among devices, then the first device device to be 513 * initialized gets an extra CPU. 514 */ 515 if (first_dev_init && 516 cpumask_weight(&entry->comp_vect_mask) % 517 hfi1_per_node_cntr[dd->node] != 0) 518 possible_cpus_comp_vect++; 519 } 520 521 dd->comp_vect_possible_cpus = possible_cpus_comp_vect; 522 523 /* Reserving CPUs for device completion vector */ 524 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 525 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, 526 entry->comp_vect_affinity); 527 if (curr_cpu < 0) 528 goto fail; 529 530 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); 531 } 532 533 hfi1_cdbg(AFFINITY, 534 "[%s] Completion vector affinity CPU set(s) %*pbl", 535 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), 536 cpumask_pr_args(dev_comp_vect_mask)); 537 538 return 0; 539 540 fail: 541 for (j = 0; j < i; j++) 542 per_cpu_affinity_put_max(&entry->comp_vect_mask, 543 entry->comp_vect_affinity); 544 545 return curr_cpu; 546 } 547 548 /* 549 * It assumes dd->comp_vect_possible_cpus is available. 550 */ 551 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, 552 struct hfi1_affinity_node *entry) 553 __must_hold(&node_affinity.lock) 554 { 555 int i, cpu; 556 557 lockdep_assert_held(&node_affinity.lock); 558 if (!dd->comp_vect_possible_cpus) 559 return; 560 561 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 562 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, 563 entry->comp_vect_affinity); 564 /* Clearing CPU in device completion vector cpu mask */ 565 if (cpu >= 0) 566 cpumask_clear_cpu(cpu, &dd->comp_vect->mask); 567 } 568 569 dd->comp_vect_possible_cpus = 0; 570 } 571 572 /* 573 * Interrupt affinity. 574 * 575 * non-rcv avail gets a default mask that 576 * starts as possible cpus with threads reset 577 * and each rcv avail reset. 578 * 579 * rcv avail gets node relative 1 wrapping back 580 * to the node relative 1 as necessary. 581 * 582 */ 583 int hfi1_dev_affinity_init(struct hfi1_devdata *dd) 584 { 585 struct hfi1_affinity_node *entry; 586 const struct cpumask *local_mask; 587 int curr_cpu, possible, i, ret; 588 bool new_entry = false; 589 590 local_mask = cpumask_of_node(dd->node); 591 if (cpumask_first(local_mask) >= nr_cpu_ids) 592 local_mask = topology_core_cpumask(0); 593 594 mutex_lock(&node_affinity.lock); 595 entry = node_affinity_lookup(dd->node); 596 597 /* 598 * If this is the first time this NUMA node's affinity is used, 599 * create an entry in the global affinity structure and initialize it. 600 */ 601 if (!entry) { 602 entry = node_affinity_allocate(dd->node); 603 if (!entry) { 604 dd_dev_err(dd, 605 "Unable to allocate global affinity node\n"); 606 ret = -ENOMEM; 607 goto fail; 608 } 609 new_entry = true; 610 611 init_cpu_mask_set(&entry->def_intr); 612 init_cpu_mask_set(&entry->rcv_intr); 613 cpumask_clear(&entry->comp_vect_mask); 614 cpumask_clear(&entry->general_intr_mask); 615 /* Use the "real" cpu mask of this node as the default */ 616 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, 617 local_mask); 618 619 /* fill in the receive list */ 620 possible = cpumask_weight(&entry->def_intr.mask); 621 curr_cpu = cpumask_first(&entry->def_intr.mask); 622 623 if (possible == 1) { 624 /* only one CPU, everyone will use it */ 625 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); 626 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 627 } else { 628 /* 629 * The general/control context will be the first CPU in 630 * the default list, so it is removed from the default 631 * list and added to the general interrupt list. 632 */ 633 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); 634 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 635 curr_cpu = cpumask_next(curr_cpu, 636 &entry->def_intr.mask); 637 638 /* 639 * Remove the remaining kernel receive queues from 640 * the default list and add them to the receive list. 641 */ 642 for (i = 0; 643 i < (dd->n_krcv_queues - 1) * 644 hfi1_per_node_cntr[dd->node]; 645 i++) { 646 cpumask_clear_cpu(curr_cpu, 647 &entry->def_intr.mask); 648 cpumask_set_cpu(curr_cpu, 649 &entry->rcv_intr.mask); 650 curr_cpu = cpumask_next(curr_cpu, 651 &entry->def_intr.mask); 652 if (curr_cpu >= nr_cpu_ids) 653 break; 654 } 655 656 /* 657 * If there ends up being 0 CPU cores leftover for SDMA 658 * engines, use the same CPU cores as general/control 659 * context. 660 */ 661 if (cpumask_empty(&entry->def_intr.mask)) 662 cpumask_copy(&entry->def_intr.mask, 663 &entry->general_intr_mask); 664 } 665 666 /* Determine completion vector CPUs for the entire node */ 667 cpumask_and(&entry->comp_vect_mask, 668 &node_affinity.real_cpu_mask, local_mask); 669 cpumask_andnot(&entry->comp_vect_mask, 670 &entry->comp_vect_mask, 671 &entry->rcv_intr.mask); 672 cpumask_andnot(&entry->comp_vect_mask, 673 &entry->comp_vect_mask, 674 &entry->general_intr_mask); 675 676 /* 677 * If there ends up being 0 CPU cores leftover for completion 678 * vectors, use the same CPU core as the general/control 679 * context. 680 */ 681 if (cpumask_empty(&entry->comp_vect_mask)) 682 cpumask_copy(&entry->comp_vect_mask, 683 &entry->general_intr_mask); 684 } 685 686 ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); 687 if (ret < 0) 688 goto fail; 689 690 if (new_entry) 691 node_affinity_add_tail(entry); 692 693 dd->affinity_entry = entry; 694 mutex_unlock(&node_affinity.lock); 695 696 return 0; 697 698 fail: 699 if (new_entry) 700 node_affinity_destroy(entry); 701 mutex_unlock(&node_affinity.lock); 702 return ret; 703 } 704 705 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) 706 { 707 struct hfi1_affinity_node *entry; 708 709 mutex_lock(&node_affinity.lock); 710 if (!dd->affinity_entry) 711 goto unlock; 712 entry = node_affinity_lookup(dd->node); 713 if (!entry) 714 goto unlock; 715 716 /* 717 * Free device completion vector CPUs to be used by future 718 * completion vectors 719 */ 720 _dev_comp_vect_cpu_mask_clean_up(dd, entry); 721 unlock: 722 dd->affinity_entry = NULL; 723 mutex_unlock(&node_affinity.lock); 724 } 725 726 /* 727 * Function updates the irq affinity hint for msix after it has been changed 728 * by the user using the /proc/irq interface. This function only accepts 729 * one cpu in the mask. 730 */ 731 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) 732 { 733 struct sdma_engine *sde = msix->arg; 734 struct hfi1_devdata *dd = sde->dd; 735 struct hfi1_affinity_node *entry; 736 struct cpu_mask_set *set; 737 int i, old_cpu; 738 739 if (cpu > num_online_cpus() || cpu == sde->cpu) 740 return; 741 742 mutex_lock(&node_affinity.lock); 743 entry = node_affinity_lookup(dd->node); 744 if (!entry) 745 goto unlock; 746 747 old_cpu = sde->cpu; 748 sde->cpu = cpu; 749 cpumask_clear(&msix->mask); 750 cpumask_set_cpu(cpu, &msix->mask); 751 dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n", 752 msix->irq, irq_type_names[msix->type], 753 sde->this_idx, cpu); 754 irq_set_affinity_hint(msix->irq, &msix->mask); 755 756 /* 757 * Set the new cpu in the hfi1_affinity_node and clean 758 * the old cpu if it is not used by any other IRQ 759 */ 760 set = &entry->def_intr; 761 cpumask_set_cpu(cpu, &set->mask); 762 cpumask_set_cpu(cpu, &set->used); 763 for (i = 0; i < dd->msix_info.max_requested; i++) { 764 struct hfi1_msix_entry *other_msix; 765 766 other_msix = &dd->msix_info.msix_entries[i]; 767 if (other_msix->type != IRQ_SDMA || other_msix == msix) 768 continue; 769 770 if (cpumask_test_cpu(old_cpu, &other_msix->mask)) 771 goto unlock; 772 } 773 cpumask_clear_cpu(old_cpu, &set->mask); 774 cpumask_clear_cpu(old_cpu, &set->used); 775 unlock: 776 mutex_unlock(&node_affinity.lock); 777 } 778 779 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify, 780 const cpumask_t *mask) 781 { 782 int cpu = cpumask_first(mask); 783 struct hfi1_msix_entry *msix = container_of(notify, 784 struct hfi1_msix_entry, 785 notify); 786 787 /* Only one CPU configuration supported currently */ 788 hfi1_update_sdma_affinity(msix, cpu); 789 } 790 791 static void hfi1_irq_notifier_release(struct kref *ref) 792 { 793 /* 794 * This is required by affinity notifier. We don't have anything to 795 * free here. 796 */ 797 } 798 799 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) 800 { 801 struct irq_affinity_notify *notify = &msix->notify; 802 803 notify->irq = msix->irq; 804 notify->notify = hfi1_irq_notifier_notify; 805 notify->release = hfi1_irq_notifier_release; 806 807 if (irq_set_affinity_notifier(notify->irq, notify)) 808 pr_err("Failed to register sdma irq affinity notifier for irq %d\n", 809 notify->irq); 810 } 811 812 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix) 813 { 814 struct irq_affinity_notify *notify = &msix->notify; 815 816 if (irq_set_affinity_notifier(notify->irq, NULL)) 817 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n", 818 notify->irq); 819 } 820 821 /* 822 * Function sets the irq affinity for msix. 823 * It *must* be called with node_affinity.lock held. 824 */ 825 static int get_irq_affinity(struct hfi1_devdata *dd, 826 struct hfi1_msix_entry *msix) 827 { 828 cpumask_var_t diff; 829 struct hfi1_affinity_node *entry; 830 struct cpu_mask_set *set = NULL; 831 struct sdma_engine *sde = NULL; 832 struct hfi1_ctxtdata *rcd = NULL; 833 char extra[64]; 834 int cpu = -1; 835 836 extra[0] = '\0'; 837 cpumask_clear(&msix->mask); 838 839 entry = node_affinity_lookup(dd->node); 840 841 switch (msix->type) { 842 case IRQ_SDMA: 843 sde = (struct sdma_engine *)msix->arg; 844 scnprintf(extra, 64, "engine %u", sde->this_idx); 845 set = &entry->def_intr; 846 break; 847 case IRQ_GENERAL: 848 cpu = cpumask_first(&entry->general_intr_mask); 849 break; 850 case IRQ_RCVCTXT: 851 rcd = (struct hfi1_ctxtdata *)msix->arg; 852 if (rcd->ctxt == HFI1_CTRL_CTXT) 853 cpu = cpumask_first(&entry->general_intr_mask); 854 else 855 set = &entry->rcv_intr; 856 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 857 break; 858 case IRQ_NETDEVCTXT: 859 rcd = (struct hfi1_ctxtdata *)msix->arg; 860 set = &entry->def_intr; 861 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 862 break; 863 default: 864 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); 865 return -EINVAL; 866 } 867 868 /* 869 * The general and control contexts are placed on a particular 870 * CPU, which is set above. Skip accounting for it. Everything else 871 * finds its CPU here. 872 */ 873 if (cpu == -1 && set) { 874 if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) 875 return -ENOMEM; 876 877 cpu = cpu_mask_set_get_first(set, diff); 878 if (cpu < 0) { 879 free_cpumask_var(diff); 880 dd_dev_err(dd, "Failure to obtain CPU for IRQ\n"); 881 return cpu; 882 } 883 884 free_cpumask_var(diff); 885 } 886 887 cpumask_set_cpu(cpu, &msix->mask); 888 dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n", 889 msix->irq, irq_type_names[msix->type], 890 extra, cpu); 891 irq_set_affinity_hint(msix->irq, &msix->mask); 892 893 if (msix->type == IRQ_SDMA) { 894 sde->cpu = cpu; 895 hfi1_setup_sdma_notifier(msix); 896 } 897 898 return 0; 899 } 900 901 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) 902 { 903 int ret; 904 905 mutex_lock(&node_affinity.lock); 906 ret = get_irq_affinity(dd, msix); 907 mutex_unlock(&node_affinity.lock); 908 return ret; 909 } 910 911 void hfi1_put_irq_affinity(struct hfi1_devdata *dd, 912 struct hfi1_msix_entry *msix) 913 { 914 struct cpu_mask_set *set = NULL; 915 struct hfi1_affinity_node *entry; 916 917 mutex_lock(&node_affinity.lock); 918 entry = node_affinity_lookup(dd->node); 919 920 switch (msix->type) { 921 case IRQ_SDMA: 922 set = &entry->def_intr; 923 hfi1_cleanup_sdma_notifier(msix); 924 break; 925 case IRQ_GENERAL: 926 /* Don't do accounting for general contexts */ 927 break; 928 case IRQ_RCVCTXT: { 929 struct hfi1_ctxtdata *rcd = msix->arg; 930 931 /* Don't do accounting for control contexts */ 932 if (rcd->ctxt != HFI1_CTRL_CTXT) 933 set = &entry->rcv_intr; 934 break; 935 } 936 case IRQ_NETDEVCTXT: 937 set = &entry->def_intr; 938 break; 939 default: 940 mutex_unlock(&node_affinity.lock); 941 return; 942 } 943 944 if (set) { 945 cpumask_andnot(&set->used, &set->used, &msix->mask); 946 _cpu_mask_set_gen_dec(set); 947 } 948 949 irq_set_affinity_hint(msix->irq, NULL); 950 cpumask_clear(&msix->mask); 951 mutex_unlock(&node_affinity.lock); 952 } 953 954 /* This should be called with node_affinity.lock held */ 955 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, 956 struct hfi1_affinity_node_list *affinity) 957 { 958 int curr_cpu; 959 uint num_cores; 960 961 cpumask_copy(hw_thread_mask, &affinity->proc.mask); 962 963 if (affinity->num_core_siblings == 0) 964 return; 965 966 num_cores = rounddown(node_affinity.num_online_cpus / affinity->num_core_siblings, 967 node_affinity.num_online_nodes); 968 969 /* Removing other siblings not needed for now */ 970 curr_cpu = cpumask_nth(num_cores * node_affinity.num_online_nodes, hw_thread_mask) + 1; 971 cpumask_clear_cpus(hw_thread_mask, curr_cpu, nr_cpu_ids - curr_cpu); 972 973 /* Identifying correct HW threads within physical cores */ 974 cpumask_shift_left(hw_thread_mask, hw_thread_mask, num_cores * hw_thread_no); 975 } 976 977 int hfi1_get_proc_affinity(int node) 978 { 979 int cpu = -1, ret, i; 980 struct hfi1_affinity_node *entry; 981 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; 982 const struct cpumask *node_mask, 983 *proc_mask = current->cpus_ptr; 984 struct hfi1_affinity_node_list *affinity = &node_affinity; 985 struct cpu_mask_set *set = &affinity->proc; 986 987 /* 988 * check whether process/context affinity has already 989 * been set 990 */ 991 if (current->nr_cpus_allowed == 1) { 992 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", 993 current->pid, current->comm, 994 cpumask_pr_args(proc_mask)); 995 /* 996 * Mark the pre-set CPU as used. This is atomic so we don't 997 * need the lock 998 */ 999 cpu = cpumask_first(proc_mask); 1000 cpumask_set_cpu(cpu, &set->used); 1001 goto done; 1002 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { 1003 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", 1004 current->pid, current->comm, 1005 cpumask_pr_args(proc_mask)); 1006 goto done; 1007 } 1008 1009 /* 1010 * The process does not have a preset CPU affinity so find one to 1011 * recommend using the following algorithm: 1012 * 1013 * For each user process that is opening a context on HFI Y: 1014 * a) If all cores are filled, reinitialize the bitmask 1015 * b) Fill real cores first, then HT cores (First set of HT 1016 * cores on all physical cores, then second set of HT core, 1017 * and, so on) in the following order: 1018 * 1019 * 1. Same NUMA node as HFI Y and not running an IRQ 1020 * handler 1021 * 2. Same NUMA node as HFI Y and running an IRQ handler 1022 * 3. Different NUMA node to HFI Y and not running an IRQ 1023 * handler 1024 * 4. Different NUMA node to HFI Y and running an IRQ 1025 * handler 1026 * c) Mark core as filled in the bitmask. As user processes are 1027 * done, clear cores from the bitmask. 1028 */ 1029 1030 ret = zalloc_cpumask_var(&diff, GFP_KERNEL); 1031 if (!ret) 1032 goto done; 1033 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); 1034 if (!ret) 1035 goto free_diff; 1036 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); 1037 if (!ret) 1038 goto free_hw_thread_mask; 1039 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); 1040 if (!ret) 1041 goto free_available_mask; 1042 1043 mutex_lock(&affinity->lock); 1044 /* 1045 * If we've used all available HW threads, clear the mask and start 1046 * overloading. 1047 */ 1048 _cpu_mask_set_gen_inc(set); 1049 1050 /* 1051 * If NUMA node has CPUs used by interrupt handlers, include them in the 1052 * interrupt handler mask. 1053 */ 1054 entry = node_affinity_lookup(node); 1055 if (entry) { 1056 cpumask_copy(intrs_mask, (entry->def_intr.gen ? 1057 &entry->def_intr.mask : 1058 &entry->def_intr.used)); 1059 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? 1060 &entry->rcv_intr.mask : 1061 &entry->rcv_intr.used)); 1062 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); 1063 } 1064 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", 1065 cpumask_pr_args(intrs_mask)); 1066 1067 cpumask_copy(hw_thread_mask, &set->mask); 1068 1069 /* 1070 * If HT cores are enabled, identify which HW threads within the 1071 * physical cores should be used. 1072 */ 1073 for (i = 0; i < affinity->num_core_siblings; i++) { 1074 find_hw_thread_mask(i, hw_thread_mask, affinity); 1075 1076 /* 1077 * If there's at least one available core for this HW 1078 * thread number, stop looking for a core. 1079 * 1080 * diff will always be not empty at least once in this 1081 * loop as the used mask gets reset when 1082 * (set->mask == set->used) before this loop. 1083 */ 1084 if (cpumask_andnot(diff, hw_thread_mask, &set->used)) 1085 break; 1086 } 1087 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", 1088 cpumask_pr_args(hw_thread_mask)); 1089 1090 node_mask = cpumask_of_node(node); 1091 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, 1092 cpumask_pr_args(node_mask)); 1093 1094 /* Get cpumask of available CPUs on preferred NUMA */ 1095 cpumask_and(available_mask, hw_thread_mask, node_mask); 1096 cpumask_andnot(available_mask, available_mask, &set->used); 1097 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, 1098 cpumask_pr_args(available_mask)); 1099 1100 /* 1101 * At first, we don't want to place processes on the same 1102 * CPUs as interrupt handlers. Then, CPUs running interrupt 1103 * handlers are used. 1104 * 1105 * 1) If diff is not empty, then there are CPUs not running 1106 * non-interrupt handlers available, so diff gets copied 1107 * over to available_mask. 1108 * 2) If diff is empty, then all CPUs not running interrupt 1109 * handlers are taken, so available_mask contains all 1110 * available CPUs running interrupt handlers. 1111 * 3) If available_mask is empty, then all CPUs on the 1112 * preferred NUMA node are taken, so other NUMA nodes are 1113 * used for process assignments using the same method as 1114 * the preferred NUMA node. 1115 */ 1116 if (cpumask_andnot(diff, available_mask, intrs_mask)) 1117 cpumask_copy(available_mask, diff); 1118 1119 /* If we don't have CPUs on the preferred node, use other NUMA nodes */ 1120 if (cpumask_empty(available_mask)) { 1121 cpumask_andnot(available_mask, hw_thread_mask, &set->used); 1122 /* Excluding preferred NUMA cores */ 1123 cpumask_andnot(available_mask, available_mask, node_mask); 1124 hfi1_cdbg(PROC, 1125 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", 1126 cpumask_pr_args(available_mask)); 1127 1128 /* 1129 * At first, we don't want to place processes on the same 1130 * CPUs as interrupt handlers. 1131 */ 1132 if (cpumask_andnot(diff, available_mask, intrs_mask)) 1133 cpumask_copy(available_mask, diff); 1134 } 1135 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", 1136 cpumask_pr_args(available_mask)); 1137 1138 cpu = cpumask_first(available_mask); 1139 if (cpu >= nr_cpu_ids) /* empty */ 1140 cpu = -1; 1141 else 1142 cpumask_set_cpu(cpu, &set->used); 1143 1144 mutex_unlock(&affinity->lock); 1145 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); 1146 1147 free_cpumask_var(intrs_mask); 1148 free_available_mask: 1149 free_cpumask_var(available_mask); 1150 free_hw_thread_mask: 1151 free_cpumask_var(hw_thread_mask); 1152 free_diff: 1153 free_cpumask_var(diff); 1154 done: 1155 return cpu; 1156 } 1157 1158 void hfi1_put_proc_affinity(int cpu) 1159 { 1160 struct hfi1_affinity_node_list *affinity = &node_affinity; 1161 struct cpu_mask_set *set = &affinity->proc; 1162 1163 if (cpu < 0) 1164 return; 1165 1166 mutex_lock(&affinity->lock); 1167 cpu_mask_set_put(set, cpu); 1168 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); 1169 mutex_unlock(&affinity->lock); 1170 } 1171